diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4 | |
download | talos-obmc-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz talos-obmc-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/ipv4')
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig new file mode 100644 index 000000000000..6d3e8b1bd1f2 --- /dev/null +++ b/net/ipv4/Kconfig @@ -0,0 +1,411 @@ +# +# IP configuration +# +config IP_MULTICAST + bool "IP: multicasting" + depends on INET + help + This is code for addressing several networked computers at once, + enlarging your kernel by about 2 KB. You need multicasting if you + intend to participate in the MBONE, a high bandwidth network on top + of the Internet which carries audio and video broadcasts. More + information about the MBONE is on the WWW at + <http://www-itg.lbl.gov/mbone/>. Information about the multicast + capabilities of the various network cards is contained in + <file:Documentation/networking/multicast.txt>. For most people, it's + safe to say N. + +config IP_ADVANCED_ROUTER + bool "IP: advanced router" + depends on INET + ---help--- + If you intend to run your Linux box mostly as a router, i.e. as a + computer that forwards and redistributes network packets, say Y; you + will then be presented with several options that allow more precise + control about the routing process. + + The answer to this question won't directly affect the kernel: + answering N will just cause the configurator to skip all the + questions about advanced routing. + + Note that your box can only act as a router if you enable IP + forwarding in your kernel; you can do that by saying Y to "/proc + file system support" and "Sysctl support" below and executing the + line + + echo "1" > /proc/sys/net/ipv4/ip_forward + + at boot time after the /proc file system has been mounted. + + If you turn on IP forwarding, you will also get the rp_filter, which + automatically rejects incoming packets if the routing table entry + for their source address doesn't match the network interface they're + arriving on. This has security advantages because it prevents the + so-called IP spoofing, however it can pose problems if you use + asymmetric routing (packets from you to a host take a different path + than packets from that host to you) or if you operate a non-routing + host which has several IP addresses on different interfaces. To turn + rp_filter off use: + + echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter + or + echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter + + If unsure, say N here. + +config IP_MULTIPLE_TABLES + bool "IP: policy routing" + depends on IP_ADVANCED_ROUTER + ---help--- + Normally, a router decides what to do with a received packet based + solely on the packet's final destination address. If you say Y here, + the Linux router will also be able to take the packet's source + address into account. Furthermore, the TOS (Type-Of-Service) field + of the packet can be used for routing decisions as well. + + If you are interested in this, please see the preliminary + documentation at <http://www.compendium.com.ar/policy-routing.txt> + and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>. + You will need supporting software from + <ftp://ftp.tux.org/pub/net/ip-routing/>. + + If unsure, say N. + +config IP_ROUTE_FWMARK + bool "IP: use netfilter MARK value as routing key" + depends on IP_MULTIPLE_TABLES && NETFILTER + help + If you say Y here, you will be able to specify different routes for + packets with different mark values (see iptables(8), MARK target). + +config IP_ROUTE_MULTIPATH + bool "IP: equal cost multipath" + depends on IP_ADVANCED_ROUTER + help + Normally, the routing tables specify a single action to be taken in + a deterministic manner for a given packet. If you say Y here + however, it becomes possible to attach several actions to a packet + pattern, in effect specifying several alternative paths to travel + for those packets. The router considers all these paths to be of + equal "cost" and chooses one of them in a non-deterministic fashion + if a matching packet arrives. + +config IP_ROUTE_MULTIPATH_CACHED + bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" + depends on: IP_ROUTE_MULTIPATH + help + Normally, equal cost multipath routing is not supported by the + routing cache. If you say Y here, alternative routes are cached + and on cache lookup a route is chosen in a configurable fashion. + + If unsure, say N. + +config IP_ROUTE_MULTIPATH_RR + tristate "MULTIPATH: round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Mulitpath routes are chosen according to Round Robin + +config IP_ROUTE_MULTIPATH_RANDOM + tristate "MULTIPATH: random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Multipath routes are chosen in a random fashion. Actually, + there is no weight for a route. The advantage of this policy + is that it is implemented stateless and therefore introduces only + a very small delay. + +config IP_ROUTE_MULTIPATH_WRANDOM + tristate "MULTIPATH: weighted random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Multipath routes are chosen in a weighted random fashion. + The per route weights are the weights visible via ip route 2. As the + corresponding state management introduces some overhead routing delay + is increased. + +config IP_ROUTE_MULTIPATH_DRR + tristate "MULTIPATH: interface round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Connections are distributed in a round robin fashion over the + available interfaces. This policy makes sense if the connections + should be primarily distributed on interfaces and not on routes. + +config IP_ROUTE_VERBOSE + bool "IP: verbose route monitoring" + depends on IP_ADVANCED_ROUTER + help + If you say Y here, which is recommended, then the kernel will print + verbose messages regarding the routing, for example warnings about + received packets which look strange and could be evidence of an + attack or a misconfigured system somewhere. The information is + handled by the klogd daemon which is responsible for kernel messages + ("man klogd"). + +config IP_PNP + bool "IP: kernel level autoconfiguration" + depends on INET + help + This enables automatic configuration of IP addresses of devices and + of the routing table during kernel boot, based on either information + supplied on the kernel command line or by BOOTP or RARP protocols. + You need to say Y only for diskless machines requiring network + access to boot (in which case you want to say Y to "Root file system + on NFS" as well), because all other machines configure the network + in their startup scripts. + +config IP_PNP_DHCP + bool "IP: DHCP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the DHCP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does DHCP itself, providing all necessary information on the kernel + command line, you can say N here. + + If unsure, say Y. Note that if you want to use DHCP, a DHCP server + must be operating on your network. Read + <file:Documentation/nfsroot.txt> for details. + +config IP_PNP_BOOTP + bool "IP: BOOTP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the BOOTP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does BOOTP itself, providing all necessary information on the kernel + command line, you can say N here. If unsure, say Y. Note that if you + want to use BOOTP, a BOOTP server must be operating on your network. + Read <file:Documentation/nfsroot.txt> for details. + +config IP_PNP_RARP + bool "IP: RARP support" + depends on IP_PNP + help + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the RARP protocol (an + older protocol which is being obsoleted by BOOTP and DHCP), say Y + here. Note that if you want to use RARP, a RARP server must be + operating on your network. Read <file:Documentation/nfsroot.txt> for + details. + +# not yet ready.. +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +config NET_IPIP + tristate "IP: tunneling" + depends on INET + select INET_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + encapsulation of IP within IP, which sounds kind of pointless, but + can be useful if you want to make your (or some other) machine + appear on a different network than it physically is, or to use + mobile-IP facilities (allowing laptops to seamlessly move between + networks without changing their IP addresses). + + Saying Y to this option will produce two modules ( = code which can + be inserted in and removed from the running kernel whenever you + want). Most people won't need this and can say N. + +config NET_IPGRE + tristate "IP: GRE tunnels over IP" + depends on INET + select XFRM + help + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + GRE (Generic Routing Encapsulation) and at this time allows + encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure. + This driver is useful if the other endpoint is a Cisco router: Cisco + likes GRE much better than the other Linux tunneling driver ("IP + tunneling" above). In addition, GRE allows multicast redistribution + through the tunnel. + +config NET_IPGRE_BROADCAST + bool "IP: broadcast GRE over IP" + depends on IP_MULTICAST && NET_IPGRE + help + One application of GRE/IP is to construct a broadcast WAN (Wide Area + Network), which looks like a normal Ethernet LAN (Local Area + Network), but can be distributed all over the Internet. If you want + to do that, say Y here and to "IP multicast routing" below. + +config IP_MROUTE + bool "IP: multicast routing" + depends on IP_MULTICAST + help + This is used if you want your machine to act as a router for IP + packets that have several destination addresses. It is needed on the + MBONE, a high bandwidth network on top of the Internet which carries + audio and video broadcasts. In order to do that, you would most + likely run the program mrouted. Information about the multicast + capabilities of the various network cards is contained in + <file:Documentation/networking/multicast.txt>. If you haven't heard + about it, you don't need it. + +config IP_PIMSM_V1 + bool "IP: PIM-SM version 1 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM (Protocol Independent + Multicast) version 1. This multicast routing protocol is used widely + because Cisco supports it. You need special software to use it + (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more + information about PIM. + + Say Y if you want to use PIM-SM v1. Note that you can say N here if + you just want to use Dense Mode PIM. + +config IP_PIMSM_V2 + bool "IP: PIM-SM version 2 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM version 2. In order to use + this, you need an experimental routing daemon supporting it (pimd or + gated-5). This routing protocol is not used widely, so say N unless + you want to play with it. + +config ARPD + bool "IP: ARP daemon support (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + ---help--- + Normally, the kernel maintains an internal cache which maps IP + addresses to hardware addresses on the local network, so that + Ethernet/Token Ring/ etc. frames are sent to the proper address on + the physical networking layer. For small networks having a few + hundred directly connected hosts or less, keeping this address + resolution (ARP) cache inside the kernel works well. However, + maintaining an internal ARP cache does not work well for very large + switched networks, and will use a lot of kernel memory if TCP/IP + connections are made to many machines on the network. + + If you say Y here, the kernel's internal ARP cache will never grow + to more than 256 entries (the oldest entries are expired in a LIFO + manner) and communication will be attempted with the user space ARP + daemon arpd. Arpd then answers the address resolution request either + from its own cache or by asking the net. + + This code is experimental and also obsolete. If you want to use it, + you need to find a version of the daemon arpd on the net somewhere, + and you should also say Y to "Kernel/User network link driver", + below. If unsure, say N. + +config SYN_COOKIES + bool "IP: TCP syncookie support (disabled per default)" + depends on INET + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote + users from being able to connect to your computer during an ongoing + attack and requires very little work from the attacker, who can + operate from anywhere on the Internet. + + SYN cookies provide protection against this type of attack. If you + say Y here, the TCP/IP stack will use a cryptographic challenge + protocol known as "SYN cookies" to enable legitimate users to + continue to connect, even when your machine is under attack. There + is no need for the legitimate users to change their TCP/IP software; + SYN cookies work transparently to them. For technical information + about SYN cookies, check out <http://cr.yp.to/syncookies.html>. + + If you are SYN flooded, the source address reported by the kernel is + likely to have been forged by the attacker; it is only reported as + an aid in tracing the packets to their actual source and should not + be taken as absolute truth. + + SYN cookies may prevent correct error reporting on clients when the + server is really overloaded. If this happens frequently better turn + them off. + + If you say Y here, note that SYN cookies aren't enabled by default; + you can enable them by saying Y to "/proc file system support" and + "Sysctl support" below and executing the command + + echo 1 >/proc/sys/net/ipv4/tcp_syncookies + + at boot time after the /proc file system has been mounted. + + If unsure, say N. + +config INET_AH + tristate "IP: AH transformation" + depends on INET + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET_ESP + tristate "IP: ESP transformation" + depends on INET + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET_IPCOMP + tristate "IP: IPComp transformation" + depends on INET + select XFRM + select INET_TUNNEL + select CRYPTO + select CRYPTO_DEFLATE + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config INET_TUNNEL + tristate "IP: tunnel transformation" + depends on INET + select XFRM + ---help--- + Support for generic IP tunnel transformation, which is required by + the IP tunneling module as well as tunnel mode IPComp. + + If unsure, say Y. + +config IP_TCPDIAG + tristate "IP: TCP socket monitoring interface" + depends on INET + default y + ---help--- + Support for TCP socket monitoring interface used by native Linux + tools such as ss. ss is included in iproute2, currently downloadable + at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support + and have selected IPv6 as a module, you need to build this as a + module too. + + If unsure, say Y. + +config IP_TCPDIAG_IPV6 + def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) + +source "net/ipv4/ipvs/Kconfig" + diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile new file mode 100644 index 000000000000..8b379627ebb6 --- /dev/null +++ b/net/ipv4/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the Linux TCP/IP (INET) layer. +# + +obj-y := utils.o route.o inetpeer.o protocol.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o + +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o +obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_AH) += ah4.o +obj-$(CONFIG_INET_ESP) += esp4.o +obj-$(CONFIG_INET_IPCOMP) += ipcomp.o +obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o +obj-$(CONFIG_IP_PNP) += ipconfig.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o +obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IP_VS) += ipvs/ +obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o + +obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ + xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c new file mode 100644 index 000000000000..c34dab67e461 --- /dev/null +++ b/net/ipv4/af_inet.c @@ -0,0 +1,1188 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET protocol family socket handler. + * + * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Changes (see also sock.c) + * + * piggy, + * Karl Knutson : Socket protocol table + * A.N.Kuznetsov : Socket death error in accept(). + * John Richardson : Fix non blocking error in connect() + * so sockets that fail to connect + * don't return -EINPROGRESS. + * Alan Cox : Asynchronous I/O support + * Alan Cox : Keep correct socket pointer on sock + * structures + * when accept() ed + * Alan Cox : Semantics of SO_LINGER aren't state + * moved to close when you look carefully. + * With this fixed and the accept bug fixed + * some RPC stuff seems happier. + * Niibe Yutaka : 4.4BSD style write async I/O + * Alan Cox, + * Tony Gale : Fixed reuse semantics. + * Alan Cox : bind() shouldn't abort existing but dead + * sockets. Stops FTP netin:.. I hope. + * Alan Cox : bind() works correctly for RAW sockets. + * Note that FreeBSD at least was broken + * in this respect so be careful with + * compatibility tests... + * Alan Cox : routing cache support + * Alan Cox : memzero the socket structure for + * compactness. + * Matt Day : nonblock connect error handler + * Alan Cox : Allow large numbers of pending sockets + * (eg for big web sites), but only if + * specifically application requested. + * Alan Cox : New buffering throughout IP. Used + * dumbly. + * Alan Cox : New buffering now used smartly. + * Alan Cox : BSD rather than common sense + * interpretation of listen. + * Germano Caronni : Assorted small races. + * Alan Cox : sendmsg/recvmsg basic support. + * Alan Cox : Only sendmsg/recvmsg now supported. + * Alan Cox : Locked down bind (see security list). + * Alan Cox : Loosened bind a little. + * Mike McLagan : ADD/DEL DLCI Ioctls + * Willy Konynenberg : Transparent proxying support. + * David S. Miller : New socket lookup architecture. + * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. + * Andi Kleen : Fix inet_stream_connect TCP race. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/netfilter_ipv4.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/smp_lock.h> +#include <linux/inet.h> +#include <linux/igmp.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/ip_fib.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <net/icmp.h> +#include <net/ipip.h> +#include <net/inet_common.h> +#include <net/xfrm.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif + +DEFINE_SNMP_STAT(struct linux_mib, net_statistics); + +#ifdef INET_REFCNT_DEBUG +atomic_t inet_sock_nr; +#endif + +extern void ip_mc_drop_socket(struct sock *sk); + +/* The inetsw table contains everything that inet_create needs to + * build a new socket. + */ +static struct list_head inetsw[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw_lock); + +/* New destruction routine */ + +void inet_sock_destruct(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + __skb_queue_purge(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_error_queue); + + if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { + printk("Attempt to release TCP socket in state %d %p\n", + sk->sk_state, sk); + return; + } + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive inet socket %p\n", sk); + return; + } + + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + BUG_TRAP(!sk->sk_wmem_queued); + BUG_TRAP(!sk->sk_forward_alloc); + + if (inet->opt) + kfree(inet->opt); + dst_release(sk->sk_dst_cache); +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet_sock_nr); + printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", + sk, atomic_read(&inet_sock_nr)); +#endif +} + +/* + * The routines beyond this point handle the behaviour of an AF_INET + * socket object. Mostly it punts to the subprotocols of IP to do + * the work. + */ + +/* + * Automatically bind an unbound socket. + */ + +static int inet_autobind(struct sock *sk) +{ + struct inet_sock *inet; + /* We may need to bind the socket. */ + lock_sock(sk); + inet = inet_sk(sk); + if (!inet->num) { + if (sk->sk_prot->get_port(sk, 0)) { + release_sock(sk); + return -EAGAIN; + } + inet->sport = htons(inet->num); + } + release_sock(sk); + return 0; +} + +/* + * Move a socket into listening state. + */ +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = tcp_listen_start(sk); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +/* + * Create an inet socket. + */ + +static int inet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct list_head *p; + struct inet_protosw *answer; + struct inet_sock *inet; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int err; + + sock->state = SS_UNCONNECTED; + + /* Look for the requested type/protocol pair. */ + answer = NULL; + rcu_read_lock(); + list_for_each_rcu(p, &inetsw[sock->type]) { + answer = list_entry(p, struct inet_protosw, list); + + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + answer = NULL; + } + + err = -ESOCKTNOSUPPORT; + if (!answer) + goto out_rcu_unlock; + err = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; + err = -EPROTONOSUPPORT; + if (!protocol) + goto out_rcu_unlock; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + BUG_TRAP(answer_prot->slab != NULL); + + err = -ENOBUFS; + sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); + if (sk == NULL) + goto out; + + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + + if (SOCK_RAW == sock->type) { + inet->num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + + inet->id = 0; + + sock_init_data(sock, sk); + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_INET; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + + inet->uc_ttl = -1; + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + + if (inet->num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically + * shares. + */ + inet->sport = htons(inet->num); + /* Add to protocol hash chains. */ + sk->sk_prot->hash(sk); + } + + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) + sk_common_release(sk); + } +out: + return err; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* + * The peer socket should always be NULL (or else). When we call this + * function we are destroying the object and from then on nobody + * should refer to it. + */ +int inet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + long timeout; + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); + + /* If linger is set, we don't return until the close + * is complete. Otherwise we return immediately. The + * actually closing is done the same either way. + * + * If the close is due to the process exiting, we never + * linger.. + */ + timeout = 0; + if (sock_flag(sk, SOCK_LINGER) && + !(current->flags & PF_EXITING)) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); + } + return 0; +} + +/* It is off by default, see below. */ +int sysctl_ip_nonlocal_bind; + +int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + /* If the socket has its own bind function then use it. (RAW) */ + if (sk->sk_prot->bind) { + err = sk->sk_prot->bind(sk, uaddr, addr_len); + goto out; + } + err = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + + /* Not specified by any standard per-se, however it breaks too + * many applications when removed. It is unfortunate since + * allowing applications to make a non-local bind solves + * several problems with systems using dynamic addressing. + * (ie. your servers still start up even if your ISDN link + * is temporarily down) + */ + err = -EADDRNOTAVAIL; + if (!sysctl_ip_nonlocal_bind && + !inet->freebind && + addr->sin_addr.s_addr != INADDR_ANY && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) + goto out; + + snum = ntohs(addr->sin_port); + err = -EACCES; + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + goto out; + + /* We keep a pair of addresses. rcv_saddr is the one + * used by hash lookups, and saddr is used for transmit. + * + * In the BSD API these are the same except where it + * would be illegal to use them (multicast/broadcast) in + * which case the sending device address is used. + */ + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE || inet->num) + goto out_release_sock; + + inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->saddr = 0; /* Use device */ + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet->saddr = inet->rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + + if (inet->rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->sport = htons(inet->num); + inet->daddr = 0; + inet->dport = 0; + sk_dst_reset(sk); + err = 0; +out_release_sock: + release_sock(sk); +out: + return err; +} + +int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + if (uaddr->sa_family == AF_UNSPEC) + return sk->sk_prot->disconnect(sk, flags); + + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +} + +static long inet_wait_for_connect(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + /* Basic assumption: if someone sets sk->sk_err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ + while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + if (signal_pending(current) || !timeo) + break; + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + return timeo; +} + +/* + * Connect to a remote host. There is regrettably still a little + * TCP 'magic' in here. + */ +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + int err; + long timeo; + + lock_sock(sk); + + if (uaddr->sa_family == AF_UNSPEC) { + err = sk->sk_prot->disconnect(sk, flags); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + goto out; + } + + switch (sock->state) { + default: + err = -EINVAL; + goto out; + case SS_CONNECTED: + err = -EISCONN; + goto out; + case SS_CONNECTING: + err = -EALREADY; + /* Fall out of switch with err, set for this state */ + break; + case SS_UNCONNECTED: + err = -EISCONN; + if (sk->sk_state != TCP_CLOSE) + goto out; + + err = sk->sk_prot->connect(sk, uaddr, addr_len); + if (err < 0) + goto out; + + sock->state = SS_CONNECTING; + + /* Just entered SS_CONNECTING state; the only + * difference is that return value in non-blocking + * case is EINPROGRESS, rather than EALREADY. + */ + err = -EINPROGRESS; + break; + } + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Error code is set above */ + if (!timeo || !inet_wait_for_connect(sk, timeo)) + goto out; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + */ + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + + /* sk->sk_err may be not zero now, if RECVERR was ordered by user + * and error was received after socket entered established state. + * Hence, it is handled normally after connect() return successfully. + */ + + sock->state = SS_CONNECTED; + err = 0; +out: + release_sock(sk); + return err; + +sock_error: + err = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + if (sk->sk_prot->disconnect(sk, flags)) + sock->state = SS_DISCONNECTING; + goto out; +} + +/* + * Accept a pending connection. The TCP layer now gives BSD semantics. + */ + +int inet_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk1 = sock->sk; + int err = -EINVAL; + struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); + + if (!sk2) + goto do_err; + + lock_sock(sk2); + + BUG_TRAP((1 << sk2->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); + + sock_graft(sk2, newsock); + + newsock->state = SS_CONNECTED; + err = 0; + release_sock(sk2); +do_err: + return err; +} + + +/* + * This does both peername and sockname. + */ +int inet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + sin->sin_family = AF_INET; + if (peer) { + if (!inet->dport || + (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1)) + return -ENOTCONN; + sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->daddr; + } else { + __u32 addr = inet->rcv_saddr; + if (!addr) + addr = inet->saddr; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = addr; + } + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *uaddr_len = sizeof(*sin); + return 0; +} + +int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + + return sk->sk_prot->sendmsg(iocb, sk, msg, size); +} + + +static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + + if (sk->sk_prot->sendpage) + return sk->sk_prot->sendpage(sk, page, offset, size, flags); + return sock_no_sendpage(sock, page, offset, size, flags); +} + + +int inet_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + /* This should really check to make sure + * the socket is a TCP socket. (WHY AC...) + */ + how++; /* maps 0->1 has the advantage of making bit 1 rcvs and + 1->2 bit 2 snds. + 2->3 */ + if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ + return -EINVAL; + + lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if ((1 << sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + switch (sk->sk_state) { + case TCP_CLOSE: + err = -ENOTCONN; + /* Hack to wake up other listeners, who can poll for + POLLHUP, even on eg. unconnected UDP sockets -- RR */ + default: + sk->sk_shutdown |= how; + if (sk->sk_prot->shutdown) + sk->sk_prot->shutdown(sk, how); + break; + + /* Remaining two branches are temporary solution for missing + * close() in multithreaded environment. It is _not_ a good idea, + * but we have no choice until close() is repaired at VFS level. + */ + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* Fall through */ + case TCP_SYN_SENT: + err = sk->sk_prot->disconnect(sk, O_NONBLOCK); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + break; + } + + /* Wake up anyone sleeping in poll. */ + sk->sk_state_change(sk); + release_sock(sk); + return err; +} + +/* + * ioctl() calls you can issue on an INET socket. Most of these are + * device configuration and stuff and very rarely used. Some ioctls + * pass on to the socket itself. + * + * NOTE: I like the idea of a module for the config stuff. ie ifconfig + * loads the devconfigure module does its configuring and unloads it. + * There's a good 20K of config code hanging around the kernel. + */ + +int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = 0; + + switch (cmd) { + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + err = ip_rt_ioctl(cmd, (void __user *)arg); + break; + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + err = arp_ioctl(cmd, (void __user *)arg); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + err = devinet_ioctl(cmd, (void __user *)arg); + break; + default: + if (!sk->sk_prot->ioctl || + (err = sk->sk_prot->ioctl(sk, cmd, arg)) == + -ENOIOCTLCMD) + err = dev_ioctl(cmd, (void __user *)arg); + break; + } + return err; +} + +struct proto_ops inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = tcp_poll, + .ioctl = inet_ioctl, + .listen = inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = tcp_sendpage +}; + +struct proto_ops inet_dgram_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = udp_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +}; + +/* + * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without + * udp_poll + */ +static struct proto_ops inet_sockraw_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = datagram_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +}; + +static struct net_proto_family inet_family_ops = { + .family = PF_INET, + .create = inet_create, + .owner = THIS_MODULE, +}; + + +extern void tcp_init(void); +extern void tcp_v4_init(struct net_proto_family *); + +/* Upon startup we insert all the elements in inetsw_array[] into + * the linked list inetsw. + */ +static struct inet_protosw inetsw_array[] = +{ + { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcp_prot, + .ops = &inet_stream_ops, + .capability = -1, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, + }, + + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udp_prot, + .ops = &inet_dgram_ops, + .capability = -1, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, + }, + + + { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &raw_prot, + .ops = &inet_sockraw_ops, + .capability = CAP_NET_RAW, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + } +}; + +#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) + +void inet_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + int protocol = p->protocol; + struct list_head *last_perm; + + spin_lock_bh(&inetsw_lock); + + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + last_perm = &inetsw[p->type]; + list_for_each(lh, &inetsw[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); +out: + spin_unlock_bh(&inetsw_lock); + + synchronize_net(); + + return; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +void inet_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw_lock); + + synchronize_net(); + } +} + +#ifdef CONFIG_IP_MULTICAST +static struct net_protocol igmp_protocol = { + .handler = igmp_rcv, +}; +#endif + +static struct net_protocol tcp_protocol = { + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .no_policy = 1, +}; + +static struct net_protocol udp_protocol = { + .handler = udp_rcv, + .err_handler = udp_err, + .no_policy = 1, +}; + +static struct net_protocol icmp_protocol = { + .handler = icmp_rcv, +}; + +static int __init init_ipv4_mibs(void) +{ + net_statistics[0] = alloc_percpu(struct linux_mib); + net_statistics[1] = alloc_percpu(struct linux_mib); + ip_statistics[0] = alloc_percpu(struct ipstats_mib); + ip_statistics[1] = alloc_percpu(struct ipstats_mib); + icmp_statistics[0] = alloc_percpu(struct icmp_mib); + icmp_statistics[1] = alloc_percpu(struct icmp_mib); + tcp_statistics[0] = alloc_percpu(struct tcp_mib); + tcp_statistics[1] = alloc_percpu(struct tcp_mib); + udp_statistics[0] = alloc_percpu(struct udp_mib); + udp_statistics[1] = alloc_percpu(struct udp_mib); + if (! + (net_statistics[0] && net_statistics[1] && ip_statistics[0] + && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] + && udp_statistics[0] && udp_statistics[1])) + return -ENOMEM; + + (void) tcp_mib_init(); + + return 0; +} + +static int ipv4_proc_init(void); +extern void ipfrag_init(void); + +static int __init inet_init(void) +{ + struct sk_buff *dummy_skb; + struct inet_protosw *q; + struct list_head *r; + int rc = -EINVAL; + + if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "%s: panic\n", __FUNCTION__); + goto out; + } + + rc = proto_register(&tcp_prot, 1); + if (rc) + goto out; + + rc = proto_register(&udp_prot, 1); + if (rc) + goto out_unregister_tcp_proto; + + rc = proto_register(&raw_prot, 1); + if (rc) + goto out_unregister_udp_proto; + + /* + * Tell SOCKET that we are alive... + */ + + (void)sock_register(&inet_family_ops); + + /* + * Add all the base protocols. + */ + + if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); + if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); + if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +#ifdef CONFIG_IP_MULTICAST + if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +#endif + + /* Register the socket-side information for inet_create. */ + for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) + inet_register_protosw(q); + + /* + * Set the ARP module up + */ + + arp_init(); + + /* + * Set the IP module up + */ + + ip_init(); + + tcp_v4_init(&inet_family_ops); + + /* Setup TCP slab cache for open requests. */ + tcp_init(); + + + /* + * Set the ICMP layer up + */ + + icmp_init(&inet_family_ops); + + /* + * Initialise the multicast router + */ +#if defined(CONFIG_IP_MROUTE) + ip_mr_init(); +#endif + /* + * Initialise per-cpu ipv4 mibs + */ + + if(init_ipv4_mibs()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; + + ipv4_proc_init(); + + ipfrag_init(); + + rc = 0; +out: + return rc; +out_unregister_tcp_proto: + proto_unregister(&tcp_prot); +out_unregister_udp_proto: + proto_unregister(&udp_prot); + goto out; +} + +module_init(inet_init); + +/* ------------------------------------------------------------------------ */ + +#ifdef CONFIG_PROC_FS +extern int fib_proc_init(void); +extern void fib_proc_exit(void); +extern int ip_misc_proc_init(void); +extern int raw_proc_init(void); +extern void raw_proc_exit(void); +extern int tcp4_proc_init(void); +extern void tcp4_proc_exit(void); +extern int udp4_proc_init(void); +extern void udp4_proc_exit(void); + +static int __init ipv4_proc_init(void) +{ + int rc = 0; + + if (raw_proc_init()) + goto out_raw; + if (tcp4_proc_init()) + goto out_tcp; + if (udp4_proc_init()) + goto out_udp; + if (fib_proc_init()) + goto out_fib; + if (ip_misc_proc_init()) + goto out_misc; +out: + return rc; +out_misc: + fib_proc_exit(); +out_fib: + udp4_proc_exit(); +out_udp: + tcp4_proc_exit(); +out_tcp: + raw_proc_exit(); +out_raw: + rc = -ENOMEM; + goto out; +} + +#else /* CONFIG_PROC_FS */ +static int __init ipv4_proc_init(void) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +MODULE_ALIAS_NETPROTO(PF_INET); + +EXPORT_SYMBOL(inet_accept); +EXPORT_SYMBOL(inet_bind); +EXPORT_SYMBOL(inet_dgram_connect); +EXPORT_SYMBOL(inet_dgram_ops); +EXPORT_SYMBOL(inet_getname); +EXPORT_SYMBOL(inet_ioctl); +EXPORT_SYMBOL(inet_listen); +EXPORT_SYMBOL(inet_register_protosw); +EXPORT_SYMBOL(inet_release); +EXPORT_SYMBOL(inet_sendmsg); +EXPORT_SYMBOL(inet_shutdown); +EXPORT_SYMBOL(inet_sock_destruct); +EXPORT_SYMBOL(inet_stream_connect); +EXPORT_SYMBOL(inet_stream_ops); +EXPORT_SYMBOL(inet_unregister_protosw); +EXPORT_SYMBOL(net_statistics); + +#ifdef INET_REFCNT_DEBUG +EXPORT_SYMBOL(inet_sock_nr); +#endif diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c new file mode 100644 index 000000000000..0e98f2235b6e --- /dev/null +++ b/net/ipv4/ah4.c @@ -0,0 +1,335 @@ +#include <linux/config.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/ah.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <net/icmp.h> +#include <asm/scatterlist.h> + + +/* Clear mutable options and find final destination to substitute + * into IP header for icv calculation. Options are already checked + * for validity, so paranoia is not required. */ + +static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr) +{ + unsigned char * optptr = (unsigned char*)(iph+1); + int l = iph->ihl*4 - sizeof(struct iphdr); + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return 0; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return -EINVAL; + switch (*optptr) { + case IPOPT_SEC: + case 0x85: /* Some "Extended Security" crap. */ + case 0x86: /* Another "Commercial Security" crap. */ + case IPOPT_RA: + case 0x80|21: /* RFC1770 */ + break; + case IPOPT_LSRR: + case IPOPT_SSRR: + if (optlen < 6) + return -EINVAL; + memcpy(daddr, optptr+optlen-4, 4); + /* Fall through */ + default: + memset(optptr+2, 0, optlen-2); + } + l -= optlen; + optptr += optlen; + } + return 0; +} + +static int ah_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *iph, *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + top_iph = skb->nh.iph; + iph = &tmp_iph.iph; + + iph->tos = top_iph->tos; + iph->ttl = top_iph->ttl; + iph->frag_off = top_iph->frag_off; + + if (top_iph->ihl != 5) { + iph->daddr = top_iph->daddr; + memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + err = ip_clear_mutable_options(top_iph, &top_iph->daddr); + if (err) + goto error; + } + + ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4); + ah->nexthdr = top_iph->protocol; + + top_iph->tos = 0; + top_iph->tot_len = htons(skb->len); + top_iph->frag_off = 0; + top_iph->ttl = 0; + top_iph->protocol = IPPROTO_AH; + top_iph->check = 0; + + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + + ip_send_check(top_iph); + + err = 0; + +error: + return err; +} + +static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int ah_hlen; + struct iphdr *iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + char work_buf[60]; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + ah = (struct ip_auth_hdr*)skb->data; + ahp = x->data; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + ah = (struct ip_auth_hdr*)skb->data; + iph = skb->nh.iph; + + memcpy(work_buf, iph, iph->ihl*4); + + iph->ttl = 0; + iph->tos = 0; + iph->frag_off = 0; + iph->check = 0; + if (iph->ihl != 5) { + u32 dummy; + if (ip_clear_mutable_options(iph, &dummy)) + goto out; + } + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + x->stats.integrity_failed++; + goto out; + } + } + ((struct iphdr*)work_buf)->protocol = ah->nexthdr; + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, work_buf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + skb_pull(skb, skb->nh.iph->ihl*4); + skb->h.raw = skb->data; + + return 0; + +out: + return -EINVAL; +} + +static void ah4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", + ntohl(ah->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +static int ah_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + if (x->encap) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + + +static struct xfrm_type ah_type = +{ + .description = "AH4", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah_init_state, + .destructor = ah_destroy, + .input = ah_input, + .output = ah_output +}; + +static struct net_protocol ah4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ah4_err, + .no_policy = 1, +}; + +static int __init ah4_init(void) +{ + if (xfrm_register_type(&ah_type, AF_INET) < 0) { + printk(KERN_INFO "ip ah init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ip ah init: can't add protocol\n"); + xfrm_unregister_type(&ah_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ah4_fini(void) +{ + if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ip ah close: can't remove protocol\n"); + if (xfrm_unregister_type(&ah_type, AF_INET) < 0) + printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); +} + +module_init(ah4_init); +module_exit(ah4_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c new file mode 100644 index 000000000000..a642fd612853 --- /dev/null +++ b/net/ipv4/arp.c @@ -0,0 +1,1425 @@ +/* linux/net/inet/arp.c + * + * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ + * + * Copyright (C) 1994 by Florian La Roche + * + * This module implements the Address Resolution Protocol ARP (RFC 826), + * which is used to convert IP addresses (or in the future maybe other + * high-level addresses) into a low-level hardware address (like an Ethernet + * address). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox : Removed the Ethernet assumptions in + * Florian's code + * Alan Cox : Fixed some small errors in the ARP + * logic + * Alan Cox : Allow >4K in /proc + * Alan Cox : Make ARP add its own protocol entry + * Ross Martin : Rewrote arp_rcv() and arp_get_info() + * Stephen Henson : Add AX25 support to arp_get_info() + * Alan Cox : Drop data when a device is downed. + * Alan Cox : Use init_timer(). + * Alan Cox : Double lock fixes. + * Martin Seine : Move the arphdr structure + * to if_arp.h for compatibility. + * with BSD based programs. + * Andrew Tridgell : Added ARP netmask code and + * re-arranged proxy handling. + * Alan Cox : Changed to use notifiers. + * Niibe Yutaka : Reply for this device or proxies only. + * Alan Cox : Don't proxy across hardware types! + * Jonathan Naylor : Added support for NET/ROM. + * Mike Shaver : RFC1122 checks. + * Jonathan Naylor : Only lookup the hardware address for + * the correct hardware type. + * Germano Caronni : Assorted subtle races. + * Craig Schlenter : Don't modify permanent entry + * during arp_rcv. + * Russ Nelson : Tidied up a few bits. + * Alexey Kuznetsov: Major changes to caching and behaviour, + * eg intelligent arp probing and + * generation + * of host down events. + * Alan Cox : Missing unlock in device events. + * Eckes : ARP ioctl control errors. + * Alexey Kuznetsov: Arp free fix. + * Manuel Rodriguez: Gratuitous ARP. + * Jonathan Layes : Added arpd support through kerneld + * message queue (960314) + * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Mike McLagan : Routing by source + * Stuart Cheshire : Metricom and grat arp fixes + * *** FOR 2.1 clean this up *** + * Lawrence V. Stefani: (08/12/96) Added FDDI support. + * Alan Cox : Took the AP1000 nasty FDDI hack and + * folded into the mainstream FDDI code. + * Ack spit, Linus how did you allow that + * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. + * Alexey Kuznetsov: new arp state machine; + * now it is in net/core/neighbour.c. + * Krzysztof Halasa: Added Frame Relay ARP support. + * Arnaldo C. Melo : convert /proc/net/arp to seq_file + * Shmulik Hen: Split arp_send to arp_create and + * arp_xmit so intermediate drivers like + * bonding can change the skb before + * sending (e.g. insert 8021q tag). + * Harald Welte : convert to make use of jenkins hash + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/config.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/mm.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/fddidevice.h> +#include <linux/if_arp.h> +#include <linux/trdevice.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/jhash.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <net/ip.h> +#include <net/icmp.h> +#include <net/route.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/arp.h> +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#include <net/ax25.h> +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#include <net/netrom.h> +#endif +#endif +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) +#include <net/atmclip.h> +struct neigh_table *clip_tbl_hook; +#endif + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/netfilter_arp.h> + +/* + * Interface to generic neighbour cache. + */ +static u32 arp_hash(const void *pkey, const struct net_device *dev); +static int arp_constructor(struct neighbour *neigh); +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); +static void parp_redo(struct sk_buff *skb); + +static struct neigh_ops arp_generic_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops arp_hh_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops arp_direct_ops = { + .family = AF_INET, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_ops arp_broken_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_compat_output, + .connected_output = neigh_compat_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table arp_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802: + ip_eth_mc_map(addr, haddr); + return 0; + case ARPHRD_IEEE802_TR: + ip_tr_mc_map(addr, haddr); + return 0; + case ARPHRD_INFINIBAND: + ip_ib_mc_map(addr, haddr); + return 0; + default: + if (dir) { + memcpy(haddr, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + + +static u32 arp_hash(const void *pkey, const struct net_device *dev) +{ + return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); +} + +static int arp_constructor(struct neighbour *neigh) +{ + u32 addr = *(u32*)neigh->primary_key; + struct net_device *dev = neigh->dev; + struct in_device *in_dev; + struct neigh_parms *parms; + + neigh->type = inet_addr_type(addr); + + rcu_read_lock(); + in_dev = rcu_dereference(__in_dev_get(dev)); + if (in_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &arp_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + /* Good devices (checked by reading texts, but only Ethernet is + tested) + + ARPHRD_ETHER: (ethernet, apfddi) + ARPHRD_FDDI: (fddi) + ARPHRD_IEEE802: (tr) + ARPHRD_METRICOM: (strip) + ARPHRD_ARCNET: + etc. etc. etc. + + ARPHRD_IPDDP will also work, if author repairs it. + I did not it, because this driver does not work even + in old paradigm. + */ + +#if 1 + /* So... these "amateur" devices are hopeless. + The only thing, that I can say now: + It is very sad that we need to keep ugly obsolete + code to make them happy. + + They should be moved to more reasonable state, now + they use rebuild_header INSTEAD OF hard_start_xmit!!! + Besides that, they are sort of out of date + (a lot of redundant clones/copies, useless in 2.1), + I wonder why people believe that they work. + */ + switch (dev->type) { + default: + break; + case ARPHRD_ROSE: +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: +#endif + neigh->ops = &arp_broken_ops; + neigh->output = neigh->ops->output; + return 0; +#endif + ;} +#endif + if (neigh->type == RTN_MULTICAST) { + neigh->nud_state = NUD_NOARP; + arp_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->hard_header_cache) + neigh->ops = &arp_hh_ops; + else + neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + return 0; +} + +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + dst_link_failure(skb); + kfree_skb(skb); +} + +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + u32 saddr = 0; + u8 *dst_ha = NULL; + struct net_device *dev = neigh->dev; + u32 target = *(u32*)neigh->primary_key; + int probes = atomic_read(&neigh->probes); + struct in_device *in_dev = in_dev_get(dev); + + if (!in_dev) + return; + + switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { + default: + case 0: /* By default announce any local IP */ + if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) + saddr = skb->nh.iph->saddr; + break; + case 1: /* Restrict announcements of saddr in same subnet */ + if (!skb) + break; + saddr = skb->nh.iph->saddr; + if (inet_addr_type(saddr) == RTN_LOCAL) { + /* saddr should be known to target */ + if (inet_addr_onlink(in_dev, target, saddr)) + break; + } + saddr = 0; + break; + case 2: /* Avoid secondary IPs, get a primary/preferred one */ + break; + } + + if (in_dev) + in_dev_put(in_dev); + if (!saddr) + saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state&NUD_VALID)) + printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + dst_ha = neigh->ha; + read_lock_bh(&neigh->lock); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + return; + } + + arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_ha, dev->dev_addr, NULL); + if (dst_ha) + read_unlock_bh(&neigh->lock); +} + +static int arp_ignore(struct in_device *in_dev, struct net_device *dev, + u32 sip, u32 tip) +{ + int scope; + + switch (IN_DEV_ARP_IGNORE(in_dev)) { + case 0: /* Reply, the tip is already validated */ + return 0; + case 1: /* Reply only if tip is configured on the incoming interface */ + sip = 0; + scope = RT_SCOPE_HOST; + break; + case 2: /* + * Reply only if tip is configured on the incoming interface + * and is in same subnet as sip + */ + scope = RT_SCOPE_HOST; + break; + case 3: /* Do not reply for scope host addresses */ + sip = 0; + scope = RT_SCOPE_LINK; + dev = NULL; + break; + case 4: /* Reserved */ + case 5: + case 6: + case 7: + return 0; + case 8: /* Do not reply */ + return 1; + default: + return 0; + } + return !inet_confirm_addr(dev, sip, tip, scope); +} + +static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, + .saddr = tip } } }; + struct rtable *rt; + int flag = 0; + /*unsigned long now; */ + + if (ip_route_output_key(&rt, &fl) < 0) + return 1; + if (rt->u.dst.dev != dev) { + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + ip_rt_put(rt); + return flag; +} + +/* OBSOLETE FUNCTIONS */ + +/* + * Find an arp mapping in the cache. If not found, post a request. + * + * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, + * even if it exists. It is supposed that skb->dev was mangled + * by a virtual device (eql, shaper). Nobody but broken devices + * is allowed to use this function, it is scheduled to be removed. --ANK + */ + +static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev) +{ + switch (addr_hint) { + case RTN_LOCAL: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; + case RTN_MULTICAST: + arp_mc_map(paddr, haddr, dev, 1); + return 1; + case RTN_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; + } + return 0; +} + + +int arp_find(unsigned char *haddr, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + u32 paddr; + struct neighbour *n; + + if (!skb->dst) { + printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + kfree_skb(skb); + return 1; + } + + paddr = ((struct rtable*)skb->dst)->rt_gateway; + + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + return 0; + + n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); + + if (n) { + n->used = jiffies; + if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { + read_lock_bh(&n->lock); + memcpy(haddr, n->ha, dev->addr_len); + read_unlock_bh(&n->lock); + neigh_release(n); + return 0; + } + neigh_release(n); + } else + kfree_skb(skb); + return 1; +} + +/* END OF OBSOLETE FUNCTIONS */ + +int arp_bind_neighbour(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + struct neighbour *n = dst->neighbour; + + if (dev == NULL) + return -EINVAL; + if (n == NULL) { + u32 nexthop = ((struct rtable*)dst)->rt_gateway; + if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + nexthop = 0; + n = __neigh_lookup_errno( +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) + dev->type == ARPHRD_ATM ? clip_tbl_hook : +#endif + &arp_tbl, &nexthop, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + dst->neighbour = n; + } + return 0; +} + +/* + * Check if we can use proxy ARP for this path + */ + +static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +{ + struct in_device *out_dev; + int imi, omi = -1; + + if (!IN_DEV_PROXY_ARP(in_dev)) + return 0; + + if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + return 1; + if (imi == -1) + return 0; + + /* place to check for proxy_arp for routes */ + + if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { + omi = IN_DEV_MEDIUM_ID(out_dev); + in_dev_put(out_dev); + } + return (omi != imi && omi != -1); +} + +/* + * Interface to link layer: send routine and receive handler. + */ + +/* + * Create an arp packet. If (dest_hw == NULL), we create a broadcast + * message. + */ +struct sk_buff *arp_create(int type, int ptype, u32 dest_ip, + struct net_device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + struct arphdr *arp; + unsigned char *arp_ptr; + + /* + * Allocate a buffer + */ + + skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) + + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); + skb->dev = dev; + skb->protocol = htons(ETH_P_ARP); + if (src_hw == NULL) + src_hw = dev->dev_addr; + if (dest_hw == NULL) + dest_hw = dev->broadcast; + + /* + * Fill the device header for the ARP frame + */ + if (dev->hard_header && + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + goto out; + + /* + * Fill out the arp protocol part. + * + * The arp hardware type should match the device type, except for FDDI, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + /* + * Exceptions everywhere. AX.25 uses the AX.25 PID value not the + * DIX code for the protocol. Make these device structure fields. + */ + switch (dev->type) { + default: + arp->ar_hrd = htons(dev->type); + arp->ar_pro = htons(ETH_P_IP); + break; + +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + arp->ar_hrd = htons(ARPHRD_AX25); + arp->ar_pro = htons(AX25_P_IP); + break; + +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + arp->ar_hrd = htons(ARPHRD_NETROM); + arp->ar_pro = htons(AX25_P_IP); + break; +#endif +#endif + +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + arp->ar_hrd = htons(ARPHRD_ETHER); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif +#ifdef CONFIG_TR + case ARPHRD_IEEE802_TR: + arp->ar_hrd = htons(ARPHRD_IEEE802); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif + } + + arp->ar_hln = dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp+1); + + memcpy(arp_ptr, src_hw, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &src_ip,4); + arp_ptr+=4; + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &dest_ip, 4); + + return skb; + +out: + kfree_skb(skb); + return NULL; +} + +/* + * Send an arp packet. + */ +void arp_xmit(struct sk_buff *skb) +{ + /* Send it off, maybe filter it using firewalling first. */ + NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); +} + +/* + * Create and send an arp packet. + */ +void arp_send(int type, int ptype, u32 dest_ip, + struct net_device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + + /* + * No arp on this interface. + */ + + if (dev->flags&IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (skb == NULL) { + return; + } + + arp_xmit(skb); +} + +static void parp_redo(struct sk_buff *skb) +{ + nf_reset(skb); + arp_rcv(skb, skb->dev, NULL); +} + +/* + * Process an arp request. + */ + +static int arp_process(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev = in_dev_get(dev); + struct arphdr *arp; + unsigned char *arp_ptr; + struct rtable *rt; + unsigned char *sha, *tha; + u32 sip, tip; + u16 dev_type = dev->type; + int addr_type; + struct neighbour *n; + + /* arp_rcv below verifies the ARP header and verifies the device + * is ARP'able. + */ + + if (in_dev == NULL) + goto out; + + arp = skb->nh.arph; + + switch (dev_type) { + default: + if (arp->ar_pro != htons(ETH_P_IP) || + htons(dev_type) != arp->ar_hrd) + goto out; + break; +#ifdef CONFIG_NET_ETHERNET + case ARPHRD_ETHER: +#endif +#ifdef CONFIG_TR + case ARPHRD_IEEE802_TR: +#endif +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: +#endif +#ifdef CONFIG_NET_FC + case ARPHRD_IEEE802: +#endif +#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \ + defined(CONFIG_FDDI) || defined(CONFIG_NET_FC) + /* + * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 + * devices, according to RFC 2625) devices will accept ARP + * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). + * This is the case also of FDDI, where the RFC 1390 says that + * FDDI devices should accept ARP hardware of (1) Ethernet, + * however, to be more robust, we'll accept both 1 (Ethernet) + * or 6 (IEEE 802.2) + */ + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP)) + goto out; + break; +#endif +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_AX25)) + goto out; + break; +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_NETROM)) + goto out; + break; +#endif +#endif + } + + /* Understand only these message types */ + + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) + goto out; + +/* + * Extract fields + */ + arp_ptr= (unsigned char *)(arp+1); + sha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + tha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, 4); +/* + * Check for bad requests for 127.x.x.x and requests for multicast + * addresses. If this is one such, delete it. + */ + if (LOOPBACK(tip) || MULTICAST(tip)) + goto out; + +/* + * Special case: We must set Frame Relay source Q.922 address + */ + if (dev_type == ARPHRD_DLCI) + sha = dev->broadcast; + +/* + * Process entry. The idea here is we want to send a reply if it is a + * request for us or if it is a request for someone else that we hold + * a proxy for. We want to add an entry to our cache if it is a reply + * to us or if it is a request for our address. + * (The assumption for this last is that if someone is requesting our + * address, they are probably intending to talk to us, so it saves time + * if we cache their address. Their address is also probably not in + * our cache, since ours is not in their cache.) + * + * Putting this another way, we only care about replies if they are to + * us, in which case we add them to the cache. For requests, we care + * about those for us and those for our proxies. We reply to both, + * and in the case of requests for us we add the requester to the arp + * cache. + */ + + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ + if (sip == 0) { + if (arp->ar_op == htons(ARPOP_REQUEST) && + inet_addr_type(tip) == RTN_LOCAL && + !arp_ignore(in_dev,dev,sip,tip)) + arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr); + goto out; + } + + if (arp->ar_op == htons(ARPOP_REQUEST) && + ip_route_input(skb, tip, sip, 0, dev) == 0) { + + rt = (struct rtable*)skb->dst; + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + int dont_send = 0; + + if (!dont_send) + dont_send |= arp_ignore(in_dev,dev,sip,tip); + if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + dont_send |= arp_filter(sip,tip,dev); + if (!dont_send) + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + + neigh_release(n); + } + goto out; + } else if (IN_DEV_FORWARD(in_dev)) { + if ((rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) + neigh_release(n); + + if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || + skb->pkt_type == PACKET_HOST || + in_dev->arp_parms->proxy_delay == 0) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + in_dev_put(in_dev); + return 0; + } + goto out; + } + } + } + + /* Update our ARP tables */ + + n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + +#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, -1); +#endif + + if (n) { + int state = NUD_REACHABLE; + int override; + + /* If several different ARP replies follows back-to-back, + use the FIRST one. It is possible, if several proxy + agents are active. Taking the first reply prevents + arp trashing and chooses the fastest router. + */ + override = time_after(jiffies, n->updated + n->parms->locktime); + + /* Broadcast replies and request packets + do not assert neighbour reachability. + */ + if (arp->ar_op != htons(ARPOP_REPLY) || + skb->pkt_type != PACKET_HOST) + state = NUD_STALE; + neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_release(n); + } + +out: + if (in_dev) + in_dev_put(in_dev); + kfree_skb(skb); + return 0; +} + + +/* + * Receive an arp request from the device layer. + */ + +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct arphdr *arp; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull(skb, (sizeof(struct arphdr) + + (2 * dev->addr_len) + + (2 * sizeof(u32))))) + goto freeskb; + + arp = skb->nh.arph; + if (arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto freeskb; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out_of_mem; + + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + +freeskb: + kfree_skb(skb); +out_of_mem: + return 0; +} + +/* + * User level interface (ioctl) + */ + +/* + * Set (create) an ARP cache entry. + */ + +static int arp_req_set(struct arpreq *r, struct net_device * dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err; + + if (r->arp_flags&ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask && mask != 0xFFFFFFFF) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + if (dev == NULL) { + ipv4_devconf.proxy_arp = 1; + return 0; + } + if (__in_dev_get(dev)) { + __in_dev_get(dev)->cnf.proxy_arp = 1; + return 0; + } + return -ENXIO; + } + + if (r->arp_flags & ATF_PERM) + r->arp_flags |= ATF_COM; + if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; + struct rtable * rt; + if ((err = ip_route_output_key(&rt, &fl)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + switch (dev->type) { +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + /* + * According to RFC 1390, FDDI devices should accept ARP + * hardware types of 1 (Ethernet). However, to be more + * robust, we'll accept hardware types of either 1 (Ethernet) + * or 6 (IEEE 802.2). + */ + if (r->arp_ha.sa_family != ARPHRD_FDDI && + r->arp_ha.sa_family != ARPHRD_ETHER && + r->arp_ha.sa_family != ARPHRD_IEEE802) + return -EINVAL; + break; +#endif + default: + if (r->arp_ha.sa_family != dev->type) + return -EINVAL; + break; + } + + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); + err = PTR_ERR(neigh); + if (!IS_ERR(neigh)) { + unsigned state = NUD_STALE; + if (r->arp_flags & ATF_PERM) + state = NUD_PERMANENT; + err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + r->arp_ha.sa_data : NULL, state, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + return err; +} + +static unsigned arp_state_to_flags(struct neighbour *neigh) +{ + unsigned flags = 0; + if (neigh->nud_state&NUD_PERMANENT) + flags = ATF_PERM|ATF_COM; + else if (neigh->nud_state&NUD_VALID) + flags = ATF_COM; + return flags; +} + +/* + * Get an ARP cache entry. + */ + +static int arp_req_get(struct arpreq *r, struct net_device *dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err = -ENXIO; + + neigh = neigh_lookup(&arp_tbl, &ip, dev); + if (neigh) { + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + r->arp_ha.sa_family = dev->type; + strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + neigh_release(neigh); + err = 0; + } + return err; +} + +static int arp_req_delete(struct arpreq *r, struct net_device * dev) +{ + int err; + u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + + if (r->arp_flags & ATF_PUBL) { + u32 mask = + ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + if (mask == 0xFFFFFFFF) + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { + ipv4_devconf.proxy_arp = 0; + return 0; + } + if (__in_dev_get(dev)) { + __in_dev_get(dev)->cnf.proxy_arp = 0; + return 0; + } + return -ENXIO; + } + return -EINVAL; + } + + if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; + struct rtable * rt; + if ((err = ip_route_output_key(&rt, &fl)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + err = -ENXIO; + neigh = neigh_lookup(&arp_tbl, &ip, dev); + if (neigh) { + if (neigh->nud_state&~NUD_NOARP) + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + return err; +} + +/* + * Handle an ARP layer I/O control request. + */ + +int arp_ioctl(unsigned int cmd, void __user *arg) +{ + int err; + struct arpreq r; + struct net_device *dev = NULL; + + switch (cmd) { + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; + } + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + + if (!(r.arp_flags & ATF_PUBL) && + (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + return -EINVAL; + if (!(r.arp_flags & ATF_NETMASK)) + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = + htonl(0xFFFFFFFFUL); + rtnl_lock(); + if (r.arp_dev[0]) { + err = -ENODEV; + if ((dev = __dev_get_by_name(r.arp_dev)) == NULL) + goto out; + + /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ + if (!r.arp_ha.sa_family) + r.arp_ha.sa_family = dev->type; + err = -EINVAL; + if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) + goto out; + } else if (cmd == SIOCGARP) { + err = -ENODEV; + goto out; + } + + switch(cmd) { + case SIOCDARP: + err = arp_req_delete(&r, dev); + break; + case SIOCSARP: + err = arp_req_set(&r, dev); + break; + case SIOCGARP: + err = arp_req_get(&r, dev); + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; + break; + } +out: + rtnl_unlock(); + return err; +} + +static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&arp_tbl, dev); + rt_cache_flush(0); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block arp_netdev_notifier = { + .notifier_call = arp_netdev_event, +}; + +/* Note, that it is not on notifier chain. + It is necessary, that this routine was called after route cache will be + flushed. + */ +void arp_ifdown(struct net_device *dev) +{ + neigh_ifdown(&arp_tbl, dev); +} + + +/* + * Called once on startup. + */ + +static struct packet_type arp_packet_type = { + .type = __constant_htons(ETH_P_ARP), + .func = arp_rcv, +}; + +static int arp_proc_init(void); + +void __init arp_init(void) +{ + neigh_table_init(&arp_tbl); + + dev_add_pack(&arp_packet_type); + arp_proc_init(); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); +#endif + register_netdevice_notifier(&arp_netdev_notifier); +} + +#ifdef CONFIG_PROC_FS +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + +/* ------------------------------------------------------------------------ */ +/* + * ax25 -> ASCII conversion + */ +static char *ax2asc2(ax25_address *a, char *buf) +{ + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + +} +#endif /* CONFIG_AX25 */ + +#define HBUFFERLEN 30 + +static void arp_format_neigh_entry(struct seq_file *seq, + struct neighbour *n) +{ + char hbuffer[HBUFFERLEN]; + const char hexbuf[] = "0123456789ABCDEF"; + int k, j; + char tbuf[16]; + struct net_device *dev = n->dev; + int hatype = dev->type; + + read_lock(&n->lock); + /* Convert hardware address to XX:XX:XX:XX ... form. */ +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) + ax2asc2((ax25_address *)n->ha, hbuffer); + else { +#endif + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { + hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15]; + hbuffer[k++] = hexbuf[n->ha[j] & 15]; + hbuffer[k++] = ':'; + } + hbuffer[--k] = 0; +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + } +#endif + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key)); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); + read_unlock(&n->lock); +} + +static void arp_format_pneigh_entry(struct seq_file *seq, + struct pneigh_entry *n) +{ + struct net_device *dev = n->dev; + int hatype = dev ? dev->type : 0; + char tbuf[16]; + + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key)); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", + dev ? dev->name : "*"); +} + +static int arp_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "IP address HW type Flags " + "HW address Mask Device\n"); + } else { + struct neigh_seq_state *state = seq->private; + + if (state->flags & NEIGH_SEQ_IS_PNEIGH) + arp_format_pneigh_entry(seq, v); + else + arp_format_neigh_entry(seq, v); + } + + return 0; +} + +static void *arp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* Don't want to confuse "arp -a" w/ magic entries, + * so we tell the generic iterator to skip NUD_NOARP. + */ + return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); +} + +/* ------------------------------------------------------------------------ */ + +static struct seq_operations arp_seq_ops = { + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, +}; + +static int arp_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + memset(s, 0, sizeof(*s)); + rc = seq_open(file, &arp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations arp_seq_fops = { + .owner = THIS_MODULE, + .open = arp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init arp_proc_init(void) +{ + if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) + return -ENOMEM; + return 0; +} + +#else /* CONFIG_PROC_FS */ + +static int __init arp_proc_init(void) +{ + return 0; +} + +#endif /* CONFIG_PROC_FS */ + +EXPORT_SYMBOL(arp_broken_ops); +EXPORT_SYMBOL(arp_find); +EXPORT_SYMBOL(arp_rcv); +EXPORT_SYMBOL(arp_create); +EXPORT_SYMBOL(arp_xmit); +EXPORT_SYMBOL(arp_send); +EXPORT_SYMBOL(arp_tbl); + +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) +EXPORT_SYMBOL(clip_tbl_hook); +#endif diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c new file mode 100644 index 000000000000..b1db561f2542 --- /dev/null +++ b/net/ipv4/datagram.c @@ -0,0 +1,73 @@ +/* + * common UDP/RAW code + * Linux INET implementation + * + * Authors: + * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/route.h> + +int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct rtable *rt; + u32 saddr; + int oif; + int err; + + + if (addr_len < sizeof(*usin)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + sk_dst_reset(sk); + + oif = sk->sk_bound_dev_if; + saddr = inet->saddr; + if (MULTICAST(usin->sin_addr.s_addr)) { + if (!oif) + oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->sport, usin->sin_port, sk); + if (err) + return err; + if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { + ip_rt_put(rt); + return -EACCES; + } + if (!inet->saddr) + inet->saddr = rt->rt_src; /* Update source address */ + if (!inet->rcv_saddr) + inet->rcv_saddr = rt->rt_src; + inet->daddr = rt->rt_dst; + inet->dport = usin->sin_port; + sk->sk_state = TCP_ESTABLISHED; + inet->id = jiffies; + + sk_dst_set(sk, &rt->u.dst); + return(0); +} + +EXPORT_SYMBOL(ip4_datagram_connect); + diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c new file mode 100644 index 000000000000..eea7ef010776 --- /dev/null +++ b/net/ipv4/devinet.c @@ -0,0 +1,1508 @@ +/* + * NET3 IP device support routines. + * + * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the IP parts of dev.c 1.0.19 + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr + * lists. + * Cyrus Durgin: updated for kmod + * Matthias Andree: in devinet_ioctl, compare label and + * address (4.4BSD alias style support), + * fall back to comparing just the label + * if no match found. + */ + +#include <linux/config.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/kmod.h> + +#include <net/ip.h> +#include <net/route.h> +#include <net/ip_fib.h> + +struct ipv4_devconf ipv4_devconf = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, + .shared_media = 1, +}; + +static struct ipv4_devconf ipv4_devconf_dflt = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, + .shared_media = 1, + .accept_source_route = 1, +}; + +static void rtmsg_ifa(int event, struct in_ifaddr *); + +static struct notifier_block *inetaddr_chain; +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy); +#ifdef CONFIG_SYSCTL +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p); +static void devinet_sysctl_unregister(struct ipv4_devconf *p); +#endif + +/* Locks all the inet devices. */ + +static struct in_ifaddr *inet_alloc_ifa(void) +{ + struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + INIT_RCU_HEAD(&ifa->rcu_head); + } + + return ifa; +} + +static void inet_rcu_free_ifa(struct rcu_head *head) +{ + struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); + if (ifa->ifa_dev) + in_dev_put(ifa->ifa_dev); + kfree(ifa); +} + +static inline void inet_free_ifa(struct in_ifaddr *ifa) +{ + call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); +} + +void in_dev_finish_destroy(struct in_device *idev) +{ + struct net_device *dev = idev->dev; + + BUG_TRAP(!idev->ifa_list); + BUG_TRAP(!idev->mc_list); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", + idev, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) + printk("Freeing alive in_device %p\n", idev); + else { + kfree(idev); + } +} + +struct in_device *inetdev_init(struct net_device *dev) +{ + struct in_device *in_dev; + + ASSERT_RTNL(); + + in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + goto out; + memset(in_dev, 0, sizeof(*in_dev)); + INIT_RCU_HEAD(&in_dev->rcu_head); + memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + in_dev->cnf.sysctl = NULL; + in_dev->dev = dev; + if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) + goto out_kfree; + /* Reference in_dev->dev */ + dev_hold(dev); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); +#endif + + /* Account for reference dev->ip_ptr */ + in_dev_hold(in_dev); + rcu_assign_pointer(dev->ip_ptr, in_dev); + +#ifdef CONFIG_SYSCTL + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + ip_mc_init_dev(in_dev); + if (dev->flags & IFF_UP) + ip_mc_up(in_dev); +out: + return in_dev; +out_kfree: + kfree(in_dev); + in_dev = NULL; + goto out; +} + +static void in_dev_rcu_put(struct rcu_head *head) +{ + struct in_device *idev = container_of(head, struct in_device, rcu_head); + in_dev_put(idev); +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + struct net_device *dev; + + ASSERT_RTNL(); + + dev = in_dev->dev; + if (dev == &loopback_dev) + return; + + in_dev->dead = 1; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); +#endif + + dev->ip_ptr = NULL; + +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(in_dev->arp_parms); +#endif + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + arp_ifdown(dev); + + call_rcu(&in_dev->rcu_head, in_dev_rcu_put); +} + +int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) +{ + rcu_read_lock(); + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) { + rcu_read_unlock(); + return 1; + } + } + } endfor_ifa(in_dev); + rcu_read_unlock(); + return 0; +} + +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy) +{ + struct in_ifaddr *ifa1 = *ifap; + + ASSERT_RTNL(); + + /* 1. Deleting primary ifaddr forces deletion all secondaries */ + + if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { + struct in_ifaddr *ifa; + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap1 = &ifa->ifa_next; + continue; + } + + *ifap1 = ifa->ifa_next; + + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } + } + + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + inet_free_ifa(ifa1); + + if (!in_dev->ifa_list) + inetdev_destroy(in_dev); + } +} + +static int inet_insert_ifa(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap, **last_primary; + + ASSERT_RTNL(); + + if (!ifa->ifa_local) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; + ifap = &ifa1->ifa_next) { + if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && + ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { + net_srandom(ifa->ifa_local); + ifap = last_primary; + } + + ifa->ifa_next = *ifap; + *ifap = ifa; + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = __in_dev_get(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + in_dev = inetdev_init(dev); + if (!in_dev) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + } + if (ifa->ifa_dev != in_dev) { + BUG_TRAP(!ifa->ifa_dev); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + } + if (LOOPBACK(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(ifa); +} + +struct in_device *inetdev_by_index(int ifindex) +{ + struct net_device *dev; + struct in_device *in_dev = NULL; + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifindex); + if (dev) + in_dev = in_dev_get(dev); + read_unlock(&dev_base_lock); + return in_dev; +} + +/* Called only from RTNL semaphored context. No locks. */ + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, + u32 mask) +{ + ASSERT_RTNL(); + + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa, **ifap; + + ASSERT_RTNL(); + + if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL) + goto out; + __in_dev_put(in_dev); + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if ((rta[IFA_LOCAL - 1] && + memcmp(RTA_DATA(rta[IFA_LOCAL - 1]), + &ifa->ifa_local, 4)) || + (rta[IFA_LABEL - 1] && + rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) || + (rta[IFA_ADDRESS - 1] && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]), + ifa)))) + continue; + inet_del_ifa(in_dev, ifap, 1); + return 0; + } +out: + return -EADDRNOTAVAIL; +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct net_device *dev; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa; + int rc = -EINVAL; + + ASSERT_RTNL(); + + if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1]) + goto out; + + rc = -ENODEV; + if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL) + goto out; + + rc = -ENOBUFS; + if ((in_dev = __in_dev_get(dev)) == NULL) { + in_dev = inetdev_init(dev); + if (!in_dev) + goto out; + } + + if ((ifa = inet_alloc_ifa()) == NULL) + goto out; + + if (!rta[IFA_ADDRESS - 1]) + rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1]; + memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4); + memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + if (rta[IFA_BROADCAST - 1]) + memcpy(&ifa->ifa_broadcast, + RTA_DATA(rta[IFA_BROADCAST - 1]), 4); + if (rta[IFA_ANYCAST - 1]) + memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + if (rta[IFA_LABEL - 1]) + rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + rc = inet_insert_ifa(ifa); +out: + return rc; +} + +/* + * Determine a default network mask, based on the IP address. + */ + +static __inline__ int inet_abc_len(u32 addr) +{ + int rc = -1; /* Something else, probably a multicast. */ + + if (ZERONET(addr)) + rc = 0; + else { + addr = ntohl(addr); + + if (IN_CLASSA(addr)) + rc = 8; + else if (IN_CLASSB(addr)) + rc = 16; + else if (IN_CLASSC(addr)) + rc = 24; + } + + return rc; +} + + +int devinet_ioctl(unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + struct sockaddr_in sin_orig; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + struct net_device *dev; + char *colon; + int ret = -EFAULT; + int tryaddrmatch = 0; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + goto out; + ifr.ifr_name[IFNAMSIZ - 1] = 0; + + /* save original address for comparison */ + memcpy(&sin_orig, sin, sizeof(*sin)); + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + +#ifdef CONFIG_KMOD + dev_load(ifr.ifr_name); +#endif + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + /* Note that these ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + tryaddrmatch = (sin_orig.sin_family == AF_INET); + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + rtnl_lock(); + + ret = -ENODEV; + if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) + goto done; + + if (colon) + *colon = ':'; + + if ((in_dev = __in_dev_get(dev)) != NULL) { + if (tryaddrmatch) { + /* Matthias Andree */ + /* compare label and address (4.4BSD style) */ + /* note: we only do this for a limited set of ioctls + and only if the original address family was AF_INET. + This is checked above. */ + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + sin_orig.sin_addr.s_addr == + ifa->ifa_address) { + break; /* found */ + } + } + } + /* we didn't get a match, maybe the application is + 4.3BSD-style and passed in junk so we fall back to + comparing just the label */ + if (!ifa) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) + if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + break; + } + } + + ret = -EADDRNOTAVAIL; + if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) + goto done; + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; + + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; + + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; + + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; + + case SIOCSIFFLAGS: + if (colon) { + ret = -EADDRNOTAVAIL; + if (!ifa) + break; + ret = 0; + if (!(ifr.ifr_flags & IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFADDR: /* Set interface address (and family) */ + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + + if (!ifa) { + ret = -ENOBUFS; + if ((ifa = inet_alloc_ifa()) == NULL) + break; + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_anycast = 0; + } + + ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags & IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags & IFF_BROADCAST) && + ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address | + ~ifa->ifa_mask; + } else { + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); + } + ret = inet_set_ifa(dev, ifa); + break; + + case SIOCSIFBRDADDR: /* Set the broadcast address */ + ret = 0; + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + } + break; + + case SIOCSIFDSTADDR: /* Set the destination address */ + ret = 0; + if (ifa->ifa_address == sin->sin_addr.s_addr) + break; + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + ret = 0; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + break; + + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + + /* + * The mask we set must be legal. + */ + ret = -EINVAL; + if (bad_mask(sin->sin_addr.s_addr, 0)) + break; + ret = 0; + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + + /* See if current broadcast address matches + * with current netmask, then recalculate + * the broadcast address. Otherwise it's a + * funny address, so don't touch it since + * the user seems to know what (s)he's doing... + */ + if ((dev->flags & IFF_BROADCAST) && + (ifa->ifa_prefixlen < 31) && + (ifa->ifa_broadcast == + (ifa->ifa_local|~ifa->ifa_mask))) { + ifa->ifa_broadcast = (ifa->ifa_local | + ~sin->sin_addr.s_addr); + } + inet_insert_ifa(ifa); + } + break; + } +done: + rtnl_unlock(); +out: + return ret; +rarok: + rtnl_unlock(); + ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0; + goto out; +} + +static int inet_gifconf(struct net_device *dev, char __user *buf, int len) +{ + struct in_device *in_dev = __in_dev_get(dev); + struct in_ifaddr *ifa; + struct ifreq ifr; + int done = 0; + + if (!in_dev || (ifa = in_dev->ifa_list) == NULL) + goto out; + + for (; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < (int) sizeof(ifr)) + break; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = + ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) { + done = -EFAULT; + break; + } + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } +out: + return done; +} + +u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope) +{ + u32 addr = 0; + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (!in_dev) + goto no_in_dev; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + if (!dst || inet_ifa_match(dst, ifa)) { + addr = ifa->ifa_local; + break; + } + if (!addr) + addr = ifa->ifa_local; + } endfor_ifa(in_dev); +no_in_dev: + rcu_read_unlock(); + + if (addr) + goto out; + + /* Not loopback addresses on loopback should be preferred + in this case. It is importnat that lo is the first interface + in dev_base list. + */ + read_lock(&dev_base_lock); + rcu_read_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if ((in_dev = __in_dev_get(dev)) == NULL) + continue; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock_both; + } + } endfor_ifa(in_dev); + } +out_unlock_both: + read_unlock(&dev_base_lock); + rcu_read_unlock(); +out: + return addr; +} + +static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst, + u32 local, int scope) +{ + int same = 0; + u32 addr = 0; + + for_ifa(in_dev) { + if (!addr && + (local == ifa->ifa_local || !local) && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + if (same) + break; + } + if (!same) { + same = (!local || inet_ifa_match(local, ifa)) && + (!dst || inet_ifa_match(dst, ifa)); + if (same && addr) { + if (local || !dst) + break; + /* Is the selected addr into dst subnet? */ + if (inet_ifa_match(addr, ifa)) + break; + /* No, then can we use new local src? */ + if (ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + break; + } + /* search for large dst subnet for addr */ + same = 0; + } + } + } endfor_ifa(in_dev); + + return same? addr : 0; +} + +/* + * Confirm that local IP address exists using wildcards: + * - dev: only on this interface, 0=any interface + * - dst: only in the same subnet as dst, 0=any dst + * - local: address, 0=autoselect the local address + * - scope: maximum allowed scope value for the local address + */ +u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope) +{ + u32 addr = 0; + struct in_device *in_dev; + + if (dev) { + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev))) + addr = confirm_addr_indev(in_dev, dst, local, scope); + rcu_read_unlock(); + + return addr; + } + + read_lock(&dev_base_lock); + rcu_read_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if ((in_dev = __in_dev_get(dev))) { + addr = confirm_addr_indev(in_dev, dst, local, scope); + if (addr) + break; + } + } + rcu_read_unlock(); + read_unlock(&dev_base_lock); + + return addr; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inetaddr_chain, nb); +} + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inetaddr_chain, nb); +} + +/* Rename ifa_labels for a device name change. Make some effort to preserve existing + * alias numbering and to create unique labels if possible. +*/ +static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + int named = 0; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + char old[IFNAMSIZ], *dot; + + memcpy(old, ifa->ifa_label, IFNAMSIZ); + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (named++ == 0) + continue; + dot = strchr(ifa->ifa_label, ':'); + if (dot == NULL) { + sprintf(old, ":%d", named); + dot = old; + } + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { + strcat(ifa->ifa_label, dot); + } else { + strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); + } + } +} + +/* Called only under RTNL semaphore */ + +static int inetdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + if (event == NETDEV_REGISTER && dev == &loopback_dev) { + in_dev = inetdev_init(dev); + if (!in_dev) + panic("devinet: Failed to create loopback\n"); + in_dev->cnf.no_xfrm = 1; + in_dev->cnf.no_policy = 1; + } + goto out; + } + + switch (event) { + case NETDEV_REGISTER: + printk(KERN_DEBUG "inetdev_event: bug\n"); + dev->ip_ptr = NULL; + break; + case NETDEV_UP: + if (dev->mtu < 68) + break; + if (dev == &loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + inet_insert_ifa(ifa); + } + } + ip_mc_up(in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_CHANGEMTU: + if (dev->mtu >= 68) + break; + /* MTU falled under 68, disable IP */ + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + case NETDEV_CHANGENAME: + /* Do not notify about label change, this event is + * not interesting to applications using netlink. + */ + inetdev_changename(dev, in_dev); + +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); + neigh_sysctl_unregister(in_dev->arp_parms); + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + break; + } +out: + return NOTIFY_DONE; +} + +static struct notifier_block ip_netdev_notifier = { + .notifier_call =inetdev_event, +}; + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_address) + RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); + if (ifa->ifa_broadcast) + RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast); + if (ifa->ifa_anycast) + RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + struct net_device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + int s_ip_idx, s_idx = cb->args[0]; + + s_ip_idx = ip_idx = cb->args[1]; + read_lock(&dev_base_lock); + for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) == NULL) { + rcu_read_unlock(); + continue; + } + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR) <= 0) { + rcu_read_unlock(); + goto done; + } + } + rcu_read_unlock(); + } + +done: + read_unlock(&dev_base_lock); + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr* ifa) +{ + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128); + struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); + + if (!skb) + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + } else { + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); + } +} + +static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { + [4] = { .doit = inet_rtm_newaddr, }, + [5] = { .doit = inet_rtm_deladdr, }, + [6] = { .dumpit = inet_dump_ifaddr, }, + [8] = { .doit = inet_rtm_newroute, }, + [9] = { .doit = inet_rtm_delroute, }, + [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, }, +#ifdef CONFIG_IP_MULTIPLE_TABLES + [16] = { .doit = inet_rtm_newrule, }, + [17] = { .doit = inet_rtm_delrule, }, + [18] = { .dumpit = inet_dump_rules, }, +#endif +}; + +#ifdef CONFIG_SYSCTL + +void inet_forward_change(void) +{ + struct net_device *dev; + int on = ipv4_devconf.forwarding; + + ipv4_devconf.accept_redirects = !on; + ipv4_devconf_dflt.forwarding = on; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev) + in_dev->cnf.forwarding = on; + rcu_read_unlock(); + } + read_unlock(&dev_base_lock); + + rt_cache_flush(0); +} + +static int devinet_sysctl_forward(ctl_table *ctl, int write, + struct file* filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) { + if (valp == &ipv4_devconf.forwarding) + inet_forward_change(); + else if (valp != &ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + + return ret; +} + +int ipv4_doint_and_flush(ctl_table *ctl, int write, + struct file* filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) + rt_cache_flush(0); + + return ret; +} + +int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + rt_cache_flush(0); + return 1; +} + + +static struct devinet_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + ctl_table devinet_dev[2]; + ctl_table devinet_conf_dir[2]; + ctl_table devinet_proto_dir[2]; + ctl_table devinet_root_dir[2]; +} devinet_sysctl = { + .devinet_vars = { + { + .ctl_name = NET_IPV4_CONF_FORWARDING, + .procname = "forwarding", + .data = &ipv4_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &devinet_sysctl_forward, + }, + { + .ctl_name = NET_IPV4_CONF_MC_FORWARDING, + .procname = "mc_forwarding", + .data = &ipv4_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS, + .procname = "accept_redirects", + .data = &ipv4_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS, + .procname = "secure_redirects", + .data = &ipv4_devconf.secure_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SHARED_MEDIA, + .procname = "shared_media", + .data = &ipv4_devconf.shared_media, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_RP_FILTER, + .procname = "rp_filter", + .data = &ipv4_devconf.rp_filter, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS, + .procname = "send_redirects", + .data = &ipv4_devconf.send_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, + .procname = "accept_source_route", + .data = &ipv4_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_PROXY_ARP, + .procname = "proxy_arp", + .data = &ipv4_devconf.proxy_arp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_MEDIUM_ID, + .procname = "medium_id", + .data = &ipv4_devconf.medium_id, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_BOOTP_RELAY, + .procname = "bootp_relay", + .data = &ipv4_devconf.bootp_relay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_LOG_MARTIANS, + .procname = "log_martians", + .data = &ipv4_devconf.log_martians, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_TAG, + .procname = "tag", + .data = &ipv4_devconf.tag, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARPFILTER, + .procname = "arp_filter", + .data = &ipv4_devconf.arp_filter, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE, + .procname = "arp_announce", + .data = &ipv4_devconf.arp_announce, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARP_IGNORE, + .procname = "arp_ignore", + .data = &ipv4_devconf.arp_ignore, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_NOXFRM, + .procname = "disable_xfrm", + .data = &ipv4_devconf.no_xfrm, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_CONF_NOPOLICY, + .procname = "disable_policy", + .data = &ipv4_devconf.no_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION, + .procname = "force_igmp_version", + .data = &ipv4_devconf.force_igmp_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + }, + .devinet_dev = { + { + .ctl_name = NET_PROTO_CONF_ALL, + .procname = "all", + .mode = 0555, + .child = devinet_sysctl.devinet_vars, + }, + }, + .devinet_conf_dir = { + { + .ctl_name = NET_IPV4_CONF, + .procname = "conf", + .mode = 0555, + .child = devinet_sysctl.devinet_dev, + }, + }, + .devinet_proto_dir = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = devinet_sysctl.devinet_conf_dir, + }, + }, + .devinet_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = devinet_sysctl.devinet_proto_dir, + }, + }, +}; + +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p) +{ + int i; + struct net_device *dev = in_dev ? in_dev->dev : NULL; + struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + char *dev_name = NULL; + + if (!t) + return; + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + + if (dev) { + dev_name = dev->name; + t->devinet_dev[0].ctl_name = dev->ifindex; + } else { + dev_name = "default"; + t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + dev_name = net_sysctl_strdup(dev_name); + if (!dev_name) + goto free; + + t->devinet_dev[0].procname = dev_name; + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; + t->devinet_conf_dir[0].child = t->devinet_dev; + t->devinet_conf_dir[0].de = NULL; + t->devinet_proto_dir[0].child = t->devinet_conf_dir; + t->devinet_proto_dir[0].de = NULL; + t->devinet_root_dir[0].child = t->devinet_proto_dir; + t->devinet_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (!t->sysctl_header) + goto free_procname; + + p->sysctl = t; + return; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + return; +} + +static void devinet_sysctl_unregister(struct ipv4_devconf *p) +{ + if (p->sysctl) { + struct devinet_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->devinet_dev[0].procname); + kfree(t); + } +} +#endif + +void __init devinet_init(void) +{ + register_gifconf(PF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); + rtnetlink_links[PF_INET] = inet_rtnetlink_table; +#ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); + devinet_sysctl_register(NULL, &ipv4_devconf_dflt); +#endif +} + +EXPORT_SYMBOL(devinet_ioctl); +EXPORT_SYMBOL(in_dev_finish_destroy); +EXPORT_SYMBOL(inet_select_addr); +EXPORT_SYMBOL(inetdev_by_index); +EXPORT_SYMBOL(register_inetaddr_notifier); +EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c new file mode 100644 index 000000000000..053a883247ba --- /dev/null +++ b/net/ipv4/esp4.c @@ -0,0 +1,510 @@ +#include <linux/config.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/random.h> +#include <net/icmp.h> +#include <net/udp.h> + +/* decapsulation data for use when post-processing */ +struct esp_decap_data { + xfrm_address_t saddr; + __u16 sport; + __u8 proto; +}; + +static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *top_iph; + struct ip_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + int blksize; + int clen; + int alen; + int nfrags; + + /* Strip IP+ESP header. */ + __skb_pull(skb, skb->h.raw - skb->data); + /* Now skb is pure payload to encrypt */ + + err = -ENOMEM; + + /* Round to block size */ + clen = skb->len; + + esp = x->data; + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) + goto error; + + /* Fill padding... */ + do { + int i; + for (i=0; i<clen-skb->len - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + __skb_push(skb, skb->data - skb->nh.raw); + top_iph = skb->nh.iph; + esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); + top_iph->tot_len = htons(skb->len + alen); + *(u8*)(trailer->tail - 1) = top_iph->protocol; + + /* this is non-NULL only with UDP Encapsulation */ + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh; + u32 *udpdata32; + + uh = (struct udphdr *)esph; + uh->source = encap->encap_sport; + uh->dest = encap->encap_dport; + uh->len = htons(skb->len + alen - top_iph->ihl*4); + uh->check = 0; + + switch (encap->encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + esph = (struct ip_esp_hdr *)(uh + 1); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + udpdata32 = (u32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + esph = (struct ip_esp_hdr *)(udpdata32 + 2); + break; + } + + top_iph->protocol = IPPROTO_UDP; + } else + top_iph->protocol = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist *sg = &esp->sgbuf[0]; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + ip_send_check(top_iph); + + err = 0; + +error: + return err; +} + +/* + * Note: detecting truncated vs. non-truncated authentication data is very + * expensive, so we only support truncated data, which is the recommended + * and common case. + */ +static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; + int nfrags; + int encap_len = 0; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) + goto out; + + if (elen <= 0 || (elen & (blksize-1))) + goto out; + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr*)skb->data; + iph = skb->nh.iph; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist *sg = &esp->sgbuf[0]; + u8 workbuf[60]; + int padlen; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto out; + } + skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) + goto out; + + /* ... check padding bits here. Silly. :-) */ + + if (x->encap && decap && decap->decap_type) { + struct esp_decap_data *encap_data; + struct udphdr *uh = (struct udphdr *) (iph+1); + + encap_data = (struct esp_decap_data *) (decap->decap_data); + encap_data->proto = 0; + + switch (decap->decap_type) { + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + encap_data->proto = AF_INET; + encap_data->saddr.a4 = iph->saddr; + encap_data->sport = uh->source; + encap_len = (void*)esph - (void*)uh; + break; + + default: + goto out; + } + } + + iph->protocol = nexthdr[1]; + pskb_trim(skb, skb->len - alen - padlen - 2); + memcpy(workbuf, skb->nh.raw, iph->ihl*4); + skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, workbuf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + } + + return 0; + +out: + return -EINVAL; +} + +static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + + if (x->encap) { + struct xfrm_encap_tmpl *encap; + struct esp_decap_data *decap_data; + + encap = x->encap; + decap_data = (struct esp_decap_data *)(decap->decap_data); + + /* first, make sure that the decap type == the encap type */ + if (encap->encap_type != decap->decap_type) + return -EINVAL; + + switch (encap->encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (decap_data->proto == AF_INET && + (decap_data->saddr.a4 != x->props.saddr.a4 || + decap_data->sport != encap->encap_sport)) { + xfrm_address_t ipaddr; + + ipaddr.a4 = decap_data->saddr.a4; + km_new_mapping(x, &ipaddr, decap_data->sport); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per * draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (!x->props.mode) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + break; + } + } + return 0; +} + +static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_trunc_len; +} + +static void esp4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + if (!x) + return; + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr))); + xfrm_state_put(x); +} + +static void esp_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +static int esp_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + /* null auth and encryption can have zero length keys */ + if (x->aalg) { + if (x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8)); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + if (unlikely(esp->conf.ivec == NULL)) + goto error; + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len)) + goto error; + x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; + } + } + x->data = esp; + x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; + return 0; + +error: + x->data = esp; + esp_destroy(x); + x->data = NULL; + return -EINVAL; +} + +static struct xfrm_type esp_type = +{ + .description = "ESP4", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp_init_state, + .destructor = esp_destroy, + .get_max_size = esp4_get_max_size, + .input = esp_input, + .post_input = esp_post_input, + .output = esp_output +}; + +static struct net_protocol esp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = esp4_err, + .no_policy = 1, +}; + +static int __init esp4_init(void) +{ + struct xfrm_decap_state decap; + + if (sizeof(struct esp_decap_data) < + sizeof(decap.decap_data)) { + extern void decap_data_too_small(void); + + decap_data_too_small(); + } + + if (xfrm_register_type(&esp_type, AF_INET) < 0) { + printk(KERN_INFO "ip esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ip esp init: can't add protocol\n"); + xfrm_unregister_type(&esp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit esp4_fini(void) +{ + if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ip esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp_type, AF_INET) < 0) + printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); +} + +module_init(esp4_init); +module_exit(esp4_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c new file mode 100644 index 000000000000..563e7d612706 --- /dev/null +++ b/net/ipv4/fib_frontend.c @@ -0,0 +1,611 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: FIB frontend. + * + * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/arp.h> +#include <net/ip_fib.h> + +#define FFprint(a...) printk(KERN_DEBUG a) + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +#define RT_TABLE_MIN RT_TABLE_MAIN + +struct fib_table *ip_fib_local_table; +struct fib_table *ip_fib_main_table; + +#else + +#define RT_TABLE_MIN 1 + +struct fib_table *fib_tables[RT_TABLE_MAX+1]; + +struct fib_table *__fib_new_table(int id) +{ + struct fib_table *tb; + + tb = fib_hash_init(id); + if (!tb) + return NULL; + fib_tables[id] = tb; + return tb; +} + + +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + +static void fib_flush(void) +{ + int flushed = 0; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_table *tb; + int id; + + for (id = RT_TABLE_MAX; id>0; id--) { + if ((tb = fib_get_table(id))==NULL) + continue; + flushed += tb->tb_flush(tb); + } +#else /* CONFIG_IP_MULTIPLE_TABLES */ + flushed += ip_fib_main_table->tb_flush(ip_fib_main_table); + flushed += ip_fib_local_table->tb_flush(ip_fib_local_table); +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + if (flushed) + rt_cache_flush(-1); +} + +/* + * Find the first device with a given source address. + */ + +struct net_device * ip_dev_find(u32 addr) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct fib_result res; + struct net_device *dev = NULL; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (!ip_fib_local_table || + ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res)) + return NULL; + if (res.type != RTN_LOCAL) + goto out; + dev = FIB_RES_DEV(res); + + if (dev) + dev_hold(dev); +out: + fib_res_put(&res); + return dev; +} + +unsigned inet_addr_type(u32 addr) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct fib_result res; + unsigned ret = RTN_BROADCAST; + + if (ZERONET(addr) || BADCLASS(addr)) + return RTN_BROADCAST; + if (MULTICAST(addr)) + return RTN_MULTICAST; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (ip_fib_local_table) { + ret = RTN_UNICAST; + if (!ip_fib_local_table->tb_lookup(ip_fib_local_table, + &fl, &res)) { + ret = res.type; + fib_res_put(&res); + } + } + return ret; +} + +/* Given (packet source, input interface) and optional (dst, oif, tos): + - (main) check, that source is valid i.e. not broadcast or our local + address. + - figure out what "logical" interface this packet arrived + and calculate "specific destination" address. + - check, that packet arrived from expected physical interface. + */ + +int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct net_device *dev, u32 *spec_dst, u32 *itag) +{ + struct in_device *in_dev; + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = src, + .saddr = dst, + .tos = tos } }, + .iif = oif }; + struct fib_result res; + int no_addr, rpf; + int ret; + + no_addr = rpf = 0; + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev) { + no_addr = in_dev->ifa_list == NULL; + rpf = IN_DEV_RPFILTER(in_dev); + } + rcu_read_unlock(); + + if (in_dev == NULL) + goto e_inval; + + if (fib_lookup(&fl, &res)) + goto last_resort; + if (res.type != RTN_UNICAST) + goto e_inval_res; + *spec_dst = FIB_RES_PREFSRC(res); + fib_combine_itag(itag, &res); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) +#else + if (FIB_RES_DEV(res) == dev) +#endif + { + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + fib_res_put(&res); + return ret; + } + fib_res_put(&res); + if (no_addr) + goto last_resort; + if (rpf) + goto e_inval; + fl.oif = dev->ifindex; + + ret = 0; + if (fib_lookup(&fl, &res) == 0) { + if (res.type == RTN_UNICAST) { + *spec_dst = FIB_RES_PREFSRC(res); + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + } + fib_res_put(&res); + } + return ret; + +last_resort: + if (rpf) + goto e_inval; + *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + *itag = 0; + return 0; + +e_inval_res: + fib_res_put(&res); +e_inval: + return -EINVAL; +} + +#ifndef CONFIG_IP_NOSIOCRT + +/* + * Handle IP routing ioctl calls. These are used to manipulate the routing tables + */ + +int ip_rt_ioctl(unsigned int cmd, void __user *arg) +{ + int err; + struct kern_rta rta; + struct rtentry r; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; + rtnl_lock(); + err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r); + if (err == 0) { + if (cmd == SIOCDELRT) { + struct fib_table *tb = fib_get_table(req.rtm.rtm_table); + err = -ESRCH; + if (tb) + err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); + } else { + struct fib_table *tb = fib_new_table(req.rtm.rtm_table); + err = -ENOBUFS; + if (tb) + err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + } + if (rta.rta_mx) + kfree(rta.rta_mx); + } + rtnl_unlock(); + return err; + } + return -EINVAL; +} + +#else + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + return -EINVAL; +} + +#endif + +static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for (i=1; i<=RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4) + return -EINVAL; + if (i != RTA_MULTIPATH && i != RTA_METRICS) + rta[i-1] = (struct rtattr*)RTA_DATA(attr); + } + } + return 0; +} + +int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_get_table(r->rtm_table); + if (tb) + return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ESRCH; +} + +int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_new_table(r->rtm_table); + if (tb) + return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ENOBUFS; +} + +int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct fib_table *tb; + + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return ip_rt_dump(skb, cb); + + s_t = cb->args[0]; + if (s_t == 0) + s_t = cb->args[0] = RT_TABLE_MIN; + + for (t=s_t; t<=RT_TABLE_MAX; t++) { + if (t < s_t) continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if ((tb = fib_get_table(t))==NULL) + continue; + if (tb->tb_dump(tb, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +/* Prepare and feed intra-kernel routing request. + Really, it should be netlink message, but :-( netlink + can be not configured, so that we feed it directly + to fib engine. It is legal, because all events occur + only when netlink is already locked. + */ + +static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa) +{ + struct fib_table * tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = fib_new_table(RT_TABLE_MAIN); + else + tb = fib_new_table(RT_TABLE_LOCAL); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->tb_id; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void fib_add_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *prim = ifa; + u32 mask = ifa->ifa_mask; + u32 addr = ifa->ifa_local; + u32 prefix = ifa->ifa_address&mask; + + if (ifa->ifa_flags&IFA_F_SECONDARY) { + prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + + if (!(dev->flags&IFF_UP)) + return; + + /* Add broadcast address, if it is explicitly assigned. */ + if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + + /* Add network specific broadcasts, when it takes a sense */ + if (ifa->ifa_prefixlen < 31) { + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + } + } +} + +static void fib_del_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *ifa1; + struct in_ifaddr *prim = ifa; + u32 brd = ifa->ifa_address|~ifa->ifa_mask; + u32 any = ifa->ifa_address&ifa->ifa_mask; +#define LOCAL_OK 1 +#define BRD_OK 2 +#define BRD0_OK 4 +#define BRD1_OK 8 + unsigned ok = 0; + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + else { + prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + } + + /* Deletion is more complicated than add. + We should take care of not to delete too much :-) + + Scan address list to be sure that addresses are really gone. + */ + + for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa->ifa_local == ifa1->ifa_local) + ok |= LOCAL_OK; + if (ifa->ifa_broadcast == ifa1->ifa_broadcast) + ok |= BRD_OK; + if (brd == ifa1->ifa_broadcast) + ok |= BRD1_OK; + if (any == ifa1->ifa_broadcast) + ok |= BRD0_OK; + } + + if (!(ok&BRD_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + if (!(ok&BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok&BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (!(ok&LOCAL_OK)) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + + /* Check, that this local address finally disappeared. */ + if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + /* And the last, but not the least thing. + We must flush stray FIB entries. + + First of all, we scan fib_info list searching + for stray nexthop entries, then ignite fib_flush. + */ + if (fib_sync_down(ifa->ifa_local, NULL, 0)) + fib_flush(); + } + } +#undef LOCAL_OK +#undef BRD_OK +#undef BRD0_OK +#undef BRD1_OK +} + +static void fib_disable_ip(struct net_device *dev, int force) +{ + if (fib_sync_down(0, dev, force)) + fib_flush(); + rt_cache_flush(0); + arp_ifdown(dev); +} + +static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(ifa->ifa_dev->dev); +#endif + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + fib_del_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + /* Last address was deleted from this interface. + Disable IP. + */ + fib_disable_ip(ifa->ifa_dev->dev, 1); + } else { + rt_cache_flush(-1); + } + break; + } + return NOTIFY_DONE; +} + +static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get(dev); + + if (event == NETDEV_UNREGISTER) { + fib_disable_ip(dev, 2); + return NOTIFY_DONE; + } + + if (!in_dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + fib_disable_ip(dev, 0); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + rt_cache_flush(0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block fib_inetaddr_notifier = { + .notifier_call =fib_inetaddr_event, +}; + +static struct notifier_block fib_netdev_notifier = { + .notifier_call =fib_netdev_event, +}; + +void __init ip_fib_init(void) +{ +#ifndef CONFIG_IP_MULTIPLE_TABLES + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); +#else + fib_rules_init(); +#endif + + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); +} + +EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(ip_dev_find); +EXPORT_SYMBOL(ip_rt_ioctl); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c new file mode 100644 index 000000000000..6506dcc01b46 --- /dev/null +++ b/net/ipv4/fib_hash.c @@ -0,0 +1,1086 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> + +#include "fib_lookup.h" + +static kmem_cache_t *fn_hash_kmem; +static kmem_cache_t *fn_alias_kmem; + +struct fib_node { + struct hlist_node fn_hash; + struct list_head fn_alias; + u32 fn_key; +}; + +struct fn_zone { + struct fn_zone *fz_next; /* Next not empty zone */ + struct hlist_head *fz_hash; /* Hash table pointer */ + int fz_nent; /* Number of entries */ + + int fz_divisor; /* Hash divisor */ + u32 fz_hashmask; /* (fz_divisor - 1) */ +#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) + + int fz_order; /* Zone order */ + u32 fz_mask; +#define FZ_MASK(fz) ((fz)->fz_mask) +}; + +/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask + * can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +struct fn_hash { + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + +static inline u32 fn_hash(u32 key, struct fn_zone *fz) +{ + u32 h = ntohl(key)>>(32 - fz->fz_order); + h ^= (h>>20); + h ^= (h>>10); + h ^= (h>>5); + h &= FZ_HASHMASK(fz); + return h; +} + +static inline u32 fz_key(u32 dst, struct fn_zone *fz) +{ + return dst & FZ_MASK(fz); +} + +static DEFINE_RWLOCK(fib_hash_lock); +static unsigned int fib_hash_genid; + +#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) + +static struct hlist_head *fz_hash_alloc(int divisor) +{ + unsigned long size = divisor * sizeof(struct hlist_head); + + if (size <= PAGE_SIZE) { + return kmalloc(size, GFP_KERNEL); + } else { + return (struct hlist_head *) + __get_free_pages(GFP_KERNEL, get_order(size)); + } +} + +/* The fib hash lock must be held when this is called. */ +static inline void fn_rebuild_zone(struct fn_zone *fz, + struct hlist_head *old_ht, + int old_divisor) +{ + int i; + + for (i = 0; i < old_divisor; i++) { + struct hlist_node *node, *n; + struct fib_node *f; + + hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { + struct hlist_head *new_head; + + hlist_del(&f->fn_hash); + + new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + hlist_add_head(&f->fn_hash, new_head); + } + } +} + +static void fz_hash_free(struct hlist_head *hash, int divisor) +{ + unsigned long size = divisor * sizeof(struct hlist_head); + + if (size <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long)hash, get_order(size)); +} + +static void fn_rehash_zone(struct fn_zone *fz) +{ + struct hlist_head *ht, *old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = fz->fz_divisor; + + switch (old_divisor) { + case 16: + new_divisor = 256; + break; + case 256: + new_divisor = 1024; + break; + default: + if ((old_divisor << 1) > FZ_MAX_DIVISOR) { + printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); + return; + } + new_divisor = (old_divisor << 1); + break; + } + + new_hashmask = (new_divisor - 1); + +#if RT_CACHE_DEBUG >= 2 + printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); +#endif + + ht = fz_hash_alloc(new_divisor); + + if (ht) { + memset(ht, 0, new_divisor * sizeof(struct hlist_head)); + + write_lock_bh(&fib_hash_lock); + old_ht = fz->fz_hash; + fz->fz_hash = ht; + fz->fz_hashmask = new_hashmask; + fz->fz_divisor = new_divisor; + fn_rebuild_zone(fz, old_ht, old_divisor); + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fz_hash_free(old_ht, old_divisor); + } +} + +static inline void fn_free_node(struct fib_node * f) +{ + kmem_cache_free(fn_hash_kmem, f); +} + +static inline void fn_free_alias(struct fib_alias *fa) +{ + fib_release_info(fa->fa_info); + kmem_cache_free(fn_alias_kmem, fa); +} + +static struct fn_zone * +fn_new_zone(struct fn_hash *table, int z) +{ + int i; + struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL); + if (!fz) + return NULL; + + memset(fz, 0, sizeof(struct fn_zone)); + if (z) { + fz->fz_divisor = 16; + } else { + fz->fz_divisor = 1; + } + fz->fz_hashmask = (fz->fz_divisor - 1); + fz->fz_hash = fz_hash_alloc(fz->fz_divisor); + if (!fz->fz_hash) { + kfree(fz); + return NULL; + } + memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *)); + fz->fz_order = z; + fz->fz_mask = inet_make_mask(z); + + /* Find the first not empty zone with more specific mask */ + for (i=z+1; i<=32; i++) + if (table->fn_zones[i]) + break; + write_lock_bh(&fib_hash_lock); + if (i>32) { + /* No more specific masks, we are the first. */ + fz->fz_next = table->fn_zone_list; + table->fn_zone_list = fz; + } else { + fz->fz_next = table->fn_zones[i]->fz_next; + table->fn_zones[i]->fz_next = fz; + } + table->fn_zones[z] = fz; + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + return fz; +} + +static int +fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + int err; + struct fn_zone *fz; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + + read_lock(&fib_hash_lock); + for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + struct hlist_head *head; + struct hlist_node *node; + struct fib_node *f; + u32 k = fz_key(flp->fl4_dst, fz); + + head = &fz->fz_hash[fn_hash(k, fz)]; + hlist_for_each_entry(f, node, head, fn_hash) { + if (f->fn_key != k) + continue; + + err = fib_semantic_match(&f->fn_alias, + flp, res, + f->fn_key, fz->fz_mask, + fz->fz_order); + if (err <= 0) + goto out; + } + } + err = 1; +out: + read_unlock(&fib_hash_lock); + return err; +} + +static int fn_hash_last_dflt=-1; + +static void +fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + int order, last_idx; + struct hlist_node *node; + struct fib_node *f; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz = t->fn_zones[0]; + + if (fz == NULL) + return; + + last_idx = -1; + last_resort = NULL; + order = -1; + + read_lock(&fib_hash_lock); + hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa, &f->fn_alias, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (fa->fa_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + fa->fa_state |= FA_S_ACCESSED; + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, &fn_hash_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + fn_hash_last_dflt = order; + goto out; + } + fi = next_fi; + order++; + } + } + + if (order <= 0 || fi == NULL) { + fn_hash_last_dflt = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + fn_hash_last_dflt = order; + goto out; + } + + if (last_idx >= 0) { + if (res->fi) + fib_info_put(res->fi); + res->fi = last_resort; + if (last_resort) + atomic_inc(&last_resort->fib_clntref); + } + fn_hash_last_dflt = last_idx; +out: + read_unlock(&fib_hash_lock); +} + +/* Insert node F to FZ. */ +static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) +{ + struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + + hlist_add_head(&f->fn_hash, head); +} + +/* Return the node in FZ matching KEY. */ +static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key) +{ + struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_node *node; + struct fib_node *f; + + hlist_for_each_entry(f, node, head, fn_hash) { + if (f->fn_key == key) + return f; + } + + return NULL; +} + +static int +fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fib_node *new_f, *f; + struct fib_alias *fa, *new_fa; + struct fn_zone *fz; + struct fib_info *fi; + int z = r->rtm_dst_len; + int type = r->rtm_type; + u8 tos = r->rtm_tos; + u32 key; + int err; + + if (z > 32) + return -EINVAL; + fz = table->fn_zones[z]; + if (!fz && !(fz = fn_new_zone(table, z))) + return -ENOBUFS; + + key = 0; + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + if ((fi = fib_create_info(r, rta, n, &err)) == NULL) + return err; + + if (fz->fz_nent > (fz->fz_divisor<<1) && + fz->fz_divisor < FZ_MAX_DIVISOR && + (z==32 || (1<<z) > fz->fz_divisor)) + fn_rehash_zone(fz); + + f = fib_find_node(fz, key); + + if (!f) + fa = NULL; + else + fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); + + /* Now fa, if non-NULL, points to the first fib alias + * with the same keys [prefix,tos,priority], if such key already + * exists or to the node before which we will insert new one. + * + * If fa is NULL, we will need to allocate a new one and + * insert to the head of f. + * + * If f is NULL, no fib node matched the destination key + * and we need to allocate a new one of those as well. + */ + + if (fa && fa->fa_tos == tos && + fa->fa_info->fib_priority == fi->fib_priority) { + struct fib_alias *fa_orig; + + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags & NLM_F_REPLACE) { + struct fib_info *fi_drop; + u8 state; + + write_lock_bh(&fib_hash_lock); + fi_drop = fa->fa_info; + fa->fa_info = fi; + fa->fa_type = type; + fa->fa_scope = r->rtm_scope; + state = fa->fa_state; + fa->fa_state &= ~FA_S_ACCESSED; + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) + rt_cache_flush(-1); + return 0; + } + + /* Error if we find a perfect match which + * uses the same scope, type, and nexthop + * information. + */ + fa_orig = fa; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { + if (fa->fa_tos != tos) + break; + if (fa->fa_info->fib_priority != fi->fib_priority) + break; + if (fa->fa_type == type && + fa->fa_scope == r->rtm_scope && + fa->fa_info == fi) + goto out; + } + if (!(n->nlmsg_flags & NLM_F_APPEND)) + fa = fa_orig; + } + + err = -ENOENT; + if (!(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; + + new_f = NULL; + if (!f) { + new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL); + if (new_f == NULL) + goto out_free_new_fa; + + INIT_HLIST_NODE(&new_f->fn_hash); + INIT_LIST_HEAD(&new_f->fn_alias); + new_f->fn_key = key; + f = new_f; + } + + new_fa->fa_info = fi; + new_fa->fa_tos = tos; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; + new_fa->fa_state = 0; + + /* + * Insert new entry to the list. + */ + + write_lock_bh(&fib_hash_lock); + if (new_f) + fib_insert_node(fz, new_f); + list_add_tail(&new_fa->fa_list, + (fa ? &fa->fa_list : &f->fn_alias)); + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + if (new_f) + fz->fz_nent++; + rt_cache_flush(-1); + + rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req); + return 0; + +out_free_new_fa: + kmem_cache_free(fn_alias_kmem, new_fa); +out: + fib_release_info(fi); + return err; +} + + +static int +fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node *f; + struct fib_alias *fa, *fa_to_delete; + int z = r->rtm_dst_len; + struct fn_zone *fz; + u32 key; + u8 tos = r->rtm_tos; + + if (z > 32) + return -EINVAL; + if ((fz = table->fn_zones[z]) == NULL) + return -ESRCH; + + key = 0; + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + f = fib_find_node(fz, key); + + if (!f) + fa = NULL; + else + fa = fib_find_alias(&f->fn_alias, tos, 0); + if (!fa) + return -ESRCH; + + fa_to_delete = NULL; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fa->fa_tos != tos) + break; + + if ((!r->rtm_type || + fa->fa_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || + fa->fa_scope == r->rtm_scope) && + (!r->rtm_protocol || + fi->fib_protocol == r->rtm_protocol) && + fib_nh_match(r, n, rta, fi) == 0) { + fa_to_delete = fa; + break; + } + } + + if (fa_to_delete) { + int kill_fn; + + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req); + + kill_fn = 0; + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { + hlist_del(&f->fn_hash); + kill_fn = 1; + } + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); + fn_free_alias(fa); + if (kill_fn) { + fn_free_node(f); + fz->fz_nent--; + } + + return 0; + } + return -ESRCH; +} + +static int fn_flush_list(struct fn_zone *fz, int idx) +{ + struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_node *node, *n; + struct fib_node *f; + int found = 0; + + hlist_for_each_entry_safe(f, node, n, head, fn_hash) { + struct fib_alias *fa, *fa_node; + int kill_f; + + kill_f = 0; + list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { + hlist_del(&f->fn_hash); + kill_f = 1; + } + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fn_free_alias(fa); + found++; + } + } + if (kill_f) { + fn_free_node(f); + fz->fz_nent--; + } + } + return found; +} + +static int fn_hash_flush(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; + int found = 0; + + for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + + for (i = fz->fz_divisor - 1; i >= 0; i--) + found += fn_flush_list(fz, i); + } + return found; +} + + +static inline int +fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz, + struct hlist_head *head) +{ + struct hlist_node *node; + struct fib_node *f; + int i, s_i; + + s_i = cb->args[3]; + i = 0; + hlist_for_each_entry(f, node, head, fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa, &f->fn_alias, fa_list) { + if (i < s_i) + goto next; + + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, + fa->fa_type, + fa->fa_scope, + &f->fn_key, + fz->fz_order, + fa->fa_tos, + fa->fa_info) < 0) { + cb->args[3] = i; + return -1; + } + next: + i++; + } + } + cb->args[3] = i; + return skb->len; +} + +static inline int +fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz) +{ + int h, s_h; + + s_h = cb->args[2]; + for (h=0; h < fz->fz_divisor; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); + if (fz->fz_hash == NULL || + hlist_empty(&fz->fz_hash[h])) + continue; + if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) { + cb->args[2] = h; + return -1; + } + } + cb->args[2] = h; + return skb->len; +} + +static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +{ + int m, s_m; + struct fn_zone *fz; + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + + s_m = cb->args[1]; + read_lock(&fib_hash_lock); + for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { + if (m < s_m) continue; + if (m > s_m) + memset(&cb->args[2], 0, + sizeof(cb->args) - 2*sizeof(cb->args[0])); + if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { + cb->args[1] = m; + read_unlock(&fib_hash_lock); + return -1; + } + } + read_unlock(&fib_hash_lock); + cb->args[1] = m; + return skb->len; +} + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_table * fib_hash_init(int id) +#else +struct fib_table * __init fib_hash_init(int id) +#endif +{ + struct fib_table *tb; + + if (fn_hash_kmem == NULL) + fn_hash_kmem = kmem_cache_create("ip_fib_hash", + sizeof(struct fib_node), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (fn_alias_kmem == NULL) + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), + GFP_KERNEL); + if (tb == NULL) + return NULL; + + tb->tb_id = id; + tb->tb_lookup = fn_hash_lookup; + tb->tb_insert = fn_hash_insert; + tb->tb_delete = fn_hash_delete; + tb->tb_flush = fn_hash_flush; + tb->tb_select_default = fn_hash_select_default; + tb->tb_dump = fn_hash_dump; + memset(tb->tb_data, 0, sizeof(struct fn_hash)); + return tb; +} + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +struct fib_iter_state { + struct fn_zone *zone; + int bucket; + struct hlist_head *hash_head; + struct fib_node *fn; + struct fib_alias *fa; + loff_t pos; + unsigned int genid; + int valid; +}; + +static struct fib_alias *fib_get_first(struct seq_file *seq) +{ + struct fib_iter_state *iter = seq->private; + struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data; + + iter->bucket = 0; + iter->hash_head = NULL; + iter->fn = NULL; + iter->fa = NULL; + iter->pos = 0; + iter->genid = fib_hash_genid; + iter->valid = 1; + + for (iter->zone = table->fn_zone_list; iter->zone; + iter->zone = iter->zone->fz_next) { + int maxslot; + + if (!iter->zone->fz_nent) + continue; + + iter->hash_head = iter->zone->fz_hash; + maxslot = iter->zone->fz_divisor; + + for (iter->bucket = 0; iter->bucket < maxslot; + ++iter->bucket, ++iter->hash_head) { + struct hlist_node *node; + struct fib_node *fn; + + hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa,&fn->fn_alias,fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } + } +out: + return iter->fa; +} + +static struct fib_alias *fib_get_next(struct seq_file *seq) +{ + struct fib_iter_state *iter = seq->private; + struct fib_node *fn; + struct fib_alias *fa; + + /* Advance FA, if any. */ + fn = iter->fn; + fa = iter->fa; + if (fa) { + BUG_ON(!fn); + list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { + iter->fa = fa; + goto out; + } + } + + fa = iter->fa = NULL; + + /* Advance FN. */ + if (fn) { + struct hlist_node *node = &fn->fn_hash; + hlist_for_each_entry_continue(fn, node, fn_hash) { + iter->fn = fn; + + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fa = fa; + goto out; + } + } + } + + fn = iter->fn = NULL; + + /* Advance hash chain. */ + if (!iter->zone) + goto out; + + for (;;) { + struct hlist_node *node; + int maxslot; + + maxslot = iter->zone->fz_divisor; + + while (++iter->bucket < maxslot) { + iter->hash_head++; + + hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } + + iter->zone = iter->zone->fz_next; + + if (!iter->zone) + goto out; + + iter->bucket = 0; + iter->hash_head = iter->zone->fz_hash; + + hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } +out: + iter->pos++; + return fa; +} + +static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) +{ + struct fib_iter_state *iter = seq->private; + struct fib_alias *fa; + + if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { + fa = iter->fa; + pos -= iter->pos; + } else + fa = fib_get_first(seq); + + if (fa) + while (pos && (fa = fib_get_next(seq))) + --pos; + return pos ? NULL : fa; +} + +static void *fib_seq_start(struct seq_file *seq, loff_t *pos) +{ + void *v = NULL; + + read_lock(&fib_hash_lock); + if (ip_fib_main_table) + v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + return v; +} + +static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); +} + +static void fib_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&fib_hash_lock); +} + +static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi) +{ + static unsigned type2flags[RTN_MAX + 1] = { + [7] = RTF_REJECT, [8] = RTF_REJECT, + }; + unsigned flags = type2flags[type]; + + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == 0xFFFFFFFF) + flags |= RTF_HOST; + flags |= RTF_UP; + return flags; +} + +/* + * This outputs /proc/net/route. + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ +static int fib_seq_show(struct seq_file *seq, void *v) +{ + struct fib_iter_state *iter; + char bf[128]; + u32 prefix, mask; + unsigned flags; + struct fib_node *f; + struct fib_alias *fa; + struct fib_info *fi; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " + "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" + "\tWindow\tIRTT"); + goto out; + } + + iter = seq->private; + f = iter->fn; + fa = iter->fa; + fi = fa->fa_info; + prefix = f->fn_key; + mask = FZ_MASK(iter->zone); + flags = fib_flag_trans(fa->fa_type, mask, fi); + if (fi) + snprintf(bf, sizeof(bf), + "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", prefix, + fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, + mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + snprintf(bf, sizeof(bf), + "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); + seq_printf(seq, "%-127s\n", bf); +out: + return 0; +} + +static struct seq_operations fib_seq_ops = { + .start = fib_seq_start, + .next = fib_seq_next, + .stop = fib_seq_stop, + .show = fib_seq_show, +}; + +static int fib_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &fib_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations fib_seq_fops = { + .owner = THIS_MODULE, + .open = fib_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init fib_proc_init(void) +{ + if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init fib_proc_exit(void) +{ + proc_net_remove("route"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h new file mode 100644 index 000000000000..ac4485f75e97 --- /dev/null +++ b/net/ipv4/fib_lookup.h @@ -0,0 +1,43 @@ +#ifndef _FIB_LOOKUP_H +#define _FIB_LOOKUP_H + +#include <linux/types.h> +#include <linux/list.h> +#include <net/ip_fib.h> + +struct fib_alias { + struct list_head fa_list; + struct fib_info *fa_info; + u8 fa_tos; + u8 fa_type; + u8 fa_scope; + u8 fa_state; +}; + +#define FA_S_ACCESSED 0x01 + +/* Exported by fib_semantics.c */ +extern int fib_semantic_match(struct list_head *head, + const struct flowi *flp, + struct fib_result *res, __u32 zone, __u32 mask, + int prefixlen); +extern void fib_release_info(struct fib_info *); +extern struct fib_info *fib_create_info(const struct rtmsg *r, + struct kern_rta *rta, + const struct nlmsghdr *, + int *err); +extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, + struct kern_rta *rta, struct fib_info *fi); +extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, + int dst_len, u8 tos, struct fib_info *fi); +extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, + int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req); +extern struct fib_alias *fib_find_alias(struct list_head *fah, + u8 tos, u32 prio); +extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int *dflt); + +#endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c new file mode 100644 index 000000000000..39d0aadb9a2a --- /dev/null +++ b/net/ipv4/fib_rules.c @@ -0,0 +1,437 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: policy rules. + * + * Version: $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : local_rule cannot be deleted + * Marc Boucher : routing by fwmark + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> + +#define FRprintk(a...) + +struct fib_rule +{ + struct fib_rule *r_next; + atomic_t r_clntref; + u32 r_preference; + unsigned char r_table; + unsigned char r_action; + unsigned char r_dst_len; + unsigned char r_src_len; + u32 r_src; + u32 r_srcmask; + u32 r_dst; + u32 r_dstmask; + u32 r_srcmap; + u8 r_flags; + u8 r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + u32 r_fwmark; +#endif + int r_ifindex; +#ifdef CONFIG_NET_CLS_ROUTE + __u32 r_tclassid; +#endif + char r_ifname[IFNAMSIZ]; + int r_dead; +}; + +static struct fib_rule default_rule = { + .r_clntref = ATOMIC_INIT(2), + .r_preference = 0x7FFF, + .r_table = RT_TABLE_DEFAULT, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule main_rule = { + .r_next = &default_rule, + .r_clntref = ATOMIC_INIT(2), + .r_preference = 0x7FFE, + .r_table = RT_TABLE_MAIN, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule local_rule = { + .r_next = &main_rule, + .r_clntref = ATOMIC_INIT(2), + .r_table = RT_TABLE_LOCAL, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule *fib_rules = &local_rule; +static DEFINE_RWLOCK(fib_rules_lock); + +int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, **rp; + int err = -ESRCH; + + for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { + if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && + rtm->rtm_src_len == r->r_src_len && + rtm->rtm_dst_len == r->r_dst_len && + (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && + rtm->rtm_tos == r->r_tos && +#ifdef CONFIG_IP_ROUTE_FWMARK + (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) && +#endif + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && + (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && + (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) && + (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + err = -EPERM; + if (r == &local_rule) + break; + + write_lock_bh(&fib_rules_lock); + *rp = r->r_next; + r->r_dead = 1; + write_unlock_bh(&fib_rules_lock); + fib_rule_put(r); + err = 0; + break; + } + } + return err; +} + +/* Allocate new unique table id */ + +static struct fib_table *fib_empty_table(void) +{ + int id; + + for (id = 1; id <= RT_TABLE_MAX; id++) + if (fib_tables[id] == NULL) + return __fib_new_table(id); + return NULL; +} + +void fib_rule_put(struct fib_rule *r) +{ + if (atomic_dec_and_test(&r->r_clntref)) { + if (r->r_dead) + kfree(r); + else + printk("Freeing alive rule %p\n", r); + } +} + +int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, *new_r, **rp; + unsigned char table_id; + + if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 || + (rtm->rtm_tos & ~IPTOS_TOS_MASK)) + return -EINVAL; + + if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ) + return -EINVAL; + + table_id = rtm->rtm_table; + if (table_id == RT_TABLE_UNSPEC) { + struct fib_table *table; + if (rtm->rtm_type == RTN_UNICAST) { + if ((table = fib_empty_table()) == NULL) + return -ENOBUFS; + table_id = table->tb_id; + } + } + + new_r = kmalloc(sizeof(*new_r), GFP_KERNEL); + if (!new_r) + return -ENOMEM; + memset(new_r, 0, sizeof(*new_r)); + if (rta[RTA_SRC-1]) + memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_GATEWAY-1]) + memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4); + new_r->r_src_len = rtm->rtm_src_len; + new_r->r_dst_len = rtm->rtm_dst_len; + new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len); + new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len); + new_r->r_tos = rtm->rtm_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (rta[RTA_PROTOINFO-1]) + memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4); +#endif + new_r->r_action = rtm->rtm_type; + new_r->r_flags = rtm->rtm_flags; + if (rta[RTA_PRIORITY-1]) + memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + new_r->r_table = table_id; + if (rta[RTA_IIF-1]) { + struct net_device *dev; + rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ); + new_r->r_ifindex = -1; + dev = __dev_get_by_name(new_r->r_ifname); + if (dev) + new_r->r_ifindex = dev->ifindex; + } +#ifdef CONFIG_NET_CLS_ROUTE + if (rta[RTA_FLOW-1]) + memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); +#endif + + rp = &fib_rules; + if (!new_r->r_preference) { + r = fib_rules; + if (r && (r = r->r_next) != NULL) { + rp = &fib_rules->r_next; + if (r->r_preference) + new_r->r_preference = r->r_preference - 1; + } + } + + while ( (r = *rp) != NULL ) { + if (r->r_preference > new_r->r_preference) + break; + rp = &r->r_next; + } + + new_r->r_next = r; + atomic_inc(&new_r->r_clntref); + write_lock_bh(&fib_rules_lock); + *rp = new_r; + write_unlock_bh(&fib_rules_lock); + return 0; +} + +#ifdef CONFIG_NET_CLS_ROUTE +u32 fib_rules_tclass(struct fib_result *res) +{ + if (res->r) + return res->r->r_tclassid; + return 0; +} +#endif + + +static void fib_rules_detach(struct net_device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == dev->ifindex) { + write_lock_bh(&fib_rules_lock); + r->r_ifindex = -1; + write_unlock_bh(&fib_rules_lock); + } + } +} + +static void fib_rules_attach(struct net_device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) { + write_lock_bh(&fib_rules_lock); + r->r_ifindex = dev->ifindex; + write_unlock_bh(&fib_rules_lock); + } + } +} + +int fib_lookup(const struct flowi *flp, struct fib_result *res) +{ + int err; + struct fib_rule *r, *policy; + struct fib_table *tb; + + u32 daddr = flp->fl4_dst; + u32 saddr = flp->fl4_src; + +FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ", + NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); + read_lock(&fib_rules_lock); + for (r = fib_rules; r; r=r->r_next) { + if (((saddr^r->r_src) & r->r_srcmask) || + ((daddr^r->r_dst) & r->r_dstmask) || + (r->r_tos && r->r_tos != flp->fl4_tos) || +#ifdef CONFIG_IP_ROUTE_FWMARK + (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) || +#endif + (r->r_ifindex && r->r_ifindex != flp->iif)) + continue; + +FRprintk("tb %d r %d ", r->r_table, r->r_action); + switch (r->r_action) { + case RTN_UNICAST: + policy = r; + break; + case RTN_UNREACHABLE: + read_unlock(&fib_rules_lock); + return -ENETUNREACH; + default: + case RTN_BLACKHOLE: + read_unlock(&fib_rules_lock); + return -EINVAL; + case RTN_PROHIBIT: + read_unlock(&fib_rules_lock); + return -EACCES; + } + + if ((tb = fib_get_table(r->r_table)) == NULL) + continue; + err = tb->tb_lookup(tb, flp, res); + if (err == 0) { + res->r = policy; + if (policy) + atomic_inc(&policy->r_clntref); + read_unlock(&fib_rules_lock); + return 0; + } + if (err < 0 && err != -EAGAIN) { + read_unlock(&fib_rules_lock); + return err; + } + } +FRprintk("FAILURE\n"); + read_unlock(&fib_rules_lock); + return -ENETUNREACH; +} + +void fib_select_default(const struct flowi *flp, struct fib_result *res) +{ + if (res->r && res->r->r_action == RTN_UNICAST && + FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { + struct fib_table *tb; + if ((tb = fib_get_table(res->r->r_table)) != NULL) + tb->tb_select_default(tb, flp, res); + } +} + +static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_UNREGISTER) + fib_rules_detach(dev); + else if (event == NETDEV_REGISTER) + fib_rules_attach(dev); + return NOTIFY_DONE; +} + + +static struct notifier_block fib_rules_notifier = { + .notifier_call =fib_rules_event, +}; + +static __inline__ int inet_fill_rule(struct sk_buff *skb, + struct fib_rule *r, + struct netlink_callback *cb) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = r->r_dst_len; + rtm->rtm_src_len = r->r_src_len; + rtm->rtm_tos = r->r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (r->r_fwmark) + RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark); +#endif + rtm->rtm_table = r->r_table; + rtm->rtm_protocol = 0; + rtm->rtm_scope = 0; + rtm->rtm_type = r->r_action; + rtm->rtm_flags = r->r_flags; + + if (r->r_dst_len) + RTA_PUT(skb, RTA_DST, 4, &r->r_dst); + if (r->r_src_len) + RTA_PUT(skb, RTA_SRC, 4, &r->r_src); + if (r->r_ifname[0]) + RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname); + if (r->r_preference) + RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); + if (r->r_srcmap) + RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); +#ifdef CONFIG_NET_CLS_ROUTE + if (r->r_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid); +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct fib_rule *r; + + read_lock(&fib_rules_lock); + for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { + if (idx < s_idx) + continue; + if (inet_fill_rule(skb, r, cb) < 0) + break; + } + read_unlock(&fib_rules_lock); + cb->args[0] = idx; + + return skb->len; +} + +void __init fib_rules_init(void) +{ + register_netdevice_notifier(&fib_rules_notifier); +} diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c new file mode 100644 index 000000000000..029362d66135 --- /dev/null +++ b/net/ipv4/fib_semantics.c @@ -0,0 +1,1332 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: semantics. + * + * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> +#include <net/ip_mp_alg.h> + +#include "fib_lookup.h" + +#define FSprintk(a...) + +static DEFINE_RWLOCK(fib_info_lock); +static struct hlist_head *fib_info_hash; +static struct hlist_head *fib_info_laddrhash; +static unsigned int fib_hash_size; +static unsigned int fib_info_cnt; + +#define DEVINDEX_HASHBITS 8 +#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) +static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static DEFINE_SPINLOCK(fib_multipath_lock); + +#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ +for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ +for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +/* Hope, that gcc will optimize it to get rid of dummy loop */ + +#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ +for (nhsel=0; nhsel < 1; nhsel++) + +#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ +for (nhsel=0; nhsel < 1; nhsel++) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define endfor_nexthops(fi) } + + +static struct +{ + int error; + u8 scope; +} fib_props[RTA_MAX + 1] = { + { + .error = 0, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_UNSPEC */ + { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_UNICAST */ + { + .error = 0, + .scope = RT_SCOPE_HOST, + }, /* RTN_LOCAL */ + { + .error = 0, + .scope = RT_SCOPE_LINK, + }, /* RTN_BROADCAST */ + { + .error = 0, + .scope = RT_SCOPE_LINK, + }, /* RTN_ANYCAST */ + { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_MULTICAST */ + { + .error = -EINVAL, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_BLACKHOLE */ + { + .error = -EHOSTUNREACH, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_UNREACHABLE */ + { + .error = -EACCES, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_PROHIBIT */ + { + .error = -EAGAIN, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_THROW */ + { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_NAT */ + { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_XRESOLVE */ +}; + + +/* Release a nexthop info record */ + +void free_fib_info(struct fib_info *fi) +{ + if (fi->fib_dead == 0) { + printk("Freeing alive fib_info %p\n", fi); + return; + } + change_nexthops(fi) { + if (nh->nh_dev) + dev_put(nh->nh_dev); + nh->nh_dev = NULL; + } endfor_nexthops(fi); + fib_info_cnt--; + kfree(fi); +} + +void fib_release_info(struct fib_info *fi) +{ + write_lock(&fib_info_lock); + if (fi && --fi->fib_treeref == 0) { + hlist_del(&fi->fib_hash); + if (fi->fib_prefsrc) + hlist_del(&fi->fib_lhash); + change_nexthops(fi) { + if (!nh->nh_dev) + continue; + hlist_del(&nh->nh_hash); + } endfor_nexthops(fi) + fi->fib_dead = 1; + fib_info_put(fi); + } + write_unlock(&fib_info_lock); +} + +static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +{ + const struct fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight != onh->nh_weight || +#endif +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || +#endif + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +{ + unsigned int mask = (fib_hash_size - 1); + unsigned int val = fi->fib_nhs; + + val ^= fi->fib_protocol; + val ^= fi->fib_prefsrc; + val ^= fi->fib_priority; + + return (val ^ (val >> 7) ^ (val >> 12)) & mask; +} + +static struct fib_info *fib_find_info(const struct fib_info *nfi) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn(nfi); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, node, head, fib_hash) { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, + sizeof(fi->fib_metrics)) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } + + return NULL; +} + +static inline unsigned int fib_devindex_hashfn(unsigned int val) +{ + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ + (val >> (DEVINDEX_HASHBITS * 2))) & mask; +} + +/* Check, that the gateway is already configured. + Used only by redirect accept routine. + */ + +int ip_fib_check_default(u32 gw, struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + unsigned int hash; + + read_lock(&fib_info_lock); + + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev == dev && + nh->nh_gw == gw && + !(nh->nh_flags&RTNH_F_DEAD)) { + read_unlock(&fib_info_lock); + return 0; + } + } + + read_unlock(&fib_info_lock); + + return -1; +} + +void rtmsg_fib(int event, u32 key, struct fib_alias *fa, + int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + u32 pid = req ? req->pid : 0; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, + fa->fa_type, fa->fa_scope, &key, z, + fa->fa_tos, + fa->fa_info) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + if (n->nlmsg_flags&NLM_F_ECHO) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + if (n->nlmsg_flags&NLM_F_ECHO) + netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +} + +/* Return the first fib alias matching TOS with + * priority less than or equal to PRIO. + */ +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +{ + if (fah) { + struct fib_alias *fa; + list_for_each_entry(fa, fah, fa_list) { + if (fa->fa_tos > tos) + continue; + if (fa->fa_info->fib_priority >= prio || + fa->fa_tos < tos) + return fa; + } + } + return NULL; +} + +int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, int *dflt) +{ + struct neighbour *n; + int state = NUD_NONE; + + n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state==NUD_REACHABLE) + return 0; + if ((state&NUD_VALID) && order != *dflt) + return 0; + if ((state&NUD_VALID) || + (*last_idx<0 && order > *dflt)) { + *last_resort = fi; + *last_idx = order; + } + return 1; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) +{ + while (RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(u32*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +static int +fib_count_nexthops(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while (nhlen >= (int)sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + }; + return nhs; +} + +static int +fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + if (attrlen) { + nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + return 0; +} + +#endif + +int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, + struct fib_info *fi) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + struct rtnexthop *nhp; + int nhlen; +#endif + + if (rta->rta_priority && + *rta->rta_priority != fi->fib_priority) + return 1; + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0)) + return 0; + return 1; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp == NULL) + return 0; + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + u32 gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + if (gw && gw != nh->nh_gw) + return 1; +#ifdef CONFIG_NET_CLS_ROUTE + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); + if (gw && gw != nh->nh_tclassid) + return 1; +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); +#endif + return 0; +} + + +/* + Picture + ------- + + Semantics of nexthop is very messy by historical reasons. + We have to take into account, that: + a) gateway can be actually local interface address, + so that gatewayed route is direct. + b) gateway must be on-link address, possibly + described not by an ifaddr, but also by a direct route. + c) If both gateway and interface are specified, they should not + contradict. + d) If we use tunnel routes, gateway could be not on-link. + + Attempt to reconcile all of these (alas, self-contradictory) conditions + results in pretty ugly and hairy code with obscure logic. + + I chose to generalized it instead, so that the size + of code does not increase practically, but it becomes + much more general. + Every prefix is assigned a "scope" value: "host" is local address, + "link" is direct route, + [ ... "site" ... "interior" ... ] + and "universe" is true gateway route with global meaning. + + Every prefix refers to a set of "nexthop"s (gw, oif), + where gw must have narrower scope. This recursion stops + when gw has LOCAL scope or if "nexthop" is declared ONLINK, + which means that gw is forced to be on link. + + Code is still hairy, but now it is apparently logically + consistent and very flexible. F.e. as by-product it allows + to co-exists in peace independent exterior and interior + routing processes. + + Normally it looks as following. + + {universe prefix} -> (gw, oif) [scope link] + | + |-> {link prefix} -> (gw, oif) [scope local] + | + |-> {local prefix} (terminal node) + */ + +static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct fib_result res; + +#ifdef CONFIG_IP_ROUTE_PERVASIVE + if (nh->nh_flags&RTNH_F_PERVASIVE) + return 0; +#endif + if (nh->nh_flags&RTNH_F_ONLINK) { + struct net_device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = nh->nh_gw, + .scope = r->rtm_scope + 1 } }, + .oif = nh->nh_oif }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl.fl4_scope < RT_SCOPE_LINK) + fl.fl4_scope = RT_SCOPE_LINK; + if ((err = fib_lookup(&fl, &res)) != 0) + return err; + } + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) + goto out; + dev_hold(nh->nh_dev); + err = -ENETDOWN; + if (!(nh->nh_dev->flags & IFF_UP)) + goto out; + err = 0; +out: + fib_res_put(&res); + return err; + } else { + struct in_device *in_dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + in_dev = inetdev_by_index(nh->nh_oif); + if (in_dev == NULL) + return -ENODEV; + if (!(in_dev->dev->flags&IFF_UP)) { + in_dev_put(in_dev); + return -ENETDOWN; + } + nh->nh_dev = in_dev->dev; + dev_hold(nh->nh_dev); + nh->nh_scope = RT_SCOPE_HOST; + in_dev_put(in_dev); + } + return 0; +} + +static inline unsigned int fib_laddr_hashfn(u32 val) +{ + unsigned int mask = (fib_hash_size - 1); + + return (val ^ (val >> 7) ^ (val >> 14)) & mask; +} + +static struct hlist_head *fib_hash_alloc(int bytes) +{ + if (bytes <= PAGE_SIZE) + return kmalloc(bytes, GFP_KERNEL); + else + return (struct hlist_head *) + __get_free_pages(GFP_KERNEL, get_order(bytes)); +} + +static void fib_hash_free(struct hlist_head *hash, int bytes) +{ + if (!hash) + return; + + if (bytes <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long) hash, get_order(bytes)); +} + +static void fib_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) +{ + unsigned int old_size = fib_hash_size; + unsigned int i; + + write_lock(&fib_info_lock); + fib_hash_size = new_size; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &fib_info_hash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { + struct hlist_head *dest; + unsigned int new_hash; + + hlist_del(&fi->fib_hash); + + new_hash = fib_info_hashfn(fi); + dest = &new_info_hash[new_hash]; + hlist_add_head(&fi->fib_hash, dest); + } + } + fib_info_hash = new_info_hash; + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &fib_info_laddrhash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { + struct hlist_head *ldest; + unsigned int new_hash; + + hlist_del(&fi->fib_lhash); + + new_hash = fib_laddr_hashfn(fi->fib_prefsrc); + ldest = &new_laddrhash[new_hash]; + hlist_add_head(&fi->fib_lhash, ldest); + } + } + fib_info_laddrhash = new_laddrhash; + + write_unlock(&fib_info_lock); +} + +struct fib_info * +fib_create_info(const struct rtmsg *r, struct kern_rta *rta, + const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct fib_info *fi = NULL; + struct fib_info *ofi; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nhs = 1; +#else + const int nhs = 1; +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 mp_alg = IP_MP_ALG_NONE; +#endif + + /* Fast check to catch the most weird cases */ + if (fib_props[r->rtm_type].scope > r->rtm_scope) + goto err_inval; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp) { + nhs = fib_count_nexthops(rta->rta_mp); + if (nhs == 0) + goto err_inval; + } +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rta->rta_mp_alg) { + mp_alg = *rta->rta_mp_alg; + + if (mp_alg < IP_MP_ALG_NONE || + mp_alg > IP_MP_ALG_MAX) + goto err_inval; + } +#endif + + err = -ENOBUFS; + if (fib_info_cnt >= fib_hash_size) { + unsigned int new_size = fib_hash_size << 1; + struct hlist_head *new_info_hash; + struct hlist_head *new_laddrhash; + unsigned int bytes; + + if (!new_size) + new_size = 1; + bytes = new_size * sizeof(struct hlist_head *); + new_info_hash = fib_hash_alloc(bytes); + new_laddrhash = fib_hash_alloc(bytes); + if (!new_info_hash || !new_laddrhash) { + fib_hash_free(new_info_hash, bytes); + fib_hash_free(new_laddrhash, bytes); + } else { + memset(new_info_hash, 0, bytes); + memset(new_laddrhash, 0, bytes); + + fib_hash_move(new_info_hash, new_laddrhash, new_size); + } + + if (!fib_hash_size) + goto failure; + } + + fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + if (fi == NULL) + goto failure; + fib_info_cnt++; + memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh)); + + fi->fib_protocol = r->rtm_protocol; + + fi->fib_nhs = nhs; + change_nexthops(fi) { + nh->nh_parent = fi; + } endfor_nexthops(fi) + + fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > RTAX_MAX) + goto err_inval; + fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); + + if (rta->rta_mp) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) + goto err_inval; +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4)) + goto err_inval; +#endif +#else + goto err_inval; +#endif + } else { + struct fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 4); +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow) + memcpy(&nh->nh_tclassid, rta->rta_flow, 4); +#endif + nh->nh_flags = r->rtm_flags; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = 1; +#endif + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + fi->fib_mp_alg = mp_alg; +#endif + + if (fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct fib_nh *nh = fi->fib_nh; + + /* Local address is added. */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = fib_check_nh(r, fi, nh)) != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) + if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + +link_it: + if ((ofi = fib_find_info(fi)) != NULL) { + fi->fib_dead = 1; + free_fib_info(fi); + ofi->fib_treeref++; + return ofi; + } + + fi->fib_treeref++; + atomic_inc(&fi->fib_clntref); + write_lock(&fib_info_lock); + hlist_add_head(&fi->fib_hash, + &fib_info_hash[fib_info_hashfn(fi)]); + if (fi->fib_prefsrc) { + struct hlist_head *head; + + head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; + hlist_add_head(&fi->fib_lhash, head); + } + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; + + if (!nh->nh_dev) + continue; + hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nh->nh_hash, head); + } endfor_nexthops(fi) + write_unlock(&fib_info_lock); + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) { + fi->fib_dead = 1; + free_fib_info(fi); + } + return NULL; +} + +int fib_semantic_match(struct list_head *head, const struct flowi *flp, + struct fib_result *res, __u32 zone, __u32 mask, + int prefixlen) +{ + struct fib_alias *fa; + int nh_sel = 0; + + list_for_each_entry(fa, head, fa_list) { + int err; + + if (fa->fa_tos && + fa->fa_tos != flp->fl4_tos) + continue; + + if (fa->fa_scope < flp->fl4_scope) + continue; + + fa->fa_state |= FA_S_ACCESSED; + + err = fib_props[fa->fa_type].error; + if (err == 0) { + struct fib_info *fi = fa->fa_info; + + if (fi->fib_flags & RTNH_F_DEAD) + continue; + + switch (fa->fa_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + for_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + continue; + if (!flp->oif || flp->oif == nh->nh_oif) + break; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (nhsel < fi->fib_nhs) { + nh_sel = nhsel; + goto out_fill_res; + } +#else + if (nhsel < 1) { + goto out_fill_res; + } +#endif + endfor_nexthops(fi); + continue; + + default: + printk(KERN_DEBUG "impossible 102\n"); + return -EINVAL; + }; + } + return err; + } + return 1; + +out_fill_res: + res->prefixlen = prefixlen; + res->nh_sel = nh_sel; + res->type = fa->fa_type; + res->scope = fa->fa_scope; + res->fi = fa->fa_info; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + res->netmask = mask; + res->network = zone & + (0xFFFFFFFF >> (32 - prefixlen)); +#endif + atomic_inc(&res->fi->fib_clntref); + return 0; +} + +/* Find appropriate source address to this destination */ + +u32 __fib_res_prefsrc(struct fib_result *res) +{ + return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); +} + +int +fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, + struct fib_info *fi) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = tos; + rtm->rtm_table = tb_id; + rtm->rtm_type = type; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 4, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); +#ifdef CONFIG_NET_CLS_ROUTE + if (fi->fib_nh[0].nh_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); +#endif + if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + goto rtattr_failure; + if (fi->fib_prefsrc) + RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0)); + + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight-1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); + nhp->rtnh_len = skb->tail - (unsigned char*)nhp; + } endfor_nexthops(fi); + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifndef CONFIG_IP_NOSIOCRT + +int +fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r) +{ + int plen; + u32 *ptr; + + memset(rtm, 0, sizeof(*rtm)); + memset(rta, 0, sizeof(*rta)); + + if (r->rt_dst.sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* Check mask for validity: + a) it must be contiguous. + b) destination must have all host bits clear. + c) if application forgot to set correct family (AF_INET), + reject request unless it is absolutely clear i.e. + both family and mask are zero. + */ + plen = 32; + ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr; + if (!(r->rt_flags&RTF_HOST)) { + u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr; + if (r->rt_genmask.sa_family != AF_INET) { + if (mask || r->rt_genmask.sa_family) + return -EAFNOSUPPORT; + } + if (bad_mask(mask, *ptr)) + return -EINVAL; + plen = inet_mask_len(mask); + } + + nl->nlmsg_flags = NLM_F_REQUEST; + nl->nlmsg_pid = 0; + nl->nlmsg_seq = 0; + nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); + if (cmd == SIOCDELRT) { + nl->nlmsg_type = RTM_DELROUTE; + nl->nlmsg_flags = 0; + } else { + nl->nlmsg_type = RTM_NEWROUTE; + nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; + rtm->rtm_protocol = RTPROT_BOOT; + } + + rtm->rtm_dst_len = plen; + rta->rta_dst = ptr; + + if (r->rt_metric) { + *(u32*)&r->rt_pad3 = r->rt_metric - 1; + rta->rta_priority = (u32*)&r->rt_pad3; + } + if (r->rt_flags&RTF_REJECT) { + rtm->rtm_scope = RT_SCOPE_HOST; + rtm->rtm_type = RTN_UNREACHABLE; + return 0; + } + rtm->rtm_scope = RT_SCOPE_NOWHERE; + rtm->rtm_type = RTN_UNICAST; + + if (r->rt_dev) { + char *colon; + struct net_device *dev; + char devname[IFNAMSIZ]; + + if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + devname[IFNAMSIZ-1] = 0; + colon = strchr(devname, ':'); + if (colon) + *colon = 0; + dev = __dev_get_by_name(devname); + if (!dev) + return -ENODEV; + rta->rta_oif = &dev->ifindex; + if (colon) { + struct in_ifaddr *ifa; + struct in_device *in_dev = __in_dev_get(dev); + if (!in_dev) + return -ENODEV; + *colon = ':'; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + if (strcmp(ifa->ifa_label, devname) == 0) + break; + if (ifa == NULL) + return -ENODEV; + rta->rta_prefsrc = &ifa->ifa_local; + } + } + + ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; + if (r->rt_gateway.sa_family == AF_INET && *ptr) { + rta->rta_gw = ptr; + if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST) + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + } + + if (cmd == SIOCDELRT) + return 0; + + if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) + return -EINVAL; + + if (rtm->rtm_scope == RT_SCOPE_NOWHERE) + rtm->rtm_scope = RT_SCOPE_LINK; + + if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) { + struct rtattr *rec; + struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL); + if (mx == NULL) + return -ENOMEM; + rta->rta_mx = mx; + mx->rta_type = RTA_METRICS; + mx->rta_len = RTA_LENGTH(0); + if (r->rt_flags&RTF_MTU) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_ADVMSS; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_mtu - 40; + } + if (r->rt_flags&RTF_WINDOW) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_WINDOW; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_window; + } + if (r->rt_flags&RTF_IRTT) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_RTT; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_irtt<<3; + } + } + return 0; +} + +#endif + +/* + Update FIB if: + - local address disappeared -> we must delete all the entries + referring to it. + - device went down -> we must shutdown all nexthops going via it. + */ + +int fib_sync_down(u32 local, struct net_device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; + + if (local && fib_info_laddrhash) { + unsigned int hash = fib_laddr_hashfn(local); + struct hlist_head *head = &fib_info_laddrhash[hash]; + struct hlist_node *node; + struct fib_info *fi; + + hlist_for_each_entry(fi, node, head, fib_lhash) { + if (fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } + + if (dev) { + struct fib_info *prev_fi = NULL; + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_node *node; + struct fib_nh *nh; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int dead; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + prev_fi = fi; + dead = 0; + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != scope) { + nh->nh_flags |= RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + spin_lock_bh(&fib_multipath_lock); + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; + spin_unlock_bh(&fib_multipath_lock); +#endif + dead++; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (force > 1 && nh->nh_dev == dev) { + dead = fi->fib_nhs; + break; + } +#endif + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } + + return ret; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* + Dead device goes up. We wake up dead nexthops. + It takes sense only on multipath routes. + */ + +int fib_sync_up(struct net_device *dev) +{ + struct fib_info *prev_fi; + unsigned int hash; + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + int ret; + + if (!(dev->flags&IFF_UP)) + return 0; + + prev_fi = NULL; + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + ret = 0; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int alive; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + + prev_fi = fi; + alive = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || __in_dev_get(dev) == NULL) + continue; + alive++; + spin_lock_bh(&fib_multipath_lock); + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + spin_unlock_bh(&fib_multipath_lock); + } endfor_nexthops(fi) + + if (alive > 0) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } + + return ret; +} + +/* + The algorithm is suboptimal, but it provides really + fair weighted route distribution. + */ + +void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + int w; + + spin_lock_bh(&fib_multipath_lock); + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; + if (power <= 0) { + spin_unlock_bh(&fib_multipath_lock); + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + return; + } + } + + + /* w should be random number [0..fi->fib_power-1], + it is pretty bad approximation. + */ + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + spin_unlock_bh(&fib_multipath_lock); + return; + } + } + } endfor_nexthops(fi); + + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + spin_unlock_bh(&fib_multipath_lock); +} +#endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c new file mode 100644 index 000000000000..85bf0d3e294b --- /dev/null +++ b/net/ipv4/icmp.c @@ -0,0 +1,1143 @@ +/* + * NET3: Implementation of the ICMP protocol layer. + * + * Alan Cox, <alan@redhat.com> + * + * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Some of the function names and the icmp unreach table for this + * module were derived from [icmp.c 1.0.11 06/02/93] by + * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. + * Other than that this module is a complete rewrite. + * + * Fixes: + * Clemens Fruhwirth : introduce global icmp rate limiting + * with icmp type masking ability instead + * of broken per type icmp timeouts. + * Mike Shaver : RFC1122 checks. + * Alan Cox : Multicast ping reply as self. + * Alan Cox : Fix atomicity lockup in ip_build_xmit + * call. + * Alan Cox : Added 216,128 byte paths to the MTU + * code. + * Martin Mares : RFC1812 checks. + * Martin Mares : Can be configured to follow redirects + * if acting as a router _without_ a + * routing protocol (RFC 1812). + * Martin Mares : Echo requests may be configured to + * be ignored (RFC 1812). + * Martin Mares : Limitation of ICMP error message + * transmit rate (RFC 1812). + * Martin Mares : TOS and Precedence set correctly + * (RFC 1812). + * Martin Mares : Now copying as much data from the + * original packet as we can without + * exceeding 576 bytes (RFC 1812). + * Willy Konynenberg : Transparent proxying support. + * Keith Owens : RFC1191 correction for 4.2BSD based + * path MTU bug. + * Thomas Quinot : ICMP Dest Unreach codes up to 15 are + * valid (RFC 1812). + * Andi Kleen : Check all packet lengths properly + * and moved all kfree_skb() up to + * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a token + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. + * Yu Tianli : Fixed two ugly bugs in icmp_send + * - IP option length was accounted wrongly + * - ICMP header length was not accounted + * at all. + * Tristan Greaves : Added sysctl option to ignore bogus + * broadcast responses from broken routers. + * + * To Fix: + * + * - Should use skb_pull() instead of all the manual checking. + * This would also greatly simply some upper layer error handlers. --AK + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/netfilter_ipv4.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <net/checksum.h> + +/* + * Build xmit assembly blocks + */ + +struct icmp_bxm { + struct sk_buff *skb; + int offset; + int data_len; + + struct { + struct icmphdr icmph; + __u32 times[3]; + } data; + int head_len; + struct ip_options replyopts; + unsigned char optbuf[40]; +}; + +/* + * Statistics + */ +DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); + +/* An array of errno for error messages from dest unreach. */ +/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ + +struct icmp_err icmp_err_convert[] = { + { + .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */ + .fatal = 0, + }, + { + .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */, + .fatal = 1, + }, + { + .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */ + .fatal = 1, + }, + { + .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */ + .fatal = 0, + }, + { + .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */ + .fatal = 0, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */ + .fatal = 1, + }, + { + .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */ + .fatal = 1, + }, + { + .errno = ENONET, /* ICMP_HOST_ISOLATED */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_ANO */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */ + .fatal = 1, + }, +}; + +/* Control parameters for ECHO replies. */ +int sysctl_icmp_echo_ignore_all; +int sysctl_icmp_echo_ignore_broadcasts; + +/* Control parameter - ignore bogus broadcast responses? */ +int sysctl_icmp_ignore_bogus_error_responses; + +/* + * Configurable global rate limit. + * + * ratelimit defines tokens/packet consumed for dst->rate_token bucket + * ratemask defines which icmp types are ratelimited by setting + * it's bit position. + * + * default: + * dest unreachable (3), source quench (4), + * time exceeded (11), parameter problem (12) + */ + +int sysctl_icmp_ratelimit = 1 * HZ; +int sysctl_icmp_ratemask = 0x1818; + +/* + * ICMP control array. This specifies what to do with each ICMP. + */ + +struct icmp_control { + int output_entry; /* Field for increment on output */ + int input_entry; /* Field for increment on input */ + void (*handler)(struct sk_buff *skb); + short error; /* This ICMP is classed as an error message */ +}; + +static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; +#define icmp_socket __get_cpu_var(__icmp_socket) + +static __inline__ int icmp_xmit_lock(void) +{ + local_bh_disable(); + + if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) { + /* This can happen if the output path signals a + * dst_link_failure() for an outgoing ICMP packet. + */ + local_bh_enable(); + return 1; + } + return 0; +} + +static void icmp_xmit_unlock(void) +{ + spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); +} + +/* + * Send an ICMP frame. + */ + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the destination cache now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same dst_entry fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +int xrlim_allow(struct dst_entry *dst, int timeout) +{ + unsigned long now; + int rc = 0; + + now = jiffies; + dst->rate_tokens += now - dst->rate_last; + dst->rate_last = now; + if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) + dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; + if (dst->rate_tokens >= timeout) { + dst->rate_tokens -= timeout; + rc = 1; + } + return rc; +} + +static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +{ + struct dst_entry *dst = &rt->u.dst; + int rc = 1; + + if (type > NR_ICMP_TYPES) + goto out; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + goto out; + + /* No rate limit on loopback */ + if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + goto out; + + /* Limit if icmp type is enabled in ratemask. */ + if ((1 << type) & sysctl_icmp_ratemask) + rc = xrlim_allow(dst, sysctl_icmp_ratelimit); +out: + return rc; +} + +/* + * Maintain the counters used in the SNMP statistics for outgoing ICMP + */ +static void icmp_out_count(int type) +{ + if (type <= NR_ICMP_TYPES) { + ICMP_INC_STATS(icmp_pointers[type].output_entry); + ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + } +} + +/* + * Checksum each fragment, and on the first include the headers and final + * checksum. + */ +static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; + unsigned int csum; + + csum = skb_copy_and_csum_bits(icmp_param->skb, + icmp_param->offset + offset, + to, len, 0); + + skb->csum = csum_block_add(skb->csum, csum, odd); + if (icmp_pointers[icmp_param->data.icmph.type].error) + nf_ct_attach(skb, icmp_param->skb); + return 0; +} + +static void icmp_push_reply(struct icmp_bxm *icmp_param, + struct ipcm_cookie *ipc, struct rtable *rt) +{ + struct sk_buff *skb; + + ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+icmp_param->head_len, + icmp_param->head_len, + ipc, rt, MSG_DONTWAIT); + + if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { + struct icmphdr *icmph = skb->h.icmph; + unsigned int csum = 0; + struct sk_buff *skb1; + + skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) { + csum = csum_add(csum, skb1->csum); + } + csum = csum_partial_copy_nocheck((void *)&icmp_param->data, + (char *)icmph, + icmp_param->head_len, csum); + icmph->checksum = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(icmp_socket->sk); + } +} + +/* + * Driving logic for building and sending ICMP messages. + */ + +static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) +{ + struct sock *sk = icmp_socket->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct rtable *rt = (struct rtable *)skb->dst; + u32 daddr; + + if (ip_options_echo(&icmp_param->replyopts, skb)) + goto out; + + if (icmp_xmit_lock()) + return; + + icmp_param->data.icmph.checksum = 0; + icmp_out_count(icmp_param->data.icmph.type); + + inet->tos = skb->nh.iph->tos; + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + if (icmp_param->replyopts.optlen) { + ipc.opt = &icmp_param->replyopts; + if (ipc.opt->srr) + daddr = icmp_param->replyopts.faddr; + } + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + .proto = IPPROTO_ICMP }; + if (ip_route_output_key(&rt, &fl)) + goto out_unlock; + } + if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, + icmp_param->data.icmph.code)) + icmp_push_reply(icmp_param, &ipc, rt); + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(); +out:; +} + + +/* + * Send an ICMP message in response to a situation + * + * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. + * MAY send more (we do). + * MUST NOT change this header information. + * MUST NOT reply to a multicast/broadcast IP address. + * MUST NOT reply to a multicast/broadcast MAC address. + * MUST reply to only the first fragment. + */ + +void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info) +{ + struct iphdr *iph; + int room; + struct icmp_bxm icmp_param; + struct rtable *rt = (struct rtable *)skb_in->dst; + struct ipcm_cookie ipc; + u32 saddr; + u8 tos; + + if (!rt) + goto out; + + /* + * Find the original header. It is expected to be valid, of course. + * Check this, icmp_send is called from the most obscure devices + * sometimes. + */ + iph = skb_in->nh.iph; + + if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail) + goto out; + + /* + * No replies to physical multicast/broadcast + */ + if (skb_in->pkt_type != PACKET_HOST) + goto out; + + /* + * Now check at the protocol level + */ + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto out; + + /* + * Only reply to fragment 0. We byte re-order the constant + * mask for efficiency. + */ + if (iph->frag_off & htons(IP_OFFSET)) + goto out; + + /* + * If we send an ICMP error to an ICMP error a mess would result.. + */ + if (icmp_pointers[type].error) { + /* + * We are an error, check if we are replying to an + * ICMP error + */ + if (iph->protocol == IPPROTO_ICMP) { + u8 _inner_type, *itp; + + itp = skb_header_pointer(skb_in, + skb_in->nh.raw + + (iph->ihl << 2) + + offsetof(struct icmphdr, + type) - + skb_in->data, + sizeof(_inner_type), + &_inner_type); + if (itp == NULL) + goto out; + + /* + * Assume any unknown ICMP type is an error. This + * isn't specified by the RFC, but think about it.. + */ + if (*itp > NR_ICMP_TYPES || + icmp_pointers[*itp].error) + goto out; + } + } + + if (icmp_xmit_lock()) + return; + + /* + * Construct source address and options. + */ + + saddr = iph->daddr; + if (!(rt->rt_flags & RTCF_LOCAL)) + saddr = 0; + + tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + IPTOS_PREC_INTERNETCONTROL) : + iph->tos; + + if (ip_options_echo(&icmp_param.replyopts, skb_in)) + goto ende; + + + /* + * Prepare data for ICMP header. + */ + + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_in->nh.raw - skb_in->data; + icmp_out_count(icmp_param.data.icmph.type); + inet_sk(icmp_socket->sk)->tos = tos; + ipc.addr = iph->saddr; + ipc.opt = &icmp_param.replyopts; + + { + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = icmp_param.replyopts.srr ? + icmp_param.replyopts.faddr : + iph->saddr, + .saddr = saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_ICMP, + .uli_u = { + .icmpt = { + .type = type, + .code = code + } + } + }; + if (ip_route_output_key(&rt, &fl)) + goto out_unlock; + } + + if (!icmpv4_xrlim_allow(rt, type, code)) + goto ende; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + + room = dst_mtu(&rt->u.dst); + if (room > 576) + room = 576; + room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct icmphdr); + + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); + + icmp_push_reply(&icmp_param, &ipc, rt); +ende: + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(); +out:; +} + + +/* + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + */ + +static void icmp_unreach(struct sk_buff *skb) +{ + struct iphdr *iph; + struct icmphdr *icmph; + int hash, protocol; + struct net_protocol *ipprot; + struct sock *raw_sk; + u32 info = 0; + + /* + * Incomplete header ? + * Only checks for the IP header, there should be an + * additional check for longer headers in upper levels. + */ + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out_err; + + icmph = skb->h.icmph; + iph = (struct iphdr *)skb->data; + + if (iph->ihl < 5) /* Mangled header, drop. */ + goto out_err; + + if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->code & 15) { + case ICMP_NET_UNREACH: + case ICMP_HOST_UNREACH: + case ICMP_PROT_UNREACH: + case ICMP_PORT_UNREACH: + break; + case ICMP_FRAG_NEEDED: + if (ipv4_config.no_pmtu_disc) { + LIMIT_NETDEBUG( + printk(KERN_INFO "ICMP: %u.%u.%u.%u: " + "fragmentation needed " + "and DF set.\n", + NIPQUAD(iph->daddr))); + } else { + info = ip_rt_frag_needed(iph, + ntohs(icmph->un.frag.mtu)); + if (!info) + goto out; + } + break; + case ICMP_SR_FAILED: + LIMIT_NETDEBUG( + printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + "Route Failed.\n", + NIPQUAD(iph->daddr))); + break; + default: + break; + } + if (icmph->code > NR_ICMP_UNREACH) + goto out; + } else if (icmph->type == ICMP_PARAMETERPROB) + info = ntohl(icmph->un.gateway) >> 24; + + /* + * Throw it at our lower layers + * + * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed + * header. + * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the + * transport layer. + * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to + * transport layer. + */ + + /* + * Check the other end isnt violating RFC 1122. Some routers send + * bogus responses to broadcast frames. If you see this message + * first check your netmask matches at both ends, if it does then + * get the other vendor to fix their kit. + */ + + if (!sysctl_icmp_ignore_bogus_error_responses && + inet_addr_type(iph->daddr) == RTN_BROADCAST) { + if (net_ratelimit()) + printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " + "type %u, code %u " + "error to a broadcast: %u.%u.%u.%u on %s\n", + NIPQUAD(skb->nh.iph->saddr), + icmph->type, icmph->code, + NIPQUAD(iph->daddr), + skb->dev->name); + goto out; + } + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + iph = (struct iphdr *)skb->data; + protocol = iph->protocol; + + /* + * Deliver ICMP message to raw sockets. Pretty useless feature? + */ + + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + hash = protocol & (MAX_INET_PROTOS - 1); + read_lock(&raw_v4_lock); + if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { + while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, + iph->saddr, + skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb, info); + raw_sk = sk_next(raw_sk); + iph = (struct iphdr *)skb->data; + } + } + read_unlock(&raw_v4_lock); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); + +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_REDIRECT. + */ + +static void icmp_redirect(struct sk_buff *skb) +{ + struct iphdr *iph; + unsigned long ip; + + if (skb->len < sizeof(struct iphdr)) + goto out_err; + + /* + * Get the copied header of the packet that caused the redirect + */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + iph = (struct iphdr *)skb->data; + ip = iph->daddr; + + switch (skb->h.icmph->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + /* + * As per RFC recommendations now handle it as a host redirect. + */ + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + iph->saddr, iph->tos, skb->dev); + break; + } +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + +/* + * Handle ICMP_ECHO ("ping") requests. + * + * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo + * requests. + * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be + * included in the reply. + * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring + * echo requests, MUST have default=NOT. + * See also WRT handling of options once they are done and working. + */ + +static void icmp_echo(struct sk_buff *skb) +{ + if (!sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph.type = ICMP_ECHOREPLY; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); + icmp_reply(&icmp_param, skb); + } +} + +/* + * Handle ICMP Timestamp requests. + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ +static void icmp_timestamp(struct sk_buff *skb) +{ + struct timeval tv; + struct icmp_bxm icmp_param; + /* + * Too short. + */ + if (skb->len < 4) + goto out_err; + + /* + * Fill in the current time as ms since midnight UT: + */ + do_gettimeofday(&tv); + icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 + + tv.tv_usec / 1000); + icmp_param.data.times[2] = icmp_param.data.times[1]; + if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) + BUG(); + icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param.data.icmph.code = 0; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = 0; + icmp_param.head_len = sizeof(struct icmphdr) + 12; + icmp_reply(&icmp_param, skb); +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_ADDRESS_MASK requests. (RFC950) + * + * RFC1122 (3.2.2.9). A host MUST only send replies to + * ADDRESS_MASK requests if it's been configured as an address mask + * agent. Receiving a request doesn't constitute implicit permission to + * act as one. Of course, implementing this correctly requires (SHOULD) + * a way to turn the functionality on and off. Another one for sysctl(), + * I guess. -- MS + * + * RFC1812 (4.3.3.9). A router MUST implement it. + * A router SHOULD have switch turning it on/off. + * This switch MUST be ON by default. + * + * Gratuitous replies, zero-source replies are not implemented, + * that complies with RFC. DO NOT implement them!!! All the idea + * of broadcast addrmask replies as specified in RFC950 is broken. + * The problem is that it is not uncommon to have several prefixes + * on one physical interface. Moreover, addrmask agent can even be + * not aware of existing another prefixes. + * If source is zero, addrmask agent cannot choose correct prefix. + * Gratuitous mask announcements suffer from the same problem. + * RFC1812 explains it, but still allows to use ADDRMASK, + * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK + * + * Furthermore you can do it with a usermode address agent program + * anyway... + */ + +static void icmp_address(struct sk_buff *skb) +{ +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); +#endif +} + +/* + * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain + * loudly if an inconsistency is found. + */ + +static void icmp_address_reply(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb->dst; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) + goto out; + + in_dev = in_dev_get(dev); + if (!in_dev) + goto out; + rcu_read_lock(); + if (in_dev->ifa_list && + IN_DEV_LOG_MARTIANS(in_dev) && + IN_DEV_FORWARD(in_dev)) { + u32 _mask, *mp; + + mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); + if (mp == NULL) + BUG(); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (*mp == ifa->ifa_mask && + inet_ifa_match(rt->rt_src, ifa)) + break; + } + if (!ifa && net_ratelimit()) { + printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from " + "%s/%u.%u.%u.%u\n", + NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); + } + } + rcu_read_unlock(); + in_dev_put(in_dev); +out:; +} + +static void icmp_discard(struct sk_buff *skb) +{ +} + +/* + * Deal with incoming ICMP packets. + */ +int icmp_rcv(struct sk_buff *skb) +{ + struct icmphdr *icmph; + struct rtable *rt = (struct rtable *)skb->dst; + + ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "icmp v4 hw csum failure\n")); + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + goto error; + default:; + } + + if (!pskb_pull(skb, sizeof(struct icmphdr))) + goto error; + + icmph = skb->h.icmph; + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) + goto error; + + + /* + * Parse the ICMP message + */ + + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + /* + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + if (icmph->type == ICMP_ECHO && + sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } + if (icmph->type != ICMP_ECHO && + icmph->type != ICMP_TIMESTAMP && + icmph->type != ICMP_ADDRESS && + icmph->type != ICMP_ADDRESSREPLY) { + goto error; + } + } + + ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry); + icmp_pointers[icmph->type].handler(skb); + +drop: + kfree_skb(skb); + return 0; +error: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto drop; +} + +/* + * This table is the definition of how we handle ICMP. + */ +static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { + [ICMP_ECHOREPLY] = { + .output_entry = ICMP_MIB_OUTECHOREPS, + .input_entry = ICMP_MIB_INECHOREPS, + .handler = icmp_discard, + }, + [1] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [2] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_DEST_UNREACH] = { + .output_entry = ICMP_MIB_OUTDESTUNREACHS, + .input_entry = ICMP_MIB_INDESTUNREACHS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_SOURCE_QUENCH] = { + .output_entry = ICMP_MIB_OUTSRCQUENCHS, + .input_entry = ICMP_MIB_INSRCQUENCHS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_REDIRECT] = { + .output_entry = ICMP_MIB_OUTREDIRECTS, + .input_entry = ICMP_MIB_INREDIRECTS, + .handler = icmp_redirect, + .error = 1, + }, + [6] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [7] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_ECHO] = { + .output_entry = ICMP_MIB_OUTECHOS, + .input_entry = ICMP_MIB_INECHOS, + .handler = icmp_echo, + }, + [9] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [10] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_TIME_EXCEEDED] = { + .output_entry = ICMP_MIB_OUTTIMEEXCDS, + .input_entry = ICMP_MIB_INTIMEEXCDS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_PARAMETERPROB] = { + .output_entry = ICMP_MIB_OUTPARMPROBS, + .input_entry = ICMP_MIB_INPARMPROBS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_TIMESTAMP] = { + .output_entry = ICMP_MIB_OUTTIMESTAMPS, + .input_entry = ICMP_MIB_INTIMESTAMPS, + .handler = icmp_timestamp, + }, + [ICMP_TIMESTAMPREPLY] = { + .output_entry = ICMP_MIB_OUTTIMESTAMPREPS, + .input_entry = ICMP_MIB_INTIMESTAMPREPS, + .handler = icmp_discard, + }, + [ICMP_INFO_REQUEST] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_DUMMY, + .handler = icmp_discard, + }, + [ICMP_INFO_REPLY] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_DUMMY, + .handler = icmp_discard, + }, + [ICMP_ADDRESS] = { + .output_entry = ICMP_MIB_OUTADDRMASKS, + .input_entry = ICMP_MIB_INADDRMASKS, + .handler = icmp_address, + }, + [ICMP_ADDRESSREPLY] = { + .output_entry = ICMP_MIB_OUTADDRMASKREPS, + .input_entry = ICMP_MIB_INADDRMASKREPS, + .handler = icmp_address_reply, + }, +}; + +void __init icmp_init(struct net_proto_family *ops) +{ + struct inet_sock *inet; + int i; + + for (i = 0; i < NR_CPUS; i++) { + int err; + + if (!cpu_possible(i)) + continue; + + err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, + &per_cpu(__icmp_socket, i)); + + if (err < 0) + panic("Failed to create the ICMP control socket.\n"); + + per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + per_cpu(__icmp_socket, i)->sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + + inet = inet_sk(per_cpu(__icmp_socket, i)->sk); + inet->uc_ttl = -1; + inet->pmtudisc = IP_PMTUDISC_DONT; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk); + } +} + +EXPORT_SYMBOL(icmp_err_convert); +EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(icmp_statistics); +EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c new file mode 100644 index 000000000000..1f3183168a90 --- /dev/null +++ b/net/ipv4/igmp.c @@ -0,0 +1,2473 @@ +/* + * Linux NET3: Internet Group Management Protocol [IGMP] + * + * This code implements the IGMP protocol as defined in RFC1112. There has + * been a further revision of this protocol since which is now supported. + * + * If you have trouble with this module be careful what gcc you have used, + * the older version didn't come out right using gcc 2.5.8, the newer one + * seems to fall out with gcc 2.6.2. + * + * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $ + * + * Authors: + * Alan Cox <Alan.Cox@linux.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + * Alan Cox : Added lots of __inline__ to optimise + * the memory usage of all the tiny little + * functions. + * Alan Cox : Dumped the header building experiment. + * Alan Cox : Minor tweaks ready for multicast routing + * and extended IGMP protocol. + * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 + * writes utterly bogus code otherwise (sigh) + * fixed IGMP loopback to behave in the manner + * desired by mrouted, fixed the fact it has been + * broken since 1.3.6 and cleaned up a few minor + * points. + * + * Chih-Jen Chang : Tried to revise IGMP to Version 2 + * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu + * The enhancements are mainly based on Steve Deering's + * ipmulti-3.5 source code. + * Chih-Jen Chang : Added the igmp_get_mrouter_info and + * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of + * the mrouted version on that device. + * Chih-Jen Chang : Added the max_resp_time parameter to + * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter + * to identify the multicast router version + * and do what the IGMP version 2 specified. + * Chih-Jen Chang : Added a timer to revert to IGMP V2 router + * Tsu-Sheng Tsao if the specified time expired. + * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. + * Alan Cox : Use GFP_ATOMIC in the right places. + * Christian Daudt : igmp timer wasn't set for local group + * memberships but was being deleted, + * which caused a "del_timer() called + * from %p with timer not initialized\n" + * message (960131). + * Christian Daudt : removed del_timer from + * igmp_timer_expire function (960205). + * Christian Daudt : igmp_heard_report now only calls + * igmp_timer_expire if tm->running is + * true (960216). + * Malcolm Beattie : ttl comparison wrong in igmp_rcv made + * igmp_heard_query never trigger. Expiry + * miscalculation fixed in igmp_heard_query + * and random() made to return unsigned to + * prevent negative expiry times. + * Alexey Kuznetsov: Wrong group leaving behaviour, backport + * fix from pending 2.1.x patches. + * Alan Cox: Forget to enable FDDI support earlier. + * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. + * David L Stevens: IGMPv3 support, with help from + * Vinay Kulkarni + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/times.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#endif + +#define IP_MAX_MEMBERSHIPS 20 +#define IP_MAX_MSF 10 + +#ifdef CONFIG_IP_MULTICAST +/* Parameter names and values are taken from igmp-v2-06 draft */ + +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_V2_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 + + +#define IGMP_Initial_Report_Delay (1) + +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. + */ + +#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \ + (in_dev)->cnf.force_igmp_version == 1 || \ + ((in_dev)->mr_v1_seen && \ + time_before(jiffies, (in_dev)->mr_v1_seen))) +#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \ + (in_dev)->cnf.force_igmp_version == 2 || \ + ((in_dev)->mr_v2_seen && \ + time_before(jiffies, (in_dev)->mr_v2_seen))) + +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); +static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr); +static void igmpv3_clear_delrec(struct in_device *in_dev); +static int sf_setstate(struct ip_mc_list *pmc); +static void sf_markstate(struct ip_mc_list *pmc); +#endif +static void ip_mc_clear_src(struct ip_mc_list *pmc); +static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta); + +static void ip_ma_put(struct ip_mc_list *im) +{ + if (atomic_dec_and_test(&im->refcnt)) { + in_dev_put(im->interface); + kfree(im); + } +} + +#ifdef CONFIG_IP_MULTICAST + +/* + * Timer management + */ + +static __inline__ void igmp_stop_timer(struct ip_mc_list *im) +{ + spin_lock_bh(&im->lock); + if (del_timer(&im->timer)) + atomic_dec(&im->refcnt); + im->tm_running=0; + im->reporter = 0; + im->unsolicit_count = 0; + spin_unlock_bh(&im->lock); +} + +/* It must be called with locked im->lock */ +static void igmp_start_timer(struct ip_mc_list *im, int max_delay) +{ + int tv=net_random() % max_delay; + + im->tm_running=1; + if (!mod_timer(&im->timer, jiffies+tv+2)) + atomic_inc(&im->refcnt); +} + +static void igmp_gq_start_timer(struct in_device *in_dev) +{ + int tv = net_random() % in_dev->mr_maxdelay; + + in_dev->mr_gq_running = 1; + if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) +{ + spin_lock_bh(&im->lock); + im->unsolicit_count = 0; + if (del_timer(&im->timer)) { + if ((long)(im->timer.expires-jiffies) < max_delay) { + add_timer(&im->timer); + im->tm_running=1; + spin_unlock_bh(&im->lock); + return; + } + atomic_dec(&im->refcnt); + } + igmp_start_timer(im, max_delay); + spin_unlock_bh(&im->lock); +} + + +/* + * Send an IGMP report. + */ + +#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) + + +static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case IGMPV3_MODE_IS_INCLUDE: + case IGMPV3_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + return !(pmc->gsquery && !psf->sf_gsresp); + case IGMPV3_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case IGMPV3_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case IGMPV3_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; + case IGMPV3_BLOCK_OLD_SOURCES: + if (pmc->sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip_sf_list *psf; + int scount = 0; + + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +{ + struct sk_buff *skb; + struct rtable *rt; + struct iphdr *pip; + struct igmpv3_report *pig; + + skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) + return NULL; + + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { + .daddr = IGMPV3_ALL_MCR } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) { + kfree_skb(skb); + return NULL; + } + } + if (rt->rt_src == 0) { + kfree_skb(skb); + ip_rt_put(rt); + return NULL; + } + + skb->dst = &rt->u.dst; + skb->dev = dev; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + + pip->version = 4; + pip->ihl = (sizeof(struct iphdr)+4)>>2; + pip->tos = 0xc0; + pip->frag_off = htons(IP_DF); + pip->ttl = 1; + pip->daddr = rt->rt_dst; + pip->saddr = rt->rt_src; + pip->protocol = IPPROTO_IGMP; + pip->tot_len = 0; /* filled in later */ + ip_select_ident(pip, &rt->u.dst, NULL); + ((u8*)&pip[1])[0] = IPOPT_RA; + ((u8*)&pip[1])[1] = 4; + ((u8*)&pip[1])[2] = 0; + ((u8*)&pip[1])[3] = 0; + + pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig)); + skb->h.igmph = (struct igmphdr *)pig; + pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; + pig->resv1 = 0; + pig->csum = 0; + pig->resv2 = 0; + pig->ngrec = 0; + return skb; +} + +static int igmpv3_sendpack(struct sk_buff *skb) +{ + struct iphdr *pip = skb->nh.iph; + struct igmphdr *pig = skb->h.igmph; + int iplen, igmplen; + + iplen = skb->tail - (unsigned char *)skb->nh.iph; + pip->tot_len = htons(iplen); + ip_send_check(pip); + + igmplen = skb->tail - (unsigned char *)skb->h.igmph; + pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +} + +static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, struct igmpv3_grec **ppgr) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr; + + if (!skb) + skb = igmpv3_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->multiaddr; + pih = (struct igmpv3_report *)skb->h.igmph; + pih->ngrec = htons(ntohs(pih->ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr = NULL; + struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, first, isquery, truncate; + + if (pmc->multiaddr == IGMP_ALL_HOSTS) + return skb; + + isquery = type == IGMPV3_MODE_IS_INCLUDE || + type == IGMPV3_MODE_IS_EXCLUDE; + truncate = type == IGMPV3_MODE_IS_EXCLUDE || + type == IGMPV3_CHANGE_TO_EXCLUDE; + + psf_list = sdeleted ? &pmc->tomb : &pmc->sources; + + if (!*psf_list) { + if (type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) + return skb; + if (pmc->crcount || isquery) { + /* make sure we have room for group header and at + * least one source. + */ + if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+ + sizeof(__u32)) { + igmpv3_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + return skb; + } + pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pih && pih->ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + } + } + first = 1; + scount = 0; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + u32 *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(u32) + + first*sizeof(struct igmpv3_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + psrc = (u32 *)skb_put(skb, sizeof(u32)); + *psrc = psf->sf_inaddr; + scount++; + if ((type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->gsquery = 0; /* clear query state on report */ + return skb; +} + +static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (pmc->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + read_unlock(&in_dev->mc_list_lock); + } else { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + if (!skb) + return 0; + return igmpv3_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) +{ + struct ip_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void igmpv3_send_cr(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock(&in_dev->mc_list_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->sfmode == MCAST_INCLUDE) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->crcount) { + pmc->crcount--; + if (pmc->sfmode == MCAST_EXCLUDE) { + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + if (pmc->crcount == 0) { + igmpv3_clear_zeros(&pmc->tomb); + igmpv3_clear_zeros(&pmc->sources); + } + } + if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + in_dev->mc_tomb = pmc_next; + in_dev_put(pmc->interface); + kfree(pmc); + } else + pmc_prev = pmc; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + + /* change recs */ + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_ALLOW_NEW_SOURCES; + } else { + type = IGMPV3_ALLOW_NEW_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->crcount) { + pmc->crcount--; + if (pmc->sfmode == MCAST_EXCLUDE) + type = IGMPV3_CHANGE_TO_EXCLUDE; + else + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + } + spin_unlock_bh(&pmc->lock); + } + read_unlock(&in_dev->mc_list_lock); + + if (!skb) + return; + (void) igmpv3_sendpack(skb); +} + +static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, + int type) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct igmphdr *ih; + struct rtable *rt; + struct net_device *dev = in_dev->dev; + u32 group = pmc ? pmc->multiaddr : 0; + u32 dst; + + if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) + return igmpv3_send_report(in_dev, pmc); + else if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + else + dst = group; + + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { .daddr = dst } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) + return -1; + } + if (rt->rt_src == 0) { + ip_rt_put(rt); + return -1; + } + + skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) { + ip_rt_put(rt); + return -1; + } + + skb->dst = &rt->u.dst; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + + iph->version = 4; + iph->ihl = (sizeof(struct iphdr)+4)>>2; + iph->tos = 0xc0; + iph->frag_off = htons(IP_DF); + iph->ttl = 1; + iph->daddr = dst; + iph->saddr = rt->rt_src; + iph->protocol = IPPROTO_IGMP; + iph->tot_len = htons(IGMP_SIZE); + ip_select_ident(iph, &rt->u.dst, NULL); + ((u8*)&iph[1])[0] = IPOPT_RA; + ((u8*)&iph[1])[1] = 4; + ((u8*)&iph[1])[2] = 0; + ((u8*)&iph[1])[3] = 0; + ip_send_check(iph); + + ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + ih->type=type; + ih->code=0; + ih->csum=0; + ih->group=group; + ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); +} + +static void igmp_gq_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + in_dev->mr_gq_running = 0; + igmpv3_send_report(in_dev, NULL); + __in_dev_put(in_dev); +} + +static void igmp_ifc_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + igmpv3_send_cr(in_dev); + if (in_dev->mr_ifc_count) { + in_dev->mr_ifc_count--; + igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); + } + __in_dev_put(in_dev); +} + +static void igmp_ifc_event(struct in_device *in_dev) +{ + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) + return; + in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_start_timer(in_dev, 1); +} + + +static void igmp_timer_expire(unsigned long data) +{ + struct ip_mc_list *im=(struct ip_mc_list *)data; + struct in_device *in_dev = im->interface; + + spin_lock(&im->lock); + im->tm_running=0; + + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; + spin_unlock(&im->lock); + + if (IGMP_V1_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); + else if (IGMP_V2_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); + else + igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); + + ip_ma_put(im); +} + +static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs) +{ + struct ip_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) + if (srcs[i] == psf->sf_inaddr) { + psf->sf_gsresp = 1; + scount++; + break; + } + } +} + +static void igmp_heard_report(struct in_device *in_dev, u32 group) +{ + struct ip_mc_list *im; + + /* Timers are only set for non-local groups */ + + if (group == IGMP_ALL_HOSTS) + return; + + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (im->multiaddr == group) { + igmp_stop_timer(im); + break; + } + } + read_unlock(&in_dev->mc_list_lock); +} + +static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, + int len) +{ + struct igmphdr *ih = skb->h.igmph; + struct igmpv3_query *ih3 = (struct igmpv3_query *)ih; + struct ip_mc_list *im; + u32 group = ih->group; + int max_delay; + int mark = 0; + + + if (len == 8) { + if (ih->code == 0) { + /* Alas, old v1 router presents here. */ + + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + + IGMP_V1_Router_Present_Timeout; + group = 0; + } else { + /* v2 router present */ + max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); + in_dev->mr_v2_seen = jiffies + + IGMP_V2_Router_Present_Timeout; + } + /* cancel the interface change timer */ + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + /* clear deleted report items */ + igmpv3_clear_delrec(in_dev); + } else if (len < 12) { + return; /* ignore bogus packet; freed by caller */ + } else { /* v3 */ + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) + return; + + ih3 = (struct igmpv3_query *) skb->h.raw; + if (ih3->nsrcs) { + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + + ntohs(ih3->nsrcs)*sizeof(__u32))) + return; + ih3 = (struct igmpv3_query *) skb->h.raw; + } + + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); + if (!max_delay) + max_delay = 1; /* can't mod w/ 0 */ + in_dev->mr_maxdelay = max_delay; + if (ih3->qrv) + in_dev->mr_qrv = ih3->qrv; + if (!group) { /* general query */ + if (ih3->nsrcs) + return; /* no sources allowed */ + igmp_gq_start_timer(in_dev); + return; + } + /* mark sources to include, if group & source-specific */ + mark = ih3->nsrcs != 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (group && group != im->multiaddr) + continue; + if (im->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&im->lock); + if (im->tm_running) + im->gsquery = im->gsquery && mark; + else + im->gsquery = mark; + if (im->gsquery) + igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); + spin_unlock_bh(&im->lock); + igmp_mod_timer(im, max_delay); + } + read_unlock(&in_dev->mc_list_lock); +} + +int igmp_rcv(struct sk_buff *skb) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih; + struct in_device *in_dev = in_dev_get(skb->dev); + int len = skb->len; + + if (in_dev==NULL) { + kfree_skb(skb); + return 0; + } + + if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || + (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { + in_dev_put(in_dev); + kfree_skb(skb); + return 0; + } + + ih = skb->h.igmph; + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, skb, len); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + case IGMPV3_HOST_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (((struct rtable*)skb->dst)->fl.iif == 0) + break; + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + in_dev_put(in_dev); + return pim_rcv_v1(skb); +#endif + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + } + in_dev_put(in_dev); + kfree_skb(skb); + return 0; +} + +#endif + + +/* + * Add a filter to a device + */ + +static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. + We will get multicast token leakage, when IFF_MULTICAST + is changed. This check should be done in dev->set_multicast_list + routine. Something sort of: + if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } + --ANK + */ + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_add(dev,buf,dev->addr_len,0); +} + +/* + * Remove a filter from a device + */ + +static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_delete(dev,buf,dev->addr_len,0); +} + +#ifdef CONFIG_IP_MULTICAST +/* + * deleted ip_mc_list manipulation + */ +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) +{ + struct ip_mc_list *pmc; + + /* this is an "ip_mc_list" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL); + if (!pmc) + return; + memset(pmc, 0, sizeof(*pmc)); + spin_lock_bh(&im->lock); + pmc->interface = im->interface; + in_dev_hold(in_dev); + pmc->multiaddr = im->multiaddr; + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + pmc->sfmode = im->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + struct ip_sf_list *psf; + + pmc->tomb = im->tomb; + pmc->sources = im->sources; + im->tomb = im->sources = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->crcount; + } + spin_unlock_bh(&im->lock); + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc->next = in_dev->mc_tomb; + in_dev->mc_tomb = pmc; + spin_unlock_bh(&in_dev->mc_tomb_lock); +} + +static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr) +{ + struct ip_mc_list *pmc, *pmc_prev; + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + if (pmc->multiaddr == multiaddr) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + in_dev->mc_tomb = pmc->next; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + if (pmc) { + for (psf=pmc->tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in_dev_put(pmc->interface); + kfree(pmc); + } +} + +static void igmpv3_clear_delrec(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *nextpmc; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc = in_dev->mc_tomb; + in_dev->mc_tomb = NULL; + spin_unlock_bh(&in_dev->mc_tomb_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip_mc_clear_src(pmc); + in_dev_put(pmc->interface); + kfree(pmc); + } + /* clear dead sources, too */ + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->lock); + psf = pmc->tomb; + pmc->tomb = NULL; + spin_unlock_bh(&pmc->lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock(&in_dev->mc_list_lock); +} +#endif + +static void igmp_group_dropped(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + int reporter; +#endif + + if (im->loaded) { + im->loaded = 0; + ip_mc_filter_del(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + reporter = im->reporter; + igmp_stop_timer(im); + + if (!in_dev->dead) { + if (IGMP_V1_SEEN(in_dev)) + goto done; + if (IGMP_V2_SEEN(in_dev)) { + if (reporter) + igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); + goto done; + } + /* IGMPv3 */ + igmpv3_add_delrec(in_dev, im); + + igmp_ifc_event(in_dev); + } +done: +#endif + ip_mc_clear_src(im); +} + +static void igmp_group_added(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; + + if (im->loaded == 0) { + im->loaded = 1; + ip_mc_filter_add(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + if (in_dev->dead) + return; + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { + spin_lock_bh(&im->lock); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + spin_unlock_bh(&im->lock); + return; + } + /* else, v3 */ + + im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_event(in_dev); +#endif +} + + +/* + * Multicast list managers + */ + + +/* + * A socket has joined a multicast group on device dev. + */ + +void ip_mc_inc_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *im; + + ASSERT_RTNL(); + + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == addr) { + im->users++; + ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + goto out; + } + } + + im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + im->users=1; + im->interface=in_dev; + in_dev_hold(in_dev); + im->multiaddr=addr; + /* initial mode is (EX, empty) */ + im->sfmode = MCAST_EXCLUDE; + im->sfcount[MCAST_INCLUDE] = 0; + im->sfcount[MCAST_EXCLUDE] = 1; + im->sources = NULL; + im->tomb = NULL; + im->crcount = 0; + atomic_set(&im->refcnt, 1); + spin_lock_init(&im->lock); +#ifdef CONFIG_IP_MULTICAST + im->tm_running=0; + init_timer(&im->timer); + im->timer.data=(unsigned long)im; + im->timer.function=&igmp_timer_expire; + im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->reporter = 0; + im->gsquery = 0; +#endif + im->loaded = 0; + write_lock_bh(&in_dev->mc_list_lock); + im->next=in_dev->mc_list; + in_dev->mc_list=im; + write_unlock_bh(&in_dev->mc_list_lock); +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, im->multiaddr); +#endif + igmp_group_added(im); + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); +out: + return; +} + +/* + * A socket has left a multicast group on device dev + */ + +void ip_mc_dec_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *i, **ip; + + ASSERT_RTNL(); + + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { + if (i->multiaddr==addr) { + if (--i->users == 0) { + write_lock_bh(&in_dev->mc_list_lock); + *ip = i->next; + write_unlock_bh(&in_dev->mc_list_lock); + igmp_group_dropped(i); + + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); + + ip_ma_put(i); + return; + } + break; + } + } +} + +/* Device going down */ + +void ip_mc_down(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_dropped(i); + +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + in_dev->mr_gq_running = 0; + if (del_timer(&in_dev->mr_gq_timer)) + __in_dev_put(in_dev); + igmpv3_clear_delrec(in_dev); +#endif + + ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); +} + +void ip_mc_init_dev(struct in_device *in_dev) +{ + ASSERT_RTNL(); + + in_dev->mc_tomb = NULL; +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_gq_running = 0; + init_timer(&in_dev->mr_gq_timer); + in_dev->mr_gq_timer.data=(unsigned long) in_dev; + in_dev->mr_gq_timer.function=&igmp_gq_timer_expire; + in_dev->mr_ifc_count = 0; + init_timer(&in_dev->mr_ifc_timer); + in_dev->mr_ifc_timer.data=(unsigned long) in_dev; + in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire; + in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; +#endif + + rwlock_init(&in_dev->mc_list_lock); + spin_lock_init(&in_dev->mc_tomb_lock); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_added(i); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ip_mc_destroy_dev(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + /* Deactivate timers */ + ip_mc_down(in_dev); + + write_lock_bh(&in_dev->mc_list_lock); + while ((i = in_dev->mc_list) != NULL) { + in_dev->mc_list = i->next; + write_unlock_bh(&in_dev->mc_list_lock); + + igmp_group_dropped(i); + ip_ma_put(i); + + write_lock_bh(&in_dev->mc_list_lock); + } + write_unlock_bh(&in_dev->mc_list_lock); +} + +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +{ + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = imr->imr_multiaddr.s_addr } } }; + struct rtable *rt; + struct net_device *dev = NULL; + struct in_device *idev = NULL; + + if (imr->imr_ifindex) { + idev = inetdev_by_index(imr->imr_ifindex); + if (idev) + __in_dev_put(idev); + return idev; + } + if (imr->imr_address.s_addr) { + dev = ip_dev_find(imr->imr_address.s_addr); + if (!dev) + return NULL; + __dev_put(dev); + } + + if (!dev && !ip_route_output_key(&rt, &fl)) { + dev = rt->u.dst.dev; + ip_rt_put(rt); + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + idev = __in_dev_get(dev); + } + return idev; +} + +/* + * Join a socket to a group + */ +int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS; +int sysctl_igmp_max_msf = IP_MAX_MSF; + + +static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, + __u32 *psfsrc) +{ + struct ip_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (psf->sf_count[sfmode] == 0) { + ip_rt_multicast_event(pmc->interface); + } + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct in_device *in_dev = pmc->interface; +#endif + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->sources = psf->sf_next; +#ifdef CONFIG_IP_MULTICAST + if (psf->sf_oldin && + !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { + psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + psf->sf_next = pmc->tomb; + pmc->tomb = psf; + rv = 1; + } else +#endif + kfree(psf); + } + return rv; +} + +#ifndef CONFIG_IP_MULTICAST +#define igmp_ifc_event(x) do { } while (0) +#endif + +static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int changerec = 0; + int i, err; + + if (!in_dev) + return -ENODEV; + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock(&in_dev->mc_list_lock); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + read_unlock(&in_dev->mc_list_lock); +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + if (!delta) { + err = -EINVAL; + if (!pmc->sfcount[sfmode]) + goto out_unlock; + pmc->sfcount[sfmode]--; + } + err = 0; + for (i=0; i<sfcount; i++) { + int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); + + changerec |= rv > 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->sfmode == MCAST_EXCLUDE && + pmc->sfcount[MCAST_EXCLUDE] == 0 && + pmc->sfcount[MCAST_INCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct ip_sf_list *psf; +#endif + + /* filter mode change */ + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(pmc->interface); + } else if (sf_setstate(pmc) || changerec) { + igmp_ifc_event(pmc->interface); +#endif + } +out_unlock: + spin_unlock_bh(&pmc->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, + __u32 *psfsrc, int delta) +{ + struct ip_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf) { + psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + memset(psf, 0, sizeof(*psf)); + psf->sf_inaddr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->sources = psf; + } + psf->sf_count[sfmode]++; + if (psf->sf_count[sfmode] == 1) { + ip_rt_multicast_event(pmc->interface); + } + return 0; +} + +#ifdef CONFIG_IP_MULTICAST +static void sf_markstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->sources; psf; psf=psf->sf_next) + if (pmc->sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + int qrv = pmc->interface->mr_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (pmc->sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in != psf->sf_oldin) { + psf->sf_crcount = qrv; + rv++; + } + } + return rv; +} +#endif + +/* + * Add multicast source filter list to the interface list + */ +static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int isexclude; + int i, err; + + if (!in_dev) + return -ENODEV; + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock(&in_dev->mc_list_lock); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + read_unlock(&in_dev->mc_list_lock); + +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + isexclude = pmc->sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->sfcount[sfmode]++; + err = 0; + for (i=0; i<sfcount; i++) { + err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + if (err) + break; + } + if (err) { + int j; + + pmc->sfcount[sfmode]--; + for (j=0; j<i; j++) + (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); + } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { +#ifdef CONFIG_IP_MULTICAST + struct in_device *in_dev = pmc->interface; + struct ip_sf_list *psf; +#endif + + /* filter mode change */ + if (pmc->sfcount[MCAST_EXCLUDE]) + pmc->sfmode = MCAST_EXCLUDE; + else if (pmc->sfcount[MCAST_INCLUDE]) + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + /* else no filters; keep old mode for reports */ + + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(in_dev); + } else if (sf_setstate(pmc)) { + igmp_ifc_event(in_dev); +#endif + } + spin_unlock_bh(&pmc->lock); + return err; +} + +static void ip_mc_clear_src(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf, *nextpsf; + + for (psf=pmc->tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->tomb = NULL; + for (psf=pmc->sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->sources = NULL; + pmc->sfmode = MCAST_EXCLUDE; + pmc->sfcount[MCAST_EXCLUDE] = 0; + pmc->sfcount[MCAST_EXCLUDE] = 1; +} + + +/* + * Join a multicast group + */ +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) +{ + int err; + u32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml, *i; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + int count = 0; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + in_dev = ip_mc_find_dev(imr); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; + } + + iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); + + err = -EADDRINUSE; + for (i = inet->mc_list; i; i = i->next) { + if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { + /* New style additions are reference counted */ + if (imr->imr_address.s_addr == 0) { + i->count++; + err = 0; + } + goto done; + } + count++; + } + err = -ENOBUFS; + if (iml == NULL || count >= sysctl_igmp_max_memberships) + goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next = inet->mc_list; + iml->count = 1; + iml->sflist = NULL; + iml->sfmode = MCAST_EXCLUDE; + inet->mc_list = iml; + ip_mc_inc_group(in_dev, addr); + iml = NULL; + err = 0; + +done: + rtnl_shunlock(); + if (iml) + sock_kfree_s(sk, iml, sizeof(*iml)); + return err; +} + +static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, + struct in_device *in_dev) +{ + int err; + + if (iml->sflist == 0) { + /* any-source empty exclude case */ + return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, 0, NULL, 0); + } + err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, iml->sflist->sl_count, + iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +/* + * Ask a socket to leave a group. + */ + +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml, **imlp; + + rtnl_lock(); + for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { + if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && + iml->multi.imr_address.s_addr==imr->imr_address.s_addr && + (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { + struct in_device *in_dev; + + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + if (in_dev) + (void) ip_mc_leave_src(sk, iml, in_dev); + if (--iml->count) { + rtnl_unlock(); + if (in_dev) + in_dev_put(in_dev); + return 0; + } + + *imlp = iml->next; + + if (in_dev) { + ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); + in_dev_put(in_dev); + } + rtnl_unlock(); + sock_kfree_s(sk, iml, sizeof(*iml)); + return 0; + } + } + rtnl_unlock(); + return -EADDRNOTAVAIL; +} + +int ip_mc_source(int add, int omode, struct sock *sk, struct + ip_mreq_source *mreqs, int ifindex) +{ + int err; + struct ip_mreqn imr; + u32 addr = mreqs->imr_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev = NULL; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + int i, j, rv; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; + imr.imr_address.s_addr = mreqs->imr_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) + goto done; + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, + NULL, 0); + pmc->sfmode = omode; + } + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; + rv = !0; + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__u32)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; + + /* update the interface filter */ + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); + + for (j=i+1; j<psl->sl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip_sf_socklist *newpsl; + int count = IP_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, + IP_SFLSIZE(count), GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP_SFBLOCK; + if (psl) { + for (i=0; i<psl->sl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__u32)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = mreqs->imr_sourceaddr; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) +{ + int err; + struct ip_mreqn imr; + u32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *newpsl, *psl; + + if (!MULTICAST(addr)) + return -EINVAL; + if (msf->imsf_fmode != MCAST_INCLUDE && + msf->imsf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + if (msf->imsf_numsrc) { + newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, + IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; + memcpy(newpsl->sl_addr, msf->imsf_slist, + msf->imsf_numsrc * sizeof(msf->imsf_slist[0])); + err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, + msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else + newpsl = NULL; + psl = pmc->sflist; + if (psl) { + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + } else + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = msf->imsf_fmode; +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, + struct ip_msfilter __user *optval, int __user *optlen) +{ + int err, len, count, copycount; + struct ip_mreqn imr; + u32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = 0; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + msf->imsf_fmode = pmc->sfmode; + psl = pmc->sflist; + rtnl_shunlock(); + if (!psl) { + len = 0; + count = 0; + } else { + count = psl->sl_count; + } + copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; + len = copycount * sizeof(psl->sl_addr[0]); + msf->imsf_numsrc = count; + if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || + copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { + return -EFAULT; + } + if (len && + copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len)) + return -EFAULT; + return 0; +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + struct sockaddr_in *psin; + u32 addr; + struct ip_mc_socklist *pmc; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + addr = psin->sin_addr.s_addr; + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == addr && + pmc->multi.imr_ifindex == gsf->gf_interface) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + rtnl_shunlock(); + count = psl ? psl->sl_count : 0; + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + for (i=0; i<copycount; i++) { + struct sockaddr_in *psin; + struct sockaddr_storage ss; + + psin = (struct sockaddr_in *)&ss; + memset(&ss, 0, sizeof(ss)); + psin->sin_family = AF_INET; + psin->sin_addr.s_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + rtnl_shunlock(); + return err; +} + +/* + * check if a multicast source filter allows delivery for a given <src,dst,intf> + */ +int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *pmc; + struct ip_sf_socklist *psl; + int i; + + if (!MULTICAST(loc_addr)) + return 1; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == loc_addr && + pmc->multi.imr_ifindex == dif) + break; + } + if (!pmc) + return 1; + psl = pmc->sflist; + if (!psl) + return pmc->sfmode == MCAST_EXCLUDE; + + for (i=0; i<psl->sl_count; i++) { + if (psl->sl_addr[i] == rmt_addr) + break; + } + if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + return 0; + if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + return 0; + return 1; +} + +/* + * A socket is closing. + */ + +void ip_mc_drop_socket(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + if (inet->mc_list == NULL) + return; + + rtnl_lock(); + while ((iml = inet->mc_list) != NULL) { + struct in_device *in_dev; + inet->mc_list = iml->next; + + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) { + (void) ip_mc_leave_src(sk, iml, in_dev); + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + in_dev_put(in_dev); + } + sock_kfree_s(sk, iml, sizeof(*iml)); + + } + rtnl_unlock(); +} + +int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto) +{ + struct ip_mc_list *im; + struct ip_sf_list *psf; + int rv = 0; + + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == mc_addr) + break; + } + if (im && proto == IPPROTO_IGMP) { + rv = 1; + } else if (im) { + if (src_addr) { + for (psf=im->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == src_addr) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + im->sfcount[MCAST_EXCLUDE]; + else + rv = im->sfcount[MCAST_EXCLUDE] != 0; + } else + rv = 1; /* unspecified source; tentatively allow */ + } + read_unlock(&in_dev->mc_list_lock); + return rv; +} + +#if defined(CONFIG_PROC_FS) +struct igmp_mc_iter_state { + struct net_device *dev; + struct in_device *in_dev; +}; + +#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) + +static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) +{ + struct ip_mc_list *im = NULL; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + for (state->dev = dev_base, state->in_dev = NULL; + state->dev; + state->dev = state->dev->next) { + struct in_device *in_dev; + in_dev = in_dev_get(state->dev); + if (!in_dev) + continue; + read_lock(&in_dev->mc_list_lock); + im = in_dev->mc_list; + if (im) { + state->in_dev = in_dev; + break; + } + read_unlock(&in_dev->mc_list_lock); + in_dev_put(in_dev); + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + im = im->next; + while (!im) { + if (likely(state->in_dev != NULL)) { + read_unlock(&state->in_dev->mc_list_lock); + in_dev_put(state->in_dev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->in_dev = NULL; + break; + } + state->in_dev = in_dev_get(state->dev); + if (!state->in_dev) + continue; + read_lock(&state->in_dev->mc_list_lock); + im = state->in_dev->mc_list; + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_mc_list *im = igmp_mc_get_first(seq); + if (im) + while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_mc_list *im; + if (v == SEQ_START_TOKEN) + im = igmp_mc_get_first(seq); + else + im = igmp_mc_get_next(seq, v); + ++*pos; + return im; +} + +static void igmp_mc_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + if (likely(state->in_dev != NULL)) { + read_unlock(&state->in_dev->mc_list_lock); + in_dev_put(state->in_dev); + state->in_dev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp_mc_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); + else { + struct ip_mc_list *im = (struct ip_mc_list *)v; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + char *querier; +#ifdef CONFIG_IP_MULTICAST + querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : + IGMP_V2_SEEN(state->in_dev) ? "V2" : + "V3"; +#else + querier = "NONE"; +#endif + + if (state->in_dev->mc_list == im) { + seq_printf(seq, "%d\t%-10s: %5d %7s\n", + state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); + } + + seq_printf(seq, + "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->tm_running ? + jiffies_to_clock_t(im->timer.expires-jiffies) : 0, + im->reporter); + } + return 0; +} + +static struct seq_operations igmp_mc_seq_ops = { + .start = igmp_mc_seq_start, + .next = igmp_mc_seq_next, + .stop = igmp_mc_seq_stop, + .show = igmp_mc_seq_show, +}; + +static int igmp_mc_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &igmp_mc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct igmp_mcf_iter_state { + struct net_device *dev; + struct in_device *idev; + struct ip_mc_list *im; +}; + +#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) + +static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) +{ + struct ip_sf_list *psf = NULL; + struct ip_mc_list *im = NULL; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL, state->im = NULL; + state->dev; + state->dev = state->dev->next) { + struct in_device *idev; + idev = in_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock(&idev->mc_list_lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->lock); + psf = im->sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->lock); + } + read_unlock(&idev->mc_list_lock); + in_dev_put(idev); + } + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) { + read_unlock(&state->idev->mc_list_lock); + in_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = in_dev_get(state->dev); + if (!state->idev) + continue; + read_lock(&state->idev->mc_list_lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->lock); + psf = state->im->sources; + } +out: + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_sf_list *psf = igmp_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp_mcf_get_first(seq); + else + psf = igmp_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock(&state->idev->mc_list_lock); + in_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip_sf_list *psf = (struct ip_sf_list *)v; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%10s %10s %6s %6s\n", "Idx", + "Device", "MCA", + "SRC", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s 0x%08x " + "0x%08x %6lu %6lu\n", + state->dev->ifindex, state->dev->name, + ntohl(state->im->multiaddr), + ntohl(psf->sf_inaddr), + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static struct seq_operations igmp_mcf_seq_ops = { + .start = igmp_mcf_seq_start, + .next = igmp_mcf_seq_next, + .stop = igmp_mcf_seq_stop, + .show = igmp_mcf_seq_show, +}; + +static int igmp_mcf_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &igmp_mcf_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init igmp_mc_proc_init(void) +{ + proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); + proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + return 0; +} +#endif + +EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(ip_mc_inc_group); +EXPORT_SYMBOL(ip_mc_join_group); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c new file mode 100644 index 000000000000..95473953c406 --- /dev/null +++ b/net/ipv4/inetpeer.c @@ -0,0 +1,460 @@ +/* + * INETPEER - A storage for permanent information about peers + * + * This source is covered by the GNU GPL, the same as all kernel sources. + * + * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ + * + * Authors: Andrey V. Savochkin <saw@msu.ru> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <net/inetpeer.h> + +/* + * Theory of operations. + * We keep one entry for each peer IP address. The nodes contains long-living + * information about the peer which doesn't depend on routes. + * At this moment this information consists only of ID field for the next + * outgoing IP packet. This field is incremented with each packet as encoded + * in inet_getid() function (include/net/inetpeer.h). + * At the moment of writing this notes identifier of IP packets is generated + * to be unpredictable using this code only for packets subjected + * (actually or potentially) to defragmentation. I.e. DF packets less than + * PMTU in size uses a constant ID and do not use this code (see + * ip_select_ident() in include/net/ip.h). + * + * Route cache entries hold references to our nodes. + * New cache entries get references via lookup by destination IP address in + * the avl tree. The reference is grabbed only when it's needed i.e. only + * when we try to output IP packet which needs an unpredictable ID (see + * __ip_select_ident() in net/ipv4/route.c). + * Nodes are removed only when reference counter goes to 0. + * When it's happened the node may be removed when a sufficient amount of + * time has been passed since its last use. The less-recently-used entry can + * also be removed if the pool is overloaded i.e. if the total amount of + * entries is greater-or-equal than the threshold. + * + * Node pool is organised as an AVL tree. + * Such an implementation has been chosen not just for fun. It's a way to + * prevent easy and efficient DoS attacks by creating hash collisions. A huge + * amount of long living nodes in a single hash slot would significantly delay + * lookups performed with disabled BHs. + * + * Serialisation issues. + * 1. Nodes may appear in the tree only with the pool write lock held. + * 2. Nodes may disappear from the tree only with the pool write lock held + * AND reference count being 0. + * 3. Nodes appears and disappears from unused node list only under + * "inet_peer_unused_lock". + * 4. Global variable peer_total is modified under the pool lock. + * 5. struct inet_peer fields modification: + * avl_left, avl_right, avl_parent, avl_height: pool lock + * unused_next, unused_prevp: unused node list lock + * refcnt: atomically against modifications on other CPU; + * usually under some other lock to prevent node disappearing + * dtime: unused node list lock + * v4daddr: unchangeable + * ip_id_count: idlock + */ + +/* Exported for inet_getid inline function. */ +DEFINE_SPINLOCK(inet_peer_idlock); + +static kmem_cache_t *peer_cachep; + +#define node_height(x) x->avl_height +static struct inet_peer peer_fake_node = { + .avl_left = &peer_fake_node, + .avl_right = &peer_fake_node, + .avl_height = 0 +}; +#define peer_avl_empty (&peer_fake_node) +static struct inet_peer *peer_root = peer_avl_empty; +static DEFINE_RWLOCK(peer_pool_lock); +#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ + +static volatile int peer_total; +/* Exported for sysctl_net_ipv4. */ +int inet_peer_threshold = 65536 + 128; /* start to throw entries more + * aggressively at this stage */ +int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ +int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ + +static struct inet_peer *inet_peer_unused_head; +/* Exported for inet_putpeer inline function. */ +struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; +DEFINE_SPINLOCK(inet_peer_unused_lock); +#define PEER_MAX_CLEANUP_WORK 30 + +static void peer_check_expire(unsigned long dummy); +static struct timer_list peer_periodic_timer = + TIMER_INITIALIZER(peer_check_expire, 0, 0); + +/* Exported for sysctl_net_ipv4. */ +int inet_peer_gc_mintime = 10 * HZ, + inet_peer_gc_maxtime = 120 * HZ; + +/* Called from ip_output.c:ip_init */ +void __init inet_initpeers(void) +{ + struct sysinfo si; + + /* Use the straight interface to information about memory. */ + si_meminfo(&si); + /* The values below were suggested by Alexey Kuznetsov + * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values + * myself. --SAW + */ + if (si.totalram <= (32768*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ + if (si.totalram <= (16384*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* about 512KB */ + if (si.totalram <= (8192*1024)/PAGE_SIZE) + inet_peer_threshold >>= 2; /* about 128KB */ + + peer_cachep = kmem_cache_create("inet_peer_cache", + sizeof(struct inet_peer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!peer_cachep) + panic("cannot create inet_peer_cache"); + + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + peer_periodic_timer.expires = jiffies + + net_random() % inet_peer_gc_maxtime + + inet_peer_gc_maxtime; + add_timer(&peer_periodic_timer); +} + +/* Called with or without local BH being disabled. */ +static void unlink_from_unused(struct inet_peer *p) +{ + spin_lock_bh(&inet_peer_unused_lock); + if (p->unused_prevp != NULL) { + /* On unused list. */ + *p->unused_prevp = p->unused_next; + if (p->unused_next != NULL) + p->unused_next->unused_prevp = p->unused_prevp; + else + inet_peer_unused_tailp = p->unused_prevp; + p->unused_prevp = NULL; /* mark it as removed */ + } + spin_unlock_bh(&inet_peer_unused_lock); +} + +/* Called with local BH disabled and the pool lock held. */ +#define lookup(daddr) \ +({ \ + struct inet_peer *u, **v; \ + stackptr = stack; \ + *stackptr++ = &peer_root; \ + for (u = peer_root; u != peer_avl_empty; ) { \ + if (daddr == u->v4daddr) \ + break; \ + if (daddr < u->v4daddr) \ + v = &u->avl_left; \ + else \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = *v; \ + } \ + u; \ +}) + +/* Called with local BH disabled and the pool write lock held. */ +#define lookup_rightempty(start) \ +({ \ + struct inet_peer *u, **v; \ + *stackptr++ = &start->avl_left; \ + v = &start->avl_left; \ + for (u = *v; u->avl_right != peer_avl_empty; ) { \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = *v; \ + } \ + u; \ +}) + +/* Called with local BH disabled and the pool write lock held. + * Variable names are the proof of operation correctness. + * Look into mm/map_avl.c for more detail description of the ideas. */ +static void peer_avl_rebalance(struct inet_peer **stack[], + struct inet_peer ***stackend) +{ + struct inet_peer **nodep, *node, *l, *r; + int lh, rh; + + while (stackend > stack) { + nodep = *--stackend; + node = *nodep; + l = node->avl_left; + r = node->avl_right; + lh = node_height(l); + rh = node_height(r); + if (lh > rh + 1) { /* l: RH+2 */ + struct inet_peer *ll, *lr, *lrl, *lrr; + int lrh; + ll = l->avl_left; + lr = l->avl_right; + lrh = node_height(lr); + if (lrh <= node_height(ll)) { /* ll: RH+1 */ + node->avl_left = lr; /* lr: RH or RH+1 */ + node->avl_right = r; /* r: RH */ + node->avl_height = lrh + 1; /* RH+1 or RH+2 */ + l->avl_left = ll; /* ll: RH+1 */ + l->avl_right = node; /* node: RH+1 or RH+2 */ + l->avl_height = node->avl_height + 1; + *nodep = l; + } else { /* ll: RH, lr: RH+1 */ + lrl = lr->avl_left; /* lrl: RH or RH-1 */ + lrr = lr->avl_right; /* lrr: RH or RH-1 */ + node->avl_left = lrr; /* lrr: RH or RH-1 */ + node->avl_right = r; /* r: RH */ + node->avl_height = rh + 1; /* node: RH+1 */ + l->avl_left = ll; /* ll: RH */ + l->avl_right = lrl; /* lrl: RH or RH-1 */ + l->avl_height = rh + 1; /* l: RH+1 */ + lr->avl_left = l; /* l: RH+1 */ + lr->avl_right = node; /* node: RH+1 */ + lr->avl_height = rh + 2; + *nodep = lr; + } + } else if (rh > lh + 1) { /* r: LH+2 */ + struct inet_peer *rr, *rl, *rlr, *rll; + int rlh; + rr = r->avl_right; + rl = r->avl_left; + rlh = node_height(rl); + if (rlh <= node_height(rr)) { /* rr: LH+1 */ + node->avl_right = rl; /* rl: LH or LH+1 */ + node->avl_left = l; /* l: LH */ + node->avl_height = rlh + 1; /* LH+1 or LH+2 */ + r->avl_right = rr; /* rr: LH+1 */ + r->avl_left = node; /* node: LH+1 or LH+2 */ + r->avl_height = node->avl_height + 1; + *nodep = r; + } else { /* rr: RH, rl: RH+1 */ + rlr = rl->avl_right; /* rlr: LH or LH-1 */ + rll = rl->avl_left; /* rll: LH or LH-1 */ + node->avl_right = rll; /* rll: LH or LH-1 */ + node->avl_left = l; /* l: LH */ + node->avl_height = lh + 1; /* node: LH+1 */ + r->avl_right = rr; /* rr: LH */ + r->avl_left = rlr; /* rlr: LH or LH-1 */ + r->avl_height = lh + 1; /* r: LH+1 */ + rl->avl_right = r; /* r: LH+1 */ + rl->avl_left = node; /* node: LH+1 */ + rl->avl_height = lh + 2; + *nodep = rl; + } + } else { + node->avl_height = (lh > rh ? lh : rh) + 1; + } + } +} + +/* Called with local BH disabled and the pool write lock held. */ +#define link_to_pool(n) \ +do { \ + n->avl_height = 1; \ + n->avl_left = peer_avl_empty; \ + n->avl_right = peer_avl_empty; \ + **--stackptr = n; \ + peer_avl_rebalance(stack, stackptr); \ +} while(0) + +/* May be called with local BH enabled. */ +static void unlink_from_pool(struct inet_peer *p) +{ + int do_free; + + do_free = 0; + + write_lock_bh(&peer_pool_lock); + /* Check the reference counter. It was artificially incremented by 1 + * in cleanup() function to prevent sudden disappearing. If the + * reference count is still 1 then the node is referenced only as `p' + * here and from the pool. So under the exclusive pool lock it's safe + * to remove the node and free it later. */ + if (atomic_read(&p->refcnt) == 1) { + struct inet_peer **stack[PEER_MAXDEPTH]; + struct inet_peer ***stackptr, ***delp; + if (lookup(p->v4daddr) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p); + if (*stackptr[-1] != t) + BUG(); + **--stackptr = t->avl_left; + /* t is removed, t->v4daddr > x->v4daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + *delp[0] = t; + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + if (delp[1] != &p->avl_left) + BUG(); + delp[1] = &t->avl_left; /* was &p->avl_left */ + } + peer_avl_rebalance(stack, stackptr); + peer_total--; + do_free = 1; + } + write_unlock_bh(&peer_pool_lock); + + if (do_free) + kmem_cache_free(peer_cachep, p); + else + /* The node is used again. Decrease the reference counter + * back. The loop "cleanup -> unlink_from_unused + * -> unlink_from_pool -> putpeer -> link_to_unused + * -> cleanup (for the same node)" + * doesn't really exist because the entry will have a + * recent deletion time and will not be cleaned again soon. */ + inet_putpeer(p); +} + +/* May be called with local BH enabled. */ +static int cleanup_once(unsigned long ttl) +{ + struct inet_peer *p; + + /* Remove the first entry from the list of unused nodes. */ + spin_lock_bh(&inet_peer_unused_lock); + p = inet_peer_unused_head; + if (p != NULL) { + if (time_after(p->dtime + ttl, jiffies)) { + /* Do not prune fresh entries. */ + spin_unlock_bh(&inet_peer_unused_lock); + return -1; + } + inet_peer_unused_head = p->unused_next; + if (p->unused_next != NULL) + p->unused_next->unused_prevp = p->unused_prevp; + else + inet_peer_unused_tailp = p->unused_prevp; + p->unused_prevp = NULL; /* mark as not on the list */ + /* Grab an extra reference to prevent node disappearing + * before unlink_from_pool() call. */ + atomic_inc(&p->refcnt); + } + spin_unlock_bh(&inet_peer_unused_lock); + + if (p == NULL) + /* It means that the total number of USED entries has + * grown over inet_peer_threshold. It shouldn't really + * happen because of entry limits in route cache. */ + return -1; + + unlink_from_pool(p); + return 0; +} + +/* Called with or without local BH being disabled. */ +struct inet_peer *inet_getpeer(__u32 daddr, int create) +{ + struct inet_peer *p, *n; + struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + + /* Look up for the address quickly. */ + read_lock_bh(&peer_pool_lock); + p = lookup(daddr); + if (p != peer_avl_empty) + atomic_inc(&p->refcnt); + read_unlock_bh(&peer_pool_lock); + + if (p != peer_avl_empty) { + /* The existing node has been found. */ + /* Remove the entry from unused list if it was there. */ + unlink_from_unused(p); + return p; + } + + if (!create) + return NULL; + + /* Allocate the space outside the locked region. */ + n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); + if (n == NULL) + return NULL; + n->v4daddr = daddr; + atomic_set(&n->refcnt, 1); + n->ip_id_count = secure_ip_id(daddr); + n->tcp_ts_stamp = 0; + + write_lock_bh(&peer_pool_lock); + /* Check if an entry has suddenly appeared. */ + p = lookup(daddr); + if (p != peer_avl_empty) + goto out_free; + + /* Link the node. */ + link_to_pool(n); + n->unused_prevp = NULL; /* not on the list */ + peer_total++; + write_unlock_bh(&peer_pool_lock); + + if (peer_total >= inet_peer_threshold) + /* Remove one less-recently-used entry. */ + cleanup_once(0); + + return n; + +out_free: + /* The appropriate node is already in the pool. */ + atomic_inc(&p->refcnt); + write_unlock_bh(&peer_pool_lock); + /* Remove the entry from unused list if it was there. */ + unlink_from_unused(p); + /* Free preallocated the preallocated node. */ + kmem_cache_free(peer_cachep, n); + return p; +} + +/* Called with local BH disabled. */ +static void peer_check_expire(unsigned long dummy) +{ + int i; + int ttl; + + if (peer_total >= inet_peer_threshold) + ttl = inet_peer_minttl; + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + peer_total / inet_peer_threshold * HZ; + for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++); + + /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime + * interval depending on the total number of entries (more entries, + * less interval). */ + peer_periodic_timer.expires = jiffies + + inet_peer_gc_maxtime + - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * + peer_total / inet_peer_threshold * HZ; + add_timer(&peer_periodic_timer); +} + +EXPORT_SYMBOL(inet_peer_idlock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c new file mode 100644 index 000000000000..77094aac6c28 --- /dev/null +++ b/net/ipv4/ip_forward.c @@ -0,0 +1,127 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP forwarding functionality. + * + * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip_input.c for + * history. + * Dave Gregorich : NULL ip_rt_put fix for multicast + * routing. + * Jos Vos : Add call_out_firewall before sending, + * use output device for accounting. + * Jos Vos : Call forward firewall after routing + * (always use output device). + * Mike McLagan : Routing by source + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/netfilter_ipv4.h> +#include <net/checksum.h> +#include <linux/route.h> +#include <net/route.h> +#include <net/xfrm.h> + +static inline int ip_forward_finish(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +int ip_forward(struct sk_buff *skb) +{ + struct iphdr *iph; /* Our header */ + struct rtable *rt; /* Route we use */ + struct ip_options * opt = &(IPCB(skb)->opt); + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) + goto drop; + + if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + return NET_RX_SUCCESS; + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb->ip_summed = CHECKSUM_NONE; + + /* + * According to the RFC, we must first decrease the TTL field. If + * that reaches zero, we must reply an ICMP control message telling + * that the packet's lifetime expired. + */ + + iph = skb->nh.iph; + + if (iph->ttl <= 1) + goto too_many_hops; + + if (!xfrm4_route_forward(skb)) + goto drop; + + iph = skb->nh.iph; + rt = (struct rtable*)skb->dst; + + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; + + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + goto drop; + iph = skb->nh.iph; + + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. + */ + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + ip_rt_send_redirect(skb); + + skb->priority = rt_tos2priority(iph->tos); + + return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, + ip_forward_finish); + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c new file mode 100644 index 000000000000..7f68e27eb4ea --- /dev/null +++ b/net/ipv4/ip_fragment.c @@ -0,0 +1,691 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ + * + * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + * David S. Miller : Begin massive cleanup... + * Andi Kleen : Add sysctls. + * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. + * Bill Hawes : Frag accounting and evictor fixes. + * John McDonald : 0 length frag bug. + * Alexey Kuznetsov: SMP races, threading, cleanup. + * Patrick McHardy : LRU queue of frag heads for evictor. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/jiffies.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/checksum.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/inet.h> +#include <linux/netfilter_ipv4.h> + +/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c + * as well. Or notify me, at least. --ANK + */ + +/* Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to measurably + * harm machine performance. + */ +int sysctl_ipfrag_high_thresh = 256*1024; +int sysctl_ipfrag_low_thresh = 192*1024; + +/* Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. + */ +int sysctl_ipfrag_time = IP_FRAG_TIME; + +struct ipfrag_skb_cb +{ + struct inet_skb_parm h; + int offset; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) + +/* Describe an entry in the "incomplete datagrams" queue. */ +struct ipq { + struct ipq *next; /* linked list pointers */ + struct list_head lru_list; /* lru list member */ + u32 user; + u32 saddr; + u32 daddr; + u16 id; + u8 protocol; + u8 last_in; +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + + struct sk_buff *fragments; /* linked list of received fragments */ + int len; /* total length of original datagram */ + int meat; + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* when will this queue expire? */ + struct ipq **pprev; + int iif; + struct timeval stamp; +}; + +/* Hash table. */ + +#define IPQ_HASHSZ 64 + +/* Per-bucket lock is easy to add now. */ +static struct ipq *ipq_hash[IPQ_HASHSZ]; +static DEFINE_RWLOCK(ipfrag_lock); +static u32 ipfrag_hash_rnd; +static LIST_HEAD(ipq_lru_list); +int ip_frag_nqueues = 0; + +static __inline__ void __ipq_unlink(struct ipq *qp) +{ + if(qp->next) + qp->next->pprev = qp->pprev; + *qp->pprev = qp->next; + list_del(&qp->lru_list); + ip_frag_nqueues--; +} + +static __inline__ void ipq_unlink(struct ipq *ipq) +{ + write_lock(&ipfrag_lock); + __ipq_unlink(ipq); + write_unlock(&ipfrag_lock); +} + +static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot) +{ + return jhash_3words((u32)id << 16 | prot, saddr, daddr, + ipfrag_hash_rnd) & (IPQ_HASHSZ - 1); +} + +static struct timer_list ipfrag_secret_timer; +int sysctl_ipfrag_secret_interval = 10 * 60 * HZ; + +static void ipfrag_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&ipfrag_lock); + get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *q; + + q = ipq_hash[i]; + while (q) { + struct ipq *next = q->next; + unsigned int hval = ipqhashfn(q->id, q->saddr, + q->daddr, q->protocol); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = ipq_hash[hval]) != NULL) + q->next->pprev = &q->next; + ipq_hash[hval] = q; + q->pprev = &ipq_hash[hval]; + } + + q = next; + } + } + write_unlock(&ipfrag_lock); + + mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval); +} + +atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ + +/* Memory Tracking Functions. */ +static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &ip_frag_mem); + kfree_skb(skb); +} + +static __inline__ void frag_free_queue(struct ipq *qp, int *work) +{ + if (work) + *work -= sizeof(struct ipq); + atomic_sub(sizeof(struct ipq), &ip_frag_mem); + kfree(qp); +} + +static __inline__ struct ipq *frag_alloc_queue(void) +{ + struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); + + if(!qp) + return NULL; + atomic_add(sizeof(struct ipq), &ip_frag_mem); + return qp; +} + + +/* Destruction primitives. */ + +/* Complete destruction of ipq. */ +static void ip_frag_destroy(struct ipq *qp, int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(qp->last_in&COMPLETE); + BUG_TRAP(del_timer(&qp->timer) == 0); + + /* Release all fragment data. */ + fp = qp->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + /* Finally, release the queue descriptor itself. */ + frag_free_queue(qp, work); +} + +static __inline__ void ipq_put(struct ipq *ipq, int *work) +{ + if (atomic_dec_and_test(&ipq->refcnt)) + ip_frag_destroy(ipq, work); +} + +/* Kill ipq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static void ipq_kill(struct ipq *ipq) +{ + if (del_timer(&ipq->timer)) + atomic_dec(&ipq->refcnt); + + if (!(ipq->last_in & COMPLETE)) { + ipq_unlink(ipq); + atomic_dec(&ipq->refcnt); + ipq->last_in |= COMPLETE; + } +} + +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the threshold. + */ +static void ip_evictor(void) +{ + struct ipq *qp; + struct list_head *tmp; + int work; + + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; + if (work <= 0) + return; + + while (work > 0) { + read_lock(&ipfrag_lock); + if (list_empty(&ipq_lru_list)) { + read_unlock(&ipfrag_lock); + return; + } + tmp = ipq_lru_list.next; + qp = list_entry(tmp, struct ipq, lru_list); + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (!(qp->last_in&COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp, &work); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } +} + +/* + * Oops, a fragment queue timed out. Kill it and send an ICMP reply. + */ +static void ip_expire(unsigned long arg) +{ + struct ipq *qp = (struct ipq *) arg; + + spin_lock(&qp->lock); + + if (qp->last_in & COMPLETE) + goto out; + + ipq_kill(qp); + + IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + + if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { + struct sk_buff *head = qp->fragments; + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + dev_put(head->dev); + } + } +out: + spin_unlock(&qp->lock); + ipq_put(qp, NULL); +} + +/* Creation primitives. */ + +static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +{ + struct ipq *qp; + + write_lock(&ipfrag_lock); +#ifdef CONFIG_SMP + /* With SMP race we have to recheck hash table, because + * such entry could be created on other cpu, while we + * promoted read lock to write lock. + */ + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; + ipq_put(qp_in, NULL); + return qp; + } + } +#endif + qp = qp_in; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) + atomic_inc(&qp->refcnt); + + atomic_inc(&qp->refcnt); + if((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + INIT_LIST_HEAD(&qp->lru_list); + list_add_tail(&qp->lru_list, &ipq_lru_list); + ip_frag_nqueues++; + write_unlock(&ipfrag_lock); + return qp; +} + +/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ +static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) +{ + struct ipq *qp; + + if ((qp = frag_alloc_queue()) == NULL) + goto out_nomem; + + qp->protocol = iph->protocol; + qp->last_in = 0; + qp->id = iph->id; + qp->saddr = iph->saddr; + qp->daddr = iph->daddr; + qp->user = user; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + /* Initialize a timer for this entry. */ + init_timer(&qp->timer); + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + spin_lock_init(&qp->lock); + atomic_set(&qp->refcnt, 1); + + return ip_frag_intern(hash, qp); + +out_nomem: + NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); + return NULL; +} + +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +{ + __u16 id = iph->id; + __u32 saddr = iph->saddr; + __u32 daddr = iph->daddr; + __u8 protocol = iph->protocol; + unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + struct ipq *qp; + + read_lock(&ipfrag_lock); + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; + } + } + read_unlock(&ipfrag_lock); + + return ip_frag_create(hash, iph, user); +} + +/* Add new segment to existing queue. */ +static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +{ + struct sk_buff *prev, *next; + int flags, offset; + int ihl, end; + + if (qp->last_in & COMPLETE) + goto err; + + offset = ntohs(skb->nh.iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = skb->nh.iph->ihl * 4; + + /* Determine the position of this fragment. */ + end = offset + skb->len - ihl; + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) { + /* If we already have some bits beyond end + * or have different end, the segment is corrrupted. + */ + if (end < qp->len || + ((qp->last_in & LAST_IN) && end != qp->len)) + goto err; + qp->last_in |= LAST_IN; + qp->len = end; + } else { + if (end&7) { + end &= ~7; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + if (end > qp->len) { + /* Some bits beyond end -> corruption. */ + if (qp->last_in & LAST_IN) + goto err; + qp->len = end; + } + } + if (end == offset) + goto err; + + if (pskb_pull(skb, ihl) == NULL) + goto err; + if (pskb_trim(skb, end-offset)) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) { + if (FRAG_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + while (next && FRAG_CB(next)->offset < end) { + int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG_CB(next)->offset += i; + qp->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + qp->fragments = next; + + qp->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + FRAG_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + qp->fragments = skb; + + if (skb->dev) + qp->iif = skb->dev->ifindex; + skb->dev = NULL; + qp->stamp = skb->stamp; + qp->meat += skb->len; + atomic_add(skb->truesize, &ip_frag_mem); + if (offset == 0) + qp->last_in |= FIRST_IN; + + write_lock(&ipfrag_lock); + list_move_tail(&qp->lru_list, &ipq_lru_list); + write_unlock(&ipfrag_lock); + + return; + +err: + kfree_skb(skb); +} + + +/* Build a new IP datagram from all its fragments. */ + +static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) +{ + struct iphdr *iph; + struct sk_buff *fp, *head = qp->fragments; + int len; + int ihlen; + + ipq_kill(qp); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG_CB(head)->offset == 0); + + /* Allocate a new buffer for the datagram. */ + ihlen = head->nh.iph->ihl*4; + len = ihlen + qp->len; + + if(len > 65535) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_nomem; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_nomem; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; i<skb_shinfo(head)->nr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &ip_frag_mem); + } + + skb_shinfo(head)->frag_list = head->next; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &ip_frag_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &ip_frag_mem); + } + + head->next = NULL; + head->dev = dev; + head->stamp = qp->stamp; + + iph = head->nh.iph; + iph->frag_off = 0; + iph->tot_len = htons(len); + IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + qp->fragments = NULL; + return head; + +out_nomem: + NETDEBUG(if (net_ratelimit()) + printk(KERN_ERR + "IP: queue_glue: no memory for gluing queue %p\n", + qp)); + goto out_fail; +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO + "Oversized IP packet from %d.%d.%d.%d.\n", + NIPQUAD(qp->saddr)); +out_fail: + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return NULL; +} + +/* Process an incoming IP datagram fragment. */ +struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) +{ + struct iphdr *iph = skb->nh.iph; + struct ipq *qp; + struct net_device *dev; + + IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + + /* Start by cleaning up the memory. */ + if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) + ip_evictor(); + + dev = skb->dev; + + /* Lookup (or create) queue header */ + if ((qp = ip_find(iph, user)) != NULL) { + struct sk_buff *ret = NULL; + + spin_lock(&qp->lock); + + ip_frag_queue(qp, skb); + + if (qp->last_in == (FIRST_IN|LAST_IN) && + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + + spin_unlock(&qp->lock); + ipq_put(qp, NULL); + return ret; + } + + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return NULL; +} + +void ipfrag_init(void) +{ + ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&ipfrag_secret_timer); + ipfrag_secret_timer.function = ipfrag_secret_rebuild; + ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval; + add_timer(&ipfrag_secret_timer); +} + +EXPORT_SYMBOL(ip_defrag); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c new file mode 100644 index 000000000000..884835522224 --- /dev/null +++ b/net/ipv4/ip_gre.c @@ -0,0 +1,1290 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ipip.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> + +#ifdef CONFIG_IPV6 +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +/* + Problems & solutions + -------------------- + + 1. The most important issue is detecting local dead loops. + They would cause complete host lockup in transmit, which + would be "resolved" by stack overflow or, if queueing is enabled, + with infinite looping in net_bh. + + We cannot track such dead loops during route installation, + it is infeasible task. The most general solutions would be + to keep skb->encapsulation counter (sort of local ttl), + and silently drop packet when it expires. It is the best + solution, but it supposes maintaing new variable in ALL + skb, even if no tunneling is used. + + Current solution: t->recursion lock breaks dead loops. It looks + like dev->tbusy flag, but I preferred new variable, because + the semantics is different. One day, when hard_start_xmit + will be multithreaded we will have to use skb->encapsulation. + + + + 2. Networking dead loops would not kill routers, but would really + kill network. IP hop limit plays role of "t->recursion" in this case, + if we copy it from packet being encapsulated to upper header. + It is very good solution, but it introduces two problems: + + - Routing protocols, using packets with ttl=1 (OSPF, RIP2), + do not work over tunnels. + - traceroute does not work. I planned to relay ICMP from tunnel, + so that this problem would be solved and traceroute output + would even more informative. This idea appeared to be wrong: + only Linux complies to rfc1812 now (yes, guys, Linux is the only + true router now :-)), all routers (at least, in neighbourhood of mine) + return only 8 bytes of payload. It is the end. + + Hence, if we want that OSPF worked or traceroute said something reasonable, + we should search for another solution. + + One of them is to parse packet trying to detect inner encapsulation + made by our node. It is difficult or even impossible, especially, + taking into account fragmentation. TO be short, tt is not solution at all. + + Current solution: The solution was UNEXPECTEDLY SIMPLE. + We force DF flag on tunnels with preconfigured hop limit, + that is ALL. :-) Well, it does not remove the problem completely, + but exponential growth of network traffic is changed to linear + (branches, that exceed pmtu are pruned) and tunnel mtu + fastly degrades to value <68, where looping stops. + Yes, it is not good if there exists a router in the loop, + which does not force DF, even when encapsulating packets have DF set. + But it is not our problem! Nobody could accuse us, we made + all that we could make. Even if it is your gated who injected + fatal route to network, even if it were you who configured + fatal static route: you are innocent. :-) + + + + 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain + practically identical code. It would be good to glue them + together, but it is not very evident, how to make them modular. + sit is integral part of IPv6, ipip and gre are naturally modular. + We could extract common parts (hash table, ioctl etc) + to a separate module (ip_tunnel.c). + + Alexey Kuznetsov. + */ + +static int ipgre_tunnel_init(struct net_device *dev); +static void ipgre_tunnel_setup(struct net_device *dev); + +/* Fallback tunnel: no source, no destination, no key, no options */ + +static int ipgre_fb_tunnel_init(struct net_device *dev); + +static struct net_device *ipgre_fb_tunnel_dev; + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static struct ip_tunnel *tunnels[4][HASH_SIZE]; + +#define tunnels_r_l (tunnels[3]) +#define tunnels_r (tunnels[2]) +#define tunnels_l (tunnels[1]) +#define tunnels_wc (tunnels[0]) + +static DEFINE_RWLOCK(ipgre_lock); + +/* Given src, dst and key, find appropriate for input tunnel. */ + +static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(key); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_r[h0^h1]; t; t = t->next) { + if (remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr || + (local == t->parms.iph.daddr && MULTICAST(local))) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_wc[h1]; t; t = t->next) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + + if (ipgre_fb_tunnel_dev->flags&IFF_UP) + return ipgre_fb_tunnel_dev->priv; + return NULL; +} + +static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + u32 key = t->parms.i_key; + unsigned h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + + return &tunnels[prio][h]; +} + +static void ipgre_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipgre_bucket(t); + + t->next = *tp; + write_lock_bh(&ipgre_lock); + *tp = t; + write_unlock_bh(&ipgre_lock); +} + +static void ipgre_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipgre_lock); + *tp = t->next; + write_unlock_bh(&ipgre_lock); + break; + } + } +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + u32 key = parms->i_key; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = HASH(key); + int prio = 0; + char name[IFNAMSIZ]; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (key == t->parms.i_key) + return t; + } + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "gre%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); + if (!dev) + return NULL; + + dev->init = ipgre_tunnel_init; + nt = dev->priv; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + nt = dev->priv; + nt->parms = *parms; + + dev_hold(dev); + ipgre_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipgre_tunnel_uninit(struct net_device *dev) +{ + ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); +} + + +static void ipgre_err(struct sk_buff *skb, u32 info) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft state for keyed + GRE tunnels with enabled checksum. Tell them "thank you". + + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standrads established + by themself??? + */ + + struct iphdr *iph = (struct iphdr*)skb->data; + u16 *p = (u16*)(skb->data+(iph->ihl<<2)); + int grehlen = (iph->ihl<<2) + 4; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + u16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (skb_headlen(skb) < grehlen) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipgre_lock); + t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0); + if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + goto out; + + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipgre_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + struct iphdr *eiph; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + u16 flags; + int grehlen = (iph->ihl<<2) + 4; + struct sk_buff *skb2; + struct flowi fl; + struct rtable *rt; + + if (p[1] != htons(ETH_P_IP)) + return; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_CSUM) + grehlen += 4; + if (flags&GRE_KEY) + grehlen += 4; + if (flags&GRE_SEQ) + grehlen += 4; + } + if (len < grehlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + grehlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < (iph->ihl<<2)) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - grehlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necessary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < grehlen+68) + return; + rel_info -= grehlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + memset(&fl, 0, sizeof(fl)); + fl.fl4_dst = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_GRE; + if (ip_route_output_key(&rt, &fl)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + fl.fl4_dst = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb2->dst)) { + kfree_skb(skb2); + return; + } + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); +#endif +} + +static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) { + if (skb->protocol == htons(ETH_P_IP)) { + IP_ECN_set_ce(skb->nh.iph); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + IP6_ECN_set_ce(skb->nh.ipv6h); + } + } +} + +static inline u8 +ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) +{ + u8 inner = 0; + if (skb->protocol == htons(ETH_P_IP)) + inner = old_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + return INET_ECN_encapsulate(tos, inner); +} + +static int ipgre_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + u8 *h; + u16 flags; + u16 csum = 0; + u32 key = 0; + u32 seqno = 0; + struct ip_tunnel *tunnel; + int offset = 4; + + if (!pskb_may_pull(skb, 16)) + goto drop_nolock; + + iph = skb->nh.iph; + h = skb->data; + flags = *(u16*)h; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop_nolock; + + if (flags&GRE_CSUM) { + if (skb->ip_summed == CHECKSUM_HW) { + csum = (u16)csum_fold(skb->csum); + if (csum) + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed == CHECKSUM_NONE) { + skb->csum = skb_checksum(skb, 0, skb->len, 0); + skb->ip_summed = CHECKSUM_HW; + csum = (u16)csum_fold(skb->csum); + } + offset += 4; + } + if (flags&GRE_KEY) { + key = *(u32*)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(u32*)(h + offset)); + offset += 4; + } + } + + read_lock(&ipgre_lock); + if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + secpath_reset(skb); + + skb->protocol = *(u16*)(h + 2); + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (flags == 0 && + skb->protocol == __constant_htons(ETH_P_WCCP)) { + skb->protocol = __constant_htons(ETH_P_IP); + if ((*(h + offset) & 0xF0) != 0x40) + offset += 4; + } + + skb->mac.raw = skb->nh.raw; + skb->nh.raw = __pskb_pull(skb, offset); + skb_postpull_rcsum(skb, skb->mac.raw, offset); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + /* Looped back packet, drop it! */ + if (((struct rtable*)skb->dst)->fl.iif == 0) + goto drop; + tunnel->stat.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->stat.rx_crc_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->stat.rx_fifo_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipgre_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipgre_lock); + return(0); + } + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + +drop: + read_unlock(&ipgre_lock); +drop_nolock: + kfree_skb(skb); + return(0); +} + +static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *tiph; + u8 tos; + u16 df; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int gre_hlen; + u32 dst; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (dev->hard_header) { + gre_hlen = 0; + tiph = (struct iphdr*)skb->data; + } else { + gre_hlen = tunnel->hlen; + tiph = &tunnel->parms.iph; + } + + if ((dst = tiph->daddr) == 0) { + /* NBMA tunnel */ + + if (skb->dst == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == htons(ETH_P_IP)) { + rt = (struct rtable*)skb->dst; + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + int addr_type; + struct neighbour *neigh = skb->dst->neighbour; + + if (neigh == NULL) + goto tx_error; + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } +#endif + else + goto tx_error; + } + + tos = tiph->tos; + if (tos&1) { + if (skb->protocol == htons(ETH_P_IP)) + tos = old_iph->tos; + tos &= ~1; + } + + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_GRE }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error; + } + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + df = tiph->frag_off; + if (df) + mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (skb->protocol == htons(ETH_P_IP)) { + df |= (old_iph->frag_off&htons(IP_DF)); + + if ((old_iph->frag_off&htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + + if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + skb->dst->metrics[RTAX_MTU-1] = mtu; + } + } + + if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, gre_hlen); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_GRE; + iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) { + if (skb->protocol == htons(ETH_P_IP)) + iph->ttl = old_iph->ttl; +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) + iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; +#endif + else + iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); + } + + ((u16*)(iph+1))[0] = tunnel->parms.o_flags; + ((u16*)(iph+1))[1] = skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + } + } + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); + +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipgre_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipgre_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned nflags=0; + + t = (struct ip_tunnel*)dev->priv; + + if (MULTICAST(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + ipgre_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipgre_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == ipgre_fb_tunnel_dev->priv) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +#ifdef CONFIG_NET_IPGRE_BROADCAST +/* Nice toy. Unfortunately, useless in real life :-) + It allows to construct virtual multiprotocol broadcast "LAN" + over the Internet, provided multicast routing is tuned. + + + I have no idea was this bicycle invented before me, + so that I had to set ARPHRD_IPGRE to a random value. + I have an impression, that Cisco could make something similar, + but this feature is apparently missing in IOS<=11.2(8). + + I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks + with broadcast 224.66.66.66. If you have access to mbone, play with me :-) + + ping -t 255 224.66.66.66 + + If nobody answers, mbone does not work. + + ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 + ip addr add 10.66.66.<somewhat>/24 dev Universe + ifconfig Universe up + ifconfig Universe add fe80::<Your_real_addr>/10 + ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 + ftp 10.66.66.66 + ... + ftp fec0:6666:6666::193.233.7.65 + ... + + */ + +static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); + u16 *p = (u16*)(iph+1); + + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&iph->saddr, saddr, 4); + + if (daddr) { + memcpy(&iph->daddr, daddr, 4); + return t->hlen; + } + if (iph->daddr && !MULTICAST(iph->daddr)) + return t->hlen; + + return -t->hlen; +} + +static int ipgre_open(struct net_device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + if (MULTICAST(t->parms.iph.daddr)) { + struct flowi fl = { .oif = t->parms.link, + .nl_u = { .ip4_u = + { .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (ip_route_output_key(&rt, &fl)) + return -EADDRNOTAVAIL; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (__in_dev_get(dev) == NULL) + return -EADDRNOTAVAIL; + t->mlink = dev->ifindex; + ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr); + } + return 0; +} + +static int ipgre_close(struct net_device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + if (MULTICAST(t->parms.iph.daddr) && t->mlink) { + struct in_device *in_dev = inetdev_by_index(t->mlink); + if (in_dev) { + ip_mc_dec_group(in_dev, t->parms.iph.daddr); + in_dev_put(in_dev); + } + } + return 0; +} + +#endif + +static void ipgre_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipgre_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->get_stats = ipgre_tunnel_get_stats; + dev->do_ioctl = ipgre_tunnel_ioctl; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev->type = ARPHRD_IPGRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = 1500 - sizeof(struct iphdr) - 4; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipgre_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = 1500; + int addend = sizeof(struct iphdr) + 4; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + + dev->flags |= IFF_POINTOPOINT; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + if (!iph->saddr) + return -EINVAL; + dev->flags = IFF_BROADCAST; + dev->hard_header = ipgre_header; + dev->open = ipgre_open; + dev->stop = ipgre_close; + } +#endif + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + return 0; +} + +int __init ipgre_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_GRE; + iph->ihl = 5; + tunnel->hlen = sizeof(struct iphdr) + 4; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + + +static struct net_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, +}; + + +/* + * And now the modules code and kernel interface. + */ + +static int __init ipgre_init(void) +{ + int err; + + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + printk(KERN_INFO "ipgre init: can't add protocol\n"); + return -EAGAIN; + } + + ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", + ipgre_tunnel_setup); + if (!ipgre_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init; + + if ((err = register_netdev(ipgre_fb_tunnel_dev))) + goto err2; +out: + return err; +err2: + free_netdev(ipgre_fb_tunnel_dev); +err1: + inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto out; +} + +static void ipgre_fini(void) +{ + if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + printk(KERN_INFO "ipgre close: can't remove protocol\n"); + + unregister_netdev(ipgre_fb_tunnel_dev); +} + +module_init(ipgre_init); +module_exit(ipgre_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c new file mode 100644 index 000000000000..a0d0833034be --- /dev/null +++ b/net/ipv4/ip_input.c @@ -0,0 +1,431 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) module. + * + * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * + * + * Fixes: + * Alan Cox : Commented a couple of minor bits of surplus code + * Alan Cox : Undefining IP_FORWARD doesn't include the code + * (just stops a compiler warning). + * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes + * are junked rather than corrupting things. + * Alan Cox : Frames to bad broadcast subnets are dumped + * We used to process them non broadcast and + * boy could that cause havoc. + * Alan Cox : ip_forward sets the free flag on the + * new frame it queues. Still crap because + * it copies the frame but at least it + * doesn't eat memory too. + * Alan Cox : Generic queue code and memory fixes. + * Fred Van Kempen : IP fragment support (borrowed from NET2E) + * Gerhard Koerting: Forward fragmented frames correctly. + * Gerhard Koerting: Fixes to my fix of the above 8-). + * Gerhard Koerting: IP interface addressing fix. + * Linus Torvalds : More robustness checks + * Alan Cox : Even more checks: Still not as robust as it ought to be + * Alan Cox : Save IP header pointer for later + * Alan Cox : ip option setting + * Alan Cox : Use ip_tos/ip_ttl settings + * Alan Cox : Fragmentation bogosity removed + * (Thanks to Mark.Bush@prg.ox.ac.uk) + * Dmitry Gorodchanin : Send of a raw packet crash fix. + * Alan Cox : Silly ip bug when an overlength + * fragment turns up. Now frees the + * queue. + * Linus Torvalds/ : Memory leakage on fragmentation + * Alan Cox : handling. + * Gerhard Koerting: Forwarding uses IP priority hints + * Teemu Rantanen : Fragment problems. + * Alan Cox : General cleanup, comments and reformat + * Alan Cox : SNMP statistics + * Alan Cox : BSD address rule semantics. Also see + * UDP as there is a nasty checksum issue + * if you do things the wrong way. + * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file + * Alan Cox : IP options adjust sk->priority. + * Pedro Roque : Fix mtu/length error in ip_forward. + * Alan Cox : Avoid ip_chk_addr when possible. + * Richard Underwood : IP multicasting. + * Alan Cox : Cleaned up multicast handlers. + * Alan Cox : RAW sockets demultiplex in the BSD style. + * Gunther Mayer : Fix the SNMP reporting typo + * Alan Cox : Always in group 224.0.0.1 + * Pauline Middelink : Fast ip_checksum update when forwarding + * Masquerading support. + * Alan Cox : Multicast loopback error for 224.0.0.1 + * Alan Cox : IP_MULTICAST_LOOP option. + * Alan Cox : Use notifiers. + * Bjorn Ekwall : Removed ip_csum (from slhc.c too) + * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) + * Stefan Becker : Send out ICMP HOST REDIRECT + * Arnt Gulbrandsen : ip_build_xmit + * Alan Cox : Per socket routing cache + * Alan Cox : Fixed routing cache, added header cache. + * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. + * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. + * Alan Cox : Incoming IP option handling. + * Alan Cox : Set saddr on raw output frames as per BSD. + * Alan Cox : Stopped broadcast source route explosions. + * Alan Cox : Can disable source routing + * Takeshi Sone : Masquerading didn't work. + * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. + * Alan Cox : Memory leaks, tramples, misc debugging. + * Alan Cox : Fixed multicast (by popular demand 8)) + * Alan Cox : Fixed forwarding (by even more popular demand 8)) + * Alan Cox : Fixed SNMP statistics [I think] + * Gerhard Koerting : IP fragmentation forwarding fix + * Alan Cox : Device lock against page fault. + * Alan Cox : IP_HDRINCL facility. + * Werner Almesberger : Zero fragment bug + * Alan Cox : RAW IP frame length bug + * Alan Cox : Outgoing firewall on build_xmit + * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel + * Alan Cox : Multicast routing hooks + * Jos Vos : Do accounting *before* call_in_firewall + * Willy Konynenberg : Transparent proxying support + * + * + * + * To Fix: + * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient + * and could be made very efficient with the addition of some virtual memory hacks to permit + * the allocation of a buffer that can then be 'grown' by twiddling page tables. + * Output fragmentation wants updating along with the buffer management to use a single + * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet + * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause + * fragmentation anyway. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/system.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/net.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <net/xfrm.h> +#include <linux/mroute.h> +#include <linux/netlink.h> + +/* + * SNMP management statistics + */ + +DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); + +/* + * Process Router Attention IP option + */ +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = skb->nh.iph->protocol; + struct sock *last = NULL; + + read_lock(&ip_ra_lock); + for (ra = ip_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + + /* If socket is bound to an interface, only report + * the packet if it came from that interface. + */ + if (sk && inet_sk(sk)->num == protocol && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN); + if (skb == NULL) { + read_unlock(&ip_ra_lock); + return 1; + } + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + read_unlock(&ip_ra_lock); + return 1; + } + read_unlock(&ip_ra_lock); + return 0; +} + +static inline int ip_local_deliver_finish(struct sk_buff *skb) +{ + int ihl = skb->nh.iph->ihl*4; + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_local_deliver(skb); +#endif /*CONFIG_NETFILTER_DEBUG*/ + + __skb_pull(skb, ihl); + + /* Free reference early: we don't need it any more, and it may + hold ip_conntrack module loaded indefinitely. */ + nf_reset(skb); + + /* Point into the IP datagram, just past the header. */ + skb->h.raw = skb->data; + + rcu_read_lock(); + { + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + int protocol = skb->nh.iph->protocol; + int hash; + struct sock *raw_sk; + struct net_protocol *ipprot; + + resubmit: + hash = protocol & (MAX_INET_PROTOS - 1); + raw_sk = sk_head(&raw_v4_htable[hash]); + + /* If there maybe a raw socket we must check - if not we + * don't care less + */ + if (raw_sk) + raw_v4_input(skb, skb->nh.iph, hash); + + if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { + int ret; + + if (!ipprot->no_policy && + !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + goto out; + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + } else { + if (!raw_sk) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); + } + } else + IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + } + out: + rcu_read_unlock(); + + return 0; +} + +/* + * Deliver IP Packets to the higher protocol layers. + */ +int ip_local_deliver(struct sk_buff *skb) +{ + /* + * Reassemble IP fragments. + */ + + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER); + if (!skb) + return 0; + } + + return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, + ip_local_deliver_finish); +} + +static inline int ip_rcv_finish(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = skb->nh.iph; + + /* + * Initialise the virtual path cache for the packet. It describes + * how the packet travels inside Linux networking. + */ + if (skb->dst == NULL) { + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) + goto drop; + } + +#ifdef CONFIG_NET_CLS_ROUTE + if (skb->dst->tclassid) { + struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); + u32 idx = skb->dst->tclassid; + st[idx&0xFF].o_packets++; + st[idx&0xFF].o_bytes+=skb->len; + st[(idx>>16)&0xFF].i_packets++; + st[(idx>>16)&0xFF].i_bytes+=skb->len; + } +#endif + + if (iph->ihl > 5) { + struct ip_options *opt; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + iph = skb->nh.iph; + + if (ip_options_compile(NULL, skb)) + goto inhdr_error; + + opt = &(IPCB(skb)->opt); + if (opt->srr) { + struct in_device *in_dev = in_dev_get(dev); + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + in_dev_put(in_dev); + goto drop; + } + in_dev_put(in_dev); + } + if (ip_options_rcv_srr(skb)) + goto drop; + } + } + + return dst_input(skb); + +inhdr_error: + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Main IP Receive routine. + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct iphdr *iph; + + /* When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. + */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = skb->nh.iph; + + /* + * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. + * + * Is the datagram acceptable? + * + * 1. Length at least the size of an ip header + * 2. Version of 4 + * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] + * 4. Doesn't have a bogus length + */ + + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = skb->nh.iph; + + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto inhdr_error; + + { + __u32 len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl<<2)) + goto inhdr_error; + + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + + return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, + ip_rcv_finish); + +inhdr_error: + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +EXPORT_SYMBOL(ip_rcv); +EXPORT_SYMBOL(ip_statistics); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c new file mode 100644 index 000000000000..6d89f3f3e701 --- /dev/null +++ b/net/ipv4/ip_options.c @@ -0,0 +1,625 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The options processing module for ip.c + * + * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $ + * + * Authors: A.N.Kuznetsov + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> + +/* + * Write options to IP header, record destination address to + * source route option, address of outgoing interface + * (we should already know it, so that this function is allowed be + * called only after routing decision) and timestamp, + * if we originate this datagram. + * + * daddr is real destination address, next hop is recorded in IP header. + * saddr is address of outgoing interface. + */ + +void ip_options_build(struct sk_buff * skb, struct ip_options * opt, + u32 daddr, struct rtable *rt, int is_frag) +{ + unsigned char * iph = skb->nh.raw; + + memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); + memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + opt = &(IPCB(skb)->opt); + opt->is_data = 0; + + if (opt->srr) + memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + + if (!is_frag) { + if (opt->rr_needaddr) + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); + if (opt->ts_needaddr) + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); + if (opt->ts_needtime) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + } + return; + } + if (opt->rr) { + memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + opt->rr = 0; + opt->rr_needaddr = 0; + } + if (opt->ts) { + memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + opt->ts = 0; + opt->ts_needaddr = opt->ts_needtime = 0; + } +} + +/* + * Provided (sopt, skb) points to received options, + * build in dopt compiled option set appropriate for answering. + * i.e. invert SRR option, copy anothers, + * and grab room in RR/TS options. + * + * NOTE: dopt cannot point to skb. + */ + +int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +{ + struct ip_options *sopt; + unsigned char *sptr, *dptr; + int soffset, doffset; + int optlen; + u32 daddr; + + memset(dopt, 0, sizeof(struct ip_options)); + + dopt->is_data = 1; + + sopt = &(IPCB(skb)->opt); + + if (sopt->optlen == 0) { + dopt->optlen = 0; + return 0; + } + + sptr = skb->nh.raw; + dptr = dopt->__data; + + if (skb->dst) + daddr = ((struct rtable*)skb->dst)->rt_spec_dst; + else + daddr = skb->nh.iph->daddr; + + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; + soffset = sptr[sopt->rr+2]; + dopt->rr = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->rr, optlen); + if (sopt->rr_needaddr && soffset <= optlen) { + if (soffset + 3 > optlen) + return -EINVAL; + dptr[2] = soffset + 4; + dopt->rr_needaddr = 1; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->ts) { + optlen = sptr[sopt->ts+1]; + soffset = sptr[sopt->ts+2]; + dopt->ts = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->ts, optlen); + if (soffset <= optlen) { + if (sopt->ts_needaddr) { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needaddr = 1; + soffset += 4; + } + if (sopt->ts_needtime) { + if (soffset + 3 > optlen) + return -EINVAL; + if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { + dopt->ts_needtime = 1; + soffset += 4; + } else { + dopt->ts_needtime = 0; + + if (soffset + 8 <= optlen) { + __u32 addr; + + memcpy(&addr, sptr+soffset-1, 4); + if (inet_addr_type(addr) != RTN_LOCAL) { + dopt->ts_needtime = 1; + soffset += 8; + } + } + } + } + dptr[2] = soffset; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->srr) { + unsigned char * start = sptr+sopt->srr; + u32 faddr; + + optlen = start[1]; + soffset = start[2]; + doffset = 0; + if (soffset > optlen) + soffset = optlen + 1; + soffset -= 4; + if (soffset > 3) { + memcpy(&faddr, &start[soffset-1], 4); + for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + memcpy(&dptr[doffset-1], &start[soffset-1], 4); + /* + * RFC1812 requires to fix illegal source routes. + */ + if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) + doffset -= 4; + } + if (doffset > 3) { + memcpy(&start[doffset-1], &daddr, 4); + dopt->faddr = faddr; + dptr[0] = start[0]; + dptr[1] = doffset+3; + dptr[2] = 4; + dptr += doffset+3; + dopt->srr = dopt->optlen + sizeof(struct iphdr); + dopt->optlen += doffset+3; + dopt->is_strictroute = sopt->is_strictroute; + } + } + while (dopt->optlen & 3) { + *dptr++ = IPOPT_END; + dopt->optlen++; + } + return 0; +} + +/* + * Options "fragmenting", just fill options not + * allowed in fragments with NOOPs. + * Simple and stupid 8), but the most efficient way. + */ + +void ip_options_fragment(struct sk_buff * skb) +{ + unsigned char * optptr = skb->nh.raw; + struct ip_options * opt = &(IPCB(skb)->opt); + int l = opt->optlen; + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return; + if (!IPOPT_COPIED(*optptr)) + memset(optptr, IPOPT_NOOP, optlen); + l -= optlen; + optptr += optlen; + } + opt->ts = 0; + opt->rr = 0; + opt->rr_needaddr = 0; + opt->ts_needaddr = 0; + opt->ts_needtime = 0; + return; +} + +/* + * Verify options and fill pointers in struct options. + * Caller should clear *opt, and set opt->data. + * If opt == NULL, then skb->data should point to IP header. + */ + +int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) +{ + int l; + unsigned char * iph; + unsigned char * optptr; + int optlen; + unsigned char * pp_ptr = NULL; + struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; + + if (!opt) { + opt = &(IPCB(skb)->opt); + memset(opt, 0, sizeof(struct ip_options)); + iph = skb->nh.raw; + opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); + optptr = iph + sizeof(struct iphdr); + opt->is_data = 0; + } else { + optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); + iph = optptr - sizeof(struct iphdr); + } + + for (l = opt->optlen; l > 0; ) { + switch (*optptr) { + case IPOPT_END: + for (optptr++, l--; l>0; optptr++, l--) { + if (*optptr != IPOPT_END) { + *optptr = IPOPT_END; + opt->is_changed = 1; + } + } + goto eol; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) { + pp_ptr = optptr; + goto error; + } + switch (*optptr) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + /* NB: cf RFC-1812 5.2.4.1 */ + if (opt->srr) { + pp_ptr = optptr; + goto error; + } + if (!skb) { + if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { + pp_ptr = optptr + 1; + goto error; + } + memcpy(&opt->faddr, &optptr[3], 4); + if (optlen > 7) + memmove(&optptr[3], &optptr[7], optlen-7); + } + opt->is_strictroute = (optptr[0] == IPOPT_SSRR); + opt->srr = optptr - iph; + break; + case IPOPT_RR: + if (opt->rr) { + pp_ptr = optptr; + goto error; + } + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + if (optptr[2]+3 > optlen) { + pp_ptr = optptr + 2; + goto error; + } + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + opt->is_changed = 1; + } + optptr[2] += 4; + opt->rr_needaddr = 1; + } + opt->rr = optptr - iph; + break; + case IPOPT_TIMESTAMP: + if (opt->ts) { + pp_ptr = optptr; + goto error; + } + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 5) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + __u32 * timeptr = NULL; + if (optptr[2]+3 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + switch (optptr[3]&0xF) { + case IPOPT_TS_TSONLY: + opt->ts = optptr - iph; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]-1]; + opt->ts_needtime = 1; + optptr[2] += 4; + break; + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + optptr[2] += 8; + break; + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + { + u32 addr; + memcpy(&addr, &optptr[optptr[2]-1], 4); + if (inet_addr_type(addr) == RTN_UNICAST) + break; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needtime = 1; + optptr[2] += 8; + break; + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr + 3; + goto error; + } + break; + } + if (timeptr) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(timeptr, &midtime, sizeof(__u32)); + opt->is_changed = 1; + } + } else { + unsigned overflow = optptr[3]>>4; + if (overflow == 15) { + pp_ptr = optptr + 3; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); + opt->is_changed = 1; + } + } + break; + case IPOPT_RA: + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] == 0 && optptr[3] == 0) + opt->router_alert = optptr - iph; + break; + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr; + goto error; + } + break; + } + l -= optlen; + optptr += optlen; + } + +eol: + if (!pp_ptr) + return 0; + +error: + if (skb) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); + } + return -EINVAL; +} + + +/* + * Undo all the changes done by ip_options_compile(). + */ + +void ip_options_undo(struct ip_options * opt) +{ + if (opt->srr) { + unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + memmove(optptr+7, optptr+3, optptr[1]-7); + memcpy(optptr+3, &opt->faddr, 4); + } + if (opt->rr_needaddr) { + unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + if (opt->ts) { + unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + if (opt->ts_needtime) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + optptr[2] -= 4; + } + if (opt->ts_needaddr) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + } +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +{ + struct ip_options *opt; + + opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); + if (!opt) + return -ENOMEM; + memset(opt, 0, sizeof(struct ip_options)); + if (optlen) { + if (user) { + if (copy_from_user(opt->__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + } else + memcpy(opt->__data, data, optlen); + } + while (optlen & 3) + opt->__data[optlen++] = IPOPT_END; + opt->optlen = optlen; + opt->is_data = 1; + opt->is_setbyuser = 1; + if (optlen && ip_options_compile(opt, NULL)) { + kfree(opt); + return -EINVAL; + } + if (*optp) + kfree(*optp); + *optp = opt; + return 0; +} + +void ip_forward_options(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + unsigned char * optptr; + struct rtable *rt = (struct rtable*)skb->dst; + unsigned char *raw = skb->nh.raw; + + if (opt->rr_needaddr) { + optptr = (unsigned char *)raw + opt->rr; + ip_rt_get_source(&optptr[optptr[2]-5], rt); + opt->is_changed = 1; + } + if (opt->srr_is_hit) { + int srrptr, srrspace; + + optptr = raw + opt->srr; + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) { + if (srrptr + 3 > srrspace) + break; + if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) + break; + } + if (srrptr + 3 <= srrspace) { + opt->is_changed = 1; + ip_rt_get_source(&optptr[srrptr-1], rt); + skb->nh.iph->daddr = rt->rt_dst; + optptr[2] = srrptr+4; + } else if (net_ratelimit()) + printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); + if (opt->ts_needaddr) { + optptr = raw + opt->ts; + ip_rt_get_source(&optptr[optptr[2]-9], rt); + opt->is_changed = 1; + } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(skb->nh.iph); + } +} + +int ip_options_rcv_srr(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + int srrspace, srrptr; + u32 nexthop; + struct iphdr *iph = skb->nh.iph; + unsigned char * optptr = skb->nh.raw + opt->srr; + struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt2; + int err; + + if (!opt->srr) + return 0; + + if (skb->pkt_type != PACKET_HOST) + return -EINVAL; + if (rt->rt_type == RTN_UNICAST) { + if (!opt->is_strictroute) + return 0; + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); + return -EINVAL; + } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; + + for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + if (srrptr + 3 > srrspace) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); + return -EINVAL; + } + memcpy(&nexthop, &optptr[srrptr-1], 4); + + rt = (struct rtable*)skb->dst; + skb->dst = NULL; + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + rt2 = (struct rtable*)skb->dst; + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { + ip_rt_put(rt2); + skb->dst = &rt->u.dst; + return -EINVAL; + } + ip_rt_put(rt); + if (rt2->rt_type != RTN_LOCAL) + break; + /* Superfast 8) loopback forward */ + memcpy(&iph->daddr, &optptr[srrptr-1], 4); + opt->is_changed = 1; + } + if (srrptr <= srrspace) { + opt->srr_is_hit = 1; + opt->is_changed = 1; + } + return 0; +} + +EXPORT_SYMBOL(ip_options_compile); +EXPORT_SYMBOL(ip_options_undo); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c new file mode 100644 index 000000000000..30ab7b6ab761 --- /dev/null +++ b/net/ipv4/ip_output.c @@ -0,0 +1,1359 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) output module. + * + * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Hirokazu Takahashi, <taka@valinux.co.jp> + * + * See ip_input.c for original log + * + * Fixes: + * Alan Cox : Missing nonblock feature in ip_build_xmit. + * Mike Kilburn : htons() missing in ip_build_xmit. + * Bradford Johnson: Fix faulty handling of some frames when + * no route is found. + * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit + * (in case if packet not accepted by + * output firewall rules) + * Mike McLagan : Routing by source + * Alexey Kuznetsov: use new route cache + * Andi Kleen: Fix broken PMTU recovery and remove + * some redundant tests. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. + * Marc Boucher : When call_out_firewall returns FW_QUEUE, + * silently drop skb instead of failing with -EPERM. + * Detlev Wengorz : Copy protocol for fragments. + * Hirokazu Takahashi: HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi: sendfile() on UDP works now. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <net/inetpeer.h> +#include <net/checksum.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_bridge.h> +#include <linux/mroute.h> +#include <linux/netlink.h> + +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; +int sysctl_ip_default_ttl = IPDEFTTL; + +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) +{ + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + +/* dev_loopback_xmit for use with netfilter. */ +static int ip_dev_loopback_xmit(struct sk_buff *newskb) +{ + newskb->mac.raw = newskb->data; + __skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + BUG_TRAP(newskb->dst); + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_loopback_xmit(newskb); +#endif + netif_rx(newskb); + return 0; +} + +static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) +{ + int ttl = inet->uc_ttl; + + if (ttl < 0) + ttl = dst_metric(dst, RTAX_HOPLIMIT); + return ttl; +} + +/* + * Add an ip header to a skbuff and send it out. + * + */ +int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)skb->dst; + struct iphdr *iph; + + /* Build the IP header. */ + if (opt) + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); + else + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = inet->tos; + if (ip_dont_fragment(sk, &rt->u.dst)) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->sk_protocol; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, sk); + skb->nh.iph = iph; + + if (opt && opt->optlen) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, daddr, rt, 0); + } + ip_send_check(iph); + + skb->priority = sk->sk_priority; + + /* Send it out. */ + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); +} + +static inline int ip_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + struct net_device *dev = dst->dev; + int hh_len = LL_RESERVED_SPACE(dev); + + /* Be paranoid, rather than too clever. */ + if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); + if (skb2 == NULL) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + kfree_skb(skb); + skb = skb2; + } + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_finish_output2(skb); +#endif /*CONFIG_NETFILTER_DEBUG*/ + + if (hh) { + int hh_alen; + + read_lock_bh(&hh->hh_lock); + hh_alen = HH_DATA_ALIGN(hh->hh_len); + memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + return hh->hh_output(skb); + } else if (dst->neighbour) + return dst->neighbour->output(skb); + + if (net_ratelimit()) + printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + kfree_skb(skb); + return -EINVAL; +} + +int ip_finish_output(struct sk_buff *skb) +{ + struct net_device *dev = skb->dst->dev; + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output2); +} + +int ip_mc_output(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rtable *rt = (struct rtable*)skb->dst; + struct net_device *dev = rt->u.dst.dev; + + /* + * If the indicated interface is up and running, send the packet. + */ + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + /* + * Multicasts are looped back for other local users + */ + + if (rt->rt_flags&RTCF_MULTICAST) { + if ((!sk || inet_sk(sk)->mc_loop) +#ifdef CONFIG_IP_MROUTE + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) +#endif + ) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + newskb->dev, + ip_dev_loopback_xmit); + } + + /* Multicasts with ttl 0 must not go beyond the host */ + + if (skb->nh.iph->ttl == 0) { + kfree_skb(skb); + return 0; + } + } + + if (rt->rt_flags&RTCF_BROADCAST) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + newskb->dev, ip_dev_loopback_xmit); + } + + if (skb->len > dst_mtu(&rt->u.dst)) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); +} + +int ip_output(struct sk_buff *skb) +{ + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); +} + +int ip_queue_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt = inet->opt; + struct rtable *rt; + struct iphdr *iph; + + /* Skip all of this if the packet is already routed, + * f.e. by something like SCTP. + */ + rt = (struct rtable *) skb->dst; + if (rt != NULL) + goto packet_routed; + + /* Make sure we can route this packet. */ + rt = (struct rtable *)__sk_dst_check(sk, 0); + if (rt == NULL) { + u32 daddr; + + /* Use correct destination address if we have options. */ + daddr = inet->daddr; + if(opt && opt->srr) + daddr = opt->faddr; + + { + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = inet->dport } } }; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + if (ip_route_output_flow(&rt, &fl, sk, 0)) + goto no_route; + } + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + } + skb->dst = dst_clone(&rt->u.dst); + +packet_routed: + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto no_route; + + /* OK, we know where to send it, allocate and build IP header. */ + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + iph->tot_len = htons(skb->len); + if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->protocol = sk->sk_protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + skb->nh.iph = iph; + /* Transport layer set skb->h.foo itself. */ + + if (opt && opt->optlen) { + iph->ihl += opt->optlen >> 2; + ip_options_build(skb, opt, inet->daddr, rt, 0); + } + + ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + + /* Add an IP checksum. */ + ip_send_check(iph); + + skb->priority = sk->sk_priority; + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + +no_route: + IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EHOSTUNREACH; +} + + +static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + dst_release(to->dst); + to->dst = dst_clone(from->dst); + to->dev = from->dev; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + to->nfcache = from->nfcache; + /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); + to->nfctinfo = from->nfctinfo; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(to->nf_bridge); + to->nf_bridge = from->nf_bridge; + nf_bridge_get(to->nf_bridge); +#endif +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif +} + +/* + * This IP datagram is too large to be sent in one piece. Break it up into + * smaller pieces (each of size equal to IP header plus + * a block of the data of the original IP data part) that will yet fit in a + * single device frame, and queue such a frame for sending. + */ + +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +{ + struct iphdr *iph; + int raw = 0; + int ptr; + struct net_device *dev; + struct sk_buff *skb2; + unsigned int mtu, hlen, left, len, ll_rs; + int offset; + int not_last_frag; + struct rtable *rt = (struct rtable*)skb->dst; + int err = 0; + + dev = rt->u.dst.dev; + + /* + * Point into the IP datagram header. + */ + + iph = skb->nh.iph; + + if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->u.dst))); + kfree_skb(skb); + return -EMSGSIZE; + } + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + + /* When frag_list is given, use it. First, check its validity: + * some transformers could create wrong frag_list or break existing + * one, it is not prohibited. In this case fall back to copying. + * + * LATER: this step can be merged to real generation of fragments, + * we can switch to copy when see the first bad fragment. + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *frag; + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + /* Everything is OK. Generate! */ + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + frag->h.raw = frag->data; + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, iph, hlen); + iph = frag->nh.iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + if (offset == 0) + ip_options_fragment(frag); + offset += skb->len - hlen; + iph->frag_off = htons(offset>>3); + if (frag->next != NULL) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); + } + + err = output(skb); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (err == 0) { + IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = raw + hlen; /* Where to start from */ + +#ifdef CONFIG_BRIDGE_NETFILTER + /* for bridged IP traffic encapsulated inside f.e. a vlan header, + * we need to make room for the encapsulating header */ + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb)); + mtu -= nf_bridge_pad(skb); +#else + ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev); +#endif + /* + * Fragment the datagram. + */ + + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + not_last_frag = iph->frag_off & htons(IP_MF); + + /* + * Keep copying data until we run out. + */ + + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, ll_rs); + skb_put(skb2, len + hlen); + skb2->nh.raw = skb2->data; + skb2->h.raw = skb2->data + hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + memcpy(skb2->nh.raw, skb->data, hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + BUG(); + left -= len; + + /* + * Fill in the new header fields. + */ + iph = skb2->nh.iph; + iph->frag_off = htons((offset >> 3)); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || not_last_frag) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + + iph->tot_len = htons(len + hlen); + + ip_send_check(iph); + + err = output(skb2); + if (err) + goto fail; + } + kfree_skb(skb); + IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + return err; + +fail: + kfree_skb(skb); + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; +} + +int +ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct iovec *iov = from; + + if (skb->ip_summed == CHECKSUM_HW) { + if (memcpy_fromiovecend(to, iov, offset, len) < 0) + return -EFAULT; + } else { + unsigned int csum = 0; + if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + return -EFAULT; + skb->csum = csum_block_add(skb->csum, csum, odd); + } + return 0; +} + +static inline unsigned int +csum_page(struct page *page, int offset, int copy) +{ + char *kaddr; + unsigned int csum; + kaddr = kmap(page); + csum = csum_partial(kaddr + offset, copy, 0); + kunmap(page); + return csum; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + + struct ip_options *opt = NULL; + int hh_len; + int exthdrlen; + int mtu; + int copy; + int err; + int offset = 0; + unsigned int maxfraglen, fragheaderlen; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (inet->cork.opt == NULL) { + inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); + if (unlikely(inet->cork.opt == NULL)) + return -ENOBUFS; + } + memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); + inet->cork.flags |= IPCORK_OPT; + inet->cork.addr = ipc->addr; + } + dst_hold(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + inet->cork.rt = rt; + inet->cork.length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + if ((exthdrlen = rt->u.dst.header_len) != 0) { + length += exthdrlen; + transhdrlen += exthdrlen; + } + } else { + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + length > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + return -EMSGSIZE; + } + + /* + * transhdrlen > 0 means that this is the first fragment and we wish + * it won't be fragmented in the future. + */ + if (transhdrlen && + length + fragheaderlen <= mtu && + rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + !exthdrlen) + csummode = CHECKSUM_HW; + + inet->cork.length += length; + + /* So, what's going on in the loop below? + * + * We use calculated fragment length to generate chained skb, + * each of segments is IP fragment ready for sending to network after + * adding appropriate IP header. + */ + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = mtu - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; + struct sk_buff *skb_prev; +alloc_new_skb: + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > mtu - fragheaderlen) + datalen = maxfraglen - fragheaderlen; + fraglen = datalen + fragheaderlen; + + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length) + alloclen += rt->u.dst.trailer_len; + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len + 15, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len + 15, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + skb_trim(skb_prev, maxfraglen); + } + + copy = datalen - transhdrlen - fraggap; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if (i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + + return 0; + +error: + inet->cork.length -= length; + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +ssize_t ip_append_page(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + struct rtable *rt; + struct ip_options *opt = NULL; + int hh_len; + int mtu; + int len; + int err; + unsigned int maxfraglen, fragheaderlen, fraggap; + + if (inet->hdrincl) + return -EPERM; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) + return -EINVAL; + + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) + return -EOPNOTSUPP; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + mtu = inet->cork.fragsize; + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + return -EMSGSIZE; + } + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + return -EINVAL; + + inet->cork.length += size; + + while (size > 0) { + int i; + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + if (len <= 0) { + struct sk_buff *skb_prev; + char *data; + struct iphdr *iph; + int alloclen; + + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + alloclen = fragheaderlen + hh_len + fraggap + 15; + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); + if (unlikely(!skb)) { + err = -ENOBUFS; + goto error; + } + + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fragheaderlen + fraggap); + skb->nh.iph = iph = (struct iphdr *)data; + data += fragheaderlen; + skb->h.raw = data; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + skb_trim(skb_prev, maxfraglen); + } + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + i = skb_shinfo(skb)->nr_frags; + if (len > size) + len = size; + if (skb_can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += len; + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, len); + } else { + err = -EMSGSIZE; + goto error; + } + + if (skb->ip_summed == CHECKSUM_NONE) { + unsigned int csum; + csum = csum_page(page, offset, len); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb->len += len; + skb->data_len += len; + offset += len; + size -= len; + } + return 0; + +error: + inet->cork.length -= size; + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +/* + * Combined all pending IP fragments on the socket as one IP datagram + * and push them out. + */ +int ip_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt = NULL; + struct rtable *rt = inet->cork.rt; + struct iphdr *iph; + int df = 0; + __u8 ttl; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow + * to fragment the frame generated here. No matter, what transforms + * how transforms change size of the packet, it will come out. + */ + if (inet->pmtudisc != IP_PMTUDISC_DO) + skb->local_df = 1; + + /* DF bit is set when we want to see DF on outgoing frames. + * If local_df is set too, we still allow to fragment this frame + * locally. */ + if (inet->pmtudisc == IP_PMTUDISC_DO || + (skb->len <= dst_mtu(&rt->u.dst) && + ip_dont_fragment(sk, &rt->u.dst))) + df = htons(IP_DF); + + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (rt->rt_type == RTN_MULTICAST) + ttl = inet->mc_ttl; + else + ttl = ip_select_ttl(inet, &rt->u.dst); + + iph = (struct iphdr *)skb->data; + iph->version = 4; + iph->ihl = 5; + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, inet->cork.addr, rt, 0); + } + iph->tos = inet->tos; + iph->tot_len = htons(skb->len); + iph->frag_off = df; + if (!df) { + __ip_select_ident(iph, &rt->u.dst, 0); + } else { + iph->id = htons(inet->id++); + } + iph->ttl = ttl; + iph->protocol = sk->sk_protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + ip_send_check(iph); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + /* Netfilter gets whole the not fragmented skb. */ + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, + skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.opt) { + kfree(inet->cork.opt); + inet->cork.opt = NULL; + } + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } + return err; + +error: + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + goto out; +} + +/* + * Throw away all pending data on the socket. + */ +void ip_flush_pending_frames(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + kfree_skb(skb); + + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.opt) { + kfree(inet->cork.opt); + inet->cork.opt = NULL; + } + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } +} + + +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(void *dptr, char *to, int offset, + int len, int odd, struct sk_buff *skb) +{ + unsigned int csum; + + csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + * + * LATER: switch from ip_build_xmit to ip_append_* + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len) +{ + struct inet_sock *inet = inet_sk(sk); + struct { + struct ip_options opt; + char data[40]; + } replyopts; + struct ipcm_cookie ipc; + u32 daddr; + struct rtable *rt = (struct rtable*)skb->dst; + + if (ip_options_echo(&replyopts.opt, skb)) + return; + + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + + if (replyopts.opt.optlen) { + ipc.opt = &replyopts.opt; + + if (ipc.opt->srr) + daddr = replyopts.opt.faddr; + } + + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + /* Not quite clean, but right. */ + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } }, + .proto = sk->sk_protocol }; + if (ip_route_output_key(&rt, &fl)) + return; + } + + /* And let IP do all the hard work. + + This chunk is not reenterable, hence spinlock. + Note that it uses the fact, that this function is called + with locally disabled BH and that sk cannot be already spinlocked. + */ + bh_lock_sock(sk); + inet->tos = skb->nh.iph->tos; + sk->sk_priority = skb->priority; + sk->sk_protocol = skb->nh.iph->protocol; + ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + &ipc, rt, MSG_DONTWAIT); + if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + if (arg->csumoffset >= 0) + *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(sk); + } + + bh_unlock_sock(sk); + + ip_rt_put(rt); +} + +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = ip_rcv, +}; + +/* + * IP registers the packet type and then calls the subprotocol initialisers + */ + +void __init ip_init(void) +{ + dev_add_pack(&ip_packet_type); + + ip_rt_init(); + inet_initpeers(); + +#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) + igmp_mc_proc_init(); +#endif +} + +EXPORT_SYMBOL(ip_finish_output); +EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_generic_getfrag); +EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(ip_send_check); + +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_ip_default_ttl); +#endif diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c new file mode 100644 index 000000000000..47012b93cad2 --- /dev/null +++ b/net/ipv4/ip_sockglue.c @@ -0,0 +1,1093 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP to API glue. + * + * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip.c for history. + * Martin Mares : TOS setting fixed. + * Alan Cox : Fixed a couple of oopses in Martin's + * TOS tweaks. + * Mike McLagan : Routing by source + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/igmp.h> +#include <linux/netfilter.h> +#include <linux/route.h> +#include <linux/mroute.h> +#include <net/route.h> +#include <net/xfrm.h> +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include <net/transp_v6.h> +#endif + +#include <linux/errqueue.h> +#include <asm/uaccess.h> + +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 + +/* + * SOL_IP control messages. + */ + +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct in_pktinfo info; + struct rtable *rt = (struct rtable *)skb->dst; + + info.ipi_addr.s_addr = skb->nh.iph->daddr; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } + + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); +} + +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) +{ + int ttl = skb->nh.iph->ttl; + put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); +} + +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); +} + +static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); +} + + +static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options * opt = (struct ip_options*)optbuf; + + if (IPCB(skb)->opt.optlen == 0) + return; + + if (ip_options_echo(opt, skb)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + ip_options_undo(opt); + + put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); +} + + +void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(skb->sk); + unsigned flags = inet->cmsg_flags; + + /* Ordered by supposed usage frequency */ + if (flags & 1) + ip_cmsg_recv_pktinfo(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_ttl(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_tos(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_opts(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_retopts(msg, skb); +} + +int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) +{ + int err; + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_IP) + continue; + switch (cmsg->cmsg_type) { + case IP_RETOPTS: + err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + if (err) + return err; + break; + case IP_PKTINFO: + { + struct in_pktinfo *info; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return -EINVAL; + info = (struct in_pktinfo *)CMSG_DATA(cmsg); + ipc->oif = info->ipi_ifindex; + ipc->addr = info->ipi_spec_dst.s_addr; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + + +/* Special input handler for packets caught by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain *ip_ra_chain; +DEFINE_RWLOCK(ip_ra_lock); + +int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra, **rap; + + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip_ra_lock); + for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + write_unlock_bh(&ip_ra_lock); + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + *rap = ra->next; + write_unlock_bh(&ip_ra_lock); + + if (ra->destructor) + ra->destructor(sk); + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->destructor = destructor; + + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip_ra_lock); + + return 0; +} + +void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock_exterr_skb *serr; + + if (!inet->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; + serr->ee.ee_type = skb->h.icmph->type; + serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + if (!skb_pull(skb, payload - skb->data) || + sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock_exterr_skb *serr; + struct iphdr *iph; + struct sk_buff *skb; + + if (!inet->recverr) + return; + + skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); + skb->nh.iph = iph; + iph->daddr = daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = port; + + skb->h.raw = skb->tail; + __skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset); + sin->sin_port = serr->port; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin_family = AF_UNSPEC; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + + put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_irq(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_irq(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_irq(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +/* + * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on + * an IP socket. + */ + +int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val=0,err; + + if (level != SOL_IP) + return -ENOPROTOOPT; + + if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | + (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | + (1<<IP_RETOPTS) | (1<<IP_TOS) | + (1<<IP_TTL) | (1<<IP_HDRINCL) | + (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | + (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) || + optname == IP_MULTICAST_TTL || + optname == IP_MULTICAST_LOOP) { + if (optlen >= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else if (optlen >= sizeof(char)) { + unsigned char ucval; + + if (get_user(ucval, (unsigned char __user *) optval)) + return -EFAULT; + val = (int) ucval; + } + } + + /* If optlen==0, it is equivalent to val == 0 */ + +#ifdef CONFIG_IP_MROUTE + if (optname >= MRT_BASE && optname <= (MRT_BASE + 10)) + return ip_mroute_setsockopt(sk,optname,optval,optlen); +#endif + + err = 0; + lock_sock(sk); + + switch (optname) { + case IP_OPTIONS: + { + struct ip_options * opt = NULL; + if (optlen > 40 || optlen < 0) + goto e_inval; + err = ip_options_get(&opt, optval, optlen, 1); + if (err) + break; + if (sk->sk_type == SOCK_STREAM) { + struct tcp_sock *tp = tcp_sk(sk); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == PF_INET || + (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) && + inet->daddr != LOOPBACK4_IPV6)) { +#endif + if (inet->opt) + tp->ext_header_len -= inet->opt->optlen; + if (opt) + tp->ext_header_len += opt->optlen; + tcp_sync_mss(sk, tp->pmtu_cookie); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } +#endif + } + opt = xchg(&inet->opt, opt); + if (opt) + kfree(opt); + break; + } + case IP_PKTINFO: + if (val) + inet->cmsg_flags |= IP_CMSG_PKTINFO; + else + inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + break; + case IP_RECVTTL: + if (val) + inet->cmsg_flags |= IP_CMSG_TTL; + else + inet->cmsg_flags &= ~IP_CMSG_TTL; + break; + case IP_RECVTOS: + if (val) + inet->cmsg_flags |= IP_CMSG_TOS; + else + inet->cmsg_flags &= ~IP_CMSG_TOS; + break; + case IP_RECVOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RECVOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + break; + case IP_RETOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RETOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + break; + case IP_TOS: /* This sets both TOS and Precedence */ + if (sk->sk_type == SOCK_STREAM) { + val &= ~3; + val |= inet->tos & 3; + } + if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && + !capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (inet->tos != val) { + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } + break; + case IP_TTL: + if (optlen<1) + goto e_inval; + if (val != -1 && (val < 1 || val>255)) + goto e_inval; + inet->uc_ttl = val; + break; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->hdrincl = val ? 1 : 0; + break; + case IP_MTU_DISCOVER: + if (val<0 || val>2) + goto e_inval; + inet->pmtudisc = val; + break; + case IP_RECVERR: + inet->recverr = !!val; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + break; + case IP_MULTICAST_TTL: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (optlen<1) + goto e_inval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + goto e_inval; + inet->mc_ttl = val; + break; + case IP_MULTICAST_LOOP: + if (optlen<1) + goto e_inval; + inet->mc_loop = !!val; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct net_device *dev = NULL; + + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + /* + * Check the arguments are allowable + */ + + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + break; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == INADDR_ANY) { + inet->mc_index = 0; + inet->mc_addr = 0; + err = 0; + break; + } + dev = ip_dev_find(mreq.imr_address.s_addr); + if (dev) { + mreq.imr_ifindex = dev->ifindex; + dev_put(dev); + } + } else + dev = __dev_get_by_index(mreq.imr_ifindex); + + + err = -EADDRNOTAVAIL; + if (!dev) + break; + + err = -EINVAL; + if (sk->sk_bound_dev_if && + mreq.imr_ifindex != sk->sk_bound_dev_if) + break; + + inet->mc_index = mreq.imr_ifindex; + inet->mc_addr = mreq.imr_address.s_addr; + err = 0; + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; + + if (optlen < sizeof(struct ip_mreq)) + goto e_inval; + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if(copy_from_user(&mreq,optval,sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + break; + } + + if (optname == IP_ADD_MEMBERSHIP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case IP_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_igmp_max_msf; + struct ip_msfilter *msf; + + if (optlen < IP_MSFILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(msf, optval, optlen)) { + kfree(msf); + break; + } + /* numsrc >= (1G-4) overflow in 32 bits */ + if (msf->imsf_numsrc >= 0x3ffffffcU || + msf->imsf_numsrc > sysctl_igmp_max_msf) { + kfree(msf); + err = -ENOBUFS; + break; + } + if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { + kfree(msf); + err = -EINVAL; + break; + } + err = ip_mc_msfilter(sk, msf, 0); + kfree(msf); + break; + } + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + struct ip_mreq_source mreqs; + int omode, add; + + if (optlen != sizeof(struct ip_mreq_source)) + goto e_inval; + if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + err = -EFAULT; + break; + } + if (optname == IP_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == IP_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { + struct ip_mreqn mreq; + + mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; + mreq.imr_address.s_addr = mreqs.imr_interface; + mreq.imr_ifindex = 0; + err = ip_mc_join_group(sk, &mreq); + if (err) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, 0); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in *psin; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + err = -EFAULT; + if(copy_from_user(&greq, optval, sizeof(greq))) + break; + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + goto e_inval; + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + err = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) { + err = -EADDRNOTAVAIL; + break; + } + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs.gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct ip_mreqn mreq; + + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs.gsr_interface; + err = ip_mc_join_group(sk, &mreq); + if (err) + break; + greqs.gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, + greqs.gsr_interface); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_igmp_max_msf; + struct sockaddr_in *psin; + struct ip_msfilter *msf = NULL; + struct group_filter *gsf = NULL; + int msize, i, ifindex; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + goto mc_msf_out; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); + msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + goto mc_msf_out; + } + ifindex = gsf->gf_interface; + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) { + err = -EADDRNOTAVAIL; + goto mc_msf_out; + } + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = gsf->gf_fmode; + msf->imsf_numsrc = gsf->gf_numsrc; + err = -EADDRNOTAVAIL; + for (i=0; i<gsf->gf_numsrc; ++i) { + psin = (struct sockaddr_in *)&gsf->gf_slist[i]; + + if (psin->sin_family != AF_INET) + goto mc_msf_out; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + kfree(gsf); + gsf = NULL; + + err = ip_mc_msfilter(sk, msf, ifindex); +mc_msf_out: + if (msf) + kfree(msf); + if (gsf) + kfree(gsf); + break; + } + case IP_ROUTER_ALERT: + err = ip_ra_control(sk, val ? 1 : 0, NULL); + break; + + case IP_FREEBIND: + if (optlen<1) + goto e_inval; + inet->freebind = !!val; + break; + + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = xfrm_user_policy(sk, optname, optval, optlen); + break; + + default: +#ifdef CONFIG_NETFILTER + err = nf_setsockopt(sk, PF_INET, optname, optval, + optlen); +#else + err = -ENOPROTOOPT; +#endif + break; + } + release_sock(sk); + return err; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +/* + * Get the options. Note for future reference. The GET of IP options gets the + * _received_ ones. The set sets the _sent_ ones. + */ + +int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val; + int len; + + if(level!=SOL_IP) + return -EOPNOTSUPP; + +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_getsockopt(sk,optname,optval,optlen); + } +#endif + + if(get_user(len,optlen)) + return -EFAULT; + if(len < 0) + return -EINVAL; + + lock_sock(sk); + + switch(optname) { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options * opt = (struct ip_options*)optbuf; + opt->optlen = 0; + if (inet->opt) + memcpy(optbuf, inet->opt, + sizeof(struct ip_options)+ + inet->opt->optlen); + release_sock(sk); + + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len = min_t(unsigned int, len, opt->optlen); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_TOS: + val = inet->tos; + break; + case IP_TTL: + val = (inet->uc_ttl == -1 ? + sysctl_ip_default_ttl : + inet->uc_ttl); + break; + case IP_HDRINCL: + val = inet->hdrincl; + break; + case IP_MTU_DISCOVER: + val = inet->pmtudisc; + break; + case IP_MTU: + { + struct dst_entry *dst; + val = 0; + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); + } + if (!val) { + release_sock(sk); + return -ENOTCONN; + } + break; + } + case IP_RECVERR: + val = inet->recverr; + break; + case IP_MULTICAST_TTL: + val = inet->mc_ttl; + break; + case IP_MULTICAST_LOOP: + val = inet->mc_loop; + break; + case IP_MULTICAST_IF: + { + struct in_addr addr; + len = min_t(unsigned int, len, sizeof(struct in_addr)); + addr.s_addr = inet->mc_addr; + release_sock(sk); + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &addr, len)) + return -EFAULT; + return 0; + } + case IP_MSFILTER: + { + struct ip_msfilter msf; + int err; + + if (len < IP_MSFILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_msfget(sk, &msf, + (struct ip_msfilter __user *)optval, optlen); + release_sock(sk); + return err; + } + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_gsfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + case IP_PKTOPTIONS: + { + struct msghdr msg; + + release_sock(sk); + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + + if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + struct in_pktinfo info; + + info.ipi_addr.s_addr = inet->rcv_saddr; + info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_ifindex = inet->mc_index; + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); + } + if (inet->cmsg_flags & IP_CMSG_TTL) { + int hlim = inet->mc_ttl; + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IP_FREEBIND: + val = inet->freebind; + break; + default: +#ifdef CONFIG_NETFILTER + val = nf_getsockopt(sk, PF_INET, optname, optval, + &len); + release_sock(sk); + if (val >= 0) + val = put_user(len, optlen); + return val; +#else + release_sock(sk); + return -ENOPROTOOPT; +#endif + } + release_sock(sk); + + if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&ucval,1)) + return -EFAULT; + } else { + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + } + return 0; +} + +EXPORT_SYMBOL(ip_cmsg_recv); + +#ifdef CONFIG_IP_SCTP_MODULE +EXPORT_SYMBOL(ip_getsockopt); +EXPORT_SYMBOL(ip_setsockopt); +#endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c new file mode 100644 index 000000000000..1a23c5263b99 --- /dev/null +++ b/net/ipv4/ipcomp.c @@ -0,0 +1,524 @@ +/* + * IP Payload Compression Protocol (IPComp) - RFC3173. + * + * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Todo: + * - Tunable compression parameters. + * - Compression stats. + * - Adaptive compression. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <asm/scatterlist.h> +#include <asm/semaphore.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/percpu.h> +#include <linux/smp.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/rtnetlink.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/icmp.h> +#include <net/ipcomp.h> + +struct ipcomp_tfms { + struct list_head list; + struct crypto_tfm **tfms; + int users; +}; + +static DECLARE_MUTEX(ipcomp_resource_sem); +static void **ipcomp_scratches; +static int ipcomp_scratch_users; +static LIST_HEAD(ipcomp_tfms_list); + +static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen; + struct iphdr *iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if (dlen < (plen + sizeof(struct ip_comp_hdr))) { + err = -EINVAL; + goto out; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) + goto out; + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + iph = skb->nh.iph; + iph->tot_len = htons(dlen + iph->ihl * 4); +out: + put_cpu(); + return err; +} + +static int ipcomp_input(struct xfrm_state *x, + struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + u8 nexthdr; + int err = 0; + struct iphdr *iph; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.iph; + memcpy(&tmp_iph, iph, iph->ihl * 4); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ip_comp_hdr)); + skb->nh.raw += sizeof(struct ip_comp_hdr); + memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); + iph = skb->nh.iph; + iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); + iph->protocol = nexthdr; + skb->h.raw = skb->data; + err = ipcomp_decompress(x, skb); + +out: + return err; +} + +static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen, ihlen; + struct iphdr *iph = skb->nh.iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + ihlen = iph->ihl * 4; + plen = skb->len - ihlen; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data + ihlen; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { + err = -EMSGSIZE; + goto out; + } + + memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); + put_cpu(); + + pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr)); + return 0; + +out: + put_cpu(); + return err; +} + +static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *iph; + struct ip_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + int hdr_len = 0; + + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + hdr_len = iph->ihl * 4; + if ((skb->len - hdr_len) < ipcd->threshold) { + /* Don't bother compressing */ + goto out_ok; + } + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + goto out_ok; + } + + err = ipcomp_compress(x, skb); + iph = skb->nh.iph; + + if (err) { + goto out_ok; + } + + /* Install ipcomp header, convert into ipcomp datagram. */ + iph->tot_len = htons(skb->len); + ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); + ipch->nexthdr = iph->protocol; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + iph->protocol = IPPROTO_COMP; + ip_send_check(iph); + return 0; + +out_ok: + if (x->props.mode) + ip_send_check(iph); + return 0; +} + +static void ipcomp4_err(struct sk_buff *skb, u32 info) +{ + u32 spi; + struct iphdr *iph = (struct iphdr *)skb->data; + struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + spi = ntohl(ntohs(ipch->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET); + if (!x) + return; + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr))); + xfrm_state_put(x); +} + +/* We always hold one tunnel user reference to indicate a tunnel */ +static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) +{ + struct xfrm_state *t; + + t = xfrm_state_alloc(); + if (t == NULL) + goto out; + + t->id.proto = IPPROTO_IPIP; + t->id.spi = x->props.saddr.a4; + t->id.daddr.a4 = x->id.daddr.a4; + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET; + t->props.mode = 1; + t->props.saddr.a4 = x->props.saddr.a4; + t->props.flags = x->props.flags; + + t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); + if (t->type == NULL) + goto error; + + if (t->type->init_state(t, NULL)) + goto error; + + t->km.state = XFRM_STATE_VALID; + atomic_set(&t->tunnel_users, 1); +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +/* + * Must be protected by xfrm_cfg_sem. State and tunnel user references are + * always incremented on success. + */ +static int ipcomp_tunnel_attach(struct xfrm_state *x) +{ + int err = 0; + struct xfrm_state *t; + + t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, + x->props.saddr.a4, IPPROTO_IPIP, AF_INET); + if (!t) { + t = ipcomp_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); +out: + return err; +} + +static void ipcomp_free_scratches(void) +{ + int i; + void **scratches; + + if (--ipcomp_scratch_users) + return; + + scratches = ipcomp_scratches; + if (!scratches) + return; + + for_each_cpu(i) { + void *scratch = *per_cpu_ptr(scratches, i); + if (scratch) + vfree(scratch); + } + + free_percpu(scratches); +} + +static void **ipcomp_alloc_scratches(void) +{ + int i; + void **scratches; + + if (ipcomp_scratch_users++) + return ipcomp_scratches; + + scratches = alloc_percpu(void *); + if (!scratches) + return NULL; + + ipcomp_scratches = scratches; + + for_each_cpu(i) { + void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); + if (!scratch) + return NULL; + *per_cpu_ptr(scratches, i) = scratch; + } + + return scratches; +} + +static void ipcomp_free_tfms(struct crypto_tfm **tfms) +{ + struct ipcomp_tfms *pos; + int cpu; + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + if (pos->tfms == tfms) + break; + } + + BUG_TRAP(pos); + + if (--pos->users) + return; + + list_del(&pos->list); + kfree(pos); + + if (!tfms) + return; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); + if (tfm) + crypto_free_tfm(tfm); + } + free_percpu(tfms); +} + +static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name) +{ + struct ipcomp_tfms *pos; + struct crypto_tfm **tfms; + int cpu; + + /* This can be any valid CPU ID so we don't need locking. */ + cpu = smp_processor_id(); + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + struct crypto_tfm *tfm; + + tfms = pos->tfms; + tfm = *per_cpu_ptr(tfms, cpu); + + if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) { + pos->users++; + return tfms; + } + } + + pos = kmalloc(sizeof(*pos), GFP_KERNEL); + if (!pos) + return NULL; + + pos->users = 1; + INIT_LIST_HEAD(&pos->list); + list_add(&pos->list, &ipcomp_tfms_list); + + pos->tfms = tfms = alloc_percpu(struct crypto_tfm *); + if (!tfms) + goto error; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0); + if (!tfm) + goto error; + *per_cpu_ptr(tfms, cpu) = tfm; + } + + return tfms; + +error: + ipcomp_free_tfms(tfms); + return NULL; +} + +static void ipcomp_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfms) + ipcomp_free_tfms(ipcd->tfms); + ipcomp_free_scratches(); +} + +static void ipcomp_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + xfrm_state_delete_tunnel(x); + down(&ipcomp_resource_sem); + ipcomp_free_data(ipcd); + up(&ipcomp_resource_sem); + kfree(ipcd); +} + +static int ipcomp_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + if (x->encap) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = 0; + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + + down(&ipcomp_resource_sem); + if (!ipcomp_alloc_scratches()) + goto error; + + ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name); + if (!ipcd->tfms) + goto error; + up(&ipcomp_resource_sem); + + if (x->props.mode) { + err = ipcomp_tunnel_attach(x); + if (err) + goto error_tunnel; + } + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; + +error_tunnel: + down(&ipcomp_resource_sem); +error: + ipcomp_free_data(ipcd); + up(&ipcomp_resource_sem); + kfree(ipcd); + goto out; +} + +static struct xfrm_type ipcomp_type = { + .description = "IPCOMP4", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output +}; + +static struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ipcomp4_err, + .no_policy = 1, +}; + +static int __init ipcomp4_init(void) +{ + if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { + printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp4_fini(void) +{ + if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp4_init); +module_exit(ipcomp4_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); + diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c new file mode 100644 index 000000000000..f2509034ce72 --- /dev/null +++ b/net/ipv4/ipconfig.c @@ -0,0 +1,1507 @@ +/* + * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $ + * + * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or + * user-supplied information to configure own IP address and routes. + * + * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Derived from network configuration code in fs/nfs/nfsroot.c, + * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + * + * BOOTP rewritten to construct and analyse packets itself instead + * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; + * -- MJ, December 1998 + * + * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic" + * initialization scheme. + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999 + * + * DHCP support added. To users this looks like a whole separate + * protocol, but we know it's just a bag on the side of BOOTP. + * -- Chip Salzenberg <chip@valinux.com>, May 2000 + * + * Ported DHCP support from 2.2.16 to 2.4.0-test4 + * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000 + * + * Merged changes from 2.2.19 into 2.4.3 + * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001 + * + * Multiple Nameservers in /proc/net/pnp + * -- Josef Siemes <jsiemes@web.de>, Aug 2002 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/utsname.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/socket.h> +#include <linux/route.h> +#include <linux/udp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/major.h> +#include <linux/root_dev.h> +#include <linux/delay.h> +#include <net/arp.h> +#include <net/ip.h> +#include <net/ipconfig.h> + +#include <asm/uaccess.h> +#include <net/checksum.h> +#include <asm/processor.h> + +/* Define this to allow debugging output */ +#undef IPCONFIG_DEBUG + +#ifdef IPCONFIG_DEBUG +#define DBG(x) printk x +#else +#define DBG(x) do { } while(0) +#endif + +#if defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_DHCP +#endif +#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_BOOTP +#endif +#if defined(CONFIG_IP_PNP_RARP) +#define IPCONFIG_RARP +#endif +#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP) +#define IPCONFIG_DYNAMIC +#endif + +/* Define the friendly delay before and after opening net devices */ +#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ +#define CONF_POST_OPEN 1 /* After opening: 1 second */ + +/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ +#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ +#define CONF_SEND_RETRIES 6 /* Send six requests per open */ +#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ +#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ +#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ +#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ +#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers + - '3' from resolv.h */ + + +/* + * Public IP configuration + */ + +/* This is used by platforms which might be able to set the ipconfig + * variables using firmware environment vars. If this is set, it will + * ignore such firmware variables. + */ +int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ + +static int ic_enable __initdata = 0; /* IP config enabled? */ + +/* Protocol choice */ +int ic_proto_enabled __initdata = 0 +#ifdef IPCONFIG_BOOTP + | IC_BOOTP +#endif +#ifdef CONFIG_IP_PNP_DHCP + | IC_USE_DHCP +#endif +#ifdef IPCONFIG_RARP + | IC_RARP +#endif + ; + +static int ic_host_name_set __initdata = 0; /* Host name set by us? */ + +u32 ic_myaddr = INADDR_NONE; /* My IP address */ +static u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */ +u32 ic_gateway = INADDR_NONE; /* Gateway IP address */ + +u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */ + +u32 root_server_addr = INADDR_NONE; /* Address of NFS server */ +u8 root_server_path[256] = { 0, }; /* Path to mount as root */ + +/* Persistent data: */ + +static int ic_proto_used; /* Protocol used, if any */ +static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ +static u8 ic_domain[64]; /* DNS (not NIS) domain name */ + +/* + * Private state. + */ + +/* Name of user-selected boot device */ +static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; + +/* Protocols supported by available interfaces */ +static int ic_proto_have_if __initdata = 0; + +#ifdef IPCONFIG_DYNAMIC +static DEFINE_SPINLOCK(ic_recv_lock); +static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +#endif +#ifdef IPCONFIG_DHCP +static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +#endif + + +/* + * Network devices + */ + +struct ic_device { + struct ic_device *next; + struct net_device *dev; + unsigned short flags; + short able; + u32 xid; +}; + +static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ +static struct net_device *ic_dev __initdata = NULL; /* Selected device */ + +static int __init ic_open_devs(void) +{ + struct ic_device *d, **last; + struct net_device *dev; + unsigned short oflags; + + last = &ic_first_dev; + rtnl_shlock(); + + /* bring loopback device up first */ + if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); + + for (dev = dev_base; dev; dev = dev->next) { + if (dev == &loopback_dev) + continue; + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) { + int able = 0; + if (dev->mtu >= 364) + able |= IC_BOOTP; + else + printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); + if (!(dev->flags & IFF_NOARP)) + able |= IC_RARP; + able &= ic_proto_enabled; + if (ic_proto_enabled && !able) + continue; + oflags = dev->flags; + if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + continue; + } + if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { + rtnl_shunlock(); + return -1; + } + d->dev = dev; + *last = d; + last = &d->next; + d->flags = oflags; + d->able = able; + if (able & IC_BOOTP) + get_random_bytes(&d->xid, sizeof(u32)); + else + d->xid = 0; + ic_proto_have_if |= able; + DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n", + dev->name, able, d->xid)); + } + } + rtnl_shunlock(); + + *last = NULL; + + if (!ic_first_dev) { + if (user_dev_name[0]) + printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); + else + printk(KERN_ERR "IP-Config: No network devices available.\n"); + return -1; + } + return 0; +} + +static void __init ic_close_devs(void) +{ + struct ic_device *d, *next; + struct net_device *dev; + + rtnl_shlock(); + next = ic_first_dev; + while ((d = next)) { + next = d->next; + dev = d->dev; + if (dev != ic_dev) { + DBG(("IP-Config: Downing %s\n", dev->name)); + dev_change_flags(dev, d->flags); + } + kfree(d); + } + rtnl_shunlock(); +} + +/* + * Interface to various network functions. + */ + +static inline void +set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port) +{ + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = addr; + sin->sin_port = port; +} + +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = devinet_ioctl(cmd, (struct ifreq __user *) arg); + set_fs(oldfs); + return res; +} + +static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = ip_rt_ioctl(cmd, (void __user *) arg); + set_fs(oldfs); + return res; +} + +/* + * Set up interface addresses and routes. + */ + +static int __init ic_setup_if(void) +{ + struct ifreq ir; + struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; + int err; + + memset(&ir, 0, sizeof(ir)); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + set_sockaddr(sin, ic_myaddr, 0); + if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); + return -1; + } + return 0; +} + +static int __init ic_setup_routes(void) +{ + /* No need to setup device routes, only the default route... */ + + if (ic_gateway != INADDR_NONE) { + struct rtentry rm; + int err; + + memset(&rm, 0, sizeof(rm)); + if ((ic_gateway ^ ic_myaddr) & ic_netmask) { + printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); + return -1; + } + set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); + rm.rt_flags = RTF_UP | RTF_GATEWAY; + if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); + return -1; + } + } + + return 0; +} + +/* + * Fill in default values for all missing parameters. + */ + +static int __init ic_defaults(void) +{ + /* + * At this point we have no userspace running so need not + * claim locks on system_utsname + */ + + if (!ic_host_name_set) + sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + + if (root_server_addr == INADDR_NONE) + root_server_addr = ic_servaddr; + + if (ic_netmask == INADDR_NONE) { + if (IN_CLASSA(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSA_NET); + else if (IN_CLASSB(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSB_NET); + else if (IN_CLASSC(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSC_NET); + else { + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n", + NIPQUAD(ic_myaddr)); + return -1; + } + printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask)); + } + + return 0; +} + +/* + * RARP support. + */ + +#ifdef IPCONFIG_RARP + +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + +static struct packet_type rarp_packet_type __initdata = { + .type = __constant_htons(ETH_P_RARP), + .func = ic_rarp_recv, +}; + +static inline void ic_rarp_init(void) +{ + dev_add_pack(&rarp_packet_type); +} + +static inline void ic_rarp_cleanup(void) +{ + dev_remove_pack(&rarp_packet_type); +} + +/* + * Process received RARP packet. + */ +static int __init +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct arphdr *rarp; + unsigned char *rarp_ptr; + unsigned long sip, tip; + unsigned char *sha, *tha; /* s for "source", t for "target" */ + struct ic_device *d; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, sizeof(struct arphdr))) + goto drop; + + /* Basic sanity checks can be done without the lock. */ + rarp = (struct arphdr *)skb->h.raw; + + /* If this test doesn't pass, it's not IP, or we should + * ignore it anyway. + */ + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) + goto drop; + + /* If it's not a RARP reply, delete it. */ + if (rarp->ar_op != htons(ARPOP_RREPLY)) + goto drop; + + /* If it's not Ethernet, delete it. */ + if (rarp->ar_pro != htons(ETH_P_IP)) + goto drop; + + if (!pskb_may_pull(skb, + sizeof(struct arphdr) + + (2 * dev->addr_len) + + (2 * 4))) + goto drop; + + /* OK, it is all there and looks valid, process... */ + rarp = (struct arphdr *)skb->h.raw; + rarp_ptr = (unsigned char *) (rarp + 1); + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Extract variable-width fields */ + sha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&sip, rarp_ptr, 4); + rarp_ptr += 4; + tha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&tip, rarp_ptr, 4); + + /* Discard packets which are not meant for us. */ + if (memcmp(tha, dev->dev_addr, dev->addr_len)) + goto drop_unlock; + + /* Discard packets which are not from specified server. */ + if (ic_servaddr != INADDR_NONE && ic_servaddr != sip) + goto drop_unlock; + + /* We have a winner! */ + ic_dev = dev; + if (ic_myaddr == INADDR_NONE) + ic_myaddr = tip; + ic_servaddr = sip; + ic_got_reply = IC_RARP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + return 0; +} + + +/* + * Send RARP request packet over a single interface. + */ +static void __init ic_rarp_send_if(struct ic_device *d) +{ + struct net_device *dev = d->dev; + arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, + dev->dev_addr, dev->dev_addr); +} +#endif + +/* + * DHCP/BOOTP support. + */ + +#ifdef IPCONFIG_BOOTP + +struct bootp_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ + u8 op; /* 1=request, 2=reply */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + u8 hops; /* Used only by gateways */ + u32 xid; /* Transaction ID */ + u16 secs; /* Seconds since we started */ + u16 flags; /* Just what it says */ + u32 client_ip; /* Client's IP address if known */ + u32 your_ip; /* Assigned IP address */ + u32 server_ip; /* (Next, e.g. NFS) Server's IP address */ + u32 relay_ip; /* IP address of BOOTP relay */ + u8 hw_addr[16]; /* Client's HW address */ + u8 serv_name[64]; /* Server host name */ + u8 boot_file[128]; /* Name of boot file */ + u8 exten[312]; /* DHCP options / BOOTP vendor extensions */ +}; + +/* packet ops */ +#define BOOTP_REQUEST 1 +#define BOOTP_REPLY 2 + +/* DHCP message types */ +#define DHCPDISCOVER 1 +#define DHCPOFFER 2 +#define DHCPREQUEST 3 +#define DHCPDECLINE 4 +#define DHCPACK 5 +#define DHCPNAK 6 +#define DHCPRELEASE 7 +#define DHCPINFORM 8 + +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + +static struct packet_type bootp_packet_type __initdata = { + .type = __constant_htons(ETH_P_IP), + .func = ic_bootp_recv, +}; + + +/* + * Initialize DHCP/BOOTP extension fields in the request. + */ + +static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; + +#ifdef IPCONFIG_DHCP + +static void __init +ic_dhcp_init_options(u8 *options) +{ + u8 mt = ((ic_servaddr == INADDR_NONE) + ? DHCPDISCOVER : DHCPREQUEST); + u8 *e = options; + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Sending message type %d\n", mt); +#endif + + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + + *e++ = 53; /* DHCP message type */ + *e++ = 1; + *e++ = mt; + + if (mt == DHCPREQUEST) { + *e++ = 54; /* Server ID (IP address) */ + *e++ = 4; + memcpy(e, &ic_servaddr, 4); + e += 4; + + *e++ = 50; /* Requested IP address */ + *e++ = 4; + memcpy(e, &ic_myaddr, 4); + e += 4; + } + + /* always? */ + { + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 40, /* NIS domain name */ + }; + + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + } + + *e++ = 255; /* End of the list */ +} + +#endif /* IPCONFIG_DHCP */ + +static void __init ic_bootp_init_ext(u8 *e) +{ + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + *e++ = 1; /* Subnet mask request */ + *e++ = 4; + e += 4; + *e++ = 3; /* Default gateway request */ + *e++ = 4; + e += 4; + *e++ = 5; /* Name server request */ + *e++ = 8; + e += 8; + *e++ = 12; /* Host name request */ + *e++ = 32; + e += 32; + *e++ = 40; /* NIS Domain name request */ + *e++ = 32; + e += 32; + *e++ = 17; /* Boot path */ + *e++ = 40; + e += 40; + + *e++ = 57; /* set extension buffer size for reply */ + *e++ = 2; + *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */ + *e++ = 150; + + *e++ = 255; /* End of the list */ +} + + +/* + * Initialize the DHCP/BOOTP mechanism. + */ +static inline void ic_bootp_init(void) +{ + int i; + + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + ic_nameservers[i] = INADDR_NONE; + + dev_add_pack(&bootp_packet_type); +} + + +/* + * DHCP/BOOTP cleanup. + */ +static inline void ic_bootp_cleanup(void) +{ + dev_remove_pack(&bootp_packet_type); +} + + +/* + * Send DHCP/BOOTP request to single interface. + */ +static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff) +{ + struct net_device *dev = d->dev; + struct sk_buff *skb; + struct bootp_pkt *b; + int hh_len = LL_RESERVED_SPACE(dev); + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, hh_len); + b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); + memset(b, 0, sizeof(struct bootp_pkt)); + + /* Construct IP header */ + skb->nh.iph = h = &b->iph; + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct bootp_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = INADDR_BROADCAST; + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(68); + b->udph.dest = htons(67); + b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct DHCP/BOOTP header */ + b->op = BOOTP_REQUEST; + if (dev->type < 256) /* check for false types */ + b->htype = dev->type; + else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ + b->htype = ARPHRD_IEEE802; + else if (dev->type == ARPHRD_FDDI) + b->htype = ARPHRD_ETHER; + else { + printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); + b->htype = dev->type; /* can cause undefined behavior */ + } + b->hlen = dev->addr_len; + b->your_ip = INADDR_NONE; + b->server_ip = INADDR_NONE; + memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); + b->secs = htons(jiffies_diff / HZ); + b->xid = d->xid; + + /* add DHCP options or BOOTP extensions */ +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) + ic_dhcp_init_options(b->exten); + else +#endif + ic_bootp_init_ext(b->exten); + + /* Chain packet down the line... */ + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + if ((dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + dev_queue_xmit(skb) < 0) + printk("E"); +} + + +/* + * Copy BOOTP-supplied string if not already set. + */ +static int __init ic_bootp_string(char *dest, char *src, int len, int max) +{ + if (!len) + return 0; + if (len > max-1) + len = max-1; + memcpy(dest, src, len); + dest[len] = '\0'; + return 1; +} + + +/* + * Process BOOTP extensions. + */ +static void __init ic_do_bootp_ext(u8 *ext) +{ + u8 servers; + int i; + +#ifdef IPCONFIG_DEBUG + u8 *c; + + printk("DHCP/BOOTP: Got extension %d:",*ext); + for(c=ext+2; c<ext+2+ext[1]; c++) + printk(" %02x", *c); + printk("\n"); +#endif + + switch (*ext++) { + case 1: /* Subnet mask */ + if (ic_netmask == INADDR_NONE) + memcpy(&ic_netmask, ext+1, 4); + break; + case 3: /* Default gateway */ + if (ic_gateway == INADDR_NONE) + memcpy(&ic_gateway, ext+1, 4); + break; + case 6: /* DNS server */ + servers= *ext/4; + if (servers > CONF_NAMESERVERS_MAX) + servers = CONF_NAMESERVERS_MAX; + for (i = 0; i < servers; i++) { + if (ic_nameservers[i] == INADDR_NONE) + memcpy(&ic_nameservers[i], ext+1+4*i, 4); + } + break; + case 12: /* Host name */ + ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_host_name_set = 1; + break; + case 15: /* Domain name (DNS) */ + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + break; + case 17: /* Root path */ + if (!root_server_path[0]) + ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); + break; + case 40: /* NIS Domain name (_not_ DNS) */ + ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + break; + } +} + + +/* + * Receive BOOTP reply. + */ +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct bootp_pkt *b; + struct iphdr *h; + struct ic_device *d; + int len, ext_len; + + /* Perform verifications before taking the lock. */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, + sizeof(struct iphdr) + + sizeof(struct udphdr))) + goto drop; + + b = (struct bootp_pkt *) skb->nh.iph; + h = &b->iph; + + if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) + goto drop; + + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET | IP_MF)) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " + "reply.\n"); + goto drop; + } + + if (skb->len < ntohs(h->tot_len)) + goto drop; + + if (ip_fast_csum((char *) h, h->ihl)) + goto drop; + + if (b->udph.source != htons(67) || b->udph.dest != htons(68)) + goto drop; + + if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; + + len = ntohs(b->udph.len) - sizeof(struct udphdr); + ext_len = len - (sizeof(*b) - + sizeof(struct iphdr) - + sizeof(struct udphdr) - + sizeof(b->exten)); + if (ext_len < 0) + goto drop; + + /* Ok the front looks good, make sure we can get at the rest. */ + if (!pskb_may_pull(skb, skb->len)) + goto drop; + + b = (struct bootp_pkt *) skb->nh.iph; + h = &b->iph; + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Is it a reply to our BOOTP request? */ + if (b->op != BOOTP_REPLY || + b->xid != d->xid) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " + "op[%x] xid[%x]\n", + b->op, b->xid); + goto drop_unlock; + } + + /* Parse extensions */ + if (ext_len >= 4 && + !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ + u8 *end = (u8 *) b + ntohs(b->iph.tot_len); + u8 *ext; + +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) { + u32 server_id = INADDR_NONE; + int mt = 0; + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext >= end) + break; + switch (*opt) { + case 53: /* Message type */ + if (opt[1]) + mt = opt[2]; + break; + case 54: /* Server ID (IP address) */ + if (opt[1] >= 4) + memcpy(&server_id, opt + 2, 4); + break; + }; + } + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Got message type %d\n", mt); +#endif + + switch (mt) { + case DHCPOFFER: + /* While in the process of accepting one offer, + * ignore all others. + */ + if (ic_myaddr != INADDR_NONE) + goto drop_unlock; + + /* Let's accept that offer. */ + ic_myaddr = b->your_ip; + ic_servaddr = server_id; +#ifdef IPCONFIG_DEBUG + printk("DHCP: Offered address %u.%u.%u.%u", + NIPQUAD(ic_myaddr)); + printk(" by server %u.%u.%u.%u\n", + NIPQUAD(ic_servaddr)); +#endif + /* The DHCP indicated server address takes + * precedence over the bootp header one if + * they are different. + */ + if ((server_id != INADDR_NONE) && + (b->server_ip != server_id)) + b->server_ip = ic_servaddr; + break; + + case DHCPACK: + if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0) + goto drop_unlock; + + /* Yeah! */ + break; + + default: + /* Urque. Forget it*/ + ic_myaddr = INADDR_NONE; + ic_servaddr = INADDR_NONE; + goto drop_unlock; + }; + + ic_dhcp_msgtype = mt; + + } +#endif /* IPCONFIG_DHCP */ + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext < end) + ic_do_bootp_ext(opt); + } + } + + /* We have a winner! */ + ic_dev = dev; + ic_myaddr = b->your_ip; + ic_servaddr = b->server_ip; + if (ic_gateway == INADDR_NONE && b->relay_ip) + ic_gateway = b->relay_ip; + if (ic_nameservers[0] == INADDR_NONE) + ic_nameservers[0] = ic_servaddr; + ic_got_reply = IC_BOOTP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + + return 0; +} + + +#endif + + +/* + * Dynamic IP configuration -- DHCP, BOOTP, RARP. + */ + +#ifdef IPCONFIG_DYNAMIC + +static int __init ic_dynamic(void) +{ + int retries; + struct ic_device *d; + unsigned long start_jiffies, timeout, jiff; + int do_bootp = ic_proto_have_if & IC_BOOTP; + int do_rarp = ic_proto_have_if & IC_RARP; + + /* + * If none of DHCP/BOOTP/RARP was selected, return with an error. + * This routine gets only called when some pieces of information + * are missing, and without DHCP/BOOTP/RARP we are unable to get it. + */ + if (!ic_proto_enabled) { + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + return -1; + } + +#ifdef IPCONFIG_BOOTP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) + printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); +#endif +#ifdef IPCONFIG_RARP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) + printk(KERN_ERR "RARP: No suitable device found.\n"); +#endif + + if (!ic_proto_have_if) + /* Error message already printed */ + return -1; + + /* + * Setup protocols + */ +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_init(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_init(); +#endif + + /* + * Send requests and wait, until we get an answer. This loop + * seems to be a terrible waste of CPU time, but actually there is + * only one process running at all, so we don't need to use any + * scheduler functions. + * [Actually we could now, but the nothing else running note still + * applies.. - AC] + */ + printk(KERN_NOTICE "Sending %s%s%s requests .", + do_bootp + ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", + (do_bootp && do_rarp) ? " and " : "", + do_rarp ? "RARP" : ""); + + start_jiffies = jiffies; + d = ic_first_dev; + retries = CONF_SEND_RETRIES; + get_random_bytes(&timeout, sizeof(timeout)); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + for(;;) { +#ifdef IPCONFIG_BOOTP + if (do_bootp && (d->able & IC_BOOTP)) + ic_bootp_send_if(d, jiffies - start_jiffies); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp && (d->able & IC_RARP)) + ic_rarp_send_if(d); +#endif + + jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); + while (time_before(jiffies, jiff) && !ic_got_reply) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + } +#ifdef IPCONFIG_DHCP + /* DHCP isn't done until we get a DHCPACK. */ + if ((ic_got_reply & IC_BOOTP) + && (ic_proto_enabled & IC_USE_DHCP) + && ic_dhcp_msgtype != DHCPACK) + { + ic_got_reply = 0; + printk(","); + continue; + } +#endif /* IPCONFIG_DHCP */ + + if (ic_got_reply) { + printk(" OK\n"); + break; + } + + if ((d = d->next)) + continue; + + if (! --retries) { + printk(" timed out!\n"); + break; + } + + d = ic_first_dev; + + timeout = timeout CONF_TIMEOUT_MULT; + if (timeout > CONF_TIMEOUT_MAX) + timeout = CONF_TIMEOUT_MAX; + + printk("."); + } + +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_cleanup(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_cleanup(); +#endif + + if (!ic_got_reply) + return -1; + + printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", + ((ic_got_reply & IC_RARP) ? "RARP" + : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), + NIPQUAD(ic_servaddr)); + printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr)); + + return 0; +} + +#endif /* IPCONFIG_DYNAMIC */ + +#ifdef CONFIG_PROC_FS + +static int pnp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + if (ic_proto_used & IC_PROTO) + seq_printf(seq, "#PROTO: %s\n", + (ic_proto_used & IC_RARP) ? "RARP" + : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP"); + else + seq_puts(seq, "#MANUAL\n"); + + if (ic_domain[0]) + seq_printf(seq, + "domain %s\n", ic_domain); + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { + if (ic_nameservers[i] != INADDR_NONE) + seq_printf(seq, + "nameserver %u.%u.%u.%u\n", + NIPQUAD(ic_nameservers[i])); + } + if (ic_servaddr != INADDR_NONE) + seq_printf(seq, + "bootserver %u.%u.%u.%u\n", + NIPQUAD(ic_servaddr)); + return 0; +} + +static int pnp_seq_open(struct inode *indoe, struct file *file) +{ + return single_open(file, pnp_seq_show, NULL); +} + +static struct file_operations pnp_seq_fops = { + .owner = THIS_MODULE, + .open = pnp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +/* + * Extract IP address from the parameter string if needed. Note that we + * need to have root_server_addr set _before_ IPConfig gets called as it + * can override it. + */ +u32 __init root_nfs_parse_addr(char *name) +{ + u32 addr; + int octets = 0; + char *cp, *cq; + + cp = cq = name; + while (octets < 4) { + while (*cp >= '0' && *cp <= '9') + cp++; + if (cp == cq || cp - cq > 3) + break; + if (*cp == '.' || octets == 3) + octets++; + if (octets < 4) + cp++; + cq = cp; + } + if (octets == 4 && (*cp == ':' || *cp == '\0')) { + if (*cp == ':') + *cp++ = '\0'; + addr = in_aton(name); + memmove(name, cp, strlen(cp) + 1); + } else + addr = INADDR_NONE; + + return addr; +} + +/* + * IP Autoconfig dispatcher. + */ + +static int __init ip_auto_config(void) +{ + u32 addr; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops); +#endif /* CONFIG_PROC_FS */ + + if (!ic_enable) + return 0; + + DBG(("IP-Config: Entered.\n")); +#ifdef IPCONFIG_DYNAMIC + try_try_again: +#endif + /* Give hardware a chance to settle */ + msleep(CONF_PRE_OPEN); + + /* Setup all network devices */ + if (ic_open_devs() < 0) + return -1; + + /* Give drivers a chance to settle */ + ssleep(CONF_POST_OPEN); + + /* + * If the config information is insufficient (e.g., our IP address or + * IP address of the boot server is missing or we have multiple network + * interfaces and no default was set), use BOOTP or RARP to get the + * missing values. + */ + if (ic_myaddr == INADDR_NONE || +#ifdef CONFIG_ROOT_NFS + (MAJOR(ROOT_DEV) == UNNAMED_MAJOR + && root_server_addr == INADDR_NONE + && ic_servaddr == INADDR_NONE) || +#endif + ic_first_dev->next) { +#ifdef IPCONFIG_DYNAMIC + + int retries = CONF_OPEN_RETRIES; + + if (ic_dynamic() < 0) { + ic_close_devs(); + + /* + * I don't know why, but sometimes the + * eepro100 driver (at least) gets upset and + * doesn't work the first time it's opened. + * But then if you close it and reopen it, it + * works just fine. So we need to try that at + * least once before giving up. + * + * Also, if the root will be NFS-mounted, we + * have nowhere to go if DHCP fails. So we + * just have to keep trying forever. + * + * -- Chip + */ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + printk(KERN_ERR + "IP-Config: Retrying forever (NFS root)...\n"); + goto try_try_again; + } +#endif + + if (--retries) { + printk(KERN_ERR + "IP-Config: Reopening network devices...\n"); + goto try_try_again; + } + + /* Oh, well. At least we tried. */ + printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); + return -1; + } +#else /* !DYNAMIC */ + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + ic_close_devs(); + return -1; +#endif /* IPCONFIG_DYNAMIC */ + } else { + /* Device selected manually or only one device -> use it */ + ic_dev = ic_first_dev->dev; + } + + addr = root_nfs_parse_addr(root_server_path); + if (root_server_addr == INADDR_NONE) + root_server_addr = addr; + + /* + * Use defaults whereever applicable. + */ + if (ic_defaults() < 0) + return -1; + + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + ic_close_devs(); + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + return -1; + + /* + * Record which protocol was actually used. + */ +#ifdef IPCONFIG_DYNAMIC + ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP); +#endif + +#ifndef IPCONFIG_SILENT + /* + * Clue in the operator. + */ + printk("IP-Config: Complete:"); + printk("\n device=%s", ic_dev->name); + printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); + printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); + printk(",\n host=%s, domain=%s, nis-domain=%s", + system_utsname.nodename, ic_domain, system_utsname.domainname); + printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); + printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); + printk(", rootpath=%s", root_server_path); + printk("\n"); +#endif /* !SILENT */ + + return 0; +} + +late_initcall(ip_auto_config); + + +/* + * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel + * command line parameter. It consists of option fields separated by colons in + * the following order: + * + * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO> + * + * Any of the fields can be empty which means to use a default value: + * <client-ip> - address given by BOOTP or RARP + * <server-ip> - address of host returning BOOTP or RARP packet + * <gw-ip> - none, or the address returned by BOOTP + * <netmask> - automatically determined from <client-ip>, or the + * one returned by BOOTP + * <host name> - <client-ip> in ASCII notation, or the name returned + * by BOOTP + * <device> - use all available devices + * <PROTO>: + * off|none - don't do autoconfig at all (DEFAULT) + * on|any - use any configured protocol + * dhcp|bootp|rarp - use only the specified protocol + * both - use both BOOTP and RARP (not DHCP) + */ +static int __init ic_proto_name(char *name) +{ + if (!strcmp(name, "on") || !strcmp(name, "any")) { + return 1; + } +#ifdef CONFIG_IP_PNP_DHCP + else if (!strcmp(name, "dhcp")) { + ic_proto_enabled &= ~IC_RARP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_BOOTP + else if (!strcmp(name, "bootp")) { + ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + else if (!strcmp(name, "rarp")) { + ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef IPCONFIG_DYNAMIC + else if (!strcmp(name, "both")) { + ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */ + return 1; + } +#endif + return 0; +} + +static int __init ip_auto_config_setup(char *addrs) +{ + char *cp, *ip, *dp; + int num = 0; + + ic_set_manually = 1; + + ic_enable = (*addrs && + (strcmp(addrs, "off") != 0) && + (strcmp(addrs, "none") != 0)); + if (!ic_enable) + return 1; + + if (ic_proto_name(addrs)) + return 1; + + /* Parse the whole string */ + ip = addrs; + while (ip && *ip) { + if ((cp = strchr(ip, ':'))) + *cp++ = '\0'; + if (strlen(ip) > 0) { + DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + switch (num) { + case 0: + if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + ic_myaddr = INADDR_NONE; + break; + case 1: + if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + ic_servaddr = INADDR_NONE; + break; + case 2: + if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + ic_gateway = INADDR_NONE; + break; + case 3: + if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + ic_netmask = INADDR_NONE; + break; + case 4: + if ((dp = strchr(ip, '.'))) { + *dp++ = '\0'; + strlcpy(system_utsname.domainname, dp, + sizeof(system_utsname.domainname)); + } + strlcpy(system_utsname.nodename, ip, + sizeof(system_utsname.nodename)); + ic_host_name_set = 1; + break; + case 5: + strlcpy(user_dev_name, ip, sizeof(user_dev_name)); + break; + case 6: + ic_proto_name(ip); + break; + } + } + ip = cp; + num++; + } + + return 1; +} + +static int __init nfsaddrs_config_setup(char *addrs) +{ + return ip_auto_config_setup(addrs); +} + +__setup("ip=", ip_auto_config_setup); +__setup("nfsaddrs=", nfsaddrs_config_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c new file mode 100644 index 000000000000..68a78731f722 --- /dev/null +++ b/net/ipv4/ipip.c @@ -0,0 +1,905 @@ +/* + * Linux NET3: IP/IP protocol decoder. + * + * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $ + * + * Authors: + * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + * + * Fixes: + * Alan Cox : Merged and made usable non modular (its so tiny its silly as + * a module taking up 2 pages). + * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) + * to keep ip_forward happy. + * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). + * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (Alan.Cox@linux.org) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ipip.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip_fb_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); +static void ipip_tunnel_setup(struct net_device *dev); + +static struct net_device *ipip_fb_tunnel_dev; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static DEFINE_RWLOCK(ipip_lock); + +static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &tunnels[prio][h]; +} + + +static void ipip_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipip_lock); + *tp = t->next; + write_unlock_bh(&ipip_lock); + break; + } + } +} + +static void ipip_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipip_bucket(t); + + t->next = *tp; + write_lock_bh(&ipip_lock); + *tp = t; + write_unlock_bh(&ipip_lock); +} + +static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = 0; + int prio = 0; + char name[IFNAMSIZ]; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "tunl%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); + if (dev == NULL) + return NULL; + + nt = dev->priv; + SET_MODULE_OWNER(dev); + dev->init = ipip_tunnel_init; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + dev_hold(dev); + ipip_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipip_tunnel_uninit(struct net_device *dev) +{ + if (dev == ipip_fb_tunnel_dev) { + write_lock_bh(&ipip_lock); + tunnels_wc[0] = NULL; + write_unlock_bh(&ipip_lock); + } else + ipip_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); +} + +static void ipip_err(struct sk_buff *skb, void *__unused) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)skb->data; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipip_lock); + t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipip_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct iphdr *eiph; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct flowi fl; + struct rtable *rt; + + if (len < hlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necessary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < hlen+68) + return; + rel_info -= hlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + memset(&fl, 0, sizeof(fl)); + fl.fl4_daddr = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_IPIP; + if (ip_route_output_key(&rt, &key)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + fl.fl4_daddr = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_TUNNEL) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_TUNNEL) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb2->dst)) { + kfree_skb(skb2); + return; + } + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); + return; +#endif +} + +static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb) +{ + struct iphdr *inner_iph = skb->nh.iph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int ipip_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + iph = skb->nh.iph; + + read_lock(&ipip_lock); + if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + read_unlock(&ipip_lock); + kfree_skb(skb); + return 0; + } + + secpath_reset(skb); + + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipip_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipip_lock); + return 0; + } + read_unlock(&ipip_lock); + +out: + return -1; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + u16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != htons(ETH_P_IP)) + goto tx_error; + + if (tos&1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = (struct rtable*)skb->dst) == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + if (tiph->frag_off) + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off&htons(IP_DF)); + + if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipip_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = (struct ip_tunnel*)dev->priv; + ipip_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipip_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t->dev == ipip_fb_tunnel_dev) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipip_tunnel_uninit; + dev->hard_start_xmit = ipip_tunnel_xmit; + dev->get_stats = ipip_tunnel_get_stats; + dev->do_ioctl = ipip_tunnel_ioctl; + dev->change_mtu = ipip_tunnel_change_mtu; + dev->destructor = free_netdev; + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipip_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +static int __init ipip_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + +static struct xfrm_tunnel ipip_handler = { + .handler = ipip_rcv, + .err_handler = ipip_err, +}; + +static char banner[] __initdata = + KERN_INFO "IPv4 over IPv4 tunneling driver\n"; + +static int __init ipip_init(void) +{ + int err; + + printk(banner); + + if (xfrm4_tunnel_register(&ipip_handler) < 0) { + printk(KERN_INFO "ipip init: can't register tunnel\n"); + return -EAGAIN; + } + + ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + "tunl0", + ipip_tunnel_setup); + if (!ipip_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init; + + if ((err = register_netdev(ipip_fb_tunnel_dev))) + goto err2; + out: + return err; + err2: + free_netdev(ipip_fb_tunnel_dev); + err1: + xfrm4_tunnel_deregister(&ipip_handler); + goto out; +} + +static void __exit ipip_fini(void) +{ + if (xfrm4_tunnel_deregister(&ipip_handler) < 0) + printk(KERN_INFO "ipip close: can't deregister tunnel\n"); + + unregister_netdev(ipip_fb_tunnel_dev); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c new file mode 100644 index 000000000000..e21c049ec62a --- /dev/null +++ b/net/ipv4/ipmr.c @@ -0,0 +1,1900 @@ +/* + * IP multicast routing support for mrouted 3.6/3.8 + * + * (c) 1995 Alan Cox, <alan@redhat.com> + * Linux Consultancy and Custom Driver Development + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $ + * + * Fixes: + * Michael Chastain : Incorrect size of copying. + * Alan Cox : Added the cache manager code + * Alan Cox : Fixed the clone/copy bug and device race. + * Mike McLagan : Routing by source + * Malcolm Beattie : Buffer handling fixes. + * Alexey Kuznetsov : Double buffer free and other fixes. + * SVR Anand : Fixed several multicast bugs and problems. + * Alexey Kuznetsov : Status, optimisations and more. + * Brad Parker : Better behaviour on mrouted upcall + * overflow. + * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requrement to work with older peers. + * + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/netfilter_ipv4.h> +#include <net/ipip.h> +#include <net/checksum.h> + +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + +static struct sock *mroute_socket; + + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +static struct vif_device vif_table[MAXVIFS]; /* Devices */ +static int maxvif; + +#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL) + +static int mroute_do_assert; /* Set in PIM assert */ +static int mroute_do_pim; + +static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ + +static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ +static atomic_t cache_resolve_queue_len; /* Size of unresolved */ + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + entries is changed only in process context and protected + with weak lock mrt_lock. Queue of unresolved entries is protected + with strong spinlock mfc_unres_lock. + + In this case data path is free of exclusive locks at all. + */ + +static kmem_cache_t *mrt_cachep; + +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); + +#ifdef CONFIG_IP_PIMSM_V2 +static struct net_protocol pim_protocol; +#endif + +static struct timer_list ipmr_expire_timer; + +/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ + +static +struct net_device *ipmr_new_tunnel(struct vifctl *v) +{ + struct net_device *dev; + + dev = __dev_get_by_name("tunl0"); + + if (dev) { + int err; + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + dev = NULL; + + if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = __in_dev_get(dev); + if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + goto failure; + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + } + } + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static int reg_vif_num = -1; + +static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) +{ + read_lock(&mrt_lock); + ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)dev->priv)->tx_packets++; + ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return 0; +} + +static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) +{ + return (struct net_device_stats*)dev->priv; +} + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->hard_start_xmit = reg_vif_xmit; + dev->get_stats = reg_vif_get_stats; + dev->destructor = free_netdev; +} + +static struct net_device *ipmr_reg_vif(void) +{ + struct net_device *dev; + struct in_device *in_dev; + + dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", + reg_vif_setup); + + if (dev == NULL) + return NULL; + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + if ((in_dev = inetdev_init(dev)) == NULL) + goto failure; + + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int vif_delete(int vifi) +{ + struct vif_device *v; + struct net_device *dev; + struct in_device *in_dev; + + if (vifi < 0 || vifi >= maxvif) + return -EADDRNOTAVAIL; + + v = &vif_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IP_PIMSM + if (vifi == reg_vif_num) + reg_vif_num = -1; +#endif + + if (vifi+1 == maxvif) { + int tmp; + for (tmp=vifi-1; tmp>=0; tmp--) { + if (VIF_EXISTS(tmp)) + break; + } + maxvif = tmp+1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + if ((in_dev = __in_dev_get(dev)) != NULL) { + in_dev->cnf.mc_forwarding--; + ip_rt_multicast_event(in_dev); + } + + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + unregister_netdevice(dev); + + dev_put(dev); + return 0; +} + +/* Destroy an unresolved cache entry, killing queued skbs + and reporting error to netlink readers. + */ + +static void ipmr_destroy_unres(struct mfc_cache *c) +{ + struct sk_buff *skb; + + atomic_dec(&cache_resolve_queue_len); + + while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { + if (skb->nh.iph->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else + kfree_skb(skb); + } + + kmem_cache_free(mrt_cachep, c); +} + + +/* Single timer process for all the unresolved queue. */ + +static void ipmr_expire_process(unsigned long dummy) +{ + unsigned long now; + unsigned long expires; + struct mfc_cache *c, **cp; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&ipmr_expire_timer, jiffies+HZ/10); + return; + } + + if (atomic_read(&cache_resolve_queue_len) == 0) + goto out; + + now = jiffies; + expires = 10*HZ; + cp = &mfc_unres_queue; + + while ((c=*cp) != NULL) { + if (time_after(c->mfc_un.unres.expires, now)) { + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + cp = &c->next; + continue; + } + + *cp = c->next; + + ipmr_destroy_unres(c); + } + + if (atomic_read(&cache_resolve_queue_len)) + mod_timer(&ipmr_expire_timer, jiffies + expires); + +out: + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXVIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXVIFS); + + for (vifi=0; vifi<maxvif; vifi++) { + if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int vif_add(struct vifctl *vifc, int mrtsock) +{ + int vifi = vifc->vifc_vifi; + struct vif_device *v = &vif_table[vifi]; + struct net_device *dev; + struct in_device *in_dev; + + /* Is vif busy ? */ + if (VIF_EXISTS(vifi)) + return -EADDRINUSE; + + switch (vifc->vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (reg_vif_num >= 0) + return -EADDRINUSE; + dev = ipmr_reg_vif(); + if (!dev) + return -ENOBUFS; + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(vifc); + if (!dev) + return -ENOBUFS; + break; + case 0: + dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr); + if (!dev) + return -EADDRNOTAVAIL; + __dev_put(dev); + break; + default: + return -EINVAL; + } + + if ((in_dev = __in_dev_get(dev)) == NULL) + return -EADDRNOTAVAIL; + in_dev->cnf.mc_forwarding++; + dev_set_allmulti(dev, +1); + ip_rt_multicast_event(in_dev); + + /* + * Fill in the VIF structures + */ + v->rate_limit=vifc->vifc_rate_limit; + v->local=vifc->vifc_lcl_addr.s_addr; + v->remote=vifc->vifc_rmt_addr.s_addr; + v->flags=vifc->vifc_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold=vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + dev_hold(dev); + v->dev=dev; +#ifdef CONFIG_IP_PIMSM + if (v->flags&VIFF_REGISTER) + reg_vif_num = vifi; +#endif + if (vifi+1 > maxvif) + maxvif = vifi+1; + write_unlock_bh(&mrt_lock); + return 0; +} + +static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp) +{ + int line=MFC_HASH(mcastgrp,origin); + struct mfc_cache *c; + + for (c=mfc_cache_array[line]; c; c = c->next) { + if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) + break; + } + return c; +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc_cache *ipmr_cache_alloc(void) +{ + struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(*c)); + c->mfc_un.res.minvif = MAXVIFS; + return c; +} + +static struct mfc_cache *ipmr_cache_alloc_unres(void) +{ + struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(*c)); + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) +{ + struct sk_buff *skb; + + /* + * Play the pending entries through our router + */ + + while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (skb->nh.iph->version == 0) { + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb->tail - (u8*)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else + ip_mr_forward(skb, c, 0); + } +} + +/* + * Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) +{ + struct sk_buff *skb; + int ihl = pkt->nh.iph->ihl<<2; + struct igmphdr *igmp; + struct igmpmsg *msg; + int ret; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + + if(!skb) + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix ihl, length etc. + And all this only to mangle msg->im_msgtype and + to set msg->im_mbz to "mbz" :-) + */ + msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); + skb->nh.raw = skb->h.raw = (u8*)msg; + memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = reg_vif_num; + skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; + skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + } else +#endif + { + + /* + * Copy the IP header + */ + + skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); + memcpy(skb->data,pkt->data,ihl); + skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg*)skb->nh.iph; + msg->im_vif = vifi; + skb->dst = dst_clone(pkt->dst); + + /* + * Add our header + */ + + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); + igmp->type = + msg->im_msgtype = assert; + igmp->code = 0; + skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ + skb->h.raw = skb->nh.raw; + } + + if (mroute_socket == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + /* + * Deliver to mrouted + */ + if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) +{ + int err; + struct mfc_cache *c; + + spin_lock_bh(&mfc_unres_lock); + for (c=mfc_unres_queue; c; c=c->next) { + if (c->mfc_mcastgrp == skb->nh.iph->daddr && + c->mfc_origin == skb->nh.iph->saddr) + break; + } + + if (c == NULL) { + /* + * Create a new entry if allowable + */ + + if (atomic_read(&cache_resolve_queue_len)>=10 || + (c=ipmr_cache_alloc_unres())==NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* + * Fill in the new cache entry + */ + c->mfc_parent=-1; + c->mfc_origin=skb->nh.iph->saddr; + c->mfc_mcastgrp=skb->nh.iph->daddr; + + /* + * Reflect first query at mrouted. + */ + if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + kmem_cache_free(mrt_cachep, c); + kfree_skb(skb); + return err; + } + + atomic_inc(&cache_resolve_queue_len); + c->next = mfc_unres_queue; + mfc_unres_queue = c; + + mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); + } + + /* + * See if we can append the packet + */ + if (c->mfc_un.unres.unresolved.qlen>3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved,skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC cache manipulation by user space mroute daemon + */ + +static int ipmr_mfc_delete(struct mfcctl *mfc) +{ + int line; + struct mfc_cache *c, **cp; + + line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + write_lock_bh(&mrt_lock); + *cp = c->next; + write_unlock_bh(&mrt_lock); + + kmem_cache_free(mrt_cachep, c); + return 0; + } + } + return -ENOENT; +} + +static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) +{ + int line; + struct mfc_cache *uc, *c, **cp; + + line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) + break; + } + + if (c != NULL) { + write_lock_bh(&mrt_lock); + c->mfc_parent = mfc->mfcc_parent; + ipmr_update_threshoulds(c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + return 0; + } + + if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + return -EINVAL; + + c=ipmr_cache_alloc(); + if (c==NULL) + return -ENOMEM; + + c->mfc_origin=mfc->mfcc_origin.s_addr; + c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; + c->mfc_parent=mfc->mfcc_parent; + ipmr_update_threshoulds(c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + write_lock_bh(&mrt_lock); + c->next = mfc_cache_array[line]; + mfc_cache_array[line] = c; + write_unlock_bh(&mrt_lock); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + spin_lock_bh(&mfc_unres_lock); + for (cp = &mfc_unres_queue; (uc=*cp) != NULL; + cp = &uc->next) { + if (uc->mfc_origin == c->mfc_origin && + uc->mfc_mcastgrp == c->mfc_mcastgrp) { + *cp = uc->next; + if (atomic_dec_and_test(&cache_resolve_queue_len)) + del_timer(&ipmr_expire_timer); + break; + } + } + spin_unlock_bh(&mfc_unres_lock); + + if (uc) { + ipmr_cache_resolve(uc, c); + kmem_cache_free(mrt_cachep, uc); + } + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct sock *sk) +{ + int i; + + /* + * Shut down all active vif entries + */ + for(i=0; i<maxvif; i++) { + if (!(vif_table[i].flags&VIFF_STATIC)) + vif_delete(i); + } + + /* + * Wipe the cache + */ + for (i=0;i<MFC_LINES;i++) { + struct mfc_cache *c, **cp; + + cp = &mfc_cache_array[i]; + while ((c = *cp) != NULL) { + if (c->mfc_flags&MFC_STATIC) { + cp = &c->next; + continue; + } + write_lock_bh(&mrt_lock); + *cp = c->next; + write_unlock_bh(&mrt_lock); + + kmem_cache_free(mrt_cachep, c); + } + } + + if (atomic_read(&cache_resolve_queue_len) != 0) { + struct mfc_cache *c; + + spin_lock_bh(&mfc_unres_lock); + while (mfc_unres_queue != NULL) { + c = mfc_unres_queue; + mfc_unres_queue = c->next; + spin_unlock_bh(&mfc_unres_lock); + + ipmr_destroy_unres(c); + + spin_lock_bh(&mfc_unres_lock); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +static void mrtsock_destruct(struct sock *sk) +{ + rtnl_lock(); + if (sk == mroute_socket) { + ipv4_devconf.mc_forwarding--; + + write_lock_bh(&mrt_lock); + mroute_socket=NULL; + write_unlock_bh(&mrt_lock); + + mroute_clean_tables(sk); + } + rtnl_unlock(); +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen) +{ + int ret; + struct vifctl vif; + struct mfcctl mfc; + + if(optname!=MRT_INIT) + { + if(sk!=mroute_socket && !capable(CAP_NET_ADMIN)) + return -EACCES; + } + + switch(optname) + { + case MRT_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->num != IPPROTO_IGMP) + return -EOPNOTSUPP; + if(optlen!=sizeof(int)) + return -ENOPROTOOPT; + + rtnl_lock(); + if (mroute_socket) { + rtnl_unlock(); + return -EADDRINUSE; + } + + ret = ip_ra_control(sk, 1, mrtsock_destruct); + if (ret == 0) { + write_lock_bh(&mrt_lock); + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + + ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; + case MRT_DONE: + if (sk!=mroute_socket) + return -EACCES; + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if(optlen!=sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif,optval,sizeof(vif))) + return -EFAULT; + if(vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + rtnl_lock(); + if (optname==MRT_ADD_VIF) { + ret = vif_add(&vif, sk==mroute_socket); + } else { + ret = vif_delete(vif.vifc_vifi); + } + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if(optlen!=sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname==MRT_DEL_MFC) + ret = ipmr_mfc_delete(&mfc); + else + ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + rtnl_unlock(); + return ret; + /* + * Control PIM assert. + */ + case MRT_ASSERT: + { + int v; + if(get_user(v,(int __user *)optval)) + return -EFAULT; + mroute_do_assert=(v)?1:0; + return 0; + } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v, ret; + if(get_user(v,(int __user *)optval)) + return -EFAULT; + v = (v)?1:0; + rtnl_lock(); + ret = 0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; +#ifdef CONFIG_IP_PIMSM_V2 + if (mroute_do_pim) + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); + else + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; +#endif + } + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen) +{ + int olr; + int val; + + if(optname!=MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname!=MRT_PIM && +#endif + optname!=MRT_ASSERT) + return -ENOPROTOOPT; + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(unsigned int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if(put_user(olr,optlen)) + return -EFAULT; + if(optname==MRT_VERSION) + val=0x0305; +#ifdef CONFIG_IP_PIMSM + else if(optname==MRT_PIM) + val=mroute_do_pim; +#endif + else + val=mroute_do_assert; + if(copy_to_user(optval,&val,olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req sr; + struct sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + + switch(cmd) + { + case SIOCGETVIFCNT: + if (copy_from_user(&vr,arg,sizeof(vr))) + return -EFAULT; + if(vr.vifi>=maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif=&vif_table[vr.vifi]; + if(VIF_EXISTS(vr.vifi)) { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg,&vr,sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr,arg,sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg,&sr,sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + + +static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct vif_device *v; + int ct; + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + v=&vif_table[0]; + for(ct=0;ct<maxvif;ct++,v++) { + if (v->dev==ptr) + vif_delete(ct); + } + return NOTIFY_DONE; +} + + +static struct notifier_block ip_mr_notifier={ + .notifier_call = ipmr_device_event, +}; + +/* + * Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. + */ + +static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->tos = skb->nh.iph->tos; + iph->ttl = skb->nh.iph->ttl; + iph->frag_off = 0; + iph->daddr = daddr; + iph->saddr = saddr; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, skb->dst, NULL); + ip_send_check(iph); + + skb->h.ipiph = skb->nh.iph; + skb->nh.iph = iph; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + nf_reset(skb); +} + +static inline int ipmr_forward_finish(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +/* + * Processing handlers for ipmr_forward + */ + +static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) +{ + struct iphdr *iph = skb->nh.iph; + struct vif_device *vif = &vif_table[vifi]; + struct net_device *dev; + struct rtable *rt; + int encap = 0; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out+=skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + kfree_skb(skb); + return; + } +#endif + + if (vif->flags&VIFF_TUNNEL) { + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) + goto out_free; + encap = sizeof(struct iphdr); + } else { + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) + goto out_free; + } + + dev = rt->u.dst.dev; + + if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + /* Do not fragment multicasts. Alas, IPv4 does not + allow to send ICMP, so that packets will disappear + to blackhole. + */ + + IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + ip_rt_put(rt); + goto out_free; + } + + encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; + + if (skb_cow(skb, encap)) { + ip_rt_put(rt); + goto out_free; + } + + vif->pkt_out++; + vif->bytes_out+=skb->len; + + dst_release(skb->dst); + skb->dst = &rt->u.dst; + iph = skb->nh.iph; + ip_decrease_ttl(iph); + + /* FIXME: forward and output firewalls used to be called here. + * What do we do with netfilter? -- RR */ + if (vif->flags & VIFF_TUNNEL) { + ip_encap(skb, vif->local, vif->remote); + /* FIXME: extra output firewall step used to be here. --RR */ + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len; + } + + IPCB(skb)->flags |= IPSKB_FORWARDED; + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, + ipmr_forward_finish); + return; + +out_free: + kfree_skb(skb); + return; +} + +static int ipmr_find_vif(struct net_device *dev) +{ + int ct; + for (ct=maxvif-1; ct>=0; ct--) { + if (vif_table[ct].dev == dev) + break; + } + return ct; +} + +/* "local" means that we should preserve one skb (for local delivery) */ + +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) +{ + int psend = -1; + int vif, ct; + + vif = cache->mfc_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (((struct rtable*)skb->dst)->fl.iif == 0) { + /* It is our own packet, looped back. + Very complicated situation... + + The best workaround until routing daemons will be + fixed is not to redistribute packet, if it was + send through wrong interface. It means, that + multicast applications WILL NOT work for + (S,G), which have default multicast route pointing + to wrong oif. In any case, it is not a good + idea to use multicasting applications on router. + */ + goto dont_forward; + } + + cache->mfc_un.res.wrong_if++; + true_vifi = ipmr_find_vif(skb->dev); + + if (true_vifi >= 0 && mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); + } + goto dont_forward; + } + + vif_table[vif].pkt_in++; + vif_table[vif].bytes_in+=skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { + if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } + psend=ct; + } + } + if (psend != -1) { + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } else { + ipmr_queue_xmit(skb, cache, psend); + return 0; + } + } + +dont_forward: + if (!local) + kfree_skb(skb); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + + /* Packet is looped back after forward, it should not be + forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags&IPSKB_FORWARDED) + goto dont_forward; + + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (skb->nh.iph->protocol == IPPROTO_IGMP){ + /* IGMPv1 (and broken IGMPv2 implementations sort of + Cisco IOS <= 11.2(8)) do not put router alert + option to IGMP packets destined to routable + groups. It is very bad, because it means + that we can forward NO IGMP messages. + */ + read_lock(&mrt_lock); + if (mroute_socket) { + raw_rcv(mroute_socket, skb); + read_unlock(&mrt_lock); + return 0; + } + read_unlock(&mrt_lock); + } + } + + read_lock(&mrt_lock); + cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + + /* + * No usable cache entry + */ + if (cache==NULL) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) { + read_unlock(&mrt_lock); + return -ENOBUFS; + } + skb = skb2; + } + + vif = ipmr_find_vif(skb->dev); + if (vif >= 0) { + int err = ipmr_cache_unresolved(vif, skb); + read_unlock(&mrt_lock); + + return err; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + ip_mr_forward(skb, cache, local); + + read_unlock(&mrt_lock); + + if (local) + return ip_local_deliver(skb); + + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff * skb) +{ + struct igmphdr *pim; + struct iphdr *encap; + struct net_device *reg_dev = NULL; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct igmphdr*)skb->h.raw; + + if (!mroute_do_pim || + skb->len < sizeof(*pim) + sizeof(*encap) || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) + goto drop; + + encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + /* + Check that: + a. packet is really destinted to a multicast group + b. packet is not a NULL-REGISTER + c. packet is not truncated + */ + if (!MULTICAST(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = vif_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + skb->dst = NULL; + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + nf_reset(skb); + netif_rx(skb); + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static int pim_rcv(struct sk_buff * skb) +{ + struct pimreghdr *pim; + struct iphdr *encap; + struct net_device *reg_dev = NULL; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct pimreghdr*)skb->h.raw; + if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || + (pim->flags&PIM_NULL_REGISTER) || + (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + /* check if the inner packet is destined to mcast group */ + encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + if (!MULTICAST(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = vif_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + skb->dst = NULL; + nf_reset(skb); + netif_rx(skb); + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} +#endif + +static int +ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + struct net_device *dev = vif_table[c->mfc_parent].dev; + u8 *b = skb->tail; + struct rtattr *mp_head; + + if (dev) + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + + mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (c->mfc_un.res.ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -EMSGSIZE; +} + +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + int err; + struct mfc_cache *cache; + struct rtable *rt = (struct rtable*)skb->dst; + + read_lock(&mrt_lock); + cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + + if (cache==NULL) { + struct net_device *dev; + int vif; + + if (nowait) { + read_unlock(&mrt_lock); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { + read_unlock(&mrt_lock); + return -ENODEV; + } + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb->nh.iph->saddr = rt->rt_src; + skb->nh.iph->daddr = rt->rt_dst; + skb->nh.iph->version = 0; + err = ipmr_cache_unresolved(vif, skb); + read_unlock(&mrt_lock); + return err; + } + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + err = ipmr_fill_mroute(skb, cache, rtm); + read_unlock(&mrt_lock); + return err; +} + +#ifdef CONFIG_PROC_FS +/* + * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + */ +struct ipmr_vif_iter { + int ct; +}; + +static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, + loff_t pos) +{ + for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { + if(!VIF_EXISTS(iter->ct)) + continue; + if (pos-- == 0) + return &vif_table[iter->ct]; + } + return NULL; +} + +static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&mrt_lock); + return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ipmr_vif_seq_idx(iter, 0); + + while (++iter->ct < maxvif) { + if(!VIF_EXISTS(iter->ct)) + continue; + return &vif_table[iter->ct]; + } + return NULL; +} + +static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&mrt_lock); +} + +static int ipmr_vif_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); + } else { + const struct vif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + vif - vif_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags, vif->local, vif->remote); + } + return 0; +} + +static struct seq_operations ipmr_vif_seq_ops = { + .start = ipmr_vif_seq_start, + .next = ipmr_vif_seq_next, + .stop = ipmr_vif_seq_stop, + .show = ipmr_vif_seq_show, +}; + +static int ipmr_vif_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ipmr_vif_seq_ops); + if (rc) + goto out_kfree; + + s->ct = 0; + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; + +} + +static struct file_operations ipmr_vif_fops = { + .owner = THIS_MODULE, + .open = ipmr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct ipmr_mfc_iter { + struct mfc_cache **cache; + int ct; +}; + + +static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mfc_cache *mfc; + + it->cache = mfc_cache_array; + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC_LINES; it->ct++) + for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) + if (pos-- == 0) + return mfc; + read_unlock(&mrt_lock); + + it->cache = &mfc_unres_queue; + spin_lock_bh(&mfc_unres_lock); + for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + it->cache = NULL; + it->ct = 0; + return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(seq->private, 0); + + if (mfc->next) + return mfc->next; + + if (it->cache == &mfc_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != mfc_cache_array); + + while (++it->ct < MFC_LINES) { + mfc = mfc_cache_array[it->ct]; + if (mfc) + return mfc; + } + + /* exhausted cache_array, show unresolved */ + read_unlock(&mrt_lock); + it->cache = &mfc_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + mfc = mfc_unres_queue; + if (mfc) + return mfc; + + end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + + if (it->cache == &mfc_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == mfc_cache_array) + read_unlock(&mrt_lock); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + + seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld", + (unsigned long) mfc->mfc_mcastgrp, + (unsigned long) mfc->mfc_origin, + mfc->mfc_parent, + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + + if (it->cache != &mfc_unres_queue) { + for(n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++ ) { + if(VIF_EXISTS(n) + && mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } + seq_putc(seq, '\n'); + } + return 0; +} + +static struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ipmr_mfc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; + +} + +static struct file_operations ipmr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static struct net_protocol pim_protocol = { + .handler = pim_rcv, +}; +#endif + + +/* + * Setup for IP multicast routing + */ + +void __init ip_mr_init(void) +{ + mrt_cachep = kmem_cache_create("ip_mrt_cache", + sizeof(struct mfc_cache), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!mrt_cachep) + panic("cannot allocate ip_mrt_cache"); + + init_timer(&ipmr_expire_timer); + ipmr_expire_timer.function=ipmr_expire_process; + register_netdevice_notifier(&ip_mr_notifier); +#ifdef CONFIG_PROC_FS + proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops); + proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops); +#endif +} diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig new file mode 100644 index 000000000000..63a82b4b64bb --- /dev/null +++ b/net/ipv4/ipvs/Kconfig @@ -0,0 +1,244 @@ +# +# IP Virtual Server configuration +# +menu "IP: Virtual Server Configuration" + depends on INET && NETFILTER + +config IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on INET && NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: <http://www.linuxvirtualserver.org/>. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + depends on IP_VS + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + depends on IP_VS + default "12" + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + depends on IP_VS + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing ESP (Encapsultion + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + depends on IP_VS + +config IP_VS_RR + tristate "round-robin scheduling" + depends on IP_VS + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + depends on IP_VS + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + depends on IP_VS + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + depends on IP_VS + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + depends on IP_VS + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + depends on IP_VS + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + depends on IP_VS + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + depends on IP_VS + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + depends on IP_VS + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + depends on IP_VS + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + depends on IP_VS + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS && IP_VS_PROTO_TCP + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +endmenu diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile new file mode 100644 index 000000000000..a788461a40c9 --- /dev/null +++ b/net/ipv4/ipvs/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c new file mode 100644 index 000000000000..d9212addd193 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -0,0 +1,658 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DECLARE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + /* test and get the module atomically */ + if (app->module) + return try_module_get(app->module); + else + return 1; +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + if (app->module) + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL); + if (!inc) + return -ENOMEM; + memcpy(inc, app, sizeof(*inc)); + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + if (inc->timeout_table) + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + if (inc->timeout_table != NULL) + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + down(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + up(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + down(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + up(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + down(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + up(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +#if 0000 +/* + * Get reference to app by name (called from user context) + */ +struct ip_vs_app *ip_vs_app_get_by_name(char *appname) +{ + struct ip_vs_app *app, *a = NULL; + + down(&__ip_vs_app_mutex); + + list_for_each_entry(ent, &ip_vs_app_list, a_list) { + if (strcmp(app->name, appname)) + continue; + + /* softirq may call ip_vs_app_get too, so the caller + must disable softirq on the current CPU */ + if (ip_vs_app_get(app)) + a = app; + break; + } + + up(&__ip_vs_app_mutex); + + return a; +} +#endif + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb, + struct ip_vs_app *app) +{ + int diff; + unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + struct tcphdr *th; + __u32 seq; + + if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, pskb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, pskb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, pskb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb, + struct ip_vs_app *app) +{ + int diff; + unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + struct tcphdr *th; + __u32 seq; + + if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, pskb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, pskb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, pskb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + down(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + up(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_app_seq_ops); +} + +static struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + struct iphdr *iph; + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + memcpy(skb->data + o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove("ip_vs_app"); +} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c new file mode 100644 index 000000000000..fd6feb5499fe --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -0,0 +1,920 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> /* for proc_net_* */ +#include <linux/seq_file.h> +#include <linux/jhash.h> +#include <linux/random.h> + +#include <net/ip_vs.h> + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static kmem_cache_t *ip_vs_conn_cachep; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) +#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) + +struct ip_vs_aligned_lock +{ + rwlock_t l; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +/* lock array for conn table */ +static struct ip_vs_aligned_lock +__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; + +static inline void ct_read_lock(unsigned key) +{ + read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock(unsigned key) +{ + read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock(unsigned key) +{ + write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock(unsigned key) +{ + write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_lock_bh(unsigned key) +{ + read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock_bh(unsigned key) +{ + read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock_bh(unsigned key) +{ + write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock_bh(unsigned key) +{ + write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + + +/* + * Returns hash value for IPVS connection entry + */ +static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + return jhash_3words(addr, port, proto, ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +} + + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + ret = 0; + } + + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (s_addr==cp->caddr && s_port==cp->cport && + d_port==cp->vport && d_addr==cp->vaddr && + protocol==cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + + IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + cp?"hit":"not hit"); + + return cp; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (d_addr == cp->caddr && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ret?"hit":"not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + (sysctl_ip_vs_expire_quiescent_template && + (atomic_read(&dest->weight) == 0))) { + IP_VS_DBG(9, "check_template: dest not available for " + "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "-> d:%u.%u.%u.%u:%d\n", + ip_vs_proto_name(ct->protocol), + NIPQUAD(ct->caddr), ntohs(ct->cport), + NIPQUAD(ct->vaddr), ntohs(ct->vport), + NIPQUAD(ct->daddr), ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->cport) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = 65535; + ct->vport = 65535; + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); + __ip_vs_conn_put(cp); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + memset(cp, 0, sizeof(*cp)); + INIT_LIST_HEAD(&cp->c_list); + init_timer(&cp->timer); + cp->timer.data = (unsigned long)cp; + cp->timer.function = ip_vs_conn_expire; + cp->protocol = proto; + cp->caddr = caddr; + cp->cport = cport; + cp->vaddr = vaddr; + cp->vport = vport; + cp->daddr = daddr; + cp->dport = dport; + cp->flags = flags; + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + seq->private = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct list_head *e, *l = seq->private; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next) != l) + return list_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < IP_VS_CONN_TAB_SIZE) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + seq->private = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct list_head *l = seq->private; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + else { + const struct ip_vs_conn *cp = v; + + seq_printf(seq, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_seq_ops); +} + +static struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + + +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_conn *ct; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + /* + * Drop the entry, and drop its ct if not referenced + */ + atomic_inc(&cp->refcnt); + ct_write_unlock(hash); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(hash); + } + ct_write_unlock(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_conn *ct; + + flush_again: + for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(idx); + + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + atomic_inc(&cp->refcnt); + ct_write_unlock(idx); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(idx); + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove("ip_vs_conn"); + vfree(ip_vs_conn_tab); +} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c new file mode 100644 index 000000000000..5fb257dd07cb --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -0,0 +1,1191 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/icmp.h> + +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif +EXPORT_SYMBOL(ip_vs_make_skb_writable); + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.inpkts++; + ip_vs_stats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.outpkts++; + ip_vs_stats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, skb, pp); +} + + +int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len) +{ + struct sk_buff *skb = *pskb; + + /* skb is already used, better copy skb and its payload */ + if (unlikely(skb_shared(skb) || skb->sk)) + goto copy_skb; + + /* skb data is already used, copy it */ + if (unlikely(skb_cloned(skb))) + goto copy_data; + + return pskb_may_pull(skb, writable_len); + + copy_data: + if (unlikely(writable_len > skb->len)) + return 0; + return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + + copy_skb: + if (unlikely(writable_len > skb->len)) + return 0; + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return 0; + BUG_ON(skb_is_nonlinear(skb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(skb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = skb; + return 1; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + const struct sk_buff *skb, + __u16 ports[2]) +{ + struct ip_vs_conn *cp = NULL; + struct iphdr *iph = skb->nh.iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __u16 dport; /* destination port to forward */ + __u32 snet; /* source network of the client, after masking */ + + /* Mask saddr with the netmask to adjust template granularity */ + snet = iph->saddr & svc->netmask; + + IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " + "mnet %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), ntohs(ports[0]), + NIPQUAD(iph->daddr), ntohs(ports[1]), + NIPQUAD(snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP + * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> + * is created for other persistent services. + */ + if (ports[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, ports[1]); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like <protocol,caddr,0, + * vaddr,vport,daddr,dport> for non-ftp service, + * and <protocol,caddr,0,vaddr,0,daddr,0> + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, + ports[1], + dest->addr, dest->port, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> + * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> + */ + if (svc->fwmark) + ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0, + htonl(svc->fwmark), 0); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) + ct = ip_vs_conn_new(IPPROTO_IP, + snet, 0, + htonl(svc->fwmark), 0, + dest->addr, 0, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = ports[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, ports[0], + iph->daddr, ports[1], + dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_conn *cp = NULL; + struct iphdr *iph = skb->nh.iph; + struct ip_vs_dest *dest; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr); + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1], + dest->addr, dest->port?dest->port:pptr[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " + "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + ip_vs_fwd_tag(cp), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + __u16 _ports[2], *pptr; + struct iphdr *iph = skb->nh.iph; + + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is RTN_UNICAST (and not local), then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark + && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + int ret, cs; + struct ip_vs_conn *cp; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1], + 0, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) + return NF_DROP; + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + return NF_DROP; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + return NF_ACCEPT; + + /* The packet was sent from IPVS, exit this chain */ + (*okfn)(*pskb); + + return NF_STOLEN; +} + +u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline struct sk_buff * +ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb = ip_defrag(skb, user); + if (skb) + ip_send_check(skb->nh.iph); + return skb; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr; + ip_send_check(iph); + ciph->daddr = cp->vaddr; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr; + ip_send_check(iph); + ciph->saddr = cp->daddr; + ip_send_check(ciph); + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + __u16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + * (Only used in VS/NAT) + */ +static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); + if (!skb) + return NF_STOLEN; + *pskb = skb; + } + + iph = skb->nh.iph; + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(skb, pp, cih, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the" + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + if (!ip_vs_make_skb_writable(pskb, offset)) + goto out; + skb = *pskb; + + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->nfcache |= NFC_IPVS_PROPERTY; + verdict = NF_ACCEPT; + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +static inline int is_tcp_reset(const struct sk_buff *skb) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* + * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn, + * rewrite addresses of the packet and send it on its way... + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ihl; + + EnterFunction(11); + + if (skb->nfcache & NFC_IPVS_PROPERTY) + return NF_ACCEPT; + + iph = skb->nh.iph; + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(pskb, &related); + + if (related) + return verdict; + skb = *pskb; + iph = skb->nh.iph; + } + + pp = ip_vs_proto_get(iph->protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ + if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); + if (!skb) + return NF_STOLEN; + iph = skb->nh.iph; + *pskb = skb; + } + + ihl = iph->ihl << 2; + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(skb, pp, iph, ihl, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP)) { + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, ihl, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(iph->protocol, + iph->saddr, pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if (iph->protocol != IPPROTO_TCP + || !is_tcp_reset(skb)) { + icmp_send(skb,ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!ip_vs_make_skb_writable(pskb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) + goto drop; + skb = *pskb; + skb->nh.iph->saddr = cp->vaddr; + ip_send_check(skb->nh.iph); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + LeaveFunction(11); + return NF_ACCEPT; + + drop: + ip_vs_conn_put(cp); + kfree_skb(*pskb); + return NF_STOLEN; +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_vs_gather_frags(skb, + hooknum == NF_IP_LOCAL_IN ? + IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); + if (!skb) + return NF_STOLEN; + *pskb = skb; + } + + iph = skb->nh.iph; + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(skb, pp, cih, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); + /* do not touch skb anymore */ + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ret, restart; + int ihl; + + /* + * Big tappo: only PACKET_HOST (neither loopback nor mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + if (unlikely(skb->pkt_type != PACKET_HOST + || skb->dev == &loopback_dev || skb->sk)) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + skb->nh.iph->protocol, + NIPQUAD(skb->nh.iph->daddr)); + return NF_ACCEPT; + } + + iph = skb->nh.iph; + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); + + if (related) + return verdict; + skb = *pskb; + iph = skb->nh.iph; + } + + /* Protocol supported? */ + pp = ip_vs_proto_get(iph->protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + ihl = iph->ihl << 2; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(skb, pp, iph, ihl, 0); + + if (unlikely(!cp)) { + int v; + + if (!pp->conn_schedule(skb, pp, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } else { + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + } + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* increase its packet counter and check if it is needed + to be synchronized */ + atomic_inc(&cp->in_pkts); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + (cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) + ip_vs_sync_conn(cp); + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + return ip_vs_in_icmp(pskb, &r, hooknum); +} + + +/* After packet filtering, forward packet through VS/DR, VS/TUN, + or VS/NAT(change destination), so that filtering rules can be + applied to IPVS. */ +static struct nf_hook_ops ip_vs_in_ops = { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = 100, +}; + +/* After packet filtering, change source only for VS/NAT */ +static struct nf_hook_ops ip_vs_out_ops = { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 100, +}; + +/* After packet filtering (but before ip_vs_out_icmp), catch icmp + destined for 0.0.0.0/0, which is for incoming IPVS connections */ +static struct nf_hook_ops ip_vs_forward_icmp_ops = { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 99, +}; + +/* Before the netfilter connection tracking, exit from POST_ROUTING */ +static struct nf_hook_ops ip_vs_post_routing_ops = { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_nothing; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hook(&ip_vs_in_ops); + if (ret < 0) { + IP_VS_ERR("can't register in hook.\n"); + goto cleanup_conn; + } + + ret = nf_register_hook(&ip_vs_out_ops); + if (ret < 0) { + IP_VS_ERR("can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_vs_post_routing_ops); + if (ret < 0) { + IP_VS_ERR("can't register post_routing hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_vs_forward_icmp_ops); + if (ret < 0) { + IP_VS_ERR("can't register forward_icmp hook.\n"); + goto cleanup_postroutingops; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_postroutingops: + nf_unregister_hook(&ip_vs_post_routing_ops); + cleanup_outops: + nf_unregister_hook(&ip_vs_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_vs_in_ops); + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_nothing: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hook(&ip_vs_forward_icmp_ops); + nf_unregister_hook(&ip_vs_post_routing_ops); + nf_unregister_hook(&ip_vs_out_ops); + nf_unregister_hook(&ip_vs_in_ops); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c new file mode 100644 index 000000000000..218d9701036e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -0,0 +1,2391 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/workqueue.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip.h> +#include <net/sock.h> + +#include <asm/uaccess.h> + +#include <net/ip_vs.h> + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DECLARE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* lock for table with the real services */ +static DEFINE_RWLOCK(__ip_vs_rs_lock); + +/* lock for state and timeout tables */ +static DEFINE_RWLOCK(__ip_vs_securetcp_lock); + +/* lock for drop entry handling */ +static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); + +/* lock for drop packet handling */ +static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_expire_quiescent_template = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +/* + * update_defense_level is called from keventd and from sysctl. + */ +static void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ +static void defense_work_handler(void *data); +static DECLARE_WORK(defense_work, defense_work_handler, NULL); + +static void defense_work_handler(void *data) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +} + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by <protocol, addr, port> */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by <proto,addr,port> + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by <protocol,addr,port> in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static __inline__ struct ip_vs_service * +__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->addr == vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + goto out; + + /* + * Check the table hashed by <protocol,addr,port> + * for "full" addressed entries + */ + svc = __ip_vs_service_get(protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + NIPQUAD(vaddr), ntohs(vport), + svc?"hit":"not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by <proto,addr,port> in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(daddr, dport); + + read_lock(&__ip_vs_rs_lock); + list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + if ((dest->addr == daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->addr == daddr) && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest, *nxt; + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " + "refcnt=%d\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->addr == daddr && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (dest->vaddr == svc->addr && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " + "from trash\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest, *nxt; + + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + memset(stats, 0, (char *)&stats->lock - (char *)stats); + spin_unlock_bh(&stats->lock); + ip_vs_zero_estimator(stats); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ + if (inet_addr_type(udest->addr) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + + atype = inet_addr_type(udest->addr); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + + dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + memset(dest, 0, sizeof(struct ip_vs_dest)); + + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + dest->addr = udest->addr; + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " + "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + NIPQUAD(daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + NIPQUAD(dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + while (atomic_read(&svc->usecnt) > 1) {}; + + /* call the update_service, because server weight may be changed */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + if (svcupd) { + /* + * Call the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + } +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + + svc = (struct ip_vs_service *) + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + memset(svc, 0, sizeof(struct ip_vs_service)); + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->protocol = u->protocol; + svc->addr = u->addr; + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out; + } + } + + out: + write_unlock_bh(&__ip_vs_svc_lock); + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by <protocol,addr,port> + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + local_bh_disable(); + update_defense_level(); + local_bh_enable(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ + +static struct ctl_table vs_vars[] = { + { + .ctl_name = NET_IPV4_VS_AMEMTHRESH, + .procname = "amemthresh", + .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .ctl_name = NET_IPV4_VS_DEBUG_LEVEL, + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = NET_IPV4_VS_AMDROPRATE, + .procname = "am_droprate", + .data = &sysctl_ip_vs_am_droprate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_DROP_ENTRY, + .procname = "drop_entry", + .data = &sysctl_ip_vs_drop_entry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .ctl_name = NET_IPV4_VS_DROP_PACKET, + .procname = "drop_packet", + .data = &sysctl_ip_vs_drop_packet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .ctl_name = NET_IPV4_VS_SECURE_TCP, + .procname = "secure_tcp", + .data = &sysctl_ip_vs_secure_tcp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, +#if 0 + { + .ctl_name = NET_IPV4_VS_TO_ES, + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SS, + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SR, + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_FW, + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_TW, + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_CL, + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_CW, + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_LA, + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_LI, + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SA, + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_UDP, + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_ICMP, + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, +#endif + { + .ctl_name = NET_IPV4_VS_CACHE_BYPASS, + .procname = "cache_bypass", + .data = &sysctl_ip_vs_cache_bypass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN, + .procname = "expire_nodest_conn", + .data = &sysctl_ip_vs_expire_nodest_conn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, + .procname = "expire_quiescent_template", + .data = &sysctl_ip_vs_expire_quiescent_template, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD, + .procname = "sync_threshold", + .data = &sysctl_ip_vs_sync_threshold, + .maxlen = sizeof(sysctl_ip_vs_sync_threshold), + .mode = 0644, + .proc_handler = &proc_do_sync_threshold, + }, + { + .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND, + .procname = "nat_icmp_send", + .data = &sysctl_ip_vs_nat_icmp_send, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (pos-- == 0){ + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->scheduler->name); + else + seq_printf(seq, "FWM %08X %s ", + svc->fwmark, svc->scheduler->name); + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { + seq_printf(seq, + " -> %08X:%04X %-7s %-6d %-10d %-10d\n", + ntohl(dest->addr), ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + } + } + return 0; +} + +static struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ip_vs_info_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +struct ip_vs_stats ip_vs_stats; + +#ifdef CONFIG_PROC_FS +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + spin_lock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, + ip_vs_stats.inpkts, ip_vs_stats.outpkts, + (unsigned long long) ip_vs_stats.inbytes, + (unsigned long long) ip_vs_stats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq,"%8X %8X %8X %16X %16X\n", + ip_vs_stats.cps, + ip_vs_stats.inpps, + ip_vs_stats.outpps, + ip_vs_stats.inbps, + ip_vs_stats.outbps); + spin_unlock_bh(&ip_vs_stats.lock); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_vs_stats_show, NULL); +} + +static struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (down_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc = (struct ip_vs_service_user *)arg; + udest = (struct ip_vs_dest_user *)(usvc + 1); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc->fwmark && !usvc->addr && !usvc->port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { + IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", + usvc->protocol, NIPQUAD(usvc->addr), + ntohs(usvc->port), usvc->sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by <protocol, addr, port> or fwmark */ + if (usvc->fwmark == 0) + svc = __ip_vs_service_get(usvc->protocol, + usvc->addr, usvc->port); + else + svc = __ip_vs_svc_fwm_get(usvc->fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc->protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + up(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, src, (char*)&src->lock - (char*)src); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr; + dst->port = src->port; + dst->fwmark = src->fwmark; + strcpy(dst->sched_name, src->scheduler->name); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (count >= get->num_services) + goto out; + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (count >= get->num_services) + goto out; + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(get->fwmark); + else + svc = __ip_vs_service_get(get->protocol, + get->addr, get->port); + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (down_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + + entry = (struct ip_vs_service_entry *)arg; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(entry->fwmark); + else + svc = __ip_vs_service_get(entry->protocol, + entry->addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + up(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, +}; + + +int ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); + + sysctl_header = register_sysctl_table(vs_root_table, 0); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); + spin_lock_init(&ip_vs_stats.lock); + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + cancel_rearming_delayed_work(&defense_work); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(sysctl_header); + proc_net_remove("ip_vs_stats"); + proc_net_remove("ip_vs"); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c new file mode 100644 index 000000000000..f3bc320dce93 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell <proellt@gmx.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c new file mode 100644 index 000000000000..67b3e2fc1fa1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -0,0 +1,200 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include <linux/kernel.h> +#include <linux/types.h> + +#include <net/ip_vs.h> + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +struct ip_vs_estimator +{ + struct ip_vs_estimator *next; + struct ip_vs_stats *stats; + + u32 last_conns; + u32 last_inpkts; + u32 last_outpkts; + u64 last_inbytes; + u64 last_outbytes; + + u32 cps; + u32 inpps; + u32 outpps; + u32 inbps; + u32 outbps; +}; + + +static struct ip_vs_estimator *est_list = NULL; +static DEFINE_RWLOCK(est_lock); +static struct timer_list est_timer; + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + read_lock(&est_lock); + for (e = est_list; e; e = e->next) { + s = e->stats; + + spin_lock(&s->lock); + n_conns = s->conns; + n_inpkts = s->inpkts; + n_outpkts = s->outpkts; + n_inbytes = s->inbytes; + n_outbytes = s->outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + read_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +int ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOMEM; + + memset(est, 0, sizeof(*est)); + est->stats = stats; + est->last_conns = stats->conns; + est->cps = stats->cps<<10; + + est->last_inpkts = stats->inpkts; + est->inpps = stats->inpps<<10; + + est->last_outpkts = stats->outpkts; + est->outpps = stats->outpps<<10; + + est->last_inbytes = stats->inbytes; + est->inbps = stats->inbps<<5; + + est->last_outbytes = stats->outbytes; + est->outbps = stats->outbps<<5; + + write_lock_bh(&est_lock); + est->next = est_list; + if (est->next == NULL) { + init_timer(&est_timer); + est_timer.expires = jiffies + 2*HZ; + est_timer.function = estimation_timer; + add_timer(&est_timer); + } + est_list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est, **pest; + int killed = 0; + + write_lock_bh(&est_lock); + pest = &est_list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + *pest = est->next; + kfree(est); + killed++; + } + if (killed && est_list == NULL) + del_timer_sync(&est_timer); + write_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e; + + write_lock_bh(&est_lock); + for (e = est_list; e; e = e->next) { + if (e->stats != stats) + continue; + + /* set counters zero */ + e->last_conns = 0; + e->last_inpkts = 0; + e->last_outpkts = 0; + e->last_inbytes = 0; + e->last_outbytes = 0; + e->cps = 0; + e->inpps = 0; + e->outpps = 0; + e->inbps = 0; + e->outbps = 0; + } + write_unlock_bh(&est_lock); +} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c new file mode 100644 index 000000000000..a19a33ceb811 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -0,0 +1,400 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> + +#include <net/ip_vs.h> + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, int, NULL, 0); + +/* + * Debug level + */ +#ifdef CONFIG_IP_VS_DEBUG +static int debug=0; +module_param(debug, int, 0); +#endif + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * <addr,port> is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __u32 *addr, __u16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0]; + *port = (p[5]<<8) | p[4]; + return 1; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff **pskb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + __u32 from; + __u16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret; + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = (*pskb)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = (*pskb)->tail; + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(iph->protocol, from, port, + cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + cp->caddr, 0, + cp->vaddr, port, + from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from = n_cp->vaddr; + port = n_cp->vport; + sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + port&255, (port>>8)&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + *diff = buf_len - (end-start); + + if (*diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + ret = 1; + } else { + ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff **pskb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + __u32 to; + __u16 port; + struct ip_vs_conn *n_cp; + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = (*pskb)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = (*pskb)->tail; + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to), ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); + + n_cp = ip_vs_conn_in_get(iph->protocol, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1), + cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { + if (!ports[i]) + continue; + ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); + if (ret) + break; + IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c new file mode 100644 index 000000000000..c035838b780a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -0,0 +1,624 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * + * return n; + * + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing + * me to write this module. + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .ctl_name = NET_IPV4_VS_LBLC_EXPIRE, + .procname = "lblc_expiration", + .data = &sysctl_ip_vs_lblc_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table + }, + { .ctl_name = 0 } +}; + +static ctl_table lblc_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +/* + * new/free a ip_vs_lblc_entry, which is a mapping of a destionation + * IP address to a server. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + return en; +} + + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static int +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblc_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. + * returns bool success. + */ +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, + struct ip_vs_lblc_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblc_entry *en; + + hash = ip_vs_lblc_hashkey(addr); + + read_lock(&tbl->lock); + + list_for_each_entry(en, &tbl->bucket[hash], list) { + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + int i; + struct ip_vs_lblc_entry *en, *nxt; + + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + sysctl_ip_vs_lblc_expiration)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_lblc_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + tbl = (struct ip_vs_lblc_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblc_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + rwlock_init(&tbl->lock); + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblc_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_lblc_table)); + + return 0; +} + + +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblc_table *tbl; + struct ip_vs_lblc_entry *en; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblc_table *)svc->sched_data; + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblc_new(iph->daddr, dest); + if (en == NULL) { + return NULL; + } + ip_vs_lblc_hash(tbl, en); + } else { + dest = en->dest; + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .update_service = ip_vs_lblc_update_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + sysctl_header = register_sysctl_table(lblc_root_table, 0); + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c new file mode 100644 index 000000000000..22b5dd55d271 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -0,0 +1,888 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n <- {weighted least-conn node}; + * add n to serverSet[dest_ip]; + * if |serverSet[dest_ip]| > 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> +/* for proc_net_create/proc_net_remove */ +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + write_lock(&set->lock); + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + write_unlock(&set->lock); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } + write_unlock(&set->lock); +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE, + .procname = "lblcr_expiration", + .data = &sysctl_ip_vs_lblcr_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table + }, + { .ctl_name = 0 } +}; + +static ctl_table lblcr_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +/* + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) +{ + struct ip_vs_lblcr_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + return en; +} + + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static int +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblcr_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. + * returns bool success. + */ +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, + struct ip_vs_lblcr_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblcr_entry *en; + + hash = ip_vs_lblcr_hashkey(addr); + + read_lock(&tbl->lock); + + list_for_each_entry(en, &tbl->bucket[hash], list) { + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, + now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + tbl = (struct ip_vs_lblcr_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + write_lock(&tbl->lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG +static struct ip_vs_lblcr_table *lblcr_table_list; + +/* + * /proc/net/ip_vs_lblcr to display the mappings of + * destination IP address <==> its serverSet + */ +static int +ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0, begin; + int len=0, size; + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int i; + struct ip_vs_lblcr_entry *en; + + tbl = lblcr_table_list; + + size = sprintf(buffer, "LastTime Dest IP address Server set\n"); + pos += size; + len += size; + + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + read_lock_bh(&tbl->lock); + list_for_each_entry(en, &tbl->bucket[i], list) { + char tbuf[16]; + struct ip_vs_dest_list *d; + + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr)); + size = sprintf(buffer+len, "%8lu %-16s ", + now-en->lastuse, tbuf); + + read_lock(&en->set.lock); + for (d=en->set.list; d!=NULL; d=d->next) { + size += sprintf(buffer+len+size, + "%u.%u.%u.%u ", + NIPQUAD(d->dest->addr)); + } + read_unlock(&en->set.lock); + size += sprintf(buffer+len+size, "\n"); + len += size; + pos += size; + if (pos <= offset) + len=0; + if (pos >= offset+length) { + read_unlock_bh(&tbl->lock); + goto done; + } + } + read_unlock_bh(&tbl->lock); + } + + done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} +#endif + + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblcr_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + rwlock_init(&tbl->lock); + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + lblcr_table_list = tbl; +#endif + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_lblcr_table)); + + return 0; +} + + +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblcr_table *tbl; + struct ip_vs_lblcr_entry *en; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblcr_new(iph->daddr); + if (en == NULL) { + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + ip_vs_lblcr_hash(tbl, en); + } else { + dest = ip_vs_dest_set_min(&en->set); + if (!dest || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + } + if (atomic_read(&en->set.size) > 1 && + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + struct ip_vs_dest *m; + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .update_service = ip_vs_lblcr_update_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + sysctl_header = register_sysctl_table(lblcr_root_table, 0); +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); +#endif + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_remove("ip_vs_lblcr"); +#endif + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c new file mode 100644 index 000000000000..d88fef90a641 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -0,0 +1,123 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (least) + IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lc_init_svc, + .done_service = ip_vs_lc_done_svc, + .update_service = ip_vs_lc_update_svc, + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c new file mode 100644 index 000000000000..bc2a9e5f2a7b --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -0,0 +1,161 @@ +/* + * IPVS: Never Queue scheduling module + * + * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_nq_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) + return NULL; + + out: + IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_nq_init_svc, + .done_service = ip_vs_nq_done_svc, + .update_service = ip_vs_nq_update_svc, + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c new file mode 100644 index 000000000000..253c46252bd5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -0,0 +1,244 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + int *t; + + t = kmalloc(size, GFP_ATOMIC); + if (t == NULL) + return NULL; + memcpy(t, table, size); + return t; +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return "ERR!"; + return pp->state_name(state); +} + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __u16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +int ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_ICMP + REGISTER_PROTOCOL(&ip_vs_protocol_icmp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c new file mode 100644 index 000000000000..453e94a0bbd7 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -0,0 +1,177 @@ +/* + * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +ah_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void ah_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .dont_defrag = 1, + .init = ah_init, + .exit = ah_exit, + .conn_schedule = ah_conn_schedule, + .conn_in_get = ah_conn_in_get, + .conn_out_get = ah_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c new file mode 100644 index 000000000000..478e5c7c7e8e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -0,0 +1,175 @@ +/* + * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .dont_defrag = 1, + .init = esp_init, + .exit = esp_exit, + .conn_schedule = esp_conn_schedule, + .conn_in_get = esp_conn_in_get, + .conn_out_get = esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c new file mode 100644 index 000000000000..191e94aa1c1f --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c @@ -0,0 +1,182 @@ +/* + * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server + * + * Authors: Julian Anastasov <ja@ssi.bg>, March 2002 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/icmp.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +static int icmp_timeouts[1] = { 1*60*HZ }; + +static char * icmp_state_name_table[1] = { "ICMP" }; + +static struct ip_vs_conn * +icmp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; + +#else + return NULL; +#endif +} + +static struct ip_vs_conn * +icmp_conn_out_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; +#else + return NULL; +#endif +} + +static int +icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + *verdict = NF_ACCEPT; + return 0; +} + +static int +icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) { + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); + return 0; + } + } + } + return 1; +} + +static void +icmp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + struct icmphdr _icmph, *ic; + + ic = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) + sprintf(buf, "%s TRUNCATED to %u bytes\n", + pp->name, skb->len - offset); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), + ic->type, ic->code); + } + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +static int +icmp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL]; + return 1; +} + +static int +icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + int num; + char **names; + + num = IP_VS_ICMP_S_LAST; + names = icmp_state_name_table; + return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to); +} + + +static void icmp_init(struct ip_vs_protocol *pp) +{ + pp->timeout_table = icmp_timeouts; +} + +static void icmp_exit(struct ip_vs_protocol *pp) +{ +} + +struct ip_vs_protocol ip_vs_protocol_icmp = { + .name = "ICMP", + .protocol = IPPROTO_ICMP, + .dont_defrag = 0, + .init = icmp_init, + .exit = icmp_exit, + .conn_schedule = icmp_conn_schedule, + .conn_in_get = icmp_conn_in_get, + .conn_out_get = icmp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = icmp_csum_check, + .state_transition = icmp_state_transition, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = icmp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = icmp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 000000000000..e65de675da74 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,640 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +static struct ip_vs_conn * +tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_in_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + return ip_vs_conn_in_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_out_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + return ip_vs_conn_out_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } +} + + +static int +tcp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + + if (th->syn && + (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, + skb->nh.iph->daddr, th->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + tcph->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, tcph->check)); +} + + +static int +tcp_snat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, pskb)) + return 0; + } + + tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, tcphoff, + (*pskb)->len - tcphoff, 0); + tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + (*pskb)->len - tcphoff, + cp->protocol, + (*pskb)->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!ip_vs_app_pkt_in(cp, pskb)) + return 0; + } + + tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, tcphoff, + (*pskb)->len - tcphoff, 0); + tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + (*pskb)->len - tcphoff, + cp->protocol, + (*pskb)->csum); + (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff = skb->nh.iph->ihl*4; + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - tcphoff, + skb->nh.iph->protocol, skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + + +#if 0 + +/* FIXME: This is going to die */ + +static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 10*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 100*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +#endif + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" + "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + pp->name, + (state_off==TCP_DIR_OUTPUT)?"output ":"input ", + th->syn? 'S' : '.', + th->fin? 'F' : '.', + th->ack? 'A' : '.', + th->rst? 'R' : '.', + NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr), ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(tcp_app_lock); + +static inline __u16 tcp_app_hashkey(__u16 port) +{ + return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash, port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&tcp_app_lock); + list_for_each_entry(i, &tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &tcp_apps[hash]); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&tcp_app_lock); + list_for_each_entry(inc, &tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .dont_defrag = 0, + .appcnt = ATOMIC_INIT(0), + .init = tcp_init, + .exit = tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c new file mode 100644 index 000000000000..8ae5f2e0aefa --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,427 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/kernel.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +static struct ip_vs_conn * +udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_out_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + + if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, + skb->nh.iph->daddr, uh->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + uhdr->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, uhdr->check)); + if (!uhdr->check) + uhdr->check = 0xFFFF; +} + +static int +udp_snat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!ip_vs_app_pkt_out(cp, pskb)) + return 0; + } + + udph = (void *)(*pskb)->nh.iph + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(udph, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, udphoff, + (*pskb)->len - udphoff, 0); + udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + (*pskb)->len - udphoff, + cp->protocol, + (*pskb)->csum); + if (udph->check == 0) + udph->check = 0xFFFF; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!ip_vs_app_pkt_in(cp, pskb)) + return 0; + } + + udph = (void *)(*pskb)->nh.iph + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(udph, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, udphoff, + (*pskb)->len - udphoff, 0); + udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + (*pskb)->len - udphoff, + cp->protocol, + (*pskb)->csum); + if (udph->check == 0) + udph->check = 0xFFFF; + (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff = skb->nh.iph->ihl*4; + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(skb->nh.iph->saddr, + skb->nh.iph->daddr, + skb->len - udphoff, + skb->nh.iph->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(udp_app_lock); + +static inline __u16 udp_app_hashkey(__u16 port) +{ + return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash, port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&udp_app_lock); + list_for_each_entry(i, &udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &udp_apps[hash]); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&udp_app_lock); + list_for_each_entry(inc, &udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .dont_defrag = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c new file mode 100644 index 000000000000..b23bab231cab --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -0,0 +1,118 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_rr_init_svc, + .done_service = ip_vs_rr_done_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c new file mode 100644 index 000000000000..0f7c56a225bd --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -0,0 +1,251 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <asm/string.h> +#include <linux/kmod.h> + +#include <net/ip_vs.h> + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + sched = ip_vs_sched_getbyname(scheduler->name); + if (sched) { + ip_vs_scheduler_put(sched); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", scheduler->name); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + + if (scheduler->n_list.next != &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (scheduler->n_list.next == &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c new file mode 100644 index 000000000000..ff366f7390d9 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -0,0 +1,163 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_sed_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sed_init_svc, + .done_service = ip_vs_sed_done_svc, + .update_service = ip_vs_sed_update_svc, + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c new file mode 100644 index 000000000000..6f7c50e44a39 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -0,0 +1,255 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c new file mode 100644 index 000000000000..25c479550a32 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -0,0 +1,892 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/igmp.h> /* for ip_mc_join_group */ + +#include <net/ip.h> +#include <net/sock.h> +#include <asm/uaccess.h> /* for get_fs and set_fs */ + +#include <net/ip_vs.h> + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __u16 cport; + __u16 vport; + __u16 dport; + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + + /* Flags and state transition */ + __u16 flags; /* status flags */ + __u16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static DEFINE_SPINLOCK(ip_vs_sync_lock); + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static DEFINE_SPINLOCK(curr_sb_lock); + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* multicast addr */ +static struct sockaddr_in mcast_addr; + + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + list_add_tail(&sb->list, &ip_vs_sync_queue); + spin_unlock(&ip_vs_sync_lock); +} + +static inline struct ip_vs_sync_buff * sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && (time == 0 || + time_before(jiffies - curr_sb->firstuse, time))) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr; + s->vaddr = cp->vaddr; + s->daddr = cp->daddr; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + char *p; + int i; + + /* Convert size back to host byte order */ + m->size = ntohs(m->size); + + if (buflen != m->size) { + IP_VS_ERR("bogus message\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; i<m->nr_conns; i++) { + s = (struct ip_vs_sync_conn *)p; + cp = ip_vs_conn_in_get(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport); + if (!cp) { + cp = ip_vs_conn_new(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport, + s->daddr, s->dport, + ntohs(s->flags), NULL); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + cp->state = ntohs(s->state); + } else if (!cp->dest) { + /* it is an entry created by the synchronization */ + cp->state = ntohs(s->state); + cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; + } /* Note that we don't touch its state and flags + if it is a normal entry. */ + + if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(&cp->in_seq, opt, sizeof(*opt)); + p += FULL_CONN_SIZE; + } else + p += SIMPLE_CONN_SIZE; + + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; + ip_vs_conn_put(cp); + + if (p > buffer+buflen) { + IP_VS_ERR("bogus message\n"); + return; + } + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = + SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + u32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + if (sock->ops->connect(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr), 0) < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + if (sock->ops->bind(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr)) < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + if (join_mcast_group(sock->sk, + (struct in_addr*)&mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn) < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + IP_VS_ERR("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static DECLARE_WAIT_QUEUE_HEAD(sync_wait); +static pid_t sync_master_pid = 0; +static pid_t sync_backup_pid = 0; + +static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); +static int stop_master_sync = 0; +static int stop_backup_sync = 0; + +static void sync_master_loop(void) +{ + struct socket *sock; + struct ip_vs_sync_buff *sb; + + /* create the sending multicast socket */ + sock = make_send_sock(); + if (!sock) + return; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + for (;;) { + while ((sb=sb_dequeue())) { + ip_vs_send_sync_msg(sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + if ((sb = get_curr_sync_buff(2*HZ))) { + ip_vs_send_sync_msg(sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + if (stop_master_sync) + break; + + ssleep(1); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(sock); +} + + +static void sync_backup_loop(void) +{ + struct socket *sock; + char *buf; + int len; + + if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { + IP_VS_ERR("sync_backup_loop: kmalloc error\n"); + return; + } + + /* create the receiving multicast socket */ + sock = make_receive_sock(); + if (!sock) + goto out; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + for (;;) { + /* do you have data now? */ + while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { + if ((len = + ip_vs_receive(sock, buf, + sync_recv_mesg_maxlen)) <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + /* disable bottom half, because it accessed the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(buf, len); + local_bh_enable(); + } + + if (stop_backup_sync) + break; + + ssleep(1); + } + + /* release the sending multicast socket */ + sock_release(sock); + + out: + kfree(buf); +} + + +static void set_sync_pid(int sync_state, pid_t sync_pid) +{ + if (sync_state == IP_VS_STATE_MASTER) + sync_master_pid = sync_pid; + else if (sync_state == IP_VS_STATE_BACKUP) + sync_backup_pid = sync_pid; +} + +static void set_stop_sync(int sync_state, int set) +{ + if (sync_state == IP_VS_STATE_MASTER) + stop_master_sync = set; + else if (sync_state == IP_VS_STATE_BACKUP) + stop_backup_sync = set; + else { + stop_master_sync = set; + stop_backup_sync = set; + } +} + +static int sync_thread(void *startup) +{ + DECLARE_WAITQUEUE(wait, current); + mm_segment_t oldmm; + int state; + const char *name; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { + state = IP_VS_STATE_MASTER; + name = "ipvs_syncmaster"; + } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { + state = IP_VS_STATE_BACKUP; + name = "ipvs_syncbackup"; + } else { + IP_VS_BUG(); + ip_vs_use_count_dec(); + return -EINVAL; + } + + daemonize(name); + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + /* Block all signals */ + spin_lock_irq(¤t->sighand->siglock); + siginitsetinv(¤t->blocked, 0); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* set the maximum length of sync message */ + set_sync_mesg_maxlen(state); + + /* set up multicast address */ + mcast_addr.sin_family = AF_INET; + mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); + mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); + + add_wait_queue(&sync_wait, &wait); + + set_sync_pid(state, current->pid); + complete((struct completion *)startup); + + /* processing master/backup loop here */ + if (state == IP_VS_STATE_MASTER) + sync_master_loop(); + else if (state == IP_VS_STATE_BACKUP) + sync_backup_loop(); + else IP_VS_BUG(); + + remove_wait_queue(&sync_wait, &wait); + + /* thread exits */ + set_sync_pid(state, 0); + IP_VS_INFO("sync thread stopped!\n"); + + set_fs(oldmm); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + set_stop_sync(state, 0); + wake_up(&stop_sync_wait); + + return 0; +} + + +static int fork_sync_thread(void *startup) +{ + pid_t pid; + + /* fork the sync thread here, then the parent process of the + sync thread is the init process after this thread exits. */ + repeat: + if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { + IP_VS_ERR("could not create sync_thread due to %d... " + "retrying.\n", pid); + ssleep(1); + goto repeat; + } + + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + DECLARE_COMPLETION(startup); + pid_t pid; + + if ((state == IP_VS_STATE_MASTER && sync_master_pid) || + (state == IP_VS_STATE_BACKUP && sync_backup_pid)) + return -EEXIST; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); + + ip_vs_sync_state |= state; + if (state == IP_VS_STATE_MASTER) { + strcpy(ip_vs_master_mcast_ifn, mcast_ifn); + ip_vs_master_syncid = syncid; + } else { + strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); + ip_vs_backup_syncid = syncid; + } + + repeat: + if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) { + IP_VS_ERR("could not create fork_sync_thread due to %d... " + "retrying.\n", pid); + ssleep(1); + goto repeat; + } + + wait_for_completion(&startup); + + return 0; +} + + +int stop_sync_thread(int state) +{ + DECLARE_WAITQUEUE(wait, current); + + if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || + (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) + return -ESRCH; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_INFO("stopping sync thread %d ...\n", + (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&stop_sync_wait, &wait); + set_stop_sync(state, 1); + ip_vs_sync_state -= state; + wake_up(&sync_wait); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&stop_sync_wait, &wait); + + /* Note: no need to reap the sync thread, because its parent + process is the init process */ + + if ((state == IP_VS_STATE_MASTER && stop_master_sync) || + (state == IP_VS_STATE_BACKUP && stop_backup_sync)) + IP_VS_BUG(); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c new file mode 100644 index 000000000000..8a9d913261d8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -0,0 +1,151 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_wlc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wlc_init_svc, + .done_service = ip_vs_wlc_done_svc, + .update_service = ip_vs_wlc_update_svc, + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c new file mode 100644 index 000000000000..749fa044eca5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -0,0 +1,235 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_INFO("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c new file mode 100644 index 000000000000..faa6176bbeb1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -0,0 +1,563 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> /* for ip_route_output */ +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +static inline struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + return NULL; + } + } + + return rt; +} + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT(skb, rt) \ +do { \ + nf_reset_debug(skb); \ + (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ip_summed = CHECKSUM_NONE; \ + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; + + EnterFunction(10); + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(skb->nh.iph); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = skb->nh.iph; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __u16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) + goto tx_error; + skb->nh.iph->daddr = cp->daddr; + ip_send_check(skb->nh.iph); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 <Virtual IP Address> up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + u8 tos = old_iph->tos; + u16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + __constant_htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = (void *) old_iph; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, NULL); + ip_send_check(iph); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(skb->nh.iph); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + __ip_vs_conn_put(cp); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!ip_vs_make_skb_writable(&skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c new file mode 100644 index 000000000000..4e9ca7c76407 --- /dev/null +++ b/net/ipv4/multipath.c @@ -0,0 +1,55 @@ +/* multipath.c: IPV4 multipath algorithm support. + * + * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com> + * Copyright (C) 2005 David S. Miller <davem@davemloft.net> + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include <net/ip_mp_alg.h> + +static DEFINE_SPINLOCK(alg_table_lock); +struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1]; + +int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + int err; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX || + !ops->mp_alg_select_route) + return -EINVAL; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot != NULL) { + err = -EBUSY; + } else { + *slot = ops; + err = 0; + } + spin_unlock(&alg_table_lock); + + return err; +} +EXPORT_SYMBOL(multipath_alg_register); + +void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX) + return; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot == ops) + *slot = NULL; + spin_unlock(&alg_table_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(multipath_alg_unregister); diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c new file mode 100644 index 000000000000..9349686131fc --- /dev/null +++ b/net/ipv4/multipath_drr.c @@ -0,0 +1,265 @@ +/* + * Device round robin policy for multipath. + * + * + * Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $ + * + * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/netfilter_ipv4.h> +#include <net/ipip.h> +#include <net/checksum.h> +#include <net/ip_mp_alg.h> + +struct multipath_device { + int ifi; /* interface index of device */ + atomic_t usecount; + int allocated; +}; + +#define MULTIPATH_MAX_DEVICECANDIDATES 10 + +static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES]; +static DEFINE_SPINLOCK(state_lock); +static struct rtable *last_selection = NULL; + +static int inline __multipath_findslot(void) +{ + int i; + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) { + if (state[i].allocated == 0) + return i; + } + return -1; +} + +static int inline __multipath_finddev(int ifindex) +{ + int i; + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) { + if (state[i].allocated != 0 && + state[i].ifi == ifindex) + return i; + } + return -1; +} + +static int drr_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + int devidx; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + spin_lock_bh(&state_lock); + + devidx = __multipath_finddev(dev->ifindex); + if (devidx != -1) { + state[devidx].allocated = 0; + state[devidx].ifi = 0; + atomic_set(&state[devidx].usecount, 0); + } + + spin_unlock_bh(&state_lock); + break; + }; + + return NOTIFY_DONE; +} + +struct notifier_block drr_dev_notifier = { + .notifier_call = drr_dev_event, +}; + +static void drr_remove(struct rtable *rt) +{ + if (last_selection == rt) + last_selection = NULL; +} + +static void drr_safe_inc(atomic_t *usecount) +{ + int n; + + atomic_inc(usecount); + + n = atomic_read(usecount); + if (n <= 0) { + int i; + + spin_lock_bh(&state_lock); + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) + atomic_set(&state[i].usecount, 0); + + spin_unlock_bh(&state_lock); + } +} + +static void drr_select_route(const struct flowi *flp, + struct rtable *first, struct rtable **rp) +{ + struct rtable *nh, *result, *cur_min; + int min_usecount = -1; + int devidx = -1; + int cur_min_devidx = -1; + + /* if necessary and possible utilize the old alternative */ + if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && + last_selection != NULL) { + result = last_selection; + *rp = result; + return; + } + + /* 1. make sure all alt. nexthops have the same GC related data */ + /* 2. determine the new candidate to be returned */ + result = NULL; + cur_min = NULL; + for (nh = rcu_dereference(first); nh; + nh = rcu_dereference(nh->u.rt_next)) { + if ((nh->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&nh->fl, flp)) { + int nh_ifidx = nh->u.dst.dev->ifindex; + + nh->u.dst.lastuse = jiffies; + nh->u.dst.__use++; + if (result != NULL) + continue; + + /* search for the output interface */ + + /* this is not SMP safe, only add/remove are + * SMP safe as wrong usecount updates have no big + * impact + */ + devidx = __multipath_finddev(nh_ifidx); + if (devidx == -1) { + /* add the interface to the array + * SMP safe + */ + spin_lock_bh(&state_lock); + + /* due to SMP: search again */ + devidx = __multipath_finddev(nh_ifidx); + if (devidx == -1) { + /* add entry for device */ + devidx = __multipath_findslot(); + if (devidx == -1) { + /* unlikely but possible */ + continue; + } + + state[devidx].allocated = 1; + state[devidx].ifi = nh_ifidx; + atomic_set(&state[devidx].usecount, 0); + min_usecount = 0; + } + + spin_unlock_bh(&state_lock); + } + + if (min_usecount == 0) { + /* if the device has not been used it is + * the primary target + */ + drr_safe_inc(&state[devidx].usecount); + result = nh; + } else { + int count = + atomic_read(&state[devidx].usecount); + + if (min_usecount == -1 || + count < min_usecount) { + cur_min = nh; + cur_min_devidx = devidx; + min_usecount = count; + } + } + } + } + + if (!result) { + if (cur_min) { + drr_safe_inc(&state[cur_min_devidx].usecount); + result = cur_min; + } else { + result = first; + } + } + + *rp = result; + last_selection = result; +} + +static struct ip_mp_alg_ops drr_ops = { + .mp_alg_select_route = drr_select_route, + .mp_alg_remove = drr_remove, +}; + +static int __init drr_init(void) +{ + int err = register_netdevice_notifier(&drr_dev_notifier); + + if (err) + return err; + + err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR); + if (err) + goto fail; + + return 0; + +fail: + unregister_netdevice_notifier(&drr_dev_notifier); + return err; +} + +static void __exit drr_exit(void) +{ + unregister_netdevice_notifier(&drr_dev_notifier); + multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR); +} + +module_init(drr_init); +module_exit(drr_exit); diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c new file mode 100644 index 000000000000..805a16e47de5 --- /dev/null +++ b/net/ipv4/multipath_random.c @@ -0,0 +1,128 @@ +/* + * Random policy for multipath. + * + * + * Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $ + * + * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/netfilter_ipv4.h> +#include <net/ipip.h> +#include <net/checksum.h> +#include <net/ip_mp_alg.h> + +#define MULTIPATH_MAX_CANDIDATES 40 + +/* interface to random number generation */ +static unsigned int RANDOM_SEED = 93186752; + +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + + return RANDOM_SEED % ubound; +} + + +static void random_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) +{ + struct rtable *rt; + struct rtable *decision; + unsigned char candidate_count = 0; + + /* count all candidate */ + for (rt = rcu_dereference(first); rt; + rt = rcu_dereference(rt->u.rt_next)) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) + ++candidate_count; + } + + /* choose a random candidate */ + decision = first; + if (candidate_count > 1) { + unsigned char i = 0; + unsigned char candidate_no = (unsigned char) + random(candidate_count); + + /* find chosen candidate and adjust GC data for all candidates + * to ensure they stay in cache + */ + for (rt = first; rt; rt = rt->u.rt_next) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) { + rt->u.dst.lastuse = jiffies; + + if (i == candidate_no) + decision = rt; + + if (i >= candidate_count) + break; + + i++; + } + } + } + + decision->u.dst.__use++; + *rp = decision; +} + +static struct ip_mp_alg_ops random_ops = { + .mp_alg_select_route = random_select_route, +}; + +static int __init random_init(void) +{ + return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM); +} + +static void __exit random_exit(void) +{ + multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM); +} + +module_init(random_init); +module_exit(random_exit); diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c new file mode 100644 index 000000000000..554a82568160 --- /dev/null +++ b/net/ipv4/multipath_rr.c @@ -0,0 +1,115 @@ +/* + * Round robin policy for multipath. + * + * + * Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $ + * + * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/netfilter_ipv4.h> +#include <net/ipip.h> +#include <net/checksum.h> +#include <net/ip_mp_alg.h> + +#define MULTIPATH_MAX_CANDIDATES 40 + +static struct rtable* last_used = NULL; + +static void rr_remove(struct rtable *rt) +{ + if (last_used == rt) + last_used = NULL; +} + +static void rr_select_route(const struct flowi *flp, + struct rtable *first, struct rtable **rp) +{ + struct rtable *nh, *result, *min_use_cand = NULL; + int min_use = -1; + + /* if necessary and possible utilize the old alternative */ + if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && + last_used != NULL) { + result = last_used; + goto out; + } + + /* 1. make sure all alt. nexthops have the same GC related data + * 2. determine the new candidate to be returned + */ + result = NULL; + for (nh = rcu_dereference(first); nh; + nh = rcu_dereference(nh->u.rt_next)) { + if ((nh->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&nh->fl, flp)) { + nh->u.dst.lastuse = jiffies; + + if (min_use == -1 || nh->u.dst.__use < min_use) { + min_use = nh->u.dst.__use; + min_use_cand = nh; + } + } + } + result = min_use_cand; + if (!result) + result = first; + +out: + last_used = result; + result->u.dst.__use++; + *rp = result; +} + +static struct ip_mp_alg_ops rr_ops = { + .mp_alg_select_route = rr_select_route, + .mp_alg_remove = rr_remove, +}; + +static int __init rr_init(void) +{ + return multipath_alg_register(&rr_ops, IP_MP_ALG_RR); +} + +static void __exit rr_exit(void) +{ + multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR); +} + +module_init(rr_init); +module_exit(rr_exit); diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c new file mode 100644 index 000000000000..10b23e1bece6 --- /dev/null +++ b/net/ipv4/multipath_wrandom.c @@ -0,0 +1,344 @@ +/* + * Weighted random policy for multipath. + * + * + * Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $ + * + * Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/netfilter_ipv4.h> +#include <net/ipip.h> +#include <net/checksum.h> +#include <net/ip_fib.h> +#include <net/ip_mp_alg.h> + +#define MULTIPATH_STATE_SIZE 15 + +struct multipath_candidate { + struct multipath_candidate *next; + int power; + struct rtable *rt; +}; + +struct multipath_dest { + struct list_head list; + + const struct fib_nh *nh_info; + __u32 netmask; + __u32 network; + unsigned char prefixlen; + + struct rcu_head rcu; +}; + +struct multipath_bucket { + struct list_head head; + spinlock_t lock; +}; + +struct multipath_route { + struct list_head list; + + int oif; + __u32 gw; + struct list_head dests; + + struct rcu_head rcu; +}; + +/* state: primarily weight per route information */ +static struct multipath_bucket state[MULTIPATH_STATE_SIZE]; + +/* interface to random number generation */ +static unsigned int RANDOM_SEED = 93186752; + +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + return RANDOM_SEED % ubound; +} + +static unsigned char __multipath_lookup_weight(const struct flowi *fl, + const struct rtable *rt) +{ + const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE; + struct multipath_route *r; + struct multipath_route *target_route = NULL; + struct multipath_dest *d; + int weight = 1; + + /* lookup the weight information for a certain route */ + rcu_read_lock(); + + /* find state entry for gateway or add one if necessary */ + list_for_each_entry_rcu(r, &state[state_idx].head, list) { + if (r->gw == rt->rt_gateway && + r->oif == rt->idev->dev->ifindex) { + target_route = r; + break; + } + } + + if (!target_route) { + /* this should not happen... but we are prepared */ + printk( KERN_CRIT"%s: missing state for gateway: %u and " \ + "device %d\n", __FUNCTION__, rt->rt_gateway, + rt->idev->dev->ifindex); + goto out; + } + + /* find state entry for destination */ + list_for_each_entry_rcu(d, &target_route->dests, list) { + __u32 targetnetwork = fl->fl4_dst & + (0xFFFFFFFF >> (32 - d->prefixlen)); + + if ((targetnetwork & d->netmask) == d->network) { + weight = d->nh_info->nh_weight; + goto out; + } + } + +out: + rcu_read_unlock(); + return weight; +} + +static void wrandom_init_state(void) +{ + int i; + + for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { + INIT_LIST_HEAD(&state[i].head); + spin_lock_init(&state[i].lock); + } +} + +static void wrandom_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) +{ + struct rtable *rt; + struct rtable *decision; + struct multipath_candidate *first_mpc = NULL; + struct multipath_candidate *mpc, *last_mpc = NULL; + int power = 0; + int last_power; + int selector; + const size_t size_mpc = sizeof(struct multipath_candidate); + + /* collect all candidates and identify their weights */ + for (rt = rcu_dereference(first); rt; + rt = rcu_dereference(rt->u.rt_next)) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) { + struct multipath_candidate* mpc = + (struct multipath_candidate*) + kmalloc(size_mpc, GFP_KERNEL); + + if (!mpc) + return; + + power += __multipath_lookup_weight(flp, rt) * 10000; + + mpc->power = power; + mpc->rt = rt; + mpc->next = NULL; + + if (!first_mpc) + first_mpc = mpc; + else + last_mpc->next = mpc; + + last_mpc = mpc; + } + } + + /* choose a weighted random candidate */ + decision = first; + selector = random(power); + last_power = 0; + + /* select candidate, adjust GC data and cleanup local state */ + decision = first; + last_mpc = NULL; + for (mpc = first_mpc; mpc; mpc = mpc->next) { + mpc->rt->u.dst.lastuse = jiffies; + if (last_power <= selector && selector < mpc->power) + decision = mpc->rt; + + last_power = mpc->power; + if (last_mpc) + kfree(last_mpc); + + last_mpc = mpc; + } + + if (last_mpc) { + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); + } + + decision->u.dst.__use++; + *rp = decision; +} + +static void wrandom_set_nhinfo(__u32 network, + __u32 netmask, + unsigned char prefixlen, + const struct fib_nh *nh) +{ + const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE; + struct multipath_route *r, *target_route = NULL; + struct multipath_dest *d, *target_dest = NULL; + + /* store the weight information for a certain route */ + spin_lock(&state[state_idx].lock); + + /* find state entry for gateway or add one if necessary */ + list_for_each_entry_rcu(r, &state[state_idx].head, list) { + if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) { + target_route = r; + break; + } + } + + if (!target_route) { + const size_t size_rt = sizeof(struct multipath_route); + target_route = (struct multipath_route *) + kmalloc(size_rt, GFP_KERNEL); + + target_route->gw = nh->nh_gw; + target_route->oif = nh->nh_oif; + memset(&target_route->rcu, 0, sizeof(struct rcu_head)); + INIT_LIST_HEAD(&target_route->dests); + + list_add_rcu(&target_route->list, &state[state_idx].head); + } + + /* find state entry for destination or add one if necessary */ + list_for_each_entry_rcu(d, &target_route->dests, list) { + if (d->nh_info == nh) { + target_dest = d; + break; + } + } + + if (!target_dest) { + const size_t size_dst = sizeof(struct multipath_dest); + target_dest = (struct multipath_dest*) + kmalloc(size_dst, GFP_KERNEL); + + target_dest->nh_info = nh; + target_dest->network = network; + target_dest->netmask = netmask; + target_dest->prefixlen = prefixlen; + memset(&target_dest->rcu, 0, sizeof(struct rcu_head)); + + list_add_rcu(&target_dest->list, &target_route->dests); + } + /* else: we already stored this info for another destination => + * we are finished + */ + + spin_unlock(&state[state_idx].lock); +} + +static void __multipath_free(struct rcu_head *head) +{ + struct multipath_route *rt = container_of(head, struct multipath_route, + rcu); + kfree(rt); +} + +static void __multipath_free_dst(struct rcu_head *head) +{ + struct multipath_dest *dst = container_of(head, + struct multipath_dest, + rcu); + kfree(dst); +} + +static void wrandom_flush(void) +{ + int i; + + /* defere delete to all entries */ + for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { + struct multipath_route *r; + + spin_lock(&state[i].lock); + list_for_each_entry_rcu(r, &state[i].head, list) { + struct multipath_dest *d; + list_for_each_entry_rcu(d, &r->dests, list) { + list_del_rcu(&d->list); + call_rcu(&d->rcu, + __multipath_free_dst); + } + list_del_rcu(&r->list); + call_rcu(&r->rcu, + __multipath_free); + } + + spin_unlock(&state[i].lock); + } +} + +static struct ip_mp_alg_ops wrandom_ops = { + .mp_alg_select_route = wrandom_select_route, + .mp_alg_flush = wrandom_flush, + .mp_alg_set_nhinfo = wrandom_set_nhinfo, +}; + +static int __init wrandom_init(void) +{ + wrandom_init_state(); + + return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM); +} + +static void __exit wrandom_exit(void) +{ + multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM); +} + +module_init(wrandom_init); +module_exit(wrandom_exit); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig new file mode 100644 index 000000000000..46d4cb1c06f0 --- /dev/null +++ b/net/ipv4/netfilter/Kconfig @@ -0,0 +1,696 @@ +# +# IP netfilter configuration +# + +menu "IP: Netfilter Configuration" + depends on INET && NETFILTER + +# connection tracking, helpers and protocols +config IP_NF_CONNTRACK + tristate "Connection tracking (required for masq/NAT)" + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is required to do Masquerading or other kinds of Network + Address Translation (except for Fast NAT). It can also be used to + enhance packet filtering (see `Connection state match support' + below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config IP_NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config IP_NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + With this option enabled, the connection tracking code will + be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_FTP + tristate "FTP protocol support" + depends on IP_NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_IRC + tristate "IRC protocol support" + depends on IP_NF_CONNTRACK + ---help--- + There is a commonly-used extension to IRC called + Direct Client-to-Client Protocol (DCC). This enables users to send + files to each other, and also chat to each other without the need + of a server. DCC Sending is used anywhere you send files over IRC, + and DCC Chat is most commonly used by Eggdrop bots. If you are + using NAT, this extension will enable you to send files and initiate + chats. Note that you do NOT need this extension to get files or + have others initiate chats, or everything else in IRC. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_TFTP + tristate "TFTP protocol support" + depends on IP_NF_CONNTRACK + help + TFTP connection tracking helper, this is required depending + on how restrictive your ruleset is. + If you are using a tftp client behind -j SNAT or -j MASQUERADING + you will need this. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_AMANDA + tristate "Amanda backup protocol support" + depends on IP_NF_CONNTRACK + help + If you are running the Amanda backup package <http://www.amanda.org/> + on this machine or machines that will be MASQUERADED through this + machine, then you may want to enable this feature. This allows the + connection tracking and natting code to allow the sub-channels that + Amanda requires for communication of the backup data, messages and + index. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_QUEUE + tristate "Userspace queueing via NETLINK" + help + Netfilter has the ability to queue packets to user space: the + netlink device can be used to access them using this driver. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_IPTABLES + tristate "IP tables support (required for filtering/masq/NAT)" + help + iptables is a general, extensible packet identification framework. + The packet filtering and full NAT (masquerading, port forwarding, + etc) subsystems now use this: say `Y' or `M' here if you want to use + either of those. + + To compile it as a module, choose M here. If unsure, say N. + +# The matches. +config IP_NF_MATCH_LIMIT + tristate "limit match support" + depends on IP_NF_IPTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_IPRANGE + tristate "IP range match support" + depends on IP_NF_IPTABLES + help + This option makes possible to match IP addresses against IP address + ranges. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MAC + tristate "MAC address match support" + depends on IP_NF_IPTABLES + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PKTTYPE + tristate "Packet type match support" + depends on IP_NF_IPTABLES + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MARK + tristate "netfilter MARK match support" + depends on IP_NF_IPTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MULTIPORT + tristate "Multiple port match support" + depends on IP_NF_IPTABLES + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TOS + tristate "TOS match support" + depends on IP_NF_IPTABLES + help + TOS matching allows you to match packets based on the Type Of + Service fields of the IP packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_RECENT + tristate "recent match support" + depends on IP_NF_IPTABLES + help + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: <http://snowman.net/projects/ipt_recent/> + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ECN + tristate "ECN match support" + depends on IP_NF_IPTABLES + help + This option adds a `ECN' match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_DSCP + tristate "DSCP match support" + depends on IP_NF_IPTABLES + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_AH_ESP + tristate "AH/ESP match support" + depends on IP_NF_IPTABLES + help + These two match extensions (`ah' and `esp') allow you to match a + range of SPIs inside AH or ESP headers of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_LENGTH + tristate "LENGTH match support" + depends on IP_NF_IPTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TTL + tristate "TTL match support" + depends on IP_NF_IPTABLES + help + This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user + to match packets by their TTL value. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TCPMSS + tristate "tcpmss match support" + depends on IP_NF_IPTABLES + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_HELPER + tristate "Helper match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_MATCH_STATE + tristate "Connection state match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_CONNTRACK + tristate "Connection tracking match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_OWNER + tristate "Owner match support" + depends on IP_NF_IPTABLES + help + Packet owner matching allows you to match locally-generated packets + based on who created them: the user, group, process or session. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PHYSDEV + tristate "Physdev match support" + depends on IP_NF_IPTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ADDRTYPE + tristate 'address type match support' + depends on IP_NF_IPTABLES + help + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_REALM + tristate 'realm match support' + depends on IP_NF_IPTABLES + select NET_CLS_ROUTE + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_SCTP + tristate 'SCTP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_COMMENT + tristate 'comment match support' + depends on IP_NF_IPTABLES + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_CONNMARK + tristate 'Connection mark match support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + help + This option adds a `connmark' match, which allows you to match the + connection mark value previously set for the session by `CONNMARK'. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_connmark.o. If unsure, say `N'. + +config IP_NF_MATCH_HASHLIMIT + tristate 'hashlimit match support' + depends on IP_NF_IPTABLES + help + This option adds a new iptables `hashlimit' match. + + As opposed to `limit', this match dynamically crates a hash table + of limit buckets, based on your selection of source/destination + ip addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination IP' or `500pps from any given source IP' with a single + IPtables rule. + +# `filter', generic and specific targets +config IP_NF_FILTER + tristate "Packet filtering" + depends on IP_NF_IPTABLES + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP_NF_FILTER + help + The REJECT target allows a filtering rule to specify that an ICMP + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_LOG + tristate "LOG target support" + depends on IP_NF_IPTABLES + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ULOG + tristate "ULOG target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `ULOG' target, which allows you to create rules in + any iptables table. The packet is passed to a userspace logging + daemon using netlink multicast sockets; unlike the LOG target + which can only be viewed through syslog. + + The apropriate userspace logging daemon (ulogd) may be obtained from + <http://www.gnumonks.org/projects/ulogd/> + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TCPMSS + tristate "TCPMSS target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `TCPMSS' target, which allows you to alter the + MSS value of TCP SYN packets, to control the maximum size for that + connection (usually limiting it to your outgoing interface's MTU + minus 40). + + This is used to overcome criminally braindead ISPs or servers which + block ICMP Fragmentation Needed packets. The symptoms of this + problem are that everything works fine from your Linux + firewall/router, but machines behind it can never exchange large + packets: + 1) Web browsers connect, then hang with no data received. + 2) Small mail works fine, but large emails hang. + 3) ssh works fine, but scp hangs after initial handshaking. + + Workaround: activate this option and add a rule to your firewall + configuration like: + + iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \ + -j TCPMSS --clamp-mss-to-pmtu + + To compile it as a module, choose M here. If unsure, say N. + +# NAT + specific targets +config IP_NF_NAT + tristate "Full NAT" + depends on IP_NF_IPTABLES && IP_NF_CONNTRACK + help + The Full NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in iptables: see the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_NEEDED + bool + depends on IP_NF_NAT != n + default y + +config IP_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on IP_NF_NAT + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on IP_NF_NAT + help + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" + depends on IP_NF_NAT + help + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. It is similar to Fast NAT, except that + Netfilter's connection tracking doesn't work well with Fast NAT. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_SAME + tristate "SAME target support" + depends on IP_NF_NAT + help + This option adds a `SAME' target, which works like the standard SNAT + target, but attempts to give clients the same IP for all connections. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_NAT + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_IRC + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_IRC=y + default m if IP_NF_IRC=m + +# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), +# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh. +config IP_NF_NAT_FTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_FTP=y + default m if IP_NF_FTP=m + +config IP_NF_NAT_TFTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_TFTP=y + default m if IP_NF_TFTP=m + +config IP_NF_NAT_AMANDA + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_AMANDA=y + default m if IP_NF_AMANDA=m + +# mangle + specific targets +config IP_NF_MANGLE + tristate "Packet mangling" + depends on IP_NF_IPTABLES + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TOS + tristate "TOS target support" + depends on IP_NF_MANGLE + help + This option adds a `TOS' target, which allows you to create rules in + the `mangle' table which alter the Type Of Service field of an IP + packet prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ECN + tristate "ECN target support" + depends on IP_NF_MANGLE + ---help--- + This option adds a `ECN' target, which can be used in the iptables mangle + table. + + You can use this target to remove the ECN bits from the IPv4 header of + an IP packet. This is particularly useful, if you need to work around + existing ECN blackholes on the internet, but don't want to disable + ECN support in general. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_DSCP + tristate "DSCP target support" + depends on IP_NF_MANGLE + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_MARK + tristate "MARK target support" + depends on IP_NF_MANGLE + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CLASSIFY + tristate "CLASSIFY target support" + depends on IP_NF_MANGLE + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CONNMARK + tristate 'CONNMARK target support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + help + This option adds a `CONNMARK' target, which allows one to manipulate + the connection mark value. Similar to the MARK target, but + affects the connection mark value rather than the packet mark value. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_CONNMARK.o. If unsure, say `N'. + +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + +# raw + specific targets +config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' + depends on IP_NF_IPTABLES + help + This option adds a `raw' table to iptables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_TARGET_NOTRACK + tristate 'NOTRACK target support' + depends on IP_NF_RAW + depends on IP_NF_CONNTRACK + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + + +# ARP tables +config IP_NF_ARPTABLES + tristate "ARP tables support" + help + arptables is a general, extensible packet identification framework. + The ARP packet filtering and mangling (manipulation)subsystems + use this: say Y or M here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARPFILTER + tristate "ARP packet filtering" + depends on IP_NF_ARPTABLES + help + ARP packet filtering defines a table `filter', which has a series of + rules for simple ARP packet filtering at local input and + local output. On a bridge, you can also specify filtering rules + for forwarded ARP packets. See the man page for arptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARP_MANGLE + tristate "ARP payload mangling" + depends on IP_NF_ARPTABLES + help + Allows altering the ARP packet payload: source and destination + hardware and network addresses. + +endmenu + diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile new file mode 100644 index 000000000000..45796d5924dd --- /dev/null +++ b/net/ipv4/netfilter/Makefile @@ -0,0 +1,89 @@ +# +# Makefile for the netfilter modules on top of IPv4. +# + +# objects for the standalone - connection tracking / NAT +ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o +iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o + +# connection tracking +obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o + +# SCTP protocol connection tracking +obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o + +# connection tracking helpers +obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o +obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o +obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o +obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o + +# NAT helpers +obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o +obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o +obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o +obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o + +# generic IP tables +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o + +# the three instances of ip_tables +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o +obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o + +# matches +obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o +obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o +obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o +obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o +obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o +obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o +obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o +obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o +obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o +obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o +obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o +obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o +obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o +obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o +obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o +obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o +obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o +obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o +obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o + +# targets +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o +obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o +obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o +obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o +obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o +obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o + +# generic ARP tables +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o + +# just filtering instance of ARP tables for now +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c new file mode 100644 index 000000000000..df79f5ed6a0a --- /dev/null +++ b/net/ipv4/netfilter/arp_tables.c @@ -0,0 +1,1333 @@ +/* + * Packet matching code for ARP packets. + * + * Based heavily, if not almost entirely, upon ip_tables.c framework. + * + * Some ARP specific bits are: + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/semaphore.h> + +#include <linux/netfilter_arp/arp_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); +MODULE_DESCRIPTION("arptables core"); + +/*#define DEBUG_ARP_TABLES*/ +/*#define DEBUG_ARP_TABLES_USER*/ + +#ifdef DEBUG_ARP_TABLES +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_ARP_TABLES_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define ARP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("ARP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define ARP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(arpt_mutex); + +#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +struct arpt_table_info { + unsigned int size; + unsigned int number; + unsigned int initial_entries; + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + unsigned int underflow[NF_ARP_NUMHOOKS]; + char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); +}; + +static LIST_HEAD(arpt_target); +static LIST_HEAD(arpt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, + char *hdr_addr, int len) +{ + int i, ret; + + if (len > ARPT_DEV_ADDR_LEN_MAX) + len = ARPT_DEV_ADDR_LEN_MAX; + + ret = 0; + for (i = 0; i < len; i++) + ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; + + return (ret != 0); +} + +/* Returns whether packet matches rule or not. */ +static inline int arp_packet_match(const struct arphdr *arphdr, + struct net_device *dev, + const char *indev, + const char *outdev, + const struct arpt_arp *arpinfo) +{ + char *arpptr = (char *)(arphdr + 1); + char *src_devaddr, *tgt_devaddr; + u32 src_ipaddr, tgt_ipaddr; + int i, ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) + + if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, + ARPT_INV_ARPOP)) { + dprintf("ARP operation field mismatch.\n"); + dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", + arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + return 0; + } + + if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, + ARPT_INV_ARPHRD)) { + dprintf("ARP hardware address format mismatch.\n"); + dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", + arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + return 0; + } + + if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, + ARPT_INV_ARPPRO)) { + dprintf("ARP protocol address format mismatch.\n"); + dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", + arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + return 0; + } + + if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, + ARPT_INV_ARPHLN)) { + dprintf("ARP hardware address length mismatch.\n"); + dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", + arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + return 0; + } + + src_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&src_ipaddr, arpptr, sizeof(u32)); + arpptr += sizeof(u32); + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); + + if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), + ARPT_INV_SRCDEVADDR) || + FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), + ARPT_INV_TGTDEVADDR)) { + dprintf("Source or target device address mismatch.\n"); + + return 0; + } + + if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, + ARPT_INV_SRCIP) || + FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), + ARPT_INV_TGTIP)) { + dprintf("Source or target IP address mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(src_ipaddr), + NIPQUAD(arpinfo->smsk.s_addr), + NIPQUAD(arpinfo->src.s_addr), + arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); + dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(tgt_ipaddr), + NIPQUAD(arpinfo->tmsk.s_addr), + NIPQUAD(arpinfo->tgt.s_addr), + arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches. */ + for (i = 0, ret = 0; i < IFNAMSIZ; i++) { + ret |= (indev[i] ^ arpinfo->iniface[i]) + & arpinfo->iniface_mask[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, arpinfo->iniface, + arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + unsigned long odev; + memcpy(&odev, outdev + i*sizeof(unsigned long), + sizeof(unsigned long)); + ret |= (odev + ^ ((const unsigned long *)arpinfo->outiface)[i]) + & ((const unsigned long *)arpinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, arpinfo->outiface, + arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + return 1; +} + +static inline int arp_checkentry(const struct arpt_arp *arp) +{ + if (arp->flags & ~ARPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + arp->flags & ~ARPT_F_MASK); + return 0; + } + if (arp->invflags & ~ARPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + arp->invflags & ~ARPT_INV_MASK); + return 0; + } + + return 1; +} + +static unsigned int arpt_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("arp_tables: error: '%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +{ + return (struct arpt_entry *)(base + offset); +} + +unsigned int arpt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct arpt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ]; + unsigned int verdict = NF_DROP; + struct arphdr *arp; + int hotdrop = 0; + struct arpt_entry *e, *back; + const char *indev, *outdev; + void *table_base; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) + + (2 * (*pskb)->dev->addr_len) + + (2 * sizeof(u32))))) + return NF_DROP; + + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + read_lock_bh(&table->lock); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, + smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + back = get_entry(table_base, table->private->underflow[hook]); + + arp = (*pskb)->nh.arph; + do { + if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { + struct arpt_entry_target *t; + int hdr_len; + + hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + + (2 * (*pskb)->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); + + t = arpt_get_target(e); + + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct arpt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != ARPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct arpt_entry *next + = (void *)e + e->next_offset; + next->comefrom = + (void *)back - table_base; + + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + * abs. verdicts + */ + verdict = t->u.kernel.target->target(pskb, + hook, + in, out, + t->data, + userdata); + + /* Target might have changed stuff. */ + arp = (*pskb)->nh.arph; + + if (verdict == ARPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + read_unlock_bh(&table->lock); + + if (hotdrop) + return NF_DROP; + else + return verdict; +} + +static inline void *find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + duprintf("find_inlist: loading `%s%s'.\n", prefix, name); + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex); +} + +static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int unconditional(const struct arpt_arp *arp) +{ + unsigned int i; + + for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) + if (((__u32 *)arp)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + * there are loops. Puts hook bitmask in comefrom. + */ +static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + * to 0 as we leave), and comefrom to save source hook bitmask. + */ + for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct arpt_entry *e + = (struct arpt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct arpt_standard_target *t + = (void *)arpt_get_target(e); + + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { + printk("arptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct arpt_entry) + && (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->arp)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + * big jump. + */ + do { + e->comefrom ^= (1<<NF_ARP_NUMHOOKS); + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct arpt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct arpt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct arpt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int standard_check(const struct arpt_entry_target *t, + unsigned int max_offset) +{ + struct arpt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { + duprintf("arpt_standard_check: target size %u != %Zu\n", + t->u.target_size, + ARPT_ALIGN(sizeof(struct arpt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct arpt_entry)) { + duprintf("arpt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("arpt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static struct arpt_target arpt_standard_target; + +static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + if (!arp_checkentry(&e->arp)) { + duprintf("arp_tables: arp check failed %p %s.\n", e, name); + return -EINVAL; + } + + t = arpt_get_target(e); + target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto out; + } + if (!try_module_get((target->me))) { + ret = -ENOENT; + goto out_unlock; + } + t->u.kernel.target = target; + up(&arpt_mutex); + + if (t->u.kernel.target == &arpt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto out; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto out; + } + + (*i)++; + return 0; + +out_unlock: + up(&arpt_mutex); +out: + return ret; +} + +static inline int check_entry_size_and_hooks(struct arpt_entry *e, + struct arpt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not ARPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct arpt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = arpt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + * newinfo). + */ +static int translate_table(const char *name, + unsigned int valid_hooks, + struct arpt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + + /* Walk through entries, checking offsets. */ + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) { + duprintf("Looping hook\n"); + return -ELOOP; + } + + /* Finally, each sanity check must pass */ + i = 0; + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct arpt_table_info *replace_table(struct arpt_table *table, + unsigned int num_counters, + struct arpt_table_info *newinfo, + int *error) +{ + struct arpt_table_info *oldinfo; + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int add_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void get_counters(const struct arpt_table_info *t, + struct arpt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct arpt_entry *e; + struct arpt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + * (other than comefrom, which userspace doesn't care + * about). + */ + countersize = sizeof(struct arpt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + struct arpt_entry_target *t; + + e = (struct arpt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct arpt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + t = arpt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct arpt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int get_entries(const struct arpt_get_entries *entries, + struct arpt_get_entries __user *uptr) +{ + int ret; + struct arpt_table *t; + + t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&arpt_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int do_replace(void __user *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct arpt_table *t; + struct arpt_table_info *newinfo, *oldinfo; + struct arpt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("arp_tables: Translated table\n"); + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + /* Get a reference in advance, we're not allowed fail later */ + if (!try_module_get(t->me)) { + ret = -EBUSY; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct arpt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&arpt_mutex); + return ret; + + put_module: + module_put(t->me); + free_newinfo_counters_untrans_unlock: + up(&arpt_mutex); + free_newinfo_counters_untrans: + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. + */ +static inline int add_counter_to_entry(struct arpt_entry *e, + const struct arpt_counters addme[], + unsigned int *i) +{ + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct arpt_counters_info tmp, *paddc; + struct arpt_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + ARPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&arpt_mutex); + free: + vfree(paddc); + + return ret; +} + +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: { + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + t = arpt_find_table_lock(name, &ret, &arpt_mutex); + if (t) { + struct arpt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&arpt_mutex); + } + } + break; + + case ARPT_SO_GET_ENTRIES: { + struct arpt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int arpt_register_target(struct arpt_target *target) +{ + int ret; + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&arpt_target, target)) { + duprintf("arpt_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&arpt_mutex); + return ret; +} + +void arpt_unregister_target(struct arpt_target *target) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_target, target); + up(&arpt_mutex); +} + +int arpt_register_table(struct arpt_table *table, + const struct arpt_replace *repl) +{ + int ret; + struct arpt_table_info *newinfo; + static struct arpt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) { + ret = -ENOMEM; + return ret; + } + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + duprintf("arpt_register_table: translate table gives %d\n", ret); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&arpt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&arpt_tables, table); + + unlock: + up(&arpt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void arpt_unregister_table(struct arpt_table *table) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_tables, table); + up(&arpt_mutex); + + /* Decrease module usage counts and free resources */ + ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct arpt_target arpt_standard_target = { + .name = ARPT_STANDARD_TARGET, +}; + +static struct arpt_target arpt_error_target = { + .name = ARPT_ERROR_TARGET, + .target = arpt_error, +}; + +static struct nf_sockopt_ops arpt_sockopts = { + .pf = PF_INET, + .set_optmin = ARPT_BASE_CTL, + .set_optmax = ARPT_SO_SET_MAX+1, + .set = do_arpt_set_ctl, + .get_optmin = ARPT_BASE_CTL, + .get_optmax = ARPT_SO_GET_MAX+1, + .get = do_arpt_get_ctl, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct arpt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int arpt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&arpt_mutex) != 0) + return 0; + + LIST_FIND(&arpt_tables, print_name, struct arpt_table *, + offset, buffer, length, &pos, &count); + + up(&arpt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&arpt_mutex); + list_append(&arpt_target, &arpt_standard_target); + list_append(&arpt_target, &arpt_error_target); + up(&arpt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&arpt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + + proc = proc_net_create("arp_tables_names", 0, arpt_get_tables); + if (!proc) { + nf_unregister_sockopt(&arpt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } +#endif + + printk("arp_tables: (C) 2002 David S. Miller\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&arpt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("arp_tables_names"); +#endif +} + +EXPORT_SYMBOL(arpt_register_table); +EXPORT_SYMBOL(arpt_unregister_table); +EXPORT_SYMBOL(arpt_do_table); +EXPORT_SYMBOL(arpt_register_target); +EXPORT_SYMBOL(arpt_unregister_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c new file mode 100644 index 000000000000..3e592ec86482 --- /dev/null +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -0,0 +1,104 @@ +/* module that allows mangling of the arp payload */ +#include <linux/module.h> +#include <linux/netfilter_arp/arpt_mangle.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("arptables arp payload mangle target"); + +static unsigned int +target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, + const struct net_device *out, const void *targinfo, void *userinfo) +{ + const struct arpt_mangle *mangle = targinfo; + struct arphdr *arp; + unsigned char *arpptr; + int pln, hln; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + arp = (*pskb)->nh.arph; + arpptr = (*pskb)->nh.raw + sizeof(*arp); + pln = arp->ar_pln; + hln = arp->ar_hln; + /* We assume that pln and hln were checked in the match */ + if (mangle->flags & ARPT_MANGLE_SDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->src_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_SIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_s.src_ip, pln); + } + arpptr += pln; + if (mangle->flags & ARPT_MANGLE_TDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->tgt_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_TIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_t.tgt_ip, pln); + } + return mangle->target; +} + +static int +checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + const struct arpt_mangle *mangle = targinfo; + + if (mangle->flags & ~ARPT_MANGLE_MASK || + !(mangle->flags & ARPT_MANGLE_MASK)) + return 0; + + if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && + mangle->target != ARPT_CONTINUE) + return 0; + return 1; +} + +static struct arpt_target arpt_mangle_reg += { + .name = "mangle", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (arpt_register_target(&arpt_mangle_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + arpt_unregister_target(&arpt_mangle_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c new file mode 100644 index 000000000000..0d759f5a4ef0 --- /dev/null +++ b/net/ipv4/netfilter/arptable_filter.c @@ -0,0 +1,214 @@ +/* + * Filtering ARP tables module. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/module.h> +#include <linux/netfilter_arp/arp_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); +MODULE_DESCRIPTION("arptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ + (1 << NF_ARP_FORWARD)) + +/* Standard entry. */ +struct arpt_standard +{ + struct arpt_entry entry; + struct arpt_standard_target target; +}; + +struct arpt_error_target +{ + struct arpt_entry_target target; + char errorname[ARPT_FUNCTION_MAXNAMELEN]; +}; + +struct arpt_error +{ + struct arpt_entry entry; + struct arpt_error_target target; +}; + +static struct +{ + struct arpt_replace repl; + struct arpt_standard entries[3]; + struct arpt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error), + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + 0, NULL, { } }, + { + /* ARP_IN */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_OUT */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_FORWARD */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + } + }, + /* ERROR */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_error), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct arpt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .private = NULL, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c */ +static unsigned int arpt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops arpt_ops[] = { + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_IN, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_FORWARD, + }, +}; + +static int __init init(void) +{ + int ret, i; + + /* Register table */ + ret = arpt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + if ((ret = nf_register_hook(&arpt_ops[i])) < 0) + goto cleanup_hooks; + return ret; + +cleanup_hooks: + while (--i >= 0) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c new file mode 100644 index 000000000000..3dbddd062605 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -0,0 +1,167 @@ +/* Amanda extension for IP connection tracking, Version 0.2 + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on HW's ip_conntrack_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_conntrack_amanda.o [master_timeout=n] + * + * Where master_timeout is the timeout (in seconds) of the master + * connection (port 10080). This defaults to 5 minutes but if + * your clients take longer than 5 minutes to do their work + * before getting back to the Amanda server, you can increase + * this value. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/moduleparam.h> +#include <net/checksum.h> +#include <net/udp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> + +static unsigned int master_timeout = 300; + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda connection tracking module"); +MODULE_LICENSE("GPL"); +module_param(master_timeout, int, 0600); +MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); + +static char *conns[] = { "DATA ", "MESG ", "INDEX " }; + +/* This is slow, but it's simple. --RR */ +static char amanda_buffer[65536]; +static DECLARE_LOCK(amanda_buffer_lock); + +unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + struct ip_conntrack_expect *exp; + char *data, *data_limit, *tmp; + unsigned int dataoff, i; + u_int16_t port, len; + int ret = NF_ACCEPT; + + /* Only look at packets from the Amanda server */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* increase the UDP timeout of the master connection as replies from + * Amanda clients to the server can be quite delayed */ + ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ); + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + if (net_ratelimit()) + printk("amanda_help: skblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + LOCK_BH(&amanda_buffer_lock); + skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); + data = amanda_buffer; + data_limit = amanda_buffer + (*pskb)->len - dataoff; + *data_limit = '\0'; + + /* Search for the CONNECT string */ + data = strstr(data, "CONNECT "); + if (!data) + goto out; + data += strlen("CONNECT "); + + /* Only search first line. */ + if ((tmp = strchr(data, '\n'))) + *tmp = '\0'; + + for (i = 0; i < ARRAY_SIZE(conns); i++) { + char *match = strstr(data, conns[i]); + if (!match) + continue; + tmp = data = match + strlen(conns[i]); + port = simple_strtoul(data, &data, 10); + len = data - tmp; + if (port == 0 || len > 5) + break; + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + + exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->tuple.dst.u.tcp.port = htons(port); + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.protonum = 0xFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + + if (ip_nat_amanda_hook) + ret = ip_nat_amanda_hook(pskb, ctinfo, + tmp - amanda_buffer, + len, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + } + +out: + UNLOCK_BH(&amanda_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper amanda_helper = { + .max_expected = ARRAY_SIZE(conns), + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "amanda", + + .tuple = { .src = { .u = { __constant_htons(10080) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&amanda_helper); +} + +static int __init init(void) +{ + return ip_conntrack_helper_register(&amanda_helper); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c new file mode 100644 index 000000000000..28d9425d5c39 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -0,0 +1,1247 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/icmp.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <net/checksum.h> +#include <net/ip.h> +#include <linux/stddef.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#define IP_CONNTRACK_VERSION "2.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_conntrack_lock); + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); + +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; +LIST_HEAD(ip_conntrack_expect_list); +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; +static LIST_HEAD(helpers); +unsigned int ip_conntrack_htable_size = 0; +int ip_conntrack_max; +struct list_head *ip_conntrack_hash; +static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_expect_cachep; +struct ip_conntrack ip_conntrack_untracked; +unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} + +static int ip_conntrack_hash_rnd_initted; +static unsigned int ip_conntrack_hash_rnd; + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ +#if 0 + dump_tuple(tuple); +#endif + return (jhash_3words(tuple->src.ip, + (tuple->dst.ip ^ tuple->dst.protonum), + (tuple->src.u.all | (tuple->dst.u.all << 16)), + ip_conntrack_hash_rnd) % ip_conntrack_htable_size); +} + +int +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) +{ + /* Never happen */ + if (iph->frag_off & htons(IP_OFFSET)) { + printk("ip_conntrack_core: Frag of proto %u.\n", + iph->protocol); + return 0; + } + + tuple->src.ip = iph->saddr; + tuple->dst.ip = iph->daddr; + tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) +{ + inverse->src.ip = orig->dst.ip; + inverse->dst.ip = orig->src.ip; + inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; + + return protocol->invert_tuple(inverse, orig); +} + + +/* ip_conntrack_expect helper functions */ +static void destroy_expect(struct ip_conntrack_expect *exp) +{ + ip_conntrack_put(exp->master); + IP_NF_ASSERT(!timer_pending(&exp->timeout)); + kmem_cache_free(ip_conntrack_expect_cachep, exp); + CONNTRACK_STAT_INC(expect_delete); +} + +static void unlink_expect(struct ip_conntrack_expect *exp) +{ + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + list_del(&exp->list); + /* Logically in destroy_expect, but we hold the lock here. */ + exp->master->expecting--; +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct ip_conntrack_expect *exp = (void *)ul_expect; + + WRITE_LOCK(&ip_conntrack_lock); + unlink_expect(exp); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master) + && del_timer(&i->timeout)) { + unlink_expect(i); + return i; + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct ip_conntrack *ct) +{ + struct ip_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + } +} + +static void +clean_from_lists(struct ip_conntrack *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; + struct ip_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + IP_NF_ASSERT(atomic_read(&nfct->use) == 0); + IP_NF_ASSERT(!timer_pending(&ct->timeout)); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to ip_conntrack_lock!!! -HW */ + proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (ip_conntrack_destroyed) + ip_conntrack_destroyed(ct); + + WRITE_LOCK(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + CONNTRACK_STAT_INC(delete); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (ct->master) + ip_conntrack_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + kmem_cache_free(ip_conntrack_cachep, ct); + atomic_dec(&ip_conntrack_count); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct ip_conntrack *ct = (void *)ul_conntrack; + + WRITE_LOCK(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); + clean_from_lists(ct); + WRITE_UNLOCK(&ip_conntrack_lock); + ip_conntrack_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + return tuplehash_to_ctrack(i) != ignored_conntrack + && ip_ct_tuple_equal(tuple, &i->tuple); +} + +static struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; + } + CONNTRACK_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct ip_conntrack_tuple_hash * +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + READ_UNLOCK(&ip_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__ip_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* ipt_REJECT uses ip_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + IP_NF_ASSERT(!is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + WRITE_LOCK(&ip_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + CONNTRACK_STAT_INC(insert); + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_ACCEPT; + } + + CONNTRACK_STAT_INC(insert_failed); + WRITE_UNLOCK(&ip_conntrack_lock); + + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + READ_UNLOCK(&ip_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct ip_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; + int dropped = 0; + + READ_LOCK(&ip_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + READ_UNLOCK(&ip_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + CONNTRACK_STAT_INC(early_drop); + } + ip_conntrack_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct ip_conntrack_helper *i, + const struct ip_conntrack_tuple *rtuple) +{ + return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct ip_conntrack_helper *, + tuple); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct ip_conntrack_tuple_hash * +init_conntrack(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + size_t hash; + struct ip_conntrack_expect *exp; + + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } + + hash = hash_conntrack(tuple); + + if (ip_conntrack_max + && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + /* Try dropping from this hash chain. */ + if (!early_drop(&ip_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "ip_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return ERR_PTR(-ENOMEM); + } + + memset(conntrack, 0, sizeof(*conntrack)); + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; + if (!protocol->new(conntrack, skb)) { + kmem_cache_free(ip_conntrack_cachep, conntrack); + return NULL; + } + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + WRITE_LOCK(&ip_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + CONNTRACK_STAT_INC(expect_new); + } else { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + + CONNTRACK_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + atomic_inc(&ip_conntrack_count); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + destroy_expect(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct ip_conntrack * +resolve_normal_ct(struct sk_buff *skb, + struct ip_conntrack_protocol *proto, + int *set_reply, + unsigned int hooknum, + enum ip_conntrack_info *ctinfo) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; + + IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); + + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) + return NULL; + + /* look for tuple match */ + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, proto, skb); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: normal packet for %p\n", + ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: related packet for %p\n", + ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("ip_conntrack_in: new packet for %p\n", + ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +/* Netfilter hook itself. */ +unsigned int ip_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_conntrack_protocol *proto; + int set_reply; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + CONNTRACK_STAT_INC(ignore); + return NF_ACCEPT; + } + + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return NF_DROP; + } + + /* FIXME: Do this right please. --RR */ + (*pskb)->nfcache |= NFC_UNKNOWN; + +/* Doesn't cover locally-generated broadcast, so not worth it. */ +#if 0 + /* Ignore broadcast: no `connection'. */ + if ((*pskb)->pkt_type == PACKET_BROADCAST) { + printk("Broadcast packet!\n"); + return NF_ACCEPT; + } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) + == htonl(0x000000FF)) { + printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + NIPQUAD((*pskb)->nh.iph->daddr), + (*pskb)->sk, (*pskb)->pkt_type); + } +#endif + + proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + CONNTRACK_STAT_INC(error); + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { + /* Not valid part of a connection */ + CONNTRACK_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + CONNTRACK_STAT_INC(drop); + return NF_DROP; + } + + IP_NF_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, ctinfo); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (set_reply) + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + + return ret; +} + +int invert_tuplepr(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig) +{ + return ip_ct_invert_tuple(inverse, orig, + ip_ct_find_proto(orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct ip_conntrack_tuple intersect_mask + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; + + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) +{ + struct ip_conntrack_expect *i; + + WRITE_LOCK(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + unlink_expect(i); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(i); + return; + } + } + WRITE_UNLOCK(&ip_conntrack_lock); +} + +struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) +{ + struct ip_conntrack_expect *new; + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = NULL; + return new; +} + +void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) +{ + kmem_cache_free(ip_conntrack_expect_cachep, expect); +} + +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) +{ + atomic_inc(&exp->master->ct_general.use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + if (exp->master->helper->timeout) { + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires + = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + } else + exp->timeout.function = NULL; + + CONNTRACK_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + break; + } + } +} + +static inline int refresh_timer(struct ip_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) +{ + struct ip_conntrack_expect *i; + int ret; + + DEBUGP("ip_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + /* We don't need the one they've given us. */ + ip_conntrack_expect_free(expect); + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + ip_conntrack_expect_insert(expect); + ret = 0; +out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) +{ + WRITE_LOCK(&ip_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + IP_NF_ASSERT(!is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = ip_ct_find_helper(newreply); + WRITE_UNLOCK(&ip_conntrack_lock); +} + +int ip_conntrack_helper_register(struct ip_conntrack_helper *me) +{ + BUG_ON(me->timeout == 0); + WRITE_LOCK(&ip_conntrack_lock); + list_prepend(&helpers, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_helper *me) +{ + if (tuplehash_to_ctrack(i)->helper == me) + tuplehash_to_ctrack(i)->helper = NULL; + return 0; +} + +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) +{ + unsigned int i; + struct ip_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + WRITE_LOCK(&ip_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + unlink_expect(exp); + destroy_expect(exp); + } + } + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); + for (i = 0; i < ip_conntrack_htable_size; i++) + LIST_FIND_W(&ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) +{ + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* If not in hash table, timer will not be active yet */ + if (!is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + ct_add_counters(ct, ctinfo, skb); + } else { + WRITE_LOCK(&ip_conntrack_lock); + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + } + ct_add_counters(ct, ctinfo, skb); + WRITE_UNLOCK(&ip_conntrack_lock); + } +} + +/* Returns new sk_buff, or NULL */ +struct sk_buff * +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + struct sock *sk = skb->sk; +#ifdef CONFIG_NETFILTER_DEBUG + unsigned int olddebug = skb->nf_debug; +#endif + + if (sk) { + sock_hold(sk); + skb_orphan(skb); + } + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (!skb) { + if (sk) + sock_put(sk); + return skb; + } + + if (sk) { + skb_set_owner_w(skb, sk); + sock_put(sk); + } + + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; +#ifdef CONFIG_NETFILTER_DEBUG + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; +#endif + return skb; +} + +/* Used by ipt_REJECT. */ +static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = ip_conntrack_get(skb, &ctinfo); + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), + void *data) +{ + return iter(tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct ip_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), + void *data, unsigned int *bucket) +{ + struct ip_conntrack_tuple_hash *h = NULL; + + WRITE_LOCK(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); + + return h; +} + +void +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + ip_conntrack_put(ct); + } +} + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + + IP_CT_TUPLE_U_BLANK(&tuple); + tuple.src.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = ip_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + ip_conntrack_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +static int kill_all(struct ip_conntrack *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(void) +{ + if (ip_conntrack_vmalloc) + vfree(ip_conntrack_hash); + else + free_pages((unsigned long)ip_conntrack_hash, + get_order(sizeof(struct list_head) + * ip_conntrack_htable_size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + i_see_dead_people: + ip_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&ip_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + kmem_cache_destroy(ip_conntrack_cachep); + kmem_cache_destroy(ip_conntrack_expect_cachep); + free_conntrack_hash(); + nf_unregister_sockopt(&so_getorigdst); +} + +static int hashsize; +module_param(hashsize, int, 0400); + +int __init ip_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (hashsize) { + ip_conntrack_htable_size = hashsize; + } else { + ip_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + ip_conntrack_htable_size = 8192; + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } + ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack version %s (%u buckets, %d max)" + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, + ip_conntrack_htable_size, ip_conntrack_max, + sizeof(struct ip_conntrack)); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + /* AK: the hash table is twice as big than needed because it + uses list_head. it would be much nicer to caches to use a + single pointer list head here. */ + ip_conntrack_vmalloc = 0; + ip_conntrack_hash + =(void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + *ip_conntrack_htable_size)); + if (!ip_conntrack_hash) { + ip_conntrack_vmalloc = 1; + printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + } + if (!ip_conntrack_hash) { + printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); + goto err_unreg_sockopt; + } + + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, + 0, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); + goto err_free_hash; + } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, 0, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + WRITE_LOCK(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) + ip_ct_protos[i] = &ip_conntrack_generic_protocol; + /* Sew in builtin protocols. */ + ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + WRITE_UNLOCK(&ip_conntrack_lock); + + for (i = 0; i < ip_conntrack_htable_size; i++) + INIT_LIST_HEAD(&ip_conntrack_hash[i]); + + /* For use by ipt_REJECT */ + ip_ct_attach = ip_conntrack_attach; + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&ip_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); +err_free_hash: + free_conntrack_hash(); +err_unreg_sockopt: + nf_unregister_sockopt(&so_getorigdst); + + return -ENOMEM; +} diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c new file mode 100644 index 000000000000..12b88cbb11db --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -0,0 +1,501 @@ +/* FTP extension for IP connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/ctype.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/moduleparam.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char ftp_buffer[65536]; + +static DECLARE_LOCK(ip_ftp_lock); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, u_int32_t [], char); +static int try_eprt(const char *, size_t, u_int32_t [], char); +static int try_epsv_response(const char *, size_t, u_int32_t [], char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, u_int32_t[], char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + return try_number(data, dlen, array, 6, ',', term); +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int32_t array[2]) +{ + u_int16_t port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (port == 0) + break; + array[0] = port >> 8; + array[1] = port; + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + port = port*10 + data[i] - '0'; + else /* Some other crap */ + break; + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */ +static int try_eprt(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4, then + delimiter again. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != '1' || data[2] != delim) + return 0; + + DEBUGP("EPRT: Got |1|!\n"); + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length == 0) + return 0; + + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, array+4); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, array+4); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + u_int32_t array[6], + int (*getnum)(const char *, size_t, u_int32_t[], char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, array, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + else if (oldest != NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][oldest] = nl_seq; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq, array[6] = { 0 }; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; + struct ip_conntrack_expect *exp; + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + LOCK_BH(&ip_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl[0][dir] + old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP array to expected address (it's not mentioned + in EPSV responses) */ + array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; + array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; + array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; + array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, (*pskb)->len - dataoff, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + array, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", + fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); + + /* Allocate expectation which will be inserted */ + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + + if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) + != ct->tuplehash[dir].tuple.src.ip) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + array[0], array[1], array[2], array[3], + NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); + + /* Thanks to Cristiano Lincoln Mattos + <lincoln@cesar.org.br> for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + ip_conntrack_expect_free(exp); + goto out_update_nl; + } + exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); + exp->tuple.src.u.tcp.port = 0; /* Don't care. */ + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask = ((struct ip_conntrack_tuple) + { { 0xFFFFFFFF, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + + exp->expectfn = NULL; + exp->master = ct; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (ip_nat_ftp_hook) + ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } else + ret = NF_ACCEPT; + } + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info,dir); + out: + UNLOCK_BH(&ip_ftp_lock); + return ret; +} + +static struct ip_conntrack_helper ftp[MAX_PORTS]; +static char ftp_names[MAX_PORTS][10]; + +/* Not __exit: called from init() */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&ftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + for (i = 0; i < ports_c; i++) { + ftp[i].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i].tuple.dst.protonum = IPPROTO_TCP; + ftp[i].mask.src.u.tcp.port = 0xFFFF; + ftp[i].mask.dst.protonum = 0xFF; + ftp[i].max_expected = 1; + ftp[i].timeout = 5 * 60; /* 5 minutes */ + ftp[i].me = THIS_MODULE; + ftp[i].help = help; + + tmpname = &ftp_names[i][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i].name = tmpname; + + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); + ret = ip_conntrack_helper_register(&ftp[i]); + + if (ret) { + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c new file mode 100644 index 000000000000..33cc7348b6ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -0,0 +1,313 @@ +/* IRC extension for IP connection tracking, Version 1.21 + * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> + * based on RR's ip_conntrack_ftp.c + * + * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + ** + * Module load syntax: + * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS> + * max_dcc_channels=n dcc_timeout=secs + * + * please give the ports of all IRC servers You wish to connect to. + * If You don't specify ports, the default will be port 6667. + * With max_dcc_channels you can define the maximum number of not + * yet answered DCC channels per IRC session (default 8). + * With dcc_timeout you can specify how long the system waits for + * an expected DCC channel (default 300 seconds). + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_irc.h> +#include <linux/moduleparam.h> + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +static int max_dcc_channels = 8; +static unsigned int dcc_timeout = 300; +/* This is slow, but it's simple. --RR */ +static char irc_buffer[65536]; +static DECLARE_LOCK(irc_buffer_lock); + +unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_irc_hook); + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); +MODULE_LICENSE("GPL"); +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of IRC servers"); +module_param(max_dcc_channels, int, 0400); +MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); +module_param(dcc_timeout, int, 0400); +MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); + +static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +#define MINMATCHLEN 5 + +#if 0 +#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +static int parse_dcc(char *data, char *data_end, u_int32_t *ip, + u_int16_t *port, char **ad_beg_p, char **ad_end_p) +/* tries to get the ip_addr and port out of a dcc command + return value: -1 on failure, 0 on success + data pointer to first byte of DCC command data + data_end pointer to last byte of dcc command data + ip returns parsed ip of dcc command + port returns parsed port of dcc command + ad_beg_p returns pointer to first byte of addr data + ad_end_p returns pointer to last byte of addr data */ +{ + + /* at least 12: "AAAAAAAA P\1\n" */ + while (*data++ != ' ') + if (data > data_end - 12) + return -1; + + *ad_beg_p = data; + *ip = simple_strtoul(data, &data, 10); + + /* skip blanks between ip and port */ + while (*data == ' ') { + if (data >= data_end) + return -1; + data++; + } + + *port = simple_strtoul(data, &data, 10); + *ad_end_p = data; + + return 0; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff; + struct tcphdr _tcph, *th; + char *data, *data_limit, *ib_ptr; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack_expect *exp; + u32 seq; + u_int32_t dcc_ip; + u_int16_t dcc_port; + int i, ret = NF_ACCEPT; + char *addr_beg_p, *addr_end_p; + + DEBUGP("entered\n"); + + /* If packet is coming from IRC server */ + if (dir == IP_CT_DIR_REPLY) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + DEBUGP("Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + /* Not a full tcp header? */ + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + if (dataoff >= (*pskb)->len) + return NF_ACCEPT; + + LOCK_BH(&irc_buffer_lock); + ib_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, irc_buffer); + BUG_ON(ib_ptr == NULL); + + data = ib_ptr; + data_limit = ib_ptr + (*pskb)->len - dataoff; + + /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 + * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ + while (data < (data_limit - (19 + MINMATCHLEN))) { + if (memcmp(data, "\1DCC ", 5)) { + data++; + continue; + } + + data += 5; + /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + + DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest)); + + for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { + if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { + /* no match */ + continue; + } + + DEBUGP("DCC %s detected\n", dccprotos[i]); + data += strlen(dccprotos[i]); + /* we have at least + * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc((char *)data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { + /* unable to parse */ + DEBUGP("unable to parse dcc command\n"); + continue; + } + DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", + HIPQUAD(dcc_ip), dcc_port); + + /* dcc_ip can be the internal OR external (NAT'ed) IP + * Tiago Sousa <mirage@kaotik.org> */ + if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip) + && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) { + if (net_ratelimit()) + printk(KERN_WARNING + "Forged DCC command from " + "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", + NIPQUAD(ct->tuplehash[dir].tuple.src.ip), + HIPQUAD(dcc_ip), dcc_port); + + continue; + } + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* save position of address in dcc string, + * necessary for NAT */ + DEBUGP("tcph->seq = %u\n", th->seq); + seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); + + /* We refer to the reverse direction ("!dir") + * tuples here, because we're expecting + * something in the other * direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { ct->tuplehash[!dir].tuple.dst.ip, + { .tcp = { htons(dcc_port) } }, + IPPROTO_TCP }}); + exp->mask = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + exp->expectfn = NULL; + exp->master = ct; + if (ip_nat_irc_hook) + ret = ip_nat_irc_hook(pskb, ctinfo, + addr_beg_p - ib_ptr, + addr_end_p - addr_beg_p, + exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + goto out; + } /* for .. NUM_DCCPROTO */ + } /* while data < ... */ + + out: + UNLOCK_BH(&irc_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper irc_helpers[MAX_PORTS]; +static char irc_names[MAX_PORTS][10]; + +static void fini(void); + +static int __init init(void) +{ + int i, ret; + struct ip_conntrack_helper *hlpr; + char *tmpname; + + if (max_dcc_channels < 1) { + printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); + return -EBUSY; + } + if (dcc_timeout < 0) { + printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); + return -EBUSY; + } + + /* If no port given, default to standard irc port */ + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; + + for (i = 0; i < ports_c; i++) { + hlpr = &irc_helpers[i]; + hlpr->tuple.src.u.tcp.port = htons(ports[i]); + hlpr->tuple.dst.protonum = IPPROTO_TCP; + hlpr->mask.src.u.tcp.port = 0xFFFF; + hlpr->mask.dst.protonum = 0xFF; + hlpr->max_expected = max_dcc_channels; + hlpr->timeout = dcc_timeout; + hlpr->me = THIS_MODULE; + hlpr->help = help; + + tmpname = &irc_names[i][0]; + if (ports[i] == IRC_PORT) + sprintf(tmpname, "irc"); + else + sprintf(tmpname, "irc-%d", i); + hlpr->name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(hlpr); + + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", + ports[i]); + fini(); + return -EBUSY; + } + } + return 0; +} + +/* This function is intentionally _NOT_ defined as __exit, because + * it is needed by the init function */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&irc_helpers[i]); + } +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c new file mode 100644 index 000000000000..88c3712bd251 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -0,0 +1,75 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct ip_conntrack *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_generic_protocol = +{ + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c new file mode 100644 index 000000000000..602c74db3252 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -0,0 +1,279 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/icmp.h> +#include <linux/seq_file.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct ip_conntrack *ct, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct ip_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct ip_conntrack_protocol *innerproto; + struct ip_conntrack_tuple_hash *h; + int dataoff; + + IP_NF_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_track: fragment of proto %u\n", + inside->ip.protocol); + return NF_ACCEPT; + } + + innerproto = ip_ct_find_proto(inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; + /* Are they talking about one of our connections? */ + if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { + DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + return NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + DEBUGP("icmp_error_track: Can't invert tuple\n"); + return NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = ip_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = ip_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_track: no match\n"); + return NF_ACCEPT; + } + /* Reverse direction from that found */ + if (DIRECTION(h) != IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct ip_conntrack_protocol ip_conntrack_protocol_icmp = +{ + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c new file mode 100644 index 000000000000..ff8c34a860ff --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -0,0 +1,649 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> +#include <linux/seq_file.h> + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DECLARE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long ip_ct_sctp_timeout_closed = 10 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long ip_ct_sctp_timeout_established = 5 DAYS; +static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + READ_LOCK(&sctp_lock); + state = conntrack->proto.sctp.state; + READ_UNLOCK(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \ +for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + WRITE_LOCK(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + WRITE_UNLOCK(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + WRITE_UNLOCK(&sctp_lock); + } + + ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum sctp_conntrack newconntrack; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state (IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &ip_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &ip_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &ip_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_sctp_timeout_established", + .data = &ip_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &ip_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &ip_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &ip_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ip_ct_sysctl_header; +#endif + +static int __init init(void) +{ + int ret; + + ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); + if (ret) { + printk("ip_conntrack_proto_sctp: protocol register failed\n"); + goto out; + } + +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + ret = -ENOMEM; + printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#endif + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c new file mode 100644 index 000000000000..e800b16fc920 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -0,0 +1,1098 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * version 2.2 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/spinlock.h> + +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DECLARE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int ip_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int ip_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int ip_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long ip_ct_tcp_timeout_established = 5 DAYS; +unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum tcp_conntrack state; + + READ_LOCK(&tcp_lock); + state = conntrack->proto.tcp.state; + READ_UNLOCK(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + struct iphdr *iph, + struct tcphdr *tcph) +{ + return (seq + len - (iph->ihl + tcph->doff)*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode=*ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = ntohl(*((u_int32_t *)(ptr+i)+1)); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, iph, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, iph, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = ip_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +void ip_conntrack_tcp_update(struct sk_buff *skb, + struct ip_conntrack *conntrack, + enum ip_conntrack_dir dir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); + + WRITE_LOCK(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + WRITE_UNLOCK(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - iph->ihl * 4; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + WRITE_LOCK(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); + + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid packet ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + WRITE_UNLOCK(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } else { + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid SYN"); + return -NF_ACCEPT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, iph, th)) { + WRITE_UNLOCK(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans + ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + WRITE_UNLOCK(&tcp_lock); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum tcp_conntrack new_state; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("ip_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_tcp = +{ + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c new file mode 100644 index 000000000000..5bc28a224623 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -0,0 +1,146 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_udp_timeout = 30*HZ; +unsigned long ip_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh_acct(conntrack, ctinfo, skb, + ip_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } else + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int udplen = skb->len - iph->ihl * 4; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, udplen, 0))) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_udp = +{ + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c new file mode 100644 index 000000000000..80a7bde2a57a --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -0,0 +1,961 @@ +/* This file contains all the functions required for the standalone + ip_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <net/checksum.h> +#include <net/ip.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t ip_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +static int kill_proto(struct ip_conntrack *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + *((u_int8_t *) data)); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *proto) +{ + seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); + return proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < ip_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&ip_conntrack_hash[st->bucket])) + return ip_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &ip_conntrack_hash[st->bucket]) { + if (++st->bucket >= ip_conntrack_htable_size) + return NULL; + head = ip_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + READ_LOCK(&ip_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct ip_conntrack_tuple_hash *hash = v; + const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); + struct ip_conntrack_protocol *proto; + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + IP_NF_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (DIRECTION(hash)) + return 0; + + proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + IP_NF_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %ld ", + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ + : 0) != 0) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%lu ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + READ_LOCK(&ip_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &ip_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + e = e->next; + + if (e == &ip_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct ip_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + + seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); + + print_tuple(s, &expect->tuple, + ip_ct_find_proto(expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static unsigned int ip_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = ip_conntrack_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + return ip_conntrack_confirm(pskb); +} + +static unsigned int ip_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return ip_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ip_conntrack_defrag_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_in_ops = { + .hook = ip_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_local_out_ops = { + .hook = ip_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ip_conntrack_out_ops = { + .hook = ip_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_LAST, +}; + +static struct nf_hook_ops ip_conntrack_local_in_ops = { + .hook = ip_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_LAST-1, +}; + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From ip_conntrack_core.c */ +extern int ip_conntrack_max; +extern unsigned int ip_conntrack_htable_size; + +/* From ip_conntrack_proto_tcp.c */ +extern unsigned long ip_ct_tcp_timeout_syn_sent; +extern unsigned long ip_ct_tcp_timeout_syn_recv; +extern unsigned long ip_ct_tcp_timeout_established; +extern unsigned long ip_ct_tcp_timeout_fin_wait; +extern unsigned long ip_ct_tcp_timeout_close_wait; +extern unsigned long ip_ct_tcp_timeout_last_ack; +extern unsigned long ip_ct_tcp_timeout_time_wait; +extern unsigned long ip_ct_tcp_timeout_close; +extern unsigned long ip_ct_tcp_timeout_max_retrans; +extern int ip_ct_tcp_loose; +extern int ip_ct_tcp_be_liberal; +extern int ip_ct_tcp_max_retrans; + +/* From ip_conntrack_proto_udp.c */ +extern unsigned long ip_ct_udp_timeout; +extern unsigned long ip_ct_udp_timeout_stream; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_icmp_timeout; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *ip_ct_sysctl_header; + +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, + .procname = "ip_conntrack_count", + .data = &ip_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, + .procname = "ip_conntrack_buckets", + .data = &ip_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "ip_conntrack_tcp_timeout_syn_sent", + .data = &ip_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "ip_conntrack_tcp_timeout_syn_recv", + .data = &ip_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_tcp_timeout_established", + .data = &ip_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "ip_conntrack_tcp_timeout_fin_wait", + .data = &ip_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "ip_conntrack_tcp_timeout_close_wait", + .data = &ip_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "ip_conntrack_tcp_timeout_last_ack", + .data = &ip_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "ip_conntrack_tcp_timeout_time_wait", + .data = &ip_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "ip_conntrack_tcp_timeout_close", + .data = &ip_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "ip_conntrack_udp_timeout", + .data = &ip_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "ip_conntrack_udp_timeout_stream", + .data = &ip_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "ip_conntrack_icmp_timeout", + .data = &ip_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "ip_conntrack_generic_timeout", + .data = &ip_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, + .procname = "ip_conntrack_log_invalid", + .data = &ip_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &ip_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, + .procname = "ip_conntrack_tcp_loose", + .data = &ip_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "ip_conntrack_tcp_be_liberal", + .data = &ip_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "ip_conntrack_tcp_max_retrans", + .data = &ip_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +#define NET_IP_CONNTRACK_MAX 2089 + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { + .ctl_name = NET_IP_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +EXPORT_SYMBOL(ip_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = ip_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + ret = -ENOMEM; + proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif + + ret = nf_register_hook(&ip_conntrack_defrag_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing defrag hook.\n"); + goto cleanup_proc_stat; + } + ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + ret = nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + ret = nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + ret = nf_register_hook(&ip_conntrack_local_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + printk("ip_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ip_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("ip_conntrack_expect"); + cleanup_proc: + proc_net_remove("ip_conntrack"); + cleanup_init: +#endif /* CONFIG_PROC_FS */ + ip_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_conntrack_lock); + if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + ret = -EBUSY; + goto out; + } + ip_ct_protos[proto->proto] = proto; + out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) +{ + WRITE_LOCK(&ip_conntrack_lock); + ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + ip_ct_iterate_cleanup(kill_proto, &proto->proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(ip_conntrack_protocol_unregister); +EXPORT_SYMBOL(ip_ct_get_tuple); +EXPORT_SYMBOL(invert_tuplepr); +EXPORT_SYMBOL(ip_conntrack_alter_reply); +EXPORT_SYMBOL(ip_conntrack_destroyed); +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(ip_conntrack_helper_register); +EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(ip_ct_iterate_cleanup); +EXPORT_SYMBOL(ip_ct_refresh_acct); +EXPORT_SYMBOL(ip_ct_protos); +EXPORT_SYMBOL(ip_ct_find_proto); +EXPORT_SYMBOL(ip_conntrack_expect_alloc); +EXPORT_SYMBOL(ip_conntrack_expect_free); +EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL(ip_conntrack_tuple_taken); +EXPORT_SYMBOL(ip_ct_gather_frags); +EXPORT_SYMBOL(ip_conntrack_htable_size); +EXPORT_SYMBOL(ip_conntrack_lock); +EXPORT_SYMBOL(ip_conntrack_hash); +EXPORT_SYMBOL(ip_conntrack_untracked); +EXPORT_SYMBOL_GPL(ip_conntrack_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_put); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(ip_conntrack_tcp_update); +#endif diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c new file mode 100644 index 000000000000..992fac3e36ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -0,0 +1,159 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> + * - port to newnat API + * + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> +#include <linux/moduleparam.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("tftp connection tracking helper"); +MODULE_LICENSE("GPL"); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of tftp servers"); + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_tftp_hook); + +static int tftp_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tftphdr _tftph, *tfh; + struct ip_conntrack_expect *exp; + unsigned int ret = NF_ACCEPT; + + tfh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) + return NF_ACCEPT; + + switch (ntohs(tfh->opcode)) { + /* RRQ and WRQ works the same way */ + case TFTP_OPCODE_READ: + case TFTP_OPCODE_WRITE: + DEBUGP(""); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) + return NF_DROP; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->mask.src.ip = 0xffffffff; + exp->mask.dst.ip = 0xffffffff; + exp->mask.dst.u.udp.port = 0xffff; + exp->mask.dst.protonum = 0xff; + exp->expectfn = NULL; + exp->master = ct; + + DEBUGP("expect: "); + DUMP_TUPLE(&exp->tuple); + DUMP_TUPLE(&exp->mask); + if (ip_nat_tftp_hook) + ret = ip_nat_tftp_hook(pskb, ctinfo, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + break; + case TFTP_OPCODE_DATA: + case TFTP_OPCODE_ACK: + DEBUGP("Data/ACK opcode\n"); + break; + case TFTP_OPCODE_ERROR: + DEBUGP("Error opcode\n"); + break; + default: + DEBUGP("Unknown opcode\n"); + } + return NF_ACCEPT; +} + +static struct ip_conntrack_helper tftp[MAX_PORTS]; +static char tftp_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + + for (i = 0 ; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&tftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); + + tftp[i].tuple.dst.protonum = IPPROTO_UDP; + tftp[i].tuple.src.u.udp.port = htons(ports[i]); + tftp[i].mask.dst.protonum = 0xFF; + tftp[i].mask.src.u.udp.port = 0xFFFF; + tftp[i].max_expected = 1; + tftp[i].timeout = 5 * 60; /* 5 minutes */ + tftp[i].me = THIS_MODULE; + tftp[i].help = tftp_help; + + tmpname = &tftp_names[i][0]; + if (ports[i] == TFTP_PORT) + sprintf(tmpname, "tftp"); + else + sprintf(tmpname, "tftp-%d", i); + tftp[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret=ip_conntrack_helper_register(&tftp[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return(ret); + } + } + return(0); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c new file mode 100644 index 000000000000..da1f412583ed --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_amanda.c @@ -0,0 +1,88 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on a copy of HW's ip_nat_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_nat_amanda.o + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/tcp.h> +#include <net/udp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> + + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + sprintf(buffer, "%u", port); + ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_amanda_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_amanda_hook); + ip_nat_amanda_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c new file mode 100644 index 000000000000..162ceacfc29a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -0,0 +1,556 @@ +/* NAT for netfilter; shared with compatibility layer. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4.h> +#include <linux/vmalloc.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> /* For tcp_prot in getorigdst */ +#include <linux/icmp.h> +#include <linux/udp.h> +#include <linux/jhash.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_nat_lock); + +/* Calculated at init based on memory size */ +static unsigned int ip_nat_htable_size; + +static struct list_head *bysource; +struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct ip_conntrack_tuple *tuple) +{ + /* Original src, to ensure we map it consistently if poss. */ + return jhash_3words(tuple->src.ip, tuple->src.u.all, + tuple->dst.protonum, 0) % ip_nat_htable_size; +} + +/* Noone using conntrack by the time this called. */ +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) +{ + if (!(conn->status & IPS_NAT_DONE_MASK)) + return; + + WRITE_LOCK(&ip_nat_lock); + list_del(&conn->nat.info.bysource); + WRITE_UNLOCK(&ip_nat_lock); +} + +/* We do checksum mangling, so if they were wrong before they're still + * wrong. Also works for incomplete packets (eg. ICMP dest + * unreachables.) */ +u_int16_t +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +/* Is this tuple already taken? (not by us) */ +int +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + incoming ones. NAT means they don't have a fixed mapping, + so we invert the tuple and look for the incoming reply. + + We could keep a separate hash if this proves too slow. */ + struct ip_conntrack_tuple reply; + + invert_tuplepr(&reply, tuple); + return ip_conntrack_tuple_taken(&reply, ignored_conntrack); +} + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. */ +static int +in_range(const struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range) +{ + struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + + /* If we are supposed to map IPs, then we must be in the + range specified, otherwise let this drag us onto a new src IP. */ + if (range->flags & IP_NAT_RANGE_MAP_IPS) { + if (ntohl(tuple->src.ip) < ntohl(range->min_ip) + || ntohl(tuple->src.ip) > ntohl(range->max_ip)) + return 0; + } + + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, IP_NAT_MANIP_SRC, + &range->min, &range->max)) + return 1; + + return 0; +} + +static inline int +same_src(const struct ip_conntrack *ct, + const struct ip_conntrack_tuple *tuple) +{ + return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum + == tuple->dst.protonum + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip + == tuple->src.ip + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all + == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_tuple *result, + const struct ip_nat_range *range) +{ + unsigned int h = hash_by_src(tuple); + struct ip_conntrack *ct; + + READ_LOCK(&ip_nat_lock); + list_for_each_entry(ct, &bysource[h], nat.info.bysource) { + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ + invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(result, range)) { + READ_UNLOCK(&ip_nat_lock); + return 1; + } + } + } + READ_UNLOCK(&ip_nat_lock); + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + 1-65535, we don't do pro-rata allocation based on ports; we choose + the ip with the lowest src-ip/dst-ip/proto usage. +*/ +static void +find_best_ips_proto(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + const struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + u_int32_t *var_ipp; + /* Host order */ + u_int32_t minip, maxip, j; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == IP_NAT_MANIP_SRC) + var_ipp = &tuple->src.ip; + else + var_ipp = &tuple->dst.ip; + + /* Fast path: only one choice. */ + if (range->min_ip == range->max_ip) { + *var_ipp = range->min_ip; + return; + } + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. */ + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0); + *var_ipp = htonl(minip + j % (maxip - minip + 1)); +} + +/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, + * we change the source to map into the range. For NF_IP_PRE_ROUTING + * and NF_IP_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig_tuple, + const struct ip_nat_range *range, + struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + struct ip_nat_protocol *proto + = ip_nat_find_proto(orig_tuple->dst.protonum); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + and that same mapping gives a unique tuple within the given + range, use that. + + This is only required for source (ie. NAT/masq) mappings. + So far, we don't do local source mappings, so multiple + manips not an issue. */ + if (maniptype == IP_NAT_MANIP_SRC) { + if (find_appropriate_src(orig_tuple, tuple, range)) { + DEBUGP("get_unique_tuple: Found current src map\n"); + if (!ip_nat_used_tuple(tuple, conntrack)) + return; + } + } + + /* 2) Select the least-used IP/proto combination in the given + range. */ + *tuple = *orig_tuple; + find_best_ips_proto(tuple, range, conntrack, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + the range to make a unique tuple. */ + + /* Only bother mapping if it's not already in range and unique */ + if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, maniptype, &range->min, &range->max)) + && !ip_nat_used_tuple(tuple, conntrack)) + return; + + /* Last change: get protocol to try to obtain unique tuple. */ + proto->unique_tuple(tuple, range, maniptype, conntrack); +} + +unsigned int +ip_nat_setup_info(struct ip_conntrack *conntrack, + const struct ip_nat_range *range, + unsigned int hooknum) +{ + struct ip_conntrack_tuple curr_tuple, new_tuple; + struct ip_nat_info *info = &conntrack->nat.info; + int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_IN + || hooknum == NF_IP_LOCAL_OUT); + BUG_ON(ip_nat_initialized(conntrack, maniptype)); + + /* What we've got will look like inverse of reply. Normally + this is what is in the conntrack, except for prior + manipulations (future optimization: if num_manips == 0, + orig_tp = + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ + invert_tuplepr(&curr_tuple, + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype); + + if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct ip_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + invert_tuplepr(&reply, &new_tuple); + ip_conntrack_alter_reply(conntrack, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == IP_NAT_MANIP_SRC) + conntrack->status |= IPS_SRC_NAT; + else + conntrack->status |= IPS_DST_NAT; + } + + /* Place in source hash if this is the first time. */ + if (have_to_hash) { + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple); + WRITE_LOCK(&ip_nat_lock); + list_add(&info->bysource, &bysource[srchash]); + WRITE_UNLOCK(&ip_nat_lock); + } + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) + set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status); + else + set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status); + + return NF_ACCEPT; +} + +/* Returns true if succeeded. */ +static int +manip_pkt(u_int16_t proto, + struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *target, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph; + + (*pskb)->nfcache |= NFC_ALTERED; + if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + /* Manipulate protcol part. */ + if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, + target, maniptype)) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + if (maniptype == IP_NAT_MANIP_SRC) { + iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip, + iph->check); + iph->saddr = target->src.ip; + } else { + iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip, + iph->check); + iph->daddr = target->dst.ip; + } + return 1; +} + +/* Do packet manipulations according to ip_nat_setup_info. */ +unsigned int nat_packet(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff **pskb) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) + && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) { + DEBUGP("ip_nat_core: adjusting sequence number\n"); + /* future: put this in a l4-proto specific function, + * and call this function here. */ + if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) + return NF_DROP; + } + + if (mtype == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct ip_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} + +/* Dir is direction ICMP is coming from (opposite to packet it contains) */ +int icmp_reply_translation(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_nat_manip_type manip, + enum ip_conntrack_dir dir) +{ + struct { + struct icmphdr icmp; + struct iphdr ip; + } *inside; + struct ip_conntrack_tuple inner, target; + int hdrlen = (*pskb)->nh.iph->ihl * 4; + + if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + return 0; + + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + + /* We're actually going to mangle it beyond trivial checksum + adjustment, so make sure the current checksum is correct. */ + if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) { + hdrlen = (*pskb)->nh.iph->ihl * 4; + if ((u16)csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, 0))) + return 0; + } + + /* Must be RELATED */ + IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED || + (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); + + /* Redirects on non-null nats must be dropped, else they'll + start talking to each other without our translation, and be + confused... --RR */ + if (inside->icmp.type == ICMP_REDIRECT) { + /* If NAT isn't finished, assume it and drop. */ + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + + if (ct->status & IPS_NAT_MASK) + return 0; + } + + DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n", + *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + + if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + + sizeof(struct icmphdr) + inside->ip.ihl*4, + &inner, ip_ct_find_proto(inside->ip.protocol))) + return 0; + + /* Change inner back to look like incoming packet. We do the + opposite manip on this hook to normal, because it might not + pass all hooks (locally-generated ICMP). Consider incoming + packet: PREROUTING (DST manip), routing produces ICMP, goes + through POSTROUTING (which must correct the DST manip). */ + if (!manip_pkt(inside->ip.protocol, pskb, + (*pskb)->nh.iph->ihl*4 + + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, + !manip)) + return 0; + + /* Reloading "inside" here since manip_pkt inner. */ + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside->icmp.checksum = 0; + inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, + 0)); + + /* Change outer to look the reply to an incoming packet + * (proto 0 means don't invert per-proto part). */ + + /* Obviously, we need to NAT destination IP, but source IP + should be NAT'ed only if it is from a NAT'd host. + + Explanation: some people use NAT for anonymizing. Also, + CERT recommends dropping all packets from private IP + addresses (although ICMP errors from internal links with + such addresses are not too uncommon, as Alan Cox points + out) */ + if (manip != IP_NAT_MANIP_SRC + || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) { + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, pskb, 0, &target, manip)) + return 0; + } + + return 1; +} + +/* Protocol registration. */ +int ip_nat_protocol_register(struct ip_nat_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_nat_lock); + if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } + ip_nat_protos[proto->protonum] = proto; + out: + WRITE_UNLOCK(&ip_nat_lock); + return ret; +} + +/* Noone stores the protocol anywhere; simply delete it. */ +void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) +{ + WRITE_LOCK(&ip_nat_lock); + ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + WRITE_UNLOCK(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ + synchronize_net(); +} + +int __init ip_nat_init(void) +{ + size_t i; + + /* Leave them the same for the moment. */ + ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ + bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); + if (!bysource) + return -ENOMEM; + + /* Sew in builtin protocols. */ + WRITE_LOCK(&ip_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) + ip_nat_protos[i] = &ip_nat_unknown_protocol; + ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; + ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; + ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + WRITE_UNLOCK(&ip_nat_lock); + + for (i = 0; i < ip_nat_htable_size; i++) { + INIT_LIST_HEAD(&bysource[i]); + } + + /* FIXME: Man, this is a hack. <SIGH> */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); + ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + + /* Initialize fake conntrack so that NAT will skip it */ + ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct ip_conntrack *i, void *data) +{ + memset(&i->nat, 0, sizeof(i->nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */ +void ip_nat_cleanup(void) +{ + ip_ct_iterate_cleanup(&clean_nat, NULL); + ip_conntrack_destroyed = NULL; + vfree(bysource); +} diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c new file mode 100644 index 000000000000..c6000e794ad6 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -0,0 +1,183 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/moduleparam.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Time out? --RR */ + +static int +mangle_rfc959_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; + + sprintf(buffer, "%u,%u,%u,%u,%u,%u", + NIPQUAD(newip), port>>8, port&0xFF); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_eprt_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|1|255.255.255.255|65535|")]; + + sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_epsv_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|||65535|")]; + + sprintf(buffer, "|||%u|", port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t, + unsigned int, + unsigned int, + struct ip_conntrack *, + enum ip_conntrack_info, + u32 *seq) += { [IP_CT_FTP_PORT] = mangle_rfc959_packet, + [IP_CT_FTP_PASV] = mangle_rfc959_packet, + [IP_CT_FTP_EPRT] = mangle_eprt_packet, + [IP_CT_FTP_EPSV] = mangle_epsv_packet +}; + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_ftp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq) +{ + u_int32_t newip; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack *ct = exp->master; + + DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, + seq)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_ftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_ftp_hook); + ip_nat_ftp_hook = ip_nat_ftp; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c new file mode 100644 index 000000000000..1637b96d8c01 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -0,0 +1,430 @@ +/* ip_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte <laforge@netfilter.org> + * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>: + * - add support for SACK adjustment + * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>: + * - merge SACK support into newnat API + * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>: + * - make ip_nat_resize_packet more generic (TCP and UDP) + * - add ip_nat_mangle_udp_packet + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos); +#else +#define DEBUGP(format, args...) +#define DUMP_OFFSET(x) +#endif + +static DECLARE_LOCK(ip_nat_seqofs_lock); + +/* Setup TCP sequence correction given this change at this sequence */ +static inline void +adjust_tcp_sequence(u32 seq, + int sizediff, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir; + struct ip_nat_seq *this_way, *other_way; + + DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n", + (*skb)->len, new_size); + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + DEBUGP("ip_nat_resize_packet: Seq_offset before: "); + DUMP_OFFSET(this_way); + + LOCK_BH(&ip_nat_seqofs_lock); + + /* SYN adjust. If it's uninitialized, or this is after last + * correction, record it: we don't handle more than one + * adjustment in the window, but do deal with common case of a + * retransmit */ + if (this_way->offset_before == this_way->offset_after + || before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; + } + UNLOCK_BH(&ip_nat_seqofs_lock); + + DEBUGP("ip_nat_resize_packet: Seq_offset after: "); + DUMP_OFFSET(this_way); +} + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = (unsigned char *)skb->nh.iph + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb->tail - (data + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + DEBUGP("ip_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, + skb->len); + skb_put(skb, rep_len - match_len); + } else { + DEBUGP("ip_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, + skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + /* fix IP hdr checksum information */ + skb->nh.iph->tot_len = htons(skb->len); + ip_send_check(skb->nh.iph); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) +{ + struct sk_buff *nskb; + + if ((*pskb)->len + extra > 65535) + return 0; + + nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC); + if (!nskb) + return 0; + + /* Transfer socket to new skb. */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); +#ifdef CONFIG_NETFILTER_DEBUG + nskb->nf_debug = (*pskb)->nf_debug; +#endif + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int +ip_nat_mangle_tcp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int datalen; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(*pskb); + + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + + mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = (*pskb)->len - iph->ihl*4; + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, datalen, 0)); + + if (rep_len != match_len) { + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(tcph->seq), + (int)rep_len - (int)match_len, + ct, ctinfo); + /* Tell TCP window tracking about seq change */ + ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo)); + } + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with ip_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +ip_nat_mangle_udp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct udphdr *udph; + + /* UDP helpers might accidentally mangle the wrong packet */ + iph = (*pskb)->nh.iph; + if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + + match_offset + match_len) + return 0; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + iph = (*pskb)->nh.iph; + udph = (void *)iph + iph->ihl*4; + mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + udph->len = htons((*pskb)->len - iph->ihl*4); + + /* fix udp checksum if udp checksum was previously calculated */ + if (udph->check) { + int datalen = (*pskb)->len - iph->ihl * 4; + udph->check = 0; + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + csum_partial((char *)udph, + datalen, 0)); + } + + return 1; +} + +/* Adjust one found SACK option including checksum correction */ +static void +sack_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct ip_nat_seq *natseq) +{ + while (sackoff < sackend) { + struct tcp_sack_block *sack; + u_int32_t new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - natseq->offset_before, + natseq->correction_pos)) + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_after; + else + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_before; + new_start_seq = htonl(new_start_seq); + + if (after(ntohl(sack->end_seq) - natseq->offset_before, + natseq->correction_pos)) + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_after; + else + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_before; + new_end_seq = htonl(new_end_seq); + + DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + tcph->check = + ip_nat_cheat_check(~sack->start_seq, new_start_seq, + ip_nat_cheat_check(~sack->end_seq, + new_end_seq, + tcph->check)); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static inline unsigned int +ip_nat_sack_adjust(struct sk_buff **pskb, + struct tcphdr *tcph, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + + optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); + optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; + + if (!skb_ip_make_writable(pskb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = (*pskb)->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend + || optoff + op[1] > optend + || op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK + && op[1] >= 2+TCPOLEN_SACK_PERBLOCK + && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + sack_adjust(*pskb, tcph, optoff+2, + optoff+op[1], + &ct->nat.info.seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int +ip_nat_seq_adjust(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tcphdr *tcph; + int dir, newseq, newack; + struct ip_nat_seq *this_way, *other_way; + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + + tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + if (after(ntohl(tcph->seq), this_way->correction_pos)) + newseq = ntohl(tcph->seq) + this_way->offset_after; + else + newseq = ntohl(tcph->seq) + this_way->offset_before; + newseq = htonl(newseq); + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + newack = ntohl(tcph->ack_seq) - other_way->offset_after; + else + newack = ntohl(tcph->ack_seq) - other_way->offset_before; + newack = htonl(newack); + + tcph->check = ip_nat_cheat_check(~tcph->seq, newseq, + ip_nat_cheat_check(~tcph->ack_seq, + newack, + tcph->check)); + + DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo)) + return 0; + + ip_conntrack_tcp_update(*pskb, ct, dir); + + return 1; +} + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void ip_nat_follow_master(struct ip_conntrack *ct, + struct ip_conntrack_expect *exp) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); +} diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c new file mode 100644 index 000000000000..9c1ca3381d56 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_irc.c @@ -0,0 +1,125 @@ +/* IRC extension for TCP NAT alteration. + * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/kernel.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_conntrack_irc.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/moduleparam.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + u_int16_t port; + unsigned int ret; + + /* "4294967296 65635 " */ + char buffer[18]; + + DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n", + expect->seq, exp_irc_info->len, + ntohl(tcph->seq)); + + /* Reply comes from server. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + + /* AAA = "us", ie. where server normally talks to. */ + sprintf(buffer, "%u %u", + ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip), + port); + DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n", + buffer, NIPQUAD(exp->tuple.src.ip), port); + + ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, buffer, + strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_irc_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_irc_hook); + ip_nat_irc_hook = help; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c new file mode 100644 index 000000000000..a558cf0eee8a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -0,0 +1,115 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +icmp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return (tuple->src.u.icmp.id >= min->icmp.id + && tuple->src.u.icmp.id <= max->icmp.id); +} + +static int +icmp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t id; + unsigned int range_size + = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1; + unsigned int i; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xFFFF; + + for (i = 0; i < range_size; i++, id++) { + tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +icmp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct icmphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + hdr = (struct icmphdr *)((*pskb)->data + hdroff); + + hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF, + tuple->src.u.icmp.id, + hdr->checksum); + hdr->un.echo.id = tuple->src.u.icmp.id; + return 1; +} + +static unsigned int +icmp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.icmp.id) + len += sprintf(buffer + len, "id=%u ", + ntohs(match->src.u.icmp.id)); + + if (mask->dst.u.icmp.type) + len += sprintf(buffer + len, "type=%u ", + ntohs(match->dst.u.icmp.type)); + + if (mask->dst.u.icmp.code) + len += sprintf(buffer + len, "code=%u ", + ntohs(match->dst.u.icmp.code)); + + return len; +} + +static unsigned int +icmp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF) + return sprintf(buffer, "id %u-%u ", + ntohs(range->min.icmp.id), + ntohs(range->max.icmp.id)); + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_icmp += { "ICMP", IPPROTO_ICMP, + icmp_manip_pkt, + icmp_in_range, + icmp_unique_tuple, + icmp_print, + icmp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c new file mode 100644 index 000000000000..a91cfceff272 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -0,0 +1,178 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> + +static int +tcp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.tcp.port; + else + port = tuple->dst.u.tcp.port; + + return ntohs(port) >= ntohs(min->tcp.port) + && ntohs(port) <= ntohs(max->tcp.port); +} + +static int +tcp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.tcp.port; + else + portptr = &tuple->dst.u.tcp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + /* Map privileged onto privileged. */ + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.tcp.port); + range_size = ntohs(range->max.tcp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) { + return 1; + } + } + return 0; +} + +static int +tcp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct tcphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct tcphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return 1; + + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); + return 1; +} + +static unsigned int +tcp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.tcp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.tcp.port)); + + + if (mask->dst.u.tcp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.tcp.port)); + + return len; +} + +static unsigned int +tcp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { + if (range->min.tcp.port == range->max.tcp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.tcp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.tcp.port), + ntohs(range->max.tcp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_tcp += { "TCP", IPPROTO_TCP, + tcp_manip_pkt, + tcp_in_range, + tcp_unique_tuple, + tcp_print, + tcp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c new file mode 100644 index 000000000000..c669e3b5f5d0 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -0,0 +1,165 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +udp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.udp.port; + else + port = tuple->dst.u.udp.port; + + return ntohs(port) >= ntohs(min->udp.port) + && ntohs(port) <= ntohs(max->udp.port); +} + +static int +udp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.udp.port; + else + portptr = &tuple->dst.u.udp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.udp.port); + range_size = ntohs(range->max.udp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +udp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct udphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check) /* 0 is a special case meaning no checksum */ + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); + *portptr = newport; + return 1; +} + +static unsigned int +udp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.udp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.udp.port)); + + + if (mask->dst.u.udp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.udp.port)); + + return len; +} + +static unsigned int +udp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { + if (range->min.udp.port == range->max.udp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.udp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.udp.port), + ntohs(range->max.udp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_udp += { "UDP", IPPROTO_UDP, + udp_manip_pkt, + udp_in_range, + udp_unique_tuple, + udp_print, + udp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c new file mode 100644 index 000000000000..f5525bd58d16 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -0,0 +1,70 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int unknown_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type manip_type, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return 1; +} + +static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + anything. */ + return 0; +} + +static int +unknown_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + return 1; +} + +static unsigned int +unknown_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + return 0; +} + +static unsigned int +unknown_print_range(char *buffer, const struct ip_nat_range *range) +{ + return 0; +} + +struct ip_nat_protocol ip_nat_unknown_protocol = { + "unknown", 0, + unknown_manip_pkt, + unknown_in_range, + unknown_unique_tuple, + unknown_print, + unknown_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c new file mode 100644 index 000000000000..581f097f5a24 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -0,0 +1,319 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Everything about the rules for NAT. */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/checksum.h> +#include <net/route.h> +#include <linux/bitops.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} nat_initial_table __initdata += { { "nat", NAT_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table nat_table = { + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* Source NAT */ +static unsigned int ipt_snat_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr = targinfo; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + IP_NF_ASSERT(out); + + return ip_nat_setup_info(ct, &mr->range[0], hooknum); +} + +/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ +static void warn_if_extra_mangle(u32 dstip, u32 srcip) +{ + static int warned = 0; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; + struct rtable *rt; + + if (ip_route_output_key(&rt, &fl) != 0) + return; + + if (rt->rt_src != srcip && !warned) { + printk("NAT: no longer support implicit source local NAT\n"); + printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n", + NIPQUAD(srcip), NIPQUAD(dstip)); + warned = 1; + } + ip_rt_put(rt); +} + +static unsigned int ipt_dnat_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr = targinfo; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + if (hooknum == NF_IP_LOCAL_OUT + && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) + warn_if_extra_mangle((*pskb)->nh.iph->daddr, + mr->range[0].min_ip); + + return ip_nat_setup_info(ct, &mr->range[0], hooknum); +} + +static int ipt_snat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("SNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("SNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("SNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + return 1; +} + +static int ipt_dnat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("DNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("DNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("DNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + + return 1; +} + +inline unsigned int +alloc_null_binding(struct ip_conntrack *conntrack, + struct ip_nat_info *info, + unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + Use reply in case it's already been mangled (eg local packet). + */ + u_int32_t ip + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + struct ip_nat_range range + = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; + + DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, + NIPQUAD(ip)); + return ip_nat_setup_info(conntrack, &range, hooknum); +} + +int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct ip_conntrack *ct, + struct ip_nat_info *info) +{ + int ret; + + ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); + + if (ret == NF_ACCEPT) { + if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) + /* NUL mapping */ + ret = alloc_null_binding(ct, info, hooknum); + } + return ret; +} + +static struct ipt_target ipt_snat_reg = { + .name = "SNAT", + .target = ipt_snat_target, + .checkentry = ipt_snat_checkentry, +}; + +static struct ipt_target ipt_dnat_reg = { + .name = "DNAT", + .target = ipt_dnat_target, + .checkentry = ipt_dnat_checkentry, +}; + +int __init ip_nat_rule_init(void) +{ + int ret; + + ret = ipt_register_table(&nat_table, &nat_initial_table.repl); + if (ret != 0) + return ret; + ret = ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + + ret = ipt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: + ipt_unregister_target(&ipt_snat_reg); + unregister_table: + ipt_unregister_table(&nat_table); + + return ret; +} + +void ip_nat_rule_cleanup(void) +{ + ipt_unregister_target(&ipt_dnat_reg); + ipt_unregister_target(&ipt_snat_reg); + ipt_unregister_table(&nat_table); +} diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c new file mode 100644 index 000000000000..2a48b6e635ae --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -0,0 +1,1347 @@ +/* + * ip_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: James Morris <jmorris@intercode.com.au> + * + * Updates: + * 2000-08-06: Convert to new helper API (Harald Welte). + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <asm/uaccess.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 +#define NOCT1(n) (u_int8_t )((n) & 0xff) + +static int debug; +static DEFINE_SPINLOCK(snmp_lock); + +/* + * Application layer address mapping mimics the NAT mapping, but + * only for the first octet in this case (a more flexible system + * can be implemented if needed). + */ +struct oct1_map +{ + u_int8_t from; + u_int8_t to; +}; + + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +/* + * ASN.1 context. + */ +struct asn1_ctx +{ + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr +{ + unsigned char *data; + unsigned int len; +}; + +static void asn1_open(struct asn1_ctx *ctx, + unsigned char *buf, + unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do + { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char asn1_length_decode(struct asn1_ctx *ctx, + unsigned int *def, + unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = (unsigned char) (ch & 0x7F); + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + return 1; +} + +static unsigned char asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned int def, len; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, + unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} + +static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, + unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long **oid, + unsigned int *len) +{ + unsigned long subid; + unsigned int size; + unsigned long *optr; + + size = eoc - ctx->pointer + 1; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr [0] = 0; + optr [1] = subid; + } else if (subid < 80) { + optr [0] = 1; + optr [1] = subid - 40; + } else { + optr [0] = 2; + optr [1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +/***************************************************************************** + * + * SNMP decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* SNMP Versions */ +#define SNMP_V1 0 +#define SNMP_V2C 1 +#define SNMP_V2 2 +#define SNMP_V3 3 + +/* Default Sizes */ +#define SNMP_SIZE_COMM 256 +#define SNMP_SIZE_OBJECTID 128 +#define SNMP_SIZE_BUFCHR 256 +#define SNMP_SIZE_BUFINT 128 +#define SNMP_SIZE_SMALLOBJECTID 16 + +/* Requests */ +#define SNMP_PDU_GET 0 +#define SNMP_PDU_NEXT 1 +#define SNMP_PDU_RESPONSE 2 +#define SNMP_PDU_SET 3 +#define SNMP_PDU_TRAP1 4 +#define SNMP_PDU_BULK 5 +#define SNMP_PDU_INFORM 6 +#define SNMP_PDU_TRAP2 7 + +/* Errors */ +#define SNMP_NOERROR 0 +#define SNMP_TOOBIG 1 +#define SNMP_NOSUCHNAME 2 +#define SNMP_BADVALUE 3 +#define SNMP_READONLY 4 +#define SNMP_GENERROR 5 +#define SNMP_NOACCESS 6 +#define SNMP_WRONGTYPE 7 +#define SNMP_WRONGLENGTH 8 +#define SNMP_WRONGENCODING 9 +#define SNMP_WRONGVALUE 10 +#define SNMP_NOCREATION 11 +#define SNMP_INCONSISTENTVALUE 12 +#define SNMP_RESOURCEUNAVAILABLE 13 +#define SNMP_COMMITFAILED 14 +#define SNMP_UNDOFAILED 15 +#define SNMP_AUTHORIZATIONERROR 16 +#define SNMP_NOTWRITABLE 17 +#define SNMP_INCONSISTENTNAME 18 + +/* General SNMP V1 Traps */ +#define SNMP_TRAP_COLDSTART 0 +#define SNMP_TRAP_WARMSTART 1 +#define SNMP_TRAP_LINKDOWN 2 +#define SNMP_TRAP_LINKUP 3 +#define SNMP_TRAP_AUTFAILURE 4 +#define SNMP_TRAP_EQPNEIGHBORLOSS 5 +#define SNMP_TRAP_ENTSPECIFIC 6 + +/* SNMPv1 Types */ +#define SNMP_NULL 0 +#define SNMP_INTEGER 1 /* l */ +#define SNMP_OCTETSTR 2 /* c */ +#define SNMP_DISPLAYSTR 2 /* c */ +#define SNMP_OBJECTID 3 /* ul */ +#define SNMP_IPADDR 4 /* uc */ +#define SNMP_COUNTER 5 /* ul */ +#define SNMP_GAUGE 6 /* ul */ +#define SNMP_TIMETICKS 7 /* ul */ +#define SNMP_OPAQUE 8 /* c */ + +/* Additional SNMPv2 Types */ +#define SNMP_UINTEGER 5 /* ul */ +#define SNMP_BITSTR 9 /* uc */ +#define SNMP_NSAP 10 /* uc */ +#define SNMP_COUNTER64 11 /* ul */ +#define SNMP_NOSUCHOBJECT 12 +#define SNMP_NOSUCHINSTANCE 13 +#define SNMP_ENDOFMIBVIEW 14 + +union snmp_syntax +{ + unsigned char uc[0]; /* 8 bit unsigned */ + char c[0]; /* 8 bit signed */ + unsigned long ul[0]; /* 32 bit unsigned */ + long l[0]; /* 32 bit signed */ +}; + +struct snmp_object +{ + unsigned long *id; + unsigned int id_len; + unsigned short type; + unsigned int syntax_len; + union snmp_syntax syntax; +}; + +struct snmp_request +{ + unsigned long id; + unsigned int error_status; + unsigned int error_index; +}; + +struct snmp_v1_trap +{ + unsigned long *id; + unsigned int id_len; + unsigned long ip_address; /* pointer */ + unsigned int general; + unsigned int specific; + unsigned long time; +}; + +/* SNMP types */ +#define SNMP_IPA 0 +#define SNMP_CNT 1 +#define SNMP_GGE 2 +#define SNMP_TIT 3 +#define SNMP_OPQ 4 +#define SNMP_C64 6 + +/* SNMP errors */ +#define SERR_NSO 0 +#define SERR_NSI 1 +#define SERR_EOM 2 + +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check); +struct snmp_cnv +{ + unsigned int class; + unsigned int tag; + int syntax; +}; + +static struct snmp_cnv snmp_conv [] = +{ + {ASN1_UNI, ASN1_NUL, SNMP_NULL}, + {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, + {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, + {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, + {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, + {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, + {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ + {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ + {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, + {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, + + /* SNMPv2 data types and errors */ + {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, + {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, + {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, + {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, + {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, + {0, 0, -1} +}; + +static unsigned char snmp_tag_cls2syntax(unsigned int tag, + unsigned int cls, + unsigned short *syntax) +{ + struct snmp_cnv *cnv; + + cnv = snmp_conv; + + while (cnv->syntax != -1) { + if (cnv->tag == tag && cnv->class == cls) { + *syntax = cnv->syntax; + return 1; + } + cnv++; + } + return 0; +} + +static unsigned char snmp_object_decode(struct asn1_ctx *ctx, + struct snmp_object **obj) +{ + unsigned int cls, con, tag, len, idlen; + unsigned short type; + unsigned char *eoc, *end, *p; + unsigned long *lp, *id; + unsigned long ul; + long l; + + *obj = NULL; + id = NULL; + + if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &id, &idlen)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { + kfree(id); + return 0; + } + + if (con != ASN1_PRI) { + kfree(id); + return 0; + } + + if (!snmp_tag_cls2syntax(tag, cls, &type)) { + kfree(id); + return 0; + } + + switch (type) { + case SNMP_INTEGER: + len = sizeof(long); + if (!asn1_long_decode(ctx, end, &l)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.l[0] = l; + break; + case SNMP_OCTETSTR: + case SNMP_OPAQUE: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.c, p, len); + kfree(p); + break; + case SNMP_NULL: + case SNMP_NOSUCHOBJECT: + case SNMP_NOSUCHINSTANCE: + case SNMP_ENDOFMIBVIEW: + len = 0; + *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + if (!asn1_null_decode(ctx, end)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + break; + case SNMP_OBJECTID: + if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + kfree(id); + return 0; + } + len *= sizeof(unsigned long); + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.ul, lp, len); + kfree(lp); + break; + case SNMP_IPADDR: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + if (len != 4) { + kfree(p); + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.uc, p, len); + kfree(p); + break; + case SNMP_COUNTER: + case SNMP_GAUGE: + case SNMP_TIMETICKS: + len = sizeof(unsigned long); + if (!asn1_ulong_decode(ctx, end, &ul)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.ul[0] = ul; + break; + default: + kfree(id); + return 0; + } + + (*obj)->syntax_len = len; + (*obj)->type = type; + (*obj)->id = id; + (*obj)->id_len = idlen; + + if (!asn1_eoc_decode(ctx, eoc)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + return 1; +} + +static unsigned char snmp_request_decode(struct asn1_ctx *ctx, + struct snmp_request *request) +{ + unsigned int cls, con, tag; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_ulong_decode(ctx, end, &request->id)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_status)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_index)) + return 0; + + return 1; +} + +/* + * Fast checksum update for possibly oddly-aligned UDP byte, from the + * code example in the draft. + */ +static void fast_csum(unsigned char *csum, + const unsigned char *optr, + const unsigned char *nptr, + int odd) +{ + long x, old, new; + + x = csum[0] * 256 + csum[1]; + + x =~ x & 0xFFFF; + + if (odd) old = optr[0] * 256; + else old = optr[0]; + + x -= old & 0xFFFF; + if (x <= 0) { + x--; + x &= 0xFFFF; + } + + if (odd) new = nptr[0] * 256; + else new = nptr[0]; + + x += new & 0xFFFF; + if (x & 0x10000) { + x++; + x &= 0xFFFF; + } + + x =~ x & 0xFFFF; + csum[0] = x / 256; + csum[1] = x & 0xFF; +} + +/* + * Mangle IP address. + * - begin points to the start of the snmp messgae + * - addr points to the start of the address + */ +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check) +{ + if (map->from == NOCT1(*addr)) { + u_int32_t old; + + if (debug) + memcpy(&old, (unsigned char *)addr, sizeof(old)); + + *addr = map->to; + + /* Update UDP checksum if being used */ + if (*check) { + unsigned char odd = !((addr - begin) % 2); + + fast_csum((unsigned char *)check, + &map->from, &map->to, odd); + + } + + if (debug) + printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " + "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); + } +} + +static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, + struct snmp_v1_trap *trap, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned int cls, con, tag, len; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_id_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) + goto err_id_free; + + if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) + goto err_id_free; + + /* IPv4 only */ + if (len != 4) + goto err_addr_free; + + mangle_address(ctx->begin, ctx->pointer - 4, map, check); + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->general)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->specific)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) + goto err_addr_free; + + if (!asn1_ulong_decode(ctx, end, &trap->time)) + goto err_addr_free; + + return 1; + +err_id_free: + kfree(trap->id); + +err_addr_free: + kfree((unsigned long *)trap->ip_address); + + return 0; +} + +/***************************************************************************** + * + * Misc. routines + * + *****************************************************************************/ + +static void hex_dump(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i && !(i % 16)) + printk("\n"); + printk("%02x ", *(buf + i)); + } + printk("\n"); +} + +/* + * Parse and mangle SNMP message according to mapping. + * (And this is the fucking 'basic' method). + */ +static int snmp_parse_mangle(unsigned char *msg, + u_int16_t len, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned char *eoc, *end; + unsigned int cls, con, tag, vers, pdutype; + struct asn1_ctx ctx; + struct asn1_octstr comm; + struct snmp_object **obj; + + if (debug > 1) + hex_dump(msg, len); + + asn1_open(&ctx, msg, len); + + /* + * Start of SNMP message. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + /* + * Version 1 or 2 handled. + */ + if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + if (!asn1_uint_decode (&ctx, end, &vers)) + return 0; + if (debug > 1) + printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); + if (vers > 1) + return 1; + + /* + * Community. + */ + if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) + return 0; + if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) + return 0; + if (debug > 1) { + unsigned int i; + + printk(KERN_DEBUG "bsalg: community: "); + for (i = 0; i < comm.len; i++) + printk("%c", comm.data[i]); + printk("\n"); + } + kfree(comm.data); + + /* + * PDU type + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) + return 0; + if (cls != ASN1_CTX || con != ASN1_CON) + return 0; + if (debug > 1) { + unsigned char *pdus[] = { + [SNMP_PDU_GET] = "get", + [SNMP_PDU_NEXT] = "get-next", + [SNMP_PDU_RESPONSE] = "response", + [SNMP_PDU_SET] = "set", + [SNMP_PDU_TRAP1] = "trapv1", + [SNMP_PDU_BULK] = "bulk", + [SNMP_PDU_INFORM] = "inform", + [SNMP_PDU_TRAP2] = "trapv2" + }; + + if (pdutype > SNMP_PDU_TRAP2) + printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); + else + printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); + } + if (pdutype != SNMP_PDU_RESPONSE && + pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) + return 1; + + /* + * Request header or v1 trap + */ + if (pdutype == SNMP_PDU_TRAP1) { + struct snmp_v1_trap trap; + unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); + + /* Discard trap allocations regardless */ + kfree(trap.id); + kfree((unsigned long *)trap.ip_address); + + if (!ret) + return ret; + + } else { + struct snmp_request req; + + if (!snmp_request_decode(&ctx, &req)) + return 0; + + if (debug > 1) + printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " + "error_index=%u\n", req.id, req.error_status, + req.error_index); + } + + /* + * Loop through objects, look for IP addresses to mangle. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (obj == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); + return 0; + } + + while (!asn1_eoc_decode(&ctx, eoc)) { + unsigned int i; + + if (!snmp_object_decode(&ctx, obj)) { + if (*obj) { + if ((*obj)->id) + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + return 0; + } + + if (debug > 1) { + printk(KERN_DEBUG "bsalg: object: "); + for (i = 0; i < (*obj)->id_len; i++) { + if (i > 0) + printk("."); + printk("%lu", (*obj)->id[i]); + } + printk(": type=%u\n", (*obj)->type); + + } + + if ((*obj)->type == SNMP_IPADDR) + mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + + if (!asn1_eoc_decode(&ctx, eoc)) + return 0; + + return 1; +} + +/***************************************************************************** + * + * NAT routines. + * + *****************************************************************************/ + +/* + * SNMP translation routine. + */ +static int snmp_translate(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + u_int16_t udplen = ntohs(udph->len); + u_int16_t paylen = udplen - sizeof(struct udphdr); + int dir = CTINFO2DIR(ctinfo); + struct oct1_map map; + + /* + * Determine mappping for application layer addresses based + * on NAT manipulations for the packet. + */ + if (dir == IP_CT_DIR_ORIGINAL) { + /* SNAT traps */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip); + } else { + /* DNAT replies */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip); + } + + if (map.from == map.to) + return NF_ACCEPT; + + if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), + paylen, &map, &udph->check)) { + if (net_ratelimit()) + printk(KERN_WARNING "bsalg: parser failed\n"); + return NF_DROP; + } + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted */ +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* + * Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) { + if (net_ratelimit()) + printk(KERN_WARNING "SNMP: dropping malformed packet " + "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + return NF_DROP; + } + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, ctinfo, pskb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static struct ip_conntrack_helper snmp_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp", + + .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static struct ip_conntrack_helper snmp_trap_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp_trap", + + .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +/***************************************************************************** + * + * Module stuff. + * + *****************************************************************************/ + +static int __init init(void) +{ + int ret = 0; + + ret = ip_conntrack_helper_register(&snmp_helper); + if (ret < 0) + return ret; + ret = ip_conntrack_helper_register(&snmp_trap_helper); + if (ret < 0) { + ip_conntrack_helper_unregister(&snmp_helper); + return ret; + } + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&snmp_helper); + ip_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(init); +module_exit(fini); + +module_param(debug, bool, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c new file mode 100644 index 000000000000..dec4a74212cd --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -0,0 +1,349 @@ +/* This file contains all the functions required for the standalone + ip_nat module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/icmp.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/spinlock.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \ + : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \ + : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \ + : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \ + : "*ERROR*"))) + +static unsigned int +ip_nat_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_nat_info *info; + /* maniptype == SRC for postrouting. */ + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + and local-out, and ip_nat_out protects post-routing. */ + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off + & htons(IP_MF|IP_OFFSET))); + + (*pskb)->nfcache |= NFC_UNKNOWN; + + /* If we had a hardware checksum before, it's now invalid */ + if ((*pskb)->ip_summed == CHECKSUM_HW) + if (skb_checksum_help(*pskb, (out == NULL))) + return NF_DROP; + + ct = ip_conntrack_get(*pskb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + have dropped it. Hence it's the user's responsibilty to + packet filter it out, or implement conntrack/NAT for that + protocol. 8) --RR */ + if (!ct) { + /* Exception: ICMP redirect to new connection (not in + hash table yet). We must not let this through, in + case we're doing NAT to the same network. */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_hdr), &_hdr); + if (hp != NULL && + hp->type == ICMP_REDIRECT) + return NF_DROP; + } + return NF_ACCEPT; + } + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (!icmp_reply_translation(pskb, ct, maniptype, + CTINFO2DIR(ctinfo))) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + info = &ct->nat.info; + + /* Seen it before? This can happen for loopback, retrans, + or local packets.. */ + if (!ip_nat_initialized(ct, maniptype)) { + unsigned int ret; + + /* LOCAL_IN hook doesn't have a chain! */ + if (hooknum == NF_IP_LOCAL_IN) + ret = alloc_null_binding(ct, info, hooknum); + else + ret = ip_nat_rule_find(pskb, hooknum, + in, out, ct, + info); + + if (ret != NF_ACCEPT) { + return ret; + } + } else + DEBUGP("Already setup manip %s for ct %p\n", + maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + break; + + default: + /* ESTABLISHED */ + IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED + || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + info = &ct->nat.info; + } + + IP_NF_ASSERT(info); + return nat_packet(ct, ctinfo, hooknum, pskb); +} + +static unsigned int +ip_nat_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) { + dst_release((*pskb)->dst); + (*pskb)->dst = NULL; + } + return ret; +} + +static unsigned int +ip_nat_out(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + /* We can hit fragment here; forwarded packets get + defragmented by connection tracking coming in, then + fragmented (grr) by the forward code. + + In future: If we have nfct != NULL, AND we have NAT + initialized, AND there is no helper, then we can do full + NAPT on the head, and IP-address-only NAT on the rest. + + I'm starting to have nightmares about fragments. */ + + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT); + + if (!*pskb) + return NF_STOLEN; + } + + return ip_nat_fn(hooknum, pskb, in, out, okfn); +} + +static unsigned int +ip_nat_local_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + return ret; +} + +/* We must be after connection tracking and before packet filtering. */ + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_in_ops = { + .hook = ip_nat_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source */ +static struct nf_hook_ops ip_nat_out_ops = { + .hook = ip_nat_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, +}; + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_local_out_ops = { + .hook = ip_nat_local_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */ +static struct nf_hook_ops ip_nat_local_in_ops = { + .hook = ip_nat_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, +}; + +static int init_or_cleanup(int init) +{ + int ret = 0; + + need_ip_conntrack(); + + if (!init) goto cleanup; + + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_nothing; + } + ret = ip_nat_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_rule_init; + } + ret = nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_nat; + } + ret = nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; + } + return ret; + + cleanup: + nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: + nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_outops: + nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_nat_in_ops); + cleanup_nat: + ip_nat_cleanup(); + cleanup_rule_init: + ip_nat_rule_cleanup(); + cleanup_nothing: + MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock); + return ret; +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +EXPORT_SYMBOL(ip_nat_setup_info); +EXPORT_SYMBOL(ip_nat_protocol_register); +EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL(ip_nat_cheat_check); +EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); +EXPORT_SYMBOL(ip_nat_mangle_udp_packet); +EXPORT_SYMBOL(ip_nat_used_tuple); +EXPORT_SYMBOL(ip_nat_follow_master); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c new file mode 100644 index 000000000000..0343e0d64674 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -0,0 +1,70 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> + * - Port to newnat API + * + * This module currently supports DNAT: + * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y + * + * and SNAT: + * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x } + * + * It has not been tested with + * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip + * If you do test this please let me know if it works or not. + * + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/moduleparam.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("tftp NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp) +{ + exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = ip_nat_follow_master; + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_tftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_tftp_hook); + ip_nat_tftp_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c new file mode 100644 index 000000000000..9e40dffc204f --- /dev/null +++ b/net/ipv4/netfilter/ip_queue.c @@ -0,0 +1,741 @@ +/* + * This is a module which is used for queueing IPv4 packets and + * communicating with userspace via netlink. + * + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). + * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). + * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian + * Zander). + * 2000-08-01: Added Nick Williams' MAC support. + * 2002-06-25: Code cleanup. + * 2005-01-10: Added /proc counter for dropped packets; fixed so + * packets aren't delivered to user space if they're going + * to be dropped. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_queue.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <net/sock.h> +#include <net/route.h> + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" + +struct ipq_rt_info { + __u8 tos; + __u32 daddr; + __u32 saddr; +}; + +struct ipq_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + struct ipq_rt_info rt_info; +}; + +typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); + +static unsigned char copy_mode = IPQ_COPY_NONE; +static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT; +static DEFINE_RWLOCK(queue_lock); +static int peer_pid; +static unsigned int copy_range; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl; +static LIST_HEAD(queue_list); +static DECLARE_MUTEX(ipqnl_sem); + +static void +ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) +{ + nf_reinject(entry->skb, entry->info, verdict); + kfree(entry); +} + +static inline void +__ipq_enqueue_entry(struct ipq_queue_entry *entry) +{ + list_add(&entry->list, &queue_list); + queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct ipq_queue_entry * +__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue_list) { + struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__ipq_dequeue_entry(struct ipq_queue_entry *entry) +{ + list_del(&entry->list); + queue_total--; +} + +static inline struct ipq_queue_entry * +__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + entry = __ipq_find_entry(cmpfn, data); + if (entry == NULL) + return NULL; + + __ipq_dequeue_entry(entry); + return entry; +} + + +static inline void +__ipq_flush(int verdict) +{ + struct ipq_queue_entry *entry; + + while ((entry = __ipq_find_dequeue_entry(NULL, 0))) + ipq_issue_verdict(entry, verdict); +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + copy_mode = mode; + copy_range = range; + if (copy_range > 0xFFFF) + copy_range = 0xFFFF; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NF_DROP); +} + +static struct ipq_queue_entry * +ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + write_lock_bh(&queue_lock); + entry = __ipq_find_dequeue_entry(cmpfn, data); + write_unlock_bh(&queue_lock); + return entry; +} + +static void +ipq_flush(int verdict) +{ + write_lock_bh(&queue_lock); + __ipq_flush(verdict); + write_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + + read_lock_bh(&queue_lock); + + switch (copy_mode) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + data_len = 0; + break; + + case IPQ_COPY_PACKET: + if (copy_range == 0 || copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = copy_range; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + read_unlock_bh(&queue_lock); + return NULL; + } + + read_unlock_bh(&queue_lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + pmsg->timestamp_sec = entry->skb->stamp.tv_sec; + pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->mark = entry->skb->nfmark; + pmsg->hook = entry->info->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->info->indev) + strcpy(pmsg->indev_name, entry->info->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->info->outdev) + strcpy(pmsg->outdev_name, entry->info->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->info->indev && entry->skb->dev) { + pmsg->hw_type = entry->skb->dev->type; + if (entry->skb->dev->hard_header_parse) + pmsg->hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct ipq_queue_entry *entry; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + + if (entry->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = skb->nh.iph; + + entry->rt_info.tos = iph->tos; + entry->rt_info.daddr = iph->daddr; + entry->rt_info.saddr = iph->saddr; + } + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + goto err_out_free; + + write_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + write_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + write_unlock_bh(&queue_lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +{ + int diff; + struct iphdr *user_iph = (struct iphdr *)v->payload; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_ip_make_writable(&e->skb, v->data_len)) + return -ENOMEM; + memcpy(e->skb->data, v->payload, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + + /* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + if (e->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = e->skb->nh.iph; + + if (!(iph->tos == e->rt_info.tos + && iph->daddr == e->rt_info.daddr + && iph->saddr == e->rt_info.saddr)) + return ip_route_me_harder(&e->skb); + } + return 0; +} + +static inline int +id_cmp(struct ipq_queue_entry *e, unsigned long id) +{ + return (id == (unsigned long )e); +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct ipq_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv4(vmsg, entry) < 0) + verdict = NF_DROP; + + ipq_issue_verdict(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + write_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + write_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + struct ipq_queue_entry *entry; + + while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) + ipq_issue_verdict(entry, NF_DROP); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags, nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = (struct nlmsghdr *)skb->data; + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + write_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + write_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + skblen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +static void +ipq_rcv_sk(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (down_trylock(&ipqnl_sem)) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + ipq_rcv_skb(skb); + kfree_skb(skb); + } + + up(&ipqnl_sem); + + } while (ipqnl && ipqnl->sk_receive_queue.qlen); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_FIREWALL && n->pid) { + write_lock_bh(&queue_lock); + if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .ctl_name = NET_IPQ_QMAX, + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_dir_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ipq_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipq_dir_table + }, + { .ctl_name = 0 } +}; + +#ifdef CONFIG_PROC_FS +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + read_lock_bh(&queue_lock); + + len = sprintf(buffer, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netlink dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + read_unlock_bh(&queue_lock); + + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} +#endif /* CONFIG_PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc; + + if (!init) + goto cleanup; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + if (ipqnl == NULL) { + printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + if (proc) + proc->owner = THIS_MODULE; + else { + printk(KERN_ERR "ip_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } + + register_netdevice_notifier(&ipq_dev_notifier); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + + status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + if (status < 0) { + printk(KERN_ERR "ip_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup: + nf_unregister_queue_handler(PF_INET); + synchronize_net(); + ipq_flush(NF_DROP); + +cleanup_sysctl: + unregister_sysctl_table(ipq_sysctl_header); + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(IPQ_PROC_FS_NAME); + +cleanup_ipqnl: + sock_release(ipqnl->sk_socket); + down(&ipqnl_sem); + up(&ipqnl_sem); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("IPv4 packet queue handler"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c new file mode 100644 index 000000000000..8a54f92b8496 --- /dev/null +++ b/net/ipv4/netfilter/ip_tables.c @@ -0,0 +1,1964 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> + * - increase module usage count as soon as we have rules inside + * a table + */ +#include <linux/config.h> +#include <linux/cache.h> +#include <linux/skbuff.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/ip.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> +#include <linux/proc_fs.h> +#include <linux/err.h> + +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("IPv4 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(ipt_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ipt_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ + unsigned int initial_entries; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP_NUMHOOKS]; + unsigned int underflow[NF_IP_NUMHOOKS]; + + /* ipt_entry tables: one per CPU */ + char entries[0] ____cacheline_aligned; +}; + +static LIST_HEAD(ipt_target); +static LIST_HEAD(ipt_match); +static LIST_HEAD(ipt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +/* Returns whether matches rule or not. */ +static inline int +ip_packet_match(const struct iphdr *ip, + const char *indev, + const char *outdev, + const struct ipt_ip *ipinfo, + int isfrag) +{ + size_t i; + unsigned long ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) + + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) + || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->saddr), + NIPQUAD(ipinfo->smsk.s_addr), + NIPQUAD(ipinfo->src.s_addr), + ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->daddr), + NIPQUAD(ipinfo->dmsk.s_addr), + NIPQUAD(ipinfo->dst.s_addr), + ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ipinfo->iniface)[i]) + & ((const unsigned long *)ipinfo->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ipinfo->iniface, + ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ipinfo->outiface)[i]) + & ((const unsigned long *)ipinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ipinfo->outiface, + ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + /* Check specific protocol */ + if (ipinfo->proto + && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, ipinfo->proto, + ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + return 0; + } + + /* If we have a fragment rule but the packet is not a fragment + * then we return zero */ + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + return 0; + } + + return 1; +} + +static inline int +ip_checkentry(const struct ipt_ip *ip) +{ + if (ip->flags & ~IPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ip->flags & ~IPT_F_MASK); + return 0; + } + if (ip->invflags & ~IPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ip->invflags & ~IPT_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ipt_error(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ipt_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ipt_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ipt_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ipt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ipt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + u_int16_t offset; + struct iphdr *ip; + u_int16_t datalen; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ipt_entry *e, *back; + + /* Initialization */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + offset = ntohs(ip->frag_off) & IP_OFFSET; + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ipt_entry *)table_base)->comefrom, + ((struct ipt_entry *)table_base)->comefrom); + } + ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + struct ipt_entry_target *t; + + if (IPT_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + + t = ipt_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ipt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ipt_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + in, out, + hook, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ipt_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IPT_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ipt_entry *)table_base)->comefrom + = 0x57acc001; +#endif + /* Target might have changed stuff. */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + + if (verdict == IPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +static inline struct ipt_table *find_table_lock(const char *name) +{ + struct ipt_table *t; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&ipt_mutex); + return NULL; +} + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_match *find_match(const char *name, u8 revision) +{ + struct ipt_match *m; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + up(&ipt_mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_target *find_target(const char *name, u8 revision) +{ + struct ipt_target *t; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&ipt_mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +struct ipt_target *ipt_find_target(const char *name, u8 revision) +{ + struct ipt_target *target; + + target = try_then_request_module(find_target(name, revision), + "ipt_%s", name); + if (IS_ERR(target) || !target) + return NULL; + return target; +} + +static int match_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +static int target_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +static inline int find_revision(const char *name, u8 revision, + int (*revfn)(const char *, u8, int *), + int *err) +{ + int have_rev, best = -1; + + if (down_interruptible(&ipt_mutex) != 0) { + *err = -EINTR; + return 1; + } + have_rev = revfn(name, revision, &best); + up(&ipt_mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} + + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ipt_ip *ip) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) + if (((__u32 *)ip)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ipt_entry *e + = (struct ipt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ipt_standard_target *t + = (void *)ipt_get_target(e); + + if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ipt_entry) + && (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ip)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<<NF_IP_NUMHOOKS); +#ifdef DEBUG_IP_FIREWALL_USER + if (e->comefrom + & (1 << NF_IP_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ipt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ipt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ipt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ipt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +standard_check(const struct ipt_entry_target *t, + unsigned int max_offset) +{ + struct ipt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IPT_ALIGN(sizeof(struct ipt_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IPT_ALIGN(sizeof(struct ipt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ipt_entry)) { + duprintf("ipt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ipt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ipt_entry_match *m, + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) +{ + struct ipt_match *match; + + match = try_then_request_module(find_match(m->u.user.name, + m->u.user.revision), + "ipt_%s", m->u.user.name); + if (IS_ERR(match) || !match) { + duprintf("check_match: `%s' not found\n", m->u.user.name); + return match ? PTR_ERR(match) : -ENOENT; + } + m->u.kernel.match = match; + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ip, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + module_put(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ipt_target ipt_standard_target; + +static inline int +check_entry(struct ipt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ipt_entry_target *t; + struct ipt_target *target; + int ret; + unsigned int j; + + if (!ip_checkentry(&e->ip)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ipt_get_target(e); + target = try_then_request_module(find_target(t->u.user.name, + t->u.user.revision), + "ipt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto cleanup_matches; + } + t->u.kernel.target = target; + + if (t->u.kernel.target == &ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IPT_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ipt_entry *e, + struct ipt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 + || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ipt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ipt_entry *e, unsigned int *i) +{ + struct ipt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IPT_MATCH_ITERATE(e, cleanup_match, NULL); + t = ipt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ipt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ipt_table_info * +replace_table(struct ipt_table *table, + unsigned int num_counters, + struct ipt_table_info *newinfo, + int *error) +{ + struct ipt_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ipt_entry *table_base; + unsigned int i; + + for (i = 0; i < num_possible_cpus(); i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ipt_table_info *t, + struct ipt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ipt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct ipt_entry *e; + struct ipt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ipt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ipt_entry_match *m; + struct ipt_entry_target *t; + + e = (struct ipt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ipt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ipt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ipt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ipt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ipt_get_entries *entries, + struct ipt_get_entries __user *uptr) +{ + int ret; + struct ipt_table *t; + + t = find_table_lock(entries->name); + if (t && !IS_ERR(t)) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + module_put(t->me); + up(&ipt_mutex); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct ipt_table *t; + struct ipt_table_info *newinfo, *oldinfo; + struct ipt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = try_then_request_module(find_table_lock(tmp.name), + "iptable_%s", tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct ipt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&ipt_mutex); + return ret; + + put_module: + module_put(t->me); + up(&ipt_mutex); + free_newinfo_counters_untrans: + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ipt_entry *e, + const struct ipt_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct ipt_counters_info tmp, *paddc; + struct ipt_table *t; + int ret = 0; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ipt_mutex); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +static int +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: { + char name[IPT_TABLE_MAXNAMELEN]; + struct ipt_table *t; + + if (*len != sizeof(struct ipt_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ipt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[IPT_TABLE_MAXNAMELEN-1] = '\0'; + + t = try_then_request_module(find_table_lock(name), + "iptable_%s", name); + if (t && !IS_ERR(t)) { + struct ipt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + memcpy(info.name, name, sizeof(info.name)); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + up(&ipt_mutex); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + } + break; + + case IPT_SO_GET_ENTRIES: { + struct ipt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ipt_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ipt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + case IPT_SO_GET_REVISION_MATCH: + case IPT_SO_GET_REVISION_TARGET: { + struct ipt_get_revision rev; + int (*revfn)(const char *, u8, int *); + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + + if (cmd == IPT_SO_GET_REVISION_TARGET) + revfn = target_revfn; + else + revfn = match_revfn; + + try_then_request_module(find_revision(rev.name, rev.revision, + revfn, &ret), + "ipt_%s", rev.name); + break; + } + + default: + duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ipt_register_target(struct ipt_target *target) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + list_add(&target->list, &ipt_target); + up(&ipt_mutex); + return ret; +} + +void +ipt_unregister_target(struct ipt_target *target) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_target, target); + up(&ipt_mutex); +} + +int +ipt_register_match(struct ipt_match *match) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &ipt_match); + up(&ipt_mutex); + + return ret; +} + +void +ipt_unregister_match(struct ipt_match *match) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_match, match); + up(&ipt_mutex); +} + +int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) +{ + int ret; + struct ipt_table_info *newinfo; + static struct ipt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ipt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&ipt_tables, table); + + unlock: + up(&ipt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ipt_unregister_table(struct ipt_table *table) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_tables, table); + up(&ipt_mutex); + + /* Decrease module usage counts and free resources */ + IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct ipt_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IPT_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & IPT_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ip->proto == IPPROTO_TCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp)) + && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct ipt_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) { + duprintf("ipt_udp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp))); + return 0; + } + if (udpinfo->invflags & ~IPT_UDP_INV_MASK) { + duprintf("ipt_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code)) + ^ invert; +} + +static int +icmp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct icmphdr _icmph, *ic; + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ic = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags&IPT_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ip->proto == IPPROTO_ICMP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp)) + && !(icmpinfo->invflags & ~IPT_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ipt_target ipt_standard_target = { + .name = IPT_STANDARD_TARGET, +}; + +static struct ipt_target ipt_error_target = { + .name = IPT_ERROR_TARGET, + .target = ipt_error, +}; + +static struct nf_sockopt_ops ipt_sockopts = { + .pf = PF_INET, + .set_optmin = IPT_BASE_CTL, + .set_optmax = IPT_SO_SET_MAX+1, + .set = do_ipt_set_ctl, + .get_optmin = IPT_BASE_CTL, + .get_optmax = IPT_SO_GET_MAX+1, + .get = do_ipt_get_ctl, +}; + +static struct ipt_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, +}; + +static struct ipt_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, +}; + +static struct ipt_match icmp_matchstruct = { + .name = "icmp", + .match = &icmp_match, + .checkentry = &icmp_checkentry, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const char *i, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", + i + sizeof(struct list_head)); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static inline int print_target(const struct ipt_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ipt_standard_target || t == &ipt_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + +static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_tables, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} + +static int ipt_get_targets(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_target, print_target, struct ipt_target *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_match, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +{ { "ip_tables_names", ipt_get_tables }, + { "ip_tables_targets", ipt_get_targets }, + { "ip_tables_matches", ipt_get_matches }, + { NULL, NULL} }; +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ipt_mutex); + list_append(&ipt_target, &ipt_standard_target); + list_append(&ipt_target, &ipt_error_target); + list_append(&ipt_match, &tcp_matchstruct); + list_append(&ipt_match, &udp_matchstruct); + list_append(&ipt_match, &icmp_matchstruct); + up(&ipt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + int i; + + for (i = 0; ipt_proc_entry[i].name; i++) { + proc = proc_net_create(ipt_proc_entry[i].name, 0, + ipt_proc_entry[i].get_info); + if (!proc) { + while (--i >= 0) + proc_net_remove(ipt_proc_entry[i].name); + nf_unregister_sockopt(&ipt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } + } +#endif + + printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ipt_sockopts); +#ifdef CONFIG_PROC_FS + { + int i; + for (i = 0; ipt_proc_entry[i].name; i++) + proc_net_remove(ipt_proc_entry[i].name); + } +#endif +} + +EXPORT_SYMBOL(ipt_register_table); +EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_register_match); +EXPORT_SYMBOL(ipt_unregister_match); +EXPORT_SYMBOL(ipt_do_table); +EXPORT_SYMBOL(ipt_register_target); +EXPORT_SYMBOL(ipt_unregister_target); +EXPORT_SYMBOL(ipt_find_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c new file mode 100644 index 000000000000..9842e6e23184 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -0,0 +1,92 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +/* (C) 2001-2002 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CLASSIFY.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables qdisc classification target module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_classify_target_info *clinfo = targinfo; + + if((*pskb)->priority != clinfo->priority) { + (*pskb)->priority = clinfo->priority; + (*pskb)->nfcache |= NFC_ALTERED; + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ + printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { + printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD " + "and POST_ROUTING.\n"); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_ERR "CLASSIFY: can only be called from " + "\"mangle\" table, not \"%s\".\n", + tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_classify_reg = { + .name = "CLASSIFY", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_classify_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_classify_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c new file mode 100644 index 000000000000..0f12e3a3dc73 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -0,0 +1,761 @@ +/* Cluster IP hashmark target + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * based on ideas of Fabio Olive Leite <olive@unixforge.org> + * + * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/config.h> +#include <linux/proc_fs.h> +#include <linux/jhash.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <net/checksum.h> + +#include <linux/netfilter_arp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#define CLUSTERIP_VERSION "0.6" + +#define DEBUG_CLUSTERIP + +#ifdef DEBUG_CLUSTERIP +#define DEBUGP printk +#else +#define DEBUGP +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables target for CLUSTERIP"); + +struct clusterip_config { + struct list_head list; /* list of all configs */ + atomic_t refcount; /* reference count */ + + u_int32_t clusterip; /* the IP address */ + u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ + struct net_device *dev; /* device */ + u_int16_t num_total_nodes; /* total number of nodes */ + u_int16_t num_local_nodes; /* number of local nodes */ + u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */ + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; /* proc dir entry */ +#endif + enum clusterip_hashmode hash_mode; /* which hashing mode */ + u_int32_t hash_initval; /* hash initialization */ +}; + +static LIST_HEAD(clusterip_configs); + +/* clusterip_lock protects the clusterip_configs list _AND_ the configurable + * data within all structurses (num_local_nodes, local_nodes[]) */ +static DECLARE_RWLOCK(clusterip_lock); + +#ifdef CONFIG_PROC_FS +static struct file_operations clusterip_proc_fops; +static struct proc_dir_entry *clusterip_procdir; +#endif + +static inline void +clusterip_config_get(struct clusterip_config *c) { + atomic_inc(&c->refcount); +} + +static inline void +clusterip_config_put(struct clusterip_config *c) { + if (atomic_dec_and_test(&c->refcount)) { + WRITE_LOCK(&clusterip_lock); + list_del(&c->list); + WRITE_UNLOCK(&clusterip_lock); + dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); + dev_put(c->dev); + kfree(c); + } +} + + +static struct clusterip_config * +__clusterip_config_find(u_int32_t clusterip) +{ + struct list_head *pos; + + MUST_BE_READ_LOCKED(&clusterip_lock); + list_for_each(pos, &clusterip_configs) { + struct clusterip_config *c = list_entry(pos, + struct clusterip_config, list); + if (c->clusterip == clusterip) { + return c; + } + } + + return NULL; +} + +static inline struct clusterip_config * +clusterip_config_find_get(u_int32_t clusterip) +{ + struct clusterip_config *c; + + READ_LOCK(&clusterip_lock); + c = __clusterip_config_find(clusterip); + if (!c) { + READ_UNLOCK(&clusterip_lock); + return NULL; + } + atomic_inc(&c->refcount); + READ_UNLOCK(&clusterip_lock); + + return c; +} + +static struct clusterip_config * +clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip, + struct net_device *dev) +{ + struct clusterip_config *c; + char buffer[16]; + + c = kmalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return NULL; + + memset(c, 0, sizeof(*c)); + c->dev = dev; + c->clusterip = ip; + memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + c->num_total_nodes = i->num_total_nodes; + c->num_local_nodes = i->num_local_nodes; + memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes)); + c->hash_mode = i->hash_mode; + c->hash_initval = i->hash_initval; + atomic_set(&c->refcount, 1); + +#ifdef CONFIG_PROC_FS + /* create proc dir entry */ + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); + c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir); + if (!c->pde) { + kfree(c); + return NULL; + } + c->pde->proc_fops = &clusterip_proc_fops; + c->pde->data = c; +#endif + + WRITE_LOCK(&clusterip_lock); + list_add(&c->list, &clusterip_configs); + WRITE_UNLOCK(&clusterip_lock); + + return c; +} + +static int +clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes >= CLUSTERIP_MAX_NODES + || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + /* check if we alrady have this number in our array */ + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + } + + c->local_nodes[c->num_local_nodes++] = nodenum; + + WRITE_UNLOCK(&clusterip_lock); + return 0; +} + +static int +clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); + memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); + c->num_local_nodes--; + WRITE_UNLOCK(&clusterip_lock); + return 0; + } + } + + WRITE_UNLOCK(&clusterip_lock); + return 1; +} + +static inline u_int32_t +clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) +{ + struct iphdr *iph = skb->nh.iph; + unsigned long hashval; + u_int16_t sport, dport; + struct tcphdr *th; + struct udphdr *uh; + struct icmphdr *ih; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = (void *)iph+iph->ihl*4; + sport = ntohs(th->source); + dport = ntohs(th->dest); + break; + case IPPROTO_UDP: + uh = (void *)iph+iph->ihl*4; + sport = ntohs(uh->source); + dport = ntohs(uh->dest); + break; + case IPPROTO_ICMP: + ih = (void *)iph+iph->ihl*4; + sport = ntohs(ih->un.echo.id); + dport = (ih->type<<8)|ih->code; + break; + default: + if (net_ratelimit()) { + printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", + iph->protocol); + } + sport = dport = 0; + } + + switch (config->hash_mode) { + case CLUSTERIP_HASHMODE_SIP: + hashval = jhash_1word(ntohl(iph->saddr), + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT: + hashval = jhash_2words(ntohl(iph->saddr), sport, + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT_DPT: + hashval = jhash_3words(ntohl(iph->saddr), sport, dport, + config->hash_initval); + break; + default: + /* to make gcc happy */ + hashval = 0; + /* This cannot happen, unless the check function wasn't called + * at rule load time */ + printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); + BUG(); + break; + } + + /* node numbers are 1..n, not 0..n */ + return ((hashval % config->num_total_nodes)+1); +} + +static inline int +clusterip_responsible(struct clusterip_config *config, u_int32_t hash) +{ + int i; + + READ_LOCK(&clusterip_lock); + + if (config->num_local_nodes == 0) { + READ_UNLOCK(&clusterip_lock); + return 0; + } + + for (i = 0; i < config->num_local_nodes; i++) { + if (config->local_nodes[i] == hash) { + READ_UNLOCK(&clusterip_lock); + return 1; + } + } + + READ_UNLOCK(&clusterip_lock); + + return 0; +} + +/*********************************************************************** + * IPTABLES TARGET + ***********************************************************************/ + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + u_int32_t hash; + + /* don't need to clusterip_config_get() here, since refcount + * is only decremented by destroy() - and ip_tables guarantees + * that the ->target() function isn't called after ->destroy() */ + + if (!ct) { + printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); + /* FIXME: need to drop invalid ones, since replies + * to outgoing connections of other nodes will be + * marked as INVALID */ + return NF_DROP; + } + + /* special case: ICMP error handling. conntrack distinguishes between + * error messages (RELATED) and information requests (see below) */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP + && (ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) + return IPT_CONTINUE; + + /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, + * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here + * on, which all have an ID field [relevant for hashing]. */ + + hash = clusterip_hashfn(*pskb, cipinfo->config); + + switch (ctinfo) { + case IP_CT_NEW: + ct->mark = hash; + break; + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + /* FIXME: we don't handle expectations at the + * moment. they can arrive on a different node than + * the master connection (e.g. FTP passive mode) */ + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: + break; + default: + break; + } + +#ifdef DEBUG_CLUSTERP + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +#endif + DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + if (!clusterip_responsible(cipinfo->config, hash)) { + DEBUGP("not responsible\n"); + return NF_DROP; + } + DEBUGP("responsible\n"); + + /* despite being received via linklayer multicast, this is + * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ + (*pskb)->pkt_type = PACKET_HOST; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_clusterip_tgt_info *cipinfo = targinfo; + + struct clusterip_config *config; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) { + printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))); + return 0; + } + + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { + printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", + cipinfo->hash_mode); + return 0; + + } + if (e->ip.dmsk.s_addr != 0xffffffff + || e->ip.dst.s_addr == 0) { + printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); + return 0; + } + + /* FIXME: further sanity checks */ + + config = clusterip_config_find_get(e->ip.dst.s_addr); + if (!config) { + if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { + printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr)); + return 0; + } else { + struct net_device *dev; + + if (e->ip.iniface[0] == '\0') { + printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); + return 0; + } + + dev = dev_get_by_name(e->ip.iniface); + if (!dev) { + printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); + return 0; + } + + config = clusterip_config_init(cipinfo, + e->ip.dst.s_addr, dev); + if (!config) { + printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); + dev_put(dev); + return 0; + } + dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); + } + } + + cipinfo->config = config; + + return 1; +} + +/* drop reference count of cluster config when rule is deleted */ +static void destroy(void *matchinfo, unsigned int matchinfosize) +{ + struct ipt_clusterip_tgt_info *cipinfo = matchinfo; + + /* we first remove the proc entry and then drop the reference + * count. In case anyone still accesses the file, the open/close + * functions are also incrementing the refcount on their own */ +#ifdef CONFIG_PROC_FS + remove_proc_entry(cipinfo->config->pde->name, + cipinfo->config->pde->parent); +#endif + clusterip_config_put(cipinfo->config); +} + +static struct ipt_target clusterip_tgt = { + .name = "CLUSTERIP", + .target = &target, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + + +/*********************************************************************** + * ARP MANGLING CODE + ***********************************************************************/ + +/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ +struct arp_payload { + u_int8_t src_hw[ETH_ALEN]; + u_int32_t src_ip; + u_int8_t dst_hw[ETH_ALEN]; + u_int32_t dst_ip; +} __attribute__ ((packed)); + +#ifdef CLUSTERIP_DEBUG +static void arp_print(struct arp_payload *payload) +{ +#define HBUFFERLEN 30 + char hbuffer[HBUFFERLEN]; + int j,k; + const char hexbuf[]= "0123456789abcdef"; + + for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15]; + hbuffer[k++]=hexbuf[payload->src_hw[j]&15]; + hbuffer[k++]=':'; + } + hbuffer[--k]='\0'; + + printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", + NIPQUAD(payload->src_ip), hbuffer, + NIPQUAD(payload->dst_ip)); +} +#endif + +static unsigned int +arp_mangle(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct arphdr *arp = (*pskb)->nh.arph; + struct arp_payload *payload; + struct clusterip_config *c; + + /* we don't care about non-ethernet and non-ipv4 ARP */ + if (arp->ar_hrd != htons(ARPHRD_ETHER) + || arp->ar_pro != htons(ETH_P_IP) + || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + return NF_ACCEPT; + + /* we only want to mangle arp replies */ + if (arp->ar_op != htons(ARPOP_REPLY)) + return NF_ACCEPT; + + payload = (void *)(arp+1); + + /* if there is no clusterip configuration for the arp reply's + * source ip, we don't want to mangle it */ + c = clusterip_config_find_get(payload->src_ip); + if (!c) + return NF_ACCEPT; + + /* normally the linux kernel always replies to arp queries of + * addresses on different interfacs. However, in the CLUSTERIP case + * this wouldn't work, since we didn't subscribe the mcast group on + * other interfaces */ + if (c->dev != out) { + DEBUGP("CLUSTERIP: not mangling arp reply on different " + "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name); + clusterip_config_put(c); + return NF_ACCEPT; + } + + /* mangle reply hardware address */ + memcpy(payload->src_hw, c->clustermac, arp->ar_hln); + +#ifdef CLUSTERIP_DEBUG + DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: "); + arp_print(payload); +#endif + + clusterip_config_put(c); + + return NF_ACCEPT; +} + +static struct nf_hook_ops cip_arp_ops = { + .hook = arp_mangle, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 +}; + +/*********************************************************************** + * PROC DIR HANDLING + ***********************************************************************/ + +#ifdef CONFIG_PROC_FS + +static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx; + + READ_LOCK(&clusterip_lock); + if (*pos >= c->num_local_nodes) + return NULL; + + nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL); + if (!nodeidx) + return ERR_PTR(-ENOMEM); + + *nodeidx = *pos; + return nodeidx; +} + +static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + *pos = ++(*nodeidx); + if (*pos >= c->num_local_nodes) { + kfree(v); + return NULL; + } + return nodeidx; +} + +static void clusterip_seq_stop(struct seq_file *s, void *v) +{ + kfree(v); + + READ_UNLOCK(&clusterip_lock); +} + +static int clusterip_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + if (*nodeidx != 0) + seq_putc(s, ','); + seq_printf(s, "%u", c->local_nodes[*nodeidx]); + + if (*nodeidx == c->num_local_nodes-1) + seq_putc(s, '\n'); + + return 0; +} + +static struct seq_operations clusterip_seq_ops = { + .start = clusterip_seq_start, + .next = clusterip_seq_next, + .stop = clusterip_seq_stop, + .show = clusterip_seq_show, +}; + +static int clusterip_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &clusterip_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + + sf->private = pde; + + clusterip_config_get(c); + } + + return ret; +} + +static int clusterip_proc_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + int ret; + + ret = seq_release(inode, file); + + if (!ret) + clusterip_config_put(c); + + return ret; +} + +static ssize_t clusterip_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *ofs) +{ +#define PROC_WRITELEN 10 + char buffer[PROC_WRITELEN+1]; + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct clusterip_config *c = pde->data; + unsigned long nodenum; + + if (copy_from_user(buffer, input, PROC_WRITELEN)) + return -EFAULT; + + if (*buffer == '+') { + nodenum = simple_strtoul(buffer+1, NULL, 10); + if (clusterip_add_node(c, nodenum)) + return -ENOMEM; + } else if (*buffer == '-') { + nodenum = simple_strtoul(buffer+1, NULL,10); + if (clusterip_del_node(c, nodenum)) + return -ENOENT; + } else + return -EIO; + + return size; +} + +static struct file_operations clusterip_proc_fops = { + .owner = THIS_MODULE, + .open = clusterip_proc_open, + .read = seq_read, + .write = clusterip_proc_write, + .llseek = seq_lseek, + .release = clusterip_proc_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static int init_or_cleanup(int fini) +{ + int ret; + + if (fini) + goto cleanup; + + if (ipt_register_target(&clusterip_tgt)) { + ret = -EINVAL; + goto cleanup_none; + } + + if (nf_register_hook(&cip_arp_ops) < 0) { + ret = -EINVAL; + goto cleanup_target; + } + +#ifdef CONFIG_PROC_FS + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net); + if (!clusterip_procdir) { + printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_hook; + } +#endif /* CONFIG_PROC_FS */ + + printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", + CLUSTERIP_VERSION); + + return 0; + +cleanup: + printk(KERN_NOTICE "ClusterIP Version %s unloading\n", + CLUSTERIP_VERSION); +#ifdef CONFIG_PROC_FS + remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); +#endif +cleanup_hook: + nf_unregister_hook(&cip_arp_ops); +cleanup_target: + ipt_unregister_target(&clusterip_tgt); +cleanup_none: + return -EINVAL; +} + +static int __init init(void) +{ + return init_or_cleanup(0); +} + +static void __exit fini(void) +{ + init_or_cleanup(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c new file mode 100644 index 000000000000..30ddd3e18eb7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -0,0 +1,118 @@ +/* This kernel module is used to modify the connection mark values, or + * to optionally restore the skb nfmark from the connection mark + * + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); +MODULE_DESCRIPTION("IP tables CONNMARK matching module"); +MODULE_LICENSE("GPL"); + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CONNMARK.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_connmark_target_info *markinfo = targinfo; + unsigned long diff; + unsigned long nfmark; + unsigned long newmark; + + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + if (ct) { + switch(markinfo->mode) { + case IPT_CONNMARK_SET: + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) + ct->mark = newmark; + break; + case IPT_CONNMARK_SAVE: + newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (ct->mark != newmark) + ct->mark = newmark; + break; + case IPT_CONNMARK_RESTORE: + nfmark = (*pskb)->nfmark; + diff = (ct->mark ^ nfmark) & markinfo->mask; + if (diff != 0) { + (*pskb)->nfmark = nfmark ^ diff; + (*pskb)->nfcache |= NFC_ALTERED; + } + break; + } + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_connmark_target_info *matchinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) { + printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_connmark_target_info))); + return 0; + } + + if (matchinfo->mode == IPT_CONNMARK_RESTORE) { + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_connmark_reg = { + .name = "CONNMARK", + .target = &target, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_connmark_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_connmark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c new file mode 100644 index 000000000000..3ea4509099f9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -0,0 +1,106 @@ +/* iptables module for setting the IPv4 DSCP field, Version 1.8 + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * See RFC2474 for a description of the DSCP field within the IP Header. + * + * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp +*/ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_DSCP.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables DSCP modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_DSCP_info *dinfo = targinfo; + u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + + if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK) + | sh_dscp; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^ 0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) { + printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_DSCP_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if ((dscp > IPT_DSCP_MAX)) { + printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_dscp_reg = { + .name = "DSCP", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_dscp_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_dscp_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c new file mode 100644 index 000000000000..ada9911118e9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -0,0 +1,175 @@ +/* iptables module for the IPv4 and TCP ECN bits, Version 1.5 + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp +*/ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ECN.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables ECN modification module"); + +/* set ECT codepoint from IP header. + * return 0 if there was an error. */ +static inline int +set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) +{ + if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK) + != (einfo->ip_ect & IPT_ECN_IP_MASK)) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return 0; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK; + (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return 1; +} + +/* Return 0 if there was an error. */ +static inline int +set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) +{ + struct tcphdr _tcph, *tcph; + u_int16_t diffs[2]; + + /* Not enought header? */ + tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (!tcph) + return 0; + + if (!(einfo->operation & IPT_ECN_OP_SET_ECE + || tcph->ece == einfo->proto.tcp.ece) + && (!(einfo->operation & IPT_ECN_OP_SET_CWR + || tcph->cwr == einfo->proto.tcp.cwr))) + return 1; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + + diffs[0] = ((u_int16_t *)tcph)[6]; + if (einfo->operation & IPT_ECN_OP_SET_ECE) + tcph->ece = einfo->proto.tcp.ece; + if (einfo->operation & IPT_ECN_OP_SET_CWR) + tcph->cwr = einfo->proto.tcp.cwr; + diffs[1] = ((u_int16_t *)tcph)[6]; + diffs[0] = diffs[0] ^ 0xFFFF; + + if ((*pskb)->ip_summed != CHECKSUM_HW) + tcph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + tcph->check^0xFFFF)); + else + if (skb_checksum_help(*pskb, inward)) + return 0; + (*pskb)->nfcache |= NFC_ALTERED; + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_ECN_info *einfo = targinfo; + + if (einfo->operation & IPT_ECN_OP_SET_IP) + if (!set_ect_ip(pskb, einfo)) + return NF_DROP; + + if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) + && (*pskb)->nh.iph->protocol == IPPROTO_TCP) + if (!set_ect_tcp(pskb, einfo, (out == NULL))) + return NF_DROP; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) { + printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_ECN_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MASK) { + printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", + einfo->operation); + return 0; + } + if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { + printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", + einfo->ip_ect); + return 0; + } + + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) + && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) { + printk(KERN_WARNING "ECN: cannot use TCP operations on a " + "non-tcp rule\n"); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ecn_reg = { + .name = "ECN", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_ecn_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_ecn_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c new file mode 100644 index 000000000000..ef08733d26da --- /dev/null +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -0,0 +1,485 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_LOG.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables syslog logging module"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Use lock to serialize, so printks don't overlap */ +static DEFINE_SPINLOCK(log_lock); + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ipt_log_info *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + printk("TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + printk("CE "); + if (ntohs(ih->frag_off) & IP_DF) + printk("DF "); + if (ntohs(ih->frag_off) & IP_MF) + printk("MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((info->logflags & IPT_LOG_IPOPT) + && ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IPT_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ + printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + printk("CWR "); + if (th->ece) + printk("ECE "); + if (th->urg) + printk("URG "); + if (th->ack) + printk("ACK "); + if (th->psh) + printk("PSH "); + if (th->rst) + printk("RST "); + if (th->syn) + printk("SYN "); + if (th->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(th->urg_ptr)); + + if ((info->logflags & IPT_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; + unsigned char *op; + unsigned int i, optsize; + + optsize = th->doff * 4 - sizeof(struct tcphdr); + op = skb_header_pointer(skb, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMP: { + struct icmphdr _icmph, *ich; + static size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + printk("PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + printk("PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + printk("GATEWAY=%u.%u.%u.%u ", + NIPQUAD(ich->un.gateway)); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + printk("["); + dump_packet(info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) + printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr, *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + printk("PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 10 "PROTO=ESP " */ + printk("PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + printk("PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void +ipt_log_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_log_info *loginfo, + const char *level_string, + const char *prefix) +{ + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + prefix == NULL ? loginfo->prefix : prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + struct net_device *physindev = skb->nf_bridge->physindev; + struct net_device *physoutdev = skb->nf_bridge->physoutdev; + + if (physindev && in != physindev) + printk("PHYSIN=%s ", physindev->name); + if (physoutdev && out != physoutdev) + printk("PHYSOUT=%s ", physoutdev->name); + } +#endif + + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if (skb->dev && skb->dev->hard_header_len + && skb->mac.raw != (void*)skb->nh.iph) { + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else + printk(" "); + } + + dump_packet(loginfo, skb, 0); + printk("\n"); + spin_unlock_bh(&log_lock); +} + +static unsigned int +ipt_log_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + + return IPT_CONTINUE; +} + +static void +ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_log_info loginfo = { + .level = 0, + .logflags = IPT_LOG_MASK, + .prefix = "" + }; + + ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); +} + +static int ipt_log_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_log_info *loginfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_log_reg = { + .name = "LOG", + .target = ipt_log_target, + .checkentry = ipt_log_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c new file mode 100644 index 000000000000..33c6f9b63b8d --- /dev/null +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -0,0 +1,162 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_MARK.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables MARK modification module"); + +static unsigned int +target_v0(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static unsigned int +target_v1(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info_v1 *markinfo = targinfo; + int mark = 0; + + switch (markinfo->mode) { + case IPT_MARK_SET: + mark = markinfo->mark; + break; + + case IPT_MARK_AND: + mark = (*pskb)->nfmark & markinfo->mark; + break; + + case IPT_MARK_OR: + mark = (*pskb)->nfmark | markinfo->mark; + break; + } + + if((*pskb)->nfmark != mark) { + (*pskb)->nfmark = mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + + +static int +checkentry_v0(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_mark_target_info_v1 *markinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){ + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (markinfo->mode != IPT_MARK_SET + && markinfo->mode != IPT_MARK_AND + && markinfo->mode != IPT_MARK_OR) { + printk(KERN_WARNING "MARK: unknown mode %u\n", + markinfo->mode); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mark_reg_v0 = { + .name = "MARK", + .target = target_v0, + .checkentry = checkentry_v0, + .me = THIS_MODULE, + .revision = 0, +}; + +static struct ipt_target ipt_mark_reg_v1 = { + .name = "MARK", + .target = target_v1, + .checkentry = checkentry_v1, + .me = THIS_MODULE, + .revision = 1, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_target(&ipt_mark_reg_v0); + if (!err) { + err = ipt_register_target(&ipt_mark_reg_v1); + if (err) + ipt_unregister_target(&ipt_mark_reg_v0); + } + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mark_reg_v0); + ipt_unregister_target(&ipt_mark_reg_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c new file mode 100644 index 000000000000..57e9f6cf1c36 --- /dev/null +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -0,0 +1,207 @@ +/* Masquerade. Simple mapping which alters range to a local IP address + (depending on route). */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables MASQUERADE target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Lock protects masq region inside conntrack */ +static DECLARE_RWLOCK(masq_lock); + +/* FIXME: Multiple targets. --RR */ +static int +masquerade_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("masquerade_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("masquerade_check: size %u != %u.\n", + targinfosize, sizeof(*mr)); + return 0; + } + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("masquerade_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +masquerade_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr; + struct ip_nat_range newrange; + struct rtable *rt; + u_int32_t newsrc; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + /* FIXME: For the moment, don't do local packets, breaks + testsuite for 2.3.49 --RR */ + if ((*pskb)->sk) + return NF_ACCEPT; + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + + mr = targinfo; + rt = (struct rtable *)(*pskb)->dst; + newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + if (!newsrc) { + printk("MASQUERADE: %s ate my IP address\n", out->name); + return NF_DROP; + } + + WRITE_LOCK(&masq_lock); + ct->nat.masq_index = out->ifindex; + WRITE_UNLOCK(&masq_lock); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newsrc, newsrc, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static inline int +device_cmp(struct ip_conntrack *i, void *ifindex) +{ + int ret; + + READ_LOCK(&masq_lock); + ret = (i->nat.masq_index == (int)(long)ifindex); + READ_UNLOCK(&masq_lock); + + return ret; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (event == NETDEV_DOWN) { + /* IP address was deleted. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct ipt_target masquerade = { + .name = "MASQUERADE", + .target = masquerade_target, + .checkentry = masquerade_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + ret = ipt_register_target(&masquerade); + + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&masquerade); + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c new file mode 100644 index 000000000000..06254b29d034 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -0,0 +1,117 @@ +/* NETMAP - static NAT mapping of IP network addresses (1:1). + * The mapping can be applied to source (POSTROUTING), + * destination (PREROUTING), or both (with separate rules). + */ + +/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> + +#define MODULENAME "NETMAP" +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); +MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP(MODULENAME":check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); + return 0; + } + if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t new_ip, netmask; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (hooknum == NF_IP_PRE_ROUTING) + new_ip = (*pskb)->nh.iph->daddr & ~netmask; + else + new_ip = (*pskb)->nh.iph->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + new_ip, new_ip, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target target_module = { + .name = MODULENAME, + .target = target, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&target_module); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&target_module); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c new file mode 100644 index 000000000000..a4bb9b3bc292 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -0,0 +1,76 @@ +/* This is a module which is used for setting up fake conntracks + * on packets so that they are not seen by the conntrack/NAT code. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + /* Previously seen (loopback)? Ignore. */ + if ((*pskb)->nfct != NULL) + return IPT_CONTINUE; + + /* Attach fake conntrack entry. + If there is a real ct entry correspondig to this packet, + it'll hang aroun till timing out. We don't deal with it + for performance reasons. JK */ + (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + (*pskb)->nfctinfo = IP_CT_NEW; + nf_conntrack_get((*pskb)->nfct); + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != 0) { + printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n", + targinfosize); + return 0; + } + + if (strcmp(tablename, "raw") != 0) { + printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_notrack_reg = { + .name = "NOTRACK", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_notrack_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_notrack_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c new file mode 100644 index 000000000000..d2e13447678e --- /dev/null +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -0,0 +1,129 @@ +/* Redirect. Simple mapping which alters dst to a local IP address. */ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables REDIRECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Take multiple ranges --RR */ +static int +redirect_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("redirect_check: bad table `%s'.\n", table); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("redirect_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("redirect_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("redirect_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +redirect_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t newdst; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_IP_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + + /* Device might not have an associated in_device. */ + indev = (struct in_device *)(*pskb)->dev->ip_ptr; + if (indev == NULL || indev->ifa_list == NULL) + return NF_DROP; + + /* Grab first address on interface. */ + newdst = indev->ifa_list->ifa_local; + } + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newdst, newdst, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target redirect_reg = { + .name = "REDIRECT", + .target = redirect_target, + .checkentry = redirect_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&redirect_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&redirect_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c new file mode 100644 index 000000000000..266d64979286 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -0,0 +1,335 @@ +/* + * This is a module which is used for rejecting packets. + * Added support for customized reject packets (Jozsef Kadlecsik). + * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812] + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/route.h> +#include <net/dst.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_REJECT.h> +#ifdef CONFIG_BRIDGE_NETFILTER +#include <linux/netfilter_bridge.h> +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables REJECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static inline struct rtable *route_reverse(struct sk_buff *skb, + struct tcphdr *tcph, int hook) +{ + struct iphdr *iph = skb->nh.iph; + struct dst_entry *odst; + struct flowi fl = {}; + struct rtable *rt; + + /* We don't require ip forwarding to be enabled to be able to + * send a RST reply for bridged traffic. */ + if (hook != NF_IP_FORWARD +#ifdef CONFIG_BRIDGE_NETFILTER + || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED) +#endif + ) { + fl.nl_u.ip4_u.daddr = iph->saddr; + if (hook == NF_IP_LOCAL_IN) + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->daddr; + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + + odst = skb->dst; + if (ip_route_input(skb, iph->saddr, iph->daddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return NULL; + } + dst_release(&rt->u.dst); + rt = (struct rtable *)skb->dst; + skb->dst = odst; + + fl.nl_u.ip4_u.daddr = iph->saddr; + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + } + + if (rt->u.dst.error) { + dst_release(&rt->u.dst); + return NULL; + } + + fl.proto = IPPROTO_TCP; + fl.fl_ip_sport = tcph->dest; + fl.fl_ip_dport = tcph->source; + + if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) { + dst_release(&rt->u.dst); + rt = NULL; + } + + return rt; +} + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr _otcph, *oth, *tcph; + struct rtable *rt; + u_int16_t tmp_port; + u_int32_t tmp_addr; + int needs_ack; + int hh_len; + + /* IP header checks: fragment. */ + if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + /* FIXME: Check checksum --RR */ + if ((rt = route_reverse(oldskb, oth, hook)) == NULL) + return; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + /* We need a linear, writeable skb. We also need to expand + headroom in case hh_len of incoming interface < hh_len of + outgoing interface */ + nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb), + GFP_ATOMIC); + if (!nskb) { + dst_release(&rt->u.dst); + return; + } + + dst_release(nskb->dst); + nskb->dst = &rt->u.dst; + + /* This packet will not be the same as the other: clear nf fields */ + nf_reset(nskb); + nskb->nfcache = 0; + nskb->nfmark = 0; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(nskb->nf_bridge); + nskb->nf_bridge = NULL; +#endif + + tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); + + /* Swap source and dest */ + tmp_addr = nskb->nh.iph->saddr; + nskb->nh.iph->saddr = nskb->nh.iph->daddr; + nskb->nh.iph->daddr = tmp_addr; + tmp_port = tcph->source; + tcph->source = tcph->dest; + tcph->dest = tmp_port; + + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); + nskb->nh.iph->tot_len = htons(nskb->len); + + if (tcph->ack) { + needs_ack = 0; + tcph->seq = oth->ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - oldskb->nh.iph->ihl*4 + - (oth->doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + + tcph->window = 0; + tcph->urg_ptr = 0; + + /* Adjust TCP checksum */ + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr), + nskb->nh.iph->saddr, + nskb->nh.iph->daddr, + csum_partial((char *)tcph, + sizeof(struct tcphdr), 0)); + + /* Adjust IP TTL, DF */ + nskb->nh.iph->ttl = MAXTTL; + /* Set DF, id = 0 */ + nskb->nh.iph->frag_off = htons(IP_DF); + nskb->nh.iph->id = 0; + + /* Adjust IP checksum */ + nskb->nh.iph->check = 0; + nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, + nskb->nh.iph->ihl); + + /* "Never happens" */ + if (nskb->len > dst_mtu(nskb->dst)) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + dst_output); + return; + + free_nskb: + kfree_skb(nskb); +} + +static inline void send_unreach(struct sk_buff *skb_in, int code) +{ + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +} + +static unsigned int reject(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_reject_info *reject = targinfo; + + /* Our naive response construction doesn't deal with IP + options, and probably shouldn't try. */ + if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + return NF_DROP; + + /* WARNING: This code causes reentry within iptables. + This means that the iptables jump stack is now crap. We + must return an absolute verdict. --RR */ + switch (reject->with) { + case IPT_ICMP_NET_UNREACHABLE: + send_unreach(*pskb, ICMP_NET_UNREACH); + break; + case IPT_ICMP_HOST_UNREACHABLE: + send_unreach(*pskb, ICMP_HOST_UNREACH); + break; + case IPT_ICMP_PROT_UNREACHABLE: + send_unreach(*pskb, ICMP_PROT_UNREACH); + break; + case IPT_ICMP_PORT_UNREACHABLE: + send_unreach(*pskb, ICMP_PORT_UNREACH); + break; + case IPT_ICMP_NET_PROHIBITED: + send_unreach(*pskb, ICMP_NET_ANO); + break; + case IPT_ICMP_HOST_PROHIBITED: + send_unreach(*pskb, ICMP_HOST_ANO); + break; + case IPT_ICMP_ADMIN_PROHIBITED: + send_unreach(*pskb, ICMP_PKT_FILTERED); + break; + case IPT_TCP_RESET: + send_reset(*pskb, hooknum); + case IPT_ICMP_ECHOREPLY: + /* Doesn't happen. */ + break; + } + + return NF_DROP; +} + +static int check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_reject_info *rejinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) { + DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if (strcmp(tablename, "filter") != 0) { + DEBUGP("REJECT: bad table `%s'.\n", tablename); + return 0; + } + if ((hook_mask & ~((1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT))) != 0) { + DEBUGP("REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { + printk("REJECT: ECHOREPLY no longer supported.\n"); + return 0; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ip.proto != IPPROTO_TCP + || (e->ip.invflags & IPT_INV_PROTO)) { + DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_reject_reg = { + .name = "REJECT", + .target = reject, + .checkentry = check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_reject_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_reject_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c new file mode 100644 index 000000000000..7a0536d864ac --- /dev/null +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -0,0 +1,211 @@ +/* Same. Just like SNAT, only try to make the connections + * between client A and server B always have the same source ip. + * + * (C) 2000 Paul `Rusty' Russell + * (C) 2001 Martin Josefsson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 010320 Martin Josefsson <gandalf@wlug.westbo.se> + * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things. + * 010728 Martin Josefsson <gandalf@wlug.westbo.se> + * * added --nodst to not include destination-ip in new source + * calculations. + * * added some more sanity-checks. + * 010729 Martin Josefsson <gandalf@wlug.westbo.se> + * * fixed a buggy if-statement in same_check(), should have + * used ntohl() but didn't. + * * added support for multiple ranges. IPT_SAME_MAX_RANGE is + * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h + * and is currently set to 10. + * * added support for 1-address range, nice to have now that + * we have multiple ranges. + */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ipt_SAME.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>"); +MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +same_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + unsigned int count, countess, rangeip, index = 0; + struct ipt_same_info *mr = targinfo; + + mr->ipnum = 0; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("same_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("same_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) { + DEBUGP("same_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->rangesize < 1) { + DEBUGP("same_check: need at least one dest range.\n"); + return 0; + } + if (mr->rangesize > IPT_SAME_MAX_RANGE) { + DEBUGP("same_check: too many ranges specified, maximum " + "is %u ranges\n", + IPT_SAME_MAX_RANGE); + return 0; + } + for (count = 0; count < mr->rangesize; count++) { + if (ntohl(mr->range[count].min_ip) > + ntohl(mr->range[count].max_ip)) { + DEBUGP("same_check: min_ip is larger than max_ip in " + "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n", + NIPQUAD(mr->range[count].min_ip), + NIPQUAD(mr->range[count].max_ip)); + return 0; + } + if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP("same_check: bad MAP_IPS.\n"); + return 0; + } + rangeip = (ntohl(mr->range[count].max_ip) - + ntohl(mr->range[count].min_ip) + 1); + mr->ipnum += rangeip; + + DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip); + } + DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum); + + mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL); + if (!mr->iparray) { + DEBUGP("same_check: Couldn't allocate %u bytes " + "for %u ipaddresses!\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + return 0; + } + DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + + for (count = 0; count < mr->rangesize; count++) { + for (countess = ntohl(mr->range[count].min_ip); + countess <= ntohl(mr->range[count].max_ip); + countess++) { + mr->iparray[index] = countess; + DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' " + "in index %u.\n", + HIPQUAD(countess), index); + index++; + } + } + return 1; +} + +static void +same_destroy(void *targinfo, + unsigned int targinfosize) +{ + struct ipt_same_info *mr = targinfo; + + kfree(mr->iparray); + + DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); +} + +static unsigned int +same_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t tmpip, aindex, new_ip; + const struct ipt_same_info *same = targinfo; + struct ip_nat_range newrange; + const struct ip_conntrack_tuple *t; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || + hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* Base new source on real src ip and optionally dst ip, + giving some hope for consistency across reboots. + Here we calculate the index in same->iparray which + holds the ipaddress we should use */ + + tmpip = ntohl(t->src.ip); + + if (!(same->info & IPT_SAME_NODST)) + tmpip += ntohl(t->dst.ip); + + aindex = tmpip % same->ipnum; + + new_ip = htonl(same->iparray[aindex]); + + DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, " + "new src=%u.%u.%u.%u\n", + NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip), + NIPQUAD(new_ip)); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { same->range[0].flags, new_ip, new_ip, + /* FIXME: Use ports from correct range! */ + same->range[0].min, same->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target same_reg = { + .name = "SAME", + .target = same_target, + .checkentry = same_check, + .destroy = same_destroy, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&same_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&same_reg); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c new file mode 100644 index 000000000000..1049050b2bfb --- /dev/null +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -0,0 +1,262 @@ +/* + * This is a module which is used for setting the MSS option in TCP packets. + * + * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TCPMSS.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables TCP MSS modification module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static u_int16_t +cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +static inline unsigned int +optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; + else return opt[offset+1]; +} + +static unsigned int +ipt_tcpmss_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + struct tcphdr *tcph; + struct iphdr *iph; + u_int16_t tcplen, newtotlen, oldval, newmss; + unsigned int i; + u_int8_t *opt; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + tcplen = (*pskb)->len - iph->ihl*4; + + tcph = (void *)iph + iph->ihl*4; + + /* Since it passed flags test in tcp match, we know it is is + not a fragment, and has data >= tcp header length. SYN + packets should not contain data: if they did, then we risk + running over MTU, sending Frag Needed and breaking things + badly. --RR */ + if (tcplen != tcph->doff*4) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: bad length (%d bytes)\n", + (*pskb)->len); + return NF_DROP; + } + + if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) { + if(!(*pskb)->dst) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: no dst?! can't determine path-MTU\n"); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst)); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); + } else + newmss = tcpmssinfo->mss; + + opt = (u_int8_t *)tcph; + for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){ + if ((opt[i] == TCPOPT_MSS) && + ((tcph->doff*4 - i) >= TCPOLEN_MSS) && + (opt[i+1] == TCPOLEN_MSS)) { + u_int16_t oldmss; + + oldmss = (opt[i+2] << 8) | opt[i+3]; + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + (oldmss <= newmss)) + return IPT_CONTINUE; + + opt[i+2] = (newmss & 0xff00) >> 8; + opt[i+3] = (newmss & 0x00ff); + + tcph->check = cheat_check(htons(oldmss)^0xFFFF, + htons(newmss), + tcph->check); + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu changed TCP MSS option" + " (from %u to %u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + oldmss, newmss); + goto retmodified; + } + } + + /* + * MSS Option not found ?! add it.. + */ + if (skb_tailroom((*pskb)) < TCPOLEN_MSS) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), + TCPOLEN_MSS, GFP_ATOMIC); + if (!newskb) { + if (net_ratelimit()) + printk(KERN_ERR "ipt_tcpmss_target:" + " unable to allocate larger skb\n"); + return NF_DROP; + } + + kfree_skb(*pskb); + *pskb = newskb; + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + } + + skb_put((*pskb), TCPOLEN_MSS); + + opt = (u_int8_t *)tcph + sizeof(struct tcphdr); + memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + + tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF, + htons(tcplen + TCPOLEN_MSS), tcph->check); + tcplen += TCPOLEN_MSS; + + opt[0] = TCPOPT_MSS; + opt[1] = TCPOLEN_MSS; + opt[2] = (newmss & 0xff00) >> 8; + opt[3] = (newmss & 0x00ff); + + tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check); + + oldval = ((u_int16_t *)tcph)[6]; + tcph->doff += TCPOLEN_MSS/4; + tcph->check = cheat_check(oldval ^ 0xFFFF, + ((u_int16_t *)tcph)[6], tcph->check); + + newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS); + iph->check = cheat_check(iph->tot_len ^ 0xFFFF, + newtotlen, iph->check); + iph->tot_len = newtotlen; + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + newmss); + + retmodified: + /* We never hw checksum SYN packets. */ + BUG_ON((*pskb)->ip_summed == CHECKSUM_HW); + + (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; + return IPT_CONTINUE; +} + +#define TH_SYN 0x02 + +static inline int find_syn_match(const struct ipt_entry_match *m) +{ + const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data; + + if (strcmp(m->u.kernel.match->name, "tcp") == 0 + && (tcpinfo->flg_cmp & TH_SYN) + && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS)) + return 1; + + return 0; +} + +/* Must specify -p tcp --syn/--tcp-flags SYN */ +static int +ipt_tcpmss_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) { + DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info))); + return 0; + } + + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + ((hook_mask & ~((1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) != 0)) { + printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return 0; + } + + if (e->ip.proto == IPPROTO_TCP + && !(e->ip.invflags & IPT_INV_PROTO) + && IPT_MATCH_ITERATE(e, find_syn_match)) + return 1; + + printk("TCPMSS: Only works on TCP SYN packets\n"); + return 0; +} + +static struct ipt_target ipt_tcpmss_reg = { + .name = "TCPMSS", + .target = ipt_tcpmss_target, + .checkentry = ipt_tcpmss_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tcpmss_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tcpmss_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c new file mode 100644 index 000000000000..85c70d240f8b --- /dev/null +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -0,0 +1,105 @@ +/* This is a module which is used for setting the TOS field of a packet. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TOS.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables TOS mangling module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tos_target_info *tosinfo = targinfo; + + if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos + = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK) + | tosinfo->tos; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) { + printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_tos_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (tos != IPTOS_LOWDELAY + && tos != IPTOS_THROUGHPUT + && tos != IPTOS_RELIABILITY + && tos != IPTOS_MINCOST + && tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_tos_reg = { + .name = "TOS", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tos_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tos_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c new file mode 100644 index 000000000000..6f2cefbe16cd --- /dev/null +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -0,0 +1,419 @@ +/* + * netfilter module for userspace packet logging daemons + * + * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> + * + * 2000/09/22 ulog-cprange feature added + * 2001/01/04 in-kernel queue as proposed by Sebastian Zander + * <zander@fokus.gmd.de> + * 2001/01/30 per-rule nlgroup conflicts with global queue. + * nlgroup now global (sysctl) + * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at + * module loadtime -HW + * 2002/07/07 remove broken nflog_rcv() function -HW + * 2002/08/29 fix shifted/unshifted nlgroup bug -HW + * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen> + * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT + * resulting in bogus 'error during NLMSG_PUT' messages. + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + * + * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/spinlock.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/netlink.h> +#include <linux/netdevice.h> +#include <linux/mm.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ULOG.h> +#include <linux/netfilter_ipv4/lockhelp.h> +#include <net/sock.h> +#include <linux/bitops.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("iptables userspace logging module"); + +#define ULOG_NL_EVENT 111 /* Harald's favorite number */ +#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) + +static unsigned int nlbufsiz = 4096; +module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */ +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, int, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +/* global data structures */ + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ +} ulog_buff_t; + +static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ + +static struct sock *nflognl; /* our socket */ +static DECLARE_LOCK(ulog_lock); /* spinlock */ + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroupnum) +{ + ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + + if (timer_pending(&ub->timer)) { + DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); + DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", + ub->qlen, nlgroupnum); + netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; + ub->lastnlh = NULL; + +} + + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n"); + + /* lock to protect against somebody modifying our structure + * from ipt_ulog_target at the same time */ + LOCK_BH(&ulog_lock); + ulog_send(data); + UNLOCK_BH(&ulog_lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + + /* alloc skb which should be big enough for a whole + * multipart message. WARNING: has to be <= 131000 + * due to slab allocator restrictions */ + + skb = alloc_skb(nlbufsiz, GFP_ATOMIC); + if (!skb) { + PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", + nlbufsiz); + + /* try to allocate only as much as we need for + * current packet */ + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + PRINTR("ipt_ULOG: can't even allocate %ub\n", size); + } + + return skb; +} + +static void ipt_ulog_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_ulog_info *loginfo, + const char *prefix) +{ + ulog_buff_t *ub; + ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + + /* ffs == find first bit set, necessary because userspace + * is already shifting groupnumber, but we need unshifted. + * ffs() returns [1..32], we need [0..31] */ + unsigned int groupnum = ffs(loginfo->nl_group) - 1; + + /* calculate the size of the skb needed */ + if ((loginfo->copy_range == 0) || + (loginfo->copy_range > skb->len)) { + copy_len = skb->len; + } else { + copy_len = loginfo->copy_range; + } + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + + ub = &ulog_buffers[groupnum]; + + LOCK_BH(&ulog_lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (ub->qlen >= loginfo->qthreshold || + size > skb_tailroom(ub->skb)) { + /* either the queue len is too high or we don't have + * enough room in nlskb left. send it to userspace. */ + + ulog_send(groupnum); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen, + loginfo->qthreshold); + + /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* We might not have a timestamp, get one */ + if (skb->stamp.tv_sec == 0) + do_gettimeofday((struct timeval *)&skb->stamp); + + /* copy hook, prefix, timestamp, payload, etc. */ + pm->data_len = copy_len; + pm->timestamp_sec = skb->stamp.tv_sec; + pm->timestamp_usec = skb->stamp.tv_usec; + pm->mark = skb->nfmark; + pm->hook = hooknum; + if (prefix != NULL) + strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + else if (loginfo->prefix[0] != '\0') + strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + + if (in && in->hard_header_len > 0 + && skb->mac.raw != (void *) skb->nh.iph + && in->hard_header_len <= ULOG_MAC_LEN) { + memcpy(pm->mac, skb->mac.raw, in->hard_header_len); + pm->mac_len = in->hard_header_len; + } else + pm->mac_len = 0; + + if (in) + strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + /* copy_len <= skb->len, so can't fail. */ + if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) + BUG(); + + /* check if we are building multi-part messages */ + if (ub->qlen > 1) { + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + } + + ub->lastnlh = nlh; + + /* if timer isn't already running, start it */ + if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + + /* if threshold is reached, send message to userspace */ + if (ub->qlen >= loginfo->qthreshold) { + if (loginfo->qthreshold > 1) + nlh->nlmsg_type = NLMSG_DONE; + ulog_send(groupnum); + } + + UNLOCK_BH(&ulog_lock); + + return; + +nlmsg_failure: + PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); + +alloc_failure: + PRINTR("ipt_ULOG: Error building netlink message\n"); + + UNLOCK_BH(&ulog_lock); +} + +static unsigned int ipt_ulog_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL); + + return IPT_CONTINUE; +} + +static void ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_ulog_info loginfo = { + .nl_group = ULOG_DEFAULT_NLGROUP, + .copy_range = 0, + .qthreshold = ULOG_DEFAULT_QTHRESHOLD, + .prefix = "" + }; + + ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static int ipt_ulog_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hookmask) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) { + DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { + DEBUGP("ipt_ULOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix) - 1]); + return 0; + } + + if (loginfo->qthreshold > ULOG_MAX_QLEN) { + DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n", + loginfo->qthreshold); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ulog_reg = { + .name = "ULOG", + .target = ipt_ulog_target, + .checkentry = ipt_ulog_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int i; + + DEBUGP("ipt_ULOG: init module\n"); + + if (nlbufsiz >= 128*1024) { + printk("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + init_timer(&ulog_buffers[i].timer); + ulog_buffers[i].timer.function = ulog_timer; + ulog_buffers[i].timer.data = i; + } + + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + if (!nflognl) + return -ENOMEM; + + if (ipt_register_target(&ipt_ulog_reg) != 0) { + sock_release(nflognl->sk_socket); + return -EINVAL; + } + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + ulog_buff_t *ub; + int i; + + DEBUGP("ipt_ULOG: cleanup_module\n"); + + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_ulog_reg); + sock_release(nflognl->sk_socket); + + /* remove pending timers and free allocated skb's */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) { + DEBUGP("timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + } + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c new file mode 100644 index 000000000000..f5909a4c3fc7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -0,0 +1,77 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter_ipv4/ipt_addrtype.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("iptables addrtype match"); + +static inline int match_type(u_int32_t addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_addr_type(addr))); +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_addrtype_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + int ret = 1; + + if (info->source) + ret &= match_type(iph->saddr, info->source)^info->invert_source; + if (info->dest) + ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + + return ret; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) { + printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.", + matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info))); + return 0; + } + + return 1; +} + +static struct ipt_match addrtype_match = { + .name = "addrtype", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&addrtype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&addrtype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c new file mode 100644 index 000000000000..a0fea847cb72 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ah.c @@ -0,0 +1,117 @@ +/* Kernel module to match AH parameters. */ +/* (C) 1999-2000 Yon Uriarte <yon@astaro.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter_ipv4/ipt_ah.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); +MODULE_DESCRIPTION("iptables AH SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_auth_hdr _ahdr, *ah; + const struct ipt_ah *ahinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil AH tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IPT_AH_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_ah *ahinfo = matchinfo; + + /* Must specify proto == AH, and no unknown invflags */ + if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_ah: Protocol %u != %u\n", ip->proto, + IPPROTO_AH); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) { + duprintf("ipt_ah: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah))); + return 0; + } + if (ahinfo->invflags & ~IPT_AH_INV_MASK) { + duprintf("ipt_ah: unknown flags %X\n", + ahinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match ah_match = { + .name = "ah", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ah_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&ah_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c new file mode 100644 index 000000000000..6b76a1ea5245 --- /dev/null +++ b/net/ipv4/netfilter/ipt_comment.c @@ -0,0 +1,59 @@ +/* + * Implements a dummy match to allow attaching comments to rules + * + * 2003-05-13 Brad Fisher (brad@info-link.net) + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_comment.h> + +MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); +MODULE_DESCRIPTION("iptables comment match module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + /* We always match */ + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* Check the size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info))) + return 0; + return 1; +} + +static struct ipt_match comment_match = { + .name = "comment", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&comment_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&comment_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c new file mode 100644 index 000000000000..2706f96cea55 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -0,0 +1,81 @@ +/* This kernel module matches connection mark values set by the + * CONNMARK target + * + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); +MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_LICENSE("GPL"); + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_connmark.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connmark_info *info = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) + return 0; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) + return 0; + + return 1; +} + +static struct ipt_match connmark_match = { + .name = "connmark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&connmark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&connmark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c new file mode 100644 index 000000000000..c1d22801b7cf --- /dev/null +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -0,0 +1,136 @@ +/* Kernel module to match connection tracking information. + * Superset of Rusty's minimalistic state match. + * + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_conntrack.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables connection tracking match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &ip_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info))) + return 0; + + return 1; +} + +static struct ipt_match conntrack_match = { + .name = "conntrack", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&conntrack_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&conntrack_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c new file mode 100644 index 000000000000..5df52a64a5d4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_dscp.c @@ -0,0 +1,63 @@ +/* IP tables module for matching the value of the IPv4 DSCP field + * + * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_dscp.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables DSCP matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_dscp_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info))) + return 0; + + return 1; +} + +static struct ipt_match dscp_match = { + .name = "dscp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&dscp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dscp_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c new file mode 100644 index 000000000000..b6f7181e89cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -0,0 +1,131 @@ +/* IP tables module for matching the value of the IPv4 and TCP ECN bits + * + * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp + * + * (C) 2002 by Harald Welte <laforge@gnumonks.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ecn.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables ECN matching module"); +MODULE_LICENSE("GPL"); + +static inline int match_ip(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo) +{ + return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); +} + +static inline int match_tcp(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + *hotdrop = 0; + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { + if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return 0; + } else { + if (th->ece == 0) + return 0; + } + } + + if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { + if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return 0; + } else { + if (th->cwr == 0) + return 0; + } + } + + return 1; +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (info->operation & IPT_ECN_OP_MATCH_IP) + if (!match_ip(skb, info)) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { + if (skb->nh.iph->protocol != IPPROTO_TCP) + return 0; + if (!match_tcp(skb, info, hotdrop)) + return 0; + } + + return 1; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info))) + return 0; + + if (info->operation & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->invert & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) + && ip->proto != IPPROTO_TCP) { + printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" + " non-tcp packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match ecn_match = { + .name = "ecn", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ecn_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ecn_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c new file mode 100644 index 000000000000..e1d0dd31e117 --- /dev/null +++ b/net/ipv4/netfilter/ipt_esp.c @@ -0,0 +1,118 @@ +/* Kernel module to match ESP parameters. */ + +/* (C) 1999-2000 Yon Uriarte <yon@astaro.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter_ipv4/ipt_esp.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); +MODULE_DESCRIPTION("iptables ESP SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_esp_hdr _esp, *eh; + const struct ipt_esp *espinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_esp), &_esp); + if (eh == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ESP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(espinfo->spis[0], espinfo->spis[1], + ntohl(eh->spi), + !!(espinfo->invflags & IPT_ESP_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_esp *espinfo = matchinfo; + + /* Must specify proto == ESP, and no unknown invflags */ + if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_esp: Protocol %u != %u\n", ip->proto, + IPPROTO_ESP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) { + duprintf("ipt_esp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp))); + return 0; + } + if (espinfo->invflags & ~IPT_ESP_INV_MASK) { + duprintf("ipt_esp: unknown flags %X\n", + espinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match esp_match = { + .name = "esp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&esp_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&esp_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c new file mode 100644 index 000000000000..f1937190cd77 --- /dev/null +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -0,0 +1,731 @@ +/* iptables match extension to limit the number of packets per second + * seperately for each hashbucket (sourceip/sourceport/dstip/dstport) + * + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * + * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $ + * + * Development of this code was funded by Astaro AG, http://www.astaro.com/ + * + * based on ipt_limit.c by: + * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * Rusty Russell <rusty@rustcorp.com.au> + * + * The general idea is to create a hash table for every dstip and have a + * seperate limit counter per tuple. This way you can do something like 'limit + * the number of syn packets for each of my internal addresses. + * + * Ideally this would just be implemented as a general 'hash' match, which would + * allow us to attach any iptables target to it's hash buckets. But this is + * not possible in the current iptables architecture. As always, pkttables for + * 2.7.x will help ;) + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/list.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_hashlimit.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +/* FIXME: this is just for IP_NF_ASSERRT */ +#include <linux/netfilter_ipv4/ip_conntrack.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables match for limiting per hash-bucket"); + +/* need to declare this at the top */ +static struct proc_dir_entry *hashlimit_procdir; +static struct file_operations dl_file_ops; + +/* hash table crap */ + +struct dsthash_dst { + u_int32_t src_ip; + u_int32_t dst_ip; + /* ports have to be consecutive !!! */ + u_int16_t src_port; + u_int16_t dst_port; +}; + +struct dsthash_ent { + /* static / read-only parts in the beginning */ + struct hlist_node node; + struct dsthash_dst dst; + + /* modified structure members in the end */ + unsigned long expires; /* precalculated expiry time */ + struct { + unsigned long prev; /* last modification */ + u_int32_t credit; + u_int32_t credit_cap, cost; + } rateinfo; +}; + +struct ipt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ + atomic_t use; + + struct hashlimit_cfg cfg; /* config */ + + /* used internally */ + spinlock_t lock; /* lock for list_head */ + u_int32_t rnd; /* random seed for hash */ + struct timer_list timer; /* timer for gc */ + atomic_t count; /* number entries in table */ + + /* seq_file stuff */ + struct proc_dir_entry *pde; + + struct hlist_head hash[0]; /* hashtable itself */ +}; + +static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ +static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +static HLIST_HEAD(hashlimit_htables); +static kmem_cache_t *hashlimit_cachep; + +static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) +{ + return (ent->dst.dst_ip == b->dst_ip + && ent->dst.dst_port == b->dst_port + && ent->dst.src_port == b->src_port + && ent->dst.src_ip == b->src_ip); +} + +static inline u_int32_t +hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst) +{ + return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port), + dst->src_ip, ht->rnd) % ht->cfg.size); +} + +static inline struct dsthash_ent * +__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + struct hlist_node *pos; + u_int32_t hash = hash_dst(ht, dst); + + if (!hlist_empty(&ht->hash[hash])) + hlist_for_each_entry(ent, pos, &ht->hash[hash], node) { + if (dst_cmp(ent, dst)) { + return ent; + } + } + + return NULL; +} + +/* allocate dsthash_ent, initialize dst, put in htable and lock it */ +static struct dsthash_ent * +__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + + /* initialize hash with random val at the time we allocate + * the first hashtable entry */ + if (!ht->rnd) + get_random_bytes(&ht->rnd, 4); + + if (ht->cfg.max && + atomic_read(&ht->count) >= ht->cfg.max) { + /* FIXME: do something. question is what.. */ + if (net_ratelimit()) + printk(KERN_WARNING + "ipt_hashlimit: max count of %u reached\n", + ht->cfg.max); + return NULL; + } + + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (!ent) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_hashlimit: can't allocate dsthash_ent\n"); + return NULL; + } + + atomic_inc(&ht->count); + + ent->dst.dst_ip = dst->dst_ip; + ent->dst.dst_port = dst->dst_port; + ent->dst.src_ip = dst->src_ip; + ent->dst.src_port = dst->src_port; + + hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]); + + return ent; +} + +static inline void +__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent) +{ + hlist_del(&ent->node); + kmem_cache_free(hashlimit_cachep, ent); + atomic_dec(&ht->count); +} +static void htable_gc(unsigned long htlong); + +static int htable_create(struct ipt_hashlimit_info *minfo) +{ + int i; + unsigned int size; + struct ipt_hashlimit_htable *hinfo; + + if (minfo->cfg.size) + size = minfo->cfg.size; + else { + size = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + size = 8192; + if (size < 16) + size = 16; + } + /* FIXME: don't use vmalloc() here or anywhere else -HW */ + hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable) + + (sizeof(struct list_head) * size)); + if (!hinfo) { + printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n"); + return -1; + } + minfo->hinfo = hinfo; + + /* copy match config into hashtable config */ + memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + hinfo->cfg.size = size; + if (!hinfo->cfg.max) + hinfo->cfg.max = 8 * hinfo->cfg.size; + else if (hinfo->cfg.max < hinfo->cfg.size) + hinfo->cfg.max = hinfo->cfg.size; + + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + + atomic_set(&hinfo->count, 0); + atomic_set(&hinfo->use, 1); + hinfo->rnd = 0; + spin_lock_init(&hinfo->lock); + hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); + if (!hinfo->pde) { + vfree(hinfo); + return -1; + } + hinfo->pde->proc_fops = &dl_file_ops; + hinfo->pde->data = hinfo; + + init_timer(&hinfo->timer); + hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); + hinfo->timer.data = (unsigned long )hinfo; + hinfo->timer.function = htable_gc; + add_timer(&hinfo->timer); + + LOCK_BH(&hashlimit_lock); + hlist_add_head(&hinfo->node, &hashlimit_htables); + UNLOCK_BH(&hashlimit_lock); + + return 0; +} + +static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return 1; +} + +static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return (jiffies >= he->expires); +} + +static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht, + int (*select)(struct ipt_hashlimit_htable *ht, + struct dsthash_ent *he)) +{ + int i; + + IP_NF_ASSERT(ht->cfg.size && ht->cfg.max); + + /* lock hash table and iterate over it */ + spin_lock_bh(&ht->lock); + for (i = 0; i < ht->cfg.size; i++) { + struct dsthash_ent *dh; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + if ((*select)(ht, dh)) + __dsthash_free(ht, dh); + } + } + spin_unlock_bh(&ht->lock); +} + +/* hash table garbage collector, run by timer */ +static void htable_gc(unsigned long htlong) +{ + struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong; + + htable_selective_cleanup(ht, select_gc); + + /* re-add the timer accordingly */ + ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); + add_timer(&ht->timer); +} + +static void htable_destroy(struct ipt_hashlimit_htable *hinfo) +{ + /* remove timer, if it is pending */ + if (timer_pending(&hinfo->timer)) + del_timer(&hinfo->timer); + + /* remove proc entry */ + remove_proc_entry(hinfo->pde->name, hashlimit_procdir); + + htable_selective_cleanup(hinfo, select_all); + vfree(hinfo); +} + +static struct ipt_hashlimit_htable *htable_find_get(char *name) +{ + struct ipt_hashlimit_htable *hinfo; + struct hlist_node *pos; + + LOCK_BH(&hashlimit_lock); + hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { + if (!strcmp(name, hinfo->pde->name)) { + atomic_inc(&hinfo->use); + UNLOCK_BH(&hashlimit_lock); + return hinfo; + } + } + UNLOCK_BH(&hashlimit_lock); + + return NULL; +} + +static void htable_put(struct ipt_hashlimit_htable *hinfo) +{ + if (atomic_dec_and_test(&hinfo->use)) { + LOCK_BH(&hashlimit_lock); + hlist_del(&hinfo->node); + UNLOCK_BH(&hashlimit_lock); + htable_destroy(hinfo); + } +} + + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maximum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. +*/ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +/* Precision saver. */ +static inline u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE; +} + +static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +{ + dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now)) + * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) + dh->rateinfo.credit = dh->rateinfo.credit_cap; +} + +static inline int get_ports(const struct sk_buff *skb, int offset, + u16 ports[2]) +{ + union { + struct tcphdr th; + struct udphdr uh; + sctp_sctphdr_t sctph; + } hdr_u, *ptr_u; + + /* Must not be a fragment. */ + if (offset) + return 1; + + /* Must be big enough to read ports (both UDP and TCP have + them at the start). */ + ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); + if (!ptr_u) + return 1; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + ports[0] = ptr_u->th.source; + ports[1] = ptr_u->th.dest; + break; + case IPPROTO_UDP: + ports[0] = ptr_u->uh.source; + ports[1] = ptr_u->uh.dest; + break; + case IPPROTO_SCTP: + ports[0] = ptr_u->sctph.source; + ports[1] = ptr_u->sctph.dest; + break; + default: + /* all other protocols don't supprot per-port hash + * buckets */ + ports[0] = ports[1] = 0; + break; + } + + return 0; +} + + +static int +hashlimit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_hashlimit_info *r = + ((struct ipt_hashlimit_info *)matchinfo)->u.master; + struct ipt_hashlimit_htable *hinfo = r->hinfo; + unsigned long now = jiffies; + struct dsthash_ent *dh; + struct dsthash_dst dst; + + /* build 'dst' according to hinfo->cfg and current packet */ + memset(&dst, 0, sizeof(dst)); + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP) + dst.dst_ip = skb->nh.iph->daddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP) + dst.src_ip = skb->nh.iph->saddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT + ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { + u_int16_t ports[2]; + if (get_ports(skb, offset, ports)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + *hotdrop = 1; + return 0; + } + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) + dst.src_port = ports[0]; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT) + dst.dst_port = ports[1]; + } + + spin_lock_bh(&hinfo->lock); + dh = __dsthash_find(hinfo, &dst); + if (!dh) { + dh = __dsthash_alloc_init(hinfo, &dst); + + if (!dh) { + /* enomem... don't match == DROP */ + if (net_ratelimit()) + printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__); + spin_unlock_bh(&hinfo->lock); + return 0; + } + + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + + spin_unlock_bh(&hinfo->lock); + return 1; + } + + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + + rateinfo_recalc(dh, now); + if (dh->rateinfo.credit >= dh->rateinfo.cost) { + /* We're underlimit. */ + dh->rateinfo.credit -= dh->rateinfo.cost; + spin_unlock_bh(&hinfo->lock); + return 1; + } + + spin_unlock_bh(&hinfo->lock); + + /* default case: we're overlimit, thus don't match */ + return 0; +} + +static int +hashlimit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_hashlimit_info *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info))) + return 0; + + /* Check for overflow. */ + if (r->cfg.burst == 0 + || user2credits(r->cfg.avg * r->cfg.burst) < + user2credits(r->cfg.avg)) { + printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n", + r->cfg.avg, r->cfg.burst); + return 0; + } + + if (r->cfg.mode == 0 + || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT + |IPT_HASHLIMIT_HASH_DIP + |IPT_HASHLIMIT_HASH_SIP + |IPT_HASHLIMIT_HASH_SPT)) + return 0; + + if (!r->cfg.gc_interval) + return 0; + + if (!r->cfg.expire) + return 0; + + /* This is the best we've got: We cannot release and re-grab lock, + * since checkentry() is called before ip_tables.c grabs ipt_mutex. + * We also cannot grab the hashtable spinlock, since htable_create will + * call vmalloc, and that can sleep. And we cannot just re-search + * the list of htable's in htable_create(), since then we would + * create duplicate proc files. -HW */ + down(&hlimit_mutex); + r->hinfo = htable_find_get(r->name); + if (!r->hinfo && (htable_create(r) != 0)) { + up(&hlimit_mutex); + return 0; + } + up(&hlimit_mutex); + + /* Ugly hack: For SMP, we only want to use one set */ + r->u.master = r; + + return 1; +} + +static void +hashlimit_destroy(void *matchinfo, unsigned int matchsize) +{ + struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + + htable_put(r->hinfo); +} + +static struct ipt_match ipt_hashlimit = { + .name = "hashlimit", + .match = hashlimit_match, + .checkentry = hashlimit_checkentry, + .destroy = hashlimit_destroy, + .me = THIS_MODULE +}; + +/* PROC stuff */ + +static void *dl_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket; + + spin_lock_bh(&htable->lock); + if (*pos >= htable->cfg.size) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); + if (!bucket) + return ERR_PTR(-ENOMEM); + + *bucket = *pos; + return bucket; +} + +static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= htable->cfg.size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void dl_seq_stop(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + kfree(bucket); + + spin_unlock_bh(&htable->lock); +} + +static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s) +{ + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies); + + return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port), + NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); +} + +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + struct hlist_node *pos; + + if (!hlist_empty(&htable->hash[*bucket])) + hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) { + if (dl_seq_real_show(ent, s)) { + /* buffer was filled and unable to print that tuple */ + return 1; + } + } + + return 0; +} + +static struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +} + +static struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int init_or_fini(int fini) +{ + int ret = 0; + + if (fini) + goto cleanup; + + if (ipt_register_match(&ipt_hashlimit)) { + ret = -EINVAL; + goto cleanup_nothing; + } + + hashlimit_cachep = kmem_cache_create("ipt_hashlimit", + sizeof(struct dsthash_ent), 0, + 0, NULL, NULL); + if (!hashlimit_cachep) { + printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n"); + ret = -ENOMEM; + goto cleanup_unreg_match; + } + + hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir) { + printk(KERN_ERR "Unable to create proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_free_slab; + } + + return ret; + +cleanup: + remove_proc_entry("ipt_hashlimit", proc_net); +cleanup_free_slab: + kmem_cache_destroy(hashlimit_cachep); +cleanup_unreg_match: + ipt_unregister_match(&ipt_hashlimit); +cleanup_nothing: + return ret; + +} + +static int __init init(void) +{ + return init_or_fini(0); +} + +static void __exit fini(void) +{ + init_or_fini(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c new file mode 100644 index 000000000000..33fdf364d3d3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_helper.c @@ -0,0 +1,113 @@ +/* iptables module to match on related connections */ +/* + * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Mar 2002 Harald Welte <laforge@gnumonks.org>: + * - Port to newnat infrastructure + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_helper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); +MODULE_DESCRIPTION("iptables helper match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + READ_LOCK(&ip_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + READ_UNLOCK(&ip_conntrack_lock); + return ret; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_helper_info *info = matchinfo; + + info->name[29] = '\0'; + + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info))) + return 0; + + return 1; +} + +static struct ipt_match helper_match = { + .name = "helper", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&helper_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&helper_match); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c new file mode 100644 index 000000000000..b835b7b2e560 --- /dev/null +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -0,0 +1,99 @@ +/* + * iptables module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_iprange.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("iptables arbitrary IP range match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_iprange_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + if (info->flags & IPRANGE_SRC) { + if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) + || (ntohl(iph->saddr) > ntohl(info->src.max_ip))) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + DEBUGP("src IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + NIPQUAD(info->src.min_ip), + NIPQUAD(info->src.max_ip)); + return 0; + } + } + if (info->flags & IPRANGE_DST) { + if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip)) + || (ntohl(iph->daddr) > ntohl(info->dst.max_ip))) + ^ !!(info->flags & IPRANGE_DST_INV)) { + DEBUGP("dst IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->daddr), + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + NIPQUAD(info->dst.min_ip), + NIPQUAD(info->dst.max_ip)); + return 0; + } + } + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info))) + return 0; + + return 1; +} + +static struct ipt_match iprange_match = +{ + .list = { NULL, NULL }, + .name = "iprange", + .match = &match, + .checkentry = &check, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&iprange_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&iprange_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c new file mode 100644 index 000000000000..4eabcfbda9d1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_length.c @@ -0,0 +1,64 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_length.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info))) + return 0; + + return 1; +} + +static struct ipt_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&length_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&length_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c new file mode 100644 index 000000000000..0c24dcc703a5 --- /dev/null +++ b/net/ipv4/netfilter/ipt_limit.c @@ -0,0 +1,157 @@ +/* Kernel module to control the rate + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ + +/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_limit.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); +MODULE_DESCRIPTION("iptables rate limit match"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maxmum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static int +ipt_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; +} + +static int +ipt_limit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_rateinfo *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Overflow in ipt_limit, try lower: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ipt_match ipt_limit_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_match(&ipt_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ipt_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c new file mode 100644 index 000000000000..11a459e33f25 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mac.c @@ -0,0 +1,79 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> + +#include <linux/netfilter_ipv4/ipt_mac.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables mac matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data + /* If so, compare... */ + && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ipt_mac_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* FORWARD isn't always valid, but it's nice to be able to do --RR */ + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) + return 0; + + return 1; +} + +static struct ipt_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ipt_mac_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c new file mode 100644 index 000000000000..8955728127b9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mark.c @@ -0,0 +1,64 @@ +/* Kernel module to match NFMARK values. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_mark.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables mark matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) + return 0; + + return 1; +} + +static struct ipt_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c new file mode 100644 index 000000000000..99e8188162e2 --- /dev/null +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -0,0 +1,212 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/udp.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_multiport.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables multiple port match module"); + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; i<count; i++) { + if (flags != IPT_MULTIPORT_DESTINATION + && portlist[i] == src) + return 1; + + if (flags != IPT_MULTIPORT_SOURCE + && portlist[i] == dst) + return 1; + } + + return 0; +} + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match_v1(const struct ipt_multiport_v1 *minfo, + u_int16_t src, u_int16_t dst) +{ + unsigned int i; + u_int16_t s, e; + + for (i=0; i < minfo->count; i++) { + s = minfo->ports[i]; + + if (minfo->pflags[i]) { + /* range port matching */ + e = minfo->ports[++i]; + duprintf("src or dst matches with %d-%d?\n", s, e); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src >= s && src <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst >= s && dst <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && ((dst >= s && dst <= e) + || (src >= s && src <= e))) + return 1 ^ minfo->invert; + } else { + /* exact port matching */ + duprintf("src or dst matches with %d?\n", s); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && (src == s || dst == s)) + return 1 ^ minfo->invert; + } + } + + return minfo->invert; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match(multiinfo->ports, + multiinfo->flags, multiinfo->count, + ntohs(pptr[0]), ntohs(pptr[1])); +} + +static int +match_v1(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport_v1 *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport))); +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1))); +} + +static struct ipt_match multiport_match = { + .name = "multiport", + .revision = 0, + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static struct ipt_match multiport_match_v1 = { + .name = "multiport", + .revision = 1, + .match = &match_v1, + .checkentry = &checkentry_v1, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_match(&multiport_match); + if (!err) { + err = ipt_register_match(&multiport_match_v1); + if (err) + ipt_unregister_match(&multiport_match); + } + + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&multiport_match); + ipt_unregister_match(&multiport_match_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c new file mode 100644 index 000000000000..3b9065e06381 --- /dev/null +++ b/net/ipv4/netfilter/ipt_owner.c @@ -0,0 +1,217 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. */ + +/* (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> + +#include <linux/netfilter_ipv4/ipt_owner.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables owner match"); + +static int +match_comm(const struct sk_buff *skb, const char *comm) +{ + struct task_struct *g, *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if(strncmp(p->comm, comm, sizeof(p->comm))) + continue; + + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); +out: + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *g, *p; + struct file *file = skb->sk->sk_socket->file; + int i, found=0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + struct files_struct *files; + if (p->signal->session != sid) + continue; + + task_lock(p); + files = p->files; + if (files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + if (found) + goto out; + } while_each_thread(g, p); +out: + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) + return 0; + + if(info->match & IPT_OWNER_UID) { + if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return 0; + } + + if(info->match & IPT_OWNER_GID) { + if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return 0; + } + + if(info->match & IPT_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IPT_OWNER_PID)) + return 0; + } + + if(info->match & IPT_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IPT_OWNER_SID)) + return 0; + } + + if(info->match & IPT_OWNER_COMM) { + if (!match_comm(skb, info->comm) ^ + !!(info->invert & IPT_OWNER_COMM)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { + printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) { + printk("Matchsize %u != %Zu\n", matchsize, + IPT_ALIGN(sizeof(struct ipt_owner_info))); + return 0; + } +#ifdef CONFIG_SMP + /* files->file_lock can not be used in a BH */ + if (((struct ipt_owner_info *)matchinfo)->match + & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching is broken " + "on SMP.\n"); + return 0; + } +#endif + return 1; +} + +static struct ipt_match owner_match = { + .name = "owner", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c new file mode 100644 index 000000000000..1a53924041fc --- /dev/null +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -0,0 +1,134 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ipt_physdev.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_bridge.h> +#define MATCH 1 +#define NOMATCH 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("iptables bridge physical device match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int i; + static const char nulldevname[IFNAMSIZ]; + const struct ipt_physdev_info *info = matchinfo; + unsigned int ret; + const char *indev, *outdev; + struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + !(info->invert & IPT_PHYSDEV_OP_BRIDGED)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) && + !(info->invert & IPT_PHYSDEV_OP_ISIN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) && + !(info->invert & IPT_PHYSDEV_OP_ISOUT)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_IN) && + !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_OUT) && + !(info->invert & IPT_PHYSDEV_OP_OUT)) + return NOMATCH; + return MATCH; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & IPT_PHYSDEV_OP_BRIDGED))) + return NOMATCH; + + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) || + (info->bitmask & IPT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT)))) + return NOMATCH; + + if (!(info->bitmask & IPT_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)indev)[i] + ^ ((const unsigned int *)info->physindev)[i]) + & ((const unsigned int *)info->in_mask)[i]; + } + + if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + +match_outdev: + if (!(info->bitmask & IPT_PHYSDEV_OP_OUT)) + return MATCH; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)outdev)[i] + ^ ((const unsigned int *)info->physoutdev)[i]) + & ((const unsigned int *)info->out_mask)[i]; + } + + return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_physdev_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info))) + return 0; + if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) || + info->bitmask & ~IPT_PHYSDEV_OP_MASK) + return 0; + return 1; +} + +static struct ipt_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&physdev_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&physdev_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c new file mode 100644 index 000000000000..8ddb1dc5e5ae --- /dev/null +++ b/net/ipv4/netfilter/ipt_pkttype.c @@ -0,0 +1,70 @@ +/* (C) 1999-2001 Michal Ludvig <michal@logix.cz> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> + +#include <linux/netfilter_ipv4/ipt_pkttype.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); +MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_pkttype_info *info = matchinfo; + + return (skb->pkt_type == info->pkttype) ^ info->invert; +} + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ +/* + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } +*/ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info))) + return 0; + + return 1; +} + +static struct ipt_match pkttype_match = { + .name = "pkttype", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&pkttype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&pkttype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c new file mode 100644 index 000000000000..54a6897ebaa6 --- /dev/null +++ b/net/ipv4/netfilter/ipt_realm.c @@ -0,0 +1,76 @@ +/* IP tables module for matching the routing realm + * + * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $ + * + * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <net/route.h> + +#include <linux/netfilter_ipv4/ipt_realm.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables realm match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_realm_info *info = matchinfo; + struct dst_entry *dst = skb->dst; + + return (info->id == (dst->tclassid & info->mask)) ^ info->invert; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) { + printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, " + "LOCAL_IN or FORWARD.\n"); + return 0; + } + if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) { + printk("ipt_realm: invalid matchsize.\n"); + return 0; + } + return 1; +} + +static struct ipt_match realm_match = { + .name = "realm", + .match = match, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&realm_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&realm_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c new file mode 100644 index 000000000000..25ab9fabdcba --- /dev/null +++ b/net/ipv4/netfilter/ipt_recent.c @@ -0,0 +1,1002 @@ +/* Kernel module to check if the source address has been seen recently. */ +/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ +/* Author: Stephen Frost <sfrost@snowman.net> */ +/* Project Page: http://snowman.net/projects/ipt_recent/ */ +/* This software is distributed under the terms of the GPL, Version 2 */ +/* This copyright does not cover user programs that use kernel services + * by normal system calls. */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/uaccess.h> +#include <linux/ctype.h> +#include <linux/ip.h> +#include <linux/vmalloc.h> +#include <linux/moduleparam.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_recent.h> + +#undef DEBUG +#define HASH_LOG 9 + +/* Defaults, these can be overridden on the module command-line. */ +static int ip_list_tot = 100; +static int ip_pkt_list_tot = 20; +static int ip_list_hash_size = 0; +static int ip_list_perms = 0644; +#ifdef DEBUG +static int debug = 1; +#endif + +static char version[] = +KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n"; + +MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); +MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); +MODULE_LICENSE("GPL"); +module_param(ip_list_tot, int, 0400); +module_param(ip_pkt_list_tot, int, 0400); +module_param(ip_list_hash_size, int, 0400); +module_param(ip_list_perms, int, 0400); +#ifdef DEBUG +module_param(debug, int, 0600); +MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); +#endif +MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); +MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); + +/* Structure of our list of recently seen addresses. */ +struct recent_ip_list { + u_int32_t addr; + u_int8_t ttl; + unsigned long last_seen; + unsigned long *last_pkts; + u_int32_t oldest_pkt; + u_int32_t hash_entry; + u_int32_t time_pos; +}; + +struct time_info_list { + u_int32_t position; + u_int32_t time; +}; + +/* Structure of our linked list of tables of recent lists. */ +struct recent_ip_tables { + char name[IPT_RECENT_NAME_LEN]; + int count; + int time_pos; + struct recent_ip_list *table; + struct recent_ip_tables *next; + spinlock_t list_lock; + int *hash_table; + struct time_info_list *time_info; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *status_proc; +#endif /* CONFIG_PROC_FS */ +}; + +/* Our current list of addresses we have recently seen. + * Only added to on a --set, and only updated on --set || --update + */ +static struct recent_ip_tables *r_tables = NULL; + +/* We protect r_list with this spinlock so two processors are not modifying + * the list at the same time. + */ +static DEFINE_SPINLOCK(recent_lock); + +#ifdef CONFIG_PROC_FS +/* Our /proc/net/ipt_recent entry */ +static struct proc_dir_entry *proc_net_ipt_recent = NULL; +#endif + +/* Function declaration for later. */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop); + +/* Function to hash a given address into the hash table of table_size size */ +static int hash_func(unsigned int addr, int table_size) +{ + int result = 0; + unsigned int value = addr; + do { result ^= value; } while((value >>= HASH_LOG)); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", + result & (table_size - 1), + addr, + table_size); +#endif + + return(result & (table_size - 1)); +} + +#ifdef CONFIG_PROC_FS +/* This is the function which produces the output for our /proc output + * interface which lists each IP address, the last seen time and the + * other recent times the address was seen. + */ + +static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +{ + int len = 0, count, last_len = 0, pkt_count; + off_t pos = 0; + off_t begin = 0; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + spin_lock_bh(&curr_table->list_lock); + for(count = 0; count < ip_list_tot; count++) { + if(!curr_table->table[count].addr) continue; + last_len = len; + len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); + len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); + len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); + len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); + len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); + for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(!curr_table->table[count].last_pkts[pkt_count]) break; + len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); + } + len += sprintf(buffer+len,"\n"); + pos = begin + len; + if(pos < offset) { len = 0; begin = pos; } + if(pos > offset + length) { len = last_len; break; } + } + + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) len = length; + + spin_unlock_bh(&curr_table->list_lock); + return len; +} + +/* ip_recent_ctrl provides an interface for users to modify the table + * directly. This allows adding entries, removing entries, and + * flushing the entire table. + * This is done by opening up the appropriate table for writing and + * sending one of: + * xx.xx.xx.xx -- Add entry to table with current time + * +xx.xx.xx.xx -- Add entry to table with current time + * -xx.xx.xx.xx -- Remove entry from table + * clear -- Flush table, remove all entries + */ + +static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +{ + static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; + u_int32_t val; + int base, used = 0; + char c, *cp; + union iaddr { + uint8_t bytes[4]; + uint32_t word; + } res; + uint8_t *pp = res.bytes; + int digit; + + char buffer[20]; + int len, check_set = 0, count; + u_int32_t addr = 0; + struct sk_buff *skb; + struct ipt_recent_info *info; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + if(size > 20) len = 20; else len = size; + + if(copy_from_user(buffer,input,len)) return -EFAULT; + + if(len < 20) buffer[len] = '\0'; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); +#endif + + cp = buffer; + while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } + + /* Check if we are asked to flush the entire table */ + if(!memcmp(cp,"clear",5)) { + used += 5; + spin_lock_bh(&curr_table->list_lock); + curr_table->time_pos = 0; + for(count = 0; count < ip_list_hash_size; count++) { + curr_table->hash_table[count] = -1; + } + for(count = 0; count < ip_list_tot; count++) { + curr_table->table[count].last_seen = 0; + curr_table->table[count].addr = 0; + curr_table->table[count].ttl = 0; + memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + curr_table->table[count].oldest_pkt = 0; + curr_table->table[count].time_pos = 0; + curr_table->time_info[count].position = count; + curr_table->time_info[count].time = 0; + } + spin_unlock_bh(&curr_table->list_lock); + return used; + } + + check_set = IPT_RECENT_SET; + switch(*cp) { + case '+': check_set = IPT_RECENT_SET; cp++; used++; break; + case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; + default: if(!isdigit(*cp)) return (used+1); break; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); +#endif + /* Get addr (effectively inet_aton()) */ + /* Shamelessly stolen from libc, a function in the kernel for doing + * this would, of course, be greatly preferred, but our options appear + * to be rather limited, so we will just do it ourselves here. + */ + res.word = 0; + + c = *cp; + for(;;) { + if(!isdigit(c)) return used; + val = 0; base = 10; digit = 0; + if(c == '0') { + c = *++cp; + if(c == 'x' || c == 'X') base = 16, c = *++cp; + else { base = 8; digit = 1; } + } + for(;;) { + if(isascii(c) && isdigit(c)) { + if(base == 8 && (c == '8' || c == '0')) return used; + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if(base == 16 && isascii(c) && isxdigit(c)) { + val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else break; + } + if(c == '.') { + if(pp > res.bytes + 2 || val > 0xff) return used; + *pp++ = val; + c = *++cp; + } else break; + } + used = cp - buffer; + if(c != '\0' && (!isascii(c) || !isspace(c))) return used; + if(c == '\n') used++; + if(!digit) return used; + + if(val > max[pp - res.bytes]) return used; + addr = res.word | htonl(val); + + if(!addr && check_set == IPT_RECENT_SET) return used; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); +#endif + + /* Set up and just call match */ + info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); + if(!info) { return -ENOMEM; } + info->seconds = 0; + info->hit_count = 0; + info->check_set = check_set; + info->invert = 0; + info->side = IPT_RECENT_SOURCE; + strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); + info->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); + if (!skb) { + used = -ENOMEM; + goto out_free_info; + } + skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); + if (!skb->nh.iph) { + used = -ENOMEM; + goto out_free_skb; + } + + skb->nh.iph->saddr = addr; + skb->nh.iph->daddr = 0; + /* Clear ttl since we have no way of knowing it */ + skb->nh.iph->ttl = 0; + match(skb,NULL,NULL,info,0,NULL); + + kfree(skb->nh.iph); +out_free_skb: + kfree(skb); +out_free_info: + kfree(info); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); +#endif + return used; +} + +#endif /* CONFIG_PROC_FS */ + +/* 'match' is our primary function, called by the kernel whenever a rule is + * hit with our module as an option to it. + * What this function does depends on what was specifically asked of it by + * the user: + * --set -- Add or update last seen time of the source address of the packet + * -- matchinfo->check_set == IPT_RECENT_SET + * --rcheck -- Just check if the source address is in the list + * -- matchinfo->check_set == IPT_RECENT_CHECK + * --update -- If the source address is in the list, update last_seen + * -- matchinfo->check_set == IPT_RECENT_UPDATE + * --remove -- If the source address is in the list, remove it + * -- matchinfo->check_set == IPT_RECENT_REMOVE + * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds + * -- matchinfo->seconds + * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times + * -- matchinfo->hit_count + * --seconds and --hitcount can be combined + */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int pkt_count, hits_found, ans; + unsigned long now; + const struct ipt_recent_info *info = matchinfo; + u_int32_t addr = 0, time_temp; + u_int8_t ttl = skb->nh.iph->ttl; + int *hash_table; + int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; + struct time_info_list *time_info; + struct recent_ip_tables *curr_table; + struct recent_ip_tables *last_table; + struct recent_ip_list *r_list; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); +#endif + + /* Default is false ^ info->invert */ + ans = info->invert; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); +#endif + + /* if out != NULL then routing has been done and TTL changed. + * We change it back here internally for match what came in before routing. */ + if(out) ttl++; + + /* Find the right table */ + spin_lock_bh(&recent_lock); + curr_table = r_tables; + while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); +#endif + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found, match impossible */ + if(!curr_table) { return ans; } + + /* Make sure no one is changing the list while we work with it */ + spin_lock_bh(&curr_table->list_lock); + + r_list = curr_table->table; + if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; + + if(!addr) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); +#endif + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); +#endif + + /* Get jiffies now in case they changed while we were waiting for a lock */ + now = jiffies; + hash_table = curr_table->hash_table; + time_info = curr_table->time_info; + + orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); + /* Hash entry at this result used */ + /* Check for TTL match if requested. If TTL is zero then a match would never + * happen, so match regardless of existing TTL in that case. Zero means the + * entry was added via the /proc interface anyway, so we will just use the + * first TTL we get for that IP address. */ + if(info->check_set & IPT_RECENT_TTL) { + while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && + (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } else { + while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } + + if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { + /* IP not in list and not asked to SET */ + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + /* Check if we need to handle the collision, do not need to on REMOVE */ + if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", + orig_hash_result, + hash_result, + r_list[hash_table[orig_hash_result]].addr, + addr); +#endif + + /* We had a collision. + * orig_hash_result is where we started, hash_result is where we ended up. + * So, swap them because we are likely to see the same guy again sooner */ +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); + printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", + r_list[hash_table[orig_hash_result]].hash_entry); + } +#endif + + r_list[hash_table[orig_hash_result]].hash_entry = hash_result; + + + temp = hash_table[orig_hash_result]; +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); +#endif + hash_table[orig_hash_result] = hash_table[hash_result]; + hash_table[hash_result] = temp; + temp = hash_result; + hash_result = orig_hash_result; + orig_hash_result = temp; + time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; + if(hash_table[hash_result] != -1) { + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); +#endif + } + + if(hash_table[hash_result] == -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", + hash_result, addr); +#endif + + /* New item found and IPT_RECENT_SET, so we need to add it */ + location = time_info[curr_table->time_pos].position; + hash_table[r_list[location].hash_entry] = -1; + hash_table[hash_result] = location; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].time_pos = curr_table->time_pos; + r_list[location].addr = addr; + r_list[location].ttl = ttl; + r_list[location].last_seen = now; + r_list[location].oldest_pkt = 1; + r_list[location].last_pkts[0] = now; + r_list[location].hash_entry = hash_result; + time_info[curr_table->time_pos].time = r_list[location].last_seen; + curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; + + ans = !info->invert; + } else { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", + hash_result, + addr); +#endif + + /* Existing item found */ + location = hash_table[hash_result]; + /* We have a match on address, now to make sure it meets all requirements for a + * full match. */ + if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { + if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; + if(info->seconds && !info->hit_count) { + if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; + } + if(info->seconds && info->hit_count) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + if(info->hit_count && !info->seconds) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; + hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + } +#ifdef DEBUG + if(debug) { + if(ans) + printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); + else + printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); + } +#endif + + /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the + * current timestamp to the last_seen. */ + if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); +#endif + /* Have to update our time info */ + time_loc = r_list[location].time_pos; + time_info[time_loc].time = now; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].ttl = ttl; + r_list[location].last_pkts[r_list[location].oldest_pkt] = now; + r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; + r_list[location].last_seen = now; + } + /* If we have been asked to remove the entry from the list, just set it to 0 */ + if(info->check_set & IPT_RECENT_REMOVE) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); +#endif + /* Check if this is part of a collision chain */ + while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { + orig_hash_result++; + if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { + /* Found collision chain, how deep does this rabbit hole go? */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); +#endif + end_collision_chain = orig_hash_result; + } + } + if(end_collision_chain != -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); +#endif + /* Part of a collision chain, swap it with the end of the chain + * before removing. */ + r_list[hash_table[end_collision_chain]].hash_entry = hash_result; + temp = hash_table[end_collision_chain]; + hash_table[end_collision_chain] = hash_table[hash_result]; + hash_table[hash_result] = temp; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + hash_result = end_collision_chain; + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + location = hash_table[hash_result]; + hash_table[r_list[location].hash_entry] = -1; + time_loc = r_list[location].time_pos; + time_info[time_loc].time = 0; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].last_seen = 0; + r_list[location].addr = 0; + r_list[location].ttl = 0; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].oldest_pkt = 0; + ans = !info->invert; + } + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + spin_unlock_bh(&curr_table->list_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); +#endif + return ans; +} + +/* This function is to verify that the rule given during the userspace iptables + * command is correct. + * If the command is valid then we check if the table name referred to by the + * rule exists, if not it is created. + */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + int flag = 0, c; + unsigned long *hold; + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *find_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); +#endif + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0; + + /* seconds and hit_count only valid for CHECK/UPDATE */ + if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_CHECK) flag++; + if(info->check_set & IPT_RECENT_UPDATE) flag++; + + /* One and only one of these should ever be set */ + if(flag != 1) return 0; + + /* Name must be set to something */ + if(!info->name || !info->name[0]) return 0; + + /* Things look good, create a list for this if it does not exist */ + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); +#endif + find_table->count++; + spin_unlock_bh(&recent_lock); + return 1; + } + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found */ + /* Allocate memory for new linked list item */ + +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); + printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + } +#endif + + curr_table = vmalloc(sizeof(struct recent_ip_tables)); + if(curr_table == NULL) return 0; + + spin_lock_init(&curr_table->list_lock); + curr_table->next = NULL; + curr_table->count = 1; + curr_table->time_pos = 0; + strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); + curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + /* Allocate memory for this table and the list of packets in each entry. */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", + sizeof(struct recent_ip_list)*ip_list_tot, + info->name); +#endif + + curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); + if(curr_table->table == NULL) { vfree(curr_table); return 0; } + memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", + sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#endif + + hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); +#endif + if(hold == NULL) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; + } + + /* Allocate memory for the hash table */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", + sizeof(int)*ip_list_hash_size); +#endif + + curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); + if(!curr_table->hash_table) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + for(c = 0; c < ip_list_hash_size; c++) { + curr_table->hash_table[c] = -1; + } + + /* Allocate memory for the time info */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", + sizeof(struct time_info_list)*ip_list_tot); +#endif + + curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); + if(!curr_table->time_info) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->time_info[c].position = c; + curr_table->time_info[c].time = 0; + } + + /* Put the new table in place */ + spin_lock_bh(&recent_lock); + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { + find_table->count++; + spin_unlock_bh(&recent_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); +#endif + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 1; + } + if(!last_table) r_tables = curr_table; else last_table->next = curr_table; + + spin_unlock_bh(&recent_lock); + +#ifdef CONFIG_PROC_FS + /* Create our proc 'status' entry. */ + curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); + if (!curr_table->status_proc) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); + /* Destroy the created table */ + spin_lock_bh(&recent_lock); + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + spin_unlock_bh(&recent_lock); + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + curr_table->status_proc->owner = THIS_MODULE; + curr_table->status_proc->data = curr_table; + wmb(); + curr_table->status_proc->read_proc = ip_recent_get_info; + curr_table->status_proc->write_proc = ip_recent_ctrl; +#endif /* CONFIG_PROC_FS */ + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); +#endif + + return 1; +} + +/* This function is called in the event that a rule matching this module is + * removed. + * When this happens we need to check if there are no other rules matching + * the table given. If that is the case then we remove the table and clean + * up its memory. + */ +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); +#endif + + if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; + + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + + /* If a table does not exist then do nothing and return */ + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + + curr_table->count--; + + /* If count is still non-zero then there are still rules referenceing it so we do nothing */ + if(curr_table->count) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); +#endif + + /* Count must be zero so we remove this table from the list */ + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + + spin_unlock_bh(&recent_lock); + + /* lock to make sure any late-runners still using this after we removed it from + * the list finish up then remove everything */ + spin_lock_bh(&curr_table->list_lock); + spin_unlock_bh(&curr_table->list_lock); + +#ifdef CONFIG_PROC_FS + if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +#endif /* CONFIG_PROC_FS */ + vfree(curr_table->table[0].last_pkts); + vfree(curr_table->table); + vfree(curr_table->hash_table); + vfree(curr_table->time_info); + vfree(curr_table); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); +#endif + + return; +} + +/* This is the structure we pass to ipt_register to register our + * module with iptables. + */ +static struct ipt_match recent_match = { + .name = "recent", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + +/* Kernel module initialization. */ +static int __init init(void) +{ + int err, count; + + printk(version); +#ifdef CONFIG_PROC_FS + proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); + if(!proc_net_ipt_recent) return -ENOMEM; +#endif + + if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { + printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); + ip_list_hash_size = 0; + } + + if(!ip_list_hash_size) { + ip_list_hash_size = ip_list_tot*3; + count = 2*2; + while(ip_list_hash_size > count) count = count*2; + ip_list_hash_size = count; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); +#endif + + err = ipt_register_match(&recent_match); + if (err) + remove_proc_entry("ipt_recent", proc_net); + return err; +} + +/* Kernel module destruction. */ +static void __exit fini(void) +{ + ipt_unregister_match(&recent_match); + + remove_proc_entry("ipt_recent",proc_net); +} + +/* Register our module with the kernel. */ +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c new file mode 100644 index 000000000000..fe2b327bcaa4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_sctp.c @@ -0,0 +1,203 @@ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/sctp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_sctp.h> + +#ifdef DEBUG_SCTP +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static int +match_flags(const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) { + if (flag_info[i].chunktype == chunktype) { + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + } + } + + return 1; +} + +static int +match_packet(const struct sk_buff *skb, + const u_int32_t *chunkmap, + int chunk_match_type, + const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + int *hotdrop) +{ + int offset; + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + sctp_chunkhdr_t _sch, *sch; + +#ifdef DEBUG_SCTP + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) { + SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); + } + + offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t); + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); + if (sch == NULL) { + duprintf("Dropping invalid SCTP packet.\n"); + *hotdrop = 1; + return 0; + } + + duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), sch->flags); + + offset += (htons(sch->length) + 3) & ~3; + + duprintf("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 1; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); + } + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 0; + } + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return 0; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmap); + case SCTP_CHUNK_MATCH_ANY: + return 0; + case SCTP_CHUNK_MATCH_ONLY: + return 1; + } + + /* This will never be reached, but required to stop compiler whine */ + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_sctp_info *info; + sctp_sctphdr_t _sh, *sh; + + info = (const struct ipt_sctp_info *)matchinfo; + + if (offset) { + duprintf("Dropping non-first fragment.. FIXME\n"); + return 0; + } + + sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh); + if (sh == NULL) { + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + + return SCCHECK(((ntohs(sh->source) >= info->spts[0]) + && (ntohs(sh->source) <= info->spts[1])), + IPT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) + && (ntohs(sh->dest) <= info->dpts[1])), + IPT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type, + info->flag_info, info->flag_count, + hotdrop), + IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_sctp_info *info; + + info = (const struct ipt_sctp_info *)matchinfo; + + return ip->proto == IPPROTO_SCTP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info)) + && !(info->flags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~info->flags) + && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || + (info->chunk_match_type & + (SCTP_CHUNK_MATCH_ALL + | SCTP_CHUNK_MATCH_ANY + | SCTP_CHUNK_MATCH_ONLY))); +} + +static struct ipt_match sctp_match = +{ + .list = { NULL, NULL}, + .name = "sctp", + .match = &match, + .checkentry = &checkentry, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&sctp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&sctp_match); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Match for SCTP protocol packets"); + diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c new file mode 100644 index 000000000000..b1511b97ea5f --- /dev/null +++ b/net/ipv4/netfilter/ipt_state.c @@ -0,0 +1,74 @@ +/* Kernel module to match connection tracking information. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_state.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("iptables connection tracking state match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_state_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + if (skb->nfct == &ip_conntrack_untracked.ct_general) + statebit = IPT_STATE_UNTRACKED; + else if (!ip_conntrack_get(skb, &ctinfo)) + statebit = IPT_STATE_INVALID; + else + statebit = IPT_STATE_BIT(ctinfo); + + return (sinfo->statemask & statebit); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info))) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "state", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c new file mode 100644 index 000000000000..4dc9b16ab4a3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_tcpmss.c @@ -0,0 +1,127 @@ +/* Kernel module to match TCP MSS values. */ + +/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ipt_tcpmss.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define TH_SYN 0x02 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables TCP MSS match module"); + +/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */ +static inline int +mssoption_match(u_int16_t min, u_int16_t max, + const struct sk_buff *skb, + int invert, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u8 _opt[15 * 4 - sizeof(_tcph)], *op; + unsigned int i, optlen; + + /* If we don't have the whole header, drop packet. */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) + goto dropit; + + /* Malformed. */ + if (th->doff*4 < sizeof(*th)) + goto dropit; + + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + + /* Truncated options. */ + op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th), + optlen, _opt); + if (op == NULL) + goto dropit; + + for (i = 0; i < optlen; ) { + if (op[i] == TCPOPT_MSS + && (optlen - i) >= TCPOLEN_MSS + && op[i+1] == TCPOLEN_MSS) { + u_int16_t mssval; + + mssval = (op[i+2] << 8) | op[i+3]; + + return (mssval >= min && mssval <= max) ^ invert; + } + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } +out: + return invert; + + dropit: + *hotdrop = 1; + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tcpmss_match_info *info = matchinfo; + + return mssoption_match(info->mss_min, info->mss_max, skb, + info->invert, hotdrop); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info))) + return 0; + + /* Must specify -p tcp */ + if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) { + printk("tcpmss: Only works on TCP packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match tcpmss_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tcpmss_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tcpmss_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c new file mode 100644 index 000000000000..086a1bb61e3e --- /dev/null +++ b/net/ipv4/netfilter/ipt_tos.c @@ -0,0 +1,64 @@ +/* Kernel module to match TOS values. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_tos.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables TOS match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tos_info *info = matchinfo; + + return (skb->nh.iph->tos == info->tos) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info))) + return 0; + + return 1; +} + +static struct ipt_match tos_match = { + .name = "tos", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tos_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tos_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c new file mode 100644 index 000000000000..219aa9de88cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -0,0 +1,79 @@ +/* IP tables module for matching the value of the TTL + * + * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp + * + * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_ttl.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IP tables TTL matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ttl_info *info = matchinfo; + + switch (info->mode) { + case IPT_TTL_EQ: + return (skb->nh.iph->ttl == info->ttl); + break; + case IPT_TTL_NE: + return (!(skb->nh.iph->ttl == info->ttl)); + break; + case IPT_TTL_LT: + return (skb->nh.iph->ttl < info->ttl); + break; + case IPT_TTL_GT: + return (skb->nh.iph->ttl > info->ttl); + break; + default: + printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", + info->mode); + return 0; + } + + return 0; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info))) + return 0; + + return 1; +} + +static struct ipt_match ttl_match = { + .name = "ttl", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ttl_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ttl_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c new file mode 100644 index 000000000000..260a4f0a2a90 --- /dev/null +++ b/net/ipv4/netfilter/iptable_filter.c @@ -0,0 +1,194 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_FILTER, + }, +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ipt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c new file mode 100644 index 000000000000..160eb11b6e2f --- /dev/null +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -0,0 +1,260 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Extended to all five netfilter hooks by Brad Chapman & Harald Welte + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/route.h> +#include <linux/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \ + (1 << NF_IP_LOCAL_IN) | \ + (1 << NF_IP_FORWARD) | \ + (1 << NF_IP_LOCAL_OUT) | \ + (1 << NF_IP_POST_ROUTING)) + +/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[5]; + struct ipt_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ipt_local_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + u_int8_t tos; + u_int32_t saddr, daddr; + unsigned long nfmark; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + /* Save things which could affect route */ + nfmark = (*pskb)->nfmark; + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + tos = (*pskb)->nh.iph->tos; + + ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr +#ifdef CONFIG_IP_ROUTE_FWMARK + || (*pskb)->nfmark != nfmark +#endif + || (*pskb)->nh.iph->tos != tos)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + + return ret; +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_mangler, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; + + ret = nf_register_hook(&ipt_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: + nf_unregister_hook(&ipt_ops[3]); + cleanup_hook2: + nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c new file mode 100644 index 000000000000..01b4a3c814d3 --- /dev/null +++ b/net/ipv4/netfilter/iptable_raw.c @@ -0,0 +1,156 @@ +/* + * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . + * + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[2]; + struct ipt_error term; +} initial_table __initdata = { + .repl = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), + .hook_entry = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + .underflow = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ipt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL); +} + +/* 'raw' is the very first table. */ +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_RAW + }, + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_RAW + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_raw, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_raw); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_raw); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c new file mode 100644 index 000000000000..912bbcc7f415 --- /dev/null +++ b/net/ipv4/proc.c @@ -0,0 +1,382 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. It is mainly used for debugging and + * statistics. + * + * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $ + * + * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> + * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> + * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> + * + * Fixes: + * Alan Cox : UDP sockets show the rxqueue/txqueue + * using hint flag for the netinfo. + * Pauline Middelink : identd support + * Alan Cox : Make /proc safer. + * Erik Schoenfelder : /proc/net/snmp + * Alan Cox : Handle dead sockets properly. + * Gerhard Koerting : Show both timers + * Alan Cox : Allow inode to be NULL (kernel socket) + * Andi Kleen : Add support for open_requests and + * split functions for more readibility. + * Andi Kleen : Add support for /proc/net/netstat + * Arnaldo C. Melo : Convert to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/sock.h> +#include <net/raw.h> + +static int fold_prot_inuse(struct proto *proto) +{ + int res = 0; + int cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + res += proto->stats[cpu].inuse; + + return res; +} + +/* + * Report socket allocation statistics [mea@utu.fi] + */ +static int sockstat_seq_show(struct seq_file *seq, void *v) +{ + /* From net/socket.c */ + extern void socket_seq_show(struct seq_file *seq); + + socket_seq_show(seq); + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + tcp_tw_count, atomic_read(&tcp_sockets_allocated), + atomic_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); + seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues, + atomic_read(&ip_frag_mem)); + return 0; +} + +static int sockstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sockstat_seq_show, NULL); +} + +static struct file_operations sockstat_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static unsigned long +fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +/* snmp items */ +static struct snmp_mib snmp4_ipstats_list[] = { + SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), + SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), + SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_icmp_list[] = { + SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), + SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), + SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), + SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS), + SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS), + SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS), + SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS), + SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS), + SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS), + SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS), + SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS), + SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS), + SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS), + SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS), + SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS), + SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS), + SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS), + SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS), + SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS), + SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS), + SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS), + SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS), + SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS), + SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS), + SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS), + SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_tcp_list[] = { + SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), + SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), + SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), + SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), + SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), + SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), + SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), + SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), + SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), + SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), + SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), + SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), + SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_udp_list[] = { + SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_net_list[] = { + SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), + SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), + SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), + SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), + SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), + SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), + SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), + SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), + SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), + SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), + SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), + SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), + SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), + SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), + SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), + SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), + SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), + SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), + SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), + SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), + SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), + SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), + SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), + SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), + SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), + SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), + SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), + SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), + SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), + SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), + SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), + SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), + SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), + SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), + SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), + SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), + SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), + SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), + SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), + SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), + SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), + SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), + SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), + SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), + SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), + SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), + SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), + SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), + SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), + SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), + SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), + SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), + SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), + SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), + SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), + SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), + SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), + SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), + SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), + SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), + SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), + SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), + SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_SENTINEL +}; + +/* + * Called from the PROCfs module. This outputs /proc/net/snmp. + */ +static int snmp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + seq_puts(seq, "Ip: Forwarding DefaultTTL"); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_ipstats_list[i].name); + + seq_printf(seq, "\nIp: %d %d", + ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) ip_statistics, + snmp4_ipstats_list[i].entry)); + + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_icmp_list[i].name); + + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) icmp_statistics, + snmp4_icmp_list[i].entry)); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_tcp_list[i].name); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", + fold_field((void **) tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", + fold_field((void **) tcp_statistics, + snmp4_tcp_list[i].entry)); + } + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_udp_list[i].name); + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) udp_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); + return 0; +} + +static int snmp_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp_seq_show, NULL); +} + +static struct file_operations snmp_seq_fops = { + .owner = THIS_MODULE, + .open = snmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Output /proc/net/netstat + */ +static int netstat_seq_show(struct seq_file *seq, void *v) +{ + int i; + + seq_puts(seq, "TcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_net_list[i].name); + + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) net_statistics, + snmp4_net_list[i].entry)); + + seq_putc(seq, '\n'); + return 0; +} + +static int netstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, netstat_seq_show, NULL); +} + +static struct file_operations netstat_seq_fops = { + .owner = THIS_MODULE, + .open = netstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int __init ip_misc_proc_init(void) +{ + int rc = 0; + + if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + + if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + + if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) + goto out_sockstat; +out: + return rc; +out_sockstat: + proc_net_remove("snmp"); +out_snmp: + proc_net_remove("netstat"); +out_netstat: + rc = -ENOMEM; + goto out; +} + diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c new file mode 100644 index 000000000000..90a587cacaa4 --- /dev/null +++ b/net/ipv4/protocol.c @@ -0,0 +1,101 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * INET protocol dispatch tables. + * + * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : Ahah! udp icmp errors don't work because + * udp_err is never called! + * Alan Cox : Added new fields for init and ready for + * proper fragmentation (_NO_ 4K limits!) + * Richard Colella : Hang on hash collision + * Vince Laviano : Modified inet_del_protocol() to correctly + * maintain copy bit. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/config.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/timer.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/ipip.h> +#include <linux/igmp.h> + +struct net_protocol *inet_protos[MAX_INET_PROTOS]; +static DEFINE_SPINLOCK(inet_proto_lock); + +/* + * Add a protocol handler to the hash tables + */ + +int inet_add_protocol(struct net_protocol *prot, unsigned char protocol) +{ + int hash, ret; + + hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet_proto_lock); + if (inet_protos[hash]) { + ret = -1; + } else { + inet_protos[hash] = prot; + ret = 0; + } + spin_unlock_bh(&inet_proto_lock); + + return ret; +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet_del_protocol(struct net_protocol *prot, unsigned char protocol) +{ + int hash, ret; + + hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet_proto_lock); + if (inet_protos[hash] == prot) { + inet_protos[hash] = NULL; + ret = 0; + } else { + ret = -1; + } + spin_unlock_bh(&inet_proto_lock); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(inet_add_protocol); +EXPORT_SYMBOL(inet_del_protocol); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c new file mode 100644 index 000000000000..93624a32eb9a --- /dev/null +++ b/net/ipv4/raw.c @@ -0,0 +1,888 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * RAW - implementation of IP "raw" sockets. + * + * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : verify_area() fixed up + * Alan Cox : ICMP error handling + * Alan Cox : EMSGSIZE if you send too big a packet + * Alan Cox : Now uses generic datagrams and shared + * skbuff library. No more peek crashes, + * no more backlogs + * Alan Cox : Checks sk->broadcast. + * Alan Cox : Uses skb_free_datagram/skb_copy_datagram + * Alan Cox : Raw passes ip options too + * Alan Cox : Setsocketopt added + * Alan Cox : Fixed error return for broadcasts + * Alan Cox : Removed wake_up calls + * Alan Cox : Use ttl/tos + * Alan Cox : Cleaned up old debugging + * Alan Cox : Use new kernel side addresses + * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. + * Alan Cox : BSD style RAW socket demultiplexing. + * Alan Cox : Beginnings of mrouted support. + * Alan Cox : Added IP_HDRINCL option. + * Alan Cox : Skip broadcast check if BSDism set. + * David S. Miller : New socket lookup architecture. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/atomic.h> +#include <asm/byteorder.h> +#include <asm/current.h> +#include <asm/uaccess.h> +#include <asm/ioctls.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/aio.h> +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/sockios.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/mroute.h> +#include <linux/netdevice.h> +#include <linux/in_route.h> +#include <linux/route.h> +#include <linux/tcp.h> +#include <linux/skbuff.h> +#include <net/dst.h> +#include <net/sock.h> +#include <linux/gfp.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <net/snmp.h> +#include <net/inet_common.h> +#include <net/checksum.h> +#include <net/xfrm.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE]; +DEFINE_RWLOCK(raw_v4_lock); + +static void raw_v4_hash(struct sock *sk) +{ + struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num & + (RAWV4_HTABLE_SIZE - 1)]; + + write_lock_bh(&raw_v4_lock); + sk_add_node(sk, head); + sock_prot_inc_use(sk->sk_prot); + write_unlock_bh(&raw_v4_lock); +} + +static void raw_v4_unhash(struct sock *sk) +{ + write_lock_bh(&raw_v4_lock); + if (sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(&raw_v4_lock); +} + +struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, + unsigned long raddr, unsigned long laddr, + int dif) +{ + struct hlist_node *node; + + sk_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == num && + !(inet->daddr && inet->daddr != raddr) && + !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; /* gotcha */ + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + if (!pskb_may_pull(skb, sizeof(struct icmphdr))) + return 1; + + type = skb->h.icmph->type; + if (type < 32) { + __u32 data = raw_sk(sk)->filter.data; + + return ((1 << type) & data) != 0; + } + + /* Do not block unknown ICMP types */ + return 0; +} + +/* IP input processing comes here for RAW socket delivery. + * Caller owns SKB, so we must make clones. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. + */ +void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +{ + struct sock *sk; + struct hlist_head *head; + + read_lock(&raw_v4_lock); + head = &raw_v4_htable[hash]; + if (hlist_empty(head)) + goto out; + sk = __raw_v4_lookup(__sk_head(head), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + + while (sk) { + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + raw_rcv(sk, clone); + } + sk = __raw_v4_lookup(sk_next(sk), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + } +out: + read_unlock(&raw_v4_lock); +} + +void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int err = 0; + int harderr = 0; + + /* Report error on raw socket, if: + 1. User requested ip_recverr. + 2. Socket is connected (otherwise the error indication + is useless without ip_recverr and error is hard. + */ + if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + err = EHOSTUNREACH; + if (code > NR_ICMP_UNREACH) + break; + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; + if (code == ICMP_FRAG_NEEDED) { + harderr = inet->pmtudisc != IP_PMTUDISC_DONT; + err = EMSGSIZE; + } + } + + if (inet->recverr) { + struct iphdr *iph = (struct iphdr*)skb->data; + u8 *payload = skb->data + (iph->ihl << 2); + + if (inet->hdrincl) + payload = skb->data; + ip_icmp_error(sk, skb, err, 0, info, payload); + } + + if (inet->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (sock_queue_rcv_skb(sk, skb) < 0) { + /* FIXME: increment a raw drops counter here */ + kfree_skb(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +int raw_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_push(skb, skb->data - skb->nh.raw); + + raw_rcv_skb(sk, skb); + return 0; +} + +static int raw_send_hdrinc(struct sock *sk, void *from, int length, + struct rtable *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int hh_len; + struct iphdr *iph; + struct sk_buff *skb; + int err; + + if (length > rt->u.dst.dev->mtu) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, + rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + /* We don't modify invalid header */ + if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { + if (!iph->saddr) + iph->saddr = rt->rt_src; + iph->check = 0; + iph->tot_len = htons(length); + if (!iph->id) + ip_select_ident(iph, &rt->u.dst, NULL); + + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + } + + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMP: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + int free = 0; + u32 daddr; + u32 saddr; + u8 tos; + int err; + + err = -EMSGSIZE; + if (len < 0 || len > 0xFFFF) + goto out; + + /* + * Check the flags. + */ + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ + goto out; /* compatibility */ + + /* + * Get and verify the address. + */ + + if (msg->msg_namelen) { + struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; + err = -EINVAL; + if (msg->msg_namelen < sizeof(*usin)) + goto out; + if (usin->sin_family != AF_INET) { + static int complained; + if (!complained++) + printk(KERN_INFO "%s forgot to set AF_INET in " + "raw sendmsg. Fix it!\n", + current->comm); + err = -EAFNOSUPPORT; + if (usin->sin_family) + goto out; + } + daddr = usin->sin_addr.s_addr; + /* ANK: I did not forget to get protocol from port field. + * I just do not know, who uses this weirdness. + * IP_HDRINCL is much more convenient. + */ + } else { + err = -EDESTADDRREQ; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + daddr = inet->daddr; + } + + ipc.addr = inet->saddr; + ipc.opt = NULL; + ipc.oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + err = ip_cmsg_send(msg, &ipc); + if (err) + goto out; + if (ipc.opt) + free = 1; + } + + saddr = ipc.addr; + ipc.addr = daddr; + + if (!ipc.opt) + ipc.opt = inet->opt; + + if (ipc.opt) { + err = -EINVAL; + /* Linux does not mangle headers on raw sockets, + * so that IP options + IP_HDRINCL is non-sense. + */ + if (inet->hdrincl) + goto done; + if (ipc.opt->srr) { + if (!daddr) + goto done; + daddr = ipc.opt->faddr; + } + } + tos = RT_CONN_FLAGS(sk); + if (msg->msg_flags & MSG_DONTROUTE) + tos |= RTO_ONLINK; + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + { + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos } }, + .proto = inet->hdrincl ? IPPROTO_RAW : + sk->sk_protocol, + }; + if (!inet->hdrincl) + raw_probe_proto_opt(&fl, msg); + + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); + } + if (err) + goto done; + + err = -EACCES; + if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) + goto done; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (inet->hdrincl) + err = raw_send_hdrinc(sk, msg->msg_iov, len, + rt, msg->msg_flags); + + else { + if (!ipc.addr) + ipc.addr = rt->rt_dst; + lock_sock(sk); + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + &ipc, rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = ip_push_pending_frames(sk); + release_sock(sk); + } +done: + if (free) + kfree(ipc.opt); + ip_rt_put(rt); + +out: return err < 0 ? err : len; + +do_confirm: + dst_confirm(&rt->u.dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static void raw_close(struct sock *sk, long timeout) +{ + /* + * Raw sockets may have direct kernel refereneces. Kill them. + */ + ip_ra_control(sk, 0, NULL); + + sk_common_release(sk); +} + +/* This gets rid of all the nasties in af_inet. -DaveM */ +static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int ret = -EINVAL; + int chk_addr_ret; + + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) + goto out; + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + ret = -EADDRNOTAVAIL; + if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; + inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->saddr = 0; /* Use device */ + sk_dst_reset(sk); + ret = 0; +out: return ret; +} + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) { + err = ip_recv_error(sk, msg, len); + goto out; + } + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: return err ? err : copied; +} + +static int raw_init(struct sock *sk) +{ + struct raw_sock *rp = raw_sk(sk); + + if (inet_sk(sk)->num == IPPROTO_ICMP) + memset(&rp->filter, 0, sizeof(rp->filter)); + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) +{ + int len, ret = -EFAULT; + + if (get_user(len, optlen)) + goto out; + ret = -EINVAL; + if (len < 0) + goto out; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + ret = -EFAULT; + if (put_user(len, optlen) || + copy_to_user(optval, &raw_sk(sk)->filter, len)) + goto out; + ret = 0; +out: return ret; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_seticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_geticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->len; + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + +struct proto raw_prot = { + .name = "RAW", + .owner = THIS_MODULE, + .close = raw_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = raw_ioctl, + .init = raw_init, + .setsockopt = raw_setsockopt, + .getsockopt = raw_getsockopt, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .bind = raw_bind, + .backlog_rcv = raw_rcv_skb, + .hash = raw_v4_hash, + .unhash = raw_v4_unhash, + .obj_size = sizeof(struct raw_sock), +}; + +#ifdef CONFIG_PROC_FS +struct raw_iter_state { + int bucket; +}; + +#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private) + +static struct sock *raw_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct raw_iter_state* state = raw_seq_private(seq); + + for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + + sk_for_each(sk, node, &raw_v4_htable[state->bucket]) + if (sk->sk_family == PF_INET) + goto found; + } + sk = NULL; +found: + return sk; +} + +static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) +{ + struct raw_iter_state* state = raw_seq_private(seq); + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != PF_INET); + + if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { + sk = sk_head(&raw_v4_htable[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = raw_get_first(seq); + + if (sk) + while (pos && (sk = raw_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *raw_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&raw_v4_lock); + return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = raw_get_first(seq); + else + sk = raw_get_next(seq, v); + ++*pos; + return sk; +} + +static void raw_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&raw_v4_lock); +} + +static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) +{ + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr, + src = inet->rcv_saddr; + __u16 destp = 0, + srcp = inet->num; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + i, src, srcp, dest, destp, sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); + return tmpbuf; +} + +static int raw_seq_show(struct seq_file *seq, void *v) +{ + char tmpbuf[129]; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + else { + struct raw_iter_state *state = raw_seq_private(seq); + + seq_printf(seq, "%-127s\n", + get_raw_sock(v, tmpbuf, state->bucket)); + } + return 0; +} + +static struct seq_operations raw_seq_ops = { + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, + .show = raw_seq_show, +}; + +static int raw_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &raw_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations raw_seq_fops = { + .owner = THIS_MODULE, + .open = raw_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init raw_proc_init(void) +{ + if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init raw_proc_exit(void) +{ + proc_net_remove("raw"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c new file mode 100644 index 000000000000..9f91a116d919 --- /dev/null +++ b/net/ipv4/route.c @@ -0,0 +1,3177 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the IP router. + * + * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Linus Torvalds, <Linus.Torvalds@helsinki.fi> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * Alan Cox : Verify area fixes. + * Alan Cox : cli() protects routing changes + * Rui Oliveira : ICMP routing table updates + * (rco@di.uminho.pt) Routing table insertion and update + * Linus Torvalds : Rewrote bits to be sensible + * Alan Cox : Added BSD route gw semantics + * Alan Cox : Super /proc >4K + * Alan Cox : MTU in route table + * Alan Cox : MSS actually. Also added the window + * clamper. + * Sam Lantinga : Fixed route matching in rt_del() + * Alan Cox : Routing cache support. + * Alan Cox : Removed compatibility cruft. + * Alan Cox : RTF_REJECT support. + * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source + * Alexey Kuznetsov : End of old history. Split to fib.c and + * route.c and rewritten from scratch. + * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark + * Robert Olsson : Added rt_cache statistics + * Arnaldo C. Melo : Convert proc stuff to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/pkt_sched.h> +#include <linux/mroute.h> +#include <linux/netfilter_ipv4.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/rcupdate.h> +#include <linux/times.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/inetpeer.h> +#include <net/sock.h> +#include <net/ip_fib.h> +#include <net/arp.h> +#include <net/tcp.h> +#include <net/icmp.h> +#include <net/xfrm.h> +#include <net/ip_mp_alg.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define RT_FL_TOS(oldflp) \ + ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) + +#define IP_MAX_MTU 0xFFF0 + +#define RT_GC_TIMEOUT (300*HZ) + +static int ip_rt_min_delay = 2 * HZ; +static int ip_rt_max_delay = 10 * HZ; +static int ip_rt_max_size; +static int ip_rt_gc_timeout = RT_GC_TIMEOUT; +static int ip_rt_gc_interval = 60 * HZ; +static int ip_rt_gc_min_interval = HZ / 2; +static int ip_rt_redirect_number = 9; +static int ip_rt_redirect_load = HZ / 50; +static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost = HZ; +static int ip_rt_error_burst = 5 * HZ; +static int ip_rt_gc_elasticity = 8; +static int ip_rt_mtu_expires = 10 * 60 * HZ; +static int ip_rt_min_pmtu = 512 + 20 + 20; +static int ip_rt_min_advmss = 256; +static int ip_rt_secret_interval = 10 * 60 * HZ; +static unsigned long rt_deadline; + +#define RTprint(a...) printk(KERN_DEBUG a) + +static struct timer_list rt_flush_timer; +static struct timer_list rt_periodic_timer; +static struct timer_list rt_secret_timer; + +/* + * Interface to generic destination cache. + */ + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +static void ipv4_dst_destroy(struct dst_entry *dst); +static void ipv4_dst_ifdown(struct dst_entry *dst, + struct net_device *dev, int how); +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_link_failure(struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static int rt_garbage_collect(void); + + +static struct dst_ops ipv4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = rt_garbage_collect, + .check = ipv4_dst_check, + .destroy = ipv4_dst_destroy, + .ifdown = ipv4_dst_ifdown, + .negative_advice = ipv4_negative_advice, + .link_failure = ipv4_link_failure, + .update_pmtu = ip_rt_update_pmtu, + .entry_size = sizeof(struct rtable), +}; + +#define ECN_OR_COST(class) TC_PRIO_##class + +__u8 ip_tos2prio[16] = { + TC_PRIO_BESTEFFORT, + ECN_OR_COST(FILLER), + TC_PRIO_BESTEFFORT, + ECN_OR_COST(BESTEFFORT), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK) +}; + + +/* + * Route cache. + */ + +/* The locking scheme is rather straight forward: + * + * 1) Read-Copy Update protects the buckets of the central route hash. + * 2) Only writers remove entries, and they hold the lock + * as they look at rtable reference counts. + * 3) Only readers acquire references to rtable entries, + * they do so with atomic increments and with the + * lock held. + */ + +struct rt_hash_bucket { + struct rtable *chain; + spinlock_t lock; +} __attribute__((__aligned__(8))); + +static struct rt_hash_bucket *rt_hash_table; +static unsigned rt_hash_mask; +static int rt_hash_log; +static unsigned int rt_hash_rnd; + +struct rt_cache_stat *rt_cache_stat; + +static int rt_intern_hash(unsigned hash, struct rtable *rth, + struct rtable **res); + +static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) +{ + return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) + & rt_hash_mask); +} + +#ifdef CONFIG_PROC_FS +struct rt_cache_iter_state { + int bucket; +}; + +static struct rtable *rt_cache_get_first(struct seq_file *seq) +{ + struct rtable *r = NULL; + struct rt_cache_iter_state *st = seq->private; + + for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + if (r) + break; + rcu_read_unlock_bh(); + } + return r; +} + +static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) +{ + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + + r = r->u.rt_next; + while (!r) { + rcu_read_unlock_bh(); + if (--st->bucket < 0) + break; + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + } + return r; +} + +static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) +{ + struct rtable *r = rt_cache_get_first(seq); + + if (r) + while (pos && (r = rt_cache_get_next(seq, r))) + --pos; + return pos ? NULL : r; +} + +static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct rtable *r = NULL; + + if (v == SEQ_START_TOKEN) + r = rt_cache_get_first(seq); + else + r = rt_cache_get_next(seq, v); + ++*pos; + return r; +} + +static void rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + rcu_read_unlock_bh(); +} + +static int rt_cache_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" + "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" + "HHUptod\tSpecDst"); + else { + struct rtable *r = v; + char temp[256]; + + sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" + "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", + r->u.dst.dev ? r->u.dst.dev->name : "*", + (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + r->rt_flags, atomic_read(&r->u.dst.__refcnt), + r->u.dst.__use, 0, (unsigned long)r->rt_src, + (dst_metric(&r->u.dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->u.dst, RTAX_WINDOW), + (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + + dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->fl.fl4_tos, + r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, + r->u.dst.hh ? (r->u.dst.hh->hh_output == + dev_queue_xmit) : 0, + r->rt_spec_dst); + seq_printf(seq, "%-127s\n", temp); + } + return 0; +} + +static struct seq_operations rt_cache_seq_ops = { + .start = rt_cache_seq_start, + .next = rt_cache_seq_next, + .stop = rt_cache_seq_stop, + .show = rt_cache_seq_show, +}; + +static int rt_cache_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &rt_cache_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + + +static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; +} + +static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; + +} + +static void rt_cpu_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int rt_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct rt_cache_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + return 0; + } + + seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " + " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + atomic_read(&ipv4_dst_ops.entries), + st->in_hit, + st->in_slow_tot, + st->in_slow_mc, + st->in_no_route, + st->in_brd, + st->in_martian_dst, + st->in_martian_src, + + st->out_hit, + st->out_slow_tot, + st->out_slow_mc, + + st->gc_total, + st->gc_ignored, + st->gc_goal_miss, + st->gc_dst_overflow, + st->in_hlist_search, + st->out_hlist_search + ); + return 0; +} + +static struct seq_operations rt_cpu_seq_ops = { + .start = rt_cpu_seq_start, + .next = rt_cpu_seq_next, + .stop = rt_cpu_seq_stop, + .show = rt_cpu_seq_show, +}; + + +static int rt_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rt_cpu_seq_ops); +} + +static struct file_operations rt_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static __inline__ void rt_free(struct rtable *rt) +{ + multipath_remove(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ void rt_drop(struct rtable *rt) +{ + multipath_remove(rt); + ip_rt_put(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + rth->fl.iif && rth->u.rt_next; +} + +static __inline__ int rt_valuable(struct rtable *rth) +{ + return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || + rth->u.dst.expires; +} + +static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +{ + unsigned long age; + int ret = 0; + + if (atomic_read(&rth->u.dst.__refcnt)) + goto out; + + ret = 1; + if (rth->u.dst.expires && + time_after_eq(jiffies, rth->u.dst.expires)) + goto out; + + age = jiffies - rth->u.dst.lastuse; + ret = 0; + if ((age <= tmo1 && !rt_fast_clean(rth)) || + (age <= tmo2 && rt_valuable(rth))) + goto out; + ret = 1; +out: return ret; +} + +/* Bits of score are: + * 31: very valuable + * 30: not quite useless + * 29..0: usage counter + */ +static inline u32 rt_score(struct rtable *rt) +{ + u32 score = jiffies - rt->u.dst.lastuse; + + score = ~score & ~(3<<30); + + if (rt_valuable(rt)) + score |= (1<<31); + + if (!rt->fl.iif || + !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) + score |= (1<<30); + + return score; +} + +static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +{ + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && + fl1->iif == fl2->iif; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, + struct rtable *expentry, + int *removed_count) +{ + int passedexpired = 0; + struct rtable **nextstep = NULL; + struct rtable **rthp = chain_head; + struct rtable *rth; + + if (removed_count) + *removed_count = 0; + + while ((rth = *rthp) != NULL) { + if (rth == expentry) + passedexpired = 1; + + if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && + compare_keys(&(*rthp)->fl, &expentry->fl)) { + if (*rthp == expentry) { + *rthp = rth->u.rt_next; + continue; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + if (removed_count) + ++(*removed_count); + } + } else { + if (!((*rthp)->u.dst.flags & DST_BALANCED) && + passedexpired && !nextstep) + nextstep = &rth->u.rt_next; + + rthp = &rth->u.rt_next; + } + } + + rt_free(expentry); + if (removed_count) + ++(*removed_count); + + return nextstep; +} +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + + +/* This runs via a timer and thus is always in BH context. */ +static void rt_check_expire(unsigned long dummy) +{ + static int rover; + int i = rover, t; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + + for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; + t -= ip_rt_gc_timeout) { + unsigned long tmo = ip_rt_gc_timeout; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + spin_lock(&rt_hash_table[i].lock); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + + /* Cleanup aged off entries. */ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries if necessary */ + if (rth->u.dst.flags & DST_BALANCED) { + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, NULL); + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock(&rt_hash_table[i].lock); + + /* Fallback loop breaker. */ + if (time_after(jiffies, now)) + break; + } + rover = i; + mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); +} + +/* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ +static void rt_run_flush(unsigned long dummy) +{ + int i; + struct rtable *rth, *next; + + rt_deadline = 0; + + get_random_bytes(&rt_hash_rnd, 4); + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(&rt_hash_table[i].lock); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; + spin_unlock_bh(&rt_hash_table[i].lock); + + for (; rth; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } + } +} + +static DEFINE_SPINLOCK(rt_flush_lock); + +void rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_softirq(); + + if (delay < 0) + delay = ip_rt_min_delay; + + /* flush existing multipath state*/ + multipath_flush(); + + spin_lock_bh(&rt_flush_lock); + + if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { + long tmo = (long)(rt_deadline - now); + + /* If flush timer is already running + and flush request is not immediate (delay > 0): + + if deadline is not achieved, prolongate timer to "delay", + otherwise fire it at deadline time. + */ + + if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + spin_unlock_bh(&rt_flush_lock); + rt_run_flush(0); + return; + } + + if (rt_deadline == 0) + rt_deadline = now + ip_rt_max_delay; + + mod_timer(&rt_flush_timer, now+delay); + spin_unlock_bh(&rt_flush_lock); +} + +static void rt_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + + rt_cache_flush(0); + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); +} + +/* + Short description of GC goals. + + We want to build algorithm, which will keep routing cache + at some equilibrium point, when number of aged off entries + is kept approximately equal to newly generated ones. + + Current expiration strength is variable "expire". + We try to adjust it dynamically, so that if networking + is idle expires is large enough to keep enough of warm entries, + and when load increases it reduces to limit cache size. + */ + +static int rt_garbage_collect(void) +{ + static unsigned long expire = RT_GC_TIMEOUT; + static unsigned long last_gc; + static int rover; + static int equilibrium; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + int goal; + + /* + * Garbage collection is pretty expensive, + * do not make it too frequently. + */ + + RT_CACHE_STAT_INC(gc_total); + + if (now - last_gc < ip_rt_gc_min_interval && + atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { + RT_CACHE_STAT_INC(gc_ignored); + goto out; + } + + /* Calculate number of entries, which we want to expire now. */ + goal = atomic_read(&ipv4_dst_ops.entries) - + (ip_rt_gc_elasticity << rt_hash_log); + if (goal <= 0) { + if (equilibrium < ipv4_dst_ops.gc_thresh) + equilibrium = ipv4_dst_ops.gc_thresh; + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + if (goal > 0) { + equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + } + } else { + /* We are in dangerous area. Try to reduce cache really + * aggressively. + */ + goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + } + + if (now - last_gc >= ip_rt_gc_min_interval) + last_gc = now; + + if (goal <= 0) { + equilibrium += goal; + goto work_done; + } + + do { + int i, k; + + for (i = rt_hash_mask, k = rover; i >= 0; i--) { + unsigned long tmo = expire; + + k = (k + 1) & rt_hash_mask; + rthp = &rt_hash_table[k].chain; + spin_lock_bh(&rt_hash_table[k].lock); + while ((rth = *rthp) != NULL) { + if (!rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries + * if necessary + */ + if (rth->u.dst.flags & DST_BALANCED) { + int r; + + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, + &r); + goal -= r; + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock_bh(&rt_hash_table[k].lock); + if (goal <= 0) + break; + } + rover = k; + + if (goal <= 0) + goto work_done; + + /* Goal is not achieved. We stop process if: + + - if expire reduced to zero. Otherwise, expire is halfed. + - if table is not full. + - if we are called from interrupt. + - jiffies check is just fallback/debug loop breaker. + We will not spin here for long time in any case. + */ + + RT_CACHE_STAT_INC(gc_goal_miss); + + if (expire == 0) + break; + + expire >>= 1; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, i); +#endif + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + } while (!in_softirq() && time_before_eq(jiffies, now)); + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + if (net_ratelimit()) + printk(KERN_WARNING "dst cache overflow\n"); + RT_CACHE_STAT_INC(gc_dst_overflow); + return 1; + +work_done: + expire += ip_rt_gc_min_interval; + if (expire > ip_rt_gc_timeout || + atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, rover); +#endif +out: return 0; +} + +static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +{ + struct rtable *rth, **rthp; + unsigned long now; + struct rtable *cand, **candp; + u32 min_score; + int chain_length; + int attempts = !in_softirq(); + +restart: + chain_length = 0; + min_score = ~(u32)0; + cand = NULL; + candp = NULL; + now = jiffies; + + rthp = &rt_hash_table[hash].chain; + + spin_lock_bh(&rt_hash_table[hash].lock); + while ((rth = *rthp) != NULL) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (!(rth->u.dst.flags & DST_BALANCED) && + compare_keys(&rth->fl, &rt->fl)) { +#else + if (compare_keys(&rth->fl, &rt->fl)) { +#endif + /* Put it first */ + *rthp = rth->u.rt_next; + /* + * Since lookup is lockfree, the deletion + * must be visible to another weakly ordered CPU before + * the insertion at the start of the hash chain. + */ + rcu_assign_pointer(rth->u.rt_next, + rt_hash_table[hash].chain); + /* + * Since lookup is lockfree, the update writes + * must be ordered for consistency on SMP. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rth); + + rth->u.dst.__use++; + dst_hold(&rth->u.dst); + rth->u.dst.lastuse = now; + spin_unlock_bh(&rt_hash_table[hash].lock); + + rt_drop(rt); + *rp = rth; + return 0; + } + + if (!atomic_read(&rth->u.dst.__refcnt)) { + u32 score = rt_score(rth); + + if (score <= min_score) { + cand = rth; + candp = rthp; + min_score = score; + } + } + + chain_length++; + + rthp = &rth->u.rt_next; + } + + if (cand) { + /* ip_rt_gc_elasticity used to be average length of chain + * length, when exceeded gc becomes really aggressive. + * + * The second limit is less certain. At the moment it allows + * only 2 entries per bucket. We will see. + */ + if (chain_length > ip_rt_gc_elasticity) { + *candp = cand->u.rt_next; + rt_free(cand); + } + } + + /* Try to bind route to arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { + int err = arp_bind_neighbour(&rt->u.dst); + if (err) { + spin_unlock_bh(&rt_hash_table[hash].lock); + + if (err != -ENOBUFS) { + rt_drop(rt); + return err; + } + + /* Neighbour tables are full and nothing + can be released. Try to shrink route cache, + it is most likely it holds some neighbour records. + */ + if (attempts-- > 0) { + int saved_elasticity = ip_rt_gc_elasticity; + int saved_int = ip_rt_gc_min_interval; + ip_rt_gc_elasticity = 1; + ip_rt_gc_min_interval = 0; + rt_garbage_collect(); + ip_rt_gc_min_interval = saved_int; + ip_rt_gc_elasticity = saved_elasticity; + goto restart; + } + + if (net_ratelimit()) + printk(KERN_WARNING "Neighbour table overflow.\n"); + rt_drop(rt); + return -ENOBUFS; + } + } + + rt->u.rt_next = rt_hash_table[hash].chain; +#if RT_CACHE_DEBUG >= 2 + if (rt->u.rt_next) { + struct rtable *trt; + printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, + NIPQUAD(rt->rt_dst)); + for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) + printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); + printk("\n"); + } +#endif + rt_hash_table[hash].chain = rt; + spin_unlock_bh(&rt_hash_table[hash].lock); + *rp = rt; + return 0; +} + +void rt_bind_peer(struct rtable *rt, int create) +{ + static DEFINE_SPINLOCK(rt_peer_lock); + struct inet_peer *peer; + + peer = inet_getpeer(rt->rt_dst, create); + + spin_lock_bh(&rt_peer_lock); + if (rt->peer == NULL) { + rt->peer = peer; + peer = NULL; + } + spin_unlock_bh(&rt_peer_lock); + if (peer) + inet_putpeer(peer); +} + +/* + * Peer allocation may fail only in serious out-of-memory conditions. However + * we still can generate some output. + * Random ID selection looks a bit dangerous because we have no chances to + * select ID being unique in a reasonable period of time. + * But broken packet identifier may be better than no packet at all. + */ +static void ip_select_fb_ident(struct iphdr *iph) +{ + static DEFINE_SPINLOCK(ip_fb_id_lock); + static u32 ip_fallback_id; + u32 salt; + + spin_lock_bh(&ip_fb_id_lock); + salt = secure_ip_id(ip_fallback_id ^ iph->daddr); + iph->id = htons(salt & 0xFFFF); + ip_fallback_id = salt; + spin_unlock_bh(&ip_fb_id_lock); +} + +void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt) { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + + /* If peer is attached to destination, it is never detached, + so that we need not to grab a lock to dereference it. + */ + if (rt->peer) { + iph->id = htons(inet_getid(rt->peer, more)); + return; + } + } else + printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); + + ip_select_fb_ident(iph); +} + +static void rt_del(unsigned hash, struct rtable *rt) +{ + struct rtable **rthp; + + spin_lock_bh(&rt_hash_table[hash].lock); + ip_rt_put(rt); + for (rthp = &rt_hash_table[hash].chain; *rthp; + rthp = &(*rthp)->u.rt_next) + if (*rthp == rt) { + *rthp = rt->u.rt_next; + rt_free(rt); + break; + } + spin_unlock_bh(&rt_hash_table[hash].lock); +} + +void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, + u32 saddr, u8 tos, struct net_device *dev) +{ + int i, k; + struct in_device *in_dev = in_dev_get(dev); + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + + tos &= IPTOS_RT_MASK; + + if (!in_dev) + return; + + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) + || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + goto reject_redirect; + + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + goto reject_redirect; + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) + goto reject_redirect; + } + + for (i = 0; i < 2; i++) { + for (k = 0; k < 2; k++) { + unsigned hash = rt_hash_code(daddr, + skeys[i] ^ (ikeys[k] << 5), + tos); + + rthp=&rt_hash_table[hash].chain; + + rcu_read_lock(); + while ((rth = rcu_dereference(*rthp)) != NULL) { + struct rtable *rt; + + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } + + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; + + dst_hold(&rth->u.dst); + rcu_read_unlock(); + + rt = dst_alloc(&ipv4_dst_ops); + if (rt == NULL) { + ip_rt_put(rth); + in_dev_put(in_dev); + return; + } + + /* Copy all the information. */ + *rt = *rth; + INIT_RCU_HEAD(&rt->u.dst.rcu_head); + rt->u.dst.__use = 1; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.child = NULL; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + if (rt->idev) + in_dev_hold(rt->idev); + rt->u.dst.obsolete = 0; + rt->u.dst.lastuse = jiffies; + rt->u.dst.path = &rt->u.dst; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); + + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + + if (arp_bind_neighbour(&rt->u.dst) || + !(rt->u.dst.neighbour->nud_state & + NUD_VALID)) { + if (rt->u.dst.neighbour) + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rth); + rt_drop(rt); + goto do_next; + } + + rt_del(hash, rth); + if (!rt_intern_hash(hash, rt, &rt)) + ip_rt_put(rt); + goto do_next; + } + rcu_read_unlock(); + do_next: + ; + } + } + in_dev_put(in_dev); + return; + +reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " + "%u.%u.%u.%u ignored.\n" + " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, " + "tos %02x\n", + NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), + NIPQUAD(saddr), NIPQUAD(daddr), tos); +#endif + in_dev_put(in_dev); +} + +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable*)dst; + struct dst_entry *ret = dst; + + if (rt) { + if (dst->obsolete) { + ip_rt_put(rt); + ret = NULL; + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->u.dst.expires) { + unsigned hash = rt_hash_code(rt->fl.fl4_dst, + rt->fl.fl4_src ^ + (rt->fl.oif << 5), + rt->fl.fl4_tos); +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "ip_rt_advice: redirect to " + "%u.%u.%u.%u/%02x dropped\n", + NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); +#endif + rt_del(hash, rt); + ret = NULL; + } + } + return ret; +} + +/* + * Algorithm: + * 1. The first ip_rt_redirect_number redirects are sent + * with exponential backoff, then we stop sending them at all, + * assuming that the host ignores our redirects. + * 2. If we did not see packets requiring redirects + * during ip_rt_redirect_silence, we assume that the host + * forgot redirected route and start to send redirects again. + * + * This algorithm is much cheaper and more intelligent than dumb load limiting + * in icmp.c. + * + * NOTE. Do not forget to inhibit load limiting for redirects (redundant) + * and "frag. need" (breaks PMTU discovery) in icmp.c. + */ + +void ip_rt_send_redirect(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + + if (!in_dev) + return; + + if (!IN_DEV_TX_REDIRECTS(in_dev)) + goto out; + + /* No redirected packets during ip_rt_redirect_silence; + * reset the algorithm. + */ + if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) + rt->u.dst.rate_tokens = 0; + + /* Too many ignored redirects; do not send anything + * set u.dst.rate_last to the last seen redirected packet. + */ + if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { + rt->u.dst.rate_last = jiffies; + goto out; + } + + /* Check for load limit; set rate_last to the latest sent + * redirect. + */ + if (time_after(jiffies, + (rt->u.dst.rate_last + + (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + rt->u.dst.rate_last = jiffies; + ++rt->u.dst.rate_tokens; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && + rt->u.dst.rate_tokens == ip_rt_redirect_number && + net_ratelimit()) + printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " + "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", + NIPQUAD(rt->rt_src), rt->rt_iif, + NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); +#endif + } +out: + in_dev_put(in_dev); +} + +static int ip_error(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + unsigned long now; + int code; + + switch (rt->u.dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; + } + + now = jiffies; + rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; + if (rt->u.dst.rate_tokens > ip_rt_error_burst) + rt->u.dst.rate_tokens = ip_rt_error_burst; + rt->u.dst.rate_last = now; + if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { + rt->u.dst.rate_tokens -= ip_rt_error_cost; + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + } + +out: kfree_skb(skb); + return 0; +} + +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + +static __inline__ unsigned short guess_mtu(unsigned short old_mtu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; + return 68; +} + +unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +{ + int i; + unsigned short old_mtu = ntohs(iph->tot_len); + struct rtable *rth; + u32 skeys[2] = { iph->saddr, 0, }; + u32 daddr = iph->daddr; + u8 tos = iph->tos & IPTOS_RT_MASK; + unsigned short est_mtu = 0; + + if (ipv4_config.no_pmtu_disc) + return 0; + + for (i = 0; i < 2; i++) { + unsigned hash = rt_hash_code(daddr, skeys[i], tos); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == skeys[i] && + rth->rt_dst == daddr && + rth->rt_src == iph->saddr && + rth->fl.fl4_tos == tos && + rth->fl.iif == 0 && + !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { + unsigned short mtu = new_mtu; + + if (new_mtu < 68 || new_mtu >= old_mtu) { + + /* BSD 4.2 compatibility hack :-( */ + if (mtu == 0 && + old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + + mtu = guess_mtu(old_mtu); + } + if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { + if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { + dst_confirm(&rth->u.dst); + if (mtu < ip_rt_min_pmtu) { + mtu = ip_rt_min_pmtu; + rth->u.dst.metrics[RTAX_LOCK-1] |= + (1 << RTAX_MTU); + } + rth->u.dst.metrics[RTAX_MTU-1] = mtu; + dst_set_expires(&rth->u.dst, + ip_rt_mtu_expires); + } + est_mtu = mtu; + } + } + } + rcu_read_unlock(); + } + return est_mtu ? : new_mtu; +} + +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && + !(dst_metric_locked(dst, RTAX_MTU))) { + if (mtu < ip_rt_min_pmtu) { + mtu = ip_rt_min_pmtu; + dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); + } + dst->metrics[RTAX_MTU-1] = mtu; + dst_set_expires(dst, ip_rt_mtu_expires); + } +} + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer = rt->peer; + struct in_device *idev = rt->idev; + + if (peer) { + rt->peer = NULL; + inet_putpeer(peer); + } + + if (idev) { + rt->idev = NULL; + in_dev_put(idev); + } +} + +static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rtable *rt = (struct rtable *) dst; + struct in_device *idev = rt->idev; + if (dev != &loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (loopback_idev) { + rt->idev = loopback_idev; + in_dev_put(idev); + } + } +} + +static void ipv4_link_failure(struct sk_buff *skb) +{ + struct rtable *rt; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + + rt = (struct rtable *) skb->dst; + if (rt) + dst_set_expires(&rt->u.dst, 0); +} + +static int ip_rt_bug(struct sk_buff *skb) +{ + printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", + NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), + skb->dev ? skb->dev->name : "?"); + kfree_skb(skb); + return 0; +} + +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. + + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->fl.iif == 0) + src = rt->rt_src; + else if (fib_lookup(&rt->fl, &res) == 0) { + src = FIB_RES_PREFSRC(res); + fib_res_put(&res); + } else + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); +} + +#ifdef CONFIG_NET_CLS_ROUTE +static void set_class_tag(struct rtable *rt, u32 tag) +{ + if (!(rt->u.dst.tclassid & 0xFFFF)) + rt->u.dst.tclassid |= tag & 0xFFFF; + if (!(rt->u.dst.tclassid & 0xFFFF0000)) + rt->u.dst.tclassid |= tag & 0xFFFF0000; +} +#endif + +static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +{ + struct fib_info *fi = res->fi; + + if (fi) { + if (FIB_RES_GW(*res) && + FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); + memcpy(rt->u.dst.metrics, fi->fib_metrics, + sizeof(rt->u.dst.metrics)); + if (fi->fib_mtu == 0) { + rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; + if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && + rt->rt_gateway != rt->rt_dst && + rt->u.dst.dev->mtu > 576) + rt->u.dst.metrics[RTAX_MTU-1] = 576; + } +#ifdef CONFIG_NET_CLS_ROUTE + rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif + } else + rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; + + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) + rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) + rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + ip_rt_min_advmss); + if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) + rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; + +#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_MULTIPLE_TABLES + set_class_tag(rt, fib_rules_tclass(res)); +#endif + set_class_tag(rt, itag); +#endif + rt->rt_type = res->type; +} + +static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev, int our) +{ + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = in_dev_get(dev); + u32 itag = 0; + + /* Primary sanity checks. */ + + if (in_dev == NULL) + return -EINVAL; + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + goto e_inval; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, + dev, &spec_dst, &itag) < 0) + goto e_inval; + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) + goto e_nobufs; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + RT_CACHE_STAT_INC(in_slow_mc); + + in_dev_put(in_dev); + hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos); + return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + +e_nobufs: + in_dev_put(in_dev); + return -ENOBUFS; + +e_inval: + in_dev_put(in_dev); + return -EINVAL; +} + + +static void ip_handle_martian_source(struct net_device *dev, + struct in_device *in_dev, + struct sk_buff *skb, + u32 daddr, + u32 saddr) +{ + RT_CACHE_STAT_INC(in_martian_src); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { + /* + * RFC1812 recommendation, if source is martian, + * the only hint is MAC header. + */ + printk(KERN_WARNING "martian source %u.%u.%u.%u from " + "%u.%u.%u.%u, on dev %s\n", + NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + if (dev->hard_header_len) { + int i; + unsigned char *p = skb->mac.raw; + printk(KERN_WARNING "ll header: "); + for (i = 0; i < dev->hard_header_len; i++, p++) { + printk("%02x", *p); + if (i < (dev->hard_header_len - 1)) + printk(":"); + } + printk("\n"); + } + } +#endif +} + +static inline int __mkroute_input(struct sk_buff *skb, + struct fib_result* res, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos, + struct rtable **result) +{ + + struct rtable *rth; + int err; + struct in_device *out_dev; + unsigned flags = 0; + u32 spec_dst, itag; + + /* get a working reference to the output device */ + out_dev = in_dev_get(FIB_RES_DEV(*res)); + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in ip_route_input" \ + "_slow(). Please, report\n"); + return -EINVAL; + } + + + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); + if (err < 0) { + ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, + saddr); + + err = -EINVAL; + goto cleanup; + } + + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + flags |= RTCF_DOREDIRECT; + + if (skb->protocol != htons(ETH_P_IP)) { + /* Not IP (i.e. ARP). Do not create route, if it is + * invalid for proxy arp. DNAT routes are always valid. + */ + if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + err = -EINVAL; + goto cleanup; + } + } + + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) { + err = -ENOBUFS; + goto cleanup; + } + + rth->u.dst.flags= DST_HOST; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (res->fi->fib_nhs > 1) + rth->u.dst.flags |= DST_BALANCED; +#endif + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; + rth->rt_iif = + rth->fl.iif = in_dev->dev->ifindex; + rth->u.dst.dev = (out_dev)->dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; + rth->rt_spec_dst= spec_dst; + + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; + + rt_set_nexthop(rth, res, itag); + + rth->rt_flags = flags; + + *result = rth; + err = 0; + cleanup: + /* release the working reference to the output device */ + in_dev_put(out_dev); + return err; +} + +static inline int ip_mkroute_input_def(struct sk_buff *skb, + struct fib_result* res, + const struct flowi *fl, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos) +{ + struct rtable* rth; + int err; + unsigned hash; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) + fib_select_multipath(fl, res); +#endif + + /* create a routing cache entry */ + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); + if (err) + return err; + atomic_set(&rth->u.dst.__refcnt, 1); + + /* put it into the cache */ + hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); + return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); +} + +static inline int ip_mkroute_input(struct sk_buff *skb, + struct fib_result* res, + const struct flowi *fl, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + struct rtable* rth; + unsigned char hop, hopcount, lasthop; + int err = -EINVAL; + unsigned int hash; + + if (res->fi) + hopcount = res->fi->fib_nhs; + else + hopcount = 1; + + lasthop = hopcount - 1; + + /* distinguish between multipath and singlepath */ + if (hopcount < 2) + return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, + saddr, tos); + + /* add all alternatives to the routing cache */ + for (hop = 0; hop < hopcount; hop++) { + res->nh_sel = hop; + + /* create a routing cache entry */ + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, + &rth); + if (err) + return err; + + /* put it into the cache */ + hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); + err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + if (err) + return err; + + /* forward hop information to multipath impl. */ + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), + FIB_RES_NETMASK(*res), + res->prefixlen, + &FIB_RES_NH(*res)); + + /* only for the last hop the reference count is handled + * outside + */ + if (hop == lasthop) + atomic_set(&(skb->dst->__refcnt), 1); + } + return err; +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ +} + + +/* + * NOTE. We drop all the packets that has local source + * addresses, because every properly looped back packet + * must have correct destination already attached by output routine. + * + * Such approach solves two big problems: + * 1. Not simplex devices are handled properly. + * 2. IP spoofing attempts are filtered with 100% of guarantee. + */ + +static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev) +{ + struct fib_result res; + struct in_device *in_dev = in_dev_get(dev); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos, + .scope = RT_SCOPE_UNIVERSE, +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = skb->nfmark +#endif + } }, + .iif = dev->ifindex }; + unsigned flags = 0; + u32 itag = 0; + struct rtable * rth; + unsigned hash; + u32 spec_dst; + int err = -EINVAL; + int free_res = 0; + + /* IP on this device is disabled. */ + + if (!in_dev) + goto out; + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + goto martian_source; + + if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) + */ + if (ZERONET(saddr)) + goto martian_source; + + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + goto martian_destination; + + /* + * Now we are ready to route packet. + */ + if ((err = fib_lookup(&fl, &res)) != 0) { + if (!IN_DEV_FORWARD(in_dev)) + goto e_inval; + goto no_route; + } + free_res = 1; + + RT_CACHE_STAT_INC(in_slow_tot); + + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + int result; + result = fib_validate_source(saddr, daddr, tos, + loopback_dev.ifindex, + dev, &spec_dst, &itag); + if (result < 0) + goto martian_source; + if (result) + flags |= RTCF_DIRECTSRC; + spec_dst = daddr; + goto local_input; + } + + if (!IN_DEV_FORWARD(in_dev)) + goto e_inval; + if (res.type != RTN_UNICAST) + goto martian_destination; + + err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + if (err == -ENOBUFS) + goto e_nobufs; + if (err == -EINVAL) + goto e_inval; + +done: + in_dev_put(in_dev); + if (free_res) + fib_res_put(&res); +out: return err; + +brd_input: + if (skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ZERONET(saddr)) + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, + &itag); + if (err < 0) + goto martian_source; + if (err) + flags |= RTCF_DIRECTSRC; + } + flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; + RT_CACHE_STAT_INC(in_brd); + +local_input: + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) + goto e_nobufs; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + rth->rt_flags = flags|RTCF_LOCAL; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= -err; + rth->rt_flags &= ~RTCF_LOCAL; + } + rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); + err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + goto done; + +no_route: + RT_CACHE_STAT_INC(in_no_route); + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; + + /* + * Do not cache martian addresses: they should be logged (RFC1812) + */ +martian_destination: + RT_CACHE_STAT_INC(in_martian_dst); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_WARNING "martian destination %u.%u.%u.%u from " + "%u.%u.%u.%u, dev %s\n", + NIPQUAD(daddr), NIPQUAD(saddr), dev->name); +#endif +e_inval: + err = -EINVAL; + goto done; + +e_nobufs: + err = -ENOBUFS; + goto done; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + goto e_inval; +} + +int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev) +{ + struct rtable * rth; + unsigned hash; + int iif = dev->ifindex; + + tos &= IPTOS_RT_MASK; + hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == saddr && + rth->fl.iif == iif && + rth->fl.oif == 0 && +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == skb->nfmark && +#endif + rth->fl.fl4_tos == tos) { + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; + RT_CACHE_STAT_INC(in_hit); + rcu_read_unlock(); + skb->dst = (struct dst_entry*)rth; + return 0; + } + RT_CACHE_STAT_INC(in_hlist_search); + } + rcu_read_unlock(); + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many Ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + struct in_device *in_dev; + + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) != NULL) { + int our = ip_check_mc(in_dev, daddr, saddr, + skb->nh.iph->protocol); + if (our +#ifdef CONFIG_IP_MROUTE + || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) +#endif + ) { + rcu_read_unlock(); + return ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); + } + } + rcu_read_unlock(); + return -EINVAL; + } + return ip_route_input_slow(skb, daddr, saddr, tos, dev); +} + +static inline int __mkroute_output(struct rtable **result, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ + struct rtable *rth; + struct in_device *in_dev; + u32 tos = RT_FL_TOS(oldflp); + int err = 0; + + if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + return -EINVAL; + + if (fl->fl4_dst == 0xFFFFFFFF) + res->type = RTN_BROADCAST; + else if (MULTICAST(fl->fl4_dst)) + res->type = RTN_MULTICAST; + else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) + return -EINVAL; + + if (dev_out->flags & IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + /* get work reference to inet device */ + in_dev = in_dev_get(dev_out); + if (!in_dev) + return -EINVAL; + + if (res->type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST | RTCF_LOCAL; + if (res->fi) { + fib_info_put(res->fi); + res->fi = NULL; + } + } else if (res->type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST|RTCF_LOCAL; + if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, + oldflp->proto)) + flags &= ~RTCF_LOCAL; + /* If multicast route do not exist use + default one, but do not gateway in this case. + Yes, it is hack. + */ + if (res->fi && res->prefixlen < 4) { + fib_info_put(res->fi); + res->fi = NULL; + } + } + + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) { + err = -ENOBUFS; + goto cleanup; + } + + rth->u.dst.flags= DST_HOST; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (res->fi) { + rth->rt_multipath_alg = res->fi->fib_mp_alg; + if (res->fi->fib_nhs > 1) + rth->u.dst.flags |= DST_BALANCED; + } +#endif + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + + rth->fl.fl4_dst = oldflp->fl4_dst; + rth->fl.fl4_tos = tos; + rth->fl.fl4_src = oldflp->fl4_src; + rth->fl.oif = oldflp->oif; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= oldflp->fl4_fwmark; +#endif + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; + /* get references to the devices that are to be hold by the routing + cache entry */ + rth->u.dst.dev = dev_out; + dev_hold(dev_out); + rth->idev = in_dev_get(dev_out); + rth->rt_gateway = fl->fl4_dst; + rth->rt_spec_dst= fl->fl4_src; + + rth->u.dst.output=ip_output; + + RT_CACHE_STAT_INC(out_slow_tot); + + if (flags & RTCF_LOCAL) { + rth->u.dst.input = ip_local_deliver; + rth->rt_spec_dst = fl->fl4_dst; + } + if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + rth->rt_spec_dst = fl->fl4_src; + if (flags & RTCF_LOCAL && + !(dev_out->flags & IFF_LOOPBACK)) { + rth->u.dst.output = ip_mc_output; + RT_CACHE_STAT_INC(out_slow_mc); + } +#ifdef CONFIG_IP_MROUTE + if (res->type == RTN_MULTICAST) { + if (IN_DEV_MFORWARD(in_dev) && + !LOCAL_MCAST(oldflp->fl4_dst)) { + rth->u.dst.input = ip_mr_input; + rth->u.dst.output = ip_mc_output; + } + } +#endif + } + + rt_set_nexthop(rth, res, 0); + + rth->rt_flags = flags; + + *result = rth; + cleanup: + /* release work reference to inet device */ + in_dev_put(in_dev); + + return err; +} + +static inline int ip_mkroute_output_def(struct rtable **rp, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ + struct rtable *rth; + int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); + unsigned hash; + if (err == 0) { + u32 tos = RT_FL_TOS(oldflp); + + atomic_set(&rth->u.dst.__refcnt, 1); + + hash = rt_hash_code(oldflp->fl4_dst, + oldflp->fl4_src ^ (oldflp->oif << 5), tos); + err = rt_intern_hash(hash, rth, rp); + } + + return err; +} + +static inline int ip_mkroute_output(struct rtable** rp, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 tos = RT_FL_TOS(oldflp); + unsigned char hop; + unsigned hash; + int err = -EINVAL; + struct rtable *rth; + + if (res->fi && res->fi->fib_nhs > 1) { + unsigned char hopcount = res->fi->fib_nhs; + + for (hop = 0; hop < hopcount; hop++) { + struct net_device *dev2nexthop; + + res->nh_sel = hop; + + /* hold a work reference to the output device */ + dev2nexthop = FIB_RES_DEV(*res); + dev_hold(dev2nexthop); + + err = __mkroute_output(&rth, res, fl, oldflp, + dev2nexthop, flags); + + if (err != 0) + goto cleanup; + + hash = rt_hash_code(oldflp->fl4_dst, + oldflp->fl4_src ^ + (oldflp->oif << 5), tos); + err = rt_intern_hash(hash, rth, rp); + + /* forward hop information to multipath impl. */ + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), + FIB_RES_NETMASK(*res), + res->prefixlen, + &FIB_RES_NH(*res)); + cleanup: + /* release work reference to output device */ + dev_put(dev2nexthop); + + if (err != 0) + return err; + } + atomic_set(&(*rp)->u.dst.__refcnt, 1); + return err; + } else { + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, + flags); + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); +#endif +} + +/* + * Major route resolver routine. + */ + +static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) +{ + u32 tos = RT_FL_TOS(oldflp); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = oldflp->fl4_dst, + .saddr = oldflp->fl4_src, + .tos = tos & IPTOS_RT_MASK, + .scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : + RT_SCOPE_UNIVERSE), +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = oldflp->fl4_fwmark +#endif + } }, + .iif = loopback_dev.ifindex, + .oif = oldflp->oif }; + struct fib_result res; + unsigned flags = 0; + struct net_device *dev_out = NULL; + int free_res = 0; + int err; + + + res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (oldflp->fl4_src) { + err = -EINVAL; + if (MULTICAST(oldflp->fl4_src) || + BADCLASS(oldflp->fl4_src) || + ZERONET(oldflp->fl4_src)) + goto out; + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oldflp->oif == 0 + && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + fl.oif = dev_out->ifindex; + goto make_route; + } + if (dev_out) + dev_put(dev_out); + dev_out = NULL; + } + + + if (oldflp->oif) { + dev_out = dev_get_by_index(oldflp->oif); + err = -ENODEV; + if (dev_out == NULL) + goto out; + if (__in_dev_get(dev_out) == NULL) { + dev_put(dev_out); + goto out; /* Wrong error code */ + } + + if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) { + if (!fl.fl4_src) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + goto make_route; + } + if (!fl.fl4_src) { + if (MULTICAST(oldflp->fl4_dst)) + fl.fl4_src = inet_select_addr(dev_out, 0, + fl.fl4_scope); + else if (!oldflp->fl4_dst) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); + } + } + + if (!fl.fl4_dst) { + fl.fl4_dst = fl.fl4_src; + if (!fl.fl4_dst) + fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + fl.oif = loopback_dev.ifindex; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; + } + + if (fib_lookup(&fl, &res)) { + res.fi = NULL; + if (oldflp->oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, ignoring both routing tables + and ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + if (fl.fl4_src == 0) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + res.type = RTN_UNICAST; + goto make_route; + } + if (dev_out) + dev_put(dev_out); + err = -ENETUNREACH; + goto out; + } + free_res = 1; + + if (res.type == RTN_LOCAL) { + if (!fl.fl4_src) + fl.fl4_src = fl.fl4_dst; + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + if (res.fi) + fib_info_put(res.fi); + res.fi = NULL; + flags |= RTCF_LOCAL; + goto make_route; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && fl.oif == 0) + fib_select_multipath(&fl, &res); + else +#endif + if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) + fib_select_default(&fl, &res); + + if (!fl.fl4_src) + fl.fl4_src = FIB_RES_PREFSRC(res); + + if (dev_out) + dev_put(dev_out); + dev_out = FIB_RES_DEV(res); + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + + +make_route: + err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + + + if (free_res) + fib_res_put(&res); + if (dev_out) + dev_put(dev_out); +out: return err; +} + +int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +{ + unsigned hash; + struct rtable *rth; + + hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); + + rcu_read_lock_bh(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == flp->fl4_dst && + rth->fl.fl4_src == flp->fl4_src && + rth->fl.iif == 0 && + rth->fl.oif == flp->oif && +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == flp->fl4_fwmark && +#endif + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK))) { + + /* check for multipath routes and choose one if + * necessary + */ + if (multipath_select_route(flp, rth, rp)) { + dst_hold(&(*rp)->u.dst); + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); + return 0; + } + + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); + *rp = rth; + return 0; + } + RT_CACHE_STAT_INC(out_hlist_search); + } + rcu_read_unlock_bh(); + + return ip_route_output_slow(rp, flp); +} + +int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +{ + int err; + + if ((err = __ip_route_output_key(rp, flp)) != 0) + return err; + + if (flp->proto) { + if (!flp->fl4_src) + flp->fl4_src = (*rp)->rt_src; + if (!flp->fl4_dst) + flp->fl4_dst = (*rp)->rt_dst; + return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + } + + return 0; +} + +int ip_route_output_key(struct rtable **rp, struct flowi *flp) +{ + return ip_route_output_flow(rp, flp, NULL, 0); +} + +static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + int nowait) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; +#ifdef CONFIG_IP_MROUTE + struct rtattr *eptr; +#endif + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 0; + r->rtm_tos = rt->fl.fl4_tos; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + if (rt->fl.fl4_src) { + r->rtm_src_len = 32; + RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src); + } + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); +#ifdef CONFIG_NET_CLS_ROUTE + if (rt->u.dst.tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rt->rt_multipath_alg != IP_MP_ALG_NONE) { + __u32 alg = rt->rt_multipath_alg; + + RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); + } +#endif + if (rt->fl.iif) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + else if (rt->rt_src != rt->fl.fl4_src) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + goto rtattr_failure; + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + ci.rta_used = rt->u.dst.__use; + ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); + if (rt->u.dst.expires) + ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies); + else + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; + ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; + if (rt->peer) { + ci.rta_id = rt->peer->ip_id_count; + if (rt->peer->tcp_ts_stamp) { + ci.rta_ts = rt->peer->tcp_ts; + ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; + } + } +#ifdef CONFIG_IP_MROUTE + eptr = (struct rtattr*)skb->tail; +#endif + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + if (rt->fl.iif) { +#ifdef CONFIG_IP_MROUTE + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && + ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nlmsg_failure; + } else { + if (err == -EMSGSIZE) + goto nlmsg_failure; + ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + } + } + } else +#endif + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int iif = 0; + int err = -ENOBUFS; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta[RTA_SRC - 1]) + memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4); + if (rta[RTA_DST - 1]) + memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4); + if (rta[RTA_IIF - 1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int)); + + if (iif) { + struct net_device *dev = __dev_get_by_index(iif); + err = -ENODEV; + if (!dev) + goto out_free; + skb->protocol = htons(ETH_P_IP); + skb->dev = dev; + local_bh_disable(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + local_bh_enable(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = -rt->u.dst.error; + } else { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst, + .saddr = src, + .tos = rtm->rtm_tos } } }; + int oif = 0; + if (rta[RTA_OIF - 1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); + fl.oif = oif; + err = ip_route_output_key(&rt, &fl); + } + if (err) + goto out_free; + + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + + err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0); + if (!err) + goto out_free; + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; +out: return err; + +out_free: + kfree_skb(skb); + goto out; +} + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for (h = 0; h <= rt_hash_mask; h++) { + if (h < s_h) continue; + if (h > s_h) + s_idx = 0; + rcu_read_lock_bh(); + for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; + rt = rcu_dereference(rt->u.rt_next), idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + rcu_read_unlock_bh(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + rcu_read_unlock_bh(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +void ip_rt_multicast_event(struct in_device *in_dev) +{ + rt_cache_flush(0); +} + +#ifdef CONFIG_SYSCTL +static int flush_delay; + +static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + rt_cache_flush(flush_delay); + return 0; + } + + return -EINVAL; +} + +static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, + int nlen, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, + size_t newlen, + void **context) +{ + int delay; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(delay, (int __user *)newval)) + return -EFAULT; + rt_cache_flush(delay); + return 0; +} + +ctl_table ipv4_route_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .data = &flush_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, + .procname = "min_delay", + .data = &ip_rt_min_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, + .procname = "max_delay", + .data = &ip_rt_max_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_THRESH, + .procname = "gc_thresh", + .data = &ipv4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, + .procname = "max_size", + .data = &ip_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + /* Deprecated. Use gc_min_interval_ms */ + + .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, + .procname = "gc_min_interval", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, + .procname = "gc_min_interval_ms", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, + .procname = "gc_timeout", + .data = &ip_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, + .procname = "redirect_load", + .data = &ip_rt_redirect_load, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, + .procname = "redirect_number", + .data = &ip_rt_redirect_number, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, + .procname = "redirect_silence", + .data = &ip_rt_redirect_silence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_ERROR_COST, + .procname = "error_cost", + .data = &ip_rt_error_cost, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, + .procname = "error_burst", + .data = &ip_rt_error_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, + .procname = "gc_elasticity", + .data = &ip_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, + .procname = "mtu_expires", + .data = &ip_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, + .procname = "min_pmtu", + .data = &ip_rt_min_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, + .procname = "min_adv_mss", + .data = &ip_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, + .procname = "secret_interval", + .data = &ip_rt_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { .ctl_name = 0 } +}; +#endif + +#ifdef CONFIG_NET_CLS_ROUTE +struct ip_rt_acct *ip_rt_acct; + +/* This code sucks. But you should have seen it before! --RR */ + +/* IP route accounting ptr for this logical cpu number. */ +#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) + +#ifdef CONFIG_PROC_FS +static int ip_rt_acct_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + unsigned int i; + + if ((offset & 3) || (length & 3)) + return -EIO; + + if (offset >= sizeof(struct ip_rt_acct) * 256) { + *eof = 1; + return 0; + } + + if (offset + length >= sizeof(struct ip_rt_acct) * 256) { + length = sizeof(struct ip_rt_acct) * 256 - offset; + *eof = 1; + } + + offset /= sizeof(u32); + + if (length > 0) { + u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; + u32 *dst = (u32 *) buffer; + + /* Copy first cpu. */ + *start = buffer; + memcpy(dst, src, length); + + /* Add the other cpus in, one int at a time */ + for_each_cpu(i) { + unsigned int j; + + src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; + + for (j = 0; j < length/4; j++) + dst[j] += src[j]; + } + } + return length; +} +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_CLS_ROUTE */ + +static __initdata unsigned long rhash_entries; +static int __init set_rhash_entries(char *str) +{ + if (!str) + return 0; + rhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("rhash_entries=", set_rhash_entries); + +int __init ip_rt_init(void) +{ + int i, order, goal, rc = 0; + + rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7))); + +#ifdef CONFIG_NET_CLS_ROUTE + for (order = 0; + (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) + /* NOTHING */; + ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + if (!ip_rt_acct) + panic("IP: failed to allocate ip_rt_acct\n"); + memset(ip_rt_acct, 0, PAGE_SIZE << order); +#endif + + ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", + sizeof(struct rtable), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!ipv4_dst_ops.kmem_cachep) + panic("IP: failed to allocate ip_dst_cache\n"); + + goal = num_physpages >> (26 - PAGE_SHIFT); + if (rhash_entries) + goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; + for (order = 0; (1UL << order) < goal; order++) + /* NOTHING */; + + do { + rt_hash_mask = (1UL << order) * PAGE_SIZE / + sizeof(struct rt_hash_bucket); + while (rt_hash_mask & (rt_hash_mask - 1)) + rt_hash_mask--; + rt_hash_table = (struct rt_hash_bucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (rt_hash_table == NULL && --order > 0); + + if (!rt_hash_table) + panic("Failed to allocate IP route cache hash table\n"); + + printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", + rt_hash_mask, + (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); + + for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) + /* NOTHING */; + + rt_hash_mask--; + for (i = 0; i <= rt_hash_mask; i++) { + spin_lock_init(&rt_hash_table[i].lock); + rt_hash_table[i].chain = NULL; + } + + ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); + ip_rt_max_size = (rt_hash_mask + 1) * 16; + + rt_cache_stat = alloc_percpu(struct rt_cache_stat); + if (!rt_cache_stat) + return -ENOMEM; + + devinet_init(); + ip_fib_init(); + + init_timer(&rt_flush_timer); + rt_flush_timer.function = rt_run_flush; + init_timer(&rt_periodic_timer); + rt_periodic_timer.function = rt_check_expire; + init_timer(&rt_secret_timer); + rt_secret_timer.function = rt_secret_rebuild; + + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); + + rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&rt_secret_timer); + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ + if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || + !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, + proc_net_stat))) { + free_percpu(rt_cache_stat); + return -ENOMEM; + } + rtstat_pde->proc_fops = &rt_cpu_seq_fops; + } +#ifdef CONFIG_NET_CLS_ROUTE + create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); +#endif +#endif +#ifdef CONFIG_XFRM + xfrm_init(); + xfrm4_init(); +#endif + return rc; +} + +EXPORT_SYMBOL(__ip_select_ident); +EXPORT_SYMBOL(ip_route_input); +EXPORT_SYMBOL(ip_route_output_key); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c new file mode 100644 index 000000000000..e923d2f021aa --- /dev/null +++ b/net/ipv4/syncookies.c @@ -0,0 +1,279 @@ +/* + * Syncookies implementation for the Linux kernel + * + * Copyright (C) 1997 Andi Kleen + * Based on ideas by D.J.Bernstein and Eric Schenk. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ + * + * Missing: IPv6 support. + */ + +#include <linux/tcp.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/cryptohash.h> +#include <linux/kernel.h> +#include <net/tcp.h> + +extern int sysctl_tcp_syncookies; + +static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS]; + +static __init int init_syncookies(void) +{ + get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); + return 0; +} +module_init(init_syncookies); + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport, + u32 count, int c) +{ + __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS]; + + memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c])); + tmp[0] = saddr; + tmp[1] = daddr; + tmp[2] = (sport << 16) + dport; + tmp[3] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + +static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport, + __u16 dport, __u32 sseq, __u32 count, + __u32 data) +{ + /* + * Compute the secure sequence number. + * The output should be: + * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) + * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). + * Where sseq is their sequence number and count increases every + * minute by 1. + * As an extra hack, we add a small "data" value that encodes the + * MSS into the second hash value. + */ + + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +/* + * This retrieves the small "data" value from the syncookie. + * If the syncookie is bad, the data returned will be out of + * range. This must be checked by the caller. + * + * The count value used to generate the cookie must be within + * "maxdiff" if the current (passed-in) "count". The return value + * is (__u32)-1 if this test fails. + */ +static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr, + __u16 sport, __u16 dport, __u32 sseq, + __u32 count, __u32 maxdiff) +{ + __u32 diff; + + /* Strip away the layers from the cookie */ + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; /* Leaving the data behind */ +} + +/* + * This table has to be sorted and terminated with (__u16)-1. + * XXX generate a better table. + * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + */ +static __u16 const msstab[] = { + 64 - 1, + 256 - 1, + 512 - 1, + 536 - 1, + 1024 - 1, + 1440 - 1, + 1460 - 1, + 4312 - 1, + (__u16)-1 +}; +/* The number doesn't include the -1 terminator */ +#define NUM_MSS (ARRAY_SIZE(msstab) - 1) + +/* + * Generate a syncookie. mssp points to the mss, which is returned + * rounded down to the value encoded in the cookie. + */ +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mssind; + const __u16 mss = *mssp; + + + tp->last_synq_overflow = jiffies; + + /* XXX sort msstab[] by probability? Binary search? */ + for (mssind = 0; mss > msstab[mssind + 1]; mssind++) + ; + *mssp = msstab[mssind] + 1; + + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); + + return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + ntohl(skb->h.th->seq), + jiffies / (HZ * 60), mssind); +} + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 +/* + * Check if a ack sequence number is a valid syncookie. + * Return the decoded mss if it is, or 0 if not. + */ +static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +{ + __u32 seq; + __u32 mssind; + + seq = ntohl(skb->h.th->seq)-1; + mssind = check_tcp_syn_cookie(cookie, + skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + seq, jiffies / (HZ * 60), COUNTER_TRIES); + + return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; +} + +extern struct or_calltable or_ipv4; + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *child; + + child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + if (child) + tcp_acceptq_queue(sk, req, child); + else + tcp_openreq_free(req); + + return child; +} + +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, + struct ip_options *opt) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; + struct sock *ret = sk; + struct open_request *req; + int mss; + struct rtable *rt; + __u8 rcv_wscale; + + if (!sysctl_tcp_syncookies || !skb->h.th->ack) + goto out; + + if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || + (mss = cookie_check(skb, cookie)) == 0) { + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + + req = tcp_openreq_alloc(); + ret = NULL; + if (!req) + goto out; + + req->rcv_isn = htonl(skb->h.th->seq) - 1; + req->snt_isn = cookie; + req->mss = mss; + req->rmt_port = skb->h.th->source; + req->af.v4_req.loc_addr = skb->nh.iph->daddr; + req->af.v4_req.rmt_addr = skb->nh.iph->saddr; + req->class = &or_ipv4; /* for savety */ + req->af.v4_req.opt = NULL; + + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ + if (opt && opt->optlen) { + int opt_size = sizeof(struct ip_options) + opt->optlen; + + req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); + if (req->af.v4_req.opt) { + if (ip_options_echo(req->af.v4_req.opt, skb)) { + kfree(req->af.v4_req.opt); + req->af.v4_req.opt = NULL; + } + } + } + + req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; + req->wscale_ok = req->sack_ok = 0; + req->expires = 0UL; + req->retrans = 0; + + /* + * We need to lookup the route here to get at the correct + * window size. We should better make sure that the window size + * hasn't changed since we received the original syn, but I see + * no easy way to do this. + */ + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } } }; + if (ip_route_output_key(&rt, &fl)) { + tcp_openreq_free(req); + goto out; + } + } + + /* Try to redo what tcp_v4_send_synack did. */ + req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + 0, &rcv_wscale); + /* BTW win scale with syncookies is 0 by definition */ + req->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, &rt->u.dst); +out: return ret; +} diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c new file mode 100644 index 000000000000..3aafb298c1c1 --- /dev/null +++ b/net/ipv4/sysctl_net_ipv4.c @@ -0,0 +1,698 @@ +/* + * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. + * + * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $ + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/config.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/tcp.h> + +/* From af_inet.c */ +extern int sysctl_ip_nonlocal_bind; + +/* From icmp.c */ +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; + +/* From ip_fragment.c */ +extern int sysctl_ipfrag_low_thresh; +extern int sysctl_ipfrag_high_thresh; +extern int sysctl_ipfrag_time; +extern int sysctl_ipfrag_secret_interval; + +/* From ip_output.c */ +extern int sysctl_ip_dynaddr; + +/* From icmp.c */ +extern int sysctl_icmp_ratelimit; +extern int sysctl_icmp_ratemask; + +/* From igmp.c */ +extern int sysctl_igmp_max_memberships; +extern int sysctl_igmp_max_msf; + +/* From inetpeer.c */ +extern int inet_peer_threshold; +extern int inet_peer_minttl; +extern int inet_peer_maxttl; +extern int inet_peer_gc_mintime; +extern int inet_peer_gc_maxtime; + +#ifdef CONFIG_SYSCTL +static int tcp_retr1_max = 255; +static int ip_local_port_range_min[] = { 1, 1 }; +static int ip_local_port_range_max[] = { 65535, 65535 }; +#endif + +struct ipv4_config ipv4_config; + +extern ctl_table ipv4_route_table[]; + +#ifdef CONFIG_SYSCTL + +static +int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int val = ipv4_devconf.forwarding; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && ipv4_devconf.forwarding != val) + inet_forward_change(); + + return ret; +} + +static int ipv4_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + inet_forward_change(); + return 1; +} + +ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4_TCP_TIMESTAMPS, + .procname = "tcp_timestamps", + .data = &sysctl_tcp_timestamps, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_WINDOW_SCALING, + .procname = "tcp_window_scaling", + .data = &sysctl_tcp_window_scaling, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_SACK, + .procname = "tcp_sack", + .data = &sysctl_tcp_sack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE, + .procname = "tcp_retrans_collapse", + .data = &sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_forward, + .strategy = &ipv4_sysctl_forward_strategy + }, + { + .ctl_name = NET_IPV4_DEFAULT_TTL, + .procname = "ip_default_ttl", + .data = &sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_AUTOCONFIG, + .procname = "ip_autoconfig", + .data = &ipv4_config.autoconfig, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_NO_PMTU_DISC, + .procname = "ip_no_pmtu_disc", + .data = &ipv4_config.no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_NONLOCAL_BIND, + .procname = "ip_nonlocal_bind", + .data = &sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_SYN_RETRIES, + .procname = "tcp_syn_retries", + .data = &sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_SYNACK_RETRIES, + .procname = "tcp_synack_retries", + .data = &sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_ORPHANS, + .procname = "tcp_max_orphans", + .data = &sysctl_tcp_max_orphans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_TW_BUCKETS, + .procname = "tcp_max_tw_buckets", + .data = &sysctl_tcp_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, + .procname = "ipfrag_high_thresh", + .data = &sysctl_ipfrag_high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, + .procname = "ipfrag_low_thresh", + .data = &sysctl_ipfrag_low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_DYNADDR, + .procname = "ip_dynaddr", + .data = &sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_TIME, + .procname = "ipfrag_time", + .data = &sysctl_ipfrag_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, + .procname = "tcp_keepalive_time", + .data = &sysctl_tcp_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES, + .procname = "tcp_keepalive_probes", + .data = &sysctl_tcp_keepalive_probes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL, + .procname = "tcp_keepalive_intvl", + .data = &sysctl_tcp_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_RETRIES1, + .procname = "tcp_retries1", + .data = &sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra2 = &tcp_retr1_max + }, + { + .ctl_name = NET_IPV4_TCP_RETRIES2, + .procname = "tcp_retries2", + .data = &sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT, + .procname = "tcp_fin_timeout", + .data = &sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, +#ifdef CONFIG_SYN_COOKIES + { + .ctl_name = NET_TCP_SYNCOOKIES, + .procname = "tcp_syncookies", + .data = &sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = NET_TCP_TW_RECYCLE, + .procname = "tcp_tw_recycle", + .data = &sysctl_tcp_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ABORT_ON_OVERFLOW, + .procname = "tcp_abort_on_overflow", + .data = &sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_STDURG, + .procname = "tcp_stdurg", + .data = &sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_RFC1337, + .procname = "tcp_rfc1337", + .data = &sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_SYN_BACKLOG, + .procname = "tcp_max_syn_backlog", + .data = &sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, + .procname = "ip_local_port_range", + .data = &sysctl_local_port_range, + .maxlen = sizeof(sysctl_local_port_range), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = ip_local_port_range_min, + .extra2 = ip_local_port_range_max + }, + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, + .procname = "icmp_echo_ignore_all", + .data = &sysctl_icmp_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, + .procname = "icmp_echo_ignore_broadcasts", + .data = &sysctl_icmp_echo_ignore_broadcasts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, + .procname = "icmp_ignore_bogus_error_responses", + .data = &sysctl_icmp_ignore_bogus_error_responses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv4_route_table + }, +#ifdef CONFIG_IP_MULTICAST + { + .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS, + .procname = "igmp_max_memberships", + .data = &sysctl_igmp_max_memberships, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + +#endif + { + .ctl_name = NET_IPV4_IGMP_MAX_MSF, + .procname = "igmp_max_msf", + .data = &sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_INET_PEER_THRESHOLD, + .procname = "inet_peer_threshold", + .data = &inet_peer_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_INET_PEER_MINTTL, + .procname = "inet_peer_minttl", + .data = &inet_peer_minttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_MAXTTL, + .procname = "inet_peer_maxttl", + .data = &inet_peer_maxttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME, + .procname = "inet_peer_gc_mintime", + .data = &inet_peer_gc_mintime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME, + .procname = "inet_peer_gc_maxtime", + .data = &inet_peer_gc_maxtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_TCP_ORPHAN_RETRIES, + .procname = "tcp_orphan_retries", + .data = &sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_FACK, + .procname = "tcp_fack", + .data = &sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_REORDERING, + .procname = "tcp_reordering", + .data = &sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ECN, + .procname = "tcp_ecn", + .data = &sysctl_tcp_ecn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_DSACK, + .procname = "tcp_dsack", + .data = &sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MEM, + .procname = "tcp_mem", + .data = &sysctl_tcp_mem, + .maxlen = sizeof(sysctl_tcp_mem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_WMEM, + .procname = "tcp_wmem", + .data = &sysctl_tcp_wmem, + .maxlen = sizeof(sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_RMEM, + .procname = "tcp_rmem", + .data = &sysctl_tcp_rmem, + .maxlen = sizeof(sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_APP_WIN, + .procname = "tcp_app_win", + .data = &sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ADV_WIN_SCALE, + .procname = "tcp_adv_win_scale", + .data = &sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATELIMIT, + .procname = "icmp_ratelimit", + .data = &sysctl_icmp_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATEMASK, + .procname = "icmp_ratemask", + .data = &sysctl_icmp_ratemask, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_TW_REUSE, + .procname = "tcp_tw_reuse", + .data = &sysctl_tcp_tw_reuse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_FRTO, + .procname = "tcp_frto", + .data = &sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_LOW_LATENCY, + .procname = "tcp_low_latency", + .data = &sysctl_tcp_low_latency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, + .procname = "ipfrag_secret_interval", + .data = &sysctl_ipfrag_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_TCP_NO_METRICS_SAVE, + .procname = "tcp_no_metrics_save", + .data = &sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_WESTWOOD, + .procname = "tcp_westwood", + .data = &sysctl_tcp_westwood, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS, + .procname = "tcp_vegas_cong_avoid", + .data = &sysctl_tcp_vegas_cong_avoid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_ALPHA, + .procname = "tcp_vegas_alpha", + .data = &sysctl_tcp_vegas_alpha, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_BETA, + .procname = "tcp_vegas_beta", + .data = &sysctl_tcp_vegas_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_GAMMA, + .procname = "tcp_vegas_gamma", + .data = &sysctl_tcp_vegas_gamma, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC, + .procname = "tcp_bic", + .data = &sysctl_tcp_bic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, + .procname = "tcp_bic_fast_convergence", + .data = &sysctl_tcp_bic_fast_convergence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_LOW_WINDOW, + .procname = "tcp_bic_low_window", + .data = &sysctl_tcp_bic_low_window, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_MODERATE_RCVBUF, + .procname = "tcp_moderate_rcvbuf", + .data = &sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_TSO_WIN_DIVISOR, + .procname = "tcp_tso_win_divisor", + .data = &sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_BETA, + .procname = "tcp_bic_beta", + .data = &sysctl_tcp_bic_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +#endif /* CONFIG_SYSCTL */ + +EXPORT_SYMBOL(ipv4_config); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c new file mode 100644 index 000000000000..5cff56af7855 --- /dev/null +++ b/net/ipv4/tcp.c @@ -0,0 +1,2386 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but it's a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) + * Hirokazu Takahashi : Use copy_from_user() instead of + * csum_and_copy_from_user() if possible. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/smp_lock.h> +#include <linux/fs.h> +#include <linux/random.h> +#include <linux/bootmem.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/xfrm.h> +#include <net/ip.h> + + +#include <asm/uaccess.h> +#include <asm/ioctls.h> + +int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + +DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); + +kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; + +atomic_t tcp_orphan_count = ATOMIC_INIT(0); + +int sysctl_tcp_mem[3]; +int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + +EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_SYMBOL(sysctl_tcp_rmem); +EXPORT_SYMBOL(sysctl_tcp_wmem); + +atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ + +EXPORT_SYMBOL(tcp_memory_allocated); +EXPORT_SYMBOL(tcp_sockets_allocated); + +/* + * Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the sk_stream_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. + */ +int tcp_memory_pressure; + +EXPORT_SYMBOL(tcp_memory_pressure); + +void tcp_enter_memory_pressure(void) +{ + if (!tcp_memory_pressure) { + NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + tcp_memory_pressure = 1; + } +} + +EXPORT_SYMBOL(tcp_enter_memory_pressure); + +/* + * LISTEN is a special case for poll.. + */ +static __inline__ unsigned int tcp_listen_poll(struct sock *sk, + poll_table *wait) +{ + return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0; +} + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == TCP_LISTEN) + return tcp_listen_poll(sk, wait); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->sk_err) + mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if PULLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why PULLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK + */ + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Potential race condition. If read of tp below will + * escape above sk->sk_state, we can be illegally awaken + * in SYN_* states. */ + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq + 1 || + sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + + if (tp->urg_data & TCP_URG_VALID) + mask |= POLLPRI; + } + return mask; +} + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct tcp_sock *tp = tcp_sk(sk); + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else if (sock_flag(sk, SOCK_URGINLINE) || + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN is in queue. */ + if (answ && !skb_queue_empty(&sk->sk_receive_queue)) + answ -= + ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; + } else + answ = tp->urg_seq - tp->copied_seq; + release_sock(sk); + break; + case SIOCATMARK: + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + case SIOCOUTQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; + break; + default: + return -ENOIOCTLCMD; + }; + + return put_user(answ, (int __user *)arg); +} + + +int tcp_listen_start(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + tp->accept_queue = tp->accept_queue_tail = NULL; + rwlock_init(&tp->syn_wait_lock); + tcp_delack_init(tp); + + lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); + if (!lopt) + return -ENOMEM; + + memset(lopt, 0, sizeof(struct tcp_listen_opt)); + for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) + if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog) + break; + get_random_bytes(&lopt->hash_rnd, 4); + + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = lopt; + write_unlock_bh(&tp->syn_wait_lock); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + kfree(lopt); + return -EADDRINUSE; +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ + +static void tcp_listen_stop (struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *acc_req = tp->accept_queue; + struct open_request *req; + int i; + + tcp_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + tp->accept_queue = tp->accept_queue_tail = NULL; + + if (lopt->qlen) { + for (i = 0; i < TCP_SYNQ_HSIZE; i++) { + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + tcp_openreq_free(req); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + } + } + } + BUG_TRAP(!lopt->qlen); + + kfree(lopt); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + tcp_disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(&tcp_orphan_count); + + tcp_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + tcp_openreq_fastfree(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + tp->pushed_seq = tp->write_seq; +} + +static inline int forced_push(struct tcp_sock *tp) +{ + return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); +} + +static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + skb->csum = 0; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = tp->write_seq; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk_charge_skb(sk, skb); + if (!sk->sk_send_head) + sk->sk_send_head = skb; + else if (tp->nonagle&TCP_NAGLE_PUSH) + tp->nonagle &= ~TCP_NAGLE_PUSH; +} + +static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, + struct sk_buff *skb) +{ + if (flags & MSG_OOB) { + tp->urg_mode = 1; + tp->snd_up = tp->write_seq; + TCP_SKB_CB(skb)->sacked |= TCPCB_URG; + } +} + +static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, + int mss_now, int nonagle) +{ + if (sk->sk_send_head) { + struct sk_buff *skb = sk->sk_write_queue.prev; + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + tcp_mark_urg(tp, flags, skb); + __tcp_push_pending_frames(sk, tp, mss_now, + (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + } +} + +static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, + size_t psize, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mss_now; + int err; + ssize_t copied; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (psize > 0) { + struct sk_buff *skb = sk->sk_write_queue.prev; + struct page *page = pages[poffset / PAGE_SIZE]; + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); + + if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { +new_segment: + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_pskb(sk, 0, 0, + sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + if (copy > size) + copy = size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (sk->sk_forward_alloc < copy && + !sk_stream_mem_schedule(sk, copy, 0)) + goto wait_for_memory; + + if (can_coalesce) { + skb_shinfo(skb)->frags[i - 1].size += copy; + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->sk_wmem_queued += copy; + sk->sk_forward_alloc -= copy; + skb->ip_summed = CHECKSUM_HW; + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->tso_segs = 0; + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + copied += copy; + poffset += copy; + if (!(psize -= copy)) + goto out; + + if (skb->len != mss_now || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); + } else if (skb == sk->sk_send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + return copied; + +do_error: + if (copied) + goto out; +out_err: + return sk_stream_error(sk, flags, err); +} + +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + ssize_t res; + struct sock *sk = sock->sk; + +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) + + if (!(sk->sk_route_caps & NETIF_F_SG) || + !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + return sock_no_sendpage(sock, page, offset, size, flags); + +#undef TCP_ZC_CSUM_FLAGS + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + res = do_tcp_sendpages(sk, &page, offset, size, flags); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +} + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +static inline int select_size(struct sock *sk, struct tcp_sock *tp) +{ + int tmp = tp->mss_cache_std; + + if (sk->sk_route_caps & NETIF_F_SG) { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } + return tmp; +} + +int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size) +{ + struct iovec *iov; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int iovlen, flags; + int mss_now; + int err, copied; + long timeo; + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + + flags = msg->msg_flags; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + /* This should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (--iovlen >= 0) { + int seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; + + iov++; + + while (seglen > 0) { + int copy; + + skb = sk->sk_write_queue.prev; + + if (!sk->sk_send_head || + (copy = mss_now - skb->len) <= 0) { + +new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), + 0, sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & + (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM)) + skb->ip_summed = CHECKSUM_HW; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + /* Try to append data to the end of skb. */ + if (copy > seglen) + copy = seglen; + + /* Where to copy to? */ + if (skb_tailroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + if (copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if ((err = skb_add_data(skb, from, copy)) != 0) + goto do_fault; + } else { + int merge = 0; + int i = skb_shinfo(skb)->nr_frags; + struct page *page = TCP_PAGE(sk); + int off = TCP_OFF(sk); + + if (skb_can_coalesce(skb, i, page, off) && + off != PAGE_SIZE) { + /* We can extend the last page + * fragment. */ + merge = 1; + } else if (i == MAX_SKB_FRAGS || + (!i && + !(sk->sk_route_caps & NETIF_F_SG))) { + /* Need to add new fragment and cannot + * do this because interface is non-SG, + * or because all the page slots are + * busy. */ + tcp_mark_push(tp, skb); + goto new_segment; + } else if (page) { + /* If page is cached, align + * offset to L1 cache boundary + */ + off = (off + L1_CACHE_BYTES - 1) & + ~(L1_CACHE_BYTES - 1); + if (off == PAGE_SIZE) { + put_page(page); + TCP_PAGE(sk) = page = NULL; + } + } + + if (!page) { + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; + off = 0; + } + + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + + /* Time to copy data. We are close to + * the end! */ + err = skb_copy_to_page(sk, from, skb, page, + off, copy); + if (err) { + /* If this page was new, give it to the + * socket so it does not get leaked. + */ + if (!TCP_PAGE(sk)) { + TCP_PAGE(sk) = page; + TCP_OFF(sk) = 0; + } + goto do_error; + } + + /* Update the skb. */ + if (merge) { + skb_shinfo(skb)->frags[i - 1].size += + copy; + } else { + skb_fill_page_desc(skb, i, page, off, copy); + if (TCP_PAGE(sk)) { + get_page(page); + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; + } + } + + TCP_OFF(sk) = off + copy; + } + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->tso_segs = 0; + + from += copy; + copied += copy; + if ((seglen -= copy) == 0 && iovlen == 0) + goto out; + + if (skb->len != mss_now || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); + } else if (skb == sk->sk_send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + } + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +do_fault: + if (!skb->len) { + if (sk->sk_send_head == skb) + sk->sk_send_head = NULL; + __skb_unlink(skb, skb->list); + sk_stream_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = sk_stream_error(sk, flags, err); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; +} + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock *sk, long timeo, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* No URG data to read. */ + if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || + tp->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) + return -ENOTCONN; + + if (tp->urg_data & TCP_URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags |= MSG_OOB; + + if (len > 0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + len = 1; + } else + msg->msg_flags |= MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike <pall@rz.uni-karlsruhe.de> + */ + return -EAGAIN; +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time_to_ack = 0; + +#if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); +#endif + + if (tcp_ack_scheduled(tp)) { + /* Delayed ACKs frequently hit locked sockets during bulk + * receive. */ + if (tp->ack.blocked || + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + /* + * If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && + !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + time_to_ack = 1; + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. + */ + if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + __u32 rcv_window_now = tcp_receive_window(tp); + + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + time_to_ack = 1; + } + } + if (time_to_ack) + tcp_send_ack(sk); +} + +static void tcp_prequeue_process(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + + NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); + + /* RX process wants to run with disabled BHs, though it is not + * necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->sk_backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; +} + +static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +{ + struct sk_buff *skb; + u32 offset; + + skb_queue_walk(&sk->sk_receive_queue, skb) { + offset = seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len || skb->h.th->fin) { + *off = offset; + return skb; + } + } + return NULL; +} + +/* + * This routine provides an alternative to tcp_recvmsg() for routines + * that would like to handle copying from skbuffs directly in 'sendfile' + * fashion. + * Note: + * - It is assumed that the socket was locked by the caller. + * - The routine does not block. + * - At present, there is no support for reading OOB data + * or for 'peeking' the socket using this routine + * (although both would be easy to implement). + */ +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + u32 offset; + int copied = 0; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + if (offset < skb->len) { + size_t used, len; + + len = skb->len - offset; + /* Stop reading if we hit a patch of urgent data */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - seq; + if (urg_offset < len) + len = urg_offset; + if (!len) + break; + } + used = recv_actor(desc, skb, offset, len); + if (used <= len) { + seq += used; + copied += used; + offset += used; + } + if (offset != skb->len) + break; + } + if (skb->h.th->fin) { + sk_eat_skb(sk, skb); + ++seq; + break; + } + sk_eat_skb(sk, skb); + if (!desc->count) + break; + } + tp->copied_seq = seq; + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied) + cleanup_rbuf(sk, copied); + return copied; +} + +/* + * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. + */ + +int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; + + lock_sock(sk); + + TCP_CHECK_TIMER(sk); + + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct sk_buff *skb; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } + + /* Next get a buffer. */ + + skb = skb_peek(&sk->sk_receive_queue); + do { + if (!skb) + break; + + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (before(*seq, TCP_SKB_CB(skb)->seq)) { + printk(KERN_INFO "recvmsg bug: copied %X " + "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + break; + } + offset = *seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + BUG_TRAP(flags & MSG_PEEK); + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + cleanup_rbuf(sk, copied); + + if (tp->ucopy.task == user_recv) { + /* Install new reader */ + if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + BUG_TRAP(tp->copied_seq == tp->rcv_nxt || + (flags & (MSG_PEEK | MSG_TRUNC))); + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise + * order will be broken at second iteration. + * More elegant solution is required!!! + * + * Look: we have the following (pseudo)queues: + * + * 1. packets in flight + * 2. backlog + * 3. prequeue + * 4. receive_queue + * + * Each queue can be processed only if the next ones + * are empty. At this point we have empty receive_queue. + * But prequeue _can_ be not empty after 2nd iteration, + * when we jumped to start of loop because backlog + * processing added something to receive_queue. + * We cannot release_sock(), because backlog contains + * packets arrived _after_ prequeued ones. + * + * Shortly, algorithm is clear --- to process all + * the queues in order. We could make it more directly, + * requeueing packets from backlog to prequeue, if + * is not empty. It is more elegant, but eats cycles, + * unfortunately. + */ + if (skb_queue_len(&tp->ucopy.prequeue)) + goto do_prequeue; + + /* __ Set realtime policy in scheduler __ */ + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + skb_queue_len(&tp->ucopy.prequeue)) { +do_prequeue: + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + } + if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { + if (net_ratelimit()) + printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", + current->comm, current->pid); + peek_seq = tp->copied_seq; + } + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sock_flag(sk, SOCK_URGINLINE)) { + ++*seq; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + *seq += used; + copied += used; + len -= used; + + tcp_rcv_space_adjust(sk); + +skip_copy: + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + tp->urg_data = 0; + tcp_fast_path_check(sk, tp); + } + if (used + offset < skb->len) + continue; + + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + + found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (len > 0); + + if (user_recv) { + if (skb_queue_len(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + cleanup_rbuf(sk, copied); + + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +out: + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; + +recv_urg: + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + tcp_set_state(sk, ns); + + return next & TCP_ACTION_FIN; +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD). + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk)) + tcp_send_fin(sk); + } +} + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void tcp_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) { + printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", + sk, atomic_read(&sk->sk_refcnt)); + } +#endif + + atomic_dec(&tcp_orphan_count); + sock_put(sk); +} + +void tcp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + /* Special case. */ + tcp_listen_stop(sk); + + goto adjudge_to_death; + } + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - + skb->h.th->fin; + data_was_unread += len; + __kfree_skb(skb); + } + + sk_stream_mem_reclaim(sk); + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if (data_was_unread) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_KERNEL); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + } else if (tcp_close_state(sk)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ + tcp_send_fin(sk); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + + /* Now socket is owned by kernel and we acquire BH lock + to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(!sock_owned_by_user(sk)); + + sock_hold(sk); + sock_orphan(sk); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->sk_state == TCP_FIN_WAIT2) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + } else { + int tmo = tcp_fin_time(tp); + + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + } else { + atomic_inc(&tcp_orphan_count); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->sk_state != TCP_CLOSE) { + sk_stream_mem_reclaim(sk); + if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned " + "sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + } + } + atomic_inc(&tcp_orphan_count); + + if (sk->sk_state == TCP_CLOSE) + tcp_destroy_sock(sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +/* These states need RST on ABORT according to RFC793 */ + +static inline int tcp_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +int tcp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + int old_state = sk->sk_state; + + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + /* ABORT function of RFC793 */ + if (old_state == TCP_LISTEN) { + tcp_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + /* The last check adjusts for discrepance of Linux wrt. RFC + * states + */ + tcp_send_active_reset(sk, gfp_any()); + sk->sk_err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) + sk->sk_err = ECONNRESET; + + tcp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + sk_stream_writequeue_purge(sk); + __skb_queue_purge(&tp->out_of_order_queue); + + inet->dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->srtt = 0; + if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq = 1; + tp->backoff = 0; + tp->snd_cwnd = 2; + tp->probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_cnt = 0; + tcp_set_ca_state(tp, TCP_CA_Open); + tcp_clear_retrans(tp); + tcp_delack_init(tp); + sk->sk_send_head = NULL; + tp->rx_opt.saw_tstamp = 0; + tcp_sack_reset(&tp->rx_opt); + __sk_dst_reset(sk); + + BUG_TRAP(!inet->num || tp->bind_hash); + + sk->sk_error_report(sk); + return err; +} + +/* + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. + */ +static int wait_for_connect(struct sock *sk, long timeo) +{ + struct tcp_sock *tp = tcp_sk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (!tp->accept_queue) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (tp->accept_queue) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ + +struct sock *tcp_accept(struct sock *sk, int flags, int *err) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct open_request *req; + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out; + + /* Find already established connection */ + if (!tp->accept_queue) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out; + + error = wait_for_connect(sk, timeo); + if (error) + goto out; + } + + req = tp->accept_queue; + if ((tp->accept_queue = req->dl_next) == NULL) + tp->accept_queue_tail = NULL; + + newsk = req->sk; + sk_acceptq_removed(sk); + tcp_openreq_fastfree(req); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); + release_sock(sk); + return newsk; + +out: + release_sock(sk); + *err = error; + return NULL; +} + +/* + * Socket option code for TCP. + */ +int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, + int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int val; + int err = 0; + + if (level != SOL_TCP) + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case TCP_MAXSEG: + /* Values greater than interface MTU won't take effect. However + * at the point when this call is done we typically don't yet + * know which interface is going to be used */ + if (val < 8 || val > MAX_TCP_WINDOW) { + err = -EINVAL; + break; + } + tp->rx_opt.user_mss = val; + break; + + case TCP_NODELAY: + if (val) { + /* TCP_NODELAY is weaker than TCP_CORK, so that + * this option on corked socket is remembered, but + * it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make + * an explicit push, which overrides even TCP_CORK + * for currently queued segments. + */ + tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk, tp); + } else { + tp->nonagle &= ~TCP_NAGLE_OFF; + } + break; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is + * stronger than TCP_NODELAY. + */ + if (val) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle&TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk, tp); + } + break; + + case TCP_KEEPIDLE: + if (val < 1 || val > MAX_TCP_KEEPIDLE) + err = -EINVAL; + else { + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN))) { + __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + tcp_reset_keepalive_timer(sk, elapsed); + } + } + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + err = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + err = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + err = -EINVAL; + else + tp->syn_retries = val; + break; + + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout / HZ) + tp->linger2 = 0; + else + tp->linger2 = val * HZ; + break; + + case TCP_DEFER_ACCEPT: + tp->defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of + * retransmits */ + while (tp->defer_accept < 32 && + val > ((TCP_TIMEOUT_INIT / HZ) << + tp->defer_accept)) + tp->defer_accept++; + tp->defer_accept++; + } + break; + + case TCP_WINDOW_CLAMP: + if (!val) { + if (sk->sk_state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + break; + + case TCP_QUICKACK: + if (!val) { + tp->ack.pingpong = 1; + } else { + tp->ack.pingpong = 0; + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + tcp_ack_scheduled(tp)) { + tp->ack.pending |= TCP_ACK_PUSHED; + cleanup_rbuf(sk, 1); + if (!(val & 1)) + tp->ack.pingpong = 1; + } + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + release_sock(sk); + return err; +} + +/* Return information about state of tcp endpoint in API format. */ +void tcp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 now = tcp_time_stamp; + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_ca_state = tp->ca_state; + info->tcpi_retransmits = tp->retransmits; + info->tcpi_probes = tp->probes_out; + info->tcpi_backoff = tp->backoff; + + if (tp->rx_opt.tstamp_ok) + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->rx_opt.sack_ok) + info->tcpi_options |= TCPI_OPT_SACK; + if (tp->rx_opt.wscale_ok) { + info->tcpi_options |= TCPI_OPT_WSCALE; + info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; + info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; + } + + if (tp->ecn_flags&TCP_ECN_OK) + info->tcpi_options |= TCPI_OPT_ECN; + + info->tcpi_rto = jiffies_to_usecs(tp->rto); + info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_rcv_mss = tp->ack.rcv_mss; + + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; + info->tcpi_retrans = tp->retrans_out; + info->tcpi_fackets = tp->fackets_out; + + info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); + + info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; + info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; + info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_snd_ssthresh = tp->snd_ssthresh; + info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_advmss = tp->advmss; + info->tcpi_reordering = tp->reordering; + + info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_space = tp->rcvq_space.space; + + info->tcpi_total_retrans = tp->total_retrans; +} + +EXPORT_SYMBOL_GPL(tcp_get_info); + +int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, + int __user *optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int val, len; + + if (level != SOL_TCP) + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case TCP_MAXSEG: + val = tp->mss_cache_std; + if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + val = tp->rx_opt.user_mss; + break; + case TCP_NODELAY: + val = !!(tp->nonagle&TCP_NAGLE_OFF); + break; + case TCP_CORK: + val = !!(tp->nonagle&TCP_NAGLE_CORK); + break; + case TCP_KEEPIDLE: + val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; + break; + case TCP_KEEPINTVL: + val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; + break; + case TCP_KEEPCNT: + val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + break; + case TCP_SYNCNT: + val = tp->syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val >= 0) + val = (val ? : sysctl_tcp_fin_timeout) / HZ; + break; + case TCP_DEFER_ACCEPT: + val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << + (tp->defer_accept - 1)); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; + break; + case TCP_INFO: { + struct tcp_info info; + + if (get_user(len, optlen)) + return -EFAULT; + + tcp_get_info(sk, &info); + + len = min_t(unsigned int, len, sizeof(info)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } + case TCP_QUICKACK: + val = !tp->ack.pingpong; + break; + default: + return -ENOPROTOOPT; + }; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + + +extern void __skb_cb_too_small_for_tcp(int, int); +extern void tcpdiag_init(void); + +static __initdata unsigned long thash_entries; +static int __init set_thash_entries(char *str) +{ + if (!str) + return 0; + thash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("thash_entries=", set_thash_entries); + +void __init tcp_init(void) +{ + struct sk_buff *skb = NULL; + int order, i; + + if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); + + /* Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + tcp_ehash = (struct tcp_ehash_bucket *) + alloc_large_system_hash("TCP established", + sizeof(struct tcp_ehash_bucket), + thash_entries, + (num_physpages >= 128 * 1024) ? + (25 - PAGE_SHIFT) : + (27 - PAGE_SHIFT), + HASH_HIGHMEM, + &tcp_ehash_size, + NULL, + 0); + tcp_ehash_size = (1 << tcp_ehash_size) >> 1; + for (i = 0; i < (tcp_ehash_size << 1); i++) { + rwlock_init(&tcp_ehash[i].lock); + INIT_HLIST_HEAD(&tcp_ehash[i].chain); + } + + tcp_bhash = (struct tcp_bind_hashbucket *) + alloc_large_system_hash("TCP bind", + sizeof(struct tcp_bind_hashbucket), + tcp_ehash_size, + (num_physpages >= 128 * 1024) ? + (25 - PAGE_SHIFT) : + (27 - PAGE_SHIFT), + HASH_HIGHMEM, + &tcp_bhash_size, + NULL, + 64 * 1024); + tcp_bhash_size = 1 << tcp_bhash_size; + for (i = 0; i < tcp_bhash_size; i++) { + spin_lock_init(&tcp_bhash[i].lock); + INIT_HLIST_HEAD(&tcp_bhash[i].chain); + } + + /* Try to be a bit smarter and adjust defaults depending + * on available memory. + */ + for (order = 0; ((1 << order) << PAGE_SHIFT) < + (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + order++) + ; + if (order > 4) { + sysctl_local_port_range[0] = 32768; + sysctl_local_port_range[1] = 61000; + sysctl_tcp_max_tw_buckets = 180000; + sysctl_tcp_max_orphans = 4096 << (order - 4); + sysctl_max_syn_backlog = 1024; + } else if (order < 3) { + sysctl_local_port_range[0] = 1024 * (3 - order); + sysctl_tcp_max_tw_buckets >>= (3 - order); + sysctl_tcp_max_orphans >>= (3 - order); + sysctl_max_syn_backlog = 128; + } + tcp_port_rover = sysctl_local_port_range[0] - 1; + + sysctl_tcp_mem[0] = 768 << order; + sysctl_tcp_mem[1] = 1024 << order; + sysctl_tcp_mem[2] = 1536 << order; + + if (order < 3) { + sysctl_tcp_wmem[2] = 64 * 1024; + sysctl_tcp_rmem[0] = PAGE_SIZE; + sysctl_tcp_rmem[1] = 43689; + sysctl_tcp_rmem[2] = 2 * 43689; + } + + printk(KERN_INFO "TCP: Hash tables configured " + "(established %d bind %d)\n", + tcp_ehash_size << 1, tcp_bhash_size); +} + +EXPORT_SYMBOL(tcp_accept); +EXPORT_SYMBOL(tcp_close); +EXPORT_SYMBOL(tcp_destroy_sock); +EXPORT_SYMBOL(tcp_disconnect); +EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_SYMBOL(tcp_ioctl); +EXPORT_SYMBOL(tcp_openreq_cachep); +EXPORT_SYMBOL(tcp_poll); +EXPORT_SYMBOL(tcp_read_sock); +EXPORT_SYMBOL(tcp_recvmsg); +EXPORT_SYMBOL(tcp_sendmsg); +EXPORT_SYMBOL(tcp_sendpage); +EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_SYMBOL(tcp_shutdown); +EXPORT_SYMBOL(tcp_statistics); +EXPORT_SYMBOL(tcp_timewait_cachep); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c new file mode 100644 index 000000000000..313c1408da33 --- /dev/null +++ b/net/ipv4/tcp_diag.c @@ -0,0 +1,802 @@ +/* + * tcp_diag.c Module for monitoring TCP sockets. + * + * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/random.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/time.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/ipv6.h> +#include <net/inet_common.h> + +#include <linux/inet.h> +#include <linux/stddef.h> + +#include <linux/tcp_diag.h> + +struct tcpdiag_entry +{ + u32 *saddr; + u32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *tcpnl; + + +#define TCPDIAG_PUT(skb, attrtype, attrlen) \ +({ int rtalen = RTA_LENGTH(attrlen); \ + struct rtattr *rta; \ + if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ + rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ + rta->rta_type = attrtype; \ + rta->rta_len = rtalen; \ + RTA_DATA(rta); }) + +static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq, u16 nlmsg_flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcpdiagmsg *r; + struct nlmsghdr *nlh; + struct tcp_info *info = NULL; + struct tcpdiag_meminfo *minfo = NULL; + struct tcpvegas_info *vinfo = NULL; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + r = NLMSG_DATA(nlh); + if (sk->sk_state != TCP_TIME_WAIT) { + if (ext & (1<<(TCPDIAG_MEMINFO-1))) + minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); + if (ext & (1<<(TCPDIAG_INFO-1))) + info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + + if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) + && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) + vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); + } + r->tcpdiag_family = sk->sk_family; + r->tcpdiag_state = sk->sk_state; + r->tcpdiag_timer = 0; + r->tcpdiag_retrans = 0; + + r->id.tcpdiag_if = sk->sk_bound_dev_if; + r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; + r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + if (r->tcpdiag_state == TCP_TIME_WAIT) { + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + long tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.tcpdiag_sport = tw->tw_sport; + r->id.tcpdiag_dport = tw->tw_dport; + r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; + r->id.tcpdiag_dst[0] = tw->tw_daddr; + r->tcpdiag_state = tw->tw_substate; + r->tcpdiag_timer = 3; + r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; + r->tcpdiag_rqueue = 0; + r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = 0; + r->tcpdiag_inode = 0; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &tw->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &tw->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + + r->id.tcpdiag_sport = inet->sport; + r->id.tcpdiag_dport = inet->dport; + r->id.tcpdiag_src[0] = inet->rcv_saddr; + r->id.tcpdiag_dst[0] = inet->daddr; + +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ + + if (tp->pending == TCP_TIME_RETRANS) { + r->tcpdiag_timer = 1; + r->tcpdiag_retrans = tp->retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_timer = 4; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->tcpdiag_timer = 2; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->tcpdiag_timer = 0; + r->tcpdiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + r->tcpdiag_uid = sock_i_uid(sk); + r->tcpdiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->tcpdiag_wmem = sk->sk_wmem_queued; + minfo->tcpdiag_fmem = sk->sk_forward_alloc; + minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); + } + + if (info) + tcp_get_info(sk, info); + + if (vinfo) { + if (tcp_is_vegas(tp)) { + vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; + vinfo->tcpv_rttcnt = tp->vegas.cntRTT; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); + } else { + vinfo->tcpv_enabled = 0; + vinfo->tcpv_rttcnt = 0; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); + } + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, + int dif); +#ifdef CONFIG_IP_TCPDIAG_IPV6 +extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif); +#else +static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif) +{ + return NULL; +} +#endif + +static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct tcpdiagreq *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + + if (req->tcpdiag_family == AF_INET) { + sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, + req->id.tcpdiag_src[0], req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#ifdef CONFIG_IP_TCPDIAG_IPV6 + else if (req->tcpdiag_family == AF_INET6) { + sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, + (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || + req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ + sizeof(struct tcpdiag_meminfo)+ + sizeof(struct tcp_info)+64), GFP_KERNEL); + if (!rep) + goto out; + + if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0) <= 0) + BUG(); + + err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + tcp_tw_put((struct tcp_tw_bucket*)sk); + else + sock_put(sk); + } + return err; +} + +static int bitstring_match(const u32 *a1, const u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int tcpdiag_bc_run(const void *bc, int len, + const struct tcpdiag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct tcpdiag_bc_op *op = bc; + + switch (op->code) { + case TCPDIAG_BC_NOP: + break; + case TCPDIAG_BC_JMP: + yes = 0; + break; + case TCPDIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case TCPDIAG_BC_S_LE: + yes = entry->dport <= op[1].no; + break; + case TCPDIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case TCPDIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case TCPDIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + { + struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); + u32 *addr; + + if (cond->port != -1 && + cond->port != (op->code == TCPDIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == TCPDIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr+3, cond->addr, cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct tcpdiag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) +{ + const unsigned char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case TCPDIAG_BC_AUTO: + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + case TCPDIAG_BC_S_GE: + case TCPDIAG_BC_S_LE: + case TCPDIAG_BC_D_GE: + case TCPDIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + case TCPDIAG_BC_JMP: + if (op->no < 4 || op->no > len+4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len-op->no)) + return -EINVAL; + break; + case TCPDIAG_BC_NOP: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct tcpdiag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->rcv_saddr; + entry.daddr = &inet->daddr; + } + entry.sport = inet->num; + entry.dport = ntohs(inet->dport); + entry.userlocks = sk->sk_userlocks; + + if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, + struct open_request *req, + u32 pid, u32 seq) +{ + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb->tail; + struct tcpdiagmsg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->tcpdiag_family = sk->sk_family; + r->tcpdiag_state = TCP_SYN_RECV; + r->tcpdiag_timer = 1; + r->tcpdiag_retrans = req->retrans; + + r->id.tcpdiag_if = sk->sk_bound_dev_if; + r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; + r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.tcpdiag_sport = inet->sport; + r->id.tcpdiag_dport = req->rmt_port; + r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr; + r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr; + r->tcpdiag_expires = jiffies_to_msecs(tmo), + r->tcpdiag_rqueue = 0; + r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = sock_i_uid(sk); + r->tcpdiag_inode = 0; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &req->af.v6_req.loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &req->af.v6_req.rmt_addr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct tcpdiag_entry entry; + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt; + struct rtattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&tp->syn_wait_lock); + + lopt = tp->listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + bc = (struct rtattr *)(r + 1); + entry.sport = inet->num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { + struct open_request *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + if (reqnum < s_reqnum) + continue; + if (r->id.tcpdiag_dport != req->rmt_port && + r->id.tcpdiag_dport) + continue; + + if (bc) { + entry.saddr = +#ifdef CONFIG_IP_TCPDIAG_IPV6 + (entry.family == AF_INET6) ? + req->af.v6_req.loc_addr.s6_addr32 : +#endif + &req->af.v4_req.loc_addr; + entry.daddr = +#ifdef CONFIG_IP_TCPDIAG_IPV6 + (entry.family == AF_INET6) ? + req->af.v6_req.rmt_addr.s6_addr32 : +#endif + &req->af.v4_req.rmt_addr; + entry.dport = ntohs(req->rmt_port); + + if (!tcpdiag_bc_run(RTA_DATA(bc), + RTA_PAYLOAD(bc), &entry)) + continue; + } + + err = tcpdiag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&tp->syn_wait_lock); + + return err; +} + +static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) + goto skip_listen_ht; + tcp_listen_lock(); + for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_node *node; + + num = 0; + sk_for_each(sk, node, &tcp_listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_listen; + + if (!(r->tcpdiag_states&TCPF_LISTEN) || + r->id.tcpdiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + tcp_listen_unlock(); + goto done; + } + +syn_recv: + if (!(r->tcpdiag_states&TCPF_SYN_RECV)) + goto next_listen; + + if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { + tcp_listen_unlock(); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } + tcp_listen_unlock(); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < tcp_ehash_size; i++) { + struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct sock *sk; + struct hlist_node *node; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + num = 0; + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->tcpdiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_normal; + if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) + goto next_normal; + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_normal: + ++num; + } + + if (r->tcpdiag_states&TCPF_TIME_WAIT) { + sk_for_each(sk, node, + &tcp_ehash[i + tcp_ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_dying; + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_dying; + if (r->id.tcpdiag_dport != inet->dport && + r->id.tcpdiag_dport) + goto next_dying; + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_dying: + ++num; + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int tcpdiag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type != TCPDIAG_GETSOCK) + goto err_inval; + + if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { + struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); + if (rta->rta_type != TCPDIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) + goto err_inval; + if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(tcpnl, skb, nlh, + tcpdiag_dump, + tcpdiag_dump_done); + } else { + return tcpdiag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +static inline void tcpdiag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = tcpdiag_rcv_msg(skb, nlh); + if (err || nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, err); + } +} + +static void tcpdiag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + tcpdiag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static int __init tcpdiag_init(void) +{ + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + if (tcpnl == NULL) + return -ENOMEM; + return 0; +} + +static void __exit tcpdiag_exit(void) +{ + sock_release(tcpnl->sk_socket); +} + +module_init(tcpdiag_init); +module_exit(tcpdiag_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c new file mode 100644 index 000000000000..250492735902 --- /dev/null +++ b/net/ipv4/tcp_input.c @@ -0,0 +1,4959 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. + * Andi Kleen : Moved open_request checking here + * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presnce of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS<min(MTU,ann. MSS) + * work without delayed acks. + * Andi Kleen: Process packets with PSH set in the + * fast path. + * J Hadi Salim: ECN support + * Andrei Gurtov, + * Pasi Sarolahti, + * Panu Kuhlberg: Experimental audit of TCP (re)transmission + * engine. Lots of bugs are found. + * Pasi Sarolahti: F-RTO for dealing with spurious RTOs + * Angelo Dell'Aera: TCP Westwood+ support + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <net/tcp.h> +#include <net/inet_common.h> +#include <linux/ipsec.h> +#include <asm/unaligned.h> + +int sysctl_tcp_timestamps = 1; +int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; +int sysctl_tcp_fack = 1; +int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_ecn; +int sysctl_tcp_dsack = 1; +int sysctl_tcp_app_win = 31; +int sysctl_tcp_adv_win_scale = 2; + +int sysctl_tcp_stdurg; +int sysctl_tcp_rfc1337; +int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_frto; +int sysctl_tcp_nometrics_save; +int sysctl_tcp_westwood; +int sysctl_tcp_vegas_cong_avoid; + +int sysctl_tcp_moderate_rcvbuf = 1; + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; +int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; +int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; +int sysctl_tcp_bic = 1; +int sysctl_tcp_bic_fast_convergence = 1; +int sysctl_tcp_bic_low_window = 14; +int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + +#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) +#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) +#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) + +#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + +/* Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, + struct sk_buff *skb) +{ + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len += skb->data - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + /* If PSH is not set, packet should be + * full sized, provided peer TCP is not badly broken. + * This observation (if it is correct 8)) allows + * to handle super-low mtu links fairly. + */ + (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && + !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + tp->ack.last_seg_size = len; + if (len == lss) { + tp->ack.rcv_mss = len; + return; + } + } + tp->ack.pending |= TCP_ACK_PUSHED; + } +} + +static void tcp_incr_quickack(struct tcp_sock *tp) +{ + unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + + if (quickacks==0) + quickacks=2; + if (quickacks > tp->ack.quick) + tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); +} + +void tcp_enter_quickack_mode(struct tcp_sock *tp) +{ + tcp_incr_quickack(tp); + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; +} + +/* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +{ + return (tp->ack.quick && !tp->ack.pingpong); +} + +/* Buffer size and advertised window tuning. + * + * 1. Tuning sk->sk_sndbuf, when connection enters established state. + */ + +static void tcp_fixup_sndbuf(struct sock *sk) +{ + int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + + sizeof(struct sk_buff); + + if (sk->sk_sndbuf < 3 * sndmem) + sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); +} + +/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) + * + * All tcp_full_space() is split to two parts: "network" buffer, allocated + * forward and advertised in receiver window (tp->rcv_wnd) and + * "application buffer", required to isolate scheduling/application + * latencies from network. + * window_clamp is maximal advertised window. It can be less than + * tcp_full_space(), in this case tcp_full_space() - window_clamp + * is reserved for "application" buffer. The less window_clamp is + * the smoother our behaviour from viewpoint of network, but the lower + * throughput and the higher sensitivity of the connection to losses. 8) + * + * rcv_ssthresh is more strict window_clamp used at "slow start" + * phase to predict further behaviour of this connection. + * It is used for two goals: + * - to enforce header prediction at sender, even when application + * requires some significant "application buffer". It is check #1. + * - to prevent pruning of receive queue because of misprediction + * of receiver window. Check #2. + * + * The scheme does not work when sender sends good segments opening + * window and then starts to feed us spagetti. But it should work + * in common situations. Otherwise, we have to rely on queue collapsing. + */ + +/* Slow part of check#2. */ +static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + /* Optimize this! */ + int truesize = tcp_win_from_space(skb->truesize)/2; + int window = tcp_full_space(sk)/2; + + while (tp->rcv_ssthresh <= window) { + if (truesize <= skb->len) + return 2*tp->ack.rcv_mss; + + truesize >>= 1; + window >>= 1; + } + return 0; +} + +static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && + !tcp_memory_pressure) { + int incr; + + /* Check #2. Increase window, if skb with such overhead + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(skb->truesize) <= skb->len) + incr = 2*tp->advmss; + else + incr = __tcp_grow_window(sk, tp, skb); + + if (incr) { + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->ack.quick |= 1; + } + } +} + +/* 3. Tuning rcvbuf, when connection enters established state. */ + +static void tcp_fixup_rcvbuf(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + + /* Try to select rcvbuf so that 4 mss-sized segments + * will fit to window and correspoding skbs will fit to our rcvbuf. + * (was 3; 4 is minimum to allow fast retransmit to work.) + */ + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + if (sk->sk_rcvbuf < 4 * rcvmem) + sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); +} + +/* 4. Try to fixup all. It is made iimediately after connection enters + * established state. + */ +static void tcp_init_buffer_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int maxwin; + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + tcp_fixup_rcvbuf(sk); + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) + tcp_fixup_sndbuf(sk); + + tp->rcvq_space.space = tp->rcv_wnd; + + maxwin = tcp_full_space(sk); + + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + + if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + tp->window_clamp = max(maxwin - + (maxwin >> sysctl_tcp_app_win), + 4 * tp->advmss); + } + + /* Force reservation of one segment. */ + if (sysctl_tcp_app_win && + tp->window_clamp > 2 * tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void init_bictcp(struct tcp_sock *tp) +{ + tp->bictcp.cnt = 0; + + tp->bictcp.last_max_cwnd = 0; + tp->bictcp.last_cwnd = 0; + tp->bictcp.last_stamp = 0; +} + +/* 5. Recalculate window clamp after socket hit its memory bounds. */ +static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb; + unsigned int app_win = tp->rcv_nxt - tp->copied_seq; + int ofo_win = 0; + + tp->ack.quick = 0; + + skb_queue_walk(&tp->out_of_order_queue, skb) { + ofo_win += skb->len; + } + + /* If overcommit is due to out of order segments, + * do not clamp window. Try to expand rcvbuf instead. + */ + if (ofo_win) { + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); + } + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { + app_win += ofo_win; + if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) + app_win >>= 1; + if (app_win > tp->ack.rcv_mss) + app_win -= tp->ack.rcv_mss; + app_win = max(app_win, 2U*tp->advmss); + + if (!ofo_win) + tp->window_clamp = min(tp->window_clamp, app_win); + tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + } +} + +/* Receiver "autotuning" code. + * + * The algorithm for RTT estimation w/o timestamps is based on + * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. + * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> + * + * More detail on this code can be found at + * <http://www.psc.edu/~jheffner/senior_thesis.ps>, + * though this reference is out of date. A new paper + * is pending. + */ +static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) +{ + u32 new_sample = tp->rcv_rtt_est.rtt; + long m = sample; + + if (m == 0) + m = 1; + + if (new_sample != 0) { + /* If we sample in larger samples in the non-timestamp + * case, we could grossly overestimate the RTT especially + * with chatty applications or bulk transfer apps which + * are stalled on filesystem I/O. + * + * Also, since we are only going for a minimum in the + * non-timestamp case, we do not smoothe things out + * else with timestamps disabled convergance takes too + * long. + */ + if (!win_dep) { + m -= (new_sample >> 3); + new_sample += m; + } else if (m < new_sample) + new_sample = m << 3; + } else { + /* No previous mesaure. */ + new_sample = m << 3; + } + + if (tp->rcv_rtt_est.rtt != new_sample) + tp->rcv_rtt_est.rtt = new_sample; +} + +static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) +{ + if (tp->rcv_rtt_est.time == 0) + goto new_measure; + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) + return; + tcp_rcv_rtt_update(tp, + jiffies - tp->rcv_rtt_est.time, + 1); + +new_measure: + tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; + tp->rcv_rtt_est.time = tcp_time_stamp; +} + +static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->rx_opt.rcv_tsecr && + (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); +} + +/* + * This function should be called every time data is copied to user space. + * It calculates the appropriate TCP receive buffer space. + */ +void tcp_rcv_space_adjust(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time; + int space; + + if (tp->rcvq_space.time == 0) + goto new_measure; + + time = tcp_time_stamp - tp->rcvq_space.time; + if (time < (tp->rcv_rtt_est.rtt >> 3) || + tp->rcv_rtt_est.rtt == 0) + return; + + space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + + space = max(tp->rcvq_space.space, space); + + if (tp->rcvq_space.space != space) { + int rcvmem; + + tp->rcvq_space.space = space; + + if (sysctl_tcp_moderate_rcvbuf) { + int new_clamp = space; + + /* Receive space grows, normalize in order to + * take into account packet headers and sk_buff + * structure overhead. + */ + space /= tp->advmss; + if (!space) + space = 1; + rcvmem = (tp->advmss + MAX_TCP_HEADER + + 16 + sizeof(struct sk_buff)); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + space *= rcvmem; + space = min(space, sysctl_tcp_rmem[2]); + if (space > sk->sk_rcvbuf) { + sk->sk_rcvbuf = space; + + /* Make the window clamp follow along. */ + tp->window_clamp = new_clamp; + } + } + } + +new_measure: + tp->rcvq_space.seq = tp->copied_seq; + tp->rcvq_space.time = tcp_time_stamp; +} + +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM + */ +static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 now; + + tcp_schedule_ack(tp); + + tcp_measure_rcv_mss(tp, skb); + + tcp_rcv_rtt_measure(tp); + + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. + */ + tcp_incr_quickack(tp); + tp->ack.ato = TCP_ATO_MIN; + } else { + int m = now - tp->ack.lrcvtime; + + if (m <= TCP_ATO_MIN/2) { + /* The fastest case is the first. */ + tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; + } else if (m < tp->ack.ato) { + tp->ack.ato = (tp->ack.ato>>1) + m; + if (tp->ack.ato > tp->rto) + tp->ack.ato = tp->rto; + } else if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_incr_quickack(tp); + sk_stream_mem_reclaim(sk); + } + } + tp->ack.lrcvtime = now; + + TCP_ECN_check_ce(tp, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, tp, skb); +} + +/* When starting a new connection, pin down the current choice of + * congestion algorithm. + */ +void tcp_ca_init(struct tcp_sock *tp) +{ + if (sysctl_tcp_westwood) + tp->adv_cong = TCP_WESTWOOD; + else if (sysctl_tcp_bic) + tp->adv_cong = TCP_BIC; + else if (sysctl_tcp_vegas_cong_avoid) { + tp->adv_cong = TCP_VEGAS; + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + } +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->vegas.baseRTT) + tp->vegas.baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); + tp->vegas.cntRTT++; +} + +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +{ + long m = mrtt; /* RTT */ + + if (tcp_vegas_enabled(tp)) + vegas_rtt_calc(tp, mrtt); + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be incresed fastly, decrease too fastly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) + */ + if(m == 0) + m = 1; + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + /* This is similar to one of Eifel findings. + * Eifel blocks mdev updates when rtt decreases. + * This solution is a bit different: we use finer gain + * for mdev in this case (alpha*beta). + * Like Eifel it also prevents growth of rto, + * but also it limits too fast rto decreases, + * happening in pure Eifel. + */ + if (m > 0) + m >>= 3; + } else { + m -= (tp->mdev >> 2); /* similar update on mdev */ + } + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev > tp->mdev_max) { + tp->mdev_max = tp->mdev; + if (tp->mdev_max > tp->rttvar) + tp->rttvar = tp->mdev_max; + } + if (after(tp->snd_una, tp->rtt_seq)) { + if (tp->mdev_max < tp->rttvar) + tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rtt_seq = tp->snd_nxt; + tp->mdev_max = TCP_RTO_MIN; + } + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->rtt_seq = tp->snd_nxt; + } + + tcp_westwood_update_rtt(tp, tp->srtt >> 3); +} + +/* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +static inline void tcp_set_rto(struct tcp_sock *tp) +{ + /* Old crap is replaced with new one. 8) + * + * More seriously: + * 1. If rtt variance happened to be less 50msec, it is hallucination. + * It cannot be less due to utterly erratic ACK generation made + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ + * to do with delayed acks, because at cwnd>2 true delack timeout + * is invisible. Actually, Linux-2.4 also generates erratic + * ACKs in some curcumstances. + */ + tp->rto = (tp->srtt >> 3) + tp->rttvar; + + /* 2. Fixups made earlier cannot be right. + * If we do not estimate RTO correctly without them, + * all the algo is pure shit and should be replaced + * with correct one. It is exaclty, which we pretend to do. + */ +} + +/* NOTE: clamping at TCP_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ +static inline void tcp_bound_rto(struct tcp_sock *tp) +{ + if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; +} + +/* Save metrics learned by this TCP session. + This function is called only, when TCP finishes successfully + i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (sysctl_tcp_nometrics_save) + return; + + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { + int m; + + if (tp->backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) + dst->metrics[RTAX_RTT-1] = 0; + return; + } + + m = dst_metric(dst, RTAX_RTT) - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) { + if (m <= 0) + dst->metrics[RTAX_RTT-1] = tp->srtt; + else + dst->metrics[RTAX_RTT-1] -= (m>>3); + } + + if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + if (m >= dst_metric(dst, RTAX_RTTVAR)) + dst->metrics[RTAX_RTTVAR-1] = m; + else + dst->metrics[RTAX_RTTVAR-1] -= + (dst->metrics[RTAX_RTTVAR-1] - m)>>2; + } + + if (tp->snd_ssthresh >= 0xFFFF) { + /* Slow start still did not finish. */ + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; + if (!dst_metric_locked(dst, RTAX_CWND) && + tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; + } else if (tp->snd_cwnd > tp->snd_ssthresh && + tp->ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!dst_metric_locked(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = + max(tp->snd_cwnd >> 1, tp->snd_ssthresh); + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; + if (dst->metrics[RTAX_SSTHRESH-1] && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; + } + + if (!dst_metric_locked(dst, RTAX_REORDERING)) { + if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst->metrics[RTAX_REORDERING-1] = tp->reordering; + } + } +} + +/* Numbers are taken from RFC2414. */ +__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) +{ + __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); + + if (!cwnd) { + if (tp->mss_cache_std > 1460) + cwnd = 2; + else + cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + } + return min_t(__u32, cwnd, tp->snd_cwnd_clamp); +} + +/* Initialize metrics on socket. */ + +static void tcp_init_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst_metric_locked(dst, RTAX_CWND)) + tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); + if (dst_metric(dst, RTAX_SSTHRESH)) { + tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst_metric(dst, RTAX_REORDERING) && + tp->reordering != dst_metric(dst, RTAX_REORDERING)) { + tp->rx_opt.sack_ok &= ~2; + tp->reordering = dst_metric(dst, RTAX_REORDERING); + } + + if (dst_metric(dst, RTAX_RTT) == 0) + goto reset; + + if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal curcumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (dst_metric(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric(dst, RTAX_RTT); + tp->rtt_seq = tp->snd_nxt; + } + if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric(dst, RTAX_RTTVAR); + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + } + tcp_set_rto(tp); + tcp_bound_rto(tp); + if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; + return; + +reset: + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->rx_opt.saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; + tp->rto = TCP_TIMEOUT_INIT; + } +} + +static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) +{ + if (metric > tp->reordering) { + tp->reordering = min(TCP_MAX_REORDERING, metric); + + /* This exciting event is worth to be remembered. 8) */ + if (ts) + NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); + else if (IsReno(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); + else if (IsFack(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); +#if FASTRETRANS_DEBUG > 1 + printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", + tp->rx_opt.sack_ok, tp->ca_state, + tp->reordering, + tp->fackets_out, + tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); +#endif + /* Disable FACK yet. */ + tp->rx_opt.sack_ok &= ~2; + } +} + +/* This procedure tags the retransmission queue when SACKs arrive. + * + * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). + * Packets in queue with these bits set are counted in variables + * sacked_out, retrans_out and lost_out, correspondingly. + * + * Valid combinations are: + * Tag InFlight Description + * 0 1 - orig segment is in flight. + * S 0 - nothing flies, orig reached receiver. + * L 0 - nothing flies, orig lost by net. + * R 2 - both orig and retransmit are in flight. + * L|R 1 - orig is lost, retransmit is in flight. + * S|R 1 - orig reached receiver, retrans is still in flight. + * (L|S|R is logically valid, it could occur when L|R is sacked, + * but it is equivalent to plain S and code short-curcuits it to S. + * L|S is logically invalid, it would mean -1 packet in flight 8)) + * + * These 6 states form finite state machine, controlled by the following events: + * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) + * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) + * 3. Loss detection event of one of three flavors: + * A. Scoreboard estimator decided the packet is lost. + * A'. Reno "three dupacks" marks head of queue lost. + * A''. Its FACK modfication, head until snd.fack is lost. + * B. SACK arrives sacking data transmitted after never retransmitted + * hole was sent out. + * C. SACK arrives sacking SND.NXT at the moment, when the + * segment was retransmitted. + * 4. D-SACK added new rule: D-SACK changes any tag to S. + * + * It is pleasant to note, that state diagram turns out to be commutative, + * so that we are allowed not to be bothered by order of our actions, + * when multiple events arrive simultaneously. (see the function below). + * + * Reordering detection. + * -------------------- + * Reordering metric is maximal distance, which a packet can be displaced + * in packet stream. With SACKs we can estimate it: + * + * 1. SACK fills old hole and the corresponding segment was not + * ever retransmitted -> reordering. Alas, we cannot use it + * when segment was retransmitted. + * 2. The last flaw is solved with D-SACK. D-SACK arrives + * for retransmitted and already SACKed segment -> reordering.. + * Both of these heuristics are not used in Loss state, when we cannot + * account for retransmits accurately. + */ +static int +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + int reord = tp->packets_out; + int prior_fackets; + u32 lost_retrans = 0; + int flag = 0; + int i; + + /* So, SACKs for already sent large segments will be lost. + * Not good, but alternative is to resegment the queue. */ + if (sk->sk_route_caps & NETIF_F_TSO) { + sk->sk_route_caps &= ~NETIF_F_TSO; + sock_set_flag(sk, SOCK_NO_LARGESEND); + tp->mss_cache = tp->mss_cache_std; + } + + if (!tp->sacked_out) + tp->fackets_out = 0; + prior_fackets = tp->fackets_out; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; + int dup_sack = 0; + + /* Check for D-SACK. */ + if (i == 0) { + u32 ack = TCP_SKB_CB(ack_skb)->ack_seq; + + if (before(start_seq, ack)) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1 && + !after(end_seq, ntohl(sp[1].end_seq)) && + !before(start_seq, ntohl(sp[1].start_seq))) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + } + + /* D-SACK for already forgotten data... + * Do dumb counting. */ + if (dup_sack && + !after(end_seq, prior_snd_una) && + after(end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* Eliminate too old ACKs, but take into + * account more or less fresh ones, they can + * contain valid SACK info. + */ + if (before(ack, prior_snd_una - tp->max_window)) + return 0; + } + + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; + + sk_stream_for_retrans_queue(skb, sk) { + u8 sacked = TCP_SKB_CB(skb)->sacked; + int in_sack; + + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + fack_count += tcp_skb_pcount(skb); + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + /* Account D-SACK for retransmitted packet. */ + if ((dup_sack && in_sack) && + (sacked & TCPCB_RETRANS) && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* The frame is ACKed. */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { + if (sacked&TCPCB_RETRANS) { + if ((dup_sack && in_sack) && + (sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } else { + /* If it was in a hole, we detected reordering. */ + if (fack_count < prior_fackets && + !(sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } + + /* Nothing to do; acked frame is about to be dropped. */ + continue; + } + + if ((sacked&TCPCB_SACKED_RETRANS) && + after(end_seq, TCP_SKB_CB(skb)->ack_seq) && + (!lost_retrans || after(end_seq, lost_retrans))) + lost_retrans = end_seq; + + if (!in_sack) + continue; + + if (!(sacked&TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + } + } else { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (!(sacked & TCPCB_RETRANS) && + fack_count < prior_fackets) + reord = min(fack_count, reord); + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } else { + if (dup_sack && (sacked&TCPCB_RETRANS)) + reord = min(fack_count, reord); + } + + /* D-SACK. We can detect redundant retransmission + * in S|R and plain R frames and clear it. + * undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && + (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + } + } + + /* Check for lost retransmit. This superb idea is + * borrowed from "ratehalving". Event "C". + * Later note: FACK people cheated me again 8), + * we have to account for reordering! Ugly, + * but should help. + */ + if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + struct sk_buff *skb; + + sk_stream_for_retrans_queue(skb, sk) { + if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && + after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && + (IsFack(tp) || + !before(lost_retrans, + TCP_SKB_CB(skb)->ack_seq + tp->reordering * + tp->mss_cache_std))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + } + } + } + } + + tp->left_out = tp->sacked_out + tp->lost_out; + + if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) + tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); +#endif + return flag; +} + +/* RTO occurred, but do not yet enter loss state. Instead, transmit two new + * segments to see from the next ACKs whether any data was really missing. + * If the RTO was spurious, new ACKs should arrive. + */ +void tcp_enter_frto(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + tp->frto_counter = 1; + + if (tp->ca_state <= TCP_CA_Disorder || + tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + if (!tcp_westwood_ssthresh(tp)) + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + + /* Have to clear retransmission markers here to keep the bookkeeping + * in shape, even though we are not yet in Loss state. + * If something was really lost, it is eventually caught up + * in tcp_enter_frto_loss. + */ + tp->retrans_out = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = 0; + + sk_stream_for_retrans_queue(skb, sk) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + } + tcp_sync_left_out(tp); + + tcp_set_ca_state(tp, TCP_CA_Open); + tp->frto_highmark = tp->snd_nxt; +} + +/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, + * which indicates that we should follow the traditional RTO recovery, + * i.e. mark everything lost and do go-back-N retransmission. + */ +static void tcp_enter_frto_loss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt = 0; + + tp->sacked_out = 0; + tp->lost_out = 0; + tp->fackets_out = 0; + + sk_stream_for_retrans_queue(skb, sk) { + cnt += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + + /* Do not mark those segments lost that were + * forward transmitted after RTO + */ + if (!after(TCP_SKB_CB(skb)->end_seq, + tp->frto_highmark)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } else { + tp->sacked_out += tcp_skb_pcount(skb); + tp->fackets_out = cnt; + } + } + tcp_sync_left_out(tp); + + tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->undo_marker = 0; + tp->frto_counter = 0; + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(tp, TCP_CA_Loss); + tp->high_seq = tp->frto_highmark; + TCP_ECN_queue_cwr(tp); + + init_bictcp(tp); +} + +void tcp_clear_retrans(struct tcp_sock *tp) +{ + tp->left_out = 0; + tp->retrans_out = 0; + + tp->fackets_out = 0; + tp->sacked_out = 0; + tp->lost_out = 0; + + tp->undo_marker = 0; + tp->undo_retrans = 0; +} + +/* Enter Loss state. If "how" is not zero, forget all SACK information + * and reset tags completely, otherwise preserve SACKs. If receiver + * dropped its ofo queue, we will know this due to reneging detection. + */ +void tcp_enter_loss(struct sock *sk, int how) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt = 0; + + /* Reduce ssthresh if it has not yet been made inside this window. */ + if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + + tcp_clear_retrans(tp); + + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ + if (!how) + tp->undo_marker = tp->snd_una; + + sk_stream_for_retrans_queue(skb, sk) { + cnt += tcp_skb_pcount(skb); + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } else { + tp->sacked_out += tcp_skb_pcount(skb); + tp->fackets_out = cnt; + } + } + tcp_sync_left_out(tp); + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(tp, TCP_CA_Loss); + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); +} + +static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb; + + /* If ACK arrived pointing to a remembered SACK, + * it means that our remembered SACKs do not reflect + * real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * Do processing similar to RTO timeout. + */ + if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); + + tcp_enter_loss(sk, 1); + tp->retransmits++; + tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 1; + } + return 0; +} + +static inline int tcp_fackets_out(struct tcp_sock *tp) +{ + return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; +} + +static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +{ + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); +} + +static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +{ + return tp->packets_out && + tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); +} + +/* Linux NewReno/SACK/FACK/ECN state machine. + * -------------------------------------- + * + * "Open" Normal state, no dubious events, fast path. + * "Disorder" In all the respects it is "Open", + * but requires a bit more attention. It is entered when + * we see some SACKs or dupacks. It is split of "Open" + * mainly to move some processing from fast path to slow one. + * "CWR" CWND was reduced due to some Congestion Notification event. + * It can be ECN, ICMP source quench, local device congestion. + * "Recovery" CWND was reduced, we are fast-retransmitting. + * "Loss" CWND was reduced due to RTO timeout or SACK reneging. + * + * tcp_fastretrans_alert() is entered: + * - each incoming ACK, if state is not "Open" + * - when arrived ACK is unusual, namely: + * * SACK + * * Duplicate ACK. + * * ECN ECE. + * + * Counting packets in flight is pretty simple. + * + * in_flight = packets_out - left_out + retrans_out + * + * packets_out is SND.NXT-SND.UNA counted in packets. + * + * retrans_out is number of retransmitted segments. + * + * left_out is number of segments left network, but not ACKed yet. + * + * left_out = sacked_out + lost_out + * + * sacked_out: Packets, which arrived to receiver out of order + * and hence not ACKed. With SACKs this number is simply + * amount of SACKed data. Even without SACKs + * it is easy to give pretty reliable estimate of this number, + * counting duplicate ACKs. + * + * lost_out: Packets lost by network. TCP has no explicit + * "loss notification" feedback from network (for now). + * It means that this number can be only _guessed_. + * Actually, it is the heuristics to predict lossage that + * distinguishes different algorithms. + * + * F.e. after RTO, when all the queue is considered as lost, + * lost_out = packets_out and in_flight = retrans_out. + * + * Essentially, we have now two algorithms counting + * lost packets. + * + * FACK: It is the simplest heuristics. As soon as we decided + * that something is lost, we decide that _all_ not SACKed + * packets until the most forward SACK are lost. I.e. + * lost_out = fackets_out - sacked_out and left_out = fackets_out. + * It is absolutely correct estimate, if network does not reorder + * packets. And it loses any connection to reality when reordering + * takes place. We use FACK by default until reordering + * is suspected on the path to this destination. + * + * NewReno: when Recovery is entered, we assume that one segment + * is lost (classic Reno). While we are in Recovery and + * a partial ACK arrives, we assume that one more packet + * is lost (NewReno). This heuristics are the same in NewReno + * and SACK. + * + * Imagine, that's all! Forget about all this shamanism about CWND inflation + * deflation etc. CWND is real congestion window, never inflated, changes + * only according to classic VJ rules. + * + * Really tricky (and requiring careful tuning) part of algorithm + * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The first determines the moment _when_ we should reduce CWND and, + * hence, slow down forward transmission. In fact, it determines the moment + * when we decide that hole is caused by loss, rather than by a reorder. + * + * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill + * holes, caused by lost packets. + * + * And the most logically complicated part of algorithm is undo + * heuristics. We detect false retransmits due to both too early + * fast retransmit (reordering) and underestimated RTO, analyzing + * timestamps and D-SACKs. When we detect that some segments were + * retransmitted by mistake and CWND reduction was wrong, we undo + * window reduction and abort recovery phase. This logic is hidden + * inside several functions named tcp_try_undo_<something>. + */ + +/* This function decides, when we should leave Disordered state + * and enter Recovery phase, reducing congestion window. + * + * Main question: may we further continue forward transmission + * with the same cwnd? + */ +static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out; + + /* Trick#1: The loss is proven. */ + if (tp->lost_out) + return 1; + + /* Not-A-Trick#2 : Classic rule... */ + if (tcp_fackets_out(tp) > tp->reordering) + return 1; + + /* Trick#3 : when we use RFC2988 timer restart, fast + * retransmit can be triggered by timeout of queue head. + */ + if (tcp_head_timedout(sk, tp)) + return 1; + + /* Trick#4: It is still not OK... But will it be useful to delay + * recovery more? + */ + packets_out = tp->packets_out; + if (packets_out <= tp->reordering && + tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + !tcp_may_send_now(sk, tp)) { + /* We have nothing to send. This connection is limited + * either by receiver window or by application. + */ + return 1; + } + + return 0; +} + +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) +{ + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(tp, tp->packets_out+addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out++; + tcp_check_reno_reordering(tp, 0); + tcp_sync_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) +{ + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(tp, acked); + tcp_sync_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; + tp->left_out = tp->lost_out; +} + +/* Mark head of queue up as lost. */ +static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, + int packets, u32 high_seq) +{ + struct sk_buff *skb; + int cnt = packets; + + BUG_TRAP(cnt <= tp->packets_out); + + sk_stream_for_retrans_queue(skb, sk) { + cnt -= tcp_skb_pcount(skb); + if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + break; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } + tcp_sync_left_out(tp); +} + +/* Account newly detected lost packet(s) */ + +static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +{ + if (IsFack(tp)) { + int lost = tp->fackets_out - tp->reordering; + if (lost <= 0) + lost = 1; + tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + } else { + tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + } + + /* New heuristics: it is possible only after we switched + * to restart timer each time when something is ACKed. + * Hence, we can detect timed out packets during fast + * retransmit without falling to slow start. + */ + if (tcp_head_timedout(sk, tp)) { + struct sk_buff *skb; + + sk_stream_for_retrans_queue(skb, sk) { + if (tcp_skb_timedout(tp, skb) && + !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } + tcp_sync_left_out(tp); + } +} + +/* CWND moderation, preventing bursts due to too big ACKs + * in dubious situations. + */ +static inline void tcp_moderate_cwnd(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Decrease cwnd each second ack. */ + +static void tcp_cwnd_down(struct tcp_sock *tp) +{ + int decr = tp->snd_cwnd_cnt + 1; + __u32 limit; + + /* + * TCP Westwood + * Here limit is evaluated as BWestimation*RTTmin (for obtaining it + * in packets we use mss_cache). If sysctl_tcp_westwood is off + * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is + * still used as usual. It prevents other strange cases in which + * BWE*RTTmin could assume value 0. It should not happen but... + */ + + if (!(limit = tcp_westwood_bw_rttmin(tp))) + limit = tp->snd_ssthresh/2; + + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; + + if (decr && tp->snd_cwnd > limit) + tp->snd_cwnd -= decr; + + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Nothing was retransmitted or returned timestamp is less + * than timestamp of the first retransmission. + */ +static inline int tcp_packet_delayed(struct tcp_sock *tp) +{ + return !tp->retrans_stamp || + (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0); +} + +/* Undo procedures. */ + +#if FASTRETRANS_DEBUG > 1 +static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +{ + struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", + msg, + NIPQUAD(inet->daddr), ntohs(inet->dport), + tp->snd_cwnd, tp->left_out, + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); +} +#else +#define DBGUNDO(x...) do { } while (0) +#endif + +static void tcp_undo_cwr(struct tcp_sock *tp, int undo) +{ + if (tp->prior_ssthresh) { + if (tcp_is_bic(tp)) + tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + else + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + + if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + tp->snd_ssthresh = tp->prior_ssthresh; + TCP_ECN_withdraw_cwr(tp); + } + } else { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + } + tcp_moderate_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline int tcp_may_undo(struct tcp_sock *tp) +{ + return tp->undo_marker && + (!tp->undo_retrans || tcp_packet_delayed(tp)); +} + +/* People celebrate: "We love our President!" */ +static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +{ + if (tcp_may_undo(tp)) { + /* Happy end! We did not retransmit anything + * or our original transmission succeeded. + */ + DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(tp, 1); + if (tp->ca_state == TCP_CA_Loss) + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); + tp->undo_marker = 0; + } + if (tp->snd_una == tp->high_seq && IsReno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + tcp_moderate_cwnd(tp); + return 1; + } + tcp_set_ca_state(tp, TCP_CA_Open); + return 0; +} + +/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ +static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +{ + if (tp->undo_marker && !tp->undo_retrans) { + DBGUNDO(sk, tp, "D-SACK"); + tcp_undo_cwr(tp, 1); + tp->undo_marker = 0; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); + } +} + +/* Undo during fast recovery after partial ACK. */ + +static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, + int acked) +{ + /* Partial ACK arrived. Force Hoe's retransmit. */ + int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + + DBGUNDO(sk, tp, "Hoe"); + tcp_undo_cwr(tp, 0); + NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = 0; + } + return failed; +} + +/* Undo during loss recovery after partial ACK. */ +static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +{ + if (tcp_may_undo(tp)) { + struct sk_buff *skb; + sk_stream_for_retrans_queue(skb, sk) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + DBGUNDO(sk, tp, "partial loss"); + tp->lost_out = 0; + tp->left_out = tp->sacked_out; + tcp_undo_cwr(tp, 1); + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + tp->retransmits = 0; + tp->undo_marker = 0; + if (!IsReno(tp)) + tcp_set_ca_state(tp, TCP_CA_Open); + return 1; + } + return 0; +} + +static inline void tcp_complete_cwr(struct tcp_sock *tp) +{ + if (tcp_westwood_cwnd(tp)) + tp->snd_ssthresh = tp->snd_cwnd; + else + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +{ + tp->left_out = tp->sacked_out; + + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + if (flag&FLAG_ECE) + tcp_enter_cwr(tp); + + if (tp->ca_state != TCP_CA_CWR) { + int state = TCP_CA_Open; + + if (tp->left_out || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (tp->ca_state != state) { + tcp_set_ca_state(tp, state); + tp->high_seq = tp->snd_nxt; + } + tcp_moderate_cwnd(tp); + } else { + tcp_cwnd_down(tp); + } +} + +/* Process an event, which can update packets-in-flight not trivially. + * Main goal of this function is to calculate new estimate for left_out, + * taking into account both packets sitting in receiver's buffer and + * packets lost by network. + * + * Besides that it does CWND reduction, when packet loss is detected + * and changes state of machine. + * + * It does _not_ decide what to send, it is made in function + * tcp_xmit_retransmit_queue(). + */ +static void +tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, + int prior_packets, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + + /* Some technical things: + * 1. Reno does not count dupacks (sacked_out) automatically. */ + if (!tp->packets_out) + tp->sacked_out = 0; + /* 2. SACK counts snd_fack in packets inaccurately. */ + if (tp->sacked_out == 0) + tp->fackets_out = 0; + + /* Now state machine starts. + * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ + if (flag&FLAG_ECE) + tp->prior_ssthresh = 0; + + /* B. In all the states check for reneging SACKs. */ + if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + return; + + /* C. Process data loss notification, provided it is valid. */ + if ((flag&FLAG_DATA_LOST) && + before(tp->snd_una, tp->high_seq) && + tp->ca_state != TCP_CA_Open && + tp->fackets_out > tp->reordering) { + tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); + } + + /* D. Synchronize left_out to current state. */ + tcp_sync_left_out(tp); + + /* E. Check state exit conditions. State can be terminated + * when high_seq is ACKed. */ + if (tp->ca_state == TCP_CA_Open) { + if (!sysctl_tcp_frto) + BUG_TRAP(tp->retrans_out == 0); + tp->retrans_stamp = 0; + } else if (!before(tp->snd_una, tp->high_seq)) { + switch (tp->ca_state) { + case TCP_CA_Loss: + tp->retransmits = 0; + if (tcp_try_undo_recovery(sk, tp)) + return; + break; + + case TCP_CA_CWR: + /* CWR is to be held something *above* high_seq + * is ACKed for CWR bit to reach receiver. */ + if (tp->snd_una != tp->high_seq) { + tcp_complete_cwr(tp); + tcp_set_ca_state(tp, TCP_CA_Open); + } + break; + + case TCP_CA_Disorder: + tcp_try_undo_dsack(sk, tp); + if (!tp->undo_marker || + /* For SACK case do not Open to allow to undo + * catching for all duplicate ACKs. */ + IsReno(tp) || tp->snd_una != tp->high_seq) { + tp->undo_marker = 0; + tcp_set_ca_state(tp, TCP_CA_Open); + } + break; + + case TCP_CA_Recovery: + if (IsReno(tp)) + tcp_reset_reno_sack(tp); + if (tcp_try_undo_recovery(sk, tp)) + return; + tcp_complete_cwr(tp); + break; + } + } + + /* F. Process state. */ + switch (tp->ca_state) { + case TCP_CA_Recovery: + if (prior_snd_una == tp->snd_una) { + if (IsReno(tp) && is_dupack) + tcp_add_reno_sack(tp); + } else { + int acked = prior_packets - tp->packets_out; + if (IsReno(tp)) + tcp_remove_reno_sacks(sk, tp, acked); + is_dupack = tcp_try_undo_partial(sk, tp, acked); + } + break; + case TCP_CA_Loss: + if (flag&FLAG_DATA_ACKED) + tp->retransmits = 0; + if (!tcp_try_undo_loss(sk, tp)) { + tcp_moderate_cwnd(tp); + tcp_xmit_retransmit_queue(sk); + return; + } + if (tp->ca_state != TCP_CA_Open) + return; + /* Loss is undone; fall through to processing in Open state. */ + default: + if (IsReno(tp)) { + if (tp->snd_una != prior_snd_una) + tcp_reset_reno_sack(tp); + if (is_dupack) + tcp_add_reno_sack(tp); + } + + if (tp->ca_state == TCP_CA_Disorder) + tcp_try_undo_dsack(sk, tp); + + if (!tcp_time_to_recover(sk, tp)) { + tcp_try_to_open(sk, tp, flag); + return; + } + + /* Otherwise enter Recovery state */ + + if (IsReno(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (tp->ca_state < TCP_CA_CWR) { + if (!(flag&FLAG_ECE)) + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + TCP_ECN_queue_cwr(tp); + } + + tp->snd_cwnd_cnt = 0; + tcp_set_ca_state(tp, TCP_CA_Recovery); + } + + if (is_dupack || tcp_head_timedout(sk, tp)) + tcp_update_scoreboard(sk, tp); + tcp_cwnd_down(tp); + tcp_xmit_retransmit_queue(sk); +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +{ + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> + * + * Changed: reset backoff as soon as we see the first valid sample. + * If we do not, we get strongly overstimated rto. With timestamps + * samples are accepted even from very old segments: f.e., when rtt=1 + * increases to 8, we retransmit 5 times and after 8 seconds delayed + * answer arrives rto becomes 120 seconds! If at least one of segments + * in window is lost... Voila. --ANK (010210) + */ + seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +} + +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +{ + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (flag & FLAG_RETRANS_DATA_ACKED) + return; + + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +} + +static inline void tcp_ack_update_rtt(struct tcp_sock *tp, + int flag, s32 seq_rtt) +{ + /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tcp_ack_saw_tstamp(tp, flag); + else if (seq_rtt >= 0) + tcp_ack_no_tstamp(tp, seq_rtt, flag); +} + +/* + * Compute congestion window to use. + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injog Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ +static inline __u32 bictcp_cwnd(struct tcp_sock *tp) +{ + /* orignal Reno behaviour */ + if (!tcp_is_bic(tp)) + return tp->snd_cwnd; + + if (tp->bictcp.last_cwnd == tp->snd_cwnd && + (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) + return tp->bictcp.cnt; + + tp->bictcp.last_cwnd = tp->snd_cwnd; + tp->bictcp.last_stamp = tcp_time_stamp; + + /* start off normal */ + if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) + tp->bictcp.cnt = tp->snd_cwnd; + + /* binary increase */ + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { + __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) + / BICTCP_B; + + if (dist > BICTCP_MAX_INCREMENT) + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + else if (dist <= 1U) + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd / dist; + } else { + /* slow start amd linear increase */ + if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) + / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); + else + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + } + return tp->bictcp.cnt; +} + +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +static inline void reno_cong_avoid(struct tcp_sock *tp) +{ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->vegas.beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / + tp->mss_cache_std; + old_snd_cwnd = tp->vegas.beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; + tp->vegas.beg_snd_nxt = tp->snd_nxt; + tp->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->vegas.cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->vegas.minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * tp->vegas.baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); + + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + vegas_cong_avoid(tp, ack, seq_rtt); + else + reno_cong_avoid(tp); +} + +/* Restart timer after forward progress on connection. + * RFC2988 recommends to restart timer to now+rto. + */ + +static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +{ + if (!tp->packets_out) { + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } +} + +/* There is one downside to this scheme. Although we keep the + * ACK clock ticking, adjusting packet counters and advancing + * congestion window, we do not liberate socket send buffer + * space. + * + * Mucking with skb->truesize and sk->sk_wmem_alloc et al. + * then making a write space wakeup callback is a possible + * future enhancement. WARNING: it is not trivial to make. + */ +static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, + __u32 now, __s32 *seq_rtt) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u32 seq = tp->snd_una; + __u32 packets_acked; + int acked = 0; + + /* If we get here, the whole TSO packet has not been + * acked. + */ + BUG_ON(!after(scb->end_seq, seq)); + + packets_acked = tcp_skb_pcount(skb); + if (tcp_trim_head(sk, skb, seq - scb->seq)) + return 0; + packets_acked -= tcp_skb_pcount(skb); + + if (packets_acked) { + __u8 sacked = scb->sacked; + + acked |= FLAG_DATA_ACKED; + if (sacked) { + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= packets_acked; + acked |= FLAG_RETRANS_DATA_ACKED; + *seq_rtt = -1; + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= packets_acked; + if (sacked & TCPCB_LOST) + tp->lost_out -= packets_acked; + if (sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + + if (tp->fackets_out) { + __u32 dval = min(tp->fackets_out, packets_acked); + tp->fackets_out -= dval; + } + tp->packets_out -= packets_acked; + + BUG_ON(tcp_skb_pcount(skb) == 0); + BUG_ON(!before(scb->seq, scb->end_seq)); + } + + return acked; +} + + +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + __u32 now = tcp_time_stamp; + int acked = 0; + __s32 seq_rtt = -1; + + while ((skb = skb_peek(&sk->sk_write_queue)) && + skb != sk->sk_send_head) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u8 sacked = scb->sacked; + + /* If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived at + * the other end. + */ + if (after(scb->end_seq, tp->snd_una)) { + if (tcp_skb_pcount(skb) > 1) + acked |= tcp_tso_acked(sk, skb, + now, &seq_rtt); + break; + } + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + } else { + acked |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (sacked) { + if (sacked & TCPCB_RETRANS) { + if(sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(skb); + acked |= FLAG_RETRANS_DATA_ACKED; + seq_rtt = -1; + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= tcp_skb_pcount(skb); + if (sacked & TCPCB_LOST) + tp->lost_out -= tcp_skb_pcount(skb); + if (sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(scb->end_seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + tcp_dec_pcount_approx(&tp->fackets_out, skb); + tcp_packets_out_dec(tp, skb); + __skb_unlink(skb, skb->list); + sk_stream_free_skb(sk, skb); + } + + if (acked&FLAG_ACKED) { + tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_packets_out(sk, tp); + } + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + if (!tp->packets_out && tp->rx_opt.sack_ok) { + if (tp->lost_out) { + printk(KERN_DEBUG "Leak l=%u %d\n", + tp->lost_out, tp->ca_state); + tp->lost_out = 0; + } + if (tp->sacked_out) { + printk(KERN_DEBUG "Leak s=%u %d\n", + tp->sacked_out, tp->ca_state); + tp->sacked_out = 0; + } + if (tp->retrans_out) { + printk(KERN_DEBUG "Leak r=%u %d\n", + tp->retrans_out, tp->ca_state); + tp->retrans_out = 0; + } + } +#endif + *seq_rtt_p = seq_rtt; + return acked; +} + +static void tcp_ack_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Was it a usable window open? */ + + if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + tp->snd_una + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } +} + +static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) +{ + return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + tp->ca_state != TCP_CA_Open); +} + +static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) +{ + return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && + !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); +} + +/* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, + u32 ack_seq, u32 nwin) +{ + return (after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); +} + +/* Update our send window. + * + * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 + * and in FreeBSD. NetBSD's one is even worse.) is wrong. + */ +static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb, u32 ack, u32 ack_seq) +{ + int flag = 0; + u32 nwin = ntohs(skb->h.th->window); + + if (likely(!skb->h.th->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + tcp_fast_path_check(sk, tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + } + + tp->snd_una = ack; + + return flag; +} + +static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_sync_left_out(tp); + + if (tp->snd_una == prior_snd_una || + !before(tp->snd_una, tp->frto_highmark)) { + /* RTO was caused by loss, start retransmitting in + * go-back-N slow start + */ + tcp_enter_frto_loss(sk); + return; + } + + if (tp->frto_counter == 1) { + /* First ACK after RTO advances the window: allow two new + * segments out. + */ + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + } else { + /* Also the second ACK after RTO advances the window. + * The RTO was likely spurious. Reduce cwnd and continue + * in congestion avoidance + */ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tcp_moderate_cwnd(tp); + } + + /* F-RTO affects on two new ACKs following RTO. + * At latest on third ACK the TCP behavor is back to normal. + */ + tp->frto_counter = (tp->frto_counter + 1) % 3; +} + +/* + * TCP Westwood+ + */ + +/* + * @init_westwood + * This function initializes fields used in TCP Westwood+. We can't + * get no information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ + +static void init_westwood(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = 0; + tp->westwood.bw_est = 0; + tp->westwood.accounted = 0; + tp->westwood.cumul_ack = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; + tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; + tp->westwood.snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coeffients. + */ + +static inline __u32 westwood_do_filter(__u32 a, __u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static void westwood_filter(struct sock *sk, __u32 delta) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = + westwood_do_filter(tp->westwood.bw_ns_est, + tp->westwood.bk / delta); + tp->westwood.bw_est = + westwood_do_filter(tp->westwood.bw_est, + tp->westwood.bw_ns_est); +} + +/* + * @westwood_update_rttmin + * It is used to update RTTmin. In this case we MUST NOT use + * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! + */ + +static inline __u32 westwood_update_rttmin(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 rttmin = tp->westwood.rtt_min; + + if (tp->westwood.rtt != 0 && + (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) + rttmin = tp->westwood.rtt; + + return rttmin; +} + +/* + * @westwood_acked + * Evaluate increases for dk. + */ + +static inline __u32 westwood_acked(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->snd_una - tp->westwood.snd_una; +} + +/* + * @westwood_new_window + * It evaluates if we are receiving data inside the same RTT window as + * when we started. + * Return value: + * It returns 0 if we are still evaluating samples in the same RTT + * window, 1 if the sample has to be considered in the next window. + */ + +static int westwood_new_window(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 left_bound; + __u32 rtt; + int ret = 0; + + left_bound = tp->westwood.rtt_win_sx; + rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); + + /* + * A RTT-window has passed. Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was choosen since an estimation on small + * time intervals is better to avoid... + * Obvioulsy on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + + if ((left_bound + rtt) < tcp_time_stamp) + ret = 1; + + return ret; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ + +static void __westwood_update_window(struct sock *sk, __u32 now) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 delta = now - tp->westwood.rtt_win_sx; + + if (delta) { + if (tp->westwood.rtt) + westwood_filter(sk, delta); + + tp->westwood.bk = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + } +} + + +static void westwood_update_window(struct sock *sk, __u32 now) +{ + if (westwood_new_window(sk)) + __westwood_update_window(sk, now); +} + +/* + * @__tcp_westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successfull. In such case infact update is + * straight forward and doesn't need any particular care. + */ + +static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked(sk); + tp->westwood.snd_una = tp->snd_una; + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_fast_bw(sk, skb); +} + + +/* + * @westwood_dupack_update + * It updates accounted and cumul_ack when receiving a dupack. + */ + +static void westwood_dupack_update(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.accounted += tp->mss_cache_std; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline int westwood_may_change_cumul(struct tcp_sock *tp) +{ + return (tp->westwood.cumul_ack > tp->mss_cache_std); +} + +static inline void westwood_partial_update(struct tcp_sock *tp) +{ + tp->westwood.accounted -= tp->westwood.cumul_ack; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline void westwood_complete_update(struct tcp_sock *tp) +{ + tp->westwood.cumul_ack -= tp->westwood.accounted; + tp->westwood.accounted = 0; +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating dk in case of + * delayed or partial acks. + */ + +static inline __u32 westwood_acked_count(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.cumul_ack = westwood_acked(sk); + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!(tp->westwood.cumul_ack)) + westwood_dupack_update(sk); + + if (westwood_may_change_cumul(tp)) { + /* Partial or delayed ack */ + if (tp->westwood.accounted >= tp->westwood.cumul_ack) + westwood_partial_update(tp); + else + westwood_complete_update(tp); + } + + tp->westwood.snd_una = tp->snd_una; + + return tp->westwood.cumul_ack; +} + + +/* + * @__tcp_westwood_slow_bw + * It is called when something is going wrong..even if there could + * be no problems! Infact a simple delayed packet may trigger a + * dupack. But we need to be careful in such case. + */ + +static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked_count(sk); + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_slow_bw(sk, skb); +} + +/* This routine deals with incoming acks, but not outgoing ones. */ +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 prior_in_flight; + s32 seq_rtt; + int prior_packets; + + /* If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + if (after(ack, tp->snd_nxt)) + goto uninteresting_ack; + + if (before(ack, prior_snd_una)) + goto old_ack; + + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack, ack_seq); + tp->snd_una = ack; + tcp_westwood_fast_bw(sk, skb); + flag |= FLAG_WIN_UPDATE; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); + } else { + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); + + flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + flag |= FLAG_ECE; + + tcp_westwood_slow_bw(sk,skb); + } + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->sk_err_soft = 0; + tp->rcv_tstamp = tcp_time_stamp; + prior_packets = tp->packets_out; + if (!prior_packets) + goto no_queue; + + prior_in_flight = tcp_packets_in_flight(tp); + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + + if (tp->frto_counter) + tcp_process_frto(sk, prior_snd_una); + + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt); + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + } else { + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) + tcp_cong_avoid(tp, ack, seq_rtt); + } + + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) + dst_confirm(sk->sk_dst_cache); + + return 1; + +no_queue: + tp->probes_out = 0; + + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + if (sk->sk_send_head) + tcp_ack_probe(sk); + return 1; + +old_ack: + if (TCP_SKB_CB(skb)->sacked) + tcp_sacktag_write_queue(sk, skb, prior_snd_una); + +uninteresting_ack: + SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return 0; +} + + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + */ +void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) +{ + unsigned char *ptr; + struct tcphdr *th = skb->h.th; + int length=(th->doff*4)-sizeof(struct tcphdr); + + ptr = (unsigned char *)(th + 1); + opt_rx->saw_tstamp = 0; + + while(length>0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch(opcode) { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = ntohs(get_unaligned((__u16 *)ptr)); + if (in_mss) { + if (opt_rx->user_mss && opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; + } + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (sysctl_tcp_window_scaling) { + __u8 snd_wscale = *(__u8 *) ptr; + opt_rx->wscale_ok = 1; + if (snd_wscale > 14) { + if(net_ratelimit()) + printk(KERN_INFO "tcp_parse_options: Illegal window " + "scaling value %d >14 received.\n", + snd_wscale); + snd_wscale = 14; + } + opt_rx->snd_wscale = snd_wscale; + } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if ((estab && opt_rx->tstamp_ok) || + (!estab && sysctl_tcp_timestamps)) { + opt_rx->saw_tstamp = 1; + opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr)); + opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4))); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (sysctl_tcp_sack) { + opt_rx->sack_ok = 1; + tcp_sack_reset(opt_rx); + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + opt_rx->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + }; + ptr+=opsize-2; + length-=opsize; + }; + } +} + +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) +{ + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->rx_opt.saw_tstamp = 0; + return 0; + } else if (tp->rx_opt.tstamp_ok && + th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + return 1; + } + } + tcp_parse_options(skb, &tp->rx_opt, 1); + return 1; +} + +static inline void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = xtime.tv_sec; +} + +static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + tcp_store_ts_recent(tp); + } +} + +/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + * + * It is not fatal. If this ACK does _not_ change critical state (seqs, window) + * it can pass through stack. So, the following predicate verifies that + * this segment is not used for anything but congestion avoidance or + * fast retransmit. Moreover, we even are able to eliminate most of such + * second order effects, if we apply some small "replay" window (~RTO) + * to timestamp space. + * + * All these measures still do not guarantee that we reject wrapped ACKs + * on networks with high bandwidth, when sequence space is recycled fastly, + * but it guarantees that such events will be very rare and do not affect + * connection seriously. This doesn't look nice, but alas, PAWS is really + * buggy extension. + * + * [ Later note. Even worse! It is buggy for segments _with_ data. RFC + * states that events when retransmit arrives after original data are rare. + * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is + * the biggest problem on large power networks even with minor reordering. + * OK, let's give it small replay window. If peer clock is even 1hz, it is safe + * up to bandwidth of 18Gigabit/sec. 8) ] + */ + +static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + + return (/* 1. Pure ACK with correct sequence number. */ + (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + + /* 2. ... and duplicate ACK. */ + ack == tp->snd_una && + + /* 3. ... and does not update window. */ + !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + + /* 4. ... and sits in replay window. */ + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); +} + +static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +{ + return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && + xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + !tcp_disordered_ack(tp, skb)); +} + +/* Check segment sequence number for validity. + * + * Segment controls are considered valid, if the segment + * fits to the window after truncation to the window. Acceptability + * of data (and SYN, FIN, of course) is checked separately. + * See tcp_data_queue(), for example. + * + * Also, controls (RST is main one) are accepted using RCV.WUP instead + * of RCV.NXT. Peer still did not advance his SND.UNA when we + * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. + * (borrowed from freebsd) + */ + +static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + tcp_done(sk); +} + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + */ +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_schedule_ack(tp); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + tp->ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", + __FUNCTION__, sk->sk_state); + break; + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_stream_mem_reclaim(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); + } +} + +static __inline__ int +tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +{ + if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { + if (before(seq, sp->start_seq)) + sp->start_seq = seq; + if (after(end_seq, sp->end_seq)) + sp->end_seq = end_seq; + return 1; + } + return 0; +} + +static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (before(seq, tp->rcv_nxt)) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); + + tp->rx_opt.dsack = 1; + tp->duplicate_sack[0].start_seq = seq; + tp->duplicate_sack[0].end_seq = end_seq; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok); + } +} + +static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + if (!tp->rx_opt.dsack) + tcp_dsack_set(tp, seq, end_seq); + else + tcp_sack_extend(tp->duplicate_sack, seq, end_seq); +} + +static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + tcp_enter_quickack_mode(tp); + + if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) + end_seq = tp->rcv_nxt; + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + } + } + + tcp_send_ack(sk); +} + +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) +{ + int this_sack; + struct tcp_sack_block *sp = &tp->selective_acks[0]; + struct tcp_sack_block *swalk = sp+1; + + /* See if the recent change to the first SACK eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) { + if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { + int i; + + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + tp->rx_opt.num_sacks--; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + sp[i] = sp[i+1]; + continue; + } + this_sack++, swalk++; + } +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->rx_opt.num_sacks; + int this_sack; + + if (!cur_sacks) + goto new_sack; + + for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) { + if (tcp_sack_extend(sp, seq, end_seq)) { + /* Rotate this_sack to the first one. */ + for (; this_sack>0; this_sack--, sp--) + tcp_sack_swap(sp, sp-1); + if (cur_sacks > 1) + tcp_sack_maybe_coalesce(tp); + return; + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (this_sack >= 4) { + this_sack--; + tp->rx_opt.num_sacks--; + sp--; + } + for(; this_sack > 0; this_sack--, sp--) + *sp = *(sp-1); + +new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = seq; + sp->end_seq = end_seq; + tp->rx_opt.num_sacks++; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); +} + +/* RCV.NXT advances, some SACKs should be eaten. */ + +static void tcp_sack_remove(struct tcp_sock *tp) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->rx_opt.num_sacks; + int this_sack; + + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) { + tp->rx_opt.num_sacks = 0; + tp->rx_opt.eff_sacks = tp->rx_opt.dsack; + return; + } + + for(this_sack = 0; this_sack < num_sacks; ) { + /* Check if the start of the sack is covered by RCV.NXT. */ + if (!before(tp->rcv_nxt, sp->start_seq)) { + int i; + + /* RCV.NXT must cover all the block! */ + BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); + + /* Zap this SACK, by moving forward any other SACKS. */ + for (i=this_sack+1; i < num_sacks; i++) + tp->selective_acks[i-1] = tp->selective_acks[i]; + num_sacks--; + continue; + } + this_sack++; + sp++; + } + if (num_sacks != tp->rx_opt.num_sacks) { + tp->rx_opt.num_sacks = num_sacks; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + } +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 dsack_high = tp->rcv_nxt; + struct sk_buff *skb; + + while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + + if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { + __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received \n"); + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + __skb_unlink(skb, skb->list); + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); + } +} + +static int tcp_prune_queue(struct sock *sk); + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + int eaten = -1; + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) + goto drop; + + th = skb->h.th; + __skb_pull(skb, th->doff*4); + + TCP_ECN_accept_cwr(tp, skb); + + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, + 4 - tp->rx_opt.tstamp_ok); + } + + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + goto out_of_window; + + /* Ok. In sequence. In window. */ + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && + sock_owned_by_user(sk) && !tp->urg_data) { + int chunk = min_t(unsigned int, skb->len, + tp->ucopy.len); + + __set_current_state(TASK_RUNNING); + + local_bh_enable(); + if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !th->fin); + tcp_rcv_space_adjust(sk); + } + local_bh_disable(); + } + + if (eaten <= 0) { +queue_and_out: + if (eaten < 0 && + (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_stream_rmem_schedule(sk, skb))) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) + goto drop; + } + sk_stream_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + } + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->len) + tcp_event_data_recv(sk, tp, skb); + if(th->fin) + tcp_fin(skb, sk, th); + + if (skb_queue_len(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ + if (!skb_queue_len(&tp->out_of_order_queue)) + tp->ack.pingpong = 0; + } + + if (tp->rx_opt.num_sacks) + tcp_sack_remove(tp); + + tcp_fast_path_check(sk, tp); + + if (eaten > 0) + __kfree_skb(skb); + else if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + return; + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* A retransmit, 2nd most common case. Force an immediate ack. */ + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + +out_of_window: + tcp_enter_quickack_mode(tp); + tcp_schedule_ack(tp); +drop: + __kfree_skb(skb); + return; + } + + /* Out of window. F.e. zero window probe. */ + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + goto out_of_window; + + tcp_enter_quickack_mode(tp); + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + + /* If window is closed, drop tail of packet. But after + * remembering D-SACK for its head made in previous line. + */ + if (!tcp_receive_window(tp)) + goto out_of_window; + goto queue_and_out; + } + + TCP_ECN_check_ce(tp, skb); + + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_stream_rmem_schedule(sk, skb)) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) + goto drop; + } + + /* Disable header prediction. */ + tp->pred_flags = 0; + tcp_schedule_ack(tp); + + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + sk_stream_set_owner_r(skb, sk); + + if (!skb_peek(&tp->out_of_order_queue)) { + /* Initial out of order segment, build 1 SACK. */ + if (tp->rx_opt.sack_ok) { + tp->rx_opt.num_sacks = 1; + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = 1; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = + TCP_SKB_CB(skb)->end_seq; + } + __skb_queue_head(&tp->out_of_order_queue,skb); + } else { + struct sk_buff *skb1 = tp->out_of_order_queue.prev; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + __skb_append(skb1, skb); + + if (!tp->rx_opt.num_sacks || + tp->selective_acks[0].end_seq != seq) + goto add_sack; + + /* Common case: data arrive in order after hole. */ + tp->selective_acks[0].end_seq = end_seq; + return; + } + + /* Find place to insert this segment. */ + do { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + } while ((skb1 = skb1->prev) != + (struct sk_buff*)&tp->out_of_order_queue); + + /* Do skb overlap to previous one? */ + if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + __kfree_skb(skb); + tcp_dsack_set(tp, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + skb1 = skb1->prev; + } + } + __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + + /* And clean segments covered by new one as whole. */ + while ((skb1 = skb->next) != + (struct sk_buff*)&tp->out_of_order_queue && + after(end_seq, TCP_SKB_CB(skb1)->seq)) { + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); + break; + } + __skb_unlink(skb1, skb1->list); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); + } + +add_sack: + if (tp->rx_opt.sack_ok) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + } +} + +/* Collapse contiguous sequence of skbs head..tail with + * sequence numbers start..end. + * Segments with FIN/SYN are not collapsed (only because this + * simplifies code) + */ +static void +tcp_collapse(struct sock *sk, struct sk_buff *head, + struct sk_buff *tail, u32 start, u32 end) +{ + struct sk_buff *skb; + + /* First, check that queue is collapsable and find + * the point where collapsing can be useful. */ + for (skb = head; skb != tail; ) { + /* No new bits? It is possible on ofo queue. */ + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + skb = next; + continue; + } + + /* The first skb to collapse is: + * - not SYN/FIN and + * - bloated or contains data before "start" or + * overlaps to the next one. + */ + if (!skb->h.th->syn && !skb->h.th->fin && + (tcp_win_from_space(skb->truesize) > skb->len || + before(TCP_SKB_CB(skb)->seq, start) || + (skb->next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + break; + + /* Decided to skip this, advance start seq. */ + start = TCP_SKB_CB(skb)->end_seq; + skb = skb->next; + } + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + + while (before(start, end)) { + struct sk_buff *nskb; + int header = skb_headroom(skb); + int copy = SKB_MAX_ORDER(header, 0); + + /* Too big header? This can happen with IPv6. */ + if (copy < 0) + return; + if (end-start < copy) + copy = end-start; + nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); + nskb->h.raw = nskb->head + (skb->h.raw-skb->head); + nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; + __skb_insert(nskb, skb->prev, skb, skb->list); + sk_stream_set_owner_r(nskb, sk); + + /* Copy data, releasing collapsed skbs. */ + while (copy > 0) { + int offset = start - TCP_SKB_CB(skb)->seq; + int size = TCP_SKB_CB(skb)->end_seq - start; + + if (offset < 0) BUG(); + if (size > 0) { + size = min(copy, size); + if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) + BUG(); + TCP_SKB_CB(nskb)->end_seq += size; + copy -= size; + start += size; + } + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + skb = next; + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + } + } + } +} + +/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs + * and tcp_collapse() them until all the queue is collapsed. + */ +static void tcp_collapse_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); + struct sk_buff *head; + u32 start, end; + + if (skb == NULL) + return; + + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + head = skb; + + for (;;) { + skb = skb->next; + + /* Segment is terminated when we see gap or when + * we are at the end of all the queue. */ + if (skb == (struct sk_buff *)&tp->out_of_order_queue || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { + tcp_collapse(sk, head, skb, start, end); + head = skb; + if (skb == (struct sk_buff *)&tp->out_of_order_queue) + break; + /* Start new segment */ + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + } else { + if (before(TCP_SKB_CB(skb)->seq, start)) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; + } + } +} + +/* Reduce allocated memory if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int tcp_prune_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk, tp); + else if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); + tcp_collapse(sk, sk->sk_receive_queue.next, + (struct sk_buff*)&sk->sk_receive_queue, + tp->copied_seq, tp->rcv_nxt); + sk_stream_mem_reclaim(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* Collapsing did not help, destructive actions follow. + * This must not ever occur. */ + + /* First, purge the out_of_order queue. */ + if (skb_queue_len(&tp->out_of_order_queue)) { + NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, + skb_queue_len(&tp->out_of_order_queue)); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_stream_mem_reclaim(sk); + } + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + */ + NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED); + + /* Massive buffer overcommit. */ + tp->pred_flags = 0; + return -1; +} + + +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 win_used = max(tp->snd_cwnd_used, 2U); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + + +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket + * on the exit from tcp input handler. + * + * PROBLEM: sndbuf expansion does not work well with largesend. + */ +static void tcp_new_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->packets_out < tp->snd_cwnd && + !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), + demanded = max_t(unsigned int, tp->snd_cwnd, + tp->reordering + 1); + sndmem *= 2*demanded; + if (sndmem > sk->sk_sndbuf) + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tp->snd_cwnd_stamp = tcp_time_stamp; + } + + sk->sk_write_space(sk); +} + +static inline void tcp_check_space(struct sock *sk) +{ + if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_new_space(sk); + } +} + +static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + +static __inline__ void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb != NULL) + __tcp_data_snd_check(sk, skb); + tcp_check_space(sk); +} + +/* + * Check if sending an ack is needed. + */ +static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). Or... + */ + && __tcp_select_window(sk) >= tp->rcv_wnd) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(tp) || + /* We have out of order data. */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue))) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(sk); + } +} + +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (!tcp_ack_scheduled(tp)) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk, 1); +} + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form (or just set the sysctl tcp_stdurg). + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr && !sysctl_tcp_stdurg) + ptr--; + ptr += ntohl(th->seq); + + /* Ignore urgent data that we've already seen and read. */ + if (after(tp->copied_seq, ptr)) + return; + + /* Do not replay urg ptr. + * + * NOTE: interesting situation not covered by specs. + * Misbehaving sender may send urg ptr, pointing to segment, + * which we already have in ofo queue. We are not able to fetch + * such data and will stay in TCP_URG_NOTYET until will be eaten + * by recvmsg(). Seems, we are not obliged to handle such wicked + * situations. But it is worth to think about possibility of some + * DoSes using some hypothetical application level deadlock. + */ + if (before(ptr, tp->rcv_nxt)) + return; + + /* Do we already have a newer (or duplicate) urgent pointer? */ + if (tp->urg_data && !after(ptr, tp->urg_seq)) + return; + + /* Tell the world about our new urgent pointer. */ + sk_send_sigurg(sk); + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * tp->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * + * NOTE. Double Dutch. Rendering to plain English: author of comment + * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); + * and expect that both A and B disappear from stream. This is _wrong_. + * Though this happens in BSD with high probability, this is occasional. + * Any application relying on this is buggy. Note also, that fix "works" + * only in this artificial test. Insert some normal data between A and B and we will + * decline of BSD again. Verdict: it is better to remove to trap + * buggy users. + */ + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && + tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + tp->copied_seq++; + if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + } + } + + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; +} + +/* This is the 'fast' part of urgent handling. */ +static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk,th); + + /* Do we wait for any urgent data? - normally not... */ + if (tp->urg_data == TCP_URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - + th->syn; + + /* Is the urgent pointer pointing into this packet? */ + if (ptr < skb->len) { + u8 tmp; + if (skb_copy_bits(skb, ptr, &tmp, 1)) + BUG(); + tp->urg_data = TCP_URG_VALID | tmp; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + } + } +} + +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + else + err = skb_copy_and_csum_datagram_iovec(skb, hlen, + tp->ucopy.iov); + + if (!err) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + } + + local_bh_disable(); + return err; +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + int result; + + if (sock_owned_by_user(sk)) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +} + +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * - Unexpected TCP option. + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* + * Header prediction. + * The code loosely follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + tp->rx_opt.saw_tstamp = 0; + + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_predition is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + int tcp_header_len = tp->tcp_header_len; + + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + __u32 *ptr = (__u32 *)(th + 1); + + /* No? Slow path! */ + if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + goto slow_path; + + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + goto slow_path; + + /* DO NOT update ts_recent here, if checksum fails + * and timestamp was corrupted part, it will result + * in a hung connection since we will drop all + * future packets due to the PAWS test. + */ + } + + if (len <= tcp_header_len) { + /* Bulk data transfer: sender */ + if (len == tcp_header_len) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + /* We know that such packets are checksummed + * on entry. + */ + tcp_ack(sk, skb, 0); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; + } + } else { + int eaten = 0; + + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); + + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + __skb_pull(skb, tcp_header_len); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; + } + } + if (!eaten) { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + __skb_queue_tail(&sk->sk_receive_queue, skb); + sk_stream_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } + + tcp_event_data_recv(sk, tp, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!tcp_ack_scheduled(tp)) + goto no_ack; + } + + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else { + __tcp_ack_snd_check(sk, 0); + } + +no_ack: + if (eaten) + __kfree_skb(skb); + else + sk->sk_data_ready(sk, 0); + return 0; + } + } + +slow_path: + if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Resets are accepted even if PAWS failed. + + ts_recent update must be made after we are sure + that the packet is in window. + */ + } + + /* + * Standard slow path. + */ + + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST bit + * is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return 1; + } + +step5: + if(th->ack) + tcp_ack(sk, skb, FLAG_SLOWPATH); + + tcp_rcv_rtt_measure_ts(tp, skb); + + /* Process urgent data. */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + tcp_data_queue(sk, skb); + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + return 0; + +csum_error: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + +discard: + __kfree_skb(skb); + return 0; +} + +static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int saved_clamp = tp->rx_opt.mss_clamp; + + tcp_parse_options(skb, &tp->rx_opt, 0); + + if (th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + goto reset_and_undo; + + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, + tcp_time_stamp)) { + NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; + } + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + if (!th->syn) + goto discard_and_undo; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + */ + + TCP_ECN_rcv_synack(tp, th); + if (tp->ecn_flags&TCP_ECN_OK) + sock_set_flag(sk, SOCK_NO_LARGESEND); + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk, skb, FLAG_SLOWPATH); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + + if (!tp->rx_opt.wscale_ok) { + tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tcp_store_ts_recent(tp); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + if (tp->rx_opt.sack_ok && sysctl_tcp_fack) + tp->rx_opt.sack_ok |= 2; + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initialized. */ + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + + /* Make sure socket is routed, for correct metrics. */ + tp->af_specific->rebuild_header(sk); + + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_init_buffer_space(sk); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + tcp_schedule_ack(tp); + tp->ack.lrcvtime = tcp_time_stamp; + tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(tp); + tcp_enter_quickack_mode(tp); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + +discard: + __kfree_skb(skb); + return 0; + } else { + tcp_send_ack(sk); + } + return -1; + } + + /* No ACK in the segment */ + + if (th->rst) { + /* rfc793: + * "If the RST bit is set + * + * Otherwise (no ACK) drop the segment and return." + */ + + goto discard_and_undo; + } + + /* PAWS check. */ + if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0)) + goto discard_and_undo; + + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. + */ + tcp_set_state(sk, TCP_SYN_RECV); + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tcp_store_ts_recent(tp); + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; + + TCP_ECN_rcv_syn(tp, th); + if (tp->ecn_flags&TCP_ECN_OK) + sock_set_flag(sk, SOCK_NO_LARGESEND); + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + + tcp_send_synack(sk); +#if 0 + /* Note, we could accept data and URG from this segment. + * There are no obstacles to make this. + * + * However, if we ignore data in ACKless segments sometimes, + * we have no reasons to accept it sometimes. + * Also, seems the code doing it in step6 of tcp_rcv_state_process + * is not flawless. So, discard packet for sanity. + * Uncomment this return to process the data. + */ + return -1; +#else + goto discard; +#endif + } + /* "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + */ + +discard_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + return 1; +} + + +/* + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int queued = 0; + + tp->rx_opt.saw_tstamp = 0; + + switch (sk->sk_state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if(th->ack) + return 1; + + if(th->rst) + goto discard; + + if(th->syn) { + if(tp->af_specific->conn_request(sk, skb) < 0) + return 1; + + init_westwood(sk); + init_bictcp(tp); + + /* Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + * But, this leaves one open to an easy denial of + * service attack, and SYN cookies can't defend + * against this problem. So, we drop the data + * in the interest of security over speed. + */ + goto discard; + } + goto discard; + + case TCP_SYN_SENT: + init_westwood(sk); + init_bictcp(tp); + + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (queued >= 0) + return queued; + + /* Do step6 onward by hand. */ + tcp_urg(sk, skb, th); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } + + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* step 2: check RST bit */ + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: + * + * Check for a SYN in window. + */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return 1; + } + + /* step 5: check the ACK field */ + if (th->ack) { + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); + + switch(sk->sk_state) { + case TCP_SYN_RECV: + if (acceptable) { + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + + /* Note, that this wakeup is only for marginal + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sk_sleep == + * NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) { + sk_wake_async(sk,0,POLL_OUT); + } + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << + tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, + TCP_SKB_CB(skb)->seq); + + /* tcp_ack considers this ACK as duplicate + * and does not calculate rtt. + * Fix it at least with timestamps. + */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !tp->srtt) + tcp_ack_saw_tstamp(tp, 0); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + /* Make sure socket is routed, for + * correct metrics. + */ + tp->af_specific->rebuild_header(sk); + + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on + * first data packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_initialize_rcv_mss(sk); + tcp_init_buffer_space(sk); + tcp_fast_path_on(tp); + } else { + return 1; + } + break; + + case TCP_FIN_WAIT1: + if (tp->snd_una == tp->write_seq) { + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + dst_confirm(sk->sk_dst_cache); + + if (!sock_flag(sk, SOCK_DEAD)) + /* Wake up lingering close() */ + sk->sk_state_change(sk); + else { + int tmo; + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + tcp_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } + } + break; + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; + } + } else + goto discard; + + /* step 6: check the URG bit */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + switch (sk->sk_state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + tcp_reset(sk); + return 1; + } + } + /* Fall through */ + case TCP_ESTABLISHED: + tcp_data_queue(sk, skb); + queued = 1; + break; + } + + /* tcp_data could move socket to TIME-WAIT */ + if (sk->sk_state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} + +EXPORT_SYMBOL(sysctl_tcp_ecn); +EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(tcp_parse_options); +EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c new file mode 100644 index 000000000000..3ac6659869c4 --- /dev/null +++ b/net/ipv4/tcp_ipv4.c @@ -0,0 +1,2663 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * David S. Miller : New socket lookup architecture. + * This code is dedicated to John Dyson. + * David S. Miller : Change semantics of established hash, + * half is devoted to TIME_WAIT sockets + * and the rest go in the other half. + * Andi Kleen : Add support for syncookies and fixed + * some bugs: ip options weren't passed to + * the TCP layer, missed a check for an + * ACK bit. + * Andi Kleen : Implemented fast path mtu discovery. + * Fixed many serious bugs in the + * open_request handling and moved + * most of it into the af independent code. + * Added tail drop and some other bugfixes. + * Added new listen sematics. + * Mike McLagan : Routing by source + * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year + * coma. + * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + */ + +#include <linux/config.h> + +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/cache.h> +#include <linux/jhash.h> +#include <linux/init.h> +#include <linux/times.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/ipv6.h> +#include <net/inet_common.h> +#include <net/xfrm.h> + +#include <linux/inet.h> +#include <linux/ipv6.h> +#include <linux/stddef.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +extern int sysctl_ip_dynaddr; +int sysctl_tcp_tw_reuse; +int sysctl_tcp_low_latency; + +/* Check TCP sequence numbers in ICMP packets. */ +#define ICMP_MIN_LENGTH 8 + +/* Socket used for sending RSTs */ +static struct socket *tcp_socket; + +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { + .__tcp_lhash_lock = RW_LOCK_UNLOCKED, + .__tcp_lhash_users = ATOMIC_INIT(0), + .__tcp_lhash_wait + = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), + .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +}; + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = 1024 - 1; + +static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, + __u32 faddr, __u16 fport) +{ + int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h & (tcp_ehash_size - 1); +} + +static __inline__ int tcp_sk_hashfn(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + __u32 laddr = inet->rcv_saddr; + __u16 lport = inet->num; + __u32 faddr = inet->daddr; + __u16 fport = inet->dport; + + return tcp_hashfn(laddr, lport, faddr, fport); +} + +/* Allocate and initialize a new TCP local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, + unsigned short snum) +{ + struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, + SLAB_ATOMIC); + if (tb) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +/* Caller must hold hashbucket lock for this tb with local BH disabled */ +void tcp_bucket_destroy(struct tcp_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(tcp_bucket_cachep, tb); + } +} + +/* Caller must disable local BH processing. */ +static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) +{ + struct tcp_bind_hashbucket *head = + &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = tcp_sk(sk)->bind_hash; + sk_add_bind_node(child, &tb->owners); + tcp_sk(child)->bind_hash = tb; + spin_unlock(&head->lock); +} + +inline void tcp_inherit_port(struct sock *sk, struct sock *child) +{ + local_bh_disable(); + __tcp_inherit_port(sk, child); + local_bh_enable(); +} + +void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, + unsigned short snum) +{ + inet_sk(sk)->num = snum; + sk_add_bind_node(sk, &tb->owners); + tcp_sk(sk)->bind_hash = tb; +} + +static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !tcp_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ + struct tcp_bind_hashbucket *head; + struct hlist_node *node; + struct tcp_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + do { + rover++; + if (rover < low || rover > high) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &tcp_bhash[tcp_bhashfn(snum)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (tcp_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!tcp_sk(sk)->bind_hash) + tcp_bind_hash(sk, tb, snum); + BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +/* Get rid of any references to a local port held by the + * given sock. + */ +static void __tcp_put_port(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = tcp_sk(sk)->bind_hash; + __sk_del_bind_node(sk); + tcp_sk(sk)->bind_hash = NULL; + inet->num = 0; + tcp_bucket_destroy(tb); + spin_unlock(&head->lock); +} + +void tcp_put_port(struct sock *sk) +{ + local_bh_disable(); + __tcp_put_port(sk); + local_bh_enable(); +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ + +void tcp_listen_wlock(void) +{ + write_lock(&tcp_lhash_lock); + + if (atomic_read(&tcp_lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&tcp_lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&tcp_lhash_users)) + break; + write_unlock_bh(&tcp_lhash_lock); + schedule(); + write_lock_bh(&tcp_lhash_lock); + } + + finish_wait(&tcp_lhash_wait, &wait); + } +} + +static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + if (listen_possible && sk->sk_state == TCP_LISTEN) { + list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + lock = &tcp_lhash_lock; + tcp_listen_wlock(); + } else { + list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; + lock = &tcp_ehash[sk->sk_hashent].lock; + write_lock(lock); + } + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); + if (listen_possible && sk->sk_state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +} + +static void tcp_v4_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __tcp_v4_hash(sk, 1); + local_bh_enable(); + } +} + +void tcp_unhash(struct sock *sk) +{ + rwlock_t *lock; + + if (sk_unhashed(sk)) + goto ende; + + if (sk->sk_state == TCP_LISTEN) { + local_bh_disable(); + tcp_listen_wlock(); + lock = &tcp_lhash_lock; + } else { + struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(lock); + + ende: + if (sk->sk_state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +} + +/* Don't inline this cruft. Here are some nice properties to + * exploit here. The BSD API does not allow a listening TCP + * to specify the remote port nor the remote address for the + * connection. So always assume those are both wildcarded + * during the search since they can never be otherwise. + */ +static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, + unsigned short hnum, int dif) +{ + struct sock *result = NULL, *sk; + struct hlist_node *node; + int score, hiscore; + + hiscore=-1; + sk_for_each(sk, node, head) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + __u32 rcv_saddr = inet->rcv_saddr; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} + +/* Optimize the common listener case. */ +static inline struct sock *tcp_v4_lookup_listener(u32 daddr, + unsigned short hnum, int dif) +{ + struct sock *sk = NULL; + struct hlist_head *head; + + read_lock(&tcp_lhash_lock); + head = &tcp_listening_hash[tcp_lhashfn(hnum)]; + if (!hlist_empty(head)) { + struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; + sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&tcp_lhash_lock); + return sk; +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ + +static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, + u32 daddr, u16 hnum, + int dif) +{ + struct tcp_ehash_bucket *head; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + struct sock *sk; + struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + int hash = tcp_hashfn(daddr, hnum, saddr, sport); + head = &tcp_ehash[hash]; + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(&head->lock); + return sk; +hit: + sock_hold(sk); + goto out; +} + +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ + struct sock *sk = __tcp_v4_lookup_established(saddr, sport, + daddr, hnum, dif); + + return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); +} + +inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, + u16 dport, int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(tcp_v4_lookup); + +static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + return secure_tcp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + skb->h.th->dest, + skb->h.th->source); +} + +/* called with local bh disabled */ +static int __tcp_v4_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) +{ + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); + struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct sock *sk2; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + tw = (struct tcp_tw_bucket *)sk2; + + if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + struct tcp_sock *tp = tcp_sk(sk); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it + is safe provided sequence spaces do not + overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, + only timestamp cache is held not per host, + but per port pair and TW bucket is used + as state holder. + + If TW bucket has been already destroyed we + fall back to VJ's scheme and use initial + timestamp retrieved from peer table. + */ + if (tw->tw_ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - + tw->tw_ts_recent_stamp > 1))) { + if ((tp->write_seq = + tw->tw_snd_nxt + 65535 + 2) == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + sock_hold(sk2); + goto unique; + } else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hashent = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + tcp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + tcp_tw_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 connect_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + + return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +static inline int tcp_v4_hash_connect(struct sock *sk) +{ + unsigned short snum = inet_sk(sk)->num; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + connect_port_offset(sk); + struct hlist_node *node; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &tcp_bhash[tcp_bhashfn(port)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + tb_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v4_check_established(sk, + port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + tcp_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __tcp_v4_hash(sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + } + + ret = 0; + goto out; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = tcp_sk(sk)->bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __tcp_v4_hash(sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __tcp_v4_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt && inet->opt->srr) { + if (!daddr) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + inet->sport, usin->sin_port, sk); + if (tmp < 0) + return tmp; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (!inet->opt || !inet->opt->srr) + daddr = rt->rt_dst; + + if (!inet->saddr) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + if (sysctl_tcp_tw_recycle && + !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { + struct inet_peer *peer = rt_get_peer(rt); + + /* VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state TIME-WAIT + * and initialize rx_opt.ts_recent from it, when trying new connection. + */ + + if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + tp->ext_header_len = 0; + if (inet->opt) + tp->ext_header_len = inet->opt->optlen; + + tp->rx_opt.mss_clamp = 536; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v4_hash_connect(sk); + if (err) + goto failure; + + err = ip_route_newports(&rt, inet->sport, inet->dport, sk); + if (err) + goto failure; + + /* OK, now commit destination to socket. */ + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + + if (!tp->write_seq) + tp->write_seq = secure_tcp_sequence_number(inet->saddr, + inet->daddr, + inet->sport, + usin->sin_port); + + inet->id = tp->write_seq ^ jiffies; + + err = tcp_connect(sk); + rt = NULL; + if (err) + goto failure; + + return 0; + +failure: + /* This unhashes the socket and releases the local port, if necessary. */ + tcp_set_state(sk, TCP_CLOSE); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + return err; +} + +static __inline__ int tcp_v4_iif(struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) +{ + return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); +} + +static struct open_request *tcp_v4_search_req(struct tcp_sock *tp, + struct open_request ***prevp, + __u16 rport, + __u32 raddr, __u32 laddr) +{ + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + + for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->af.v4_req.rmt_addr == raddr && + req->af.v4_req.loc_addr == laddr && + TCP_INET_FAMILY(req->class->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->sk = NULL; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +} + + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs + * send out by Linux are always <576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + tp->pmtu_cookie > mtu) { + tcp_sync_mss(sk, mtu); + + /* Resend the TCP packet because it's + * clear that the old packet has been + * dropped. This is the new "fast" path mtu + * discovery. + */ + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ + +void tcp_v4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); + struct tcp_sock *tp; + struct inet_sock *inet; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + __u32 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, + th->source, tcp_v4_iif(skb)); + if (!sk) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + if (sk->sk_state == TCP_TIME_WAIT) { + tcp_tw_put((struct tcp_tw_bucket *)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct open_request *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = tcp_v4_search_req(tp, &prev, th->dest, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + BUG_TRAP(!req->sk); + + if (seq != req->snt_isn) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tcp_synq_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can f.e. if SYNs crossed. + */ + if (!sock_owned_by_user(sk)) { + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + tcp_done(sk); + } else { + sk->sk_err_soft = err; + } + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + + if (skb->ip_summed == CHECKSUM_HW) { + th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); + skb->csum = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr, + csum_partial((char *)th, + th->doff << 2, + skb->csum)); + } +} + +/* + * This routine will send an RST to the other tcp. + * + * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) + * for reset. + * Answer: if a packet caused RST, it is not for a socket + * existing in our system, if it is matched to a socket, + * it is just duplicate segment or bug in other side's TCP. + * So that we build reply only basing on parameters + * arrived with segment. + * Exception: precedence violation. We do not implement it in any case. + */ + +static void tcp_v4_send_reset(struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL) + return; + + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr) / 4; + rth.rst = 1; + + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff << 2)); + } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); +} + +/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states + outside socket context is ugly, certainly. What can I do? + */ + +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts) +{ + struct tcphdr *th = skb->h.th; + struct { + struct tcphdr th; + u32 tsopt[3]; + } rep; + struct ip_reply_arg arg; + + memset(&rep.th, 0, sizeof(struct tcphdr)); + memset(&arg, 0, sizeof arg); + + arg.iov[0].iov_base = (unsigned char *)&rep; + arg.iov[0].iov_len = sizeof(rep.th); + if (ts) { + rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + rep.tsopt[1] = htonl(tcp_time_stamp); + rep.tsopt[2] = htonl(ts); + arg.iov[0].iov_len = sizeof(rep); + } + + /* Swap the send and the receive. */ + rep.th.dest = th->source; + rep.th.source = th->dest; + rep.th.doff = arg.iov[0].iov_len / 4; + rep.th.seq = htonl(seq); + rep.th.ack_seq = htonl(ack); + rep.th.ack = 1; + rep.th.window = htons(win); + + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + arg.iov[0].iov_len, IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); +} + +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, + tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + + tcp_tw_put(tw); +} + +static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ + tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd, + req->ts_recent); +} + +static struct dst_entry* tcp_v4_route_req(struct sock *sk, + struct open_request *req) +{ + struct rtable *rt; + struct ip_options *opt = req->af.v4_req.opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = req->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff * skb; + + /* First, grab a route. */ + if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + goto out; + + skb = tcp_make_synack(sk, dst, req); + + if (skb) { + struct tcphdr *th = skb->h.th; + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, + skb->csum)); + + err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, + req->af.v4_req.opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +} + +/* + * IPv4 open_request destructor. + */ +static void tcp_v4_or_free(struct open_request *req) +{ + if (req->af.v4_req.opt) + kfree(req->af.v4_req.opt); +} + +static inline void syn_flood_warning(struct sk_buff *skb) +{ + static unsigned long warntime; + + if (time_after(jiffies, (warntime + HZ * 60))) { + warntime = jiffies; + printk(KERN_INFO + "possible SYN flooding on port %d. Sending cookies.\n", + ntohs(skb->h.th->dest)); + } +} + +/* + * Save and compile IPv4 options into the open_request if needed. + */ +static inline struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = optlength(opt); + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(dopt, skb)) { + kfree(dopt); + dopt = NULL; + } + } + } + return dopt; +} + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Further increasing requires to change hash table size. + */ +int sysctl_max_syn_backlog = 256; + +struct or_calltable or_ipv4 = { + .family = PF_INET, + .rtx_syn_ack = tcp_v4_send_synack, + .send_ack = tcp_v4_or_send_ack, + .destructor = tcp_v4_or_free, + .send_reset = tcp_v4_send_reset, +}; + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct open_request *req; + __u32 saddr = skb->nh.iph->saddr; + __u32 daddr = skb->nh.iph->daddr; + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ +#endif + + /* Never answer to SYNs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (tcp_synq_is_full(sk) && !isn) { +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + want_cookie = 1; + } else +#endif + goto drop; + } + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (!req) + goto drop; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = 536; + tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_parse_options(skb, &tmp_opt, 0); + + if (want_cookie) { + tcp_clear_options(&tmp_opt); + tmp_opt.saw_tstamp = 0; + } + + if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { + /* Some OSes (unknown ones, but I see them on web server, which + * contains information interesting only for windows' + * users) do not send their stamp in SYN. It is easy case. + * We simply do not advertise TS support. + */ + tmp_opt.saw_tstamp = 0; + tmp_opt.tstamp_ok = 0; + } + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + + tcp_openreq_init(req, &tmp_opt, skb); + + req->af.v4_req.loc_addr = daddr; + req->af.v4_req.rmt_addr = saddr; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + req->class = &or_ipv4; + if (!want_cookie) + TCP_ECN_create_request(req, skb->h.th); + + if (want_cookie) { +#ifdef CONFIG_SYN_COOKIES + syn_flood_warning(skb); +#endif + isn = cookie_v4_init_sequence(sk, skb, &req->mss); + } else if (!isn) { + struct inet_peer *peer = NULL; + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + sysctl_tcp_tw_recycle && + (dst = tcp_v4_route_req(sk, req)) != NULL && + (peer = rt_get_peer((struct rtable *)dst)) != NULL && + peer->v4daddr == saddr) { + if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > + TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); + dst_release(dst); + goto drop_and_free; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst_metric(dst, RTAX_RTT))) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + NETDEBUG(if (net_ratelimit()) \ + printk(KERN_DEBUG "TCP: drop open " + "request from %u.%u." + "%u.%u/%u\n", \ + NIPQUAD(saddr), + ntohs(skb->h.th->source))); + dst_release(dst); + goto drop_and_free; + } + + isn = tcp_v4_init_sequence(sk, skb); + } + req->snt_isn = isn; + + if (tcp_v4_send_synack(sk, req, dst)) + goto drop_and_free; + + if (want_cookie) { + tcp_openreq_free(req); + } else { + tcp_v4_synq_add(sk, req); + } + return 0; + +drop_and_free: + tcp_openreq_free(req); +drop: + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + return 0; +} + + +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ +struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + goto exit; + + newsk = tcp_create_openreq_child(sk, req, skb); + if (!newsk) + goto exit; + + newsk->sk_dst_cache = dst; + tcp_v4_setup_caps(newsk, dst); + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newinet->daddr = req->af.v4_req.rmt_addr; + newinet->rcv_saddr = req->af.v4_req.loc_addr; + newinet->saddr = req->af.v4_req.loc_addr; + newinet->opt = req->af.v4_req.opt; + req->af.v4_req.opt = NULL; + newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_ttl = skb->nh.iph->ttl; + newtp->ext_header_len = 0; + if (newinet->opt) + newtp->ext_header_len = newinet->opt->optlen; + newinet->id = newtp->write_seq ^ jiffies; + + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(newsk); + + __tcp_v4_hash(newsk, 0); + __tcp_inherit_port(sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +exit: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + dst_release(dst); + return NULL; +} + +static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct iphdr *iph = skb->nh.iph; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *nsk; + struct open_request **prev; + /* Find possible connection requests. */ + struct open_request *req = tcp_v4_search_req(tp, &prev, th->source, + iph->saddr, iph->daddr); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket *)nsk); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->rst && !th->syn && th->ack) + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static int tcp_v4_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->csum)) + return 0; + + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->len <= 76) { + if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, + skb->nh.iph->daddr, + skb_checksum(skb, 0, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v4_check(skb->h.th, skb->len, + skb->nh.iph->saddr, + skb->nh.iph->daddr, 0); + } + return 0; +} + + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + } + + if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + + if (nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + TCP_CHECK_TIMER(sk); + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + +reset: + tcp_v4_send_reset(skb); +discard: + kfree_skb(skb); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + return 0; + +csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; +} + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb) +{ + struct tcphdr *th; + struct sock *sk; + int ret; + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* Count it even if it's bad */ + TCP_INC_STATS_BH(TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = skb->h.th; + + if (th->doff < sizeof(struct tcphdr) / 4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff * 4)) + goto discard_it; + + /* An explanation is required here, I think. + * Packet length and doff are validated by header prediction, + * provided case of th->doff==0 is elimineted. + * So, we defer the checks. */ + if ((skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v4_checksum_init(skb) < 0)) + goto bad_packet; + + th = skb->h.th; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->sacked = 0; + + sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + tcp_v4_iif(skb)); + + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + + return ret; + +no_tcp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + } else { + tcp_v4_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_SYN: { + struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + if (sk2) { + tcp_tw_deschedule((struct tcp_tw_bucket *)sk); + tcp_tw_put((struct tcp_tw_bucket *)sk); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v4_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static void __tcp_v4_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + +static int tcp_v4_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + IPPROTO_TCP, + inet->sport, inet->dport, sk); + if (err) + return err; + + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = new_saddr; + inet->rcv_saddr = new_saddr; + + /* XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __tcp_v4_rehash(sk); + return 0; +} + +int tcp_v4_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + { + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = inet->dport } } }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); + } + if (!err) { + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + return 0; + } + + /* Routing failed... */ + sk->sk_route_caps = 0; + + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = tcp_v4_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + + return err; +} + +static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +int tcp_v4_remember_stamp(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_get(sk); + struct inet_peer *peer = NULL; + int release_it = 0; + + if (!rt || rt->rt_dst != inet->daddr) { + peer = inet_getpeer(inet->daddr, 1); + release_it = 1; + } else { + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + } + + if (peer) { + if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + peer->tcp_ts = tp->rx_opt.ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + +int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +{ + struct inet_peer *peer = NULL; + + peer = inet_getpeer(tw->tw_daddr, 1); + + if (peer) { + if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; + peer->tcp_ts = tw->tw_ts_recent; + } + inet_putpeer(peer); + return 1; + } + + return 0; +} + +struct tcp_func ipv4_specific = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = tcp_v4_rebuild_header, + .conn_request = tcp_v4_conn_request, + .syn_recv_sock = tcp_v4_syn_recv_sock, + .remember_stamp = tcp_v4_remember_stamp, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = v4_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v4_init_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + tp->rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = 0x7fffffff; /* Infinity */ + tp->snd_cwnd_clamp = ~0; + tp->mss_cache_std = tp->mss_cache = 536; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + tp->af_specific = &ipv4_specific; + + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + + return 0; +} + +int tcp_v4_destroy_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_clear_xmit_timers(sk); + + /* Cleanup up the write buffer. */ + sk_stream_writequeue_purge(sk); + + /* Cleans up our, hopefully empty, out_of_order_queue. */ + __skb_queue_purge(&tp->out_of_order_queue); + + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ + if (tp->bind_hash) + tcp_put_port(sk); + + /* + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } + + atomic_dec(&tcp_sockets_allocated); + + return 0; +} + +EXPORT_SYMBOL(tcp_v4_destroy_sock); + +#ifdef CONFIG_PROC_FS +/* Proc filesystem TCP sock list dumping. */ + +static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +{ + return hlist_empty(head) ? NULL : + list_entry(head->first, struct tcp_tw_bucket, tw_node); +} + +static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +{ + return tw->tw_node.next ? + hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; +} + +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct tcp_sock *tp; + struct hlist_node *node; + struct sock *sk = cur; + struct tcp_iter_state* st = seq->private; + + if (!sk) { + st->bucket = 0; + sk = sk_head(&tcp_listening_hash[0]); + goto get_sk; + } + + ++st->num; + + if (st->state == TCP_SEQ_STATE_OPENREQ) { + struct open_request *req = cur; + + tp = tcp_sk(st->syn_wait_sk); + req = req->dl_next; + while (1) { + while (req) { + if (req->class->family == st->family) { + cur = req; + goto out; + } + req = req->dl_next; + } + if (++st->sbucket >= TCP_SYNQ_HSIZE) + break; +get_req: + req = tp->listen_opt->syn_table[st->sbucket]; + } + sk = sk_next(st->syn_wait_sk); + st->state = TCP_SEQ_STATE_LISTENING; + read_unlock_bh(&tp->syn_wait_lock); + } else { + tp = tcp_sk(sk); + read_lock_bh(&tp->syn_wait_lock); + if (tp->listen_opt && tp->listen_opt->qlen) + goto start_req; + read_unlock_bh(&tp->syn_wait_lock); + sk = sk_next(sk); + } +get_sk: + sk_for_each_from(sk, node) { + if (sk->sk_family == st->family) { + cur = sk; + goto out; + } + tp = tcp_sk(sk); + read_lock_bh(&tp->syn_wait_lock); + if (tp->listen_opt && tp->listen_opt->qlen) { +start_req: + st->uid = sock_i_uid(sk); + st->syn_wait_sk = sk; + st->state = TCP_SEQ_STATE_OPENREQ; + st->sbucket = 0; + goto get_req; + } + read_unlock_bh(&tp->syn_wait_lock); + } + if (++st->bucket < TCP_LHTABLE_SIZE) { + sk = sk_head(&tcp_listening_hash[st->bucket]); + goto get_sk; + } + cur = NULL; +out: + return cur; +} + +static void *listening_get_idx(struct seq_file *seq, loff_t *pos) +{ + void *rc = listening_get_next(seq, NULL); + + while (rc && *pos) { + rc = listening_get_next(seq, rc); + --*pos; + } + return rc; +} + +static void *established_get_first(struct seq_file *seq) +{ + struct tcp_iter_state* st = seq->private; + void *rc = NULL; + + for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + struct sock *sk; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + /* We can reschedule _before_ having picked the target: */ + cond_resched_softirq(); + + read_lock(&tcp_ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + if (sk->sk_family != st->family) { + continue; + } + rc = sk; + goto out; + } + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw_for_each(tw, node, + &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + if (tw->tw_family != st->family) { + continue; + } + rc = tw; + goto out; + } + read_unlock(&tcp_ehash[st->bucket].lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; + } +out: + return rc; +} + +static void *established_get_next(struct seq_file *seq, void *cur) +{ + struct sock *sk = cur; + struct tcp_tw_bucket *tw; + struct hlist_node *node; + struct tcp_iter_state* st = seq->private; + + ++st->num; + + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; + tw = tw_next(tw); +get_tw: + while (tw && tw->tw_family != st->family) { + tw = tw_next(tw); + } + if (tw) { + cur = tw; + goto out; + } + read_unlock(&tcp_ehash[st->bucket].lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; + + /* We can reschedule between buckets: */ + cond_resched_softirq(); + + if (++st->bucket < tcp_ehash_size) { + read_lock(&tcp_ehash[st->bucket].lock); + sk = sk_head(&tcp_ehash[st->bucket].chain); + } else { + cur = NULL; + goto out; + } + } else + sk = sk_next(sk); + + sk_for_each_from(sk, node) { + if (sk->sk_family == st->family) + goto found; + } + + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + goto get_tw; +found: + cur = sk; +out: + return cur; +} + +static void *established_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc = established_get_first(seq); + + while (rc && pos) { + rc = established_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *tcp_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc; + struct tcp_iter_state* st = seq->private; + + tcp_listen_lock(); + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_idx(seq, &pos); + + if (!rc) { + tcp_listen_unlock(); + local_bh_disable(); + st->state = TCP_SEQ_STATE_ESTABLISHED; + rc = established_get_idx(seq, pos); + } + + return rc; +} + +static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct tcp_iter_state* st = seq->private; + st->state = TCP_SEQ_STATE_LISTENING; + st->num = 0; + return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *rc = NULL; + struct tcp_iter_state* st; + + if (v == SEQ_START_TOKEN) { + rc = tcp_get_idx(seq, 0); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + rc = listening_get_next(seq, v); + if (!rc) { + tcp_listen_unlock(); + local_bh_disable(); + st->state = TCP_SEQ_STATE_ESTABLISHED; + rc = established_get_first(seq); + } + break; + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + rc = established_get_next(seq, v); + break; + } +out: + ++*pos; + return rc; +} + +static void tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct tcp_iter_state* st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + if (v) { + struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); + read_unlock_bh(&tp->syn_wait_lock); + } + case TCP_SEQ_STATE_LISTENING: + if (v != SEQ_START_TOKEN) + tcp_listen_unlock(); + break; + case TCP_SEQ_STATE_TIME_WAIT: + case TCP_SEQ_STATE_ESTABLISHED: + if (v) + read_unlock(&tcp_ehash[st->bucket].lock); + local_bh_enable(); + break; + } +} + +static int tcp_seq_open(struct inode *inode, struct file *file) +{ + struct tcp_seq_afinfo *afinfo = PDE(inode)->data; + struct seq_file *seq; + struct tcp_iter_state *s; + int rc; + + if (unlikely(afinfo == NULL)) + return -EINVAL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + memset(s, 0, sizeof(*s)); + s->family = afinfo->family; + s->seq_ops.start = tcp_seq_start; + s->seq_ops.next = tcp_seq_next; + s->seq_ops.show = afinfo->seq_show; + s->seq_ops.stop = tcp_seq_stop; + + rc = seq_open(file, &s->seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +int tcp_proc_register(struct tcp_seq_afinfo *afinfo) +{ + int rc = 0; + struct proc_dir_entry *p; + + if (!afinfo) + return -EINVAL; + afinfo->seq_fops->owner = afinfo->owner; + afinfo->seq_fops->open = tcp_seq_open; + afinfo->seq_fops->read = seq_read; + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + + p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else + rc = -ENOMEM; + return rc; +} + +void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) +{ + if (!afinfo) + return; + proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); +} + +static void get_openreq4(struct sock *sk, struct open_request *req, + char *tmpbuf, int i, int uid) +{ + int ttd = req->expires - jiffies; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", + i, + req->af.v4_req.loc_addr, + ntohs(inet_sk(sk)->sport), + req->af.v4_req.rmt_addr, + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0, 0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + atomic_read(&sk->sk_refcnt), + req); +} + +static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) +{ + int timer_active; + unsigned long timer_expires; + struct tcp_sock *tp = tcp_sk(sp); + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr; + unsigned int src = inet->rcv_saddr; + __u16 destp = ntohs(inet->dport); + __u16 srcp = ntohs(inet->sport); + + if (tp->pending == TCP_TIME_RETRANS) { + timer_active = 1; + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { + timer_active = 4; + timer_expires = tp->timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " + "%08X %5d %8d %lu %d %p %u %u %u %u %d", + i, src, srcp, dest, destp, sp->sk_state, + tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + tp->retransmits, + sock_i_uid(sp), + tp->probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + tp->snd_cwnd, + tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); +} + +static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +{ + unsigned int dest, src; + __u16 destp, srcp; + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = tw->tw_daddr; + src = tw->tw_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p", + i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +#define TMPSZ 150 + +static int tcp4_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state* st; + char tmpbuf[TMPSZ + 1]; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-*s\n", TMPSZ - 1, + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp4_sock(v, tmpbuf, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait4_sock(v, tmpbuf, st->num); + break; + } + seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf); +out: + return 0; +} + +static struct file_operations tcp4_seq_fops; +static struct tcp_seq_afinfo tcp4_seq_afinfo = { + .owner = THIS_MODULE, + .name = "tcp", + .family = AF_INET, + .seq_show = tcp4_seq_show, + .seq_fops = &tcp4_seq_fops, +}; + +int __init tcp4_proc_init(void) +{ + return tcp_proc_register(&tcp4_seq_afinfo); +} + +void tcp4_proc_exit(void) +{ + tcp_proc_unregister(&tcp4_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +struct proto tcp_prot = { + .name = "TCP", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v4_connect, + .disconnect = tcp_disconnect, + .accept = tcp_accept, + .ioctl = tcp_ioctl, + .init = tcp_v4_init_sock, + .destroy = tcp_v4_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .sendmsg = tcp_sendmsg, + .recvmsg = tcp_recvmsg, + .backlog_rcv = tcp_v4_do_rcv, + .hash = tcp_v4_hash, + .unhash = tcp_unhash, + .get_port = tcp_v4_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_sock), +}; + + + +void __init tcp_v4_init(struct net_proto_family *ops) +{ + int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket); + if (err < 0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(tcp_socket->sk)->uc_ttl = -1; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + tcp_socket->sk->sk_prot->unhash(tcp_socket->sk); +} + +EXPORT_SYMBOL(ipv4_specific); +EXPORT_SYMBOL(tcp_bind_hash); +EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(tcp_hashinfo); +EXPORT_SYMBOL(tcp_inherit_port); +EXPORT_SYMBOL(tcp_listen_wlock); +EXPORT_SYMBOL(tcp_port_rover); +EXPORT_SYMBOL(tcp_prot); +EXPORT_SYMBOL(tcp_put_port); +EXPORT_SYMBOL(tcp_unhash); +EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_SYMBOL(tcp_v4_do_rcv); +EXPORT_SYMBOL(tcp_v4_rebuild_header); +EXPORT_SYMBOL(tcp_v4_remember_stamp); +EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_SYMBOL(tcp_v4_syn_recv_sock); + +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(tcp_proc_register); +EXPORT_SYMBOL(tcp_proc_unregister); +#endif +EXPORT_SYMBOL(sysctl_local_port_range); +EXPORT_SYMBOL(sysctl_max_syn_backlog); +EXPORT_SYMBOL(sysctl_tcp_low_latency); +EXPORT_SYMBOL(sysctl_tcp_tw_reuse); + diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c new file mode 100644 index 000000000000..fd70509f0d53 --- /dev/null +++ b/net/ipv4/tcp_minisocks.c @@ -0,0 +1,1077 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/workqueue.h> +#include <net/tcp.h> +#include <net/inet_common.h> +#include <net/xfrm.h> + +#ifdef CONFIG_SYSCTL +#define SYNC_INIT 0 /* let the user enable it */ +#else +#define SYNC_INIT 1 +#endif + +int sysctl_tcp_tw_recycle; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; + +int sysctl_tcp_syncookies = SYNC_INIT; +int sysctl_tcp_abort_on_overflow; + +static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + +/* New-style handling of TIME_WAIT sockets. */ + +int tcp_tw_count; + + +/* Must be called with locally disabled BHs. */ +static void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead; + struct tcp_bind_hashbucket *bhead; + struct tcp_bind_bucket *tb; + + /* Unlink from established hashes. */ + ehead = &tcp_ehash[tw->tw_hashent]; + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + tcp_bucket_destroy(tb); + spin_unlock(&bhead->lock); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, + atomic_read(&tw->tw_refcnt)); + } +#endif + tcp_tw_put(tw); +} + +/* + * * Main purpose of TIME-WAIT state is to close connection gracefully, + * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN + * (and, probably, tail of data) and one or more our ACKs are lost. + * * What is TIME-WAIT timeout? It is associated with maximal packet + * lifetime in the internet, which results in wrong conclusion, that + * it is set to catch "old duplicate segments" wandering out of their path. + * It is not quite correct. This timeout is calculated so that it exceeds + * maximal retransmission timeout enough to allow to lose one (or more) + * segments sent by peer and our ACKs. This time may be calculated from RTO. + * * When TIME-WAIT socket receives RST, it means that another end + * finally closed and we are allowed to kill TIME-WAIT too. + * * Second purpose of TIME-WAIT is catching old duplicate segments. + * Well, certainly it is pure paranoia, but if we load TIME-WAIT + * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. + * * If we invented some more clever way to catch duplicates + * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. + * + * The algorithm below is based on FORMAL INTERPRETATION of RFCs. + * When you compare it to RFCs, please, read section SEGMENT ARRIVES + * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK + */ +enum tcp_tw_status +tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_options_received tmp_opt; + int paws_reject = 0; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + tcp_parse_options(skb, &tmp_opt, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = tw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + paws_reject = tcp_paws_check(&tmp_opt, th->rst); + } + } + + if (tw->tw_substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->tw_rcv_nxt, + tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || + TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->tw_substate = TCP_TIME_WAIT; + tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tmp_opt.saw_tstamp) { + tw->tw_ts_recent_stamp = xtime.tv_sec; + tw->tw_ts_recent = tmp_opt.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->tw_family == AF_INET && + sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->tw_timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + + if (!paws_reject && + (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { + /* In window segment, it may be only reset or bare ack. */ + + if (th->rst) { + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if (sysctl_tcp_rfc1337 == 0) { +kill: + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + if (tmp_opt.saw_tstamp) { + tw->tw_ts_recent = tmp_opt.rcv_tsval; + tw->tw_ts_recent_stamp = xtime.tv_sec; + } + + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* Out of window segment. + + All the segments are ACKed immediately. + + The only exception is new SYN. We accept it, if it is + not old duplicate and we are not in danger to be killed + by delayed old duplicates. RFC check is that it has + newer sequence number works at rates <40Mbit/sec. + However, if paws works, it is reliable AND even more, + we even may relax silly seq space cutoff. + + RED-PEN: we violate main RFC requirement, if this SYN will appear + old duplicate (i.e. we receive RST in reply to SYN-ACK), + we must return socket to time-wait state. It is not good, + but not fatal yet. + */ + + if (th->syn && !th->rst && !th->ack && !paws_reject && + (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tw->tw_snd_nxt + 65535 + 2; + if (isn == 0) + isn++; + TCP_SKB_CB(skb)->when = isn; + return TCP_TW_SYN; + } + + if (paws_reject) + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + + if(!th->rst) { + /* In this case we must reset the TIMEWAIT timer. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number <rcv_nxt. + * Do not reschedule in the last case. + */ + if (paws_reject || th->ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK; + } + tcp_tw_put(tw); + return TCP_TW_SUCCESS; +} + +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the + * relevant info into it from the SK, and mess with hash chains + * and list linkage. + */ +static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct tcp_bind_hashbucket *bhead; + + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; + spin_lock(&bhead->lock); + tw->tw_tb = tcp_sk(sk)->bind_hash; + BUG_TRAP(tcp_sk(sk)->bind_hash); + tw_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} + +/* + * Move a socket to time-wait or dead fin-wait-2 state. + */ +void tcp_time_wait(struct sock *sk, int state, int timeo) +{ + struct tcp_tw_bucket *tw = NULL; + struct tcp_sock *tp = tcp_sk(sk); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + + if(tw != NULL) { + struct inet_sock *inet = inet_sk(sk); + int rto = (tp->rto<<2) - (tp->rto>>1); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + atomic_set(&tw->tw_refcnt, 1); + + tw->tw_hashent = sk->sk_hashent; + tw->tw_rcv_nxt = tp->rcv_nxt; + tw->tw_snd_nxt = tp->snd_nxt; + tw->tw_rcv_wnd = tcp_receive_window(tp); + tw->tw_ts_recent = tp->rx_opt.ts_recent; + tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + tw_dead_node_init(tw); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_v6_ipv6only = np->ipv6only; + } else { + memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); + memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); + tw->tw_v6_ipv6only = 0; + } +#endif + /* Linkage updates. */ + __tcp_tw_hashdance(sk, tw); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->tw_timeout = rto; + } else { + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + tcp_tw_schedule(tw, timeo); + tcp_tw_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); + } + + tcp_update_metrics(sk); + tcp_done(sk); +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +static int tcp_tw_death_row_slot; + +static void tcp_twkill(unsigned long); + +/* TIME_WAIT reaping mechanism. */ +#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ +#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) + +#define TCP_TWKILL_QUOTA 100 + +static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; +static DEFINE_SPINLOCK(tw_death_lock); +static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); +static void twkill_work(void *); +static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); +static u32 twkill_thread_slots; + +/* Returns non-zero if quota exceeded. */ +static int tcp_do_twkill_work(int slot, unsigned int quota) +{ + struct tcp_tw_bucket *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + __tw_del_dead_node(tw); + spin_unlock(&tw_death_lock); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + spin_lock(&tw_death_lock); + if (killed > quota) { + ret = 1; + break; + } + + /* While we dropped tw_death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + tcp_tw_count -= killed; + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); + + return ret; +} + +static void tcp_twkill(unsigned long dummy) +{ + int need_timer, ret; + + spin_lock(&tw_death_lock); + + if (tcp_tw_count == 0) + goto out; + + need_timer = 0; + ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); + if (ret) { + twkill_thread_slots |= (1 << tcp_tw_death_row_slot); + mb(); + schedule_work(&tcp_twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (tcp_tw_count) + need_timer = 1; + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + if (need_timer) + mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); +out: + spin_unlock(&tw_death_lock); +} + +extern void twkill_slots_invalid(void); + +static void twkill_work(void *dummy) +{ + int i; + + if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) + twkill_slots_invalid(); + + while (twkill_thread_slots) { + spin_lock_bh(&tw_death_lock); + for (i = 0; i < TCP_TWKILL_SLOTS; i++) { + if (!(twkill_thread_slots & (1 << i))) + continue; + + while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { + if (need_resched()) { + spin_unlock_bh(&tw_death_lock); + schedule(); + spin_lock_bh(&tw_death_lock); + } + } + + twkill_thread_slots &= ~(1 << i); + } + spin_unlock_bh(&tw_death_lock); + } +} + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + spin_lock(&tw_death_lock); + if (tw_del_dead_node(tw)) { + tcp_tw_put(tw); + if (--tcp_tw_count == 0) + del_timer(&tcp_tw_timer); + } + spin_unlock(&tw_death_lock); + tcp_timewait_kill(tw); +} + +/* Short-time timewait calendar */ + +static int tcp_twcal_hand = -1; +static int tcp_twcal_jiffie; +static void tcp_twcal_tick(unsigned long); +static struct timer_list tcp_twcal_timer = + TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); +static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; + +static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; + + spin_lock(&tw_death_lock); + + /* Unlink it, if it was scheduled */ + if (tw_del_dead_node(tw)) + tcp_tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= TCP_TW_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= TCP_TIMEWAIT_LEN) { + slot = TCP_TWKILL_SLOTS-1; + } else { + slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; + if (slot >= TCP_TWKILL_SLOTS) + slot = TCP_TWKILL_SLOTS-1; + } + tw->tw_ttd = jiffies + timeo; + slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); + list = &tcp_tw_death_row[slot]; + } else { + tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); + + if (tcp_twcal_hand < 0) { + tcp_twcal_hand = 0; + tcp_twcal_jiffie = jiffies; + tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); + add_timer(&tcp_twcal_timer); + } else { + if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK))) + mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); + slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); + } + list = &tcp_twcal_row[slot]; + } + + hlist_add_head(&tw->tw_death_node, list); + + if (tcp_tw_count++ == 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + spin_unlock(&tw_death_lock); +} + +void tcp_twcal_tick(unsigned long dummy) +{ + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + spin_lock(&tw_death_lock); + if (tcp_twcal_hand < 0) + goto out; + + slot = tcp_twcal_hand; + j = tcp_twcal_jiffie; + + for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { + if (time_before_eq(j, now)) { + struct hlist_node *node, *safe; + struct tcp_tw_bucket *tw; + + tw_for_each_inmate_safe(tw, node, safe, + &tcp_twcal_row[slot]) { + __tw_del_dead_node(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + tcp_twcal_jiffie = j; + tcp_twcal_hand = slot; + } + + if (!hlist_empty(&tcp_twcal_row[slot])) { + mod_timer(&tcp_twcal_timer, j); + goto out; + } + } + j += (1<<TCP_TW_RECYCLE_TICK); + slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); + } + tcp_twcal_hand = -1; + +out: + if ((tcp_tw_count -= killed) == 0) + del_timer(&tcp_tw_timer); + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); + spin_unlock(&tw_death_lock); +} + +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * Actually, we could lots of memory writes here. tp of listening + * socket contains all necessary default parameters. + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + /* allocate the newsk from the same slab of the master sock, + * if not, at sk_free time we'll try to free it from the wrong + * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */ + struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + + if(newsk != NULL) { + struct tcp_sock *newtp; + struct sk_filter *filter; + + memcpy(newsk, sk, sizeof(struct tcp_sock)); + newsk->sk_state = TCP_SYN_RECV; + + /* SANITY */ + sk_node_init(&newsk->sk_node); + tcp_sk(newsk)->bind_hash = NULL; + + /* Clone the TCP header template */ + inet_sk(newsk)->dport = req->rmt_port; + + sock_lock_init(newsk); + bh_lock_sock(newsk); + + rwlock_init(&newsk->sk_dst_lock); + atomic_set(&newsk->sk_rmem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + atomic_set(&newsk->sk_wmem_alloc, 0); + skb_queue_head_init(&newsk->sk_write_queue); + atomic_set(&newsk->sk_omem_alloc, 0); + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + + sock_reset_flag(newsk, SOCK_DONE); + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_send_head = NULL; + rwlock_init(&newsk->sk_callback_lock); + skb_queue_head_init(&newsk->sk_error_queue); + newsk->sk_write_space = sk_stream_write_space; + + if ((filter = newsk->sk_filter) != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + + /* Now setup tcp_sock */ + newtp = tcp_sk(newsk); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->snd_sml = req->snt_isn + 1; + + tcp_prequeue_init(newtp); + + tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); + + newtp->retransmits = 0; + newtp->backoff = 0; + newtp->srtt = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->left_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->fackets_out = 0; + newtp->snd_ssthresh = 0x7fffffff; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + newtp->snd_cwnd_cnt = 0; + + newtp->frto_counter = 0; + newtp->frto_highmark = 0; + + tcp_set_ca_state(newtp, TCP_CA_Open); + tcp_init_xmit_timers(newsk); + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->pushed_seq = newtp->write_seq; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.eff_sacks = 0; + + newtp->probes_out = 0; + newtp->rx_opt.num_sacks = 0; + newtp->urg_data = 0; + newtp->listen_opt = NULL; + newtp->accept_queue = newtp->accept_queue_tail = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); + + /* Back to base struct sock members. */ + newsk->sk_err = 0; + newsk->sk_priority = 0; + atomic_set(&newsk->sk_refcnt, 2); +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + atomic_inc(&tcp_sockets_allocated); + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + newsk->sk_socket = NULL; + newsk->sk_sleep = NULL; + + newtp->rx_opt.tstamp_ok = req->tstamp_ok; + if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) { + if (sysctl_tcp_fack) + newtp->rx_opt.sack_ok |= 2; + } + newtp->window_clamp = req->window_clamp; + newtp->rcv_ssthresh = req->rcv_wnd; + newtp->rcv_wnd = req->rcv_wnd; + newtp->rx_opt.wscale_ok = req->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = req->snd_wscale; + newtp->rx_opt.rcv_wscale = req->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + TCP_ECN_openreq_child(newtp, req); + if (newtp->ecn_flags&TCP_ECN_OK) + sock_set_flag(newsk, SOCK_NO_LARGESEND); + + tcp_ca_init(newtp); + + TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +/* + * Process an incoming packet for SYN_RECV sockets represented + * as an open_request. + */ + +struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, + struct open_request *req, + struct open_request **prev) +{ + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + int paws_reject = 0; + struct tcp_options_received tmp_opt; + struct sock *child; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + tcp_parse_options(skb, &tmp_opt, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + paws_reject = tcp_paws_check(&tmp_opt, th->rst); + } + } + + /* Check for pure retransmitted SYN. */ + if (TCP_SKB_CB(skb)->seq == req->rcv_isn && + flg == TCP_FLAG_SYN && + !paws_reject) { + /* + * RFC793 draws (Incorrectly! It was fixed in RFC1122) + * this case on figure 6 and figure 8, but formal + * protocol description says NOTHING. + * To be more exact, it says that we should send ACK, + * because this segment (at least, if it has no data) + * is out of window. + * + * CONCLUSION: RFC793 (even with RFC1122) DOES NOT + * describe SYN-RECV state. All the description + * is wrong, we cannot believe to it and should + * rely only on common sense and implementation + * experience. + * + * Enforce "SYN-ACK" according to figure 8, figure 6 + * of RFC793, fixed by RFC1122. + */ + req->class->rtx_syn_ack(sk, req, NULL); + return NULL; + } + + /* Further reproduces section "SEGMENT ARRIVES" + for state SYN-RECEIVED of RFC793. + It is broken, however, it does not work only + when SYNs are crossed. + + You would think that SYN crossing is impossible here, since + we should have a SYN_SENT socket (from connect()) on our end, + but this is not true if the crossed SYNs were sent to both + ends by a malicious third party. We must defend against this, + and to do that we first verify the ACK (as per RFC793, page + 36) and reset if it is invalid. Is this a true full defense? + To convince ourselves, let us consider a way in which the ACK + test can still pass in this 'malicious crossed SYNs' case. + Malicious sender sends identical SYNs (and thus identical sequence + numbers) to both A and B: + + A: gets SYN, seq=7 + B: gets SYN, seq=7 + + By our good fortune, both A and B select the same initial + send sequence number of seven :-) + + A: sends SYN|ACK, seq=7, ack_seq=8 + B: sends SYN|ACK, seq=7, ack_seq=8 + + So we are now A eating this SYN|ACK, ACK test passes. So + does sequence test, SYN is truncated, and thus we consider + it a bare ACK. + + If tp->defer_accept, we silently drop this bare ACK. Otherwise, + we create an established connection. Both ends (listening sockets) + accept the new incoming connection and try to talk to each other. 8-) + + Note: This case is both harmless, and rare. Possibility is about the + same as us discovering intelligent life on another plant tomorrow. + + But generally, we should (RFC lies!) to accept ACK + from SYNACK both here and in tcp_rcv_state_process(). + tcp_rcv_state_process() does not, hence, we do not too. + + Note that the case is absolutely generic: + we cannot optimize anything here without + violating protocol. All the checks must be made + before attempt to create socket. + */ + + /* RFC793 page 36: "If the connection is in any non-synchronized state ... + * and the incoming segment acknowledges something not yet + * sent (the segment carries an unaccaptable ACK) ... + * a reset is sent." + * + * Invalid ACK: reset will be sent by listening socket + */ + if ((flg & TCP_FLAG_ACK) && + (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)) + return sk; + + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. + */ + + /* RFC793: "first check sequence number". */ + + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { + /* Out of window: send ACK and drop. */ + if (!(flg & TCP_FLAG_RST)) + req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + return NULL; + } + + /* In sequence, PAWS is OK. */ + + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) + req->ts_recent = tmp_opt.rcv_tsval; + + if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { + /* Truncate SYN, it is out of window starting + at req->rcv_isn+1. */ + flg &= ~TCP_FLAG_SYN; + } + + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) + goto embryonic_reset; + + /* ACK sequence verified above, just make sure ACK is + * set. If ACK not set, just silently drop the packet. + */ + if (!(flg & TCP_FLAG_ACK)) + return NULL; + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; + return NULL; + } + + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + tcp_synq_unlink(tp, req, prev); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; + + listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } + + embryonic_reset: + NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); + if (!(flg & TCP_FLAG_RST)) + req->class->send_reset(skb); + + tcp_synq_drop(sk, req, prev); + return NULL; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} + +EXPORT_SYMBOL(tcp_check_req); +EXPORT_SYMBOL(tcp_child_process); +EXPORT_SYMBOL(tcp_create_openreq_child); +EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_SYMBOL(tcp_tw_deschedule); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c new file mode 100644 index 000000000000..13c14cb6dee4 --- /dev/null +++ b/net/ipv4/tcp_output.c @@ -0,0 +1,1739 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * David S. Miller : Charge memory using the right skb + * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. + * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 + * J Hadi Salim : ECN support + * + */ + +#include <net/tcp.h> + +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/smp_lock.h> + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; + +/* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ +int sysctl_tcp_tso_win_divisor = 8; + +static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + sk->sk_send_head = skb->next; + if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) + sk->sk_send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_packets_out_inc(sk, tp, skb); +} + +/* SND.NXT, if window was not shrunk. + * If window has been shrunk, what should we make? It is not clear at all. + * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( + * Anything in between SND.UNA...SND.UNA+SND.WND also can be already + * invalid. OK, let's make this for now: + */ +static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) +{ + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + return tp->snd_nxt; + else + return tp->snd_una+tp->snd_wnd; +} + +/* Calculate mss to advertise in SYN segment. + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overridden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". + */ +static __u16 tcp_advertise_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + int mss = tp->advmss; + + if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { + mss = dst_metric(dst, RTAX_ADVMSS); + tp->advmss = mss; + } + + return (__u16)mss; +} + +/* RFC2861. Reset CWND after idle period longer RTO to "restart window". + * This is the first part of cwnd validation mechanism. */ +static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +{ + s32 delta = tcp_time_stamp - tp->lsndtime; + u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 cwnd = tp->snd_cwnd; + + if (tcp_is_vegas(tp)) + tcp_vegas_enable(tp); + + tp->snd_ssthresh = tcp_current_ssthresh(tp); + restart_cwnd = min(restart_cwnd, cwnd); + + while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_used = 0; +} + +static inline void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) +{ + u32 now = tcp_time_stamp; + + if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + tcp_cwnd_restart(tp, __sk_dst_get(sk)); + + tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) + tp->ack.pingpong = 1; +} + +static __inline__ void tcp_event_ack_sent(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_dec_quickack_mode(tp); + tcp_clear_xmit_timer(sk, TCP_TIME_DACK); +} + +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale) +{ + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. We try to be nice. + * If we are not window scaling, then this truncates + * our initial window offering to 32k. There should also + * be a sysctl option to stop being nice. + */ + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + (*rcv_wscale) = 0; + if (wscale_ok) { + /* Set window scaling on max possible window + * See RFC1323 for an explanation of the limit to 14 + */ + space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; + } + + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +} + +/* Chose a new window to advertise, update state in tcp_sock for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +static __inline__ u16 tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __tcp_select_window(sk); + + /* Never shrink the offered window */ + if(new_win < cur_win) { + /* Danger Will Robinson! + * Don't update rcv_wup/rcv_wnd here or else + * we will not be able to advertise a zero + * window in time. --DaveM + * + * Relax Will Robinson. + */ + new_win = cur_win; + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (!tp->rx_opt.rcv_wscale) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + + /* If we advertise zero window, disable fast path. */ + if (new_win == 0) + tp->pred_flags = 0; + + return new_win; +} + + +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!tcp_skb_pcount(skb)); + +#define SYSCTL_FLAG_TSTAMPS 0x1 +#define SYSCTL_FLAG_WSCALE 0x2 +#define SYSCTL_FLAG_SACK 0x4 + + sysctl_flags = 0; + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; + } + if(sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; + } + if(sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } + } else if (tp->rx_opt.eff_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + } + + /* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ + if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) + tcp_vegas_enable(tp); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); + if (tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; + + if (tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } + + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } + tp->af_specific->send_check(sk, th, skb->len, skb); + + if (tcb->flags & TCPCB_FLAG_ACK) + tcp_event_ack_sent(sk); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); + + TCP_INC_STATS(TCP_MIB_OUTSEGS); + + err = tp->af_specific->queue_xmit(skb, 0); + if (err <= 0) + return err; + + tcp_enter_cwr(tp); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +#undef SYSCTL_FLAG_TSTAMPS +#undef SYSCTL_FLAG_WSCALE +#undef SYSCTL_FLAG_SACK +} + + +/* This routine just queue's the buffer + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. + */ +static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk_charge_skb(sk, skb); + + /* Queue it, remembering where we must start sending. */ + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; +} + +static inline void tcp_tso_set_push(struct sk_buff *skb) +{ + /* Force push to be on for any TSO frames to workaround + * problems with busted implementations like Mac OS-X that + * hold off socket receive wakeups until push is seen. + */ + if (tcp_skb_pcount(skb) > 1) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned cur_mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + + if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { + sk->sk_send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_packets_out_inc(sk, tp, skb); + return; + } + } +} + +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std) +{ + if (skb->len <= mss_std) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + } else { + unsigned int factor; + + factor = skb->len + (mss_std - 1); + factor /= mss_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = mss_std; + } +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int nsize; + u16 flags; + + nsize = skb_headlen(skb) - len; + if (nsize < 0) + nsize = 0; + + if (skb_cloned(skb) && + skb_is_nonlinear(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ + sk_charge_skb(sk, buff); + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = + (TCP_SKB_CB(skb)->sacked & + (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); + TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; + + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + nsize, 0); + + skb_trim(skb, len); + + skb->csum = csum_block_sub(skb->csum, buff->csum, len); + } else { + skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + } + + buff->ip_summed = skb->ip_summed; + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out -= tcp_skb_pcount(skb); + tp->left_out -= tcp_skb_pcount(skb); + } + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + tcp_set_skb_tso_segs(buff, tp->mss_cache_std); + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(skb); + tp->left_out += tcp_skb_pcount(skb); + } + + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(buff); + tp->left_out += tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ + __skb_append(skb, buff); + + return 0; +} + +/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c + * eventually). The difference is that pulled data not copied, but + * immediately discarded. + */ +static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) +{ + int i, k, eat; + + eat = len; + k = 0; + for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail = skb->data; + skb->data_len -= len; + skb->len = skb->data_len; + return skb->tail; +} + +int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + if (len <= skb_headlen(skb)) { + __skb_pull(skb, len); + } else { + if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL) + return -ENOMEM; + } + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_HW; + + skb->truesize -= len; + sk->sk_wmem_queued -= len; + sk->sk_forward_alloc += len; + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + + /* Any change of skb->len requires recalculation of tso + * factor and mss. + */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); + + return 0; +} + +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->rx_opt.mss_clamp is mss negotiated at connection setup. + It is minumum of user_mss and mss received with SYN. + It also does not include TCP options. + + tp->pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->rx_opt.mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside + this function. --ANK (980731) + */ + +unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Bound mss with half of window */ + if (tp->max_window && mss_now > (tp->max_window>>1)) + mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + + /* And store cached results */ + tp->pmtu_cookie = pmtu; + tp->mss_cache = tp->mss_cache_std = mss_now; + + return mss_now; +} + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + * + * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * cannot be large. However, taking into account rare use of URG, this + * is not a big flaw. + */ + +unsigned int tcp_current_mss(struct sock *sk, int large) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + unsigned int do_large, mss_now; + + mss_now = tp->mss_cache_std; + if (dst) { + u32 mtu = dst_mtu(dst); + if (mtu != tp->pmtu_cookie) + mss_now = tcp_sync_mss(sk, mtu); + } + + do_large = (large && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode); + + if (do_large) { + unsigned int large_mss, factor, limit; + + large_mss = 65535 - tp->af_specific->net_header_len - + tp->ext_header_len - tp->tcp_header_len; + + if (tp->max_window && large_mss > (tp->max_window>>1)) + large_mss = max((tp->max_window>>1), + 68U - tp->tcp_header_len); + + factor = large_mss / mss_now; + + /* Always keep large mss multiple of real mss, but + * do not exceed 1/tso_win_divisor of the congestion window + * so we can keep the ACK clock ticking and minimize + * bursting. + */ + limit = tp->snd_cwnd; + if (sysctl_tcp_tso_win_divisor) + limit /= sysctl_tcp_tso_win_divisor; + limit = max(1U, limit); + if (factor > limit) + factor = limit; + + tp->mss_cache = mss_now * factor; + + mss_now = tp->mss_cache; + } + + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + return mss_now; +} + +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. + */ +int tcp_write_xmit(struct sock *sk, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int mss_now; + + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and all + * will be happy. + */ + if (sk->sk_state != TCP_CLOSE) { + struct sk_buff *skb; + int sent_pkts = 0; + + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk, 1); + + while ((skb = sk->sk_send_head) && + tcp_snd_test(tp, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); + + tcp_minshall_update(tp, mss_now, skb); + sent_pkts = 1; + } + + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; + } + + return !tp->packets_out && sk->sk_send_head; + } + return 0; +} + +/* This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * RFC 1122: + * "the suggested [SWS] avoidance algorithm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can raise + * it at least MSS bytes. + * + * Unfortunately, the recommended algorithm breaks header prediction, + * since header prediction assumes th->window stays fixed. + * + * Strictly speaking, keeping th->window fixed violates the receiver + * side SWS prevention criteria. The problem is that under this rule + * a stream of single byte packets will cause the right side of the + * window to always advance by a single byte. + * + * Of course, if the sender implements sender side SWS prevention + * then this will not be a problem. + * + * BSD seems to make the following compromise: + * + * If the free space is less than the 1/4 of the maximum + * space available and the free space is less than 1/2 mss, + * then set the window to 0. + * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] + * Otherwise, just prevent the window from shrinking + * and from being larger than the largest representable value. + * + * This prevents incremental opening of the window in the regime + * where TCP is limited by the speed of the reader side taking + * data out of the TCP receive queue. It does nothing about + * those cases where the window is constrained on the sender side + * because the pipeline is full. + * + * BSD also seems to "accidentally" limit itself to windows that are a + * multiple of MSS, at least until the free space gets quite small. + * This would appear to be a side effect of the mbuf implementation. + * Combining these two algorithms results in the observed behavior + * of having a fixed window size at almost all times. + * + * Below we obtain similar behavior by forcing the offered window to + * a multiple of the mss when it is feasible to do so. + * + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. + * Regular options like TIMESTAMP are taken into account. + */ +u32 __tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* MSS for the peer's data. Previous verions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + int mss = tp->ack.rcv_mss; + int free_space = tcp_space(sk); + int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int window; + + if (mss > full_space) + mss = full_space; + + if (free_space < full_space/2) { + tp->ack.quick = 0; + + if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. + */ + window = tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1<<rcv_wscale > mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space/mss)*mss; + } + + return window; +} + +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *next_skb = skb->next; + + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if (!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; + + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; + + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) + return; + + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; + + BUG_ON(tcp_skb_pcount(skb) != 1 || + tcp_skb_pcount(next_skb) != 1); + + /* Ok. We will be able to collapse the packet. */ + __skb_unlink(next_skb, next_skb->list); + + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + + if (next_skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_HW; + + if (skb->ip_summed != CHECKSUM_HW) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; + + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(next_skb); + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + tp->lost_out -= tcp_skb_pcount(next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); + } + /* Reno case is special. Sigh... */ + if (!tp->rx_opt.sack_ok && tp->sacked_out) { + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); + } + + /* Not quite right: it can be > snd.fack, but + * it is better to underestimate fackets. + */ + tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_packets_out_dec(tp, next_skb); + sk_stream_free_skb(sk, next_skb); + } +} + +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk, 0); + int lost = 0; + + sk_stream_for_retrans_queue(skb, sk) { + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + lost = 1; + } + } + } + + if (!lost) + return; + + tcp_sync_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (tp->ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(tp, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occurred which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cur_mss = tcp_current_mss(sk, 0); + int err; + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: frgagmentation, tunneling, mangling etc. + */ + if (atomic_read(&sk->sk_wmem_alloc) > + min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + return -EAGAIN; + + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + BUG(); + + if (sk->sk_route_caps & NETIF_F_TSO) { + sk->sk_route_caps &= ~NETIF_F_TSO; + sock_set_flag(sk, SOCK_NO_LARGESEND); + tp->mss_cache = tp->mss_cache_std; + } + + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return -ENOMEM; + } + + /* If receiver has shrunk his window, and skb is out of + * new window, do not retransmit it. The exception is the + * case, when window is shrunk to zero. In this case + * our retransmit serves as a zero window probe. + */ + if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + && TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + if (skb->len > cur_mss) { + int old_factor = tcp_skb_pcount(skb); + int new_factor; + + if (tcp_fragment(sk, skb, cur_mss)) + return -ENOMEM; /* We'll try again later. */ + + /* New SKB created, account for it. */ + new_factor = tcp_skb_pcount(skb); + tp->packets_out -= old_factor - new_factor; + tp->packets_out += tcp_skb_pcount(skb->next); + } + + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (skb->next != sk->sk_send_head) && + (skb->next != (struct sk_buff *)&sk->sk_write_queue) && + (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); + + if(tp->af_specific->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if(skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + } + } + + /* Make a copy, if the first transmission SKB clone we made + * is still in somebody's hands, else make a clone. + */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + + err = tcp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); + + if (err == 0) { + /* Update global TCP statistics. */ + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + + tp->total_retrans++; + +#if FASTRETRANS_DEBUG > 0 + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (net_ratelimit()) + printk(KERN_DEBUG "retrans_out leaked.\n"); + } +#endif + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out += tcp_skb_pcount(skb); + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = TCP_SKB_CB(skb)->when; + + tp->undo_retrans++; + + /* snd_nxt is stored to detect loss of retransmitted segment, + * see tcp_input.c tcp_sacktag_write_queue(). + */ + TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } + return err; +} + +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int packet_cnt = tp->lost_out; + + /* First pass: retransmit lost packets. */ + if (packet_cnt) { + sk_stream_for_retrans_queue(skb, sk) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + + if (sacked&TCPCB_LOST) { + if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (tcp_retransmit_skb(sk, skb)) + return; + if (tp->ca_state != TCP_CA_Loss) + NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); + + if (skb == + skb_peek(&sk->sk_write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + + packet_cnt -= tcp_skb_pcount(skb); + if (packet_cnt <= 0) + break; + } + } + } + + /* OK, demanded retransmission is finished. */ + + /* Forward retransmissions are possible only during Recovery. */ + if (tp->ca_state != TCP_CA_Recovery) + return; + + /* No forward retransmissions in Reno are possible. */ + if (!tp->rx_opt.sack_ok) + return; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retransmit anything, while we have some new + * segments to send. + */ + + if (tcp_may_send_now(sk, tp)) + return; + + packet_cnt = 0; + + sk_stream_for_retrans_queue(skb, sk) { + /* Similar to the retransmit loop above we + * can pretend that the retransmitted SKB + * we send out here will be composed of one + * real MSS sized packet because tcp_retransmit_skb() + * will fragment it if necessary. + */ + if (++packet_cnt > tp->fackets_out) + break; + + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + break; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + continue; + + /* Ok, retransmit it. */ + if (tcp_retransmit_skb(sk, skb)) + break; + + if (skb == skb_peek(&sk->sk_write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + + NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); + } +} + + +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. + */ +void tcp_send_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); + int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk, 1); + + if (sk->sk_send_head != NULL) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + if (skb) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF); +} + +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + */ +void tcp_send_active_reset(struct sock *sk, int priority) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_TCP_HEADER, priority); + if (!skb) { + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* Send it off. */ + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb)) + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); +} + +/* WARNING: This routine must only be called when we have already sent + * a SYN packet that crossed the incoming SYN that caused this routine + * to get called. If this assumption fails then the initial rcv_wnd + * and rcv_wscale values will not be correct. + */ +int tcp_send_synack(struct sock *sk) +{ + struct sk_buff* skb; + + skb = skb_peek(&sk->sk_write_queue); + if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + return -EFAULT; + } + if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; + __skb_unlink(skb, &sk->sk_write_queue); + skb_header_release(nskb); + __skb_queue_head(&sk->sk_write_queue, nskb); + sk_stream_free_skb(sk, skb); + sk_charge_skb(sk, nskb); + skb = nskb; + } + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_ECN_send_synack(tcp_sk(sk), skb); + } + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); +} + +/* + * Prepare a SYN-ACK. + */ +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_TCP_HEADER); + + skb->dst = dst_clone(dst); + + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + if (dst->dev->features&NETIF_F_TSO) + req->ecn_ok = 0; + TCP_ECN_make_synack(req, th); + th->source = inet_sk(sk)->sport; + th->dest = req->rmt_port; + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(req->rcv_wnd); + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + TCP_SKB_CB(skb)->when, + req->ts_recent); + + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + TCP_INC_STATS(TCP_MIB_OUTSEGS); + return skb; +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void tcp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->max_window = 0; + tcp_sync_mss(sk, dst_mtu(dst)); + + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(sk); + tcp_ca_init(tp); + + tcp_select_initial_window(tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &rcv_wscale); + + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->snd_wnd = 0; + tcp_init_wl(tp, tp->write_seq, 0); + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->rcv_nxt = 0; + tp->rcv_wup = 0; + tp->copied_seq = 0; + + tp->rto = TCP_TIMEOUT_INIT; + tp->retransmits = 0; + tcp_clear_retrans(tp); +} + +/* + * Build a SYN and send it off. + */ +int tcp_connect(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + + tcp_connect_init(sk); + + buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_ECN_send_syn(sk, tp, buff); + TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; + buff->csum = 0; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + tcp_ca_init(tp); + + /* Send it off. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = TCP_SKB_CB(buff)->when; + skb_header_release(buff); + __skb_queue_tail(&sk->sk_write_queue, buff); + sk_charge_skb(sk, buff); + tp->packets_out += tcp_skb_pcount(buff); + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 0; +} + +/* Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. + */ +void tcp_send_delayed_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int ato = tp->ack.ato; + unsigned long timeout; + + if (ato > TCP_DELACK_MIN) { + int max_ato = HZ/2; + + if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + max_ato = TCP_DELACK_MAX; + + /* Slow path, intersegment interval is "high". */ + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use tp->rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) { + int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + + if (rtt < max_ato) + max_ato = rtt; + } + + ato = min(ato, max_ato); + } + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (tp->ack.pending&TCP_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + tcp_send_ack(sk); + return; + } + + if (!time_before(timeout, tp->ack.timeout)) + timeout = tp->ack.timeout; + } + tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; + tp->ack.timeout = timeout; + sk_reset_timer(sk, &tp->delack_timer, timeout); +} + +/* This routine sends an ack and also updates the window. */ +void tcp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + tcp_schedule_ack(tp); + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff); + } +} + +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + * + * Question: what should we make while urgent mode? + * 4.4BSD forces sending single byte of data. We cannot send + * out of window data, because we have SND.NXT==SND.MAX... + * + * Current solution: to send TWO zero-length segments in urgent mode: + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is + * out-of-date with SND.UNA-1 to probe window. + */ +static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = urgent; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb); +} + +int tcp_write_wakeup(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if ((skb = sk->sk_send_head) != NULL && + before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { + int err; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + if (tcp_fragment(sk, skb, seg_size)) + return -1; + /* SWS override triggered forced fragmentation. + * Disable TSO, the connection is too sick. */ + if (sk->sk_route_caps & NETIF_F_TSO) { + sock_set_flag(sk, SOCK_NO_LARGESEND); + sk->sk_route_caps &= ~NETIF_F_TSO; + tp->mss_cache = tp->mss_cache_std; + } + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!err) { + update_send_head(sk, tp, skb); + } + return err; + } else { + if (tp->urg_mode && + between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) + tcp_xmit_probe_skb(sk, TCPCB_URG); + return tcp_xmit_probe_skb(sk, 0); + } + } + return -1; +} + +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. + */ +void tcp_send_probe0(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !sk->sk_send_head) { + /* Cancel probe timer, if it is not required. */ + tp->probes_out = 0; + tp->backoff = 0; + return; + } + + if (err <= 0) { + if (tp->backoff < sysctl_tcp_retries2) + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!tp->probes_out) + tp->probes_out=1; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + } +} + +EXPORT_SYMBOL(tcp_connect); +EXPORT_SYMBOL(tcp_make_synack); +EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_SYMBOL(tcp_sync_mss); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c new file mode 100644 index 000000000000..85b279f1e935 --- /dev/null +++ b/net/ipv4/tcp_timer.c @@ -0,0 +1,656 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +#include <linux/module.h> +#include <net/tcp.h> + +int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; +int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; +int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; +int sysctl_tcp_retries1 = TCP_RETR1; +int sysctl_tcp_retries2 = TCP_RETR2; +int sysctl_tcp_orphan_retries; + +static void tcp_write_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); + +#ifdef TCP_DEBUG +const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; +EXPORT_SYMBOL(tcp_timer_bug_msg); +#endif + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ + +void tcp_init_xmit_timers(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + init_timer(&tp->retransmit_timer); + tp->retransmit_timer.function=&tcp_write_timer; + tp->retransmit_timer.data = (unsigned long) sk; + tp->pending = 0; + + init_timer(&tp->delack_timer); + tp->delack_timer.function=&tcp_delack_timer; + tp->delack_timer.data = (unsigned long) sk; + tp->ack.pending = 0; + + init_timer(&sk->sk_timer); + sk->sk_timer.function = &tcp_keepalive_timer; + sk->sk_timer.data = (unsigned long)sk; +} + +void tcp_clear_xmit_timers(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->pending = 0; + sk_stop_timer(sk, &tp->retransmit_timer); + + tp->ack.pending = 0; + tp->ack.blocked = 0; + sk_stop_timer(sk, &tp->delack_timer); + + sk_stop_timer(sk, &sk->sk_timer); +} + +static void tcp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); +} + +/* Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criterium is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. + */ +static int tcp_out_of_resources(struct sock *sk, int do_reset) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orphans = atomic_read(&tcp_orphan_count); + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + orphans <<= 1; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->sk_err_soft) + orphans <<= 1; + + if (orphans >= sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; + if (do_reset) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + return 1; + } + return 0; +} + +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->sk_err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + +/* A write timeout has occurred. Process the after effects. */ +static int tcp_write_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int retry_until; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if (tp->retransmits) + dst_negative_advice(&sk->sk_dst_cache); + retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + } else { + if (tp->retransmits >= sysctl_tcp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->sk_dst_cache); + } + + retry_until = sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + int alive = (tp->rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + return 1; + } + } + + if (tp->retransmits >= retry_until) { + /* Has it gone just too far? */ + tcp_write_err(sk); + return 1; + } + return 0; +} + +static void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_sock *tp = tcp_sk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + tp->ack.blocked = 1; + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + goto out_unlock; + } + + sk_stream_mem_reclaim(sk); + + if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + goto out; + + if (time_after(tp->ack.timeout, jiffies)) { + sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + goto out; + } + tp->ack.pending &= ~TCP_ACK_TIMER; + + if (skb_queue_len(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, + skb_queue_len(&tp->ucopy.prequeue)); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->sk_backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (tcp_ack_scheduled(tp)) { + if (!tp->ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; + } + tcp_send_ack(sk); + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + } + TCP_CHECK_TIMER(sk); + +out: + if (tcp_memory_pressure) + sk_stream_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +static void tcp_probe_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int max_probes; + + if (tp->packets_out || !sk->sk_send_head) { + tp->probes_out = 0; + return; + } + + /* *WARNING* RFC 1122 forbids this + * + * It doesn't AFAIK, because we kill the retransmit timer -AK + * + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + * + * Let me to explain. probes_out is zeroed by incoming ACKs + * even if they advertise zero window. Hence, connection is killed only + * if we received no ACKs for normal connection timeout. It is not killed + * only because window stays zero for some time, window may be zero + * until armageddon and even later. We are in full accordance + * with RFCs, only probe timer combines both retransmission timeout + * and probe timeout in one bottle. --ANK + */ + max_probes = sysctl_tcp_retries2; + + if (sock_flag(sk, SOCK_DEAD)) { + int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + return; + } + + if (tp->probes_out > max_probes) { + tcp_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); + } +} + +/* + * The TCP retransmit timer. + */ + +static void tcp_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->packets_out) + goto out; + + BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); + + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && + !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ +#ifdef TCP_DEBUG + if (net_ratelimit()) { + struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", + NIPQUAD(inet->daddr), htons(inet->dport), + inet->num, tp->snd_una, tp->snd_nxt); + } +#endif + if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(sk); + goto out; + } + tcp_enter_loss(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + __sk_dst_reset(sk); + goto out_reset_timer; + } + + if (tcp_write_timeout(sk)) + goto out; + + if (tp->retransmits == 0) { + if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (tp->rx_opt.sack_ok) { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); + } else { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); + else + NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); + } + } else if (tp->ca_state == TCP_CA_Loss) { + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); + } else { + NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); + } + } + + if (tcp_use_frto(sk)) { + tcp_enter_frto(sk); + } else { + tcp_enter_loss(sk, 0); + } + + if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!tp->retransmits) + tp->retransmits=1; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + goto out; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + tp->backoff++; + tp->retransmits++; + +out_reset_timer: + tp->rto = min(tp->rto << 1, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + if (tp->retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +out:; +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_sock *tp = tcp_sk(sk); + int event; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE || !tp->pending) + goto out; + + if (time_after(tp->timeout, jiffies)) { + sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + goto out; + } + + event = tp->pending; + tp->pending = 0; + + switch (event) { + case TCP_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case TCP_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } + TCP_CHECK_TIMER(sk); + +out: + sk_stream_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct open_request **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (tp->defer_accept) + max_retries = tp->defer_accept; + + budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (req->acked && req->retrans < max_retries)) + && !req->class->rtx_syn_ack(sk, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + TCP_RTO_MAX); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + write_lock(&tp->syn_wait_lock); + *reqp = req->dl_next; + write_unlock(&tp->syn_wait_lock); + lopt->qlen--; + if (req->retrans == 0) + lopt->qlen_young--; + tcp_openreq_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i+1)&(TCP_SYNQ_HSIZE-1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); +} + +void tcp_delete_keepalive_timer (struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + else if (!val) + tcp_delete_keepalive_timer(sk); +} + + +static void tcp_keepalive_timer (unsigned long data) +{ + struct sock *sk = (struct sock *) data; + struct tcp_sock *tp = tcp_sk(sk); + __u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + tcp_reset_keepalive_timer (sk, HZ/20); + goto out; + } + + if (sk->sk_state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + if (tp->linger2 >= 0) { + int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + goto out; + + elapsed = keepalive_time_when(tp); + + /* It is alive without keepalive 8) */ + if (tp->packets_out || sk->sk_send_head) + goto resched; + + elapsed = tcp_time_stamp - tp->rcv_tstamp; + + if (elapsed >= keepalive_time_when(tp)) { + if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_write_err(sk); + goto out; + } + if (tcp_write_wakeup(sk) <= 0) { + tp->probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } + } else { + /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ + elapsed = keepalive_time_when(tp) - elapsed; + } + + TCP_CHECK_TIMER(sk); + sk_stream_mem_reclaim(sk); + +resched: + tcp_reset_keepalive_timer (sk, elapsed); + goto out; + +death: + tcp_done(sk); + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +EXPORT_SYMBOL(tcp_clear_xmit_timers); +EXPORT_SYMBOL(tcp_delete_keepalive_timer); +EXPORT_SYMBOL(tcp_init_xmit_timers); +EXPORT_SYMBOL(tcp_reset_keepalive_timer); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c new file mode 100644 index 000000000000..6baddfbedca3 --- /dev/null +++ b/net/ipv4/udp.c @@ -0,0 +1,1575 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The User Datagram Protocol (UDP). + * + * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Alan Cox, <Alan.Cox@linux.org> + * Hirokazu Takahashi, <taka@valinux.co.jp> + * + * Fixes: + * Alan Cox : verify_area() calls + * Alan Cox : stopped close while in use off icmp + * messages. Not a fix but a botch that + * for udp at least is 'valid'. + * Alan Cox : Fixed icmp handling properly + * Alan Cox : Correct error for oversized datagrams + * Alan Cox : Tidied select() semantics. + * Alan Cox : udp_err() fixed properly, also now + * select and read wake correctly on errors + * Alan Cox : udp_send verify_area moved to avoid mem leak + * Alan Cox : UDP can count its memory + * Alan Cox : send to an unknown connection causes + * an ECONNREFUSED off the icmp, but + * does NOT close. + * Alan Cox : Switched to new sk_buff handlers. No more backlog! + * Alan Cox : Using generic datagram code. Even smaller and the PEEK + * bug no longer crashes it. + * Fred Van Kempen : Net2e support for sk->broadcast. + * Alan Cox : Uses skb_free_datagram + * Alan Cox : Added get/set sockopt support. + * Alan Cox : Broadcasting without option set returns EACCES. + * Alan Cox : No wakeup calls. Instead we now use the callbacks. + * Alan Cox : Use ip_tos and ip_ttl + * Alan Cox : SNMP Mibs + * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. + * Matt Dillon : UDP length checks. + * Alan Cox : Smarter af_inet used properly. + * Alan Cox : Use new kernel side addressing. + * Alan Cox : Incorrect return on truncated datagram receive. + * Arnt Gulbrandsen : New udp_send and stuff + * Alan Cox : Cache last socket + * Alan Cox : Route cache + * Jon Peatfield : Minor efficiency fix to sendto(). + * Mike Shaver : RFC1122 checks. + * Alan Cox : Nonblocking error fix. + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * David S. Miller : New socket lookup architecture. + * Last socket cache retained as it + * does have a high hit rate. + * Olaf Kirch : Don't linearise iovec on sendmsg. + * Andi Kleen : Some cleanups, cache destination entry + * for connect. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Melvin Smith : Check msg_name not msg_namelen in sendto(), + * return ENOTCONN for unconnected sockets (POSIX) + * Janos Farkas : don't deliver multi/broadcasts to a different + * bound-to-device socket + * Hirokazu Takahashi : HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi : sendfile() on UDP works now. + * Arnaldo C. Melo : convert /proc/net/udp to seq_file + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/ioctls.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/config.h> +#include <linux/inet.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <net/snmp.h> +#include <net/tcp.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/sock.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/route.h> +#include <net/inet_common.h> +#include <net/checksum.h> +#include <net/xfrm.h> + +/* + * Snmp MIB for the UDP layer + */ + +DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); + +struct hlist_head udp_hash[UDP_HTABLE_SIZE]; +DEFINE_RWLOCK(udp_hash_lock); + +/* Shared by v4/v6 udp. */ +int udp_port_rover; + +static int udp_v4_get_port(struct sock *sk, unsigned short snum) +{ + struct hlist_node *node; + struct sock *sk2; + struct inet_sock *inet = inet_sk(sk); + + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; + + if (udp_port_rover > sysctl_local_port_range[1] || + udp_port_rover < sysctl_local_port_range[0]) + udp_port_rover = sysctl_local_port_range[0]; + best_size_so_far = 32767; + best = result = udp_port_rover; + for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + struct hlist_head *list; + int size; + + list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + goto gotit; + } + size = 0; + sk_for_each(sk2, node, list) + if (++size >= best_size_so_far) + goto next; + best_size_so_far = size; + best = result; + next:; + } + result = best; + for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) + goto fail; +gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, + &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet2 = inet_sk(sk2); + + if (inet2->num == snum && + sk2 != sk && + !ipv6_only_sock(sk2) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!inet2->rcv_saddr || + !inet->rcv_saddr || + inet2->rcv_saddr == inet->rcv_saddr) && + (!sk2->sk_reuse || !sk->sk_reuse)) + goto fail; + } + } + inet->num = snum; + if (sk_unhashed(sk)) { + struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; + + sk_add_node(sk, h); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); + return 0; + +fail: + write_unlock_bh(&udp_hash_lock); + return 1; +} + +static void udp_v4_hash(struct sock *sk) +{ + BUG(); +} + +static void udp_v4_unhash(struct sock *sk) +{ + write_lock_bh(&udp_hash_lock); + if (sk_del_node_init(sk)) { + inet_sk(sk)->num = 0; + sock_prot_dec_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +} + +/* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) +{ + struct sock *sk, *result = NULL; + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; + + sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + int score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if(score == 9) { + result = sk; + break; + } else if(score > badness) { + result = sk; + badness = score; + } + } + } + return result; +} + +static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) +{ + struct sock *sk; + + read_lock(&udp_hash_lock); + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + if (sk) + sock_hold(sk); + read_unlock(&udp_hash_lock); + return sk; +} + +static inline struct sock *udp_v4_mcast_next(struct sock *sk, + u16 loc_port, u32 loc_addr, + u16 rmt_port, u32 rmt_addr, + int dif) +{ + struct hlist_node *node; + struct sock *s = sk; + unsigned short hnum = ntohs(loc_port); + + sk_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (inet->num != hnum || + (inet->daddr && inet->daddr != rmt_addr) || + (inet->dport != rmt_port && inet->dport) || + (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; + if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) + continue; + goto found; + } + s = NULL; +found: + return s; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. + * Header points to the ip header of the error packet. We move + * on past this. Then (as it used to claim before adjustment) + * header points to the first 8 bytes of the udp header. We need + * to find the appropriate port. + */ + +void udp_err(struct sk_buff *skb, u32 info) +{ + struct inet_sock *inet; + struct iphdr *iph = (struct iphdr*)skb->data; + struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + int harderr; + int err; + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (sk == NULL) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; /* No socket for error */ + } + + err = 0; + harderr = 0; + inet = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { + ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + } + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip_flush_pending_frames(sk); + } +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up) +{ + struct inet_sock *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; + struct sk_buff *skb; + struct udphdr *uh; + int err = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (sk->sk_no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; + } + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + if (skb->ip_summed == CHECKSUM_HW) { + skb->csum = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, 0); + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, skb->csum); + if (uh->check == 0) + uh->check = -1; + } + } else { + unsigned int csum = 0; + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together. + */ + if (skb->ip_summed == CHECKSUM_HW) { + int offset = (unsigned char *)uh - skb->data; + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + } + + skb_queue_walk(&sk->sk_write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = -1; + } +send: + err = ip_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; +} + + +static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) +{ + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); +} + +int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct udp_sock *up = udp_sk(sk); + int ulen = len; + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + int free = 0; + int connected = 0; + u32 daddr, faddr, saddr; + u16 dport; + u8 tos; + int err; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + return -EOPNOTSUPP; + + ipc.opt = NULL; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET)) { + release_sock(sk); + return -EINVAL; + } + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + /* + * Get and verify the address. + */ + if (msg->msg_name) { + struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) { + if (usin->sin_family != AF_UNSPEC) + return -EAFNOSUPPORT; + } + + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + if (dport == 0) + return -EINVAL; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->daddr; + dport = inet->dport; + /* Open fast path for connected socket. + Route will not be used, if at least one option is set. + */ + connected = 1; + } + ipc.addr = inet->saddr; + + ipc.oif = sk->sk_bound_dev_if; + if (msg->msg_controllen) { + err = ip_cmsg_send(msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + connected = 0; + } + if (!ipc.opt) + ipc.opt = inet->opt; + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->faddr; + connected = 0; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->is_strictroute)) { + tos |= RTO_ONLINK; + connected = 0; + } + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + connected = 0; + } + + if (connected) + rt = (struct rtable*)sk_dst_check(sk, 0); + + if (rt == NULL) { + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = faddr, + .saddr = saddr, + .tos = tos } }, + .proto = IPPROTO_UDP, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = dport } } }; + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); + if (err) + goto out; + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + if (connected) + sk_dst_set(sk, dst_clone(&rt->u.dst)); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + saddr = rt->rt_src; + if (!ipc.addr) + daddr = ipc.addr = rt->rt_dst; + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + /* + * Now cork the socket to pend data. + */ + inet->cork.fl.fl4_dst = daddr; + inet->cork.fl.fl_ip_dport = dport; + inet->cork.fl.fl4_src = saddr; + inet->cork.fl.fl_ip_sport = inet->sport; + up->pending = AF_INET; + +do_append_data: + up->len += ulen; + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_flush_pending_frames(sk); + else if (!corkreq) + err = udp_push_pending_frames(sk, up); + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS); + return len; + } + return err; + +do_confirm: + dst_confirm(&rt->u.dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct udp_sock *up = udp_sk(sk); + int ret; + + if (!up->pending) { + struct msghdr msg = { .msg_flags = flags|MSG_MORE }; + + /* Call udp_sendmsg to specify destination address which + * sendpage interface can't pass. + * This will succeed only when the socket is connected. + */ + ret = udp_sendmsg(NULL, sk, &msg, 0); + if (ret < 0) + return ret; + } + + lock_sock(sk); + + if (unlikely(!up->pending)) { + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); + return -EINVAL; + } + + ret = ip_append_page(sk, page, offset, size, flags); + if (ret == -EOPNOTSUPP) { + release_sock(sk); + return sock_no_sendpage(sk->sk_socket, page, offset, + size, flags); + } + if (ret < 0) { + udp_flush_pending_frames(sk); + goto out; + } + + up->len += size; + if (!(up->corkflag || (flags&MSG_MORE))) + ret = udp_push_pending_frames(sk, up); + if (!ret) + ret = size; +out: + release_sock(sk); + return ret; +} + +/* + * IOCTL requests applicable to the UDP protocol + */ + +int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) + { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + + case SIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount = skb->len - sizeof(struct udphdr); + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: + return -ENOIOCTLCMD; + } + return(0); +} + +static __inline__ int __udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +} + +static __inline__ int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + /* + * Check any passed addresses + */ + if (addr_len) + *addr_len=sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + +try_again: + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if (__udp_checksum_complete(skb)) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + + if (err == -EINVAL) + goto csum_copy_err; + } + + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) + { + sin->sin_family = AF_INET; + sin->sin_port = skb->h.uh->source; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len - sizeof(struct udphdr); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + skb_free_datagram(sk, skb); + + if (noblock) + return -EAGAIN; + goto try_again; +} + + +int udp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + /* + * 1003.1g - break association. + */ + + sk->sk_state = TCP_CLOSE; + inet->daddr = 0; + inet->dport = 0; + sk->sk_bound_dev_if = 0; + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { + sk->sk_prot->unhash(sk); + inet->sport = 0; + } + sk_dst_reset(sk); + return 0; +} + +static void udp_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +/* return: + * 1 if the the UDP system should process it + * 0 if we should drop this packet + * -1 if it should get processed by xfrm4_rcv_encap + */ +static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) +{ +#ifndef CONFIG_XFRM + return 1; +#else + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh = skb->h.uh; + struct iphdr *iph; + int iphlen, len; + + __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr); + __u32 *udpdata32 = (__u32 *)udpdata; + __u16 encap_type = up->encap_type; + + /* if we're overly short, let UDP handle it */ + if (udpdata > skb->tail) + return 1; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + len = skb->tail - udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + return 0; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + return 0; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + + /* Now we can update and verify the packet length... */ + iph = skb->nh.iph; + iphlen = iph->ihl << 2; + iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (skb->len < iphlen + len) { + /* packet is too small!?! */ + return 0; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + skb->h.raw = skb_pull(skb, len); + + /* modify the protocol (it's ESP!) */ + iph->protocol = IPPROTO_ESP; + + /* and let the caller know to send this into the ESP processor... */ + return -1; +#endif +} + +/* returns: + * -1: error + * 0: success + * >0: "udp encap" protocol resubmission + * + * Note that in the success and error cases, the skb is assumed to + * have either been requeued or freed. + */ +static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + + /* + * Charge it to the socket, dropping if the queue is full. + */ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + + if (up->encap_type) { + /* + * This is an encapsulation socket, so let's see if this is + * an encapsulated packet. + * If it's a keepalive packet, then just eat it. + * If it's an encapsulateed packet, then pass it to the + * IPsec xfrm input and return the response + * appropriately. Otherwise, just fall through and + * pass this up the UDP socket. + */ + int ret; + + ret = udp_encap_rcv(sk, skb); + if (ret == 0) { + /* Eat the packet .. */ + kfree_skb(skb); + return 0; + } + if (ret < 0) { + /* process the ESP packet */ + ret = xfrm4_rcv_encap(skb, up->encap_type); + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return -ret; + } + /* FALLTHROUGH -- it's a UDP Packet */ + } + + if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (__udp_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return -1; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (sock_queue_rcv_skb(sk,skb)<0) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return -1; + } + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return 0; +} + +/* + * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, + u32 saddr, u32 daddr) +{ + struct sock *sk; + int dif; + + read_lock(&udp_hash_lock); + sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (sk) { + struct sock *sknext = NULL; + + do { + struct sk_buff *skb1 = skb; + + sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, + uh->source, saddr, dif); + if(sknext) + skb1 = skb_clone(skb, GFP_ATOMIC); + + if(skb1) { + int ret = udp_queue_rcv_skb(sk, skb1); + if (ret > 0) + /* we should probably re-process instead + * of dropping packets here. */ + kfree_skb(skb1); + } + sk = sknext; + } while(sknext); + } else + kfree_skb(skb); + read_unlock(&udp_hash_lock); + return 0; +} + +/* Initialize UDP checksum. If exited with zero value (success), + * CHECKSUM_UNNECESSARY means, that no more checks are required. + * Otherwise, csum completion requires chacksumming packet body, + * including udp header and folding it to skb->csum. + */ +static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) + return 0; + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + return 0; +} + +/* + * All we need to do is get the socket, and then do a checksum. + */ + +int udp_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct udphdr *uh; + unsigned short ulen; + struct rtable *rt = (struct rtable*)skb->dst; + u32 saddr = skb->nh.iph->saddr; + u32 daddr = skb->nh.iph->daddr; + int len = skb->len; + + /* + * Validate the packet and the UDP length. + */ + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto no_header; + + uh = skb->h.uh; + + ulen = ntohs(uh->len); + + if (ulen > len || ulen < sizeof(*uh)) + goto short_packet; + + if (pskb_trim(skb, ulen)) + goto short_packet; + + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto csum_error; + + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return udp_v4_mcast_deliver(skb, uh, saddr, daddr); + + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); + + if (sk != NULL) { + int ret = udp_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input, but + * it it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; + } + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + /* No socket. Drop packet silently, if checksum is wrong */ + if (udp_checksum_complete(skb)) + goto csum_error; + + UDP_INC_STATS_BH(UDP_MIB_NOPORTS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP packet to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return(0); + +short_packet: + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + NIPQUAD(saddr), + ntohs(uh->source), + ulen, + len, + NIPQUAD(daddr), + ntohs(uh->dest))); +no_header: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); + +csum_error: + /* + * RFC1122: OK. Discards the bad packet silently (as far as + * the network is concerned, anyway) as per 4.1.3.4 (MUST). + */ + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen)); +drop: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); +} + +static int udp_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + +/* + * Socket option code for UDP + */ +static int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if(optlen<sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch(optname) { + case UDP_CORK: + if (val != 0) { + up->corkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + switch (val) { + case 0: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + up->encap_type = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +/** + * udp_poll - wait for a UDP event. + * @file - file struct + * @sock - socket + * @wait - poll table + * + * This is same as datagram poll, except for the special case of + * blocking sockets. If application is using a blocking fd + * and a packet with checksum error is in the queue; + * then it could get return from select indicating data available + * but then block when reading it. Add special case code + * to work around these arguably broken applications. + */ +unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + + /* Check for false positives due to checksum errors */ + if ( (mask & POLLRDNORM) && + !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN)){ + struct sk_buff_head *rcvq = &sk->sk_receive_queue; + struct sk_buff *skb; + + spin_lock_irq(&rcvq->lock); + while ((skb = skb_peek(rcvq)) != NULL) { + if (udp_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + __skb_unlink(skb, rcvq); + kfree_skb(skb); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + } + spin_unlock_irq(&rcvq->lock); + + /* nothing to see, move along */ + if (skb == NULL) + mask &= ~(POLLIN | POLLRDNORM); + } + + return mask; + +} + +struct proto udp_prot = { + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .backlog_rcv = udp_queue_rcv_skb, + .hash = udp_v4_hash, + .unhash = udp_v4_unhash, + .get_port = udp_v4_get_port, + .obj_size = sizeof(struct udp_sock), +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static struct sock *udp_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct udp_iter_state *state = seq->private; + + for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + sk_for_each(sk, node, &udp_hash[state->bucket]) { + if (sk->sk_family == state->family) + goto found; + } + } + sk = NULL; +found: + return sk; +} + +static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) +{ + struct udp_iter_state *state = seq->private; + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != state->family); + + if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { + sk = sk_head(&udp_hash[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = udp_get_first(seq); + + if (sk) + while(pos && (sk = udp_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *udp_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&udp_hash_lock); + return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; +} + +static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == (void *)1) + sk = udp_get_idx(seq, 0); + else + sk = udp_get_next(seq, v); + + ++*pos; + return sk; +} + +static void udp_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&udp_hash_lock); +} + +static int udp_seq_open(struct inode *inode, struct file *file) +{ + struct udp_seq_afinfo *afinfo = PDE(inode)->data; + struct seq_file *seq; + int rc = -ENOMEM; + struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + memset(s, 0, sizeof(*s)); + s->family = afinfo->family; + s->seq_ops.start = udp_seq_start; + s->seq_ops.next = udp_seq_next; + s->seq_ops.show = afinfo->seq_show; + s->seq_ops.stop = udp_seq_stop; + + rc = seq_open(file, &s->seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +/* ------------------------------------------------------------------------ */ +int udp_proc_register(struct udp_seq_afinfo *afinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + if (!afinfo) + return -EINVAL; + afinfo->seq_fops->owner = afinfo->owner; + afinfo->seq_fops->open = udp_seq_open; + afinfo->seq_fops->read = seq_read; + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + + p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else + rc = -ENOMEM; + return rc; +} + +void udp_proc_unregister(struct udp_seq_afinfo *afinfo) +{ + if (!afinfo) + return; + proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); +} + +/* ------------------------------------------------------------------------ */ +static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr; + unsigned int src = inet->rcv_saddr; + __u16 destp = ntohs(inet->dport); + __u16 srcp = ntohs(inet->sport); + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + bucket, src, srcp, dest, destp, sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int udp4_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + else { + char tmpbuf[129]; + struct udp_iter_state *state = seq->private; + + udp4_format_sock(v, tmpbuf, state->bucket); + seq_printf(seq, "%-127s\n", tmpbuf); + } + return 0; +} + +/* ------------------------------------------------------------------------ */ +static struct file_operations udp4_seq_fops; +static struct udp_seq_afinfo udp4_seq_afinfo = { + .owner = THIS_MODULE, + .name = "udp", + .family = AF_INET, + .seq_show = udp4_seq_show, + .seq_fops = &udp4_seq_fops, +}; + +int __init udp4_proc_init(void) +{ + return udp_proc_register(&udp4_seq_afinfo); +} + +void udp4_proc_exit(void) +{ + udp_proc_unregister(&udp4_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +EXPORT_SYMBOL(udp_disconnect); +EXPORT_SYMBOL(udp_hash); +EXPORT_SYMBOL(udp_hash_lock); +EXPORT_SYMBOL(udp_ioctl); +EXPORT_SYMBOL(udp_port_rover); +EXPORT_SYMBOL(udp_prot); +EXPORT_SYMBOL(udp_sendmsg); +EXPORT_SYMBOL(udp_poll); + +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(udp_proc_register); +EXPORT_SYMBOL(udp_proc_unregister); +#endif diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c new file mode 100644 index 000000000000..6aecd7a43534 --- /dev/null +++ b/net/ipv4/utils.c @@ -0,0 +1,59 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Various kernel-resident INET utility functions; mainly + * for format conversion and debugging output. + * + * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $ + * + * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : verify_area check. + * Alan Cox : removed old debugging. + * Andi Kleen : add net_ratelimit() + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <asm/byteorder.h> + +/* + * Convert an ASCII string to binary IP. + */ + +__u32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return(htonl(l)); +} + +EXPORT_SYMBOL(in_aton); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c new file mode 100644 index 000000000000..2d3849c38a0f --- /dev/null +++ b/net/ipv4/xfrm4_input.c @@ -0,0 +1,160 @@ +/* + * xfrm4_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * Derek Atkins <derek@ihtfp.com> + * Add Encapsulation support + * + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/xfrm.h> + +int xfrm4_rcv(struct sk_buff *skb) +{ + return xfrm4_rcv_encap(skb, 0); +} + +EXPORT_SYMBOL(xfrm4_rcv); + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) +{ + switch (nexthdr) { + case IPPROTO_IPIP: + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -EINVAL; + *spi = skb->nh.iph->saddr; + *seq = 0; + return 0; + } + + return xfrm_parse_spi(skb, nexthdr, spi, seq); +} + +int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) +{ + int err; + u32 spi, seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + + if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) + goto drop; + + do { + struct iphdr *iph = skb->nh.iph; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET); + if (x == NULL) + goto drop; + + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + xfrm_vec[xfrm_nr].decap.decap_type = encap_type; + if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb)) + goto drop_unlock; + + /* only the first xfrm gets the encap type */ + encap_type = 0; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + iph = skb->nh.iph; + + if (x->props.mode) { + if (iph->protocol != IPPROTO_IPIP) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) + goto drop; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + + if (decaps) { + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return 0; + } else { + return -skb->nh.iph->protocol; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); + + kfree_skb(skb); + return 0; +} diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c new file mode 100644 index 000000000000..af2392ae5769 --- /dev/null +++ b/net/ipv4/xfrm4_output.c @@ -0,0 +1,141 @@ +/* + * xfrm4_output.c - Common IPsec encapsulation code for IPv4. + * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/icmp.h> + +/* Add encapsulation header. + * + * In transport mode, the IP header will be moved forward to make space + * for the encapsulation header. + * + * In tunnel mode, the top IP header will be constructed per RFC 2401. + * The following fields in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static void xfrm4_encap(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + if (!x->props.mode) { + skb->h.raw += iph->ihl*4; + memmove(top_iph, iph, iph->ihl*4); + return; + } + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + if (x->props.flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = iph->frag_off & htons(IP_DF); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); +} + +static int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst; + struct iphdr *iph = skb->nh.iph; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + + if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) + goto out; + + dst = skb->dst; + mtu = dst_mtu(dst); + if (skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +int xfrm4_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + int err; + + if (skb->ip_summed == CHECKSUM_HW) { + err = skb_checksum_help(skb, 0); + if (err) + goto error_nolock; + } + + if (x->props.mode) { + err = xfrm4_tunnel_check_size(skb); + if (err) + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; + + xfrm4_encap(skb); + + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c new file mode 100644 index 000000000000..7fe2afd2e669 --- /dev/null +++ b/net/ipv4/xfrm4_policy.c @@ -0,0 +1,281 @@ +/* + * xfrm4_policy.c + * + * Changes: + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include <linux/config.h> +#include <net/xfrm.h> +#include <net/ip.h> + +static struct dst_ops xfrm4_dst_ops; +static struct xfrm_policy_afinfo xfrm4_policy_afinfo; + +static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; + +static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + return __ip_route_output_key((struct rtable**)dst, fl); +} + +static struct dst_entry * +__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ + xdst->u.rt.fl.fl4_dst == fl->fl4_dst && + xdst->u.rt.fl.fl4_src == fl->fl4_src && + xfrm_bundle_ok(xdst, fl, AF_INET)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rtable *rt0 = (struct rtable*)(*dst_p); + struct rtable *rt = rt0; + u32 remote = fl->fl4_dst; + u32 local = fl->fl4_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip4_u = { + .saddr = local, + .daddr = remote + } + } + }; + int i; + int err; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + dst_hold(&rt->u.dst); + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); + struct xfrm_dst *xdst; + int tunnel = 0; + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + dst_release(&rt->u.dst); + goto error; + } + + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = xfrm[i]->id.daddr.a4; + local = xfrm[i]->props.saddr.a4; + tunnel = 1; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + + if (tunnel) { + fl_tunnel.fl4_src = local; + fl_tunnel.fl4_dst = remote; + err = xfrm_dst_lookup((struct xfrm_dst **)&rt, + &fl_tunnel, AF_INET); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); + } + + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; + i = 0; + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + x->u.rt.fl = *fl; + + dst_prev->xfrm = xfrm[i++]; + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); + + /* Copy neighbout for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = xfrm4_output; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + x->u.rt.peer = rt->peer; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt.rt_type = rt->rt_type; + x->u.rt.rt_src = rt0->rt_src; + x->u.rt.rt_dst = rt0->rt_dst; + x->u.rt.rt_gateway = rt->rt_gateway; + x->u.rt.rt_spec_dst = rt0->rt_spec_dst; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + + xfrm_init_pmtu(dst); + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static void +_decode_session4(struct sk_buff *skb, struct flowi *fl) +{ + struct iphdr *iph = skb->nh.iph; + u8 *xprth = skb->nh.raw + iph->ihl*4; + + memset(fl, 0, sizeof(struct flowi)); + if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { + switch (iph->protocol) { + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ports = (u16 *)xprth; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + break; + + case IPPROTO_ICMP: + if (pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp = xprth; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + break; + + case IPPROTO_ESP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u32 *ehdr = (u32 *)xprth; + + fl->fl_ipsec_spi = ehdr[0]; + } + break; + + case IPPROTO_AH: + if (pskb_may_pull(skb, xprth + 8 - skb->data)) { + u32 *ah_hdr = (u32*)xprth; + + fl->fl_ipsec_spi = ah_hdr[1]; + } + break; + + case IPPROTO_COMP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ipcomp_hdr = (u16 *)xprth; + + fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1])); + } + break; + default: + fl->fl_ipsec_spi = 0; + break; + }; + } + fl->proto = iph->protocol; + fl->fl4_dst = iph->daddr; + fl->fl4_src = iph->saddr; +} + +static inline int xfrm4_garbage_collect(void) +{ + read_lock(&xfrm4_policy_afinfo.lock); + xfrm4_policy_afinfo.garbage_collect(); + read_unlock(&xfrm4_policy_afinfo.lock); + return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); +} + +static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static struct dst_ops xfrm4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = xfrm4_garbage_collect, + .update_pmtu = xfrm4_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm4_type_map, + .dst_ops = &xfrm4_dst_ops, + .dst_lookup = xfrm4_dst_lookup, + .find_bundle = __xfrm4_find_bundle, + .bundle_create = __xfrm4_bundle_create, + .decode_session = _decode_session4, +}; + +static void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); +} + +static void __exit xfrm4_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); +} + +void __init xfrm4_init(void) +{ + xfrm4_state_init(); + xfrm4_policy_init(); +} + diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c new file mode 100644 index 000000000000..223a2e83853f --- /dev/null +++ b/net/ipv4/xfrm4_state.c @@ -0,0 +1,126 @@ +/* + * xfrm4_state.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include <net/xfrm.h> +#include <linux/pfkeyv2.h> +#include <linux/ipsec.h> + +static struct xfrm_state_afinfo xfrm4_state_afinfo; + +static void +__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + x->sel.daddr.a4 = fl->fl4_dst; + x->sel.saddr.a4 = fl->fl4_src; + x->sel.dport = xfrm_flowi_dport(fl); + x->sel.dport_mask = ~0; + x->sel.sport = xfrm_flowi_sport(fl); + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 32; + x->sel.prefixlen_s = 32; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET; +} + +static struct xfrm_state * +__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm4_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET && + spi == x->id.spi && + daddr->a4 == x->id.daddr.a4 && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm4_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET && + daddr->a4 == x->id.daddr.a4 && + mode == x->props.mode && + proto == x->id.proto && + saddr->a4 == x->props.saddr.a4 && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ && + !x->id.spi) { + x0 = x; + break; + } + } + if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) { + x0->sel.daddr.a4 = daddr->a4; + x0->sel.saddr.a4 = saddr->a4; + x0->sel.prefixlen_d = 32; + x0->sel.prefixlen_s = 32; + x0->props.saddr.a4 = saddr->a4; + x0->km.state = XFRM_STATE_ACQ; + x0->id.daddr.a4 = daddr->a4; + x0->id.proto = proto; + x0->props.family = AF_INET; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->props.family = AF_INET; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; + add_timer(&x0->timer); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + if (x0) + xfrm_state_hold(x0); + return x0; +} + +static struct xfrm_state_afinfo xfrm4_state_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm4_init_tempsel, + .state_lookup = __xfrm4_state_lookup, + .find_acq = __xfrm4_find_acq, +}; + +void __init xfrm4_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm4_state_afinfo); +} + +void __exit xfrm4_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); +} + diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c new file mode 100644 index 000000000000..413191f585f6 --- /dev/null +++ b/net/ipv4/xfrm4_tunnel.c @@ -0,0 +1,144 @@ +/* xfrm4_tunnel.c: Generic IP tunnel transformer. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include <linux/skbuff.h> +#include <linux/module.h> +#include <net/xfrm.h> +#include <net/ip.h> +#include <net/protocol.h> + +static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph; + + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + ip_send_check(iph); + + return 0; +} + +static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_tunnel *ipip_handler; +static DECLARE_MUTEX(xfrm4_tunnel_sem); + +int xfrm4_tunnel_register(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != NULL) + ret = -EINVAL; + if (!ret) + ipip_handler = handler; + up(&xfrm4_tunnel_sem); + + return ret; +} + +EXPORT_SYMBOL(xfrm4_tunnel_register); + +int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != handler) + ret = -EINVAL; + if (!ret) + ipip_handler = NULL; + up(&xfrm4_tunnel_sem); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm4_tunnel_deregister); + +static int ipip_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler = ipip_handler; + + /* Tunnel devices take precedence. */ + if (handler && handler->handler(skb) == 0) + return 0; + + return xfrm4_rcv(skb); +} + +static void ipip_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler = ipip_handler; + u32 arg = info; + + if (handler) + handler->err_handler(skb, &arg); +} + +static int ipip_init_state(struct xfrm_state *x, void *args) +{ + if (!x->props.mode) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct iphdr); + + return 0; +} + +static void ipip_destroy(struct xfrm_state *x) +{ +} + +static struct xfrm_type ipip_type = { + .description = "IPIP", + .owner = THIS_MODULE, + .proto = IPPROTO_IPIP, + .init_state = ipip_init_state, + .destructor = ipip_destroy, + .input = ipip_xfrm_rcv, + .output = ipip_output +}; + +static struct net_protocol ipip_protocol = { + .handler = ipip_rcv, + .err_handler = ipip_err, + .no_policy = 1, +}; + +static int __init ipip_init(void) +{ + if (xfrm_register_type(&ipip_type, AF_INET) < 0) { + printk(KERN_INFO "ipip init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) { + printk(KERN_INFO "ipip init: can't add protocol\n"); + xfrm_unregister_type(&ipip_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipip_fini(void) +{ + if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) + printk(KERN_INFO "ipip close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) + printk(KERN_INFO "ipip close: can't remove xfrm type\n"); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); |