diff options
Diffstat (limited to 'net/ipv4')
115 files changed, 3959 insertions, 15645 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 4670683b4688..691268f3a359 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -356,10 +356,8 @@ config INET_ESP config INET_IPCOMP tristate "IP: IPComp transformation" - select XFRM select INET_XFRM_TUNNEL - select CRYPTO - select CRYPTO_DEFLATE + select XFRM_IPCOMP ---help--- Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. @@ -632,5 +630,3 @@ config TCP_MD5SIG If unsure, say N. -source "net/ipv4/ipvs/Kconfig" - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ad40ef3f9ebc..80ff87ce43aa 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24eca23c2db3..1fbff5fa4241 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,8 +5,6 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> @@ -112,12 +110,11 @@ #include <net/ipip.h> #include <net/inet_common.h> #include <net/xfrm.h> +#include <net/net_namespace.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif -DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; - extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to @@ -151,10 +148,10 @@ void inet_sock_destruct(struct sock *sk) return; } - BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); - BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); - BUG_TRAP(!sk->sk_wmem_queued); - BUG_TRAP(!sk->sk_forward_alloc); + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(sk->sk_wmem_queued); + WARN_ON(sk->sk_forward_alloc); kfree(inet->opt); dst_release(sk->sk_dst_cache); @@ -267,7 +264,6 @@ static inline int inet_netns_ok(struct net *net, int protocol) static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; - struct list_head *p; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; @@ -284,13 +280,12 @@ static int inet_create(struct net *net, struct socket *sock, int protocol) sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ - answer = NULL; lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); - list_for_each_rcu(p, &inetsw[sock->type]) { - answer = list_entry(p, struct inet_protosw, list); + list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { + err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) @@ -305,10 +300,9 @@ lookup_protocol: break; } err = -EPROTONOSUPPORT; - answer = NULL; } - if (unlikely(answer == NULL)) { + if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* @@ -344,7 +338,7 @@ lookup_protocol: answer_flags = answer->flags; rcu_read_unlock(); - BUG_TRAP(answer_prot->slab != NULL); + WARN_ON(answer_prot->slab == NULL); err = -ENOBUFS; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); @@ -475,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && - !inet->freebind && + !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -664,8 +658,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk2); - BUG_TRAP((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); + WARN_ON(!((1 << sk2->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); sock_graft(sk2, newsock); @@ -1341,50 +1335,70 @@ static struct net_protocol icmp_protocol = { .netns_ok = 1, }; -static int __init init_ipv4_mibs(void) +static __net_init int ipv4_mib_init_net(struct net *net) { - if (snmp_mib_init((void **)net_statistics, - sizeof(struct linux_mib)) < 0) - goto err_net_mib; - if (snmp_mib_init((void **)ip_statistics, - sizeof(struct ipstats_mib)) < 0) - goto err_ip_mib; - if (snmp_mib_init((void **)icmp_statistics, - sizeof(struct icmp_mib)) < 0) - goto err_icmp_mib; - if (snmp_mib_init((void **)icmpmsg_statistics, - sizeof(struct icmpmsg_mib)) < 0) - goto err_icmpmsg_mib; - if (snmp_mib_init((void **)tcp_statistics, + if (snmp_mib_init((void **)net->mib.tcp_statistics, sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; - if (snmp_mib_init((void **)udp_statistics, + if (snmp_mib_init((void **)net->mib.ip_statistics, + sizeof(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void **)net->mib.net_statistics, + sizeof(struct linux_mib)) < 0) + goto err_net_mib; + if (snmp_mib_init((void **)net->mib.udp_statistics, sizeof(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)udplite_statistics, + if (snmp_mib_init((void **)net->mib.udplite_statistics, sizeof(struct udp_mib)) < 0) goto err_udplite_mib; + if (snmp_mib_init((void **)net->mib.icmp_statistics, + sizeof(struct icmp_mib)) < 0) + goto err_icmp_mib; + if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, + sizeof(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; - tcp_mib_init(); - + tcp_mib_init(net); return 0; -err_udplite_mib: - snmp_mib_free((void **)udp_statistics); -err_udp_mib: - snmp_mib_free((void **)tcp_statistics); -err_tcp_mib: - snmp_mib_free((void **)icmpmsg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmp_statistics); + snmp_mib_free((void **)net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)ip_statistics); -err_ip_mib: - snmp_mib_free((void **)net_statistics); + snmp_mib_free((void **)net->mib.udplite_statistics); +err_udplite_mib: + snmp_mib_free((void **)net->mib.udp_statistics); +err_udp_mib: + snmp_mib_free((void **)net->mib.net_statistics); err_net_mib: + snmp_mib_free((void **)net->mib.ip_statistics); +err_ip_mib: + snmp_mib_free((void **)net->mib.tcp_statistics); +err_tcp_mib: return -ENOMEM; } +static __net_exit void ipv4_mib_exit_net(struct net *net) +{ + snmp_mib_free((void **)net->mib.icmpmsg_statistics); + snmp_mib_free((void **)net->mib.icmp_statistics); + snmp_mib_free((void **)net->mib.udplite_statistics); + snmp_mib_free((void **)net->mib.udp_statistics); + snmp_mib_free((void **)net->mib.net_statistics); + snmp_mib_free((void **)net->mib.ip_statistics); + snmp_mib_free((void **)net->mib.tcp_statistics); +} + +static __net_initdata struct pernet_operations ipv4_mib_ops = { + .init = ipv4_mib_init_net, + .exit = ipv4_mib_exit_net, +}; + +static int __init init_ipv4_mibs(void) +{ + return register_pernet_subsys(&ipv4_mib_ops); +} + static int ipv4_proc_init(void); /* @@ -1425,6 +1439,10 @@ static int __init inet_init(void) (void)sock_register(&inet_family_ops); +#ifdef CONFIG_SYSCTL + ip_static_sysctl_init(); +#endif + /* * Add all the base protocols. */ @@ -1481,14 +1499,15 @@ static int __init inet_init(void) * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) - ip_mr_init(); + if (ip_mr_init()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); #endif /* * Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) - printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ipv4_proc_init(); @@ -1560,5 +1579,4 @@ EXPORT_SYMBOL(inet_sock_destruct); EXPORT_SYMBOL(inet_stream_connect); EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); -EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9b539fa9fe18..1a9dd66511fc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,7 +1,5 @@ /* linux/net/ipv4/arp.c * - * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ - * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), @@ -423,11 +421,12 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) struct rtable *rt; int flag = 0; /*unsigned long now; */ + struct net *net = dev_net(dev); - if (ip_route_output_key(dev_net(dev), &rt, &fl) < 0) + if (ip_route_output_key(net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { - NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); @@ -664,7 +663,7 @@ out: void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); } /* @@ -929,7 +928,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); freeskb: kfree_skb(skb); @@ -1199,7 +1198,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; default: break; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2c0e4572cc90..490e035c6d90 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -13,7 +13,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -47,17 +47,7 @@ #include <asm/bug.h> #include <asm/unaligned.h> -struct cipso_v4_domhsh_entry { - char *domain; - u32 valid; - struct list_head list; - struct rcu_head rcu; -}; - /* List of available DOI definitions */ -/* XXX - Updates should be minimal so having a single lock for the - * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be - * okay. */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we @@ -119,6 +109,19 @@ int cipso_v4_rbm_strictvalid = 1; * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 +/* Base length of the local tag (non-standard tag). + * Tag definition (may change between kernel versions) + * + * 0 8 16 24 32 + * +----------+----------+----------+----------+ + * | 10000000 | 00000110 | 32-bit secid value | + * +----------+----------+----------+----------+ + * | in (host byte order)| + * +----------+----------+ + * + */ +#define CIPSO_V4_TAG_LOC_BLEN 6 + /* * Helper Functions */ @@ -194,25 +197,6 @@ static void cipso_v4_bitmap_setbit(unsigned char *bitmap, } /** - * cipso_v4_doi_domhsh_free - Frees a domain list entry - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that the memory allocated to a domain list entry can be released - * safely. - * - */ -static void cipso_v4_doi_domhsh_free(struct rcu_head *entry) -{ - struct cipso_v4_domhsh_entry *ptr; - - ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu); - kfree(ptr->domain); - kfree(ptr); -} - -/** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * @@ -457,7 +441,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) - if (iter->doi == doi && iter->valid) + if (iter->doi == doi && atomic_read(&iter->refcount)) return iter; return NULL; } @@ -496,14 +480,17 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def) if (doi_def->type != CIPSO_V4_MAP_PASS) return -EINVAL; break; + case CIPSO_V4_TAG_LOCAL: + if (doi_def->type != CIPSO_V4_MAP_LOCAL) + return -EINVAL; + break; default: return -EINVAL; } } - doi_def->valid = 1; + atomic_set(&doi_def->refcount, 1); INIT_RCU_HEAD(&doi_def->rcu); - INIT_LIST_HEAD(&doi_def->dom_list); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi) != NULL) @@ -519,59 +506,129 @@ doi_add_failure: } /** + * cipso_v4_doi_free - Frees a DOI definition + * @entry: the entry's RCU field + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + kfree(doi_def->map.std->lvl.cipso); + kfree(doi_def->map.std->lvl.local); + kfree(doi_def->map.std->cat.cipso); + kfree(doi_def->map.std->cat.local); + break; + } + kfree(doi_def); +} + +/** + * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void cipso_v4_doi_free_rcu(struct rcu_head *entry) +{ + struct cipso_v4_doi *doi_def; + + doi_def = container_of(entry, struct cipso_v4_doi, rcu); + cipso_v4_doi_free(doi_def); +} + +/** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_secid: the LSM secid to use in the audit message - * @callback: the DOI cleanup/free callback * * Description: - * Removes a DOI definition from the CIPSO engine, @callback is called to - * free any memory. The NetLabel routines will be called to release their own - * LSM domain mappings as well as our own domain list. Returns zero on - * success and negative values on failure. + * Removes a DOI definition from the CIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. * */ -int cipso_v4_doi_remove(u32 doi, - struct netlbl_audit *audit_info, - void (*callback) (struct rcu_head * head)) +int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { struct cipso_v4_doi *doi_def; - struct cipso_v4_domhsh_entry *dom_iter; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); - if (doi_def != NULL) { - doi_def->valid = 0; - list_del_rcu(&doi_def->list); + if (doi_def == NULL) { spin_unlock(&cipso_v4_doi_list_lock); - rcu_read_lock(); - list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list) - if (dom_iter->valid) - netlbl_cfg_map_del(dom_iter->domain, - audit_info); - rcu_read_unlock(); - cipso_v4_cache_invalidate(); - call_rcu(&doi_def->rcu, callback); - return 0; + return -ENOENT; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&cipso_v4_doi_list_lock); + return -EBUSY; } + list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); - return -ENOENT; + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); + + return 0; } /** - * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition + * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that - * rcu_read_lock() is held while accessing the returned definition. + * rcu_read_lock() is held while accessing the returned definition and the DOI + * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { - return cipso_v4_doi_search(doi); + struct cipso_v4_doi *doi_def; + + rcu_read_lock(); + doi_def = cipso_v4_doi_search(doi); + if (doi_def == NULL) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * cipso_v4_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). + * + */ +void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&cipso_v4_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&cipso_v4_doi_list_lock); + + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** @@ -597,7 +654,7 @@ int cipso_v4_doi_walk(u32 *skip_cnt, rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) - if (iter_doi->valid) { + if (atomic_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); @@ -613,85 +670,6 @@ doi_walk_return: return ret_val; } -/** - * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition - * @doi_def: the DOI definition - * @domain: the domain to add - * - * Description: - * Adds the @domain to the DOI specified by @doi_def, this function - * should only be called by external functions (i.e. NetLabel). This function - * does allocate memory. Returns zero on success, negative values on failure. - * - */ -int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain) -{ - struct cipso_v4_domhsh_entry *iter; - struct cipso_v4_domhsh_entry *new_dom; - - new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL); - if (new_dom == NULL) - return -ENOMEM; - if (domain) { - new_dom->domain = kstrdup(domain, GFP_KERNEL); - if (new_dom->domain == NULL) { - kfree(new_dom); - return -ENOMEM; - } - } - new_dom->valid = 1; - INIT_RCU_HEAD(&new_dom->rcu); - - spin_lock(&cipso_v4_doi_list_lock); - list_for_each_entry(iter, &doi_def->dom_list, list) - if (iter->valid && - ((domain != NULL && iter->domain != NULL && - strcmp(iter->domain, domain) == 0) || - (domain == NULL && iter->domain == NULL))) { - spin_unlock(&cipso_v4_doi_list_lock); - kfree(new_dom->domain); - kfree(new_dom); - return -EEXIST; - } - list_add_tail_rcu(&new_dom->list, &doi_def->dom_list); - spin_unlock(&cipso_v4_doi_list_lock); - - return 0; -} - -/** - * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition - * @doi_def: the DOI definition - * @domain: the domain to remove - * - * Description: - * Removes the @domain from the DOI specified by @doi_def, this function - * should only be called by external functions (i.e. NetLabel). Returns zero - * on success and negative values on error. - * - */ -int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def, - const char *domain) -{ - struct cipso_v4_domhsh_entry *iter; - - spin_lock(&cipso_v4_doi_list_lock); - list_for_each_entry(iter, &doi_def->dom_list, list) - if (iter->valid && - ((domain != NULL && iter->domain != NULL && - strcmp(iter->domain, domain) == 0) || - (domain == NULL && iter->domain == NULL))) { - iter->valid = 0; - list_del_rcu(&iter->list); - spin_unlock(&cipso_v4_doi_list_lock); - call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free); - return 0; - } - spin_unlock(&cipso_v4_doi_list_lock); - - return -ENOENT; -} - /* * Label Mapping Functions */ @@ -712,7 +690,7 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) return 0; break; @@ -741,7 +719,7 @@ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; @@ -775,7 +753,7 @@ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { @@ -812,7 +790,7 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { @@ -860,7 +838,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, u32 host_cat_size = 0; u32 *host_cat_array = NULL; - if (doi_def->type == CIPSO_V4_MAP_STD) { + if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } @@ -875,7 +853,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; @@ -921,7 +899,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, u32 net_cat_size = 0; u32 *net_cat_array = NULL; - if (doi_def->type == CIPSO_V4_MAP_STD) { + if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } @@ -941,7 +919,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; - case CIPSO_V4_MAP_STD: + case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; @@ -1277,7 +1255,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x01; + buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; @@ -1373,7 +1351,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x02; + buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; @@ -1469,7 +1447,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, } else tag_len = 4; - buffer[0] = 0x05; + buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; @@ -1523,6 +1501,54 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, } /** + * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the local tag. Returns the size of the tag + * on success, negative values on failure. + * + */ +static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + if (!(secattr->flags & NETLBL_SECATTR_SECID)) + return -EPERM; + + buffer[0] = CIPSO_V4_TAG_LOCAL; + buffer[1] = CIPSO_V4_TAG_LOC_BLEN; + *(u32 *)&buffer[2] = secattr->attr.secid; + + return CIPSO_V4_TAG_LOC_BLEN; +} + +/** + * cipso_v4_parsetag_loc - Parse a CIPSO local tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO local tag and return the security attributes in @secattr. + * Return zero on success, negatives values on failure. + * + */ +static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + secattr->attr.secid = *(u32 *)&tag[2]; + secattr->flags |= NETLBL_SECATTR_SECID; + + return 0; +} + +/** * cipso_v4_validate - Validate a CIPSO option * @option: the start of the option, on error it is set to point to the error * @@ -1541,7 +1567,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, * that is unrecognized." * */ -int cipso_v4_validate(unsigned char **option) +int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; @@ -1566,7 +1592,7 @@ int cipso_v4_validate(unsigned char **option) goto validate_return_locked; } - opt_iter = 6; + opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) @@ -1584,7 +1610,7 @@ int cipso_v4_validate(unsigned char **option) switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1602,7 +1628,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1612,7 +1638,7 @@ int cipso_v4_validate(unsigned char **option) } break; case CIPSO_V4_TAG_ENUM: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1622,7 +1648,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1631,7 +1657,7 @@ int cipso_v4_validate(unsigned char **option) } break; case CIPSO_V4_TAG_RANGE: - if (tag_len < 4) { + if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } @@ -1641,7 +1667,7 @@ int cipso_v4_validate(unsigned char **option) err_offset = opt_iter + 3; goto validate_return_locked; } - if (tag_len > 4 && + if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { @@ -1649,6 +1675,19 @@ int cipso_v4_validate(unsigned char **option) goto validate_return_locked; } break; + case CIPSO_V4_TAG_LOCAL: + /* This is a non-standard tag that we only allow for + * local connections, so if the incoming interface is + * not the loopback device drop the packet. */ + if (!(skb->dev->flags & IFF_LOOPBACK)) { + err_offset = opt_iter; + goto validate_return_locked; + } + if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + break; default: err_offset = opt_iter; goto validate_return_locked; @@ -1704,48 +1743,27 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) } /** - * cipso_v4_sock_setattr - Add a CIPSO option to a socket - * @sk: the socket + * cipso_v4_genopt - Generate a CIPSO option + * @buf: the option buffer + * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use - * @secattr: the specific security attributes of the socket + * @secattr: the security attributes * * Description: - * Set the CIPSO option on the given socket using the DOI definition and - * security attributes passed to the function. This function requires - * exclusive access to @sk, which means it either needs to be in the - * process of being created or locked. Returns zero on success and negative - * values on failure. + * Generate a CIPSO option using the DOI definition and security attributes + * passed to the function. Returns the length of the option on success and + * negative values on failure. * */ -int cipso_v4_sock_setattr(struct sock *sk, - const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) +static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) { - int ret_val = -EPERM; + int ret_val; u32 iter; - unsigned char *buf; - u32 buf_len = 0; - u32 opt_len; - struct ip_options *opt = NULL; - struct inet_sock *sk_inet; - struct inet_connection_sock *sk_conn; - /* In the case of sock_create_lite(), the sock->sk field is not - * defined yet but it is not a problem as the only users of these - * "lite" PF_INET sockets are functions which do an accept() call - * afterwards so we will label the socket as part of the accept(). */ - if (sk == NULL) - return 0; - - /* We allocate the maximum CIPSO option size here so we are probably - * being a little wasteful, but it makes our life _much_ easier later - * on and after all we are only talking about 40 bytes. */ - buf_len = CIPSO_V4_OPT_LEN_MAX; - buf = kmalloc(buf_len, GFP_ATOMIC); - if (buf == NULL) { - ret_val = -ENOMEM; - goto socket_setattr_failure; - } + if (buf_len <= CIPSO_V4_HDR_LEN) + return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC @@ -1772,9 +1790,14 @@ int cipso_v4_sock_setattr(struct sock *sk, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_gentag_loc(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; default: - ret_val = -EPERM; - goto socket_setattr_failure; + return -EPERM; } iter++; @@ -1782,9 +1805,58 @@ int cipso_v4_sock_setattr(struct sock *sk, iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) - goto socket_setattr_failure; + return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); - buf_len = CIPSO_V4_HDR_LEN + ret_val; + return CIPSO_V4_HDR_LEN + ret_val; +} + +/** + * cipso_v4_sock_setattr - Add a CIPSO option to a socket + * @sk: the socket + * @doi_def: the CIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int cipso_v4_sock_setattr(struct sock *sk, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + unsigned char *buf = NULL; + u32 buf_len; + u32 opt_len; + struct ip_options *opt = NULL; + struct inet_sock *sk_inet; + struct inet_connection_sock *sk_conn; + + /* In the case of sock_create_lite(), the sock->sk field is not + * defined yet but it is not a problem as the only users of these + * "lite" PF_INET sockets are functions which do an accept() call + * afterwards so we will label the socket as part of the accept(). */ + if (sk == NULL) + return 0; + + /* We allocate the maximum CIPSO option size here so we are probably + * being a little wasteful, but it makes our life _much_ easier later + * on and after all we are only talking about 40 bytes. */ + buf_len = CIPSO_V4_OPT_LEN_MAX; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (buf == NULL) { + ret_val = -ENOMEM; + goto socket_setattr_failure; + } + + ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (ret_val < 0) + goto socket_setattr_failure; + buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and @@ -1822,6 +1894,80 @@ socket_setattr_failure: } /** + * cipso_v4_sock_delattr - Delete the CIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CIPSO option from a socket, if present. + * + */ +void cipso_v4_sock_delattr(struct sock *sk) +{ + u8 hdr_delta; + struct ip_options *opt; + struct inet_sock *sk_inet; + + sk_inet = inet_sk(sk); + opt = sk_inet->opt; + if (opt == NULL || opt->cipso == 0) + return; + + if (opt->srr || opt->rr || opt->ts || opt->router_alert) { + u8 cipso_len; + u8 cipso_off; + unsigned char *cipso_ptr; + int iter; + int optlen_new; + + cipso_off = opt->cipso - sizeof(struct iphdr); + cipso_ptr = &opt->__data[cipso_off]; + cipso_len = cipso_ptr[1]; + + if (opt->srr > opt->cipso) + opt->srr -= cipso_len; + if (opt->rr > opt->cipso) + opt->rr -= cipso_len; + if (opt->ts > opt->cipso) + opt->ts -= cipso_len; + if (opt->router_alert > opt->cipso) + opt->router_alert -= cipso_len; + opt->cipso = 0; + + memmove(cipso_ptr, cipso_ptr + cipso_len, + opt->optlen - cipso_off - cipso_len); + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length */ + iter = 0; + optlen_new = 0; + while (iter < opt->optlen) + if (opt->__data[iter] != IPOPT_NOP) { + iter += opt->__data[iter + 1]; + optlen_new = iter; + } else + iter++; + hdr_delta = opt->optlen; + opt->optlen = (optlen_new + 3) & ~3; + hdr_delta -= opt->optlen; + } else { + /* only the cipso option was present on the socket so we can + * remove the entire option struct */ + sk_inet->opt = NULL; + hdr_delta = opt->optlen; + kfree(opt); + } + + if (sk_inet->is_icsk && hdr_delta > 0) { + struct inet_connection_sock *sk_conn = inet_csk(sk); + sk_conn->icsk_ext_hdr_len -= hdr_delta; + sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); + } +} + +/** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes @@ -1859,6 +2005,9 @@ static int cipso_v4_getattr(const unsigned char *cipso, case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); + break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; @@ -1893,6 +2042,123 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) } /** + * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet + * @skb: the packet + * @secattr: the security attributes + * + * Description: + * Set the CIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int cipso_v4_skbuff_setattr(struct sk_buff *skb, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; + u32 buf_len = CIPSO_V4_OPT_LEN_MAX; + u32 opt_len; + int len_delta; + + buf_len = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (buf_len < 0) + return buf_len; + opt_len = (buf_len + 3) & ~3; + + /* we overwrite any existing options to ensure that we have enough + * room for the CIPSO option, the reason is that we _need_ to guarantee + * that the security label is applied to the packet - we do the same + * thing when using the socket options and it hasn't caused a problem, + * if we need to we can always revisit this choice later */ + + len_delta = opt_len - opt->optlen; + /* if we don't ensure enough headroom we could panic on the skb_push() + * call below so make sure we have enough, we are also "mangling" the + * packet so we should probably do a copy-on-write call anyway */ + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta > 0) { + /* we assume that the header + opt->optlen have already been + * "pushed" in ip_options_build() or similar */ + iph = ip_hdr(skb); + skb_push(skb, len_delta); + memmove((char *)iph - len_delta, iph, iph->ihl << 2); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + } else if (len_delta < 0) { + iph = ip_hdr(skb); + memset(iph + 1, IPOPT_NOP, opt->optlen); + } else + iph = ip_hdr(skb); + + if (opt->optlen > 0) + memset(opt, 0, sizeof(*opt)); + opt->optlen = opt_len; + opt->cipso = sizeof(struct iphdr); + opt->is_changed = 1; + + /* we have to do the following because we are being called from a + * netfilter hook which means the packet already has had the header + * fields populated and the checksum calculated - yes this means we + * are doing more work than needed but we do it to keep the core + * stack clean and tidy */ + memcpy(iph + 1, buf, buf_len); + if (opt_len > buf_len) + memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); + if (len_delta != 0) { + iph->ihl = 5 + (opt_len >> 2); + iph->tot_len = htons(skb->len); + } + ip_send_check(iph); + + return 0; +} + +/** + * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int cipso_v4_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char *cipso_ptr; + + if (opt->cipso == 0) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + /* the easiest thing to do is just replace the cipso option with noop + * options since we don't change the size of the packet, although we + * still need to recalculate the checksum */ + + iph = ip_hdr(skb); + cipso_ptr = (unsigned char *)iph + opt->cipso; + memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + opt->cipso = 0; + opt->is_changed = 1; + + ip_send_check(iph); + + return 0; +} + +/** * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option * @skb: the packet * @secattr: the security attributes diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 0c0c73f368ce..5e6c5a0f3fde 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -52,7 +52,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 79a7ef6209ff..56fce3ab6c55 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,8 +1,6 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -140,8 +138,8 @@ void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; - BUG_TRAP(!idev->ifa_list); - BUG_TRAP(!idev->mc_list); + WARN_ON(idev->ifa_list); + WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", idev, dev ? dev->name : "NIL"); @@ -170,6 +168,8 @@ static struct in_device *inetdev_init(struct net_device *dev) in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; + if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) + dev_disable_lro(dev); /* Reference in_dev->dev */ dev_hold(dev); /* Account for reference dev->ip_ptr (below) */ @@ -399,7 +399,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) } ipv4_devconf_setall(in_dev); if (ifa->ifa_dev != in_dev) { - BUG_TRAP(!ifa->ifa_dev); + WARN_ON(ifa->ifa_dev); in_dev_hold(in_dev); ifa->ifa_dev = in_dev; } @@ -613,9 +613,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (colon) *colon = 0; -#ifdef CONFIG_KMOD dev_load(net, ifr.ifr_name); -#endif switch (cmd) { case SIOCGIFADDR: /* Get interface address */ @@ -1013,7 +1011,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) - continue; + goto skip; dot = strchr(old, ':'); if (dot == NULL) { sprintf(old, ":%d", named); @@ -1024,9 +1022,16 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) } else { strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); } +skip: + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } +static inline bool inetdev_valid_mtu(unsigned mtu) +{ + return mtu >= 68; +} + /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, @@ -1046,6 +1051,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } + } else if (event == NETDEV_CHANGEMTU) { + /* Re-enabling IP */ + if (inetdev_valid_mtu(dev->mtu)) + in_dev = inetdev_init(dev); } goto out; } @@ -1056,7 +1065,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, dev->ip_ptr = NULL; break; case NETDEV_UP: - if (dev->mtu < 68) + if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa; @@ -1078,9 +1087,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, ip_mc_down(in_dev); break; case NETDEV_CHANGEMTU: - if (dev->mtu >= 68) + if (inetdev_valid_mtu(dev->mtu)) break; - /* MTU falled under 68, disable IP */ + /* disable IP when MTU is not enough */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; @@ -1241,6 +1250,8 @@ static void inet_forward_change(struct net *net) read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) + dev_disable_lro(dev); rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) @@ -1248,8 +1259,6 @@ static void inet_forward_change(struct net *net) rcu_read_unlock(); } read_unlock(&dev_base_lock); - - rt_cache_flush(0); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1272,7 +1281,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, +static int devinet_conf_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1335,10 +1344,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, if (write && *valp != val) { struct net *net = ctl->extra2; - if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) - inet_forward_change(net); - else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) - rt_cache_flush(0); + if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { + rtnl_lock(); + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { + inet_forward_change(net); + } else if (*valp) { + struct ipv4_devconf *cnf = ctl->extra1; + struct in_device *idev = + container_of(cnf, struct in_device, cnf); + dev_disable_lro(idev->dev); + } + rtnl_unlock(); + rt_cache_flush(net, 0); + } } return ret; @@ -1351,22 +1369,23 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } -int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, +int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, - newval, newlen); + int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen); + struct net *net = table->extra2; if (ret == 1) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4e73e5708e70..21515d4c49eb 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -575,7 +575,7 @@ static int esp_init_state(struct xfrm_state *x) crypto_aead_ivsize(aead); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); - else if (x->props.mode == XFRM_MODE_BEET) + else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6) x->props.header_len += IPV4_BEET_PHMAXLEN; if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0b2ac6a3d903..65c1503f8cc8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -146,7 +144,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(-1); + rt_cache_flush(net, -1); } /* @@ -899,21 +897,22 @@ static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + struct net_device *dev = ifa->ifa_dev->dev; switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(ifa->ifa_dev->dev); + fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa); @@ -921,9 +920,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(ifa->ifa_dev->dev, 1); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); } break; } @@ -951,14 +950,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; } return NOTIFY_DONE; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2e2fc3376ac9..c8cac6c7f881 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,8 +5,6 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -474,7 +472,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; @@ -534,7 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) if (new_f) fz->fz_nent++; - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -616,7 +614,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fn_free_alias(fa, f); if (kill_fn) { fn_free_node(f); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 1fb56876be54..6080d7120821 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(4); /* flow */ } -static void fib4_rule_flush_cache(void) +static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(-1); + rt_cache_flush(ops->fro_net, -1); } static struct fib_rules_ops fib4_rules_ops_template = { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0d4d72827e4b..ded2ae34eab1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e1600ad8fb0e..5cb72786a8af 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -22,8 +22,6 @@ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * - * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ - * * * Code from fib_hash has been reused which includes the following header: * @@ -1273,7 +1271,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1318,7 +1316,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1661,7 +1659,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2253,25 +2251,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) static int fib_triestat_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - net = get_proc_net(inode); - if (net == NULL) - return -ENXIO; - err = single_open(file, fib_triestat_seq_show, net); - if (err < 0) { - put_net(net); - return err; - } - return 0; -} - -static int fib_triestat_seq_release(struct inode *ino, struct file *f) -{ - struct seq_file *seq = f->private_data; - put_net(seq->private); - return single_release(ino, f); + return single_open_net(inode, file, fib_triestat_seq_show); } static const struct file_operations fib_triestat_fops = { @@ -2279,7 +2259,7 @@ static const struct file_operations fib_triestat_fops = { .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = fib_triestat_seq_release, + .release = single_release_net, }; static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 87397351ddac..72b2de76f1cd 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,9 +1,7 @@ /* * NET3: Implementation of the ICMP protocol layer. * - * Alan Cox, <alan@redhat.com> - * - * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -113,12 +111,6 @@ struct icmp_bxm { unsigned char optbuf[40]; }; -/* - * Statistics - */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; -DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; - /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ @@ -212,18 +204,22 @@ static struct sock *icmp_sk(struct net *net) return net->ipv4.icmp_sk[smp_processor_id()]; } -static inline int icmp_xmit_lock(struct sock *sk) +static inline struct sock *icmp_xmit_lock(struct net *net) { + struct sock *sk; + local_bh_disable(); + sk = icmp_sk(net); + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ local_bh_enable(); - return 1; + return NULL; } - return 0; + return sk; } static inline void icmp_xmit_unlock(struct sock *sk) @@ -298,10 +294,10 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -void icmp_out_count(unsigned char type) +void icmp_out_count(struct net *net, unsigned char type) { - ICMPMSGOUT_INC_STATS(type); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + ICMPMSGOUT_INC_STATS(net, type); + ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } /* @@ -362,15 +358,17 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb->rtable; struct net *net = dev_net(rt->u.dst.dev); - struct sock *sk = icmp_sk(net); - struct inet_sock *inet = inet_sk(sk); + struct sock *sk; + struct inet_sock *inet; __be32 daddr; if (ip_options_echo(&icmp_param->replyopts, skb)) return; - if (icmp_xmit_lock(sk)) + sk = icmp_xmit_lock(net); + if (sk == NULL) return; + inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; @@ -427,7 +425,6 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!rt) goto out; net = dev_net(rt->u.dst.dev); - sk = icmp_sk(net); /* * Find the original header. It is expected to be valid, of course. @@ -491,7 +488,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - if (icmp_xmit_lock(sk)) + sk = icmp_xmit_lock(net); + if (sk == NULL) return; /* @@ -765,7 +763,7 @@ static void icmp_unreach(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto out; } @@ -805,7 +803,7 @@ static void icmp_redirect(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); goto out; } @@ -876,7 +874,7 @@ static void icmp_timestamp(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS); goto out; } @@ -975,6 +973,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb->rtable; + struct net *net = dev_net(rt->u.dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { int nh; @@ -995,7 +994,7 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -1013,7 +1012,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(icmph->type); + ICMPMSGIN_INC_STATS_BH(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1029,9 +1028,6 @@ int icmp_rcv(struct sk_buff *skb) */ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - struct net *net; - - net = dev_net(rt->u.dst.dev); /* * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be * silently ignored (we let user decide with a sysctl). @@ -1057,7 +1053,7 @@ drop: kfree_skb(skb); return 0; error: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } @@ -1217,5 +1213,4 @@ int __init icmp_init(void) EXPORT_SYMBOL(icmp_err_convert); EXPORT_SYMBOL(icmp_send); -EXPORT_SYMBOL(icmp_statistics); EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2769dc4a4c84..a0d86455c53e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,10 +8,8 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $ - * * Authors: - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -291,6 +289,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; + struct net *net = dev_net(dev); skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) @@ -301,7 +300,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) .nl_u = { .ip4_u = { .daddr = IGMPV3_ALL_MCR } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { kfree_skb(skb); return NULL; } @@ -631,6 +630,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct igmphdr *ih; struct rtable *rt; struct net_device *dev = in_dev->dev; + struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; __be32 dst; @@ -645,7 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct flowi fl = { .oif = dev->ifindex, .nl_u = { .ip4_u = { .daddr = dst } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&init_net, &rt, &fl)) + if (ip_route_output_key(net, &rt, &fl)) return -1; } if (rt->rt_src == 0) { @@ -1198,9 +1198,6 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - for (im=in_dev->mc_list; im; im=im->next) { if (im->multiaddr == addr) { im->users++; @@ -1237,6 +1234,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) write_lock_bh(&in_dev->mc_list_lock); im->next=in_dev->mc_list; in_dev->mc_list=im; + in_dev->mc_count++; write_unlock_bh(&in_dev->mc_list_lock); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im->multiaddr); @@ -1280,14 +1278,12 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { if (i->multiaddr==addr) { if (--i->users == 0) { write_lock_bh(&in_dev->mc_list_lock); *ip = i->next; + in_dev->mc_count--; write_unlock_bh(&in_dev->mc_list_lock); igmp_group_dropped(i); @@ -1310,9 +1306,6 @@ void ip_mc_down(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - for (i=in_dev->mc_list; i; i=i->next) igmp_group_dropped(i); @@ -1333,15 +1326,13 @@ void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - in_dev->mc_tomb = NULL; #ifdef CONFIG_IP_MULTICAST in_dev->mr_gq_running = 0; setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, (unsigned long)in_dev); in_dev->mr_ifc_count = 0; + in_dev->mc_count = 0; setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; @@ -1359,9 +1350,6 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for (i=in_dev->mc_list; i; i=i->next) @@ -1378,17 +1366,14 @@ void ip_mc_destroy_dev(struct in_device *in_dev) ASSERT_RTNL(); - if (dev_net(in_dev->dev) != &init_net) - return; - /* Deactivate timers */ ip_mc_down(in_dev); write_lock_bh(&in_dev->mc_list_lock); while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; + in_dev->mc_count--; write_unlock_bh(&in_dev->mc_list_lock); - igmp_group_dropped(i); ip_ma_put(i); @@ -1397,7 +1382,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) write_unlock_bh(&in_dev->mc_list_lock); } -static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = imr->imr_multiaddr.s_addr } } }; @@ -1406,19 +1391,19 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) struct in_device *idev = NULL; if (imr->imr_ifindex) { - idev = inetdev_by_index(&init_net, imr->imr_ifindex); + idev = inetdev_by_index(net, imr->imr_ifindex); if (idev) __in_dev_put(idev); return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(&init_net, imr->imr_address.s_addr); + dev = ip_dev_find(net, imr->imr_address.s_addr); if (!dev) return NULL; dev_put(dev); } - if (!dev && !ip_route_output_key(&init_net, &rt, &fl)) { + if (!dev && !ip_route_output_key(net, &rt, &fl)) { dev = rt->u.dst.dev; ip_rt_put(rt); } @@ -1756,18 +1741,16 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) struct ip_mc_socklist *iml=NULL, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); int ifindex; int count = 0; if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); - in_dev = ip_mc_find_dev(imr); + in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { iml = NULL; @@ -1829,15 +1812,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml, **imlp; struct in_device *in_dev; + struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; int ret = -EADDRNOTAVAIL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); - in_dev = ip_mc_find_dev(imr); + in_dev = ip_mc_find_dev(net, imr); ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { if (iml->multi.imr_multiaddr.s_addr != group) @@ -1875,21 +1856,19 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + struct net *net = sock_net(sk); int leavegroup = 0; int i, j, rv; if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; @@ -2009,6 +1988,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; + struct net *net = sock_net(sk); int leavegroup = 0; if (!ipv4_is_multicast(addr)) @@ -2017,15 +1997,12 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; @@ -2096,19 +2073,17 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + struct net *net = sock_net(sk); if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; @@ -2165,9 +2140,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - if (sock_net(sk) != &init_net) - return -EPROTONOSUPPORT; - rtnl_lock(); err = -EADDRNOTAVAIL; @@ -2248,19 +2220,17 @@ void ip_mc_drop_socket(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; + struct net *net = sock_net(sk); if (inet->mc_list == NULL) return; - if (sock_net(sk) != &init_net) - return; - rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; inet->mc_list = iml->next; - in_dev = inetdev_by_index(&init_net, iml->multi.imr_ifindex); + in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev != NULL) { ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); @@ -2416,7 +2386,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) if (state->in_dev->mc_list == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", - state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); + state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } seq_printf(seq, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ec834480abe7..bd1278a2d828 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -30,20 +30,22 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif /* - * This array holds the first and last local port number. + * This struct holds the first and last local port number. */ -int sysctl_local_port_range[2] = { 32768, 61000 }; -DEFINE_SEQLOCK(sysctl_port_range_lock); +struct local_ports sysctl_local_ports __read_mostly = { + .lock = SEQLOCK_UNLOCKED, + .range = { 32768, 61000 }, +}; void inet_get_local_port_range(int *low, int *high) { unsigned seq; do { - seq = read_seqbegin(&sysctl_port_range_lock); + seq = read_seqbegin(&sysctl_local_ports.lock); - *low = sysctl_local_port_range[0]; - *high = sysctl_local_port_range[1]; - } while (read_seqretry(&sysctl_port_range_lock, seq)); + *low = sysctl_local_ports.range[0]; + *high = sysctl_local_ports.range[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -103,7 +105,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) rover = net_random() % remaining + low; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == rover) @@ -130,7 +133,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == snum) @@ -165,7 +169,7 @@ tb_not_found: success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: @@ -258,7 +262,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) } newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); + WARN_ON(newsk->sk_state == TCP_SYN_RECV); out: release_sock(sk); return newsk; @@ -333,18 +337,20 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; + struct net *net = sock_net(sk); security_req_classify_flow(req, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + if (ip_route_output_flow(net, &rt, &fl, sk, 0)) { + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } return &rt->u.dst; @@ -383,7 +389,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, ireq->rmt_addr == raddr && ireq->loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); + WARN_ON(req->sk); *prevp = prev; break; } @@ -512,6 +518,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; @@ -536,14 +544,14 @@ EXPORT_SYMBOL_GPL(inet_csk_clone); */ void inet_csk_destroy_sock(struct sock *sk) { - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -626,7 +634,7 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_disable(); bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); + WARN_ON(sock_owned_by_user(child)); sock_hold(child); sk->sk_prot->disconnect(child, O_NONBLOCK); @@ -644,7 +652,7 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); __reqsk_free(req); } - BUG_TRAP(!sk->sk_ack_backlog); + WARN_ON(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index da97695e7096..564230dabcb8 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1,8 +1,6 @@ /* * inet_diag.c Module for monitoring INET transport protocols sockets. * - * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -55,11 +53,9 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int type) { -#ifdef CONFIG_KMOD if (!inet_diag_table[type]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_INET_DIAG, type); -#endif mutex_lock(&inet_diag_table_mutex); if (!inet_diag_table[type]) @@ -784,11 +780,15 @@ skip_listen_ht: struct sock *sk; struct hlist_node *node; + num = 0; + + if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) + continue; + if (i > s_i) s_num = 0; read_lock_bh(lock); - num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 0546a0bc97ea..6c52e08f786e 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -134,8 +134,8 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, struct sk_buff *fp; struct netns_frags *nf; - BUG_TRAP(q->last_in & INET_FRAG_COMPLETE); - BUG_TRAP(del_timer(&q->timer) == 0); + WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); + WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ fp = q->fragments; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2023d37b2708..44981906fb91 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -70,7 +70,8 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, + hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -95,7 +96,8 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, + table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; @@ -192,7 +194,7 @@ struct sock *__inet_lookup_listener(struct net *net, const struct hlist_head *head; read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -225,7 +227,7 @@ struct sock * __inet_lookup_established(struct net *net, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); @@ -265,13 +267,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct net *net = sock_net(sk); + unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; - struct net *net = sock_net(sk); prefetch(head->chain.first); write_lock(lock); @@ -303,18 +305,18 @@ unique: inet->num = lport; inet->sport = htons(lport); sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock(lock); if (twp) { *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } @@ -340,7 +342,7 @@ void __inet_hash_nolisten(struct sock *sk) rwlock_t *lock; struct inet_ehash_bucket *head; - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -365,7 +367,7 @@ static void __inet_hash(struct sock *sk) return; } - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; lock = &hashinfo->lhash_lock; @@ -438,7 +440,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -447,7 +450,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, */ inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->ib_net == net && tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); + WARN_ON(hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; if (!check_established(death_row, sk, @@ -493,7 +496,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ce16e9ac24c1..1c5fd38f8824 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -32,7 +32,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, write_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); @@ -81,10 +82,11 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; - BUG_TRAP(icsk->icsk_bind_hash); + WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); @@ -124,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_reuse = sk->sk_reuse; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; + tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; twsk_net_set(tw, hold_net(sock_net(sk))); atomic_set(&tw->tw_refcnt, 1); @@ -158,6 +161,9 @@ rescan: __inet_twsk_del_dead_node(tw); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); +#endif inet_twsk_put(tw); killed++; spin_lock(&twdr->death_lock); @@ -176,8 +182,9 @@ rescan: } twdr->tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); +#endif return ret; } @@ -370,6 +377,9 @@ void inet_twdr_twcal_tick(unsigned long data) &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); +#endif inet_twsk_put(tw); killed++; } @@ -393,8 +403,45 @@ void inet_twdr_twcal_tick(unsigned long data) out: if ((twdr->tw_count -= killed) == 0) del_timer(&twdr->tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); +#endif spin_unlock(&twdr->death_lock); } EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); + +void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_node *node; + int h; + + local_bh_disable(); + for (h = 0; h < (hashinfo->ehash_size); h++) { + struct inet_ehash_bucket *head = + inet_ehash_bucket(hashinfo, h); + rwlock_t *lock = inet_ehash_lockp(hashinfo, h); +restart: + write_lock(lock); + sk_for_each(sk, node, &head->twchain) { + + tw = inet_twsk(sk); + if (!net_eq(twsk_net(tw), net) || + tw->tw_family != family) + continue; + + atomic_inc(&tw->tw_refcnt); + write_unlock(lock); + inet_twsk_deschedule(tw, twdr); + inet_twsk_put(tw); + + goto restart; + } + write_unlock(lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index af995198f643..a456ceeac3f2 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -3,8 +3,6 @@ * * This source is covered by the GNU GPL, the same as all kernel sources. * - * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ - * * Authors: Andrey V. Savochkin <saw@msu.ru> */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 4813c39b438b..450016b89a18 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,8 +5,6 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $ - * * Authors: see ip.c * * Fixes: @@ -44,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -58,6 +56,9 @@ int ip_forward(struct sk_buff *skb) struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); + if (skb_warn_if_lro(skb)) + goto drop; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop; @@ -87,7 +88,7 @@ int ip_forward(struct sk_buff *skb) if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; @@ -122,7 +123,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 37221f659159..e4f81f54befe 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,10 +5,8 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ - * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. @@ -180,7 +178,7 @@ static void ip_evictor(struct net *net) evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) - IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } /* @@ -189,8 +187,10 @@ static void ip_evictor(struct net *net) static void ip_expire(unsigned long arg) { struct ipq *qp; + struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); @@ -199,14 +199,12 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - struct net *net; - net = container_of(qp->q.net, struct net, ipv4.frags); /* Send an ICMP "Fragment Reassembly Timeout" message. */ if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -263,7 +261,10 @@ static inline int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments && (end - start) > max; if (rc) { - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -487,8 +488,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, qp->q.fragments = head; } - BUG_TRAP(head != NULL); - BUG_TRAP(FRAG_CB(head)->offset == 0); + WARN_ON(head == NULL); + WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); @@ -547,7 +548,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; @@ -562,7 +563,7 @@ out_oversize: "Oversized IP packet from " NIPQUAD_FMT ".\n", NIPQUAD(qp->saddr)); out_fail: - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); return err; } @@ -572,9 +573,9 @@ int ip_defrag(struct sk_buff *skb, u32 user) struct ipq *qp; struct net *net; - IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); @@ -592,7 +593,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } @@ -600,7 +601,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) #ifdef CONFIG_SYSCTL static int zero; -static struct ctl_table ip4_frags_ctl_table[] = { +static struct ctl_table ip4_frags_ns_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, .procname = "ipfrag_high_thresh", @@ -626,6 +627,10 @@ static struct ctl_table ip4_frags_ctl_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies }, + { } +}; + +static struct ctl_table ip4_frags_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, .procname = "ipfrag_secret_interval", @@ -646,22 +651,20 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; - table = ip4_frags_ctl_table; + table = ip4_frags_ns_ctl_table; if (net != &init_net) { - table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; - table[3].mode &= ~0222; - table[4].mode &= ~0222; } hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); @@ -678,7 +681,7 @@ err_alloc: return -ENOMEM; } -static void ip4_frags_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -686,13 +689,22 @@ static void ip4_frags_ctl_unregister(struct net *net) unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } + +static void ip4_frags_ctl_register(void) +{ + register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +} #else -static inline int ip4_frags_ctl_register(struct net *net) +static inline int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ctl_unregister(struct net *net) +static inline void ip4_frags_ns_ctl_unregister(struct net *net) +{ +} + +static inline void ip4_frags_ctl_register(void) { } #endif @@ -716,12 +728,12 @@ static int ipv4_frags_init_net(struct net *net) inet_frags_init_net(&net->ipv4.frags); - return ip4_frags_ctl_register(net); + return ip4_frags_ns_ctl_register(net); } static void ipv4_frags_exit_net(struct net *net) { - ip4_frags_ctl_unregister(net); + ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); } @@ -732,6 +744,7 @@ static struct pernet_operations ip4_frags_ops = { void __init ipfrag_init(void) { + ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4342cba4ff82..85c487b8572b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -27,6 +27,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> #include <linux/if_ether.h> #include <net/sock.h> @@ -41,6 +42,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/rtnetlink.h> #ifdef CONFIG_IPV6 #include <net/ipv6.h> @@ -117,8 +119,10 @@ Alexey Kuznetsov. */ +static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void ipgre_tunnel_setup(struct net_device *dev); +static int ipgre_tunnel_bind_dev(struct net_device *dev); /* Fallback tunnel: no source, no destination, no key, no options */ @@ -163,38 +167,64 @@ static DEFINE_RWLOCK(ipgre_lock); /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, - __be32 remote, __be32 local, __be32 key) + __be32 remote, __be32 local, + __be32 key, __be16 gre_proto) { unsigned h0 = HASH(remote); unsigned h1 = HASH(key); struct ip_tunnel *t; + struct ip_tunnel *t2 = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + ARPHRD_ETHER : ARPHRD_IPGRE; for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { if (remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr || (local == t->parms.iph.daddr && ipv4_is_multicast(local))) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_wc[h1]; t; t = t->next) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } + if (t2) + return t2; + if (ign->fb_tunnel_dev->flags&IFF_UP) return netdev_priv(ign->fb_tunnel_dev); return NULL; @@ -249,25 +279,37 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) } } -static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) +static struct ip_tunnel *ipgre_tunnel_find(struct net *net, + struct ip_tunnel_parm *parms, + int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, **tp; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + key == t->parms.i_key && + type == t->dev->type) + break; + + return t; +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + struct ip_tunnel *t, *nt; struct net_device *dev; char name[IFNAMSIZ]; struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (key == t->parms.i_key) - return t; - } - } - if (!create) - return NULL; + t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); + if (t || !create) + return t; if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); @@ -285,9 +327,11 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, goto failed_free; } - dev->init = ipgre_tunnel_init; nt = netdev_priv(dev); nt->parms = *parms; + dev->rtnl_link_ops = &ipgre_link_ops; + + dev->mtu = ipgre_tunnel_bind_dev(dev); if (register_netdevice(dev) < 0) goto failed_free; @@ -380,8 +424,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) read_lock(&ipgre_lock); t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, - (flags&GRE_KEY) ? - *(((__be32*)p) + (grehlen>>2) - 1) : 0); + flags & GRE_KEY ? + *(((__be32 *)p) + (grehlen / 4) - 1) : 0, + p[1]); if (t == NULL || t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; @@ -431,6 +476,8 @@ static int ipgre_rcv(struct sk_buff *skb) u32 seqno = 0; struct ip_tunnel *tunnel; int offset = 4; + __be16 gre_proto; + unsigned int len; if (!pskb_may_pull(skb, 16)) goto drop_nolock; @@ -470,18 +517,22 @@ static int ipgre_rcv(struct sk_buff *skb) } } + gre_proto = *(__be16 *)(h + 2); + read_lock(&ipgre_lock); if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr, key)) != NULL) { + iph->saddr, iph->daddr, key, + gre_proto))) { + struct net_device_stats *stats = &tunnel->dev->stats; + secpath_reset(skb); - skb->protocol = *(__be16*)(h + 2); + skb->protocol = gre_proto; /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IP * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ - if (flags == 0 && - skb->protocol == htons(ETH_P_WCCP)) { + if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { skb->protocol = htons(ETH_P_IP); if ((*(h + offset) & 0xF0) != 0x40) offset += 4; @@ -489,7 +540,6 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; __pskb_pull(skb, offset); - skb_reset_network_header(skb); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -497,33 +547,52 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb->rtable->fl.iif == 0) goto drop; - tunnel->stat.multicast++; + stats->multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->stat.rx_crc_errors++; - tunnel->stat.rx_errors++; + stats->rx_crc_errors++; + stats->rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->stat.rx_fifo_errors++; - tunnel->stat.rx_errors++; + stats->rx_fifo_errors++; + stats->rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; } - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + + len = skb->len; + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + stats->rx_length_errors++; + stats->rx_errors++; + goto drop; + } + + iph = ip_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + stats->rx_packets++; + stats->rx_bytes += len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; nf_reset(skb); + + skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); + netif_rx(skb); read_unlock(&ipgre_lock); return(0); @@ -540,7 +609,7 @@ drop_nolock: static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -554,11 +623,14 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } - if (dev->header_ops) { + if (dev->type == ARPHRD_ETHER) + IPCB(skb)->flags = 0; + + if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; tiph = (struct iphdr*)skb->data; } else { @@ -570,7 +642,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) /* NBMA tunnel */ if (skb->dst == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } @@ -621,7 +693,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error; } } @@ -629,13 +701,13 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } df = tiph->frag_off; if (df) - mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; + mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; else mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; @@ -701,7 +773,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) old_iph = ip_hdr(skb); } - skb->transport_header = skb->network_header; + skb_reset_transport_header(skb); skb_push(skb, gre_hlen); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -734,8 +806,9 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); } - ((__be16*)(iph+1))[0] = tunnel->parms.o_flags; - ((__be16*)(iph+1))[1] = skb->protocol; + ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; + ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); @@ -771,7 +844,7 @@ tx_error: return 0; } -static void ipgre_tunnel_bind_dev(struct net_device *dev) +static int ipgre_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; @@ -783,7 +856,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; - /* Guess output device to choose reasonable mtu and hard_header_len */ + /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi fl = { .oif = tunnel->parms.link, @@ -797,14 +870,16 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) tdev = rt->u.dst.dev; ip_rt_put(rt); } - dev->flags |= IFF_POINTOPOINT; + + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { - hlen = tdev->hard_header_len; + hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = tdev->mtu; } dev->iflink = tunnel->parms.link; @@ -818,10 +893,15 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) if (tunnel->parms.o_flags&GRE_SEQ) addend += 4; } - dev->hard_header_len = hlen + addend; - dev->mtu = mtu - addend; + dev->needed_headroom = addend + hlen; + mtu -= dev->hard_header_len - addend; + + if (mtu < 68) + mtu = 68; + tunnel->hlen = addend; + return mtu; } static int @@ -915,7 +995,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.frag_off = p.iph.frag_off; if (t->parms.link != p.link) { t->parms.link = p.link; - ipgre_tunnel_bind_dev(dev); + dev->mtu = ipgre_tunnel_bind_dev(dev); netdev_state_change(dev); } } @@ -954,15 +1034,11 @@ done: return err; } -static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); - if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -1081,15 +1157,15 @@ static int ipgre_close(struct net_device *dev) static void ipgre_tunnel_setup(struct net_device *dev) { + dev->init = ipgre_tunnel_init; dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; - dev->get_stats = ipgre_tunnel_get_stats; dev->do_ioctl = ipgre_tunnel_ioctl; dev->change_mtu = ipgre_tunnel_change_mtu; dev->type = ARPHRD_IPGRE; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; dev->flags = IFF_NOARP; dev->iflink = 0; @@ -1111,8 +1187,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - ipgre_tunnel_bind_dev(dev); - if (iph->daddr) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { @@ -1193,6 +1267,7 @@ static int ipgre_init_net(struct net *net) ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; dev_net_set(ign->fb_tunnel_dev, net); + ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; @@ -1225,6 +1300,298 @@ static struct pernet_operations ipgre_net_ops = { .exit = ipgre_exit_net, }; +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be32 daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + if (!daddr) + return -EINVAL; + } + +out: + return ipgre_tunnel_validate(tb, data); +} + +static void ipgre_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.protocol = IPPROTO_GRE; + + if (!data) + return; + + if (data[IFLA_GRE_LINK]) + parms->link = nla_get_u32(data[IFLA_GRE_LINK]); + + if (data[IFLA_GRE_IFLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + + if (data[IFLA_GRE_OFLAGS]) + parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + + if (data[IFLA_GRE_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); + + if (data[IFLA_GRE_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); + + if (data[IFLA_GRE_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); + + if (data[IFLA_GRE_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); + + if (data[IFLA_GRE_TTL]) + parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); + + if (data[IFLA_GRE_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); + + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); +} + +static int ipgre_tap_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + ipgre_tunnel_bind_dev(dev); + + return 0; +} + +static void ipgre_tap_setup(struct net_device *dev) +{ + + ether_setup(dev); + + dev->init = ipgre_tap_init; + dev->uninit = ipgre_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev->iflink = 0; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int mtu; + int err; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &nt->parms); + + if (ipgre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + random_ether_addr(dev->dev_addr); + + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + ipgre_tunnel_link(ign, nt); + +out: + return err; +} + +static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t, *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + struct ip_tunnel_parm p; + int mtu; + + if (dev == ign->fb_tunnel_dev) + return -EINVAL; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &p); + + t = ipgre_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + unsigned nflags = 0; + + t = nt; + + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + + ipgre_tunnel_unlink(ign, t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(ign, t); + netdev_state_change(dev); + } + + t->parms.o_key = p.o_key; + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + + if (t->parms.link != p.link) { + t->parms.link = p.link; + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + netdev_state_change(dev); + } + + return 0; +} + +static size_t ipgre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(4) + + /* IFLA_GRE_REMOTE */ + nla_total_size(4) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_PMTUDISC */ + nla_total_size(1) + + 0; +} + +static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); + NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); + NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); + NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); + NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); + NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); + NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); + NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); + NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); + NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_TOS] = { .type = NLA_U8 }, + [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipgre_link_ops __read_mostly = { + .kind = "gre", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tunnel_setup, + .validate = ipgre_tunnel_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + +static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { + .kind = "gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tap_setup, + .validate = ipgre_tap_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + /* * And now the modules code and kernel interface. */ @@ -1242,19 +1609,39 @@ static int __init ipgre_init(void) err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); if (err < 0) - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto gen_device_failed; + err = rtnl_link_register(&ipgre_link_ops); + if (err < 0) + goto rtnl_link_failed; + + err = rtnl_link_register(&ipgre_tap_ops); + if (err < 0) + goto tap_ops_failed; + +out: return err; + +tap_ops_failed: + rtnl_link_unregister(&ipgre_link_ops); +rtnl_link_failed: + unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); +gen_device_failed: + inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto out; } static void __exit ipgre_fini(void) { + rtnl_link_unregister(&ipgre_tap_ops); + rtnl_link_unregister(&ipgre_link_ops); + unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); - - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); } module_init(ipgre_init); module_exit(ipgre_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("gre"); +MODULE_ALIAS_RTNL_LINK("gretap"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index ff77a4a7f9ec..861978a4f1a8 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,12 +5,10 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> - * Alan Cox, <Alan.Cox@linux.org> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> @@ -147,12 +145,6 @@ #include <linux/netlink.h> /* - * SNMP management statistics - */ - -DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; - -/* * Process Router Attention IP option */ int ip_call_ra_chain(struct sk_buff *skb) @@ -232,16 +224,16 @@ static int ip_local_deliver_finish(struct sk_buff *skb) protocol = -ret; goto resubmit; } - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb); } } @@ -283,7 +275,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -292,7 +284,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } @@ -336,9 +328,11 @@ static int ip_rcv_finish(struct sk_buff *skb) skb->dev); if (unlikely(err)) { if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); else if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -359,9 +353,9 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb->rtable; if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCASTPKTS); return dst_input(skb); @@ -384,10 +378,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INRECEIVES); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } @@ -420,7 +414,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -430,7 +424,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -441,11 +435,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: return NET_RX_DROP; } - -EXPORT_SYMBOL(ip_statistics); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 33126ad2cfdc..2c88da6e7862 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,8 +5,6 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $ - * * Authors: A.N.Kuznetsov * */ @@ -440,7 +438,7 @@ int ip_options_compile(struct net *net, goto error; } opt->cipso = optptr - iph; - if (cipso_v4_validate(&optptr)) { + if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e527628f56cf..d2a8f8bb78a6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> @@ -120,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); + WARN_ON(!newskb->dst); netif_rx(newskb); return 0; } @@ -184,9 +182,9 @@ static inline int ip_finish_output2(struct sk_buff *skb) unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -246,7 +244,7 @@ int ip_mc_output(struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -300,7 +298,7 @@ int ip_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -342,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) .saddr = inet->saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = inet->dport } } }; @@ -391,7 +390,7 @@ packet_routed: return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -453,7 +452,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); @@ -544,7 +543,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -554,7 +553,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -563,7 +562,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -675,15 +674,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -1049,7 +1048,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1191,7 +1190,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1213,6 +1212,7 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; @@ -1282,7 +1282,7 @@ int ip_push_pending_frames(struct sock *sk) skb->dst = dst_clone(&rt->u.dst); if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ @@ -1299,7 +1299,7 @@ out: return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } @@ -1372,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .uli_u = { .ports = { .sport = tcp_hdr(skb)->dest, .dport = tcp_hdr(skb)->source } }, - .proto = sk->sk_protocol }; + .proto = sk->sk_protocol, + .flags = ip_reply_arg_flowi_flags(arg) }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e0514e82308e..465abf0a9869 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,8 +5,6 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $ - * * Authors: see ip.c * * Fixes: @@ -421,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_TTL) | (1<<IP_HDRINCL) | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC))) || + (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || optname == IP_MULTICAST_TTL || optname == IP_MULTICAST_LOOP) { if (optlen >= sizeof(int)) { @@ -880,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = xfrm_user_policy(sk, optname, optval, optlen); break; + case IP_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (optlen < 1) + goto e_inval; + inet->transparent = !!val; + break; + default: err = -ENOPROTOOPT; break; @@ -1132,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_FREEBIND: val = inet->freebind; break; + case IP_TRANSPARENT: + val = inet->transparent; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index a75807b971b3..38ccb6dfb02e 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -14,153 +14,14 @@ * - Adaptive compression. */ #include <linux/module.h> -#include <linux/crypto.h> #include <linux/err.h> -#include <linux/pfkeyv2.h> -#include <linux/percpu.h> -#include <linux/smp.h> -#include <linux/list.h> -#include <linux/vmalloc.h> #include <linux/rtnetlink.h> -#include <linux/mutex.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> #include <net/protocol.h> - -struct ipcomp_tfms { - struct list_head list; - struct crypto_comp **tfms; - int users; -}; - -static DEFINE_MUTEX(ipcomp_resource_mutex); -static void **ipcomp_scratches; -static int ipcomp_scratch_users; -static LIST_HEAD(ipcomp_tfms_list); - -static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipcomp_data *ipcd = x->data; - const int plen = skb->len; - int dlen = IPCOMP_SCRATCH_SIZE; - const u8 *start = skb->data; - const int cpu = get_cpu(); - u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); - int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); - - if (err) - goto out; - - if (dlen < (plen + sizeof(struct ip_comp_hdr))) { - err = -EINVAL; - goto out; - } - - err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); - if (err) - goto out; - - skb->truesize += dlen - plen; - __skb_put(skb, dlen - plen); - skb_copy_to_linear_data(skb, scratch, dlen); -out: - put_cpu(); - return err; -} - -static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int nexthdr; - int err = -ENOMEM; - struct ip_comp_hdr *ipch; - - if (skb_linearize_cow(skb)) - goto out; - - skb->ip_summed = CHECKSUM_NONE; - - /* Remove ipcomp header and decompress original payload */ - ipch = (void *)skb->data; - nexthdr = ipch->nexthdr; - - skb->transport_header = skb->network_header + sizeof(*ipch); - __skb_pull(skb, sizeof(*ipch)); - err = ipcomp_decompress(x, skb); - if (err) - goto out; - - err = nexthdr; - -out: - return err; -} - -static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipcomp_data *ipcd = x->data; - const int plen = skb->len; - int dlen = IPCOMP_SCRATCH_SIZE; - u8 *start = skb->data; - const int cpu = get_cpu(); - u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); - int err; - - local_bh_disable(); - err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - local_bh_enable(); - if (err) - goto out; - - if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { - err = -EMSGSIZE; - goto out; - } - - memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); - put_cpu(); - - pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); - return 0; - -out: - put_cpu(); - return err; -} - -static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - struct ip_comp_hdr *ipch; - struct ipcomp_data *ipcd = x->data; - - if (skb->len < ipcd->threshold) { - /* Don't bother compressing */ - goto out_ok; - } - - if (skb_linearize_cow(skb)) - goto out_ok; - - err = ipcomp_compress(x, skb); - - if (err) { - goto out_ok; - } - - /* Install ipcomp header, convert into ipcomp datagram. */ - ipch = ip_comp_hdr(skb); - ipch->nexthdr = *skb_mac_header(skb); - ipch->flags = 0; - ipch->cpi = htons((u16 )ntohl(x->id.spi)); - *skb_mac_header(skb) = IPPROTO_COMP; -out_ok: - skb_push(skb, -skb_network_offset(skb)); - return 0; -} +#include <net/sock.h> static void ipcomp4_err(struct sk_buff *skb, u32 info) { @@ -241,155 +102,9 @@ out: return err; } -static void ipcomp_free_scratches(void) -{ - int i; - void **scratches; - - if (--ipcomp_scratch_users) - return; - - scratches = ipcomp_scratches; - if (!scratches) - return; - - for_each_possible_cpu(i) - vfree(*per_cpu_ptr(scratches, i)); - - free_percpu(scratches); -} - -static void **ipcomp_alloc_scratches(void) -{ - int i; - void **scratches; - - if (ipcomp_scratch_users++) - return ipcomp_scratches; - - scratches = alloc_percpu(void *); - if (!scratches) - return NULL; - - ipcomp_scratches = scratches; - - for_each_possible_cpu(i) { - void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); - if (!scratch) - return NULL; - *per_cpu_ptr(scratches, i) = scratch; - } - - return scratches; -} - -static void ipcomp_free_tfms(struct crypto_comp **tfms) -{ - struct ipcomp_tfms *pos; - int cpu; - - list_for_each_entry(pos, &ipcomp_tfms_list, list) { - if (pos->tfms == tfms) - break; - } - - BUG_TRAP(pos); - - if (--pos->users) - return; - - list_del(&pos->list); - kfree(pos); - - if (!tfms) - return; - - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu); - crypto_free_comp(tfm); - } - free_percpu(tfms); -} - -static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name) -{ - struct ipcomp_tfms *pos; - struct crypto_comp **tfms; - int cpu; - - /* This can be any valid CPU ID so we don't need locking. */ - cpu = raw_smp_processor_id(); - - list_for_each_entry(pos, &ipcomp_tfms_list, list) { - struct crypto_comp *tfm; - - tfms = pos->tfms; - tfm = *per_cpu_ptr(tfms, cpu); - - if (!strcmp(crypto_comp_name(tfm), alg_name)) { - pos->users++; - return tfms; - } - } - - pos = kmalloc(sizeof(*pos), GFP_KERNEL); - if (!pos) - return NULL; - - pos->users = 1; - INIT_LIST_HEAD(&pos->list); - list_add(&pos->list, &ipcomp_tfms_list); - - pos->tfms = tfms = alloc_percpu(struct crypto_comp *); - if (!tfms) - goto error; - - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto error; - *per_cpu_ptr(tfms, cpu) = tfm; - } - - return tfms; - -error: - ipcomp_free_tfms(tfms); - return NULL; -} - -static void ipcomp_free_data(struct ipcomp_data *ipcd) +static int ipcomp4_init_state(struct xfrm_state *x) { - if (ipcd->tfms) - ipcomp_free_tfms(ipcd->tfms); - ipcomp_free_scratches(); -} - -static void ipcomp_destroy(struct xfrm_state *x) -{ - struct ipcomp_data *ipcd = x->data; - if (!ipcd) - return; - xfrm_state_delete_tunnel(x); - mutex_lock(&ipcomp_resource_mutex); - ipcomp_free_data(ipcd); - mutex_unlock(&ipcomp_resource_mutex); - kfree(ipcd); -} - -static int ipcomp_init_state(struct xfrm_state *x) -{ - int err; - struct ipcomp_data *ipcd; - struct xfrm_algo_desc *calg_desc; - - err = -EINVAL; - if (!x->calg) - goto out; - - if (x->encap) - goto out; + int err = -EINVAL; x->props.header_len = 0; switch (x->props.mode) { @@ -402,40 +117,22 @@ static int ipcomp_init_state(struct xfrm_state *x) goto out; } - err = -ENOMEM; - ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); - if (!ipcd) + err = ipcomp_init_state(x); + if (err) goto out; - mutex_lock(&ipcomp_resource_mutex); - if (!ipcomp_alloc_scratches()) - goto error; - - ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name); - if (!ipcd->tfms) - goto error; - mutex_unlock(&ipcomp_resource_mutex); - if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) goto error_tunnel; } - calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); - BUG_ON(!calg_desc); - ipcd->threshold = calg_desc->uinfo.comp.threshold; - x->data = ipcd; err = 0; out: return err; error_tunnel: - mutex_lock(&ipcomp_resource_mutex); -error: - ipcomp_free_data(ipcd); - mutex_unlock(&ipcomp_resource_mutex); - kfree(ipcd); + ipcomp_destroy(x); goto out; } @@ -443,7 +140,7 @@ static const struct xfrm_type ipcomp_type = { .description = "IPCOMP4", .owner = THIS_MODULE, .proto = IPPROTO_COMP, - .init_state = ipcomp_init_state, + .init_state = ipcomp4_init_state, .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output @@ -481,7 +178,7 @@ module_init(ipcomp4_init); module_exit(ipcomp4_fini); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ed45037ce9be..42065fff46c4 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,6 +1,4 @@ /* - * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $ - * * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or * user-supplied information to configure own IP address and routes. * @@ -434,7 +432,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) goto drop; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) @@ -854,7 +852,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) goto drop; /* Perform verifications before taking the lock. */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index af5cb53da5cc..29609d29df76 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,8 +1,6 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $ - * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * @@ -43,7 +41,7 @@ Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat - -Alan Cox (Alan.Cox@linux.org) 21 March 95 + -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the @@ -368,8 +366,8 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + tunnel->dev->stats.rx_packets++; + tunnel->dev->stats.rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -392,7 +390,7 @@ static int ipip_rcv(struct sk_buff *skb) static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; @@ -405,7 +403,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -418,7 +416,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!dst) { /* NBMA tunnel */ if ((rt = skb->rtable) == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -433,7 +431,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error_icmp; } } @@ -441,7 +439,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -451,7 +449,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (mtu < 68) { - tunnel->stat.collisions++; + stats->collisions++; ip_rt_put(rt); goto tx_error; } @@ -685,11 +683,6 @@ done: return err; } -static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) @@ -702,7 +695,6 @@ static void ipip_tunnel_setup(struct net_device *dev) { dev->uninit = ipip_tunnel_uninit; dev->hard_start_xmit = ipip_tunnel_xmit; - dev->get_stats = ipip_tunnel_get_stats; dev->do_ioctl = ipip_tunnel_ioctl; dev->change_mtu = ipip_tunnel_change_mtu; dev->destructor = free_netdev; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 11700a4dcd95..b42e082cc170 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1,7 +1,7 @@ /* * IP multicast routing support for mrouted 3.6/3.8 * - * (c) 1995 Alan Cox, <alan@redhat.com> + * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * * This program is free software; you can redistribute it and/or @@ -9,8 +9,6 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $ - * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code @@ -120,6 +118,31 @@ static struct timer_list ipmr_expire_timer; /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ +static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) +{ + dev_close(dev); + + dev = __dev_get_by_name(&init_net, "tunl0"); + if (dev) { + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); + set_fs(oldfs); + } +} + static struct net_device *ipmr_new_tunnel(struct vifctl *v) { @@ -161,6 +184,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) if (dev_open(dev)) goto failure; + dev_hold(dev); } } return dev; @@ -181,26 +205,20 @@ static int reg_vif_num = -1; static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { read_lock(&mrt_lock); - ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(dev))->tx_packets++; + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return 0; } -static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) -{ - return (struct net_device_stats*)netdev_priv(dev); -} - static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->hard_start_xmit = reg_vif_xmit; - dev->get_stats = reg_vif_get_stats; dev->destructor = free_netdev; } @@ -209,8 +227,7 @@ static struct net_device *ipmr_reg_vif(void) struct net_device *dev; struct in_device *in_dev; - dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", - reg_vif_setup); + dev = alloc_netdev(0, "pimreg", reg_vif_setup); if (dev == NULL) return NULL; @@ -234,6 +251,8 @@ static struct net_device *ipmr_reg_vif(void) if (dev_open(dev)) goto failure; + dev_hold(dev); + return dev; failure: @@ -248,9 +267,10 @@ failure: /* * Delete a VIF entry + * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(int vifi) +static int vif_delete(int vifi, int notify) { struct vif_device *v; struct net_device *dev; @@ -293,7 +313,7 @@ static int vif_delete(int vifi) ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) unregister_netdevice(dev); dev_put(dev); @@ -398,6 +418,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) struct vif_device *v = &vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; + int err; /* Is vif busy ? */ if (VIF_EXISTS(vifi)) @@ -415,18 +436,34 @@ static int vif_add(struct vifctl *vifc, int mrtsock) dev = ipmr_reg_vif(); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } break; #endif case VIFF_TUNNEL: dev = ipmr_new_tunnel(vifc); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + ipmr_del_tunnel(dev, vifc); + dev_put(dev); + return err; + } break; case 0: dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; - dev_put(dev); + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } break; default: return -EINVAL; @@ -435,7 +472,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) return -EADDRNOTAVAIL; IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; - dev_set_allmulti(dev, +1); ip_rt_multicast_event(in_dev); /* @@ -458,7 +494,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) /* And finish update writing critical data */ write_lock_bh(&mrt_lock); - dev_hold(dev); v->dev=dev; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) @@ -805,7 +840,7 @@ static void mroute_clean_tables(struct sock *sk) */ for (i=0; i<maxvif; i++) { if (!(vif_table[i].flags&VIFF_STATIC)) - vif_delete(i); + vif_delete(i, 0); } /* @@ -918,7 +953,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt if (optname==MRT_ADD_VIF) { ret = vif_add(&vif, sk==mroute_socket); } else { - ret = vif_delete(vif.vifc_vifi); + ret = vif_delete(vif.vifc_vifi, 0); } rtnl_unlock(); return ret; @@ -1089,7 +1124,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct vif_device *v; int ct; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_UNREGISTER) @@ -1097,7 +1132,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v v=&vif_table[0]; for (ct=0;ct<maxvif;ct++,v++) { if (v->dev==dev) - vif_delete(ct); + vif_delete(ct, 1); } return NOTIFY_DONE; } @@ -1143,7 +1178,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -1170,8 +1205,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_REGISTER) { vif->pkt_out++; vif->bytes_out+=skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); kfree_skb(skb); return; @@ -1206,7 +1241,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) to blackhole. */ - IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1230,8 +1265,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++; - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len; + vif->dev->stats.tx_packets++; + vif->dev->stats.tx_bytes += skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; @@ -1487,8 +1522,8 @@ int pim_rcv_v1(struct sk_buff * skb) skb->pkt_type = PACKET_HOST; dst_release(skb->dst); skb->dst = NULL; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; nf_reset(skb); netif_rx(skb); dev_put(reg_dev); @@ -1542,8 +1577,8 @@ static int pim_rcv(struct sk_buff * skb) skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; dst_release(skb->dst); - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; skb->dst = NULL; nf_reset(skb); netif_rx(skb); @@ -1887,16 +1922,36 @@ static struct net_protocol pim_protocol = { * Setup for IP multicast routing */ -void __init ip_mr_init(void) +int __init ip_mr_init(void) { + int err; + mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!mrt_cachep) + return -ENOMEM; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); - register_netdevice_notifier(&ip_mr_notifier); + err = register_netdevice_notifier(&ip_mr_notifier); + if (err) + goto reg_notif_fail; #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); - proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops); + err = -ENOMEM; + if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + goto proc_cache_fail; #endif + return 0; +reg_notif_fail: + kmem_cache_destroy(mrt_cachep); +#ifdef CONFIG_PROC_FS +proc_vif_fail: + unregister_netdevice_notifier(&ip_mr_notifier); +proc_cache_fail: + proc_net_remove(&init_net, "ip_mr_vif"); +#endif + return err; } diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index 09d0c3f35669..000000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null @@ -1,224 +0,0 @@ -# -# IP Virtual Server configuration -# -menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER - ---help--- - IP Virtual Server support will let you build a high-performance - virtual server based on cluster of two or more real servers. This - option must be enabled for at least one of the clustered computers - that will take care of intercepting incoming connections to a - single IP address and scheduling them to real servers. - - Three request dispatching techniques are implemented, they are - virtual server via NAT, virtual server via tunneling and virtual - server via direct routing. The several scheduling algorithms can - be used to choose which server the connection is directed to, - thus load balancing can be achieved among the servers. For more - information and its administration program, please visit the - following URL: <http://www.linuxvirtualserver.org/>. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -if IP_VS - -config IP_VS_DEBUG - bool "IP virtual server debugging" - ---help--- - Say Y here if you want to get additional messages useful in - debugging the IP virtual server code. You can change the debug - level in /proc/sys/net/ipv4/vs/debug_level - -config IP_VS_TAB_BITS - int "IPVS connection table size (the Nth power of 2)" - default "12" - ---help--- - The IPVS connection hash table uses the chaining scheme to handle - hash collisions. Using a big IPVS connection hash table will greatly - reduce conflicts when there are hundreds of thousands of connections - in the hash table. - - Note the table size must be power of 2. The table size will be the - value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). - - Another note that each connection occupies 128 bytes effectively and - each hash entry uses 8 bytes, so you can estimate how much memory is - needed for your box. - -comment "IPVS transport protocol load balancing support" - -config IP_VS_PROTO_TCP - bool "TCP load balancing support" - ---help--- - This option enables support for load balancing TCP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_UDP - bool "UDP load balancing support" - ---help--- - This option enables support for load balancing UDP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_ESP - bool "ESP load balancing support" - ---help--- - This option enables support for load balancing ESP (Encapsulation - Security Payload) transport protocol. Say Y if unsure. - -config IP_VS_PROTO_AH - bool "AH load balancing support" - ---help--- - This option enables support for load balancing AH (Authentication - Header) transport protocol. Say Y if unsure. - -comment "IPVS scheduler" - -config IP_VS_RR - tristate "round-robin scheduling" - ---help--- - The robin-robin scheduling algorithm simply directs network - connections to different real servers in a round-robin manner. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WRR - tristate "weighted round-robin scheduling" - ---help--- - The weighted robin-robin scheduling algorithm directs network - connections to different real servers based on server weights - in a round-robin manner. Servers with higher weights receive - new connections first than those with less weights, and servers - with higher weights get more connections than those with less - weights and servers with equal weights get equal connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LC - tristate "least-connection scheduling" - ---help--- - The least-connection scheduling algorithm directs network - connections to the server with the least number of active - connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WLC - tristate "weighted least-connection scheduling" - ---help--- - The weighted least-connection scheduling algorithm directs network - connections to the server with the least active connections - normalized by the server weight. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLC - tristate "locality-based least-connection scheduling" - ---help--- - The locality-based least-connection scheduling algorithm is for - destination IP load balancing. It is usually used in cache cluster. - This algorithm usually directs packet destined for an IP address to - its server if the server is alive and under load. If the server is - overloaded (its active connection numbers is larger than its weight) - and there is a server in its half load, then allocate the weighted - least-connection server to this IP address. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLCR - tristate "locality-based least-connection with replication scheduling" - ---help--- - The locality-based least-connection with replication scheduling - algorithm is also for destination IP load balancing. It is - usually used in cache cluster. It differs from the LBLC scheduling - as follows: the load balancer maintains mappings from a target - to a set of server nodes that can serve the target. Requests for - a target are assigned to the least-connection node in the target's - server set. If all the node in the server set are over loaded, - it picks up a least-connection node in the cluster and adds it - in the sever set for the target. If the server set has not been - modified for the specified time, the most loaded node is removed - from the server set, in order to avoid high degree of replication. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_DH - tristate "destination hashing scheduling" - ---help--- - The destination hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their destination IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SH - tristate "source hashing scheduling" - ---help--- - The source hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their source IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SED - tristate "shortest expected delay scheduling" - ---help--- - The shortest expected delay scheduling algorithm assigns network - connections to the server with the shortest expected delay. The - expected delay that the job will experience is (Ci + 1) / Ui if - sent to the ith server, in which Ci is the number of connections - on the ith server and Ui is the fixed service rate (weight) - of the ith server. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_NQ - tristate "never queue scheduling" - ---help--- - The never queue scheduling algorithm adopts a two-speed model. - When there is an idle server available, the job will be sent to - the idle server, instead of waiting for a fast one. When there - is no idle server available, the job will be sent to the server - that minimize its expected delay (The Shortest Expected Delay - scheduling algorithm). - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -comment 'IPVS application helper' - -config IP_VS_FTP - tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP - ---help--- - FTP is a protocol that transfers IP address and/or port number in - the payload. In the virtual server via Network Address Translation, - the IP address and port number of real servers cannot be sent to - clients in ftp connections directly, so FTP protocol helper is - required for tracking the connection and mangling it back to that of - virtual service. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -endif # IP_VS diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 30e85de9ffff..000000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -# -# Makefile for the IPVS modules on top of IPv4. -# - -# IPVS transport protocol load balancing support -ip_vs_proto-objs-y := -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o - -ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ - ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) - - -# IPVS core -obj-$(CONFIG_IP_VS) += ip_vs.o - -# IPVS schedulers -obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o -obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o -obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o -obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o -obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o -obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o -obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o -obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o -obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o -obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o - -# IPVS application helpers -obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 535abe0c45e7..000000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null @@ -1,624 +0,0 @@ -/* - * ip_vs_app.c: Application module support for IPVS - * - * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference - * is that ip_vs_app module handles the reverse direction (incoming requests - * and outgoing responses). - * - * IP_MASQ_APP application masquerading module - * - * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/net_namespace.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <asm/system.h> -#include <linux/stat.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -#include <net/ip_vs.h> - -EXPORT_SYMBOL(register_ip_vs_app); -EXPORT_SYMBOL(unregister_ip_vs_app); -EXPORT_SYMBOL(register_ip_vs_app_inc); - -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - -/* - * Get an ip_vs_app object - */ -static inline int ip_vs_app_get(struct ip_vs_app *app) -{ - return try_module_get(app->module); -} - - -static inline void ip_vs_app_put(struct ip_vs_app *app) -{ - module_put(app->module); -} - - -/* - * Allocate/initialize app incarnation and register it in proto apps. - */ -static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - struct ip_vs_protocol *pp; - struct ip_vs_app *inc; - int ret; - - if (!(pp = ip_vs_proto_get(proto))) - return -EPROTONOSUPPORT; - - if (!pp->unregister_app) - return -EOPNOTSUPP; - - inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); - if (!inc) - return -ENOMEM; - INIT_LIST_HEAD(&inc->p_list); - INIT_LIST_HEAD(&inc->incs_list); - inc->app = app; - inc->port = htons(port); - atomic_set(&inc->usecnt, 0); - - if (app->timeouts) { - inc->timeout_table = - ip_vs_create_timeout_table(app->timeouts, - app->timeouts_size); - if (!inc->timeout_table) { - ret = -ENOMEM; - goto out; - } - } - - ret = pp->register_app(inc); - if (ret) - goto out; - - list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); - - return 0; - - out: - kfree(inc->timeout_table); - kfree(inc); - return ret; -} - - -/* - * Release app incarnation - */ -static void -ip_vs_app_inc_release(struct ip_vs_app *inc) -{ - struct ip_vs_protocol *pp; - - if (!(pp = ip_vs_proto_get(inc->protocol))) - return; - - if (pp->unregister_app) - pp->unregister_app(inc); - - IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); - - list_del(&inc->a_list); - - kfree(inc->timeout_table); - kfree(inc); -} - - -/* - * Get reference to app inc (only called from softirq) - * - */ -int ip_vs_app_inc_get(struct ip_vs_app *inc) -{ - int result; - - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); - return result; -} - - -/* - * Put the app inc (only called from timer or net softirq) - */ -void ip_vs_app_inc_put(struct ip_vs_app *inc) -{ - ip_vs_app_put(inc->app); - atomic_dec(&inc->usecnt); -} - - -/* - * Register an application incarnation in protocol applications - */ -int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - int result; - - mutex_lock(&__ip_vs_app_mutex); - - result = ip_vs_app_inc_new(app, proto, port); - - mutex_unlock(&__ip_vs_app_mutex); - - return result; -} - - -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) -{ - /* increase the module use count */ - ip_vs_use_count_inc(); - - mutex_lock(&__ip_vs_app_mutex); - - list_add(&app->a_list, &ip_vs_app_list); - - mutex_unlock(&__ip_vs_app_mutex); - - return 0; -} - - -/* - * ip_vs_app unregistration routine - * We are sure there are no app incarnations attached to services - */ -void unregister_ip_vs_app(struct ip_vs_app *app) -{ - struct ip_vs_app *inc, *nxt; - - mutex_lock(&__ip_vs_app_mutex); - - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } - - list_del(&app->a_list); - - mutex_unlock(&__ip_vs_app_mutex); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - - -/* - * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) - */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) -{ - return pp->app_conn_bind(cp); -} - - -/* - * Unbind cp from application incarnation (called by cp destructor) - */ -void ip_vs_unbind_app(struct ip_vs_conn *cp) -{ - struct ip_vs_app *inc = cp->app; - - if (!inc) - return; - - if (inc->unbind_conn) - inc->unbind_conn(inc, cp); - if (inc->done_conn) - inc->done_conn(inc, cp); - ip_vs_app_inc_put(inc); - cp->app = NULL; -} - - -/* - * Fixes th->seq based on ip_vs_seq info. - */ -static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 seq = ntohl(th->seq); - - /* - * Adjust seq with delta-offset for all packets after - * the most recent resized pkt seq and with previous_delta offset - * for all packets before most recent resized pkt seq. - */ - if (vseq->delta || vseq->previous_delta) { - if(after(seq, vseq->init_seq)) { - th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); - } else { - th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); - } - } -} - - -/* - * Fixes th->ack_seq based on ip_vs_seq info. - */ -static inline void -vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 ack_seq = ntohl(th->ack_seq); - - /* - * Adjust ack_seq with delta-offset for - * the packets AFTER most recent resized pkt has caused a shift - * for packets before most recent resized pkt, use previous_delta - */ - if (vseq->delta || vseq->previous_delta) { - /* since ack_seq is the number of octet that is expected - to receive next, so compare it with init_seq+delta */ - if(after(ack_seq, vseq->init_seq+vseq->delta)) { - th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); - - } else { - th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " - "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); - } - } -} - - -/* - * Updates ip_vs_seq if pkt has been resized - * Assumes already checked proto==IPPROTO_TCP and diff!=0. - */ -static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) -{ - /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); - if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { - vseq->previous_delta = vseq->delta; - vseq->delta += diff; - vseq->init_seq = seq; - cp->flags |= flag; - } - spin_unlock(&cp->lock); -} - -static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_seq(&cp->out_seq, th); - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_ack_seq(&cp->in_seq, th); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - if (!app->pkt_out(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->out_seq, - IP_VS_CONN_F_OUT_SEQ, seq, diff); - - return 1; -} - -/* - * Output pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL - * returns false if it can't handle packet (oom) - */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - return app->pkt_out(app, cp, skb, NULL); -} - - -static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_seq(&cp->in_seq, th); - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_ack_seq(&cp->out_seq, th); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - if (!app->pkt_in(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->in_seq, - IP_VS_CONN_F_IN_SEQ, seq, diff); - - return 1; -} - -/* - * Input pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL. - * returns false if can't handle packet (oom). - */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - return app->pkt_in(app, cp, skb, NULL); -} - - -#ifdef CONFIG_PROC_FS -/* - * /proc/net/ip_vs_app entry function - */ - -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) -{ - struct ip_vs_app *app, *inc; - - list_for_each_entry(app, &ip_vs_app_list, a_list) { - list_for_each_entry(inc, &app->incs_list, a_list) { - if (pos-- == 0) - return inc; - } - } - return NULL; - -} - -static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) -{ - mutex_lock(&__ip_vs_app_mutex); - - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_app *inc, *app; - struct list_head *e; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); - - inc = v; - app = inc->app; - - if ((e = inc->a_list.next) != &app->incs_list) - return list_entry(e, struct ip_vs_app, a_list); - - /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { - app = list_entry(e, struct ip_vs_app, a_list); - list_for_each_entry(inc, &app->incs_list, a_list) { - return inc; - } - } - return NULL; -} - -static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) -{ - mutex_unlock(&__ip_vs_app_mutex); -} - -static int ip_vs_app_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "prot port usecnt name\n"); - else { - const struct ip_vs_app *inc = v; - - seq_printf(seq, "%-3s %-7u %-6d %-17s\n", - ip_vs_proto_name(inc->protocol), - ntohs(inc->port), - atomic_read(&inc->usecnt), - inc->name); - } - return 0; -} - -static const struct seq_operations ip_vs_app_seq_ops = { - .start = ip_vs_app_seq_start, - .next = ip_vs_app_seq_next, - .stop = ip_vs_app_seq_stop, - .show = ip_vs_app_seq_show, -}; - -static int ip_vs_app_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_app_seq_ops); -} - -static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, - .open = ip_vs_app_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - -int ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) -{ - proc_net_remove(&init_net, "ip_vs_app"); -} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 65f1ba112752..000000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null @@ -1,1025 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. Many code here is taken from IP MASQ code of kernel 2.2. - * - * Changes: - * - */ - -#include <linux/interrupt.h> -#include <linux/in.h> -#include <linux/net.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/proc_fs.h> /* for proc_net_* */ -#include <linux/seq_file.h> -#include <linux/jhash.h> -#include <linux/random.h> - -#include <net/net_namespace.h> -#include <net/ip_vs.h> - - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct list_head *ip_vs_conn_tab; - -/* SLAB cache for IPVS connections */ -static struct kmem_cache *ip_vs_conn_cachep __read_mostly; - -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 4 -#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) -#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) - -struct ip_vs_aligned_lock -{ - rwlock_t l; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -/* lock array for conn table */ -static struct ip_vs_aligned_lock -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; - -static inline void ct_read_lock(unsigned key) -{ - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock(unsigned key) -{ - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock_bh(unsigned key) -{ - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock_bh(unsigned key) -{ - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - - -/* - * Returns hash value for IPVS connection entry - */ -static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) -{ - return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} - - -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. - * returns bool success. - */ -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); - cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); - ret = 1; - } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - ret = 0; - } - - ct_write_unlock(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - ct_write_unlock(hash); - - return ret; -} - - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) - */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol==cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); - return cp; - } - } - - ct_read_unlock(hash); - - return NULL; -} - -struct ip_vs_conn *ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - struct ip_vs_conn *cp; - - cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - - IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); - - return cp; -} - -/* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && - cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol==cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; - } - } - cp = NULL; - - out: - ct_read_unlock(hash); - - IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); - - return cp; -} - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp, *ret=NULL; - - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ret = cp; - break; - } - } - - ct_read_unlock(hash); - - IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - ret?"hit":"not hit"); - - return ret; -} - - -/* - * Put back the conn and restart its timer with its timeout - */ -void ip_vs_conn_put(struct ip_vs_conn *cp) -{ - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); - - __ip_vs_conn_put(cp); -} - - -/* - * Fill a no_client_port connection with a client port number - */ -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) -{ - if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); - } -} - - -/* - * Bind a connection entry with the corresponding packet_xmit. - * Called by ip_vs_conn_new. - */ -static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit; - break; - } -} - - -static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) -{ - return atomic_read(&dest->activeconns) - + atomic_read(&dest->inactconns); -} - -/* - * Bind a connection entry with a virtual service destination - * Called just after a new connection entry is created. - */ -static inline void -ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) -{ - /* if dest is NULL, then return directly */ - if (!dest) - return; - - /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); - - /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) - /* if the connection is not template and is created - * by sync, preserve the activity flag. - */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); - cp->dest = dest; - - IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) - atomic_inc(&dest->activeconns); - else - atomic_inc(&dest->inactconns); - } else { - /* It is a persistent connection/template, so increase - the peristent connection counter */ - atomic_inc(&dest->persistconns); - } - - if (dest->u_threshold != 0 && - ip_vs_dest_totalconns(dest) >= dest->u_threshold) - dest->flags |= IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Check if there is a destination for the connection, if so - * bind the connection to the destination. - */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest; - - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr, cp->dport, - cp->vaddr, cp->vport, cp->protocol); - ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; -} - - -/* - * Unbind a connection entry with its VS destination - * Called by the ip_vs_conn_expire function. - */ -static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest = cp->dest; - - if (!dest) - return; - - IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so decrease the inactconns - or activeconns counter */ - if (cp->flags & IP_VS_CONN_F_INACTIVE) { - atomic_dec(&dest->inactconns); - } else { - atomic_dec(&dest->activeconns); - } - } else { - /* It is a persistent connection/template, so decrease - the peristent connection counter */ - atomic_dec(&dest->persistconns); - } - - if (dest->l_threshold != 0) { - if (ip_vs_dest_totalconns(dest) < dest->l_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else if (dest->u_threshold != 0) { - if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } - - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); -} - - -/* - * Checking if the destination of a connection template is available. - * If available, return 1, otherwise invalidate this connection - * template and return 0. - */ -int ip_vs_check_template(struct ip_vs_conn *ct) -{ - struct ip_vs_dest *dest = ct->dest; - - /* - * Checking the dest server status. - */ - if ((dest == NULL) || - !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG(9, "check_template: dest not available for " - "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "-> d:%u.%u.%u.%u:%d\n", - ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr), ntohs(ct->cport), - NIPQUAD(ct->vaddr), ntohs(ct->vport), - NIPQUAD(ct->daddr), ntohs(ct->dport)); - - /* - * Invalidate the connection template - */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } - - /* - * Simply decrease the refcnt of the template, - * don't restart its timer. - */ - atomic_dec(&ct->refcnt); - return 0; - } - return 1; -} - -static void ip_vs_conn_expire(unsigned long data) -{ - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - - /* - * do I control anybody? - */ - if (atomic_read(&cp->n_control)) - goto expire_later; - - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { - /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); - - /* does anybody control me? */ - if (cp->control) - ip_vs_control_del(cp); - - if (unlikely(cp->app != NULL)) - ip_vs_unbind_app(cp); - ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); - return; - } - - /* hash it back to the table */ - ip_vs_conn_hash(cp); - - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, - atomic_read(&cp->n_control)); - - ip_vs_conn_put(cp); -} - - -void ip_vs_conn_expire_now(struct ip_vs_conn *cp) -{ - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); -} - - -/* - * Create a new connection entry and hash it into the ip_vs_conn_tab - */ -struct ip_vs_conn * -ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, - __be32 daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) -{ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); - if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); - return NULL; - } - - INIT_LIST_HEAD(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->protocol = proto; - cp->caddr = caddr; - cp->cport = cport; - cp->vaddr = vaddr; - cp->vport = vport; - cp->daddr = daddr; - cp->dport = dport; - cp->flags = flags; - spin_lock_init(&cp->lock); - - /* - * Set the entry is referenced by the current thread before hashing - * it in the table, so that other thread run ip_vs_random_dropentry - * but cannot drop this entry. - */ - atomic_set(&cp->refcnt, 1); - - atomic_set(&cp->n_control, 0); - atomic_set(&cp->in_pkts, 0); - - atomic_inc(&ip_vs_conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); - - /* Bind the connection with a destination server */ - ip_vs_bind_dest(cp, dest); - - /* Set its state and timeout */ - cp->state = 0; - cp->timeout = 3*HZ; - - /* Bind its packet transmitter */ - ip_vs_bind_xmit(cp); - - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); - - /* Hash it in the ip_vs_conn_tab finally */ - ip_vs_conn_hash(cp); - - return cp; -} - - -/* - * /proc/net/ip_vs_conn entries - */ -#ifdef CONFIG_PROC_FS - -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) -{ - int idx; - struct ip_vs_conn *cp; - - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - } - ct_read_unlock_bh(idx); - } - - return NULL; -} - -static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) -{ - seq->private = NULL; - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; -} - -static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; - int idx; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); - - /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - ct_read_unlock_bh(idx); - } - seq->private = NULL; - return NULL; -} - -static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) -{ - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); -} - -static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); - else { - const struct ip_vs_conn *cp = v; - - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_seq_show, -}; - -static int ip_vs_conn_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_seq_ops); -} - -static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *ip_vs_origin_name(unsigned flags) -{ - if (flags & IP_VS_CONN_F_SYNC) - return "SYNC"; - else - return "LOCAL"; -} - -static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); - else { - const struct ip_vs_conn *cp = v; - - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_sync_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_sync_seq_show, -}; - -static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_sync_seq_ops); -} - -static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_sync_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - - -/* - * Randomly drop connection entries before running out of memory - */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; - - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) -{ - int idx; - struct ip_vs_conn *cp; - - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch(cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; - } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(hash); - } -} - - -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) -{ - int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); - - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } - - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } -} - - -int ip_vs_conn_init(void) -{ - int idx; - - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); - if (!ip_vs_conn_tab) - return -ENOMEM; - - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; - } - - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", - sizeof(struct ip_vs_conn)); - - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - - return 0; -} - - -void ip_vs_conn_cleanup(void) -{ - /* flush all the connection entries first */ - ip_vs_conn_flush(); - - /* Release the empty cache */ - kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); -} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index 963981a9d501..000000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null @@ -1,1126 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. - * - * Changes: - * Paul `Rusty' Russell properly handle non-linear skbs - * Harald Welte don't use nfcache - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/icmp.h> - -#include <net/ip.h> -#include <net/tcp.h> -#include <net/udp.h> -#include <net/icmp.h> /* for icmp_send */ -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -EXPORT_SYMBOL(register_ip_vs_scheduler); -EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); -EXPORT_SYMBOL(ip_vs_proto_name); -EXPORT_SYMBOL(ip_vs_conn_new); -EXPORT_SYMBOL(ip_vs_conn_in_get); -EXPORT_SYMBOL(ip_vs_conn_out_get); -#ifdef CONFIG_IP_VS_PROTO_TCP -EXPORT_SYMBOL(ip_vs_tcp_conn_listen); -#endif -EXPORT_SYMBOL(ip_vs_conn_put); -#ifdef CONFIG_IP_VS_DEBUG -EXPORT_SYMBOL(ip_vs_get_debug_level); -#endif - - -/* ID used in ICMP lookups */ -#define icmp_id(icmph) (((icmph)->un).echo.id) - -const char *ip_vs_proto_name(unsigned proto) -{ - static char buf[20]; - - switch (proto) { - case IPPROTO_IP: - return "IP"; - case IPPROTO_UDP: - return "UDP"; - case IPPROTO_TCP: - return "TCP"; - case IPPROTO_ICMP: - return "ICMP"; - default: - sprintf(buf, "IP_%d", proto); - return buf; - } -} - -void ip_vs_init_hash_table(struct list_head *table, int rows) -{ - while (--rows >= 0) - INIT_LIST_HEAD(&table[rows]); -} - -static inline void -ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.inpkts++; - dest->stats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.inpkts++; - dest->svc->stats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.inpkts++; - ip_vs_stats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.outpkts++; - dest->stats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.outpkts++; - dest->svc->stats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.outpkts++; - ip_vs_stats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.conns++; - spin_unlock(&cp->dest->stats.lock); - - spin_lock(&svc->stats.lock); - svc->stats.conns++; - spin_unlock(&svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.conns++; - spin_unlock(&ip_vs_stats.lock); -} - - -static inline int -ip_vs_set_state(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); -} - - -/* - * IPVS persistent scheduling function - * It creates a connection entry according to its template if exists, - * or selects a server and creates a connection entry plus a template. - * Locking: we are svc user (svc->refcnt), so we hold all dests too - * Protocols supported: TCP, UDP - */ -static struct ip_vs_conn * -ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) -{ - struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest; - struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ - __be32 snet; /* source network of the client, after masking */ - - /* Mask saddr with the netmask to adjust template granularity */ - snet = iph->saddr & svc->netmask; - - IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " - "mnet %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), ntohs(ports[0]), - NIPQUAD(iph->daddr), ntohs(ports[1]), - NIPQUAD(snet)); - - /* - * As far as we know, FTP is a very complicated network protocol, and - * it uses control connection and data connections. For active FTP, - * FTP server initialize data connection to the client, its source port - * is often 20. For passive FTP, FTP server tells the clients the port - * that it passively listens to, and the client issues the data - * connection. In the tunneling or direct routing mode, the load - * balancer is on the client-to-server half of connection, the port - * number is unknown to the load balancer. So, a conn template like - * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP - * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> - * is created for other persistent services. - */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, ports[1]); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like <protocol,caddr,0, - * vaddr,vport,daddr,dport> for non-ftp service, - * and <protocol,caddr,0,vaddr,0,daddr,0> - * for ftp service. - */ - if (svc->port != FTPPORT) - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, - ports[1], - dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> - * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> - */ - if (svc->fwmark) - ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, - htonl(svc->fwmark), 0); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template according to the service - */ - if (svc->fwmark) - ct = ip_vs_conn_new(IPPROTO_IP, - snet, 0, - htonl(svc->fwmark), 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } - - /* - * Create a new connection according to the template - */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], - dest->addr, dport, - 0, - dest); - if (cp == NULL) { - ip_vs_conn_put(ct); - return NULL; - } - - /* - * Add its control - */ - ip_vs_control_add(cp, ct); - ip_vs_conn_put(ct); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * IPVS main scheduling function - * It selects a server according to the virtual service, and - * creates a connection entry. - * Protocols supported: TCP, UDP - */ -struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - /* - * Persistent service - */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); - - /* - * Non-persistent service - */ - if (!svc->fwmark && pptr[1] != svc->port) { - if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); - return NULL; - } - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "Schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a connection entry. - */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - dest->addr, dest->port?dest->port:pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; - - IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * Pass or drop the packet. - * Called by ip_vs_in, when the virtual service is available but - * no destination is available for a new connection. - */ -int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - __be16 _ports[2], *pptr; - struct iphdr *iph = ip_hdr(skb); - - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) { - ip_vs_service_put(svc); - return NF_DROP; - } - - /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is RTN_UNICAST (and not local), then create - a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { - int ret, cs; - struct ip_vs_conn *cp; - - ip_vs_service_put(svc); - - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - 0, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; - - /* statistics */ - ip_vs_in_stats(cp, skb); - - /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - - /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - - atomic_inc(&cp->in_pkts); - ip_vs_conn_put(cp); - return ret; - } - - /* - * When the virtual ftp service is presented, packets destined - * for other services on the VIP may get here (except services - * listed in the ipvs table), pass the packets, because it is - * not ipvs job to decide to drop the packets. - */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); - return NF_ACCEPT; - } - - ip_vs_service_put(svc); - - /* - * Notify the client that the destination is unreachable, and - * release the socket buffer. - * Since it is in IP layer, the TCP socket is not actually - * created, the TCP RST packet cannot be sent, instead that - * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - return NF_DROP; -} - - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) -{ - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); -} - -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err = ip_defrag(skb, user); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - -/* - * Packet has been made sufficiently writable in caller - * - inout: 1=in->out, 0=out->in - */ -void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + - icmp_offset); - struct iphdr *ciph = (struct iphdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr; - ip_send_check(iph); - ciph->daddr = cp->vaddr; - ip_send_check(ciph); - } else { - iph->daddr = cp->daddr; - ip_send_check(iph); - ciph->saddr = cp->daddr; - ip_send_check(ciph); - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { - __be16 *ports = (void *)ciph + ciph->ihl*4; - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->checksum = 0; - icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMP"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMP"); -} - -/* - * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - * (Only used in VS/NAT) - */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; - - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(skb, pp, cih, offset, 1); - if (!cp) - return NF_ACCEPT; - - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -static inline int is_tcp_reset(const struct sk_buff *skb) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - return th->rst; -} - -/* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn, - * rewrite addresses of the packet and send it on its way... - */ -static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct iphdr *iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ihl; - - EnterFunction(11); - - if (skb->ipvs_property) - return NF_ACCEPT; - - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); - - if (related) - return verdict; - iph = ip_hdr(skb); - } - - pp = ip_vs_proto_get(iph->protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* reassemble IP fragments */ - if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - iph = ip_hdr(skb); - } - - ihl = iph->ihl << 2; - - /* - * Check if the packet belongs to an existing entry - */ - cp = pp->conn_out_get(skb, pp, iph, ihl, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, ihl, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph->protocol, - iph->saddr, pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph->protocol != IPPROTO_TCP - || !is_tcp_reset(skb)) { - icmp_send(skb,ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } - } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, ihl)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - ip_hdr(skb)->saddr = cp->vaddr; - ip_send_check(ip_hdr(skb)); - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - - drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; -} - - -/* - * Handle ICMP messages in the outside-to-inside direction (incoming). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; - - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(skb, pp, cih, offset, 1); - if (!cp) - return NF_ACCEPT; - - verdict = NF_DROP; - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -/* - * Check if it's for virtual services, look it up, - * and send it on its way... - */ -static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct iphdr *iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ret, restart; - int ihl; - - /* - * Big tappo: only PACKET_HOST (neither loopback nor mcasts) - * ... don't know why 1st test DOES NOT include 2nd (?) - */ - if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { - IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", - skb->pkt_type, - ip_hdr(skb)->protocol, - NIPQUAD(ip_hdr(skb)->daddr)); - return NF_ACCEPT; - } - - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); - - if (related) - return verdict; - iph = ip_hdr(skb); - } - - /* Protocol supported? */ - pp = ip_vs_proto_get(iph->protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - ihl = iph->ihl << 2; - - /* - * Check if the packet belongs to an existing connection entry - */ - cp = pp->conn_in_get(skb, pp, iph, ihl, 0); - - if (unlikely(!cp)) { - int v; - - if (!pp->conn_schedule(skb, pp, &v, &cp)) - return v; - } - - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { - /* the destination server is not available */ - - if (sysctl_ip_vs_expire_nodest_conn) { - /* try to expire the connection immediately */ - ip_vs_conn_expire_now(cp); - } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); - return NF_DROP; - } - - ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - else { - IP_VS_DBG_RL("warning: packet_xmit is null"); - ret = NF_ACCEPT; - } - - /* Increase its packet counter and check if it is needed - * to be synchronized - * - * Sync connection if it is about to close to - * encorage the standby servers to update the connections timeout - */ - atomic_inc(&cp->in_pkts); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; - - ip_vs_conn_put(cp); - return ret; -} - - -/* - * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP - * related packets destined for 0.0.0.0/0. - * When fwmark-based virtual service is used, such as transparent - * cache cluster, TCP packets can be marked and routed to ip_vs_in, - * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain - * and send them to ip_vs_in_icmp. - */ -static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; - - return ip_vs_in_icmp(skb, &r, hooknum); -} - - -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, -}; - - -/* - * Initialize IP Virtual Server - */ -static int __init ip_vs_init(void) -{ - int ret; - - ret = ip_vs_control_init(); - if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_nothing; - } - - ip_vs_protocol_init(); - - ret = ip_vs_app_init(); - if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); - goto cleanup_protocol; - } - - ret = ip_vs_conn_init(); - if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; - } - - IP_VS_INFO("ipvs loaded.\n"); - return ret; - - cleanup_conn: - ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - cleanup_nothing: - return ret; -} - -static void __exit ip_vs_cleanup(void) -{ - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); -} - -module_init(ip_vs_init); -module_exit(ip_vs_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 94c5767c8e01..000000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null @@ -1,2362 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/workqueue.h> -#include <linux/swap.h> -#include <linux/seq_file.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/mutex.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/sock.h> - -#include <asm/uaccess.h> - -#include <net/ip_vs.h> - -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - -/* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - - -#ifdef CONFIG_IP_VS_DEBUG -static int sysctl_ip_vs_debug_level = 0; - -int ip_vs_get_debug_level(void) -{ - return sysctl_ip_vs_debug_level; -} -#endif - -/* - * update_defense_level is called from keventd and from sysctl, - * so it needs to protect itself from softirqs - */ -static void update_defense_level(void) -{ - struct sysinfo i; - static int old_secure_tcp = 0; - int availmem; - int nomem; - int to_change = -1; - - /* we only count free and buffered memory (in pages) */ - si_meminfo(&i); - availmem = i.freeram + i.bufferram; - /* however in linux 2.5 the i.bufferram is total page cache size, - we need adjust it */ - /* si_swapinfo(&i); */ - /* availmem = availmem - (i.totalswap - i.freeswap); */ - - nomem = (availmem < sysctl_ip_vs_amemthresh); - - local_bh_disable(); - - /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { - case 0: - atomic_set(&ip_vs_dropentry, 0); - break; - case 1: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; - } else { - atomic_set(&ip_vs_dropentry, 0); - } - break; - case 2: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; - }; - break; - case 3: - atomic_set(&ip_vs_dropentry, 1); - break; - } - spin_unlock(&__ip_vs_dropentry_lock); - - /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { - case 0: - ip_vs_drop_rate = 0; - break; - case 1: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; - } else { - ip_vs_drop_rate = 0; - } - break; - case 2: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; - } - break; - case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; - break; - } - spin_unlock(&__ip_vs_droppacket_lock); - - /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { - case 0: - if (old_secure_tcp >= 2) - to_change = 0; - break; - case 1: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - sysctl_ip_vs_secure_tcp = 2; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - } - break; - case 2: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - sysctl_ip_vs_secure_tcp = 1; - } - break; - case 3: - if (old_secure_tcp < 2) - to_change = 1; - break; - } - old_secure_tcp = sysctl_ip_vs_secure_tcp; - if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); - - local_bh_enable(); -} - - -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - -int -ip_vs_use_count_inc(void) -{ - return try_module_get(THIS_MODULE); -} - -void -ip_vs_use_count_dec(void) -{ - module_put(THIS_MODULE); -} - - -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - - -/* - * Returns hash value for virtual service - */ -static __inline__ unsigned -ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) -{ - register unsigned porth = ntohs(port); - - return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; -} - -/* - * Returns hash value of fwmark for virtual service lookup - */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) -{ - return fwmark & IP_VS_SVC_TAB_MASK; -} - -/* - * Hashes a service in the ip_vs_svc_table by <proto,addr,port> - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) -{ - unsigned hash; - - if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* - * Hash it by <protocol,addr,port> in ip_vs_svc_table - */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); - } else { - /* - * Hash it by fwmark in ip_vs_svc_fwm_table - */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); - } - - svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); - return 1; -} - - -/* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. - * Should be called with locked tables. - */ -static int ip_vs_svc_unhash(struct ip_vs_service *svc) -{ - if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); - } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); - } - - svc->flags &= ~IP_VS_SVC_F_HASHED; - atomic_dec(&svc->refcnt); - return 1; -} - - -/* - * Get service by {proto,addr,port} in the service table. - */ -static __inline__ struct ip_vs_service * -__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(protocol, vaddr, vport); - - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr == vaddr) - && (svc->port == vport) - && (svc->protocol == protocol)) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - - -/* - * Get service by {fwmark} in the service table. - */ -static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); - - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - -struct ip_vs_service * -ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) -{ - struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); - - /* - * Check the table hashed by fwmark first - */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) - goto out; - - /* - * Check the table hashed by <protocol,addr,port> - * for "full" addressed entries - */ - svc = __ip_vs_service_get(protocol, vaddr, vport); - - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { - /* - * Check if ftp service entry exists, the packet - * might belong to FTP data connections. - */ - svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); - } - - if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { - /* - * Check if the catch-all port (port zero) exists - */ - svc = __ip_vs_service_get(protocol, vaddr, 0); - } - - out: - read_unlock(&__ip_vs_svc_lock); - - IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - NIPQUAD(vaddr), ntohs(vport), - svc?"hit":"not hit"); - - return svc; -} - - -static inline void -__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - atomic_inc(&svc->refcnt); - dest->svc = svc; -} - -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) -{ - struct ip_vs_service *svc = dest->svc; - - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - - -/* - * Returns hash value for real service - */ -static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) -{ - register unsigned porth = ntohs(port); - - return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->addr, dest->port); - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } - - return 1; -} - -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; - - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr == daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } - } - read_unlock(&__ip_vs_rs_lock); - - return NULL; -} - -/* - * Lookup destination by {addr,port} in the given service - */ -static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) -{ - struct ip_vs_dest *dest; - - /* - * Find the destination for the given service - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr == daddr) && (dest->port == dport)) { - /* HIT */ - return dest; - } - } - - return NULL; -} - -/* - * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in - * the backup synchronization daemon. It finds the - * destination to be bound to the received connection - * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. - */ -struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, - __be32 vaddr, __be16 vport, __u16 protocol) -{ - struct ip_vs_dest *dest; - struct ip_vs_service *svc; - - svc = ip_vs_service_get(0, protocol, vaddr, vport); - if (!svc) - return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); - return dest; -} - -/* - * Lookup dest by {svc,addr,port} in the destination trash. - * The destination trash is used to hold the destinations that are removed - * from the service table but are still referenced by some conn entries. - * The reason to add the destination trash is when the dest is temporary - * down (either by administrator or by monitor program), the dest can be - * picked back from the trash, the remaining connections to the dest can - * continue, and the counting information of the dest is also useful for - * scheduling. - */ -static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) -{ - struct ip_vs_dest *dest, *nxt; - - /* - * Find the destination in trash - */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->addr == daddr && - dest->port == dport && - dest->vfwmark == svc->fwmark && - dest->protocol == svc->protocol && - (svc->fwmark || - (dest->vaddr == svc->addr && - dest->vport == svc->port))) { - /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " - "from trash\n", - dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } - } - - return NULL; -} - - -/* - * Clean up all the destinations in the trash - * Called by the ip_vs_control_cleanup() - * - * When the ip_vs_control_clearup is activated by ipvs module exit, - * the service tables must have been flushed and all the connections - * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. - */ -static void ip_vs_trash_cleanup(void) -{ - struct ip_vs_dest *dest, *nxt; - - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } -} - - -static void -ip_vs_zero_stats(struct ip_vs_stats *stats) -{ - spin_lock_bh(&stats->lock); - memset(stats, 0, (char *)&stats->lock - (char *)stats); - spin_unlock_bh(&stats->lock); - ip_vs_zero_estimator(stats); -} - -/* - * Update a destination in the given service - */ -static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) -{ - int conn_flags; - - /* set the weight and the flags */ - atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { - conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - } - atomic_set(&dest->conn_flags, conn_flags); - - /* bind the service */ - if (!dest->svc) { - __ip_vs_bind_svc(dest, svc); - } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(&dest->stats); - __ip_vs_bind_svc(dest, svc); - } - } - - /* set the dest status flags */ - dest->flags |= IP_VS_DEST_F_AVAILABLE; - - if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - dest->u_threshold = udest->u_threshold; - dest->l_threshold = udest->l_threshold; -} - - -/* - * Create a destination for the given service - */ -static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, - struct ip_vs_dest **dest_p) -{ - struct ip_vs_dest *dest; - unsigned atype; - - EnterFunction(2); - - atype = inet_addr_type(&init_net, udest->addr); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; - - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); - return -ENOMEM; - } - - dest->protocol = svc->protocol; - dest->vaddr = svc->addr; - dest->vport = svc->port; - dest->vfwmark = svc->fwmark; - dest->addr = udest->addr; - dest->port = udest->port; - - atomic_set(&dest->activeconns, 0); - atomic_set(&dest->inactconns, 0); - atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); - - INIT_LIST_HEAD(&dest->d_list); - spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); - - *dest_p = dest; - - LeaveFunction(2); - return 0; -} - - -/* - * Add a destination into an existing service - */ -static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - int ret; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - /* - * Check if the dest already exists in the list - */ - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); - return -EEXIST; - } - - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, daddr, dport); - if (dest != NULL) { - IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", - NIPQUAD(daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - NIPQUAD(dest->vaddr), - ntohs(dest->vport)); - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Edit a destination in the given service - */ -static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); - return -ENOENT; - } - - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Delete a destination (must be already unlinked from the service) - */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) -{ - ip_vs_kill_estimator(&dest->stats); - - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " - "dest->refcnt=%d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } -} - - -/* - * Unlink a destination from the given service - */ -static void __ip_vs_unlink_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, - int svcupd) -{ - dest->flags &= ~IP_VS_DEST_F_AVAILABLE; - - /* - * Remove it from the d-linked destination list. - */ - list_del(&dest->n_list); - svc->num_dests--; - if (svcupd) { - /* - * Call the update_service function of its scheduler - */ - svc->scheduler->update_service(svc); - } -} - - -/* - * Delete a destination server in the given service - */ -static int -ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - - EnterFunction(2); - - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); - return -ENOENT; - } - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); - - write_unlock_bh(&__ip_vs_svc_lock); - - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); - - LeaveFunction(2); - - return 0; -} - - -/* - * Add a service into the service hash table - */ -static int -ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) -{ - int ret = 0; - struct ip_vs_scheduler *sched = NULL; - struct ip_vs_service *svc = NULL; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - ret = -ENOENT; - goto out_mod_dec; - } - - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); - if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); - ret = -ENOMEM; - goto out_err; - } - - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->protocol = u->protocol; - svc->addr = u->addr; - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; - - /* Update the virtual service counters */ - if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); - - ip_vs_new_estimator(&svc->stats); - ip_vs_num_services++; - - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; - return 0; - - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); - } - ip_vs_scheduler_put(sched); - - out_mod_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -/* - * Edit a service and bind it with a new scheduler - */ -static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) -{ - struct ip_vs_scheduler *sched, *old_sched; - int ret = 0; - - /* - * Lookup the scheduler, by 'u->sched_name' - */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - return -ENOENT; - } - old_sched = sched; - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out; - } - } - - out: - write_unlock_bh(&__ip_vs_svc_lock); - - if (old_sched) - ip_vs_scheduler_put(old_sched); - - return ret; -} - - -/* - * Delete a service from the service list - * - The service must be unlinked, unlocked and not referenced! - * - We are called under _bh lock - */ -static void __ip_vs_del_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest, *nxt; - struct ip_vs_scheduler *old_sched; - - ip_vs_num_services--; - ip_vs_kill_estimator(&svc->stats); - - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); - - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); - } - - /* - * Update the virtual service counters - */ - if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); - - /* - * Free the service if nobody refers to it - */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - -/* - * Delete a service from the service list - */ -static int ip_vs_del_service(struct ip_vs_service *svc) -{ - if (svc == NULL) - return -EEXIST; - - /* - * Unhash it from the service table - */ - write_lock_bh(&__ip_vs_svc_lock); - - ip_vs_svc_unhash(svc); - - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - return 0; -} - - -/* - * Flush all the virtual services - */ -static int ip_vs_flush(void) -{ - int idx; - struct ip_vs_service *svc, *nxt; - - /* - * Flush the service table hashed by <protocol,addr,port> - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - return 0; -} - - -/* - * Zero counters in a service or all services - */ -static int ip_vs_zero_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - - write_lock_bh(&__ip_vs_svc_lock); - list_for_each_entry(dest, &svc->destinations, n_list) { - ip_vs_zero_stats(&dest->stats); - } - ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; -} - -static int ip_vs_zero_all(void) -{ - int idx; - struct ip_vs_service *svc; - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } - } - - ip_vs_zero_stats(&ip_vs_stats); - return 0; -} - - -static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; - } else { - update_defense_level(); - } - } - return rc; -} - - -static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val[2]; - int rc; - - /* backup the value first */ - memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - - -/* - * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) - */ - -static struct ctl_table vs_vars[] = { - { - .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, -#endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = &proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); - -static struct ctl_table_header * sysctl_header; - -#ifdef CONFIG_PROC_FS - -struct ip_vs_iter { - struct list_head *table; - int bucket; -}; - -/* - * Write the contents of the VS rule table to a PROCfs file. - * (It is kept just for backward compatibility) - */ -static inline const char *ip_vs_fwd_name(unsigned flags) -{ - switch (flags & IP_VS_CONN_F_FWD_MASK) { - case IP_VS_CONN_F_LOCALNODE: - return "Local"; - case IP_VS_CONN_F_TUNNEL: - return "Tunnel"; - case IP_VS_CONN_F_DROUTE: - return "Route"; - default: - return "Masq"; - } -} - - -/* Get the Nth entry in the two lists */ -static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) -{ - struct ip_vs_iter *iter = seq->private; - int idx; - struct ip_vs_service *svc; - - /* look in hash by protocol */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ - iter->table = ip_vs_svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; - iter->bucket = idx; - return svc; - } - } - } - - return NULL; -} - -static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -{ - - read_lock_bh(&__ip_vs_svc_lock); - return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; -} - - -static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *e; - struct ip_vs_iter *iter; - struct ip_vs_service *svc; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_info_array(seq,0); - - svc = v; - iter = seq->private; - - if (iter->table == ip_vs_svc_table) { - /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ip_vs_svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); - - scan_fwmark: - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) - return svc; - } - - return NULL; -} - -static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock_bh(&__ip_vs_svc_lock); -} - - -static int ip_vs_info_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - seq_puts(seq, - "Prot LocalAddress:Port Scheduler Flags\n"); - seq_puts(seq, - " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); - } else { - const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; - const struct ip_vs_dest *dest; - - if (iter->table == ip_vs_svc_table) - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr), - ntohs(svc->port), - svc->scheduler->name); - else - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); - - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - seq_printf(seq, "persistent %d %08X\n", - svc->timeout, - ntohl(svc->netmask)); - else - seq_putc(seq, '\n'); - - list_for_each_entry(dest, &svc->destinations, n_list) { - seq_printf(seq, - " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr), ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - } - } - return 0; -} - -static const struct seq_operations ip_vs_info_seq_ops = { - .start = ip_vs_info_seq_start, - .next = ip_vs_info_seq_next, - .stop = ip_vs_info_seq_stop, - .show = ip_vs_info_seq_show, -}; - -static int ip_vs_info_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); -} - -static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, - .open = ip_vs_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats; - -#ifdef CONFIG_PROC_FS -static int ip_vs_stats_show(struct seq_file *seq, void *v) -{ - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); - - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, - ip_vs_stats.inpkts, ip_vs_stats.outpkts, - (unsigned long long) ip_vs_stats.inbytes, - (unsigned long long) ip_vs_stats.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.cps, - ip_vs_stats.inpps, - ip_vs_stats.outpps, - ip_vs_stats.inbps, - ip_vs_stats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); - - return 0; -} - -static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_vs_stats_show, NULL); -} - -static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, - .open = ip_vs_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif - -/* - * Set timeout values for tcp tcpfin udp in the timeout_table. - */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) -{ - IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", - u->tcp_timeout, - u->tcp_fin_timeout, - u->udp_timeout); - -#ifdef CONFIG_IP_VS_PROTO_TCP - if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] - = u->tcp_timeout * HZ; - } - - if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] - = u->tcp_fin_timeout * HZ; - } -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP - if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] - = u->udp_timeout * HZ; - } -#endif - return 0; -} - - -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, -}; - -static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) -{ - int ret; - unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc; - struct ip_vs_service *svc; - struct ip_vs_dest_user *udest; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, len) != 0) - return -EFAULT; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - - if (cmd == IP_VS_SO_SET_FLUSH) { - /* Flush the virtual service */ - ret = ip_vs_flush(); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_TIMEOUT) { - /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); - goto out_unlock; - } - - usvc = (struct ip_vs_service_user *)arg; - udest = (struct ip_vs_dest_user *)(usvc + 1); - - if (cmd == IP_VS_SO_SET_ZERO) { - /* if no service address is set, zero counters in all */ - if (!usvc->fwmark && !usvc->addr && !usvc->port) { - ret = ip_vs_zero_all(); - goto out_unlock; - } - } - - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc->protocol, NIPQUAD(usvc->addr), - ntohs(usvc->port), usvc->sched_name); - ret = -EFAULT; - goto out_unlock; - } - - /* Lookup the exact service by <protocol, addr, port> or fwmark */ - if (usvc->fwmark == 0) - svc = __ip_vs_service_get(usvc->protocol, - usvc->addr, usvc->port); - else - svc = __ip_vs_svc_fwm_get(usvc->fwmark); - - if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc->protocol)) { - ret = -ESRCH; - goto out_unlock; - } - - switch (cmd) { - case IP_VS_SO_SET_ADD: - if (svc != NULL) - ret = -EEXIST; - else - ret = ip_vs_add_service(usvc, &svc); - break; - case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, usvc); - break; - case IP_VS_SO_SET_DEL: - ret = ip_vs_del_service(svc); - if (!ret) - goto out_unlock; - break; - case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); - break; - case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, udest); - break; - case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, udest); - break; - case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, udest); - break; - default: - ret = -EINVAL; - } - - if (svc) - ip_vs_service_put(svc); - - out_unlock: - mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, src, (char*)&src->lock - (char*)src); - spin_unlock_bh(&src->lock); -} - -static void -ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) -{ - dst->protocol = src->protocol; - dst->addr = src->addr; - dst->port = src->port; - dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); - dst->flags = src->flags; - dst->timeout = src->timeout / HZ; - dst->netmask = src->netmask; - dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); -} - -static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, - struct ip_vs_get_services __user *uptr) -{ - int idx, count=0; - struct ip_vs_service *svc; - struct ip_vs_service_entry entry; - int ret = 0; - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - out: - return ret; -} - -static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, - struct ip_vs_get_dests __user *uptr) -{ - struct ip_vs_service *svc; - int ret = 0; - - if (get->fwmark) - svc = __ip_vs_svc_fwm_get(get->fwmark); - else - svc = __ip_vs_service_get(get->protocol, - get->addr, get->port); - if (svc) { - int count = 0; - struct ip_vs_dest *dest; - struct ip_vs_dest_entry entry; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (count >= get->num_dests) - break; - - entry.addr = dest->addr; - entry.port = dest->port; - entry.conn_flags = atomic_read(&dest->conn_flags); - entry.weight = atomic_read(&dest->weight); - entry.u_threshold = dest->u_threshold; - entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; - } - count++; - } - ip_vs_service_put(svc); - } else - ret = -ESRCH; - return ret; -} - -static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) -{ -#ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; -#endif -} - - -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, -}; - -static int -do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - unsigned char arg[128]; - int ret = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) - return -EFAULT; - - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - - switch (cmd) { - case IP_VS_SO_GET_VERSION: - { - char buf[64]; - - sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - if (copy_to_user(user, buf, strlen(buf)+1) != 0) { - ret = -EFAULT; - goto out; - } - *len = strlen(buf)+1; - } - break; - - case IP_VS_SO_GET_INFO: - { - struct ip_vs_getinfo info; - info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; - if (copy_to_user(user, &info, sizeof(info)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - int size; - - get = (struct ip_vs_get_services *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_service_entry) * get->num_services; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(get, user); - } - break; - - case IP_VS_SO_GET_SERVICE: - { - struct ip_vs_service_entry *entry; - struct ip_vs_service *svc; - - entry = (struct ip_vs_service_entry *)arg; - if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(entry->fwmark); - else - svc = __ip_vs_service_get(entry->protocol, - entry->addr, entry->port); - if (svc) { - ip_vs_copy_service(entry, svc); - if (copy_to_user(user, entry, sizeof(*entry)) != 0) - ret = -EFAULT; - ip_vs_service_put(svc); - } else - ret = -ESRCH; - } - break; - - case IP_VS_SO_GET_DESTS: - { - struct ip_vs_get_dests *get; - int size; - - get = (struct ip_vs_get_dests *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_dest_entry) * get->num_dests; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_dest_entries(get, user); - } - break; - - case IP_VS_SO_GET_TIMEOUT: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - if (copy_to_user(user, &t, sizeof(t)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - - default: - ret = -EINVAL; - } - - out: - mutex_unlock(&__ip_vs_mutex); - return ret; -} - - -static struct nf_sockopt_ops ip_vs_sockopts = { - .pf = PF_INET, - .set_optmin = IP_VS_BASE_CTL, - .set_optmax = IP_VS_SO_SET_MAX+1, - .set = do_ip_vs_set_ctl, - .get_optmin = IP_VS_BASE_CTL, - .get_optmax = IP_VS_SO_GET_MAX+1, - .get = do_ip_vs_get_ctl, - .owner = THIS_MODULE, -}; - - -int ip_vs_control_init(void) -{ - int ret; - int idx; - - EnterFunction(2); - - ret = nf_register_sockopt(&ip_vs_sockopts); - if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; - } - - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - - memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); - spin_lock_init(&ip_vs_stats.lock); - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - - LeaveFunction(2); - return 0; -} - - -void ip_vs_control_cleanup(void) -{ - EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - nf_unregister_sockopt(&ip_vs_sockopts); - LeaveFunction(2); -} diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c deleted file mode 100644 index dcf5d46aaa5e..000000000000 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * IPVS: Destination Hashing scheduling module - * - * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * Inspired by the consistent hashing scheduler patch from - * Thomas Proell <proellt@gmx.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The dh algorithm is to select server by the hash key of destination IP - * address. The pseudo code is as follows: - * - * n <- servernode[dest_ip]; - * if (n is dead) OR - * (n is overloaded) OR (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet destination IP address to the current server - * array. If the dh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include <net/ip_vs.h> - - -/* - * IPVS DH bucket - */ -struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS DH entry hash table - */ -#ifndef CONFIG_IP_VS_DH_TAB_BITS -#define CONFIG_IP_VS_DH_TAB_BITS 8 -#endif -#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS -#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) -#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS DH entry - */ -static inline unsigned ip_vs_dh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_dh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_dh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) -{ - int i; - struct ip_vs_dh_bucket *b; - - b = tbl; - for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_dh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl; - - /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Destination hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(tbl, iph->daddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS DH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_dh_scheduler = -{ - .name = "dh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_dh_init_svc, - .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, - .schedule = ip_vs_dh_schedule, -}; - - -static int __init ip_vs_dh_init(void) -{ - INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -static void __exit ip_vs_dh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -module_init(ip_vs_dh_init); -module_exit(ip_vs_dh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index dfa0d713c801..000000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * ip_vs_est.c: simple rate estimator for IPVS - * - * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/interrupt.h> -#include <linux/sysctl.h> - -#include <net/ip_vs.h> - -/* - This code is to estimate rate in a shorter interval (such as 8 - seconds) for virtual services and real servers. For measure rate in a - long interval, it is easy to implement a user level daemon which - periodically reads those statistical counters and measure rate. - - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - - We measure rate during the last 8 seconds every 2 seconds: - - avgrate = avgrate*(1-W) + rate*W - - where W = 2^(-2) - - NOTES. - - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. - - * A lot code is taken from net/sched/estimator.c - */ - - -struct ip_vs_estimator -{ - struct ip_vs_estimator *next; - struct ip_vs_stats *stats; - - u32 last_conns; - u32 last_inpkts; - u32 last_outpkts; - u64 last_inbytes; - u64 last_outbytes; - - u32 cps; - u32 inpps; - u32 outpps; - u32 inbps; - u32 outbps; -}; - - -static struct ip_vs_estimator *est_list = NULL; -static DEFINE_RWLOCK(est_lock); -static struct timer_list est_timer; - -static void estimation_timer(unsigned long arg) -{ - struct ip_vs_estimator *e; - struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; - - read_lock(&est_lock); - for (e = est_list; e; e = e->next) { - s = e->stats; - - spin_lock(&s->lock); - n_conns = s->conns; - n_inpkts = s->inpkts; - n_outpkts = s->outpkts; - n_inbytes = s->inbytes; - n_outbytes = s->outbytes; - - /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->cps = (e->cps+0x1FF)>>10; - - rate = (n_inpkts - e->last_inpkts)<<9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->inpps = (e->inpps+0x1FF)>>10; - - rate = (n_outpkts - e->last_outpkts)<<9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->outpps = (e->outpps+0x1FF)>>10; - - rate = (n_inbytes - e->last_inbytes)<<4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->inbps = (e->inbps+0xF)>>5; - - rate = (n_outbytes - e->last_outbytes)<<4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->outbps = (e->outbps+0xF)>>5; - spin_unlock(&s->lock); - } - read_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); -} - -int ip_vs_new_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est; - - est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) - return -ENOMEM; - - est->stats = stats; - est->last_conns = stats->conns; - est->cps = stats->cps<<10; - - est->last_inpkts = stats->inpkts; - est->inpps = stats->inpps<<10; - - est->last_outpkts = stats->outpkts; - est->outpps = stats->outpps<<10; - - est->last_inbytes = stats->inbytes; - est->inbps = stats->inbps<<5; - - est->last_outbytes = stats->outbytes; - est->outbps = stats->outbps<<5; - - write_lock_bh(&est_lock); - est->next = est_list; - if (est->next == NULL) { - setup_timer(&est_timer, estimation_timer, 0); - est_timer.expires = jiffies + 2*HZ; - add_timer(&est_timer); - } - est_list = est; - write_unlock_bh(&est_lock); - return 0; -} - -void ip_vs_kill_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est, **pest; - int killed = 0; - - write_lock_bh(&est_lock); - pest = &est_list; - while ((est=*pest) != NULL) { - if (est->stats != stats) { - pest = &est->next; - continue; - } - *pest = est->next; - kfree(est); - killed++; - } - if (killed && est_list == NULL) - del_timer_sync(&est_timer); - write_unlock_bh(&est_lock); -} - -void ip_vs_zero_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *e; - - write_lock_bh(&est_lock); - for (e = est_list; e; e = e->next) { - if (e->stats != stats) - continue; - - /* set counters zero */ - e->last_conns = 0; - e->last_inpkts = 0; - e->last_outpkts = 0; - e->last_inbytes = 0; - e->last_outbytes = 0; - e->cps = 0; - e->inpps = 0; - e->outpps = 0; - e->inbps = 0; - e->outbps = 0; - } - write_unlock_bh(&est_lock); -} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index 59aa166b7678..000000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null @@ -1,395 +0,0 @@ -/* - * ip_vs_ftp.c: IPVS ftp application module - * - * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * Changes: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference - * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. - * - * IP_MASQ_FTP ftp masquerading module - * - * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 - * - * Author: Wouter Gadeyne - * - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <asm/unaligned.h> - -#include <net/ip_vs.h> - - -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " - - -/* - * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper - * First port is set to the default port. - */ -static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); -MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); - - -/* Dummy variable */ -static int ip_vs_ftp_pasv; - - -static int -ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -static int -ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -/* - * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. - * <addr,port> is in network order. - */ -static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, - __be32 *addr, __be16 *port, - char **start, char **end) -{ - unsigned char p[6]; - int i = 0; - - if (data_limit - data < plen) { - /* check if there is partial match */ - if (strnicmp(data, pattern, data_limit - data) == 0) - return -1; - else - return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { - return 0; - } - *start = data + plen; - - for (data = *start; *data != term; data++) { - if (data == data_limit) - return -1; - } - *end = data; - - memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { - i++; - } else { - /* unexpected character */ - return -1; - } - } - - if (i != 5) - return -1; - - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); - return 1; -} - - -/* - * Look at outgoing ftp packets to catch the response to a PASV command - * from the server (inside-to-outside). - * When we see one, we build a connection entry with the client address, - * client port 0 (unknown at the moment), the server address and the - * server port. Mark the current connection entry as a control channel - * of the new entry. All this work is just to make the data connection - * can be scheduled to the right server later. - * - * The outgoing packet should be something like - * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". - * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. - */ -static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_limit; - char *start, *end; - __be32 from; - __be16 port; - struct ip_vs_conn *n_cp; - char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; - - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', - &from, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " - "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); - - /* - * Now update or create an connection entry for it - */ - n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr, 0); - if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr, 0, - cp->vaddr, port, - from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Replace the old passive address with the new one - */ - from = n_cp->vaddr; - port = n_cp->vport; - sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), - (ntohs(port)>>8)&255, ntohs(port)&255); - buf_len = strlen(buf); - - /* - * Calculate required delta-offset to keep TCP happy - */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; - } - return 1; -} - - -/* - * Look at incoming ftp packets to catch the PASV/PORT command - * (outside-to-inside). - * - * The incoming packet having the PORT command should be something like - * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". - * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. - * In this case, we create a connection entry using the client address and - * port, so that the active ftp data connection from the server can reach - * the client. - */ -static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_start, *data_limit; - char *start, *end; - __be32 to; - __be16 port; - struct ip_vs_conn *n_cp; - - /* no diff required for incoming packets */ - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - while (data <= data_limit - 6) { - if (strnicmp(data, "PASV\r\n", 6) == 0) { - /* Passive mode on */ - IP_VS_DBG(7, "got PASV at %td of %td\n", - data - data_start, - data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; - return 1; - } - data++; - } - - /* - * To support virtual FTP server, the scenerio is as follows: - * FTP client ----> Load Balancer ----> FTP server - * First detect the port number in the application data, - * then create a new connection entry for the coming data - * connection. - */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to), ntohs(port)); - - /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", - ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); - - n_cp = ip_vs_conn_in_get(iph->protocol, - to, port, - cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - to, port, - cp->vaddr, htons(ntohs(cp->vport)-1), - cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Move tunnel to listen state - */ - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - - return 1; -} - - -static struct ip_vs_app ip_vs_ftp = { - .name = "ftp", - .type = IP_VS_APP_TYPE_FTP, - .protocol = IPPROTO_TCP, - .module = THIS_MODULE, - .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), - .init_conn = ip_vs_ftp_init_conn, - .done_conn = ip_vs_ftp_done_conn, - .bind_conn = NULL, - .unbind_conn = NULL, - .pkt_out = ip_vs_ftp_out, - .pkt_in = ip_vs_ftp_in, -}; - - -/* - * ip_vs_ftp initialization - */ -static int __init ip_vs_ftp_init(void) -{ - int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; - - ret = register_ip_vs_app(app); - if (ret) - return ret; - - for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { - if (!ports[i]) - continue; - ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); - if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); - } - - if (ret) - unregister_ip_vs_app(app); - - return ret; -} - - -/* - * ip_vs_ftp finish. - */ -static void __exit ip_vs_ftp_exit(void) -{ - unregister_ip_vs_app(&ip_vs_ftp); -} - - -module_init(ip_vs_ftp_init); -module_exit(ip_vs_ftp_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 3888642706ad..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null @@ -1,573 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection scheduling module - * - * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Martin Hamilton : fixed the terrible locking bugs - * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug - * Wensong Zhang : added doing full expiration check to - * collect stale entries of 24+ hours when - * no partial expire check in a half hour - * Julian Anastasov : replaced del_timer call with del_timer_sync - * to avoid the possible race between timer - * handler and del_timer thread in SMP - * - */ - -/* - * The lblc algorithm is as follows (pseudo code): - * - * if cachenode[dest_ip] is null then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * else - * n <- cachenode[dest_ip]; - * if (n is dead) OR - * (n.conns>n.weight AND - * there is a node m with m.conns<m.weight/2) then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * - * return n; - * - * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing - * me to write this module. - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/jiffies.h> - -/* for sysctl */ -#include <linux/fs.h> -#include <linux/sysctl.h> - -#include <net/ip_vs.h> - - -/* - * It is for garbage collection of stale IPVS lblc entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblc entry hash table - */ -#ifndef CONFIG_IP_VS_LBLC_TAB_BITS -#define CONFIG_IP_VS_LBLC_TAB_BITS 10 -#endif -#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS -#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) -#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) - - -/* - * IPVS lblc entry represents an association between destination - * IP address and its destination server - */ -struct ip_vs_lblc_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblc hash table - */ -struct ip_vs_lblc_table { - rwlock_t lock; /* lock for this table */ - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLC sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -/* - * new/free a ip_vs_lblc_entry, which is a mapping of a destionation - * IP address to a server. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - return en; -} - - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) -{ - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLC entry - */ -static inline unsigned ip_vs_lblc_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblc_table. - * returns bool success. - */ -static int -ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) -{ - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Hash by destination IP address - */ - hash = ip_vs_lblc_hashkey(en->addr); - - write_lock(&tbl->lock); - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; -} - - -/* - * Get ip_vs_lblc_entry associated with supplied parameters. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) -{ - unsigned hash; - struct ip_vs_lblc_entry *en; - - hash = ip_vs_lblc_hashkey(addr); - - read_lock(&tbl->lock); - - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; - } - } - - read_unlock(&tbl->lock); - - return NULL; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) -{ - int i; - struct ip_vs_lblc_entry *en, *nxt; - - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } -} - - -static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) -{ - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLC_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblc table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblc_check_expire(unsigned long data) -{ - struct ip_vs_lblc_table *tbl; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - tbl = (struct ip_vs_lblc_table *)data; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblc_full_check(tbl); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLC_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&tbl->lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - - -static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblc_table *tbl; - - /* - * Allocate the ip_vs_lblc_table for this service - */ - tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblc_table)); - - /* - * Initialize the hash buckets - */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); - } - rwlock_init(&tbl->lock); - tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); - - return 0; -} - - -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblc_table)); - - return 0; -} - - -static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_lblc_table *tbl; - struct ip_vs_lblc_entry *en; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_lblc_table *)svc->sched_data; - en = ip_vs_lblc_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblc_new(iph->daddr, dest); - if (en == NULL) { - return NULL; - } - ip_vs_lblc_hash(tbl, en); - } else { - dest = en->dest; - if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } - } - en->lastuse = jiffies; - - IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLC Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ - .name = "lblc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_lblc_init_svc, - .done_service = ip_vs_lblc_done_svc, - .update_service = ip_vs_lblc_update_svc, - .schedule = ip_vs_lblc_schedule, -}; - - -static int __init ip_vs_lblc_init(void) -{ - int ret; - - INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblc_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); -} - - -module_init(ip_vs_lblc_init); -module_exit(ip_vs_lblc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index daa260eb21cf..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null @@ -1,762 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection with Replication scheduler - * - * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Julian Anastasov : Added the missing (dest->weight>0) - * condition in the ip_vs_dest_set_max. - * - */ - -/* - * The lblc/r algorithm is as follows (pseudo code): - * - * if serverSet[dest_ip] is null then - * n, serverSet[dest_ip] <- {weighted least-conn node}; - * else - * n <- {least-conn (alive) node in serverSet[dest_ip]}; - * if (n is null) OR - * (n.conns>n.weight AND - * there is a node m with m.conns<m.weight/2) then - * n <- {weighted least-conn node}; - * add n to serverSet[dest_ip]; - * if |serverSet[dest_ip]| > 1 AND - * now - serverSet[dest_ip].lastMod > T then - * m <- {most conn node in serverSet[dest_ip]}; - * remove m from serverSet[dest_ip]; - * if serverSet[dest_ip] changed then - * serverSet[dest_ip].lastMod <- now; - * - * return n; - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/jiffies.h> - -/* for sysctl */ -#include <linux/fs.h> -#include <linux/sysctl.h> -#include <net/net_namespace.h> - -#include <net/ip_vs.h> - - -/* - * It is for garbage collection of stale IPVS lblcr entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblcr entry hash table - */ -#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS -#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 -#endif -#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS -#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) -#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) - - -/* - * IPVS destination set structure and operations - */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ -}; - -struct ip_vs_dest_set { - atomic_t size; /* set size */ - unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ -}; - - -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e; - - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; - } - - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } - - atomic_inc(&dest->refcnt); - e->dest = dest; - - /* link it to the list */ - write_lock(&set->lock); - e->next = set->list; - set->list = e; - atomic_inc(&set->size); - write_unlock(&set->lock); - - set->lastmod = jiffies; - return e; -} - -static void -ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - if (e->dest == dest) { - /* HIT */ - *ep = e->next; - atomic_dec(&set->size); - set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); - break; - } - ep = &e->next; - } - write_unlock(&set->lock); -} - -static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); - } - write_unlock(&set->lock); -} - -/* get weighted least-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *least; - int loh, doh; - - if (set == NULL) - return NULL; - - read_lock(&set->lock); - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - least = e->dest; - if (least->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if ((atomic_read(&least->weight) > 0) - && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - read_unlock(&set->lock); - return NULL; - - /* find the destination with the weighted least load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) - && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - least = dest; - loh = doh; - } - } - read_unlock(&set->lock); - - IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - return least; -} - - -/* get weighted most-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *most; - int moh, doh; - - if (set == NULL) - return NULL; - - read_lock(&set->lock); - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - most = e->dest; - if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); - goto nextstage; - } - } - read_unlock(&set->lock); - return NULL; - - /* find the destination with the weighted most load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) - && (atomic_read(&dest->weight) > 0)) { - most = dest; - moh = doh; - } - } - read_unlock(&set->lock); - - IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr), ntohs(most->port), - atomic_read(&most->activeconns), - atomic_read(&most->refcnt), - atomic_read(&most->weight), moh); - return most; -} - - -/* - * IPVS lblcr entry represents an association between destination - * IP address and its destination server set - */ -struct ip_vs_lblcr_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest_set set; /* destination server set */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblcr hash table - */ -struct ip_vs_lblcr_table { - rwlock_t lock; /* lock for this table */ - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLCR sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -/* - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. - */ -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr) -{ - struct ip_vs_lblcr_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - return en; -} - - -static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) -{ - list_del(&en->list); - ip_vs_dest_set_eraseall(&en->set); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLCR entry - */ -static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblcr_table. - * returns bool success. - */ -static int -ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) -{ - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Hash by destination IP address - */ - hash = ip_vs_lblcr_hashkey(en->addr); - - write_lock(&tbl->lock); - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; -} - - -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) -{ - unsigned hash; - struct ip_vs_lblcr_entry *en; - - hash = ip_vs_lblcr_hashkey(addr); - - read_lock(&tbl->lock); - - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; - } - } - - read_unlock(&tbl->lock); - - return NULL; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) -{ - int i; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } -} - - -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) -{ - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblcr table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblcr_check_expire(unsigned long data) -{ - struct ip_vs_lblcr_table *tbl; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - tbl = (struct ip_vs_lblcr_table *)data; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblcr_full_check(tbl); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&tbl->lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - -static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblcr_table *tbl; - - /* - * Allocate the ip_vs_lblcr_table for this service - */ - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblcr_table)); - - /* - * Initialize the hash buckets - */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); - } - rwlock_init(&tbl->lock); - tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); - - return 0; -} - - -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblcr_table)); - - return 0; -} - - -static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_lblcr_table *tbl; - struct ip_vs_lblcr_entry *en; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_lblcr_table *)svc->sched_data; - en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblcr_new(iph->daddr); - if (en == NULL) { - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - ip_vs_lblcr_hash(tbl, en); - } else { - dest = ip_vs_dest_set_min(&en->set); - if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - } - if (atomic_read(&en->set.size) > 1 && - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { - struct ip_vs_dest *m; - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - } - } - en->lastuse = jiffies; - - IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLCR Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblcr_scheduler = -{ - .name = "lblcr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_lblcr_init_svc, - .done_service = ip_vs_lblcr_done_svc, - .update_service = ip_vs_lblcr_update_svc, - .schedule = ip_vs_lblcr_schedule, -}; - - -static int __init ip_vs_lblcr_init(void) -{ - int ret; - - INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblcr_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); -} - - -module_init(ip_vs_lblcr_init); -module_exit(ip_vs_lblcr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index d88fef90a641..000000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * IPVS: Least-Connection Scheduling module - * - * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : added the ip_vs_lc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int ip_vs_lc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); - - /* - * Simply select the server with the least number of - * (activeconns<<5) + inactconns - * Except whose weight is equal to zero. - * If the weight is equal to zero, it means that the server is - * quiesced, the existing connections to the server still get - * served, but no new connection is assigned to the server. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || - atomic_read(&dest->weight) == 0) - continue; - doh = ip_vs_lc_dest_overhead(dest); - if (!least || doh < loh) { - least = dest; - loh = doh; - } - } - - if (least) - IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_lc_scheduler = { - .name = "lc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_lc_init_svc, - .done_service = ip_vs_lc_done_svc, - .update_service = ip_vs_lc_update_svc, - .schedule = ip_vs_lc_schedule, -}; - - -static int __init ip_vs_lc_init(void) -{ - INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; -} - -static void __exit ip_vs_lc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); -} - -module_init(ip_vs_lc_init); -module_exit(ip_vs_lc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index bc2a9e5f2a7b..000000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * IPVS: Never Queue scheduling module - * - * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The NQ algorithm adopts a two-speed model. When there is an idle server - * available, the job will be sent to the idle server, instead of waiting - * for a fast one. When there is no idle server available, the job will be - * sent to the server that minimize its expected delay (The Shortest - * Expected Delay scheduling algorithm). - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. - * - * The difference between NQ and SED is that NQ can improve overall - * system utilization. - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_nq_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - - if (dest->flags & IP_VS_DEST_F_OVERLOAD || - !atomic_read(&dest->weight)) - continue; - - doh = ip_vs_nq_dest_overhead(dest); - - /* return the server directly if it is idle */ - if (atomic_read(&dest->activeconns) == 0) { - least = dest; - loh = doh; - goto out; - } - - if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { - least = dest; - loh = doh; - } - } - - if (!least) - return NULL; - - out: - IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_nq_scheduler = -{ - .name = "nq", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_nq_init_svc, - .done_service = ip_vs_nq_done_svc, - .update_service = ip_vs_nq_update_svc, - .schedule = ip_vs_nq_schedule, -}; - - -static int __init ip_vs_nq_init(void) -{ - INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -static void __exit ip_vs_nq_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -module_init(ip_vs_nq_init); -module_exit(ip_vs_nq_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 4b1c16cbb16b..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * ip_vs_proto.c: transport protocol load balancing support for IPVS - * - * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <net/udp.h> -#include <asm/system.h> -#include <linux/stat.h> -#include <linux/proc_fs.h> - -#include <net/ip_vs.h> - - -/* - * IPVS protocols can only be registered/unregistered when the ipvs - * module is loaded/unloaded, so no lock is needed in accessing the - * ipvs protocol table. - */ - -#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ -#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) - -static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; - - -/* - * register an ipvs protocol - */ -static int __used register_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp->next = ip_vs_proto_table[hash]; - ip_vs_proto_table[hash] = pp; - - if (pp->init != NULL) - pp->init(pp); - - return 0; -} - - -/* - * unregister an ipvs protocol - */ -static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp_p = &ip_vs_proto_table[hash]; - for (; *pp_p; pp_p = &(*pp_p)->next) { - if (*pp_p == pp) { - *pp_p = pp->next; - if (pp->exit != NULL) - pp->exit(pp); - return 0; - } - } - - return -ESRCH; -} - - -/* - * get ip_vs_protocol object by its proto. - */ -struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) -{ - struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); - - for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { - if (pp->protocol == proto) - return pp; - } - - return NULL; -} - - -/* - * Propagate event for state change to all protocols - */ -void ip_vs_protocol_timeout_change(int flags) -{ - struct ip_vs_protocol *pp; - int i; - - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); - } - } -} - - -int * -ip_vs_create_timeout_table(int *table, int size) -{ - return kmemdup(table, size, GFP_ATOMIC); -} - - -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - -const char * ip_vs_state_name(__u16 proto, int state) -{ - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); -} - - -void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[128]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else { - __be16 _ports[2], *pptr -; - pptr = skb_header_pointer(skb, offset + ih->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(ih->saddr), - ntohs(pptr[0]), - NIPQUAD(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -int ip_vs_protocol_init(void) -{ - char protocols[64]; -#define REGISTER_PROTOCOL(p) \ - do { \ - register_ip_vs_protocol(p); \ - strcat(protocols, ", "); \ - strcat(protocols, (p)->name); \ - } while (0) - - protocols[0] = '\0'; - protocols[2] = '\0'; -#ifdef CONFIG_IP_VS_PROTO_TCP - REGISTER_PROTOCOL(&ip_vs_protocol_tcp); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - REGISTER_PROTOCOL(&ip_vs_protocol_udp); -#endif -#ifdef CONFIG_IP_VS_PROTO_AH - REGISTER_PROTOCOL(&ip_vs_protocol_ah); -#endif -#ifdef CONFIG_IP_VS_PROTO_ESP - REGISTER_PROTOCOL(&ip_vs_protocol_esp); -#endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); - - return 0; -} - - -void ip_vs_protocol_cleanup(void) -{ - struct ip_vs_protocol *pp; - int i; - - /* unregister all the ipvs protocols */ - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - while ((pp = ip_vs_proto_table[i]) != NULL) - unregister_ip_vs_protocol(pp); - } -} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c deleted file mode 100644 index 4bf835e1d86d..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS - * - * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * - * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 - * Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -ah_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void ah_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_init, - .exit = ah_exit, - .conn_schedule = ah_conn_schedule, - .conn_in_get = ah_conn_in_get, - .conn_out_get = ah_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c deleted file mode 100644 index db6a6b7b1a0b..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS - * - * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * - * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 - * Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = esp_init, - .exit = esp_exit, - .conn_schedule = esp_conn_schedule, - .conn_in_get = esp_conn_in_get, - .conn_out_get = esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index b83dc14b0a4d..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null @@ -1,616 +0,0 @@ -/* - * ip_vs_proto_tcp.c: TCP load balancing support for IPVS - * - * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/kernel.h> -#include <linux/ip.h> -#include <linux/tcp.h> /* for tcphdr */ -#include <net/ip.h> -#include <net/tcp.h> /* for csum_tcpudp_magic */ -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -static struct ip_vs_conn * -tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } -} - - -static int -tcp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) { - *verdict = NF_DROP; - return 0; - } - - if (th->syn && - (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, th->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, - __be16 oldport, __be16 newport) -{ - tcph->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); -} - - -static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - tcph = (void *)ip_hdr(skb) + tcphoff; - tcph->source = cp->vport; - - /* Adjust TCP checksums */ - if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, - skb->len - tcphoff, - cp->protocol, skb->csum); - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, tcph->check, - (char*)&(tcph->check) - (char*)tcph); - } - return 1; -} - - -static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn and iph ack_seq stuff - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - tcph = (void *)ip_hdr(skb) + tcphoff; - tcph->dest = cp->dport; - - /* - * Adjust TCP checksums - */ - if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, - skb->len - tcphoff, - cp->protocol, skb->csum); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - const unsigned int tcphoff = ip_hdrlen(skb); - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - - return 1; -} - - -#define TCP_DIR_INPUT 0 -#define TCP_DIR_OUTPUT 4 -#define TCP_DIR_INPUT_ONLY 8 - -static const int tcp_state_off[IP_VS_DIR_LAST] = { - [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, - [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, - [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, -}; - -/* - * Timeout table[state] - */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 120*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = "NONE", - [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", - [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", - [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", - [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", - [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", - [IP_VS_TCP_S_CLOSE] = "CLOSE", - [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", - [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", - [IP_VS_TCP_S_LISTEN] = "LISTEN", - [IP_VS_TCP_S_SYNACK] = "SYNACK", - [IP_VS_TCP_S_LAST] = "BUG!", -}; - -#define sNO IP_VS_TCP_S_NONE -#define sES IP_VS_TCP_S_ESTABLISHED -#define sSS IP_VS_TCP_S_SYN_SENT -#define sSR IP_VS_TCP_S_SYN_RECV -#define sFW IP_VS_TCP_S_FIN_WAIT -#define sTW IP_VS_TCP_S_TIME_WAIT -#define sCL IP_VS_TCP_S_CLOSE -#define sCW IP_VS_TCP_S_CLOSE_WAIT -#define sLA IP_VS_TCP_S_LAST_ACK -#define sLI IP_VS_TCP_S_LISTEN -#define sSA IP_VS_TCP_S_SYNACK - -struct tcp_states_t { - int next_state[IP_VS_TCP_S_LAST]; -}; - -static const char * tcp_state_name(int state) -{ - if (state >= IP_VS_TCP_S_LAST) - return "ERR!"; - return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; -} - -static struct tcp_states_t tcp_states [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t tcp_states_dos [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ - int on = (flags & 1); /* secure_tcp */ - - /* - ** FIXME: change secure_tcp to independent sysctl var - ** or make it per-service or per-app because it is valid - ** for most if not for all of the applications. Something - ** like "capabilities" (flags) for each object. - */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - -static inline int tcp_state_idx(struct tcphdr *th) -{ - if (th->rst) - return 3; - if (th->syn) - return 0; - if (th->fin) - return 1; - if (th->ack) - return 2; - return -1; -} - -static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, - int direction, struct tcphdr *th) -{ - int state_idx; - int new_state = IP_VS_TCP_S_CLOSE; - int state_off = tcp_state_off[direction]; - - /* - * Update state offset to INPUT_ONLY if necessary - * or delete NO_OUTPUT flag if output packet detected - */ - if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { - if (state_off == TCP_DIR_OUTPUT) - cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; - else - state_off = TCP_DIR_INPUT_ONLY; - } - - if ((state_idx = tcp_state_idx(th)) < 0) { - IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); - goto tcp_state_out; - } - - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; - - tcp_state_out: - if (new_state != cp->state) { - struct ip_vs_dest *dest = cp->dest; - - IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - (state_off==TCP_DIR_OUTPUT)?"output ":"input ", - th->syn? 'S' : '.', - th->fin? 'F' : '.', - th->ack? 'A' : '.', - th->rst? 'R' : '.', - NIPQUAD(cp->daddr), ntohs(cp->dport), - NIPQUAD(cp->caddr), ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); - if (dest) { - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - } - - cp->timeout = pp->timeout_table[cp->state = new_state]; -} - - -/* - * Handle state transitions - */ -static int -tcp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; -} - - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - -static inline __u16 tcp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) - & TCP_APP_TAB_MASK; -} - - -static int tcp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = tcp_app_hashkey(port); - - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); - - out: - spin_unlock_bh(&tcp_app_lock); - return ret; -} - - -static void -tcp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); -} - - -static int -tcp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = tcp_app_hashkey(cp->vport); - - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&tcp_app_lock); - - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - inc->name, ntohs(inc->port)); - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&tcp_app_lock); - - out: - return result; -} - - -/* - * Set LISTEN timeout. (ip_vs_conn_put will setup timer) - */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) -{ - spin_lock(&cp->lock); - cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); -} - - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} - - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_tcp = { - .name = "TCP", - .protocol = IPPROTO_TCP, - .num_states = IP_VS_TCP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, - .register_app = tcp_register_app, - .unregister_app = tcp_unregister_app, - .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, - .snat_handler = tcp_snat_handler, - .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, - .state_name = tcp_state_name, - .state_transition = tcp_state_transition, - .app_conn_bind = tcp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index 75771cb3cd6f..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - * ip_vs_proto_udp.c: UDP load balancing support for IPVS - * - * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/udp.h> - -#include <net/ip_vs.h> -#include <net/ip.h> - -static struct ip_vs_conn * -udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } - - return cp; -} - - -static int -udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct udphdr _udph, *uh; - - uh = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_udph), &_udph); - if (uh == NULL) { - *verdict = NF_DROP; - return 0; - } - - if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, uh->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, - __be16 oldport, __be16 newport) -{ - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - if (!uhdr->check) - uhdr->check = CSUM_MANGLED_0; -} - -static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - const unsigned int udphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Call application helper if needed - */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - udph = (void *)ip_hdr(skb) + udphoff; - udph->source = cp->vport; - - /* - * Adjust UDP checksums - */ - if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr, cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, - skb->len - udphoff, - cp->protocol, skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, udph->check, - (char*)&(udph->check) - (char*)udph); - } - return 1; -} - - -static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - udph = (void *)ip_hdr(skb) + udphoff; - udph->dest = cp->dport; - - /* - * Adjust UDP checksums - */ - if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr, cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, - skb->len - udphoff, - cp->protocol, skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - struct udphdr _udph, *uh; - const unsigned int udphoff = ip_hdrlen(skb); - - uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); - if (uh == NULL) - return 0; - - if (uh->check != 0) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, udphoff, - skb->len - udphoff, 0); - case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - } - return 1; -} - - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - -static inline __u16 udp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) - & UDP_APP_TAB_MASK; -} - - -static int udp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = udp_app_hashkey(port); - - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); - - out: - spin_unlock_bh(&udp_app_lock); - return ret; -} - - -static void -udp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); -} - - -static int udp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = udp_app_hashkey(cp->vport); - - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&udp_app_lock); - - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - inc->name, ntohs(inc->port)); - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&udp_app_lock); - - out: - return result; -} - - -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = 5*60*HZ, - [IP_VS_UDP_S_LAST] = 2*HZ, -}; - -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = "UDP", - [IP_VS_UDP_S_LAST] = "BUG!", -}; - - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - -static const char * udp_state_name(int state) -{ - if (state >= IP_VS_UDP_S_LAST) - return "ERR!"; - return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; -} - -static int -udp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; -} - -static void udp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; -} - -static void udp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_udp = { - .name = "UDP", - .protocol = IPPROTO_UDP, - .num_states = IP_VS_UDP_S_LAST, - .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, - .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, - .snat_handler = udp_snat_handler, - .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, - .state_transition = udp_state_transition, - .state_name = udp_state_name, - .register_app = udp_register_app, - .unregister_app = udp_unregister_app, - .app_conn_bind = udp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index 433f8a947924..000000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * IPVS: Round-Robin Scheduling module - * - * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes/Changes: - * Wensong Zhang : changed the ip_vs_rr_schedule to return dest - * Julian Anastasov : fixed the NULL pointer access bug in debugging - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_rr_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int ip_vs_rr_init_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -static int ip_vs_rr_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -/* - * Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct list_head *p, *q; - struct ip_vs_dest *dest; - - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); - - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; - do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; - } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - return NULL; - - out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); - IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - - return dest; -} - - -static struct ip_vs_scheduler ip_vs_rr_scheduler = { - .name = "rr", /* name */ - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_rr_init_svc, - .done_service = ip_vs_rr_done_svc, - .update_service = ip_vs_rr_update_svc, - .schedule = ip_vs_rr_schedule, -}; - -static int __init ip_vs_rr_init(void) -{ - INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -static void __exit ip_vs_rr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -module_init(ip_vs_rr_init); -module_exit(ip_vs_rr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index 121a32b1b756..000000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <asm/string.h> -#include <linux/kmod.h> -#include <linux/sysctl.h> - -#include <net/ip_vs.h> - -/* - * IPVS scheduler list - */ -static LIST_HEAD(ip_vs_schedulers); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); - - -/* - * Bind a service with a scheduler - */ -int ip_vs_bind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *scheduler) -{ - int ret; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - - if (scheduler->init_service) { - ret = scheduler->init_service(svc); - if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); - return ret; - } - } - - return 0; -} - - -/* - * Unbind a service with its scheduler - */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) -{ - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } - - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; -} - - -/* - * Get scheduler in the scheduler list by name - */ -static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); - - read_lock_bh(&__ip_vs_sched_lock); - - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - /* - * Test and get the modules atomically - */ - if (sched->module && !try_module_get(sched->module)) { - /* - * This scheduler is just deleted - */ - continue; - } - if (strcmp(sched_name, sched->name)==0) { - /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); - return sched; - } - if (sched->module) - module_put(sched->module); - } - - read_unlock_bh(&__ip_vs_sched_lock); - return NULL; -} - - -/* - * Lookup scheduler and try to load it if it doesn't exist - */ -struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - /* - * Search for the scheduler by sched_name - */ - sched = ip_vs_sched_getbyname(sched_name); - - /* - * If scheduler not found, load the module and search again - */ - if (sched == NULL) { - request_module("ip_vs_%s", sched_name); - sched = ip_vs_sched_getbyname(sched_name); - } - - return sched; -} - -void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) -{ - if (scheduler->module) - module_put(scheduler->module); -} - - -/* - * Register a scheduler in the scheduler list - */ -int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - struct ip_vs_scheduler *sched; - - if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); - return -EINVAL; - } - - /* increase the module use count */ - ip_vs_use_count_inc(); - - write_lock_bh(&__ip_vs_sched_lock); - - if (scheduler->n_list.next != &scheduler->n_list) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); - return -EINVAL; - } - - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); - return -EINVAL; - } - } - /* - * Add it into the d-linked scheduler list - */ - list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); - - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); - - return 0; -} - - -/* - * Unregister a scheduler from the scheduler list - */ -int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - write_lock_bh(&__ip_vs_sched_lock); - if (scheduler->n_list.next == &scheduler->n_list) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); - return -EINVAL; - } - - /* - * Remove it from the d-linked scheduler list - */ - list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index dd7c128f9db3..000000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * IPVS: Shortest Expected Delay scheduling module - * - * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The SED algorithm attempts to minimize each job's expected delay until - * completion. The expected delay that the job will experience is - * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of - * jobs on the ith server and Ui is the fixed service rate (weight) of - * the ith server. The SED algorithm adopts a greedy policy that each does - * what is in its own best interest, i.e. to join the queue which would - * minimize its expected delay of completion. - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. - * - * The difference between SED and WLC is that SED includes the incoming - * job in the cost function (the increment of 1). SED may outperform - * WLC, while scheduling big jobs under larger heterogeneous systems - * (the server weight varies a lot). - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_sed_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_sed_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_sed_scheduler = -{ - .name = "sed", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_sed_init_svc, - .done_service = ip_vs_sed_done_svc, - .update_service = ip_vs_sed_update_svc, - .schedule = ip_vs_sed_schedule, -}; - - -static int __init ip_vs_sed_init(void) -{ - INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -static void __exit ip_vs_sed_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -module_init(ip_vs_sed_init); -module_exit(ip_vs_sed_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 1b25b00ef1e1..000000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null @@ -1,257 +0,0 @@ -/* - * IPVS: Source Hashing scheduling module - * - * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The sh algorithm is to select server by the hash key of source IP - * address. The pseudo code is as follows: - * - * n <- servernode[src_ip]; - * if (n is dead) OR - * (n is overloaded) or (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet source IP address to the current server - * array. If the sh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include <net/ip_vs.h> - - -/* - * IPVS SH bucket - */ -struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS SH entry hash table - */ -#ifndef CONFIG_IP_VS_SH_TAB_BITS -#define CONFIG_IP_VS_SH_TAB_BITS 8 -#endif -#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS -#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) -#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS SH entry - */ -static inline unsigned ip_vs_sh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_sh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_sh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) -{ - int i; - struct ip_vs_sh_bucket *b; - - b = tbl; - for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_sh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl; - - /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Source Hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(tbl, iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS SH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_sh_scheduler = -{ - .name = "sh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_sh_init_svc, - .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, - .schedule = ip_vs_sh_schedule, -}; - - -static int __init ip_vs_sh_init(void) -{ - INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -static void __exit ip_vs_sh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -module_init(ip_vs_sh_init); -module_exit(ip_vs_sh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index eff54efe0351..000000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null @@ -1,1019 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * ip_vs_sync: sync connection info from master load balancer to backups - * through multicast - * - * Changes: - * Alexandre Cassen : Added master & backup support at a time. - * Alexandre Cassen : Added SyncID support for incoming sync - * messages filtering. - * Justin Ossevoort : Fix endian problem on sync message size. - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/inetdevice.h> -#include <linux/net.h> -#include <linux/completion.h> -#include <linux/delay.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/igmp.h> /* for ip_mc_join_group */ -#include <linux/udp.h> - -#include <net/ip.h> -#include <net/sock.h> -#include <asm/uaccess.h> /* for get_fs and set_fs */ - -#include <net/ip_vs.h> - -#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ -#define IP_VS_SYNC_PORT 8848 /* multicast port */ - - -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __be16 cport; - __be16 vport; - __be16 dport; - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ - - /* Flags and state transition */ - __be16 flags; /* status flags */ - __be16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -struct ip_vs_sync_thread_data { - struct completion *startup; - int state; -}; - -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -#define SYNC_MESG_HEADER_LEN 4 - -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 syncid; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - -struct ip_vs_sync_buff { - struct list_head list; - unsigned long firstuse; - - /* pointers for the message data */ - struct ip_vs_sync_mesg *mesg; - unsigned char *head; - unsigned char *end; -}; - - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* multicast addr */ -static struct sockaddr_in mcast_addr; - - -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - list_add_tail(&sb->list, &ip_vs_sync_queue); - spin_unlock(&ip_vs_sync_lock); -} - -static inline struct ip_vs_sync_buff * sb_dequeue(void) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { - sb = NULL; - } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, - list); - list_del(&sb->list); - } - spin_unlock_bh(&ip_vs_sync_lock); - - return sb; -} - -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) -{ - struct ip_vs_sync_buff *sb; - - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) - return NULL; - - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { - kfree(sb); - return NULL; - } - sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; - sb->firstuse = jiffies; - return sb; -} - -static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) -{ - kfree(sb->mesg); - kfree(sb); -} - -/* - * Get the current sync buffer if it has been created for more - * than the specified time or the specified time is zero. - */ -static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; - } else - sb = NULL; - spin_unlock_bh(&curr_sb_lock); - return sb; -} - - -/* - * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. - */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) -{ - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; - int len; - - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); - return; - } - } - - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; - - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr; - s->vaddr = cp->vaddr; - s->daddr = cp->daddr; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } - - m->nr_conns++; - m->size += len; - curr_sb->head += len; - - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; - } - spin_unlock(&curr_sb_lock); - - /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); -} - - -/* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. - */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) -{ - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; - - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); - for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; - - if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); - return; - } - s = (struct ip_vs_sync_conn *) p; - flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; - flags &= ~IP_VS_CONN_F_HASHED; - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; - p += FULL_CONN_SIZE; - if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); - return; - } - } else { - opt = NULL; - p += SIMPLE_CONN_SIZE; - } - - state = ntohs(s->state); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) { - pp = ip_vs_proto_get(s->protocol); - if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", - s->protocol); - continue; - } - if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", - pp->name, state); - continue; - } - } else { - /* protocol in templates is not used for state/timeout */ - pp = NULL; - if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", - state); - state = 0; - } - } - - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); - else - cp = ip_vs_ct_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(s->daddr, s->dport, - s->vaddr, s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } - cp = ip_vs_conn_new(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport, - s->daddr, s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); - return; - } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } -} - - -/* - * Setup loopback of outgoing multicasts on a sending socket - */ -static void set_mcast_loop(struct sock *sk, u_char loop) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; - release_sock(sk); -} - -/* - * Specify TTL for outgoing multicasts on a sending socket - */ -static void set_mcast_ttl(struct sock *sk, u_char ttl) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ - lock_sock(sk); - inet->mc_ttl = ttl; - release_sock(sk); -} - -/* - * Specifiy default interface for outgoing multicasts - */ -static int set_mcast_if(struct sock *sk, char *ifname) -{ - struct net_device *dev; - struct inet_sock *inet = inet_sk(sk); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - lock_sock(sk); - inet->mc_index = dev->ifindex; - /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - - -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(int sync_state) -{ - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = - SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) - return -ENODEV; - - sync_recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); - } - - return 0; -} - - -/* - * Join a multicast group. - * the group is specified by a class D multicast address 224.0.0.0/8 - * in the in_addr structure passed in as a parameter. - */ -static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) -{ - struct ip_mreqn mreq; - struct net_device *dev; - int ret; - - memset(&mreq, 0, sizeof(mreq)); - memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - mreq.imr_ifindex = dev->ifindex; - - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); - - return ret; -} - - -static int bind_mcastif_addr(struct socket *sock, char *ifname) -{ - struct net_device *dev; - __be32 addr; - struct sockaddr_in sin; - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); - if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); - - IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", - ifname, NIPQUAD(addr)); - - /* Now bind the socket with the address of multicast interface */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - sin.sin_port = 0; - - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); -} - -/* - * Set up sending multicast socket over UDP - */ -static struct socket * make_send_sock(void) -{ - struct socket *sock; - - /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; - } - - if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); - goto error; - } - - set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); - - if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); - goto error; - } - - if (sock->ops->connect(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr), 0) < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return NULL; -} - - -/* - * Set up receiving multicast socket over UDP - */ -static struct socket * make_receive_sock(void) -{ - struct socket *sock; - - /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; - } - - /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; - - if (sock->ops->bind(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr)) < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); - goto error; - } - - /* join the multicast group */ - if (join_mcast_group(sock->sk, - (struct in_addr*)&mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn) < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return NULL; -} - - -static int -ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; - struct kvec iov; - int len; - - EnterFunction(7); - iov.iov_base = (void *)buffer; - iov.iov_len = length; - - len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - - LeaveFunction(7); - return len; -} - -static void -ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) -{ - int msize; - - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); - - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); -} - -static int -ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) -{ - struct msghdr msg = {NULL,}; - struct kvec iov; - int len; - - EnterFunction(7); - - /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); - - if (len < 0) - return -1; - - LeaveFunction(7); - return len; -} - - -static DECLARE_WAIT_QUEUE_HEAD(sync_wait); -static pid_t sync_master_pid = 0; -static pid_t sync_backup_pid = 0; - -static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); -static int stop_master_sync = 0; -static int stop_backup_sync = 0; - -static void sync_master_loop(void) -{ - struct socket *sock; - struct ip_vs_sync_buff *sb; - - /* create the sending multicast socket */ - sock = make_send_sock(); - if (!sock) - return; - - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - for (;;) { - while ((sb=sb_dequeue())) { - ip_vs_send_sync_msg(sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - /* check if entries stay in curr_sb for 2 seconds */ - if ((sb = get_curr_sync_buff(2*HZ))) { - ip_vs_send_sync_msg(sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - if (stop_master_sync) - break; - - msleep_interruptible(1000); - } - - /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { - ip_vs_sync_buff_release(sb); - } - - /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { - ip_vs_sync_buff_release(sb); - } - - /* release the sending multicast socket */ - sock_release(sock); -} - - -static void sync_backup_loop(void) -{ - struct socket *sock; - char *buf; - int len; - - if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { - IP_VS_ERR("sync_backup_loop: kmalloc error\n"); - return; - } - - /* create the receiving multicast socket */ - sock = make_receive_sock(); - if (!sock) - goto out; - - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - - for (;;) { - /* do you have data now? */ - while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { - if ((len = - ip_vs_receive(sock, buf, - sync_recv_mesg_maxlen)) <= 0) { - IP_VS_ERR("receiving message error\n"); - break; - } - /* disable bottom half, because it accessed the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(buf, len); - local_bh_enable(); - } - - if (stop_backup_sync) - break; - - msleep_interruptible(1000); - } - - /* release the sending multicast socket */ - sock_release(sock); - - out: - kfree(buf); -} - - -static void set_sync_pid(int sync_state, pid_t sync_pid) -{ - if (sync_state == IP_VS_STATE_MASTER) - sync_master_pid = sync_pid; - else if (sync_state == IP_VS_STATE_BACKUP) - sync_backup_pid = sync_pid; -} - -static void set_stop_sync(int sync_state, int set) -{ - if (sync_state == IP_VS_STATE_MASTER) - stop_master_sync = set; - else if (sync_state == IP_VS_STATE_BACKUP) - stop_backup_sync = set; - else { - stop_master_sync = set; - stop_backup_sync = set; - } -} - -static int sync_thread(void *startup) -{ - DECLARE_WAITQUEUE(wait, current); - mm_segment_t oldmm; - int state; - const char *name; - struct ip_vs_sync_thread_data *tinfo = startup; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { - state = IP_VS_STATE_MASTER; - name = "ipvs_syncmaster"; - } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { - state = IP_VS_STATE_BACKUP; - name = "ipvs_syncbackup"; - } else { - IP_VS_BUG(); - ip_vs_use_count_dec(); - return -EINVAL; - } - - daemonize(name); - - oldmm = get_fs(); - set_fs(KERNEL_DS); - - /* Block all signals */ - spin_lock_irq(¤t->sighand->siglock); - siginitsetinv(¤t->blocked, 0); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - /* set the maximum length of sync message */ - set_sync_mesg_maxlen(state); - - /* set up multicast address */ - mcast_addr.sin_family = AF_INET; - mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); - mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); - - add_wait_queue(&sync_wait, &wait); - - set_sync_pid(state, task_pid_nr(current)); - complete(tinfo->startup); - - /* - * once we call the completion queue above, we should - * null out that reference, since its allocated on the - * stack of the creating kernel thread - */ - tinfo->startup = NULL; - - /* processing master/backup loop here */ - if (state == IP_VS_STATE_MASTER) - sync_master_loop(); - else if (state == IP_VS_STATE_BACKUP) - sync_backup_loop(); - else IP_VS_BUG(); - - remove_wait_queue(&sync_wait, &wait); - - /* thread exits */ - - /* - * If we weren't explicitly stopped, then we - * exited in error, and should undo our state - */ - if ((!stop_master_sync) && (!stop_backup_sync)) - ip_vs_sync_state -= tinfo->state; - - set_sync_pid(state, 0); - IP_VS_INFO("sync thread stopped!\n"); - - set_fs(oldmm); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - set_stop_sync(state, 0); - wake_up(&stop_sync_wait); - - /* - * we need to free the structure that was allocated - * for us in start_sync_thread - */ - kfree(tinfo); - return 0; -} - - -static int fork_sync_thread(void *startup) -{ - pid_t pid; - - /* fork the sync thread here, then the parent process of the - sync thread is the init process after this thread exits. */ - repeat: - if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { - IP_VS_ERR("could not create sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } - - return 0; -} - - -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) -{ - DECLARE_COMPLETION_ONSTACK(startup); - pid_t pid; - struct ip_vs_sync_thread_data *tinfo; - - if ((state == IP_VS_STATE_MASTER && sync_master_pid) || - (state == IP_VS_STATE_BACKUP && sync_backup_pid)) - return -EEXIST; - - /* - * Note that tinfo will be freed in sync_thread on exit - */ - tinfo = kmalloc(sizeof(struct ip_vs_sync_thread_data), GFP_KERNEL); - if (!tinfo) - return -ENOMEM; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - - ip_vs_sync_state |= state; - if (state == IP_VS_STATE_MASTER) { - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - } else { - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - } - - tinfo->state = state; - tinfo->startup = &startup; - - repeat: - if ((pid = kernel_thread(fork_sync_thread, tinfo, 0)) < 0) { - IP_VS_ERR("could not create fork_sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } - - wait_for_completion(&startup); - - return 0; -} - - -int stop_sync_thread(int state) -{ - DECLARE_WAITQUEUE(wait, current); - - if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || - (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) - return -ESRCH; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_INFO("stopping sync thread %d ...\n", - (state == IP_VS_STATE_MASTER) ? - sync_master_pid : sync_backup_pid); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&stop_sync_wait, &wait); - set_stop_sync(state, 1); - ip_vs_sync_state -= state; - wake_up(&sync_wait); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&stop_sync_wait, &wait); - - /* Note: no need to reap the sync thread, because its parent - process is the init process */ - - if ((state == IP_VS_STATE_MASTER && stop_master_sync) || - (state == IP_VS_STATE_BACKUP && stop_backup_sync)) - IP_VS_BUG(); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 8a9d913261d8..000000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * IPVS: Weighted Least-Connection Scheduling module - * - * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest - * Wensong Zhang : changed to use the inactconns in scheduling - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wlc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_wlc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_wlc_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_wlc_scheduler = -{ - .name = "wlc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_wlc_init_svc, - .done_service = ip_vs_wlc_done_svc, - .update_service = ip_vs_wlc_update_svc, - .schedule = ip_vs_wlc_schedule, -}; - - -static int __init ip_vs_wlc_init(void) -{ - INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -static void __exit ip_vs_wlc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -module_init(ip_vs_wlc_init); -module_exit(ip_vs_wlc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 85c680add6df..000000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * IPVS: Weighted Round-Robin Scheduling module - * - * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wrr_update_svc - * Julian Anastasov : fixed the bug of returning destination - * with weight 0 when all weights are zero - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/net.h> - -#include <net/ip_vs.h> - -/* - * current destination pointer for weighted round-robin scheduling - */ -struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ - int cw; /* current weight */ - int mw; /* maximum weight */ - int di; /* decreasing interval */ -}; - - -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - -static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight; - int g = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - weight = atomic_read(&dest->weight); - if (weight > 0) { - if (g > 0) - g = gcd(weight, g); - else - g = weight; - } - } - return g ? g : 1; -} - - -/* - * Get the maximum weight of the service destinations. - */ -static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); - } - - return weight; -} - - -static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark; - - /* - * Allocate the mark variable for WRR scheduling - */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); - return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - svc->sched_data = mark; - - return 0; -} - - -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) -{ - /* - * Release the mark variable - */ - kfree(svc->sched_data); - - return 0; -} - - -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark = svc->sched_data; - - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; - return 0; -} - - -/* - * Weighted Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; - - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); - - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; - while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("ip_vs_wrr_schedule(): " - "no available servers\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } - } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - goto out; - } - } - - IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); - - out: - write_unlock(&svc->sched_lock); - return dest; -} - - -static struct ip_vs_scheduler ip_vs_wrr_scheduler = { - .name = "wrr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .init_service = ip_vs_wrr_init_svc, - .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, - .schedule = ip_vs_wrr_schedule, -}; - -static int __init ip_vs_wrr_init(void) -{ - INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); - return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; -} - -static void __exit ip_vs_wrr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); -} - -module_init(ip_vs_wrr_init); -module_exit(ip_vs_wrr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index f63006caea03..000000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null @@ -1,561 +0,0 @@ -/* - * ip_vs_xmit.c: various packet transmitters for IPVS - * - * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/kernel.h> -#include <linux/tcp.h> /* for tcphdr */ -#include <net/ip.h> -#include <net/tcp.h> /* for csum_tcpudp_magic */ -#include <net/udp.h> -#include <net/icmp.h> /* for icmp_send */ -#include <net/route.h> /* for ip_route_output */ -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* - * Destination cache to speed up outgoing route lookup - */ -static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); -} - -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) -{ - struct dst_entry *dst = dest->dst_cache; - - if (!dst) - return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); - return NULL; - } - dst_hold(dst); - return dst; -} - -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) -{ - struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, " - "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr)); - return NULL; - } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr), - atomic_read(&rt->u.dst.__refcnt), rtos); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); - return NULL; - } - } - - return rt; -} - - -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); -} - -#define IP_VS_XMIT(skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) - - -/* - * NULL transmitter (do nothing except return NF_ACCEPT) - */ -int -ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; -} - - -/* - * Bypass transmitter - * Let packets bypass the destination when the destination is not - * available, it may be only used in transparent cache cluster. - */ -int -ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; - - EnterFunction(10); - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " - "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * NAT transmitter (only for outside-to-inside nat forwarding) - * Not used for related ICMP - */ -int -ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ip_hdr(skb)->daddr = cp->daddr; - ip_send_check(ip_hdr(skb)); - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - - -/* - * IP Tunneling transmitter - * - * This function encapsulates the packet in a new IP packet, its - * destination will be set to cp->daddr. Most code of this function - * is taken from ipip.c. - * - * It is used in VS/TUN cluster. The load balancer selects a real - * server from a cluster based on a scheduling algorithm, - * encapsulates the request packet and forwards it to the selected - * server. For example, all real servers are configured with - * "ifconfig tunl0 <Virtual IP Address> up". When the server receives - * the encapsulated packet, it will decapsulate the packet, processe - * the request and return the response packets directly to the client - * without passing the load balancer. This can greatly increase the - * scalability of virtual server. - * - * Used for ANY protocol - */ -int -ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = old_transport_header; - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * Direct Routing transmitter - * Used for ANY protocol - */ -int -ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; - - EnterFunction(10); - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * ICMP packet transmitter - * called by the ip_vs_in_icmp - */ -int -ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - rc = NF_STOLEN; - goto out; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; - out: - LeaveFunction(10); - return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f8edacdf991d..6efdb70b3eb2 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -12,6 +12,7 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) { + struct net *net = dev_net(skb->dst->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; @@ -19,7 +20,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) unsigned int hh_len; unsigned int type; - type = inet_addr_type(&init_net, iph->saddr); + type = inet_addr_type(net, iph->saddr); + if (skb->sk && inet_sk(skb->sk)->transparent) + type = RTN_LOCAL; if (addr_type == RTN_UNSPEC) addr_type = type; @@ -33,7 +36,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl.mark = skb->mark; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + if (ip_route_output_key(net, &rt, &fl) != 0) return -1; /* Drop old route. */ @@ -43,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + if (ip_route_output_key(net, &rt, &fl) != 0) return -1; odst = skb->dst; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2767841a8cef..3816e1dc9295 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_DEFRAG_IPV4 + tristate + default n + config NF_CONNTRACK_IPV4 tristate "IPv4 connection tracking support (required for NAT)" depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -56,23 +61,30 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP_NF_IPTABLES + # The matches. -config IP_NF_MATCH_RECENT - tristate '"recent" match support' - depends on IP_NF_IPTABLES +config IP_NF_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' depends on NETFILTER_ADVANCED help - This match is used for creating one or many lists of recently - used addresses and then matching against that/those list(s). + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... - Short options are available by using 'iptables -m recent -h' - Official Website: <http://snowman.net/projects/ipt_recent/> + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_AH + tristate '"ah" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of SPIs + inside AH header of IPSec packets. To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_ECN tristate '"ecn" match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This option adds a `ECN' match, which allows you to match against @@ -80,19 +92,8 @@ config IP_NF_MATCH_ECN To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_AH - tristate '"ah" match support' - depends on IP_NF_IPTABLES - depends on NETFILTER_ADVANCED - help - This match extension allows you to match a range of SPIs - inside AH header of IPSec packets. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_TTL tristate '"ttl" match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user @@ -100,21 +101,9 @@ config IP_NF_MATCH_TTL To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' - depends on IP_NF_IPTABLES - depends on NETFILTER_ADVANCED - help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. - # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of @@ -136,7 +125,6 @@ config IP_NF_TARGET_REJECT config IP_NF_TARGET_LOG tristate "LOG target support" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -146,7 +134,6 @@ config IP_NF_TARGET_LOG config IP_NF_TARGET_ULOG tristate "ULOG target support" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n ---help--- @@ -167,7 +154,7 @@ config IP_NF_TARGET_ULOG # NAT + specific targets: nf_conntrack config NF_NAT tristate "Full NAT" - depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK_IPV4 default m if NETFILTER_ADVANCED=n help The Full NAT option allows masquerading, port forwarding and other @@ -194,27 +181,26 @@ config IP_NF_TARGET_MASQUERADE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_REDIRECT - tristate "REDIRECT target support" +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" depends on NF_NAT depends on NETFILTER_ADVANCED help - REDIRECT is a special case of NAT: all incoming connections are - mapped onto the incoming interface's address, causing the packets to - come to the local machine instead of passing through. This is - useful for transparent proxies. + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_NETMAP - tristate "NETMAP target support" +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" depends on NF_NAT depends on NETFILTER_ADVANCED help - NETMAP is an implementation of static 1:1 NAT mapping of network - addresses. It maps the network address part, while keeping the host - address part intact. It is similar to Fast NAT, except that - Netfilter's connection tracking doesn't work well with Fast NAT. + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. To compile it as a module, choose M here. If unsure, say N. @@ -263,44 +249,43 @@ config NF_NAT_PROTO_SCTP config NF_NAT_FTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_FTP config NF_NAT_IRC tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_IRC config NF_NAT_TFTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP config NF_NAT_AMANDA tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_AMANDA config NF_NAT_PPTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_PPTP select NF_NAT_PROTO_GRE config NF_NAT_H323 tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_H323 config NF_NAT_SIP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_SIP # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for @@ -309,6 +294,19 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on NF_CONNTRACK_IPV4 + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE @@ -339,23 +337,9 @@ config IP_NF_TARGET_TTL To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_MANGLE && EXPERIMENTAL - depends on NF_CONNTRACK_IPV4 - depends on NETFILTER_ADVANCED - select NF_CONNTRACK_MARK - help - The CLUSTERIP target allows you to build load-balancing clusters of - network servers without having a dedicated load-balancing - router/server/switch. - - To compile it as a module, choose M here. If unsure, say N. - # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This option adds a `raw' table to iptables. This table is the very @@ -365,6 +349,19 @@ config IP_NF_RAW If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +# security table for MAC policy +config IP_NF_SECURITY + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + +endif # IP_NF_IPTABLES + # ARP tables config IP_NF_ARPTABLES tristate "ARP tables support" @@ -377,9 +374,10 @@ config IP_NF_ARPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP_NF_ARPTABLES + config IP_NF_ARPFILTER tristate "ARP packet filtering" - depends on IP_NF_ARPTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and @@ -390,10 +388,11 @@ config IP_NF_ARPFILTER config IP_NF_ARP_MANGLE tristate "ARP payload mangling" - depends on IP_NF_ARPTABLES help Allows altering the ARP packet payload: source and destination hardware and network addresses. +endif # IP_NF_ARPTABLES + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index d9b92fbf5579..5f9b650d90fc 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -18,6 +18,9 @@ obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o obj-$(CONFIG_NF_NAT) += nf_nat.o +# defrag +obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o @@ -42,12 +45,12 @@ obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o +obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o # targets diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 03e83a65aec5..8d70d29f1ccf 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -200,15 +200,12 @@ static inline int arp_checkentry(const struct arpt_arp *arp) return 1; } -static unsigned int arpt_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +arpt_error(struct sk_buff *skb, const struct xt_target_param *par) { if (net_ratelimit()) - printk("arp_tables: error: '%s'\n", (char *)targinfo); + printk("arp_tables: error: '%s'\n", + (const char *)par->targinfo); return NF_DROP; } @@ -232,6 +229,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const char *indev, *outdev; void *table_base; const struct xt_table_info *private; + struct xt_target_param tgpar; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -245,6 +243,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); + tgpar.in = in; + tgpar.out = out; + tgpar.hooknum = hook; + tgpar.family = NFPROTO_ARP; + arp = arp_hdr(skb); do { if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { @@ -290,11 +293,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Targets which reenter must return * abs. verdicts */ + tgpar.target = t->u.kernel.target; + tgpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); + &tgpar); /* Target might have changed stuff. */ arp = arp_hdr(skb); @@ -456,23 +458,24 @@ static inline int check_entry(struct arpt_entry *e, const char *name) static inline int check_target(struct arpt_entry *e, const char *name) { - struct arpt_entry_target *t; - struct xt_target *target; + struct arpt_entry_target *t = arpt_get_target(e); int ret; - - t = arpt_get_target(e); - target = t->u.kernel.target; - - ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), - name, e->comefrom, 0, 0); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_ARP, + }; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); + if (ret < 0) { duprintf("arp_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static inline int @@ -488,7 +491,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, return ret; t = arpt_get_target(e); - target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, + target = try_then_request_module(xt_find_target(NFPROTO_ARP, + t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); if (IS_ERR(target) || !target) { @@ -554,15 +558,19 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) { + struct xt_tgdtor_param par; struct arpt_entry_target *t; if (i && (*i)-- == 0) return 1; t = arpt_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_ARP; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } @@ -788,7 +796,7 @@ static void compat_standard_from_user(void *dst, void *src) int v = *(compat_int_t *)src; if (v > 0) - v += xt_compat_calc_jump(NF_ARP, v); + v += xt_compat_calc_jump(NFPROTO_ARP, v); memcpy(dst, &v, sizeof(v)); } @@ -797,7 +805,7 @@ static int compat_standard_to_user(void __user *dst, void *src) compat_int_t cv = *(int *)src; if (cv > 0) - cv -= xt_compat_calc_jump(NF_ARP, cv); + cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } @@ -815,7 +823,7 @@ static int compat_calc_entry(struct arpt_entry *e, t = arpt_get_target(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; - ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) return ret; @@ -866,9 +874,9 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) - xt_compat_lock(NF_ARP); + xt_compat_lock(NFPROTO_ARP); #endif - t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); if (t && !IS_ERR(t)) { struct arpt_getinfo info; @@ -878,7 +886,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (compat) { struct xt_table_info tmp; ret = compat_table_info(private, &tmp); - xt_compat_flush_offsets(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; } #endif @@ -901,7 +909,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) ret = t ? PTR_ERR(t) : -ENOENT; #ifdef CONFIG_COMPAT if (compat) - xt_compat_unlock(NF_ARP); + xt_compat_unlock(NFPROTO_ARP); #endif return ret; } @@ -925,7 +933,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, return -EINVAL; } - t = xt_find_table_lock(net, NF_ARP, get.name); + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (t && !IS_ERR(t)) { const struct xt_table_info *private = t->private; @@ -967,7 +975,7 @@ static int __do_replace(struct net *net, const char *name, goto out; } - t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; @@ -1134,7 +1142,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - t = xt_find_table_lock(net, NF_ARP, name); + t = xt_find_table_lock(net, NFPROTO_ARP, name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; @@ -1218,7 +1226,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, entry_offset = (void *)e - (void *)base; t = compat_arpt_get_target(e); - target = try_then_request_module(xt_find_target(NF_ARP, + target = try_then_request_module(xt_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); @@ -1232,7 +1240,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, off += xt_compat_target_offset(target); *size += off; - ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) goto release_target; @@ -1333,7 +1341,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; - xt_compat_lock(NF_ARP); + xt_compat_lock(NFPROTO_ARP); /* Walk through entries, checking offsets. */ ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, check_compat_entry_size_and_hooks, @@ -1383,8 +1391,8 @@ static int translate_compat_table(const char *name, ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_copy_entry_from_user, &pos, &size, name, newinfo, entry1); - xt_compat_flush_offsets(NF_ARP); - xt_compat_unlock(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); if (ret) goto free_newinfo; @@ -1420,8 +1428,8 @@ out: COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); return ret; out_unlock: - xt_compat_flush_offsets(NF_ARP); - xt_compat_unlock(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); goto out; } @@ -1607,8 +1615,8 @@ static int compat_get_entries(struct net *net, return -EINVAL; } - xt_compat_lock(NF_ARP); - t = xt_find_table_lock(net, NF_ARP, get.name); + xt_compat_lock(NFPROTO_ARP); + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (t && !IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1623,13 +1631,13 @@ static int compat_get_entries(struct net *net, private->size, get.size); ret = -EAGAIN; } - xt_compat_flush_offsets(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); } else ret = t ? PTR_ERR(t) : -ENOENT; - xt_compat_unlock(NF_ARP); + xt_compat_unlock(NFPROTO_ARP); return ret; } @@ -1709,7 +1717,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; } - try_then_request_module(xt_find_revision(NF_ARP, rev.name, + try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), "arpt_%s", rev.name); break; @@ -1787,7 +1795,7 @@ void arpt_unregister_table(struct xt_table *table) static struct xt_target arpt_standard_target __read_mostly = { .name = ARPT_STANDARD_TARGET, .targetsize = sizeof(int), - .family = NF_ARP, + .family = NFPROTO_ARP, #ifdef CONFIG_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, @@ -1799,7 +1807,7 @@ static struct xt_target arpt_error_target __read_mostly = { .name = ARPT_ERROR_TARGET, .target = arpt_error, .targetsize = ARPT_FUNCTION_MAXNAMELEN, - .family = NF_ARP, + .family = NFPROTO_ARP, }; static struct nf_sockopt_ops arpt_sockopts = { @@ -1821,12 +1829,12 @@ static struct nf_sockopt_ops arpt_sockopts = { static int __net_init arp_tables_net_init(struct net *net) { - return xt_proto_init(net, NF_ARP); + return xt_proto_init(net, NFPROTO_ARP); } static void __net_exit arp_tables_net_exit(struct net *net) { - xt_proto_fini(net, NF_ARP); + xt_proto_fini(net, NFPROTO_ARP); } static struct pernet_operations arp_tables_net_ops = { diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index a385959d2655..b0d5b1d0a769 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -9,12 +9,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +target(struct sk_buff *skb, const struct xt_target_param *par) { - const struct arpt_mangle *mangle = targinfo; + const struct arpt_mangle *mangle = par->targinfo; const struct arphdr *arp; unsigned char *arpptr; int pln, hln; @@ -57,11 +54,9 @@ target(struct sk_buff *skb, return mangle->target; } -static bool -checkentry(const char *tablename, const void *e, const struct xt_target *target, - void *targinfo, unsigned int hook_mask) +static bool checkentry(const struct xt_tgchk_param *par) { - const struct arpt_mangle *mangle = targinfo; + const struct arpt_mangle *mangle = par->targinfo; if (mangle->flags & ~ARPT_MANGLE_MASK || !(mangle->flags & ARPT_MANGLE_MASK)) @@ -75,7 +70,7 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target, static struct xt_target arpt_mangle_reg __read_mostly = { .name = "mangle", - .family = NF_ARP, + .family = NFPROTO_ARP, .target = target, .targetsize = sizeof(struct arpt_mangle), .checkentry = checkentry, diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 3be4d07e7ed9..bee3d117661a 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -51,38 +51,59 @@ static struct xt_table packet_filter = { .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .private = NULL, .me = THIS_MODULE, - .af = NF_ARP, + .af = NFPROTO_ARP, }; /* The work comes in here from netfilter.c */ -static unsigned int arpt_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int arpt_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return arpt_do_table(skb, hook, in, out, init_net.ipv4.arptable_filter); + return arpt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.arptable_filter); +} + +static unsigned int arpt_out_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(skb, hook, in, out, + dev_net(out)->ipv4.arptable_filter); +} + +static unsigned int arpt_forward_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.arptable_filter); } static struct nf_hook_ops arpt_ops[] __read_mostly = { { - .hook = arpt_hook, + .hook = arpt_in_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_IN, .priority = NF_IP_PRI_FILTER, }, { - .hook = arpt_hook, + .hook = arpt_out_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = NF_IP_PRI_FILTER, }, { - .hook = arpt_hook, + .hook = arpt_forward_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_FORWARD, .priority = NF_IP_PRI_FILTER, }, diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 26a37cedcf2e..432ce9d1c11c 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -156,7 +156,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) case IPQ_COPY_META: case IPQ_COPY_NONE: size = NLMSG_SPACE(sizeof(*pmsg)); - data_len = 0; break; case IPQ_COPY_PACKET: @@ -224,8 +223,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) return skb; nlmsg_failure: - if (skb) - kfree_skb(skb); *errp = -EINVAL; printk(KERN_ERR "ip_queue: error creating packet message\n"); return NULL; @@ -480,7 +477,7 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* Drop any packets associated with the downed device */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4e7c719445c2..213fb27debc1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -171,31 +171,25 @@ ip_checkentry(const struct ipt_ip *ip) } static unsigned int -ipt_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ipt_error(struct sk_buff *skb, const struct xt_target_param *par) { if (net_ratelimit()) - printk("ip_tables: error: `%s'\n", (char *)targinfo); + printk("ip_tables: error: `%s'\n", + (const char *)par->targinfo); return NF_DROP; } /* Performance critical - called for every packet */ static inline bool -do_match(struct ipt_entry_match *m, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int offset, - bool *hotdrop) +do_match(struct ipt_entry_match *m, const struct sk_buff *skb, + struct xt_match_param *par) { + par->match = m->u.kernel.match; + par->matchinfo = m->data; + /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, ip_hdrlen(skb), hotdrop)) + if (!m->u.kernel.match->match(skb, par)) return true; else return false; @@ -326,7 +320,6 @@ ipt_do_table(struct sk_buff *skb, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); - u_int16_t offset; const struct iphdr *ip; u_int16_t datalen; bool hotdrop = false; @@ -336,6 +329,8 @@ ipt_do_table(struct sk_buff *skb, void *table_base; struct ipt_entry *e, *back; struct xt_table_info *private; + struct xt_match_param mtpar; + struct xt_target_param tgpar; /* Initialization */ ip = ip_hdr(skb); @@ -348,7 +343,13 @@ ipt_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ - offset = ntohs(ip->frag_off) & IP_OFFSET; + mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + mtpar.thoff = ip_hdrlen(skb); + mtpar.hotdrop = &hotdrop; + mtpar.in = tgpar.in = in; + mtpar.out = tgpar.out = out; + mtpar.family = tgpar.family = NFPROTO_IPV4; + tgpar.hooknum = hook; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); @@ -362,12 +363,11 @@ ipt_do_table(struct sk_buff *skb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + if (ip_packet_match(ip, indev, outdev, + &e->ip, mtpar.fragoff)) { struct ipt_entry_target *t; - if (IPT_MATCH_ITERATE(e, do_match, - skb, in, out, - offset, &hotdrop) != 0) + if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) goto no_match; ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); @@ -413,16 +413,14 @@ ipt_do_table(struct sk_buff *skb, } else { /* Targets which reenter must return abs. verdicts */ + tgpar.target = t->u.kernel.target; + tgpar.targinfo = t->data; #ifdef CONFIG_NETFILTER_DEBUG ((struct ipt_entry *)table_base)->comefrom = 0xeeeeeeec; #endif verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); - + &tgpar); #ifdef CONFIG_NETFILTER_DEBUG if (((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec @@ -575,12 +573,17 @@ mark_source_chains(struct xt_table_info *newinfo, static int cleanup_match(struct ipt_entry_match *m, unsigned int *i) { + struct xt_mtdtor_param par; + if (i && (*i)-- == 0) return 1; - if (m->u.kernel.match->destroy) - m->u.kernel.match->destroy(m->u.kernel.match, m->data); - module_put(m->u.kernel.match->me); + par.match = m->u.kernel.match; + par.matchinfo = m->data; + par.family = NFPROTO_IPV4; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); return 0; } @@ -606,34 +609,28 @@ check_entry(struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, unsigned int *i) +check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, + unsigned int *i) { - struct xt_match *match; + const struct ipt_ip *ip = par->entryinfo; int ret; - match = m->u.kernel.match; - ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), - name, hookmask, ip->proto, - ip->invflags & IPT_INV_PROTO); - if (!ret && m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ip, match, m->data, - hookmask)) { + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); - ret = -EINVAL; + par.match->name); + return ret; } - if (!ret) - (*i)++; - return ret; + ++*i; + return 0; } static int -find_check_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, +find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, unsigned int *i) { struct xt_match *match; @@ -648,7 +645,7 @@ find_check_match(struct ipt_entry_match *m, } m->u.kernel.match = match; - ret = check_match(m, name, ip, hookmask, i); + ret = check_match(m, par, i); if (ret) goto err; @@ -660,23 +657,25 @@ err: static int check_target(struct ipt_entry *e, const char *name) { - struct ipt_entry_target *t; - struct xt_target *target; + struct ipt_entry_target *t = ipt_get_target(e); + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_IPV4, + }; int ret; - t = ipt_get_target(e); - target = t->u.kernel.target; - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ip.proto, - e->ip.invflags & IPT_INV_PROTO); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static int @@ -687,14 +686,18 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, struct xt_target *target; int ret; unsigned int j; + struct xt_mtchk_param mtpar; ret = check_entry(e, name); if (ret) return ret; j = 0; - ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; + ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); if (ret != 0) goto cleanup_matches; @@ -769,6 +772,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, static int cleanup_entry(struct ipt_entry *e, unsigned int *i) { + struct xt_tgdtor_param par; struct ipt_entry_target *t; if (i && (*i)-- == 0) @@ -777,9 +781,13 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) /* Cleanup all matches */ IPT_MATCH_ITERATE(e, cleanup_match, NULL); t = ipt_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_IPV4; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } @@ -1648,12 +1656,16 @@ static int compat_check_entry(struct ipt_entry *e, const char *name, unsigned int *i) { + struct xt_mtchk_param mtpar; unsigned int j; int ret; j = 0; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; + ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); if (ret) goto cleanup_matches; @@ -2121,30 +2133,23 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; - const struct ipt_icmp *icmpinfo = matchinfo; + const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -2155,15 +2160,9 @@ icmp_match(const struct sk_buff *skb, !!(icmpinfo->invflags&IPT_ICMP_INV)); } -/* Called when user tries to insert an entry of this type. */ -static bool -icmp_checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool icmp_checkentry(const struct xt_mtchk_param *par) { - const struct ipt_icmp *icmpinfo = matchinfo; + const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(icmpinfo->invflags & ~IPT_ICMP_INV); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1819ad7ab910..7ac1677419a9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -281,11 +281,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) ***********************************************************************/ static unsigned int -clusterip_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t hash; @@ -349,13 +347,10 @@ clusterip_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static bool -clusterip_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool clusterip_tg_check(const struct xt_tgchk_param *par) { - struct ipt_clusterip_tgt_info *cipinfo = targinfo; - const struct ipt_entry *e = e_void; + struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; @@ -406,9 +401,9 @@ clusterip_tg_check(const char *tablename, const void *e_void, } cipinfo->config = config; - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } @@ -416,9 +411,9 @@ clusterip_tg_check(const char *tablename, const void *e_void, } /* drop reference count of cluster config when rule is deleted */ -static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) +static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { - const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ @@ -426,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } #ifdef CONFIG_COMPAT @@ -445,7 +440,7 @@ struct compat_ipt_clusterip_tgt_info static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = clusterip_tg, .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, @@ -475,11 +470,10 @@ static void arp_print(struct arp_payload *payload) #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; int j,k; - const char hexbuf[]= "0123456789abcdef"; for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { - hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15]; - hbuffer[k++]=hexbuf[payload->src_hw[j]&15]; + hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); + hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); hbuffer[k++]=':'; } hbuffer[--k]='\0'; @@ -547,7 +541,7 @@ arp_mangle(unsigned int hook, static struct nf_hook_ops cip_arp_ops __read_mostly = { .hook = arp_mangle, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = -1 }; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index d60139c134ca..f7e2fa0974dc 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -77,11 +77,9 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) } static unsigned int -ecn_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_ECN_info *einfo = targinfo; + const struct ipt_ECN_info *einfo = par->targinfo; if (einfo->operation & IPT_ECN_OP_SET_IP) if (!set_ect_ip(skb, einfo)) @@ -95,13 +93,10 @@ ecn_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static bool -ecn_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool ecn_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ECN_info *einfo = targinfo; - const struct ipt_entry *e = e_void; + const struct ipt_ECN_info *einfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (einfo->operation & IPT_ECN_OP_MASK) { printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", @@ -124,7 +119,7 @@ ecn_tg_check(const char *tablename, const void *e_void, static struct xt_target ecn_tg_reg __read_mostly = { .name = "ECN", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ecn_tg, .targetsize = sizeof(struct ipt_ECN_info), .table = "mangle", diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 0af14137137b..fc6ce04a3e35 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -375,7 +375,7 @@ static struct nf_loginfo default_loginfo = { }; static void -ipt_log_packet(unsigned int pf, +ipt_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -426,28 +426,23 @@ ipt_log_packet(unsigned int pf, } static unsigned int -log_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +log_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_log_info *loginfo = targinfo; + const struct ipt_log_info *loginfo = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - ipt_log_packet(PF_INET, hooknum, skb, in, out, &li, + ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); return XT_CONTINUE; } -static bool -log_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool log_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_log_info *loginfo = targinfo; + const struct ipt_log_info *loginfo = par->targinfo; if (loginfo->level >= 8) { pr_debug("LOG: level %u >= 8\n", loginfo->level); @@ -463,7 +458,7 @@ log_tg_check(const char *tablename, const void *e, static struct xt_target log_tg_reg __read_mostly = { .name = "LOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = log_tg, .targetsize = sizeof(struct ipt_log_info), .checkentry = log_tg_check, @@ -483,7 +478,7 @@ static int __init log_tg_init(void) ret = xt_register_target(&log_tg_reg); if (ret < 0) return ret; - nf_log_register(PF_INET, &ipt_log_logger); + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); return 0; } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 84c26dd27d81..f389f60cb105 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -31,12 +31,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ -static bool -masquerade_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool masquerade_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { pr_debug("masquerade_check: bad MAP_IPS.\n"); @@ -50,9 +47,7 @@ masquerade_tg_check(const char *tablename, const void *e, } static unsigned int -masquerade_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; struct nf_conn_nat *nat; @@ -62,7 +57,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, const struct rtable *rt; __be32 newsrc; - NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); @@ -76,16 +71,16 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; - mr = targinfo; + mr = par->targinfo; rt = skb->rtable; - newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (!newsrc) { - printk("MASQUERADE: %s ate my IP address\n", out->name); + printk("MASQUERADE: %s ate my IP address\n", par->out->name); return NF_DROP; } write_lock_bh(&masq_lock); - nat->masq_index = out->ifindex; + nat->masq_index = par->out->ifindex; write_unlock_bh(&masq_lock); /* Transfer from original range. */ @@ -119,9 +114,7 @@ static int masq_device_event(struct notifier_block *this, void *ptr) { const struct net_device *dev = ptr; - - if (dev_net(dev) != &init_net) - return NOTIFY_DONE; + struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for @@ -129,7 +122,8 @@ static int masq_device_event(struct notifier_block *this, and forget them. */ NF_CT_ASSERT(dev->ifindex != 0); - nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex); } return NOTIFY_DONE; @@ -153,7 +147,7 @@ static struct notifier_block masq_inet_notifier = { static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 6739abfd1521..7c29582d4ec8 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -22,12 +22,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); -static bool -netmap_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool netmap_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { pr_debug("NETMAP:check: bad MAP_IPS.\n"); @@ -41,24 +38,23 @@ netmap_tg_check(const char *tablename, const void *e, } static unsigned int -netmap_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING - || hooknum == NF_INET_POST_ROUTING - || hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; @@ -70,12 +66,12 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); } static struct xt_target netmap_tg_reg __read_mostly = { .name = "NETMAP", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = netmap_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 5c6292449d13..698e5e78685b 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -26,12 +26,9 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); /* FIXME: Take multiple ranges --RR */ -static bool -redirect_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool redirect_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { pr_debug("redirect_check: bad MAP_IPS.\n"); @@ -45,24 +42,22 @@ redirect_tg_check(const char *tablename, const void *e, } static unsigned int -redirect_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING - || hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ - if (hooknum == NF_INET_LOCAL_OUT) + if (par->hooknum == NF_INET_LOCAL_OUT) newdst = htonl(0x7F000001); else { struct in_device *indev; @@ -92,7 +87,7 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target redirect_tg_reg __read_mostly = { .name = "REDIRECT", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = redirect_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 2639872849da..0b4b6e0ff2b9 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -136,11 +136,9 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) } static unsigned int -reject_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +reject_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_reject_info *reject = targinfo; + const struct ipt_reject_info *reject = par->targinfo; /* WARNING: This code causes reentry within iptables. This means that the iptables jump stack is now crap. We @@ -168,7 +166,7 @@ reject_tg(struct sk_buff *skb, const struct net_device *in, send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, hooknum); + send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; @@ -177,13 +175,10 @@ reject_tg(struct sk_buff *skb, const struct net_device *in, return NF_DROP; } -static bool -reject_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool reject_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_reject_info *rejinfo = targinfo; - const struct ipt_entry *e = e_void; + const struct ipt_reject_info *rejinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); @@ -201,7 +196,7 @@ reject_tg_check(const char *tablename, const void *e_void, static struct xt_target reject_tg_reg __read_mostly = { .name = "REJECT", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = reject_tg, .targetsize = sizeof(struct ipt_reject_info), .table = "filter", diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 30eed65e7338..6d76aae90cc0 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -20,12 +20,10 @@ MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); MODULE_LICENSE("GPL"); static unsigned int -ttl_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct iphdr *iph; - const struct ipt_TTL_info *info = targinfo; + const struct ipt_TTL_info *info = par->targinfo; int new_ttl; if (!skb_make_writable(skb, skb->len)) @@ -61,12 +59,9 @@ ttl_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static bool -ttl_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool ttl_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_TTL_info *info = targinfo; + const struct ipt_TTL_info *info = par->targinfo; if (info->mode > IPT_TTL_MAXMODE) { printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", @@ -80,7 +75,7 @@ ttl_tg_check(const char *tablename, const void *e, static struct xt_target ttl_tg_reg __read_mostly = { .name = "TTL", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b192756c6d0d..18a2826b57c6 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -281,18 +281,14 @@ alloc_failure: } static unsigned int -ulog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; - - ipt_ulog_packet(hooknum, skb, in, out, loginfo, NULL); - + ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + par->targinfo, NULL); return XT_CONTINUE; } -static void ipt_logfn(unsigned int pf, +static void ipt_logfn(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -317,12 +313,9 @@ static void ipt_logfn(unsigned int pf, ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } -static bool -ulog_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hookmask) +static bool ulog_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ulog_info *loginfo = targinfo; + const struct ipt_ulog_info *loginfo = par->targinfo; if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { pr_debug("ipt_ULOG: prefix term %i\n", @@ -374,7 +367,7 @@ static int ulog_tg_compat_to_user(void __user *dst, void *src) static struct xt_target ulog_tg_reg __read_mostly = { .name = "ULOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ulog_tg, .targetsize = sizeof(struct ipt_ulog_info), .checkentry = ulog_tg_check, @@ -419,7 +412,7 @@ static int __init ulog_tg_init(void) return ret; } if (nflog) - nf_log_register(PF_INET, &ipt_ulog_logger); + nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); return 0; } diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 49587a497229..88762f02779d 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -30,12 +30,9 @@ static inline bool match_type(const struct net_device *dev, __be32 addr, } static bool -addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_addrtype_info *info = matchinfo; + const struct ipt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -50,36 +47,30 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_addrtype_info_v1 *info = matchinfo; + const struct ipt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); const struct net_device *dev = NULL; bool ret = true; if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) - dev = in; + dev = par->in; else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) - dev = out; + dev = par->out; if (info->source) ret &= match_type(dev, iph->saddr, info->source) ^ (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type(dev, iph->daddr, info->dest) ^ - (info->flags & IPT_ADDRTYPE_INVERT_DEST); + !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); return ret; } -static bool -addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { - struct ipt_addrtype_info_v1 *info = matchinfo; + struct ipt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { @@ -88,14 +79,16 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, return false; } - if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { printk(KERN_ERR "ipt_addrtype: output interface limitation " "not valid in PRE_ROUTING and INPUT\n"); return false; } - if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { printk(KERN_ERR "ipt_addrtype: input interface limitation " "not valid in POST_ROUTING and OUTPUT\n"); @@ -108,14 +101,14 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, static struct xt_match addrtype_mt_reg[] __read_mostly = { { .name = "addrtype", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = addrtype_mt_v0, .matchsize = sizeof(struct ipt_addrtype_info), .me = THIS_MODULE }, { .name = "addrtype", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index e977989629c7..0104c0b399de 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -36,27 +36,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) return r; } -static bool -ah_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; - const struct ipt_ah *ahinfo = matchinfo; + const struct ipt_ah *ahinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ah = skb_header_pointer(skb, protoff, - sizeof(_ahdr), &_ahdr); + ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr); if (ah == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil AH tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return 0; } @@ -65,13 +61,9 @@ ah_mt(const struct sk_buff *skb, const struct net_device *in, !!(ahinfo->invflags & IPT_AH_INV_SPI)); } -/* Called when user tries to insert an entry of this type. */ -static bool -ah_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ah_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ah *ahinfo = matchinfo; + const struct ipt_ah *ahinfo = par->matchinfo; /* Must specify no unknown invflags */ if (ahinfo->invflags & ~IPT_AH_INV_MASK) { @@ -83,7 +75,7 @@ ah_mt_check(const char *tablename, const void *ip_void, static struct xt_match ah_mt_reg __read_mostly = { .name = "ah", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ah_mt, .matchsize = sizeof(struct ipt_ah), .proto = IPPROTO_AH, diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 749de8284ce5..6289b64144c6 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -67,12 +67,9 @@ static inline bool match_tcp(const struct sk_buff *skb, return true; } -static bool -ecn_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_ecn_info *info = matchinfo; + const struct ipt_ecn_info *info = par->matchinfo; if (info->operation & IPT_ECN_OP_MATCH_IP) if (!match_ip(skb, info)) @@ -81,20 +78,17 @@ ecn_mt(const struct sk_buff *skb, const struct net_device *in, if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { if (ip_hdr(skb)->protocol != IPPROTO_TCP) return false; - if (!match_tcp(skb, info, hotdrop)) + if (!match_tcp(skb, info, par->hotdrop)) return false; } return true; } -static bool -ecn_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ecn_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ecn_info *info = matchinfo; - const struct ipt_ip *ip = ip_void; + const struct ipt_ecn_info *info = par->matchinfo; + const struct ipt_ip *ip = par->entryinfo; if (info->operation & IPT_ECN_OP_MATCH_MASK) return false; @@ -114,7 +108,7 @@ ecn_mt_check(const char *tablename, const void *ip_void, static struct xt_match ecn_mt_reg __read_mostly = { .name = "ecn", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ecn_mt, .matchsize = sizeof(struct ipt_ecn_info), .checkentry = ecn_mt_check, diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c deleted file mode 100644 index 21cb053f5d7d..000000000000 --- a/net/ipv4/netfilter/ipt_recent.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This is a replacement of the old ipt_recent module, which carried the - * following copyright notice: - * - * Author: Stephen Frost <sfrost@snowman.net> - * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org - */ -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/moduleparam.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/list.h> -#include <linux/random.h> -#include <linux/jhash.h> -#include <linux/bitops.h> -#include <linux/skbuff.h> -#include <linux/inet.h> -#include <net/net_namespace.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_recent.h> - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); -MODULE_LICENSE("GPL"); - -static unsigned int ip_list_tot = 100; -static unsigned int ip_pkt_list_tot = 20; -static unsigned int ip_list_hash_size = 0; -static unsigned int ip_list_perms = 0644; -static unsigned int ip_list_uid = 0; -static unsigned int ip_list_gid = 0; -module_param(ip_list_tot, uint, 0400); -module_param(ip_pkt_list_tot, uint, 0400); -module_param(ip_list_hash_size, uint, 0400); -module_param(ip_list_perms, uint, 0400); -module_param(ip_list_uid, uint, 0400); -module_param(ip_list_gid, uint, 0400); -MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); -MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); - -struct recent_entry { - struct list_head list; - struct list_head lru_list; - __be32 addr; - u_int8_t ttl; - u_int8_t index; - u_int16_t nstamps; - unsigned long stamps[0]; -}; - -struct recent_table { - struct list_head list; - char name[IPT_RECENT_NAME_LEN]; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc; -#endif - unsigned int refcnt; - unsigned int entries; - struct list_head lru_list; - struct list_head iphash[0]; -}; - -static LIST_HEAD(tables); -static DEFINE_SPINLOCK(recent_lock); -static DEFINE_MUTEX(recent_mutex); - -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_dir; -static const struct file_operations recent_fops; -#endif - -static u_int32_t hash_rnd; -static int hash_rnd_initted; - -static unsigned int recent_entry_hash(__be32 addr) -{ - if (!hash_rnd_initted) { - get_random_bytes(&hash_rnd, 4); - hash_rnd_initted = 1; - } - return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); -} - -static struct recent_entry * -recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) -{ - struct recent_entry *e; - unsigned int h; - - h = recent_entry_hash(addr); - list_for_each_entry(e, &table->iphash[h], list) - if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) - return e; - return NULL; -} - -static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) -{ - list_del(&e->list); - list_del(&e->lru_list); - kfree(e); - t->entries--; -} - -static struct recent_entry * -recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) -{ - struct recent_entry *e; - - if (t->entries >= ip_list_tot) { - e = list_entry(t->lru_list.next, struct recent_entry, lru_list); - recent_entry_remove(t, e); - } - e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, - GFP_ATOMIC); - if (e == NULL) - return NULL; - e->addr = addr; - e->ttl = ttl; - e->stamps[0] = jiffies; - e->nstamps = 1; - e->index = 1; - list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); - list_add_tail(&e->lru_list, &t->lru_list); - t->entries++; - return e; -} - -static void recent_entry_update(struct recent_table *t, struct recent_entry *e) -{ - e->stamps[e->index++] = jiffies; - if (e->index > e->nstamps) - e->nstamps = e->index; - e->index %= ip_pkt_list_tot; - list_move_tail(&e->lru_list, &t->lru_list); -} - -static struct recent_table *recent_table_lookup(const char *name) -{ - struct recent_table *t; - - list_for_each_entry(t, &tables, list) - if (!strcmp(t->name, name)) - return t; - return NULL; -} - -static void recent_table_flush(struct recent_table *t) -{ - struct recent_entry *e, *next; - unsigned int i; - - for (i = 0; i < ip_list_hash_size; i++) - list_for_each_entry_safe(e, next, &t->iphash[i], list) - recent_entry_remove(t, e); -} - -static bool -recent_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - struct recent_entry *e; - __be32 addr; - u_int8_t ttl; - bool ret = info->invert; - - if (info->side == IPT_RECENT_DEST) - addr = ip_hdr(skb)->daddr; - else - addr = ip_hdr(skb)->saddr; - - ttl = ip_hdr(skb)->ttl; - /* use TTL as seen before forwarding */ - if (out && !skb->sk) - ttl++; - - spin_lock_bh(&recent_lock); - t = recent_table_lookup(info->name); - e = recent_entry_lookup(t, addr, - info->check_set & IPT_RECENT_TTL ? ttl : 0); - if (e == NULL) { - if (!(info->check_set & IPT_RECENT_SET)) - goto out; - e = recent_entry_init(t, addr, ttl); - if (e == NULL) - *hotdrop = true; - ret = !ret; - goto out; - } - - if (info->check_set & IPT_RECENT_SET) - ret = !ret; - else if (info->check_set & IPT_RECENT_REMOVE) { - recent_entry_remove(t, e); - ret = !ret; - } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { - unsigned long time = jiffies - info->seconds * HZ; - unsigned int i, hits = 0; - - for (i = 0; i < e->nstamps; i++) { - if (info->seconds && time_after(time, e->stamps[i])) - continue; - if (++hits >= info->hit_count) { - ret = !ret; - break; - } - } - } - - if (info->check_set & IPT_RECENT_SET || - (info->check_set & IPT_RECENT_UPDATE && ret)) { - recent_entry_update(t, e); - e->ttl = ttl; - } -out: - spin_unlock_bh(&recent_lock); - return ret; -} - -static bool -recent_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - unsigned i; - bool ret = false; - - if (hweight8(info->check_set & - (IPT_RECENT_SET | IPT_RECENT_REMOVE | - IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) - return false; - if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && - (info->seconds || info->hit_count)) - return false; - if (info->hit_count > ip_pkt_list_tot) - return false; - if (info->name[0] == '\0' || - strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) - return false; - - mutex_lock(&recent_mutex); - t = recent_table_lookup(info->name); - if (t != NULL) { - t->refcnt++; - ret = true; - goto out; - } - - t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_KERNEL); - if (t == NULL) - goto out; - t->refcnt = 1; - strcpy(t->name, info->name); - INIT_LIST_HEAD(&t->lru_list); - for (i = 0; i < ip_list_hash_size; i++) - INIT_LIST_HEAD(&t->iphash[i]); -#ifdef CONFIG_PROC_FS - t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); - if (t->proc == NULL) { - kfree(t); - goto out; - } - t->proc->uid = ip_list_uid; - t->proc->gid = ip_list_gid; - t->proc->data = t; -#endif - spin_lock_bh(&recent_lock); - list_add_tail(&t->list, &tables); - spin_unlock_bh(&recent_lock); - ret = true; -out: - mutex_unlock(&recent_mutex); - return ret; -} - -static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - - mutex_lock(&recent_mutex); - t = recent_table_lookup(info->name); - if (--t->refcnt == 0) { - spin_lock_bh(&recent_lock); - list_del(&t->list); - spin_unlock_bh(&recent_lock); - recent_table_flush(t); -#ifdef CONFIG_PROC_FS - remove_proc_entry(t->name, proc_dir); -#endif - kfree(t); - } - mutex_unlock(&recent_mutex); -} - -#ifdef CONFIG_PROC_FS -struct recent_iter_state { - struct recent_table *table; - unsigned int bucket; -}; - -static void *recent_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(recent_lock) -{ - struct recent_iter_state *st = seq->private; - const struct recent_table *t = st->table; - struct recent_entry *e; - loff_t p = *pos; - - spin_lock_bh(&recent_lock); - - for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) - list_for_each_entry(e, &t->iphash[st->bucket], list) - if (p-- == 0) - return e; - return NULL; -} - -static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct recent_iter_state *st = seq->private; - const struct recent_table *t = st->table; - struct recent_entry *e = v; - struct list_head *head = e->list.next; - - while (head == &t->iphash[st->bucket]) { - if (++st->bucket >= ip_list_hash_size) - return NULL; - head = t->iphash[st->bucket].next; - } - (*pos)++; - return list_entry(head, struct recent_entry, list); -} - -static void recent_seq_stop(struct seq_file *s, void *v) - __releases(recent_lock) -{ - spin_unlock_bh(&recent_lock); -} - -static int recent_seq_show(struct seq_file *seq, void *v) -{ - const struct recent_entry *e = v; - unsigned int i; - - i = (e->index - 1) % ip_pkt_list_tot; - seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", - NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); - for (i = 0; i < e->nstamps; i++) - seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); - seq_printf(seq, "\n"); - return 0; -} - -static const struct seq_operations recent_seq_ops = { - .start = recent_seq_start, - .next = recent_seq_next, - .stop = recent_seq_stop, - .show = recent_seq_show, -}; - -static int recent_seq_open(struct inode *inode, struct file *file) -{ - struct proc_dir_entry *pde = PDE(inode); - struct recent_iter_state *st; - - st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); - if (st == NULL) - return -ENOMEM; - - st->table = pde->data; - return 0; -} - -static ssize_t recent_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *loff) -{ - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct recent_table *t = pde->data; - struct recent_entry *e; - char buf[sizeof("+255.255.255.255")], *c = buf; - __be32 addr; - int add; - - if (size > sizeof(buf)) - size = sizeof(buf); - if (copy_from_user(buf, input, size)) - return -EFAULT; - while (isspace(*c)) - c++; - - if (size - (c - buf) < 5) - return c - buf; - if (!strncmp(c, "clear", 5)) { - c += 5; - spin_lock_bh(&recent_lock); - recent_table_flush(t); - spin_unlock_bh(&recent_lock); - return c - buf; - } - - switch (*c) { - case '-': - add = 0; - c++; - break; - case '+': - c++; - default: - add = 1; - break; - } - addr = in_aton(c); - - spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, addr, 0); - if (e == NULL) { - if (add) - recent_entry_init(t, addr, 0); - } else { - if (add) - recent_entry_update(t, e); - else - recent_entry_remove(t, e); - } - spin_unlock_bh(&recent_lock); - return size; -} - -static const struct file_operations recent_fops = { - .open = recent_seq_open, - .read = seq_read, - .write = recent_proc_write, - .release = seq_release_private, - .owner = THIS_MODULE, -}; -#endif /* CONFIG_PROC_FS */ - -static struct xt_match recent_mt_reg __read_mostly = { - .name = "recent", - .family = AF_INET, - .match = recent_mt, - .matchsize = sizeof(struct ipt_recent_info), - .checkentry = recent_mt_check, - .destroy = recent_mt_destroy, - .me = THIS_MODULE, -}; - -static int __init recent_mt_init(void) -{ - int err; - - if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) - return -EINVAL; - ip_list_hash_size = 1 << fls(ip_list_tot); - - err = xt_register_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS - if (err) - return err; - proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); - if (proc_dir == NULL) { - xt_unregister_match(&recent_mt_reg); - err = -ENOMEM; - } -#endif - return err; -} - -static void __exit recent_mt_exit(void) -{ - BUG_ON(!list_empty(&tables)); - xt_unregister_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", init_net.proc_net); -#endif -} - -module_init(recent_mt_init); -module_exit(recent_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index e0b8caeb710c..297f1cbf4ff5 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -18,12 +18,9 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); MODULE_LICENSE("GPL"); -static bool -ttl_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_ttl_info *info = matchinfo; + const struct ipt_ttl_info *info = par->matchinfo; const u8 ttl = ip_hdr(skb)->ttl; switch (info->mode) { @@ -46,7 +43,7 @@ ttl_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match ttl_mt_reg __read_mostly = { .name = "ttl", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ttl_mt, .matchsize = sizeof(struct ipt_ttl_info), .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 1ea677dcf845..c9224310ebae 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -70,7 +70,7 @@ ipt_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv4.iptable_filter); + dev_net(in)->ipv4.iptable_filter); } static unsigned int @@ -81,7 +81,7 @@ ipt_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv4.iptable_filter); + dev_net(in)->ipv4.iptable_filter); } static unsigned int @@ -101,7 +101,7 @@ ipt_local_out_hook(unsigned int hook, } return ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_filter); + dev_net(out)->ipv4.iptable_filter); } static struct nf_hook_ops ipt_ops[] __read_mostly = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index da59182f2226..69f2c4287146 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -81,7 +81,7 @@ ipt_pre_routing_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_pre_routing_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -92,7 +92,7 @@ ipt_post_routing_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_post_routing_net(in, out)->ipv4.iptable_mangle); + dev_net(out)->ipv4.iptable_mangle); } static unsigned int @@ -103,7 +103,7 @@ ipt_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -114,7 +114,7 @@ ipt_forward_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -147,7 +147,7 @@ ipt_local_hook(unsigned int hook, tos = iph->tos; ret = ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_mangle); + dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index fddce7754b72..8faebfe638f1 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ ipt_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_pre_routing_net(in, out)->ipv4.iptable_raw); + dev_net(in)->ipv4.iptable_raw); } static unsigned int @@ -72,7 +72,7 @@ ipt_local_hook(unsigned int hook, return NF_ACCEPT; } return ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_raw); + dev_net(out)->ipv4.iptable_raw); } /* 'raw' is the very first table. */ diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c new file mode 100644 index 000000000000..36f3be3cc428 --- /dev/null +++ b/net/ipv4/netfilter/iptable_security.c @@ -0,0 +1,180 @@ +/* + * "security" table + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); +MODULE_DESCRIPTION("iptables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __net_initdata = { + .repl = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .num_entries = 4, + .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + .hook_entry = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + .underflow = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + }, + .entries = { + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ + IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ + }, + .term = IPT_ERROR_INIT, /* ERROR */ +}; + +static struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .lock = __RW_LOCK_UNLOCKED(security_table.lock), + .me = THIS_MODULE, + .af = AF_INET, +}; + +static unsigned int +ipt_local_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.iptable_security); +} + +static unsigned int +ipt_forward_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.iptable_security); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* Somebody is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) + || ip_hdrlen(skb) < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk(KERN_INFO "iptable_security: ignoring short " + "SOCK_RAW packet.\n"); + return NF_ACCEPT; + } + return ipt_do_table(skb, hook, in, out, + dev_net(out)->ipv4.iptable_security); +} + +static struct nf_hook_ops ipt_ops[] __read_mostly = { + { + .hook = ipt_local_in_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_forward_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_SECURITY, + }, +}; + +static int __net_init iptable_security_net_init(struct net *net) +{ + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, &initial_table.repl); + + if (IS_ERR(net->ipv4.iptable_security)) + return PTR_ERR(net->ipv4.iptable_security); + + return 0; +} + +static void __net_exit iptable_security_net_exit(struct net *net) +{ + ipt_unregister_table(net->ipv4.iptable_security); +} + +static struct pernet_operations iptable_security_net_ops = { + .init = iptable_security_net_init, + .exit = iptable_security_net_exit, +}; + +static int __init iptable_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + return ret; + + ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + if (ret < 0) + goto cleanup_table; + + return ret; + +cleanup_table: + unregister_pernet_subsys(&iptable_security_net_ops); + return ret; +} + +static void __exit iptable_security_fini(void) +{ + nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + unregister_pernet_subsys(&iptable_security_net_ops); +} + +module_init(iptable_security_init); +module_exit(iptable_security_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a955c440364..4a7c35275396 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -1,3 +1,4 @@ + /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * @@ -24,6 +25,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, struct nf_conn *ct, @@ -63,23 +65,6 @@ static int ipv4_print_tuple(struct seq_file *s, NIPQUAD(tuple->dst.u3.ip)); } -/* Returns new sk_buff, or NULL */ -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err; - - skb_orphan(skb); - - local_bh_disable(); - err = ip_defrag(skb, user); - local_bh_enable(); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -144,35 +129,13 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* Previously seen (loopback)? Ignore. Do this before - fragment check. */ - if (skb->nfct) - return NF_ACCEPT; - - /* Gather fragments. */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_INET_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT)) - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ipv4_conntrack_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); } static unsigned int ipv4_conntrack_local(unsigned int hooknum, @@ -188,20 +151,13 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; } - return nf_conntrack_in(PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so make it the first hook. */ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { - .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, .pf = PF_INET, @@ -209,13 +165,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, .pf = PF_INET, @@ -254,7 +203,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, .procname = "ip_conntrack_count", - .data = &nf_conntrack_count, + .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = &proc_dointvec, @@ -270,7 +219,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, .procname = "ip_conntrack_checksum", - .data = &nf_conntrack_checksum, + .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -278,7 +227,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, .procname = "ip_conntrack_log_invalid", - .data = &nf_ct_log_invalid, + .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -323,7 +272,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(sock_net(sk), &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -422,6 +371,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) int ret = 0; need_conntrack(); + nf_defrag_ipv4_enable(); ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 40a46d482490..313ebf00ee36 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -18,33 +18,23 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> - -#ifdef CONFIG_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif +#include <net/netfilter/nf_conntrack_acct.h> struct ct_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_get_first(struct seq_file *seq) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(net->ct.hash[st->bucket].first); if (n) return n; } @@ -54,13 +44,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) static struct hlist_node *ct_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(net->ct.hash[st->bucket].first); } return head; } @@ -127,7 +118,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_ORIGINAL])) + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) return -ENOSPC; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) @@ -138,7 +129,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_REPLY])) + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) return -ENOSPC; if (test_bit(IPS_ASSURED_BIT, &ct->status)) @@ -170,8 +161,8 @@ static const struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ct_seq_ops, - sizeof(struct ct_iter_state)); + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); } static const struct file_operations ct_file_ops = { @@ -179,21 +170,23 @@ static const struct file_operations ct_file_ops = { .open = ct_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; /* expects */ struct ct_expect_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(net->ct.expect_hash[st->bucket].first); if (n) return n; } @@ -203,13 +196,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(net->ct.expect_hash[st->bucket].first); } return head; } @@ -277,8 +271,8 @@ static const struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &exp_seq_ops, - sizeof(struct ct_expect_iter_state)); + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); } static const struct file_operations ip_exp_file_ops = { @@ -286,11 +280,12 @@ static const struct file_operations ip_exp_file_ops = { .open = exp_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) @@ -300,7 +295,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -308,13 +303,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -326,7 +322,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -366,7 +363,8 @@ static const struct seq_operations ct_cpu_seq_ops = { static int ct_cpu_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &ct_cpu_seq_ops); + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ct_cpu_seq_fops = { @@ -374,39 +372,54 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; -int __init nf_conntrack_ipv4_compat_init(void) +static int __net_init ip_conntrack_net_init(struct net *net) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; proc_stat = proc_create("ip_conntrack", S_IRUGO, - init_net.proc_net_stat, &ct_cpu_seq_fops); + net->proc_net_stat, &ct_cpu_seq_fops); if (!proc_stat) goto err3; return 0; err3: - proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack_expect"); err2: - proc_net_remove(&init_net, "ip_conntrack"); + proc_net_remove(net, "ip_conntrack"); err1: return -ENOMEM; } +static void __net_exit ip_conntrack_net_exit(struct net *net) +{ + remove_proc_entry("ip_conntrack", net->proc_net_stat); + proc_net_remove(net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack"); +} + +static struct pernet_operations ip_conntrack_net_ops = { + .init = ip_conntrack_net_init, + .exit = ip_conntrack_net_exit, +}; + +int __init nf_conntrack_ipv4_compat_init(void) +{ + return register_pernet_subsys(&ip_conntrack_net_ops); +} + void __exit nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "ip_conntrack_expect"); - proc_net_remove(&init_net, "ip_conntrack"); + unregister_pernet_subsys(&ip_conntrack_net_ops); } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 78ab19accace..4e8879220222 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -79,7 +79,7 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* Try to delete connection immediately after all replies: @@ -87,12 +87,11 @@ static int icmp_packet(struct nf_conn *ct, means this will only run once even if count hits zero twice (theoretically possible with SMP) */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); + if (atomic_dec_and_test(&ct->proto.icmp.count)) + nf_ct_kill_acct(ct, ctinfo, skb); } else { atomic_inc(&ct->proto.icmp.count); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); } @@ -124,7 +123,7 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct sk_buff *skb, +icmp_error_message(struct net *net, struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum) { @@ -156,7 +155,7 @@ icmp_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&innertuple); + h = nf_conntrack_find_get(net, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -173,8 +172,8 @@ icmp_error_message(struct sk_buff *skb, /* Small and modified version of icmp_rcv */ static int -icmp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; @@ -182,16 +181,16 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: short packet "); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; @@ -204,7 +203,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; @@ -218,7 +217,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, && icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(skb, ctinfo, hooknum); + return icmp_error_message(net, skb, ctinfo, hooknum); } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c new file mode 100644 index 000000000000..fa2d6b6fc3e5 --- /dev/null +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -0,0 +1,97 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/route.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +/* Returns new sk_buff, or NULL */ +static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err; + + skb_orphan(skb); + + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if (skb->nfct) + return NF_ACCEPT; +#endif +#endif + /* Gather fragments. */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (nf_ct_ipv4_gather_frags(skb, + hooknum == NF_INET_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT)) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static struct nf_hook_ops ipv4_defrag_ops[] = { + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +void nf_defrag_ipv4_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index d2a887fc8d9b..a65cf692359f 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -37,9 +37,6 @@ static struct nf_conntrack_l3proto *l3proto __read_mostly; /* Calculated at init based on memory size */ static unsigned int nf_nat_htable_size __read_mostly; -static int nf_nat_vmalloced; - -static struct hlist_head *bysource __read_mostly; #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] @@ -145,7 +142,8 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(const struct nf_conntrack_tuple *tuple, +find_appropriate_src(struct net *net, + const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { @@ -155,7 +153,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, const struct hlist_node *n; rcu_read_lock(); - hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ @@ -231,6 +229,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); const struct nf_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, @@ -240,12 +239,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, This is only required for source (ie. NAT/masq) mappings. So far, we don't do local source mappings, so multiple manips not an issue. */ - if (maniptype == IP_NAT_MANIP_SRC) { - if (find_appropriate_src(orig_tuple, tuple, range)) { + if (maniptype == IP_NAT_MANIP_SRC && + !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { + if (find_appropriate_src(net, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); - if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) - if (!nf_nat_used_tuple(tuple, ct)) - return; + if (!nf_nat_used_tuple(tuple, ct)) + return; } } @@ -283,6 +282,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); @@ -334,7 +334,8 @@ nf_nat_setup_info(struct nf_conn *ct, /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); + hlist_add_head_rcu(&nat->bysource, + &net->ipv4.nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -583,6 +584,132 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { + [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, + [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, + const struct nf_conn *ct, + struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_PROTONAT_MAX+1]; + const struct nf_nat_protocol *npt; + int err; + + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + if (err < 0) + return err; + + npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); + if (npt->nlattr_to_range) + err = npt->nlattr_to_range(tb, range); + nf_nat_proto_put(npt); + return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { + [CTA_NAT_MINIP] = { .type = NLA_U32 }, + [CTA_NAT_MAXIP] = { .type = NLA_U32 }, +}; + +static int +nfnetlink_parse_nat(struct nlattr *nat, + const struct nf_conn *ct, struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_NAT_MAX+1]; + int err; + + memset(range, 0, sizeof(*range)); + + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + if (err < 0) + return err; + + if (tb[CTA_NAT_MINIP]) + range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); + + if (!tb[CTA_NAT_MAXIP]) + range->max_ip = range->min_ip; + else + range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO]) + return 0; + + err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); + if (err < 0) + return err; + + return 0; +} + +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) +{ + struct nf_nat_range range; + + if (nfnetlink_parse_nat(attr, ct, &range) < 0) + return -EINVAL; + if (nf_nat_initialized(ct, manip)) + return -EEXIST; + + return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + struct nlattr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +static int __net_init nf_nat_net_init(struct net *net) +{ + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, + &net->ipv4.nat_vmalloced); + if (!net->ipv4.nat_bysource) + return -ENOMEM; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct nf_conn *i, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + memset(nat, 0, sizeof(*nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ + nf_ct_iterate_cleanup(net, &clean_nat, NULL); + synchronize_rcu(); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, + nf_nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { + .init = nf_nat_net_init, + .exit = nf_nat_net_exit, +}; + static int __init nf_nat_init(void) { size_t i; @@ -599,12 +726,9 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &nf_nat_vmalloced); - if (!bysource) { - ret = -ENOMEM; + ret = register_pernet_subsys(&nf_nat_net_ops); + if (ret < 0) goto cleanup_extend; - } /* Sew in builtin protocols. */ spin_lock_bh(&nf_nat_lock); @@ -622,6 +746,9 @@ static int __init nf_nat_init(void) BUG_ON(nf_nat_seq_adjust_hook != NULL); rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, + nfnetlink_parse_nat_setup); return 0; cleanup_extend: @@ -629,30 +756,18 @@ static int __init nf_nat_init(void) return ret; } -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct nf_conn *i, void *data) -{ - struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - memset(nat, 0, sizeof(*nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); - return 0; -} - static void __exit nf_nat_cleanup(void) { - nf_ct_iterate_cleanup(&clean_nat, NULL); - synchronize_rcu(); - nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); + unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_l3proto_put(l3proto); nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); synchronize_net(); } MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-ipv4"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 11976ea29884..cf7a42bf9820 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -16,6 +16,7 @@ #include <linux/udp.h> #include <net/checksum.h> #include <net/tcp.h> +#include <net/route.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack.h> @@ -192,7 +193,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, CTINFO2DIR(ctinfo)); - nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); + nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); } return 1; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index da3d91a5ef5c..9eb171056c63 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -40,6 +40,7 @@ MODULE_ALIAS("ip_nat_pptp"); static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + struct net *net = nf_ct_net(ct); const struct nf_conn *master = ct->master; struct nf_conntrack_expect *other_exp; struct nf_conntrack_tuple t; @@ -73,7 +74,7 @@ static void pptp_nat_expected(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple_ip(&t); - other_exp = nf_ct_expect_find_get(&t); + other_exp = nf_ct_expect_find_get(net, &t); if (other_exp) { nf_ct_unexpect_related(other_exp); nf_ct_expect_put(other_exp); diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c index 91537f11273f..6c4f11f51446 100644 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -73,9 +73,13 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, range_size = ntohs(range->max.all) - min + 1; } - off = *rover; if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - off = net_random(); + off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip, + maniptype == IP_NAT_MANIP_SRC + ? tuple->dst.u.all + : tuple->src.u.all); + else + off = *rover; for (i = 0; i < range_size; i++, off++) { *portptr = htons(min + off % range_size); diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 82e4c0e286b8..65e470bc6123 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -36,7 +36,7 @@ sctp_manip_pkt(struct sk_buff *skb, sctp_sctphdr_t *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; __be32 oldip, newip; - u32 crc32; + __be32 crc32; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -61,7 +61,7 @@ sctp_manip_pkt(struct sk_buff *skb, crc32 = sctp_update_cksum((u8 *)skb->data, skb_headlen(skb), crc32); crc32 = sctp_end_cksum(crc32); - hdr->checksum = htonl(crc32); + hdr->checksum = crc32; return true; } diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index e8b4d0d4439e..bea54a685109 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -33,7 +33,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; -} nat_initial_table __initdata = { +} nat_initial_table __net_initdata = { .repl = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, @@ -58,47 +58,42 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table __nat_table = { +static struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), .me = THIS_MODULE, .af = AF_INET, }; -static struct xt_table *nat_table; /* Source NAT */ -static unsigned int ipt_snat_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; - NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); - NF_CT_ASSERT(out); + NF_CT_ASSERT(par->out != NULL); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); } /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ -static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) +static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip) { static int warned = 0; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; struct rtable *rt; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + if (ip_route_output_key(net, &rt, &fl) != 0) return; if (rt->rt_src != srcip && !warned) { @@ -110,40 +105,32 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) ip_rt_put(rt); } -static unsigned int ipt_dnat_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || - hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - if (hooknum == NF_INET_LOCAL_OUT && + if (par->hooknum == NF_INET_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle(ip_hdr(skb)->daddr, + warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr, mr->range[0].min_ip); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); } -static bool ipt_snat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { @@ -153,13 +140,9 @@ static bool ipt_snat_checkentry(const char *tablename, return true; } -static bool ipt_dnat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { @@ -194,9 +177,10 @@ int nf_nat_rule_find(struct sk_buff *skb, const struct net_device *out, struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); int ret; - ret = ipt_do_table(skb, hooknum, in, out, nat_table); + ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); if (ret == NF_ACCEPT) { if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) @@ -226,14 +210,32 @@ static struct xt_target ipt_dnat_reg __read_mostly = { .family = AF_INET, }; +static int __net_init nf_nat_rule_net_init(struct net *net) +{ + net->ipv4.nat_table = ipt_register_table(net, &nat_table, + &nat_initial_table.repl); + if (IS_ERR(net->ipv4.nat_table)) + return PTR_ERR(net->ipv4.nat_table); + return 0; +} + +static void __net_exit nf_nat_rule_net_exit(struct net *net) +{ + ipt_unregister_table(net->ipv4.nat_table); +} + +static struct pernet_operations nf_nat_rule_net_ops = { + .init = nf_nat_rule_net_init, + .exit = nf_nat_rule_net_exit, +}; + int __init nf_nat_rule_init(void) { int ret; - nat_table = ipt_register_table(&init_net, &__nat_table, - &nat_initial_table.repl); - if (IS_ERR(nat_table)) - return PTR_ERR(nat_table); + ret = register_pernet_subsys(&nf_nat_rule_net_ops); + if (ret != 0) + goto out; ret = xt_register_target(&ipt_snat_reg); if (ret != 0) goto unregister_table; @@ -247,8 +249,8 @@ int __init nf_nat_rule_init(void) unregister_snat: xt_unregister_target(&ipt_snat_reg); unregister_table: - ipt_unregister_table(nat_table); - + unregister_pernet_subsys(&nf_nat_rule_net_ops); + out: return ret; } @@ -256,5 +258,5 @@ void nf_nat_rule_cleanup(void) { xt_unregister_target(&ipt_dnat_reg); xt_unregister_target(&ipt_snat_reg); - ipt_unregister_table(nat_table); + unregister_pernet_subsys(&nf_nat_rule_net_ops); } diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 4334d5cabc5b..14544320c545 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -318,11 +318,11 @@ static int mangle_content_len(struct sk_buff *skb, buffer, buflen); } -static unsigned mangle_sdp_packet(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, unsigned int *datalen, - enum sdp_header_types type, - enum sdp_header_types term, - char *buffer, int buflen) +static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, + unsigned int dataoff, unsigned int *datalen, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); @@ -330,9 +330,9 @@ static unsigned mangle_sdp_packet(struct sk_buff *skb, const char **dptr, if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, &matchoff, &matchlen) <= 0) - return 0; + return -ENOENT; return mangle_packet(skb, dptr, datalen, matchoff, matchlen, - buffer, buflen); + buffer, buflen) ? 0 : -EINVAL; } static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, @@ -346,8 +346,8 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, unsigned int buflen; buflen = sprintf(buffer, NIPQUAD_FMT, NIPQUAD(addr->ip)); - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, - buffer, buflen)) + if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, + buffer, buflen)) return 0; return mangle_content_len(skb, dptr, datalen); @@ -381,15 +381,27 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, /* Mangle session description owner and contact addresses */ buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(addr->ip)); - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, + if (mangle_sdp_packet(skb, dptr, dataoff, datalen, SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, buffer, buflen)) return 0; - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, - SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, - buffer, buflen)) + switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, + SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, + buffer, buflen)) { + case 0: + /* + * RFC 2327: + * + * Session description + * + * c=* (connection information - not required if included in all media) + */ + case -ENOENT: + break; + default: return 0; + } return mangle_content_len(skb, dptr, datalen); } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ffeaffc3fffe..8303e4b406c0 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -742,6 +742,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); if (*obj == NULL) { + kfree(p); kfree(id); if (net_ratelimit()) printk("OOM in bsalg (%d)\n", __LINE__); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 552169b41b16..8f5a403f6f6b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,8 +7,6 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $ - * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> @@ -73,32 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) static int sockstat_seq_open(struct inode *inode, struct file *file) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, sockstat_seq_show, net); - if (err < 0) - goto err_open; - - return 0; - -err_open: - put_net(net); -err_net: - return err; -} - -static int sockstat_seq_release(struct inode *inode, struct file *file) -{ - struct net *net = ((struct seq_file *)file->private_data)->private; - - put_net(net); - return single_release(inode, file); + return single_open_net(inode, file, sockstat_seq_show); } static const struct file_operations sockstat_seq_fops = { @@ -106,7 +79,7 @@ static const struct file_operations sockstat_seq_fops = { .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = sockstat_seq_release, + .release = single_release_net, }; /* snmp items */ @@ -259,6 +232,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), + SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), + SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), SNMP_MIB_SENTINEL }; @@ -268,11 +243,12 @@ static void icmpmsg_put(struct seq_file *seq) int j, i, count; static int out[PERLINE]; + struct net *net = seq->private; count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - if (snmp_fold_field((void **) icmpmsg_statistics, i)) + if (snmp_fold_field((void **) net->mib.icmpmsg_statistics, i)) out[count++] = i; if (count < PERLINE) continue; @@ -284,7 +260,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg: "); for (j = 0; j < PERLINE; ++j) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, out[j])); seq_putc(seq, '\n'); } @@ -296,7 +272,7 @@ static void icmpmsg_put(struct seq_file *seq) seq_printf(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", snmp_fold_field((void **) - icmpmsg_statistics, out[j])); + net->mib.icmpmsg_statistics, out[j])); } #undef PERLINE @@ -305,6 +281,7 @@ static void icmpmsg_put(struct seq_file *seq) static void icmp_put(struct seq_file *seq) { int i; + struct net *net = seq->private; seq_puts(seq, "\nIcmp: InMsgs InErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) @@ -313,18 +290,18 @@ static void icmp_put(struct seq_file *seq) for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) net->mib.icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } @@ -334,6 +311,7 @@ static void icmp_put(struct seq_file *seq) static int snmp_seq_show(struct seq_file *seq, void *v) { int i; + struct net *net = seq->private; seq_puts(seq, "Ip: Forwarding DefaultTTL"); @@ -341,12 +319,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - IPV4_DEVCONF_ALL(&init_net, FORWARDING) ? 1 : 2, + IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)net->mib.ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -361,11 +339,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -376,7 +354,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udp_statistics, + snmp_fold_field((void **)net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -387,7 +365,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udplite_statistics, + snmp_fold_field((void **)net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -396,7 +374,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) static int snmp_seq_open(struct inode *inode, struct file *file) { - return single_open(file, snmp_seq_show, NULL); + return single_open_net(inode, file, snmp_seq_show); } static const struct file_operations snmp_seq_fops = { @@ -404,7 +382,7 @@ static const struct file_operations snmp_seq_fops = { .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; @@ -415,6 +393,7 @@ static const struct file_operations snmp_seq_fops = { static int netstat_seq_show(struct seq_file *seq, void *v) { int i; + struct net *net = seq->private; seq_puts(seq, "TcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) @@ -423,7 +402,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net_statistics, + snmp_fold_field((void **)net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -433,7 +412,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)net->mib.ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); @@ -442,7 +421,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) static int netstat_seq_open(struct inode *inode, struct file *file) { - return single_open(file, netstat_seq_show, NULL); + return single_open_net(inode, file, netstat_seq_show); } static const struct file_operations netstat_seq_fops = { @@ -450,18 +429,32 @@ static const struct file_operations netstat_seq_fops = { .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; static __net_init int ip_proc_init_net(struct net *net) { if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) - return -ENOMEM; + goto out_sockstat; + if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + return 0; + +out_snmp: + proc_net_remove(net, "netstat"); +out_netstat: + proc_net_remove(net, "sockstat"); +out_sockstat: + return -ENOMEM; } static __net_exit void ip_proc_exit_net(struct net *net) { + proc_net_remove(net, "snmp"); + proc_net_remove(net, "netstat"); proc_net_remove(net, "sockstat"); } @@ -472,24 +465,6 @@ static __net_initdata struct pernet_operations ip_proc_ops = { int __init ip_misc_proc_init(void) { - int rc = 0; - - if (register_pernet_subsys(&ip_proc_ops)) - goto out_pernet; - - if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) - goto out_netstat; - - if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) - goto out_snmp; -out: - return rc; -out_snmp: - proc_net_remove(&init_net, "netstat"); -out_netstat: - unregister_pernet_subsys(&ip_proc_ops); -out_pernet: - rc = -ENOMEM; - goto out; + return register_pernet_subsys(&ip_proc_ops); } diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 971ab9356e51..ea50da0649fd 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -5,8 +5,6 @@ * * INET protocol dispatch tables. * - * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 37a1ecd9d600..cd975743bcd2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,8 +5,6 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * @@ -322,6 +320,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; @@ -370,7 +369,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, @@ -386,7 +385,7 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); return err; } @@ -608,12 +607,11 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_destroy(struct sock *sk) +static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); - return 0; } /* This gets rid of all the nasties in af_inet. -DaveM */ @@ -947,7 +945,7 @@ static int raw_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode drops\n"); + "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 96be336064fb..2ea6dcc3e2cc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> @@ -134,7 +132,6 @@ static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); -static struct timer_list rt_secret_timer; /* * Interface to generic destination cache. @@ -253,20 +250,25 @@ static inline void rt_hash_lock_init(void) static struct rt_hash_bucket *rt_hash_table __read_mostly; static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; -static atomic_t rt_genid __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx) +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, + int genid) { return jhash_3words((__force u32)(__be32)(daddr), (__force u32)(__be32)(saddr), - idx, atomic_read(&rt_genid)) + idx, genid) & rt_hash_mask; } +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->ipv4.rt_genid); +} + #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { struct seq_net_private p; @@ -280,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + if (!rt_hash_table[st->bucket].chain) + continue; rcu_read_lock_bh(); r = rcu_dereference(rt_hash_table[st->bucket].chain); while (r) { @@ -297,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, struct rtable *r) { struct rt_cache_iter_state *st = seq->private; + r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; + do { + if (--st->bucket < 0) + return NULL; + } while (!rt_hash_table[st->bucket].chain); rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } @@ -336,7 +343,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) struct rt_cache_iter_state *st = seq->private; if (*pos) return rt_cache_get_idx(seq, *pos - 1); - st->genid = atomic_read(&rt_genid); + st->genid = rt_genid(seq_file_net(seq)); return SEQ_START_TOKEN; } @@ -683,6 +690,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); } +static inline int rt_is_expired(struct rtable *rth) +{ + return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); +} + /* * Perform a full scan of hash table and free all entries. * Can be called by a softirq or a process. @@ -692,6 +704,7 @@ static void rt_do_flush(int process_context) { unsigned int i; struct rtable *rth, *next; + struct rtable * tail; for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) @@ -701,11 +714,39 @@ static void rt_do_flush(int process_context) continue; spin_lock_bh(rt_hash_lock_addr(i)); +#ifdef CONFIG_NET_NS + { + struct rtable ** prev, * p; + + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.dst.rt_next) + if (!rt_is_expired(tail)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.dst.rt_next; + if (!rt_is_expired(p)) { + prev = &p->u.dst.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + } +#else rth = rt_hash_table[i].chain; rt_hash_table[i].chain = NULL; + tail = NULL; +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.dst.rt_next; rt_free(rth); } @@ -738,7 +779,7 @@ static void rt_check_expire(void) continue; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -781,21 +822,21 @@ static void rt_worker_func(struct work_struct *work) * many times (2^24) without giving recent rt_genid. * Jenkins hash is strong enough that litle changes of rt_genid are OK. */ -static void rt_cache_invalidate(void) +static void rt_cache_invalidate(struct net *net) { unsigned char shuffle; get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &rt_genid); + atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } /* * delay < 0 : invalidate cache (fast : entries will be deleted later) * delay >= 0 : invalidate & flush cache (can be long) */ -void rt_cache_flush(int delay) +void rt_cache_flush(struct net *net, int delay) { - rt_cache_invalidate(); + rt_cache_invalidate(net); if (delay >= 0) rt_do_flush(!in_softirq()); } @@ -803,10 +844,11 @@ void rt_cache_flush(int delay) /* * We change rt_genid and let gc do the cleanup */ -static void rt_secret_rebuild(unsigned long dummy) +static void rt_secret_rebuild(unsigned long __net) { - rt_cache_invalidate(); - mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); + struct net *net = (struct net *)__net; + rt_cache_invalidate(net); + mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } /* @@ -882,7 +924,7 @@ static int rt_garbage_collect(struct dst_ops *ops) rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid == atomic_read(&rt_genid) && + if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; @@ -964,7 +1006,7 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -1067,7 +1109,12 @@ restart: printk("\n"); } #endif - rt_hash_table[hash].chain = rt; + /* + * Since lookup is lockfree, we must make sure + * previous writes to rt are comitted to memory + * before making rt visible to other CPUS. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; @@ -1140,7 +1187,7 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); while ((aux = *rthp) != NULL) { - if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) { + if (aux == rt || rt_is_expired(aux)) { *rthp = aux->u.dst.rt_next; rt_free(aux); continue; @@ -1182,7 +1229,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); rthp=&rt_hash_table[hash].chain; @@ -1194,7 +1242,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - rth->rt_genid != atomic_read(&rt_genid) || + rt_is_expired(rth) || !net_eq(dev_net(rth->u.dst.dev), net)) { rthp = &rth->u.dst.rt_next; continue; @@ -1233,7 +1281,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1297,7 +1345,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); + rt->fl.oif, + rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " NIPQUAD_FMT "/%02x dropped\n", @@ -1390,7 +1439,8 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; @@ -1446,7 +1496,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -1461,21 +1512,21 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->fl.iif != 0 || dst_metric_locked(&rth->u.dst, RTAX_MTU) || !net_eq(dev_net(rth->u.dst.dev), net) || - rth->rt_genid != atomic_read(&rt_genid)) + rt_is_expired(rth)) continue; if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) && + old_mtu >= dst_mtu(&rth->u.dst) && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) { - if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) { + if (mtu <= dst_mtu(&rth->u.dst)) { + if (mtu < dst_mtu(&rth->u.dst)) { dst_confirm(&rth->u.dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; @@ -1497,7 +1548,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 && + if (dst_mtu(dst) > mtu && mtu >= 68 && !(dst_metric_locked(dst, RTAX_MTU))) { if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; @@ -1626,7 +1677,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU) + if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, @@ -1696,7 +1747,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { @@ -1711,7 +1762,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); + hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: @@ -1837,7 +1888,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); rt_set_nexthop(rth, res, itag); @@ -1872,7 +1923,8 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); + hash = rt_hash(daddr, saddr, fl->iif, + rt_genid(dev_net(rth->u.dst.dev))); return rt_intern_hash(hash, rth, &skb->rtable); } @@ -1998,7 +2050,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -2028,7 +2080,7 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); + hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, &skb->rtable); goto done; @@ -2079,7 +2131,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, net = dev_net(dev); tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + hash = rt_hash(daddr, saddr, iif, rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2091,7 +2143,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2219,7 +2271,7 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2268,7 +2320,8 @@ static int ip_mkroute_output(struct rtable **rp, int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); + hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, + rt_genid(dev_net(dev_out))); err = rt_intern_hash(hash, rth, rp); } @@ -2313,11 +2366,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, ipv4_is_zeronet(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) - goto out; - /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: 1. ip_dev_find(net, saddr) can return wrong iface, if saddr @@ -2329,6 +2377,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (oldflp->oif == 0 && (ipv4_is_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; + /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2347,9 +2400,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, fl.oif = dev_out->ifindex; goto make_route; } - if (dev_out) + + if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; dev_put(dev_out); - dev_out = NULL; + dev_out = NULL; + } } @@ -2480,7 +2539,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2493,7 +2552,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2524,7 +2583,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) +static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) @@ -2548,7 +2607,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) rt->idev = ort->idev; if (rt->idev) in_dev_hold(rt->idev); - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; @@ -2584,7 +2643,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp); + err = ipv4_dst_blackhole(net, rp, flp); return err; } @@ -2797,13 +2856,15 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (s_h < 0) s_h = 0; s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++) { + for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { + if (!rt_hash_table[h].chain) + continue; rcu_read_lock_bh(); for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference(rt->u.dst.rt_next), idx++) { if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; - if (rt->rt_genid != atomic_read(&rt_genid)) + if (rt_is_expired(rt)) continue; skb->dst = dst_clone(&rt->u.dst); if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, @@ -2816,7 +2877,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) dst_release(xchg(&skb->dst, NULL)); } rcu_read_unlock_bh(); - s_idx = 0; } done: @@ -2827,19 +2887,25 @@ done: void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev), 0); } #ifdef CONFIG_SYSCTL -static int flush_delay; - -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + int flush_delay; + ctl_table ctl; + struct net *net; + + memcpy(&ctl, __ctl, sizeof(ctl)); + ctl.data = &flush_delay; + proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + + net = (struct net *)__ctl->extra1; + rt_cache_flush(net, flush_delay); return 0; } @@ -2847,32 +2913,82 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, } static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int delay; + struct net *net; if (newlen != sizeof(int)) return -EINVAL; if (get_user(delay, (int __user *)newval)) return -EFAULT; - rt_cache_flush(delay); + net = (struct net *)table->extra1; + rt_cache_flush(net, delay); return 0; } -ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .data = &flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, +static void rt_secret_reschedule(int old) +{ + struct net *net; + int new = ip_rt_secret_interval; + int diff = new - old; + + if (!diff) + return; + + rtnl_lock(); + for_each_net(net) { + int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); + + if (!new) + continue; + + if (deleted) { + long time = net->ipv4.rt_secret_timer.expires - jiffies; + + if (time <= 0 || (time += diff) <= 0) + time = 0; + + net->ipv4.rt_secret_timer.expires = time; + } else + net->ipv4.rt_secret_timer.expires = new; + + net->ipv4.rt_secret_timer.expires += jiffies; + add_timer(&net->ipv4.rt_secret_timer); + } + rtnl_unlock(); +} + +static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, + struct file *filp, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old = ip_rt_secret_interval; + int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); + + rt_secret_reschedule(old); + + return ret; +} + +static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, + size_t newlen) +{ + int old = ip_rt_secret_interval; + int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); + + rt_secret_reschedule(old); + + return ret; +} + +static ctl_table ipv4_route_table[] = { { .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", @@ -3006,13 +3122,120 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_secret_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = &ipv4_sysctl_rt_secret_interval, + .strategy = &ipv4_sysctl_rt_secret_interval_strategy, }, { .ctl_name = 0 } }; + +static struct ctl_table empty[1]; + +static struct ctl_table ipv4_skeleton[] = +{ + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, + .mode = 0555, .child = ipv4_route_table}, + { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, + .mode = 0555, .child = empty}, + { } +}; + +static __net_initdata struct ctl_path ipv4_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; + +static struct ctl_table ipv4_route_flush_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static __net_initdata struct ctl_path ipv4_route_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (net != &init_net) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = + register_net_sysctl_table(net, ipv4_route_path, tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; #endif + +static __net_init int rt_secret_timer_init(struct net *net) +{ + atomic_set(&net->ipv4.rt_genid, + (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7)))); + + net->ipv4.rt_secret_timer.function = rt_secret_rebuild; + net->ipv4.rt_secret_timer.data = (unsigned long)net; + init_timer_deferrable(&net->ipv4.rt_secret_timer); + + if (ip_rt_secret_interval) { + net->ipv4.rt_secret_timer.expires = + jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + } + return 0; +} + +static __net_exit void rt_secret_timer_exit(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); +} + +static __net_initdata struct pernet_operations rt_secret_timer_ops = { + .init = rt_secret_timer_init, + .exit = rt_secret_timer_exit, +}; + + #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ @@ -3031,9 +3254,6 @@ int __init ip_rt_init(void) { int rc = 0; - atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - #ifdef CONFIG_NET_CLS_ROUTE ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) @@ -3065,19 +3285,14 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - rt_secret_timer.function = rt_secret_rebuild; - rt_secret_timer.data = 0; - init_timer_deferrable(&rt_secret_timer); - /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + if (register_pernet_subsys(&rt_secret_timer_ops)) + printk(KERN_ERR "Unable to setup rt_secret_timer\n"); if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); @@ -3087,9 +3302,23 @@ int __init ip_rt_init(void) #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif return rc; } +#ifdef CONFIG_SYSCTL +/* + * We really need to sanitize the damn ipv4 init order, then all + * this nonsense will go away. + */ +void __init ip_static_sysctl_init(void) +{ + register_sysctl_paths(ipv4_path, ipv4_skeleton); +} +#endif + EXPORT_SYMBOL(__ip_select_ident); EXPORT_SYMBOL(ip_route_input); EXPORT_SYMBOL(ip_route_output_key); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d182a2a26291..d346c22aa6ae 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -8,8 +8,6 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - * - * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ */ #include <linux/tcp.h> @@ -18,6 +16,7 @@ #include <linux/cryptohash.h> #include <linux/kernel.h> #include <net/tcp.h> +#include <net/route.h> /* Timestamps: lowest 9 bits store TCP options */ #define TSBITS 9 @@ -175,7 +174,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) ; *mssp = msstab[mssind] + 1; - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), @@ -271,11 +270,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || (mss = cookie_check(skb, cookie)) == 0) { - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); @@ -298,9 +297,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; + ireq->loc_port = th->dest; ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; + ireq->ecn_ok = 0; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -338,6 +339,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, .proto = IPPROTO_TCP, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = th->dest, .dport = th->source } } }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c437f804ee38..1bb10df8ce7d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,8 +1,6 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $ - * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ @@ -28,16 +26,13 @@ static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; -extern seqlock_t sysctl_port_range_lock; -extern int sysctl_local_port_range[2]; - /* Update system visible IP port range */ static void set_local_port_range(int range[2]) { - write_seqlock(&sysctl_port_range_lock); - sysctl_local_port_range[0] = range[0]; - sysctl_local_port_range[1] = range[1]; - write_sequnlock(&sysctl_port_range_lock); + write_seqlock(&sysctl_local_ports.lock); + sysctl_local_ports.range[0] = range[0]; + sysctl_local_ports.range[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -46,8 +41,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, size_t *lenp, loff_t *ppos) { int ret; - int range[2] = { sysctl_local_port_range[0], - sysctl_local_port_range[1] }; + int range[2]; ctl_table tmp = { .data = &range, .maxlen = sizeof(range), @@ -56,6 +50,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, .extra2 = &ip_local_port_range_max, }; + inet_get_local_port_range(range, range + 1); ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); if (write && ret == 0) { @@ -69,14 +64,13 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, } /* Validate changes from sysctl interface. */ -static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int ipv4_sysctl_local_port_range(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int ret; - int range[2] = { sysctl_local_port_range[0], - sysctl_local_port_range[1] }; + int range[2]; ctl_table tmp = { .data = &range, .maxlen = sizeof(range), @@ -85,7 +79,8 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, .extra2 = &ip_local_port_range_max, }; - ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); + inet_get_local_port_range(range, range + 1); + ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen); if (ret == 0 && newval && newlen) { if (range[1] < range[0]) ret = -EINVAL; @@ -114,8 +109,8 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int sysctl_tcp_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -127,7 +122,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int ret; tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_default_congestion_control(val); return ret; @@ -170,8 +165,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int strategy_allowed_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int strategy_allowed_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) @@ -184,7 +179,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam return -ENOMEM; tcp_get_available_congestion_control(tbl.data, tbl.maxlen); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); @@ -234,6 +229,7 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &ipv4_doint_and_flush, .strategy = &ipv4_doint_and_flush_strategy, + .extra2 = &init_net, }, { .ctl_name = NET_IPV4_NO_PMTU_DISC, @@ -397,19 +393,12 @@ static struct ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, .procname = "ip_local_port_range", - .data = &sysctl_local_port_range, - .maxlen = sizeof(sysctl_local_port_range), + .data = &sysctl_local_ports.range, + .maxlen = sizeof(sysctl_local_ports.range), .mode = 0644, .proc_handler = &ipv4_local_port_range, .strategy = &ipv4_sysctl_local_port_range, }, - { - .ctl_name = NET_IPV4_ROUTE, - .procname = "route", - .maxlen = 0, - .mode = 0555, - .child = ipv4_route_table - }, #ifdef CONFIG_IP_MULTICAST { .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS, @@ -795,7 +784,8 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies }, { .ctl_name = NET_IPV4_ICMP_RATEMASK, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1d723de18686..eccb7165a80c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -279,8 +277,6 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; - atomic_t tcp_orphan_count = ATOMIC_INIT(0); EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -318,10 +314,10 @@ int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); -void tcp_enter_memory_pressure(void) +void tcp_enter_memory_pressure(struct sock *sk) { if (!tcp_memory_pressure) { - NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); tcp_memory_pressure = 1; } } @@ -346,8 +342,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. */ mask = 0; @@ -373,10 +369,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX - * solve this dilemma. I would prefer, if PULLHUP were maskable, + * solve this dilemma. I would prefer, if POLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains - * why PULLHUP is incompatible with POLLOUT. --ANK + * why POLLHUP is incompatible with POLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK @@ -388,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int target = sock_rcvlowat(sk, 0, INT_MAX); + + if (tp->urg_seq == tp->copied_seq && + !sock_flag(sk, SOCK_URGINLINE) && + tp->urg_data) + target--; + /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken * in SYN_* states. */ - if ((tp->rcv_nxt != tp->copied_seq) && - (tp->urg_seq != tp->copied_seq || - tp->rcv_nxt != tp->copied_seq + 1 || - sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -497,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, struct sk_buff *skb) { - if (flags & MSG_OOB) { - tp->urg_mode = 1; + if (flags & MSG_OOB) tp->snd_up = tp->write_seq; - } } static inline void tcp_push(struct sock *sk, int flags, int mss_now, @@ -651,7 +649,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(); + sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; @@ -1100,7 +1098,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1155,13 +1153,13 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not * necessary */ local_bh_disable(); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); local_bh_enable(); /* Clear memory counter. */ @@ -1362,7 +1360,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - BUG_TRAP(flags & MSG_PEEK); + WARN_ON(!(flags & MSG_PEEK)); skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -1425,8 +1423,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || - (flags & (MSG_PEEK | MSG_TRUNC))); + WARN_ON(tp->copied_seq != tp->rcv_nxt && + !(flags & (MSG_PEEK | MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1477,7 +1475,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Restore normal policy in scheduler __ */ if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1488,7 +1486,7 @@ do_prequeue: tcp_prequeue_process(sk); if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1603,7 +1601,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1670,12 +1668,12 @@ void tcp_set_state(struct sock *sk, int state) switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_CURRESTAB); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_ESTABRESETS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && @@ -1684,7 +1682,7 @@ void tcp_set_state(struct sock *sk, int state) /* fall through */ default: if (oldstate==TCP_ESTABLISHED) - TCP_DEC_STATS(TCP_MIB_CURRESTAB); + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed @@ -1795,13 +1793,13 @@ void tcp_close(struct sock *sk, long timeout) */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -1848,7 +1846,7 @@ adjudge_to_death: */ local_bh_disable(); bh_lock_sock(sk); - BUG_TRAP(!sock_owned_by_user(sk)); + WARN_ON(sock_owned_by_user(sk)); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -1873,7 +1871,8 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -1895,7 +1894,8 @@ adjudge_to_death: "sockets\n"); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); } } @@ -1975,7 +1975,7 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || icsk->icsk_bind_hash); + WARN_ON(inet->num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -2590,12 +2590,69 @@ void __tcp_put_md5sig_pool(void) } EXPORT_SYMBOL(__tcp_put_md5sig_pool); + +int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, + struct tcphdr *th) +{ + struct scatterlist sg; + int err; + + __sum16 old_checksum = th->check; + th->check = 0; + /* options aren't included in the hash */ + sg_init_one(&sg, th, sizeof(struct tcphdr)); + err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); + th->check = old_checksum; + return err; +} + +EXPORT_SYMBOL(tcp_md5_hash_header); + +int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, + struct sk_buff *skb, unsigned header_len) +{ + struct scatterlist sg; + const struct tcphdr *tp = tcp_hdr(skb); + struct hash_desc *desc = &hp->md5_desc; + unsigned i; + const unsigned head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + + sg_init_table(&sg, 1); + + sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); + if (crypto_hash_update(desc, &sg, head_data_len)) + return 1; + + for (i = 0; i < shi->nr_frags; ++i) { + const struct skb_frag_struct *f = &shi->frags[i]; + sg_set_page(&sg, f->page, f->size, f->page_offset); + if (crypto_hash_update(desc, &sg, f->size)) + return 1; + } + + return 0; +} + +EXPORT_SYMBOL(tcp_md5_hash_skb_data); + +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +{ + struct scatterlist sg; + + sg_init_one(&sg, key->key, key->keylen); + return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); +} + +EXPORT_SYMBOL(tcp_md5_hash_key); + #endif void tcp_done(struct sock *sk) { if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); @@ -2732,4 +2789,3 @@ EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); -EXPORT_SYMBOL(tcp_statistics); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6a250828b767..4ec5b4e97c4e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -115,7 +115,7 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (!ca && capable(CAP_SYS_MODULE)) { spin_unlock(&tcp_cong_list_lock); @@ -244,7 +244,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (ca == icsk->icsk_ca_ops) goto out; -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES /* not found attempt to autoload module */ if (!ca && capable(CAP_SYS_MODULE)) { rcu_read_unlock(); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 2fbcc7d1b1a0..838d491dfda7 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,8 +1,6 @@ /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index bfcbd148a89d..c209e054a634 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -150,7 +150,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; } - + /* check when cwnd has not been incremented for a while */ + if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } /* clamp down slowstart cwnd to ssthresh value. */ if (is_slowstart) tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cad73b7dfef0..d77c0d29e239 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -604,7 +602,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst_metric(dst, RTAX_RTO_MIN); + rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } @@ -731,6 +729,7 @@ void tcp_update_metrics(struct sock *sk) if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; + unsigned long rtt; if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? @@ -742,7 +741,8 @@ void tcp_update_metrics(struct sock *sk) return; } - m = dst_metric(dst, RTAX_RTT) - tp->srtt; + rtt = dst_metric_rtt(dst, RTAX_RTT); + m = rtt - tp->srtt; /* If newly calculated rtt larger than stored one, * store new one. Otherwise, use EWMA. Remember, @@ -750,12 +750,13 @@ void tcp_update_metrics(struct sock *sk) */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) - dst->metrics[RTAX_RTT - 1] = tp->srtt; + set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); else - dst->metrics[RTAX_RTT - 1] -= (m >> 3); + set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + unsigned long var; if (m < 0) m = -m; @@ -764,11 +765,13 @@ void tcp_update_metrics(struct sock *sk) if (m < tp->mdev) m = tp->mdev; - if (m >= dst_metric(dst, RTAX_RTTVAR)) - dst->metrics[RTAX_RTTVAR - 1] = m; + var = dst_metric_rtt(dst, RTAX_RTTVAR); + if (m >= var) + var = m; else - dst->metrics[RTAX_RTTVAR-1] -= - (dst_metric(dst, RTAX_RTTVAR) - m)>>2; + var -= (var - m) >> 2; + + set_dst_metric_rtt(dst, RTAX_RTTVAR, var); } if (tp->snd_ssthresh >= 0xFFFF) { @@ -899,7 +902,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_RTT) == 0) goto reset; - if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -916,12 +919,12 @@ static void tcp_init_metrics(struct sock *sk) * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (dst_metric(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric(dst, RTAX_RTT); + if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric_rtt(dst, RTAX_RTT); tp->rtt_seq = tp->snd_nxt; } - if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric(dst, RTAX_RTTVAR); + if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); @@ -949,17 +952,21 @@ static void tcp_update_reordering(struct sock *sk, const int metric, { struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { + int mib_idx; + tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) - NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); + mib_idx = LINUX_MIB_TCPTSREORDER; else if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); + mib_idx = LINUX_MIB_TCPRENOREORDER; else if (tcp_is_fack(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); + mib_idx = LINUX_MIB_TCPFACKREORDER; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); + mib_idx = LINUX_MIB_TCPSACKREORDER; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -972,6 +979,39 @@ static void tcp_update_reordering(struct sock *sk, const int metric, } } +/* This must be called before lost_out is incremented */ +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +{ + if ((tp->retransmit_skb_hint == NULL) || + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = skb; + + if (!tp->lost_out || + after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; +} + +static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tcp_verify_retransmit_hint(tp, skb); + + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +{ + tcp_verify_retransmit_hint(tp, skb); + + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1148,14 +1188,8 @@ static void tcp_mark_lost_retrans(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - } - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + tcp_skb_mark_lost_uncond_verify(tp, skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1167,10 +1201,11 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, +static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { + struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); int dup_sack = 0; @@ -1178,7 +1213,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); @@ -1187,7 +1222,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, !before(start_seq_0, start_seq_1)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPDSACKOFORECV); } } @@ -1262,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; } } else { if (!(sacked & TCPCB_RETRANS)) { @@ -1283,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; } } @@ -1315,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; } return flag; @@ -1414,10 +1443,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); - struct tcp_sack_block sp[4]; + struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; struct sk_buff *skb; - int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; + int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; int reord = tp->packets_out; int flag = 0; @@ -1432,7 +1461,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); } - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, + found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1458,18 +1487,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { + int mib_idx; + if (dup_sack) { if (!tp->undo_marker) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; - NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + mib_idx = LINUX_MIB_TCPSACKDISCARD; } + + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -1616,10 +1649,10 @@ advance_sp: out: #if FASTRETRANS_DEBUG > 0 - BUG_TRAP((int)tp->sacked_out >= 0); - BUG_TRAP((int)tp->lost_out >= 0); - BUG_TRAP((int)tp->retrans_out >= 0); - BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); + WARN_ON((int)tp->sacked_out < 0); + WARN_ON((int)tp->lost_out < 0); + WARN_ON((int)tp->retrans_out < 0); + WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif return flag; } @@ -1713,6 +1746,8 @@ int tcp_use_frto(struct sock *sk) return 0; skb = tcp_write_queue_head(sk); + if (tcp_skb_is_last(sk, skb)) + return 1; skb = tcp_write_queue_next(sk, skb); /* Skips head */ tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1854,6 +1889,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1870,7 +1906,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); - tcp_clear_retrans_hints_partial(tp); + tcp_clear_all_retrans_hints(tp); } static void tcp_clear_retrans_partial(struct tcp_sock *tp) @@ -1921,12 +1957,11 @@ void tcp_enter_loss(struct sock *sk, int how) /* Push undo marker, if it was plain RTO and nothing * was retransmitted. */ tp->undo_marker = tp->snd_una; - tcp_clear_retrans_hints_partial(tp); } else { tp->sacked_out = 0; tp->fackets_out = 0; - tcp_clear_all_retrans_hints(tp); } + tcp_clear_all_retrans_hints(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1939,6 +1974,7 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1962,7 +1998,7 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; @@ -2144,19 +2180,6 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } -/* RFC: This is from the original, I doubt that this is necessary at all: - * clear xmit_retrans hint if seq of this skb is beyond hint. How could we - * retransmitted past LOST markings in the first place? I'm not fully sure - * about undo and end of connection cases, which can cause R without L? - */ -static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) -{ - if ((tp->retransmit_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = NULL; -} - /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ @@ -2168,7 +2191,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) int err; unsigned int mss; - BUG_TRAP(packets <= tp->packets_out); + WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; @@ -2204,11 +2227,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) cnt = packets; } - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tcp_verify_retransmit_hint(tp, skb); - } + tcp_skb_mark_lost(tp, skb); } tcp_verify_left_out(tp); } @@ -2250,11 +2269,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (!tcp_skb_timedout(sk, skb)) break; - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tcp_verify_retransmit_hint(tp, skb); - } + tcp_skb_mark_lost(tp, skb); } tp->scoreboard_skb_hint = skb; @@ -2365,10 +2380,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; - - /* There is something screwy going on with the retrans hints after - an undo */ - tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -2382,15 +2393,19 @@ static int tcp_try_undo_recovery(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { + int mib_idx; + /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + mib_idx = LINUX_MIB_TCPLOSSUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); + mib_idx = LINUX_MIB_TCPFULLUNDO; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { @@ -2413,7 +2428,7 @@ static void tcp_try_undo_dsack(struct sock *sk) DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } } @@ -2436,7 +2451,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); - NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. * If the first packet was delayed, the rest @@ -2465,7 +2480,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tcp_undo_cwr(sk, 1); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (tcp_is_sack(tp)) @@ -2562,7 +2577,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; + int fast_rexmit = 0, mib_idx; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2584,7 +2599,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } /* D. Check consistency of the current state. */ @@ -2593,7 +2608,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - BUG_TRAP(tp->retrans_out == 0); + WARN_ON(tp->retrans_out != 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -2685,9 +2700,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) /* Otherwise enter Recovery state */ if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); + mib_idx = LINUX_MIB_TCPRENORECOVERY; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; @@ -2819,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2829,6 +2847,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; + u32 prior_sacked = tp->sacked_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); @@ -2885,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; - if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) - tp->urg_mode = 0; - tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; @@ -2910,9 +2926,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tcp_clear_all_retrans_hints(tp); + tp->scoreboard_skb_hint = NULL; + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = NULL; + if (skb == tp->lost_skb_hint) + tp->lost_skb_hint = NULL; } + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) + tp->snd_up = tp->snd_una; + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; @@ -2929,6 +2952,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) /* Non-retransmitted hole got filled? That's reordering */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); + + /* No need to care for underflows here because + * the lost_skb_hint gets NULLed if we're past it + * (or something non-trivial happened) + */ + if (tcp_is_fack(tp)) + tp->lost_cnt_hint -= pkts_acked; + else + tp->lost_cnt_hint -= prior_sacked - tp->sacked_out; } tp->fackets_out -= min(pkts_acked, tp->fackets_out); @@ -2953,9 +2985,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) } #if FASTRETRANS_DEBUG > 0 - BUG_TRAP((int)tp->sacked_out >= 0); - BUG_TRAP((int)tp->lost_out >= 0); - BUG_TRAP((int)tp->retrans_out >= 0); + WARN_ON((int)tp->sacked_out < 0); + WARN_ON((int)tp->lost_out < 0); + WARN_ON((int)tp->retrans_out < 0); if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { @@ -3198,7 +3230,7 @@ static int tcp_process_frto(struct sock *sk, int flag) } tp->frto_counter = 0; tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } @@ -3251,12 +3283,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_ca_event(sk, CA_EVENT_FAST_ACK); - NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else - NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); @@ -3273,13 +3305,14 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) * log. Something worked... */ sk->sk_err_soft = 0; + icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_time_stamp; prior_packets = tp->packets_out; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3305,8 +3338,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) return 1; no_queue: - icsk->icsk_probes_out = 0; - /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3424,6 +3455,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } } +static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) +{ + __be32 *ptr = (__be32 *)(th + 1); + + if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + return 1; + } + return 0; +} + /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ @@ -3435,21 +3482,50 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, return 0; } else if (tp->rx_opt.tstamp_ok && th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __be32 *ptr = (__be32 *)(th + 1); - if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { - tp->rx_opt.saw_tstamp = 1; - ++ptr; - tp->rx_opt.rcv_tsval = ntohl(*ptr); - ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); + if (tcp_parse_aligned_timestamp(tp, th)) return 1; - } } tcp_parse_options(skb, &tp->rx_opt, 1); return 1; } +#ifdef CONFIG_TCP_MD5SIG +/* + * Parse MD5 Signature option + */ +u8 *tcp_parse_md5sig_option(struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof (*th); + u8 *ptr = (u8*)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_MD5SIG) + return NULL; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch(opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return NULL; + if (opcode == TCPOPT_MD5SIG) + return ptr; + } + ptr += opsize - 2; + length -= opsize; + } + return NULL; +} +#endif + static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; @@ -3662,26 +3738,33 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, return 0; } -static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + int mib_idx; + if (before(seq, tp->rcv_nxt)) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); + mib_idx = LINUX_MIB_TCPDSACKOLDSENT; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); + mib_idx = LINUX_MIB_TCPDSACKOFOSENT; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1; } } -static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->rx_opt.dsack) - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } @@ -3692,7 +3775,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { @@ -3700,7 +3783,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); } } @@ -3727,9 +3810,8 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + - tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + + tp->rx_opt.dsack; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; @@ -3779,7 +3861,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * * If the sack array is full, forget about the last one. */ - if (this_sack >= 4) { + if (this_sack >= TCP_NUM_SACKS) { this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -3792,8 +3874,7 @@ new_sack: sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -3817,7 +3898,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int i; /* RCV.NXT must cover all the block! */ - BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); + WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ for (i=this_sack+1; i < num_sacks; i++) @@ -3830,9 +3911,8 @@ static void tcp_sack_remove(struct tcp_sock *tp) } if (num_sacks != tp->rx_opt.num_sacks) { tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + - tp->rx_opt.dsack, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + + tp->rx_opt.dsack; } } @@ -3853,7 +3933,7 @@ static void tcp_ofo_queue(struct sock *sk) __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack_high = TCP_SKB_CB(skb)->end_seq; - tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { @@ -3911,8 +3991,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, - 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; } /* Queue data for delivery to the user. @@ -3981,8 +4060,8 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk); @@ -4004,7 +4083,7 @@ drop: tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. @@ -4069,30 +4148,30 @@ drop: if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); goto add_sack; } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ - tcp_dsack_set(tp, seq, + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { skb1 = skb1->prev; } } - __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ while ((skb1 = skb->next) != (struct sk_buff *)&tp->out_of_order_queue && after(end_seq, TCP_SKB_CB(skb1)->seq)) { if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); break; } __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -4103,6 +4182,18 @@ add_sack: } } +static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *list) +{ + struct sk_buff *next = skb->next; + + __skb_unlink(skb, list); + __kfree_skb(skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + + return next; +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * Segments with FIN/SYN are not collapsed (only because this @@ -4120,11 +4211,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, for (skb = head; skb != tail;) { /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - struct sk_buff *next = skb->next; - __skb_unlink(skb, list); - __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); - skb = next; + skb = tcp_collapse_one(sk, skb, list); continue; } @@ -4170,7 +4257,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, list); + __skb_queue_before(list, skb, nskb); skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4188,11 +4275,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - struct sk_buff *next = skb->next; - __skb_unlink(skb, list); - __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); - skb = next; + skb = tcp_collapse_one(sk, skb, list); if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) @@ -4254,7 +4337,7 @@ static int tcp_prune_ofo_queue(struct sock *sk) int res = 0; if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -4283,7 +4366,7 @@ static int tcp_prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); @@ -4312,7 +4395,7 @@ static int tcp_prune_queue(struct sock *sk) * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ - NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; @@ -4378,8 +4461,8 @@ static void tcp_new_space(struct sock *sk) if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + - MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), - demanded = max_t(unsigned int, tp->snd_cwnd, + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + int demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); sndmem *= 2 * demanded; if (sndmem > sk->sk_sndbuf) @@ -4633,6 +4716,67 @@ out: } #endif /* CONFIG_NET_DMA */ +/* Does PAWS and seqno based validation of an incoming segment, flags will + * play significant role here. + */ +static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, int syn_inerr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* RFC1323: H1. Apply PAWS check first. */ + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(sk, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* Step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST + * bit is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* Step 2: check RST bit */ + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: Check for a SYN in window. */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (syn_inerr) + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return -1; + } + + return 1; + +discard: + __kfree_skb(skb); + return 0; +} + /* * TCP receive function for the ESTABLISHED state. * @@ -4660,6 +4804,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + int res; /* * Header prediction. @@ -4698,19 +4843,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - __be32 *ptr = (__be32 *)(th + 1); - /* No? Slow path! */ - if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; - tp->rx_opt.saw_tstamp = 1; - ++ptr; - tp->rx_opt.rcv_tsval = ntohl(*ptr); - ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); - /* If PAWS failed, check it more carefully in slow path */ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) goto slow_path; @@ -4742,7 +4878,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); return 0; } else { /* Header too small */ - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { @@ -4779,7 +4915,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } if (copied_early) tcp_cleanup_rbuf(sk, skb->len); @@ -4802,7 +4938,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ __skb_pull(skb, tcp_header_len); @@ -4821,7 +4957,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - __tcp_ack_snd_check(sk, 0); + if (!copied_early || tp->rcv_nxt != tp->rcv_wup) + __tcp_ack_snd_check(sk, 0); no_ack: #ifdef CONFIG_NET_DMA if (copied_early) @@ -4841,51 +4978,12 @@ slow_path: goto csum_error; /* - * RFC1323: H1. Apply PAWS check first. - */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); - goto discard; - } - /* Resets are accepted even if PAWS failed. - - ts_recent update must be made after we are sure - that the packet is in window. - */ - } - - /* * Standard slow path. */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - /* RFC793, page 37: "In all states except SYN-SENT, all reset - * (RST) segments are validated by checking their SEQ-fields." - * And page 69: "If an incoming segment is not acceptable, - * an acknowledgment should be sent in reply (unless the RST bit - * is set, if so drop the segment and return)". - */ - if (!th->rst) - tcp_send_dupack(sk, skb); - goto discard; - } - - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return 1; - } + res = tcp_validate_incoming(sk, skb, th, 1); + if (res <= 0) + return -res; step5: if (th->ack) @@ -4904,7 +5002,7 @@ step5: return 0; csum_error: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); @@ -4938,7 +5036,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { - NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5167,6 +5265,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; + int res; tp->rx_opt.saw_tstamp = 0; @@ -5219,42 +5318,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); - goto discard; - } - /* Reset is accepted even if it did not pass PAWS. */ - } - - /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - if (!th->rst) - tcp_send_dupack(sk, skb); - goto discard; - } - - /* step 2: check RST bit */ - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - - /* step 3: check security and precedence [ignored] */ - - /* step 4: - * - * Check for a SYN in window. - */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return 1; - } + res = tcp_validate_incoming(sk, skb, th, 0); + if (res <= 0) + return -res; /* step 5: check the ACK field */ if (th->ack) { @@ -5333,7 +5399,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } @@ -5393,7 +5459,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } @@ -5422,6 +5488,9 @@ EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); EXPORT_SYMBOL(tcp_parse_options); +#ifdef CONFIG_TCP_MD5SIG +EXPORT_SYMBOL(tcp_parse_md5sig_option); +#endif EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ffe869ac1bcf..5c8fa7f1e327 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ - * * IPv4 specific functions * * @@ -89,10 +87,14 @@ int sysctl_tcp_low_latency __read_mostly; #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); -static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, - unsigned int tcplen); +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th); +#else +static inline +struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +{ + return NULL; +} #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -172,7 +174,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return tmp; } @@ -340,16 +342,17 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; __u32 seq; int err; + struct net *net = dev_net(skb->dev); if (skb->len < (iph->ihl << 2) + 8) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } - sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, + sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { @@ -362,7 +365,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; @@ -371,7 +374,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -415,10 +418,10 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) /* ICMPs are not backlogged, hence we cannot get an established socket here. */ - BUG_TRAP(!req->sk); + WARN_ON(req->sk); if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -540,6 +543,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -578,33 +582,33 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len / 4; - tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1], - key, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ - sizeof(struct tcphdr), IPPROTO_TCP, 0); + arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; - ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, + net = dev_net(skb->dst->dev); + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); } /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, - struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts) +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts, int oif, + struct tcp_md5sig_key *key, + int reply_flags) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -616,10 +620,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, ]; } rep; struct ip_reply_arg arg; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; - struct tcp_md5sig_key tw_key; -#endif + struct net *net = dev_net(skb->dst->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -645,23 +646,6 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, rep.th.window = htons(win); #ifdef CONFIG_TCP_MD5SIG - /* - * The SKB holds an imcoming packet, but may not have a valid ->sk - * pointer. This is especially the case when we're dealing with a - * TIME_WAIT ack, because the sk structure is long gone, and only - * the tcp_timewait_sock remains. So the md5 key is stashed in that - * structure, and we use it in preference. I believe that (twsk || - * skb->sk) holds true, but we program defensively. - */ - if (!twsk && skb->sk) { - key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); - } else if (twsk && twsk->tw_md5_keylen) { - tw_key.key = twsk->tw_md5_key; - tw_key.keylen = twsk->tw_md5_keylen; - key = &tw_key; - } else - key = NULL; - if (key) { int offset = (ts) ? 3 : 0; @@ -672,25 +656,23 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len/4; - tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset], - key, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); } #endif + arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - if (twsk) - arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; + if (oif) + arg.bound_dev_if = oif; - ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -698,19 +680,26 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcptw->tw_ts_recent); + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw), + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 + ); inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, + tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, - req->ts_recent); + req->ts_recent, + 0, + tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); } /* @@ -1000,32 +989,13 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, newkey, cmd.tcpm_keylen); } -static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, - unsigned int tcplen) +static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, int nbytes) { - struct scatterlist sg[4]; - __u16 data_len; - int block = 0; - __sum16 old_checksum; - struct tcp_md5sig_pool *hp; struct tcp4_pseudohdr *bp; - struct hash_desc *desc; - int err; - unsigned int nbytes = 0; - - /* - * Okay, so RFC2385 is turned on for this connection, - * so we need to generate the MD5 hash for the packet now. - */ - - hp = tcp_get_md5sig_pool(); - if (!hp) - goto clear_hash_noput; + struct scatterlist sg; bp = &hp->md5_blk.ip4; - desc = &hp->md5_desc; /* * 1. the TCP pseudo-header (in the order: source IP address, @@ -1035,86 +1005,96 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; - bp->protocol = protocol; - bp->len = htons(tcplen); - - sg_init_table(sg, 4); + bp->protocol = IPPROTO_TCP; + bp->len = cpu_to_be16(nbytes); - sg_set_buf(&sg[block++], bp, sizeof(*bp)); - nbytes += sizeof(*bp); - - /* 2. the TCP header, excluding options, and assuming a - * checksum of zero/ - */ - old_checksum = th->check; - th->check = 0; - sg_set_buf(&sg[block++], th, sizeof(struct tcphdr)); - nbytes += sizeof(struct tcphdr); - - /* 3. the TCP segment data (if any) */ - data_len = tcplen - (th->doff << 2); - if (data_len > 0) { - unsigned char *data = (unsigned char *)th + (th->doff << 2); - sg_set_buf(&sg[block++], data, data_len); - nbytes += data_len; - } + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +} - /* 4. an independently-specified key or password, known to both - * TCPs and presumably connection-specific - */ - sg_set_buf(&sg[block++], key->key, key->keylen); - nbytes += key->keylen; +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; - sg_mark_end(&sg[block - 1]); + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; - /* Now store the Hash into the packet */ - err = crypto_hash_init(desc); - if (err) + if (crypto_hash_init(desc)) goto clear_hash; - err = crypto_hash_update(desc, sg, nbytes); - if (err) + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; - err = crypto_hash_final(desc, md5_hash); - if (err) + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) goto clear_hash; - /* Reset header, and free up the crypto */ tcp_put_md5sig_pool(); - th->check = old_checksum; - -out: return 0; + clear_hash: tcp_put_md5sig_pool(); clear_hash_noput: memset(md5_hash, 0, 16); - goto out; + return 1; } -int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - struct sock *sk, - struct dst_entry *dst, - struct request_sock *req, - struct tcphdr *th, int protocol, - unsigned int tcplen) +int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + struct sock *sk, struct request_sock *req, + struct sk_buff *skb) { + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; if (sk) { saddr = inet_sk(sk)->saddr; daddr = inet_sk(sk)->daddr; + } else if (req) { + saddr = inet_rsk(req)->loc_addr; + daddr = inet_rsk(req)->rmt_addr; } else { - struct rtable *rt = (struct rtable *)dst; - BUG_ON(!rt); - saddr = rt->rt_src; - daddr = rt->rt_dst; + const struct iphdr *iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; } - return tcp_v4_do_calc_md5_hash(md5_hash, key, - saddr, daddr, - th, protocol, tcplen); + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; } -EXPORT_SYMBOL(tcp_v4_calc_md5_hash); +EXPORT_SYMBOL(tcp_v4_md5_hash_skb); static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) { @@ -1130,80 +1110,32 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - int length = (th->doff << 2) - sizeof(struct tcphdr); int genhash; - unsigned char *ptr; unsigned char newhash[16]; hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); + hash_location = tcp_parse_md5sig_option(th); - /* - * If the TCP option length is less than the TCP_MD5SIG - * option length, then we can shortcut - */ - if (length < TCPOLEN_MD5SIG) { - if (hash_expected) - return 1; - else - return 0; - } - - /* Okay, we can't shortcut - we have to grub through the options */ - ptr = (unsigned char *)(th + 1); - while (length > 0) { - int opcode = *ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - goto done_opts; - case TCPOPT_NOP: - length--; - continue; - default: - opsize = *ptr++; - if (opsize < 2) - goto done_opts; - if (opsize > length) - goto done_opts; - - if (opcode == TCPOPT_MD5SIG) { - hash_location = ptr; - goto done_opts; - } - } - ptr += opsize-2; - length -= opsize; - } -done_opts: /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) return 0; if (hash_expected && !hash_location) { - LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found " - "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest)); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return 1; } if (!hash_expected && hash_location) { - LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found " - "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest)); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return 1; } /* Okay, so this is hash_expected and hash_location - * so we need to calculate the checksum. */ - genhash = tcp_v4_do_calc_md5_hash(newhash, - hash_expected, - iph->saddr, iph->daddr, - th, sk->sk_protocol, - skb->len); + genhash = tcp_v4_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { @@ -1317,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); @@ -1347,7 +1280,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { - NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } @@ -1437,6 +1370,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG @@ -1452,6 +1389,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newkey != NULL) tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr, newkey, key->keylen); + newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; } #endif @@ -1461,9 +1399,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; exit_overflow: - NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit: - NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); dst_release(dst); return NULL; } @@ -1590,7 +1528,7 @@ discard: return 0; csum_err: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1604,12 +1542,13 @@ int tcp_v4_rcv(struct sk_buff *skb) struct tcphdr *th; struct sock *sk; int ret; + struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ - TCP_INC_STATS_BH(TCP_MIB_INSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1638,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1685,7 +1623,7 @@ no_tcp_socket: if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { bad_packet: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } @@ -1706,7 +1644,7 @@ do_time_wait: } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); inet_twsk_put(inet_twsk(sk)); goto discard_it; } @@ -1814,7 +1752,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .md5_lookup = tcp_v4_md5_lookup, - .calc_md5_hash = tcp_v4_calc_md5_hash, + .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_add = tcp_v4_md5_add_func, .md5_parse = tcp_v4_parse_md5_keys, }; @@ -1871,7 +1809,7 @@ static int tcp_v4_init_sock(struct sock *sk) return 0; } -int tcp_v4_destroy_sock(struct sock *sk) +void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1915,8 +1853,6 @@ int tcp_v4_destroy_sock(struct sock *sk) } atomic_dec(&tcp_sockets_allocated); - - return 0; } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -1959,8 +1895,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) req = req->dl_next; while (1) { while (req) { - if (req->rsk_ops->family == st->family && - net_eq(sock_net(req->sk), net)) { + if (req->rsk_ops->family == st->family) { cur = req; goto out; } @@ -2020,6 +1955,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } +static inline int empty_bucket(struct tcp_iter_state *st) +{ + return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +} + static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state* st = seq->private; @@ -2032,6 +1973,10 @@ static void *established_get_first(struct seq_file *seq) struct inet_timewait_sock *tw; rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + /* Lockless fast path for the common case of empty buckets */ + if (empty_bucket(st)) + continue; + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family || @@ -2082,13 +2027,15 @@ get_tw: read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; - if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else { - cur = NULL; - goto out; - } + /* Look for next non empty bucket */ + while (++st->bucket < tcp_hashinfo.ehash_size && + empty_bucket(st)) + ; + if (st->bucket >= tcp_hashinfo.ehash_size) + return NULL; + + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else sk = sk_next(sk); @@ -2450,6 +2397,7 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8245247a6ceb..779f2e9d0689 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -246,7 +244,7 @@ kill: } if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -397,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->pred_flags = 0; newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; + newtp->snd_up = treq->snt_isn + 1; tcp_prequeue_init(newtp); @@ -482,7 +481,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; } @@ -611,98 +610,96 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) - req->rsk_ops->send_ack(skb, req); + req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } /* In sequence, PAWS is OK. */ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) - req->ts_recent = tmp_opt.rcv_tsval; + req->ts_recent = tmp_opt.rcv_tsval; - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { - /* Truncate SYN, it is out of window starting - at tcp_rsk(req)->rcv_isn + 1. */ - flg &= ~TCP_FLAG_SYN; - } + if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { + /* Truncate SYN, it is out of window starting + at tcp_rsk(req)->rcv_isn + 1. */ + flg &= ~TCP_FLAG_SYN; + } - /* RFC793: "second check the RST bit" and - * "fourth, check the SYN bit" - */ - if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); - goto embryonic_reset; - } + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + goto embryonic_reset; + } - /* ACK sequence verified above, just make sure ACK is - * set. If ACK not set, just silently drop the packet. - */ - if (!(flg & TCP_FLAG_ACK)) - return NULL; - - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - inet_rsk(req)->acked = 1; - return NULL; - } + /* ACK sequence verified above, just make sure ACK is + * set. If ACK not set, just silently drop the packet. + */ + if (!(flg & TCP_FLAG_ACK)) + return NULL; - /* OK, ACK is valid, create big socket and - * feed this segment to it. It will repeat all - * the tests. THIS SEGMENT MUST MOVE SOCKET TO - * ESTABLISHED STATE. If it will be dropped after - * socket is created, wait for troubles. - */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, - req, NULL); - if (child == NULL) - goto listen_overflow; + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + inet_rsk(req)->acked = 1; + return NULL; + } + + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; #ifdef CONFIG_TCP_MD5SIG - else { - /* Copy over the MD5 key from the original socket */ - struct tcp_md5sig_key *key; - struct tcp_sock *tp = tcp_sk(sk); - key = tp->af_specific->md5_lookup(sk, child); - if (key != NULL) { - /* - * We're using one, so create a matching key on the - * newsk structure. If we fail to get memory then we - * end up not copying the key across. Shucks. - */ - char *newkey = kmemdup(key->key, key->keylen, - GFP_ATOMIC); - if (newkey) { - if (!tcp_alloc_md5sig_pool()) - BUG(); - tp->af_specific->md5_add(child, child, - newkey, - key->keylen); - } + else { + /* Copy over the MD5 key from the original socket */ + struct tcp_md5sig_key *key; + struct tcp_sock *tp = tcp_sk(sk); + key = tp->af_specific->md5_lookup(sk, child); + if (key != NULL) { + /* + * We're using one, so create a matching key on the + * newsk structure. If we fail to get memory then we + * end up not copying the key across. Shucks. + */ + char *newkey = kmemdup(key->key, key->keylen, + GFP_ATOMIC); + if (newkey) { + if (!tcp_alloc_md5sig_pool()) + BUG(); + tp->af_specific->md5_add(child, child, newkey, + key->keylen); } } + } #endif - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); - return child; + inet_csk_reqsk_queue_add(sk, req, child); + return child; - listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { - inet_rsk(req)->acked = 1; - return NULL; - } +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + inet_rsk(req)->acked = 1; + return NULL; + } - embryonic_reset: - NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); - if (!(flg & TCP_FLAG_RST)) - req->rsk_ops->send_reset(sk, skb); +embryonic_reset: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + if (!(flg & TCP_FLAG_RST)) + req->rsk_ops->send_reset(sk, skb); - inet_csk_reqsk_queue_drop(sk, req, prev); - return NULL; + inet_csk_reqsk_queue_drop(sk, req, prev); + return NULL; } /* diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb4810..e4c5ac9fe89b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -347,112 +345,240 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, - __u32 tstamp, __u8 **md5_hash) +static inline int tcp_urg_mode(const struct tcp_sock *tp) { - if (tp->rx_opt.tstamp_ok) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; - - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK))); - - for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } + return tp->snd_una != tp->snd_up; +} - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -#ifdef CONFIG_TCP_MD5SIG - if (md5_hash) { +#define OPTION_SACK_ADVERTISE (1 << 0) +#define OPTION_TS (1 << 1) +#define OPTION_MD5 (1 << 2) + +struct tcp_out_options { + u8 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u16 mss; /* 0 to disable */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +/* Beware: Something in the Internet is very sensitive to the ordering of + * TCP options, we learned this through the hard way, so be careful here. + * Luckily we can at least blame others for their non-compliance but from + * inter-operatibility perspective it seems that we're somewhat stuck with + * the ordering which we have been using if we want to keep working with + * those broken things (not that it currently hurts anybody as there isn't + * particular reason why the ordering would need to be changed). + * + * At least SACK_PERM as the first option is known to lead to a disaster + * (but it may well be that other scenarios fail similarly). + */ +static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + __u8 **md5_hash) { + if (unlikely(OPTION_MD5 & opts->options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); *md5_hash = (__u8 *)ptr; + ptr += 4; + } else { + *md5_hash = NULL; } -#endif -} -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - * - * Note - that with the RFC2385 TCP option, we make room for the - * 16 byte MD5 hash. This will be filled in later, so the pointer for the - * location to be filled is passed back up. - */ -static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, - __u32 ts_recent, __u8 **md5_hash) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if (sack) + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + + if (likely(OPTION_TS & opts->options)) { + if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else + } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if (sack) + } + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } + + if (unlikely(OPTION_SACK_ADVERTISE & opts->options && + !(OPTION_TS & opts->options))) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) + } + + if (unlikely(opts->ws)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | - (wscale)); + opts->ws); + } + + if (unlikely(opts->num_sack_blocks)) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? + tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * + TCPOLEN_SACK_PERBLOCK))); + + for (this_sack = 0; this_sack < opts->num_sack_blocks; + ++this_sack) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; + } + } +} + +static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + #ifdef CONFIG_TCP_MD5SIG - /* - * If MD5 is enabled, then we set the option, and include the size - * (always 18). The actual MD5 hash is added just before the - * packet is sent. - */ - if (md5_hash) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* We always get an MSS option. The option bytes which will be seen in + * normal data packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so that + * calculations in tcp_sendmsg are simpler etc. So account for this + * fact here if necessary. If we don't do this correctly, as a + * receiver we won't recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK rules correctly. + * SACKs don't matter, we never delay an ACK when we have any of those + * going out. */ + opts->mss = tcp_advertise_mss(sk); + size += TCPOLEN_MSS_ALIGNED; + + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(sysctl_tcp_window_scaling)) { + opts->ws = tp->rx_opt.rcv_wscale; + if(likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(sysctl_tcp_sack)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!(OPTION_TS & opts->options))) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_synack_options(struct sock *sk, + struct request_sock *req, + unsigned mss, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + unsigned size = 0; + struct inet_request_sock *ireq = inet_rsk(req); + char doing_ts; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* we can't fit any SACK blocks in a packet with MD5 + TS + options. There was discussion about disabling SACK rather than TS in + order to fit in better with old, buggy kernels, but that was deemed + to be unnecessary. */ + doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); + + opts->mss = mss; + size += TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->wscale_ok)) { + opts->ws = ireq->rcv_wscale; + if(likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(doing_ts)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = req->ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(ireq->sack_ok)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!doing_ts)) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (unlikely(*md5)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; } +#else + *md5 = NULL; #endif + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcb ? tcb->when : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + + if (unlikely(tp->rx_opt.eff_sacks)) { + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + opts->num_sack_blocks = + min_t(unsigned, tp->rx_opt.eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; } /* This routine actually transmits TCP packets queued in by @@ -473,13 +599,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; - int tcp_header_size; -#ifdef CONFIG_TCP_MD5SIG + struct tcp_out_options opts; + unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif struct tcphdr *th; - int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -502,50 +626,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); - tcp_header_size = tp->tcp_header_len; - -#define SYSCTL_FLAG_TSTAMPS 0x1 -#define SYSCTL_FLAG_WSCALE 0x2 -#define SYSCTL_FLAG_SACK 0x4 + memset(&opts, 0, sizeof(opts)); - sysctl_flags = 0; - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if (sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if (sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if (sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (unlikely(tp->rx_opt.eff_sacks)) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK)); - } + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + else + tcp_options_size = tcp_established_options(sk, skb, &opts, + &md5); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); -#ifdef CONFIG_TCP_MD5SIG - /* - * Are we doing MD5 on this segment? If so - make - * room for it. - */ - md5 = tp->af_specific->md5_lookup(sk, sk); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif - skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); @@ -570,45 +662,23 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->check = 0; th->urg_ptr = 0; - if (unlikely(tp->urg_mode && + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_syn_build_options((__be32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent, - -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); - } else { - tcp_build_and_update_options((__be32 *)(th + 1), - tp, tcb->when, -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); - } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - sk, NULL, NULL, - tcp_hdr(skb), - sk->sk_protocol, - skb->len); + md5, sk, NULL, skb); } #endif @@ -621,7 +691,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -630,10 +700,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk, 1); return net_xmit_eval(err); - -#undef SYSCTL_FLAG_TSTAMPS -#undef SYSCTL_FLAG_WSCALE -#undef SYSCTL_FLAG_SACK } /* This routine just queue's the buffer @@ -963,7 +1029,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. * - * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ @@ -974,10 +1040,13 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) u32 mss_now; u16 xmit_size_goal; int doing_tso = 0; + unsigned header_len; + struct tcp_out_options opts; + struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) + if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp)) doing_tso = 1; if (dst) { @@ -986,14 +1055,16 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now = tcp_sync_mss(sk, mtu); } - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - mss_now -= TCPOLEN_MD5SIG_ALIGNED; -#endif + header_len = tcp_established_options(sk, NULL, &opts, &md5) + + sizeof(struct tcphdr); + /* The mss_cache is sized based on tp->tcp_header_len, which assumes + * some common options. If this is an odd packet (because we have SACK + * blocks etc) then our calculated header_len will be different, and + * we have to adjust mss_now correspondingly */ + if (header_len != tp->tcp_header_len) { + int delta = (int) header_len - tp->tcp_header_len; + mss_now -= delta; + } xmit_size_goal = mss_now; @@ -1139,7 +1210,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ - if (tp->urg_mode || (tp->frto_counter == 2) || + if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1770,6 +1841,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); + if (next_skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = skb; sk_wmem_free_skb(sk, next_skb); } @@ -1784,7 +1857,7 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); - int lost = 0; + u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1795,17 +1868,13 @@ void tcp_simple_retransmit(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - lost = 1; - } + tcp_skb_mark_lost_uncond_verify(tp, skb); } } - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); - if (!lost) + if (prior_lost == tp->lost_out) return; if (tcp_is_reno(tp)) @@ -1880,8 +1949,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && - (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (!tcp_skb_is_last(sk, skb)) && + (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && (tcp_skb_pcount(skb) == 1 && @@ -1913,7 +1982,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; @@ -1942,83 +2011,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } -/* This gets called after a retransmit timeout, and the initially - * retransmitted data is acknowledged. It tries to continue - * resending the rest of the retransmit queue, until either - * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. - */ -void tcp_xmit_retransmit_queue(struct sock *sk) +static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int packet_cnt; - - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - packet_cnt = tp->retransmit_cnt_hint; - } else { - skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } - - /* First pass: retransmit lost packets. */ - if (tp->lost_out) { - tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if (skb == tcp_send_head(sk)) - break; - /* we could do better than to assign each time */ - tp->retransmit_skb_hint = skb; - tp->retransmit_cnt_hint = packet_cnt; - - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - return; - - if (sacked & TCPCB_LOST) { - if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); - else - NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - } - - packet_cnt += tcp_skb_pcount(skb); - if (packet_cnt >= tp->lost_out) - break; - } - } - } - - /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) - return; + return 0; /* No forward retransmissions in Reno are possible. */ if (tcp_is_reno(tp)) - return; + return 0; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... @@ -2029,43 +2033,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ if (tcp_may_send_now(sk)) - return; + return 0; - /* If nothing is SACKed, highest_sack in the loop won't be valid */ - if (!tp->sacked_out) - return; + return 1; +} - if (tp->forward_skb_hint) - skb = tp->forward_skb_hint; - else +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct sk_buff *hole = NULL; + u32 last_lost; + int mib_idx; + int fwd_rexmitting = 0; + + if (!tp->lost_out) + tp->retransmit_high = tp->snd_una; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + last_lost = TCP_SKB_CB(skb)->end_seq; + if (after(last_lost, tp->retransmit_high)) + last_lost = tp->retransmit_high; + } else { skb = tcp_write_queue_head(sk); + last_lost = tp->snd_una; + } + /* First pass: retransmit lost packets. */ tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - tp->forward_skb_hint = skb; + __u8 sacked = TCP_SKB_CB(skb)->sacked; - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + if (skb == tcp_send_head(sk)) break; + /* we could do better than to assign each time */ + if (hole == NULL) + tp->retransmit_skb_hint = skb; + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; + return; + + if (fwd_rexmitting) { +begin_fwd: + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + break; + mib_idx = LINUX_MIB_TCPFORWARDRETRANS; + + } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + tp->retransmit_high = last_lost; + if (!tcp_can_forward_retransmit(sk)) + break; + /* Backtrack if necessary to non-L'ed skb */ + if (hole != NULL) { + skb = hole; + hole = NULL; + } + fwd_rexmitting = 1; + goto begin_fwd; - if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + } else if (!(sacked & TCPCB_LOST)) { + if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + hole = skb; continue; - /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) { - tp->forward_skb_hint = NULL; - break; + } else { + last_lost = TCP_SKB_CB(skb)->end_seq; + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; + + if (tcp_retransmit_skb(sk, skb)) + return; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - - NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } } @@ -2119,7 +2184,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } @@ -2130,9 +2195,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); - TCP_INC_STATS(TCP_MIB_OUTRSTS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent @@ -2180,11 +2245,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; int tcp_header_size; + struct tcp_out_options opts; struct sk_buff *skb; -#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif + int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2195,18 +2260,30 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); - tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + - (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + - /* SACK_PERM is in the place of NOP NOP of TS */ - ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + mss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale); + ireq->rcv_wscale = rcv_wscale; + } + + memset(&opts, 0, sizeof(opts)); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_header_size = tcp_synack_options(sk, req, mss, + skb, &opts, &md5) + + sizeof(struct tcphdr); -#ifdef CONFIG_TCP_MD5SIG - /* Are we doing MD5 on this segment? If so - make room for it */ - md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2215,7 +2292,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = inet_sk(sk)->sport; + th->source = ireq->loc_port; th->dest = ireq->rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) @@ -2224,19 +2301,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale); - ireq->rcv_wscale = rcv_wscale; - } /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); @@ -2245,29 +2309,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, - ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, - TCP_SKB_CB(skb)->when, - req->ts_recent, - ( -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL) - ); - + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - NULL, dst, req, - tcp_hdr(skb), sk->sk_protocol, - skb->len); + md5, NULL, req, skb); } #endif @@ -2304,6 +2354,9 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) + tp->advmss = tp->rx_opt.user_mss; + tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2322,6 +2375,7 @@ static void tcp_connect_init(struct sock *sk) tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -2367,7 +2421,7 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2531,8 +2585,7 @@ int tcp_write_wakeup(struct sock *sk) tcp_event_new_data_sent(sk, skb); return err; } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1); return tcp_xmit_probe_skb(sk, 0); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 63ed9d6830e7..6b6dff1164b9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -50,7 +48,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -91,7 +89,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -181,7 +179,7 @@ static void tcp_delack_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } @@ -200,10 +198,10 @@ static void tcp_delack_timer(unsigned long data) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); tp->ucopy.memory = 0; } @@ -220,7 +218,7 @@ static void tcp_delack_timer(unsigned long data) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } TCP_CHECK_TIMER(sk); @@ -289,7 +287,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - BUG_TRAP(!tcp_write_queue_empty(sk)); + WARN_ON(tcp_write_queue_empty(sk)); if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { @@ -328,24 +326,27 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; if (icsk->icsk_retransmits == 0) { + int mib_idx; + if (icsk->icsk_ca_state == TCP_CA_Disorder || icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); + mib_idx = LINUX_MIB_TCPSACKFAILURES; } else { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); + mib_idx = LINUX_MIB_TCPRENOFAILURES; } } else if (icsk->icsk_ca_state == TCP_CA_Loss) { - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); + mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else { - NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); + mib_idx = LINUX_MIB_TCPTIMEOUTS; } + NET_INC_STATS_BH(sock_net(sk), mib_idx); } if (tcp_use_frto(sk)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 56fcda3694ba..2095abc3caba 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,12 +5,10 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Alan Cox, <Alan.Cox@linux.org> + * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: @@ -110,12 +108,6 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; -EXPORT_SYMBOL(udp_statistics); - -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; -EXPORT_SYMBOL(udp_stats_in6); - struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); @@ -130,14 +122,23 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); -static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, - const struct hlist_head udptable[]) +static int udp_lib_lport_inuse(struct net *net, __u16 num, + const struct hlist_head udptable[], + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { - struct sock *sk; + struct sock *sk2; struct hlist_node *node; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (net_eq(sock_net(sk), net) && sk->sk_hash == num) + sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_hash == num && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if + || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) return 1; return 0; } @@ -154,83 +155,37 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, const struct sock *sk2 ) ) { struct hlist_head *udptable = sk->sk_prot->h.udp_hash; - struct hlist_node *node; - struct hlist_head *head; - struct sock *sk2; int error = 1; struct net *net = sock_net(sk); write_lock_bh(&udp_hash_lock); if (!snum) { - int i, low, high, remaining; - unsigned rover, best, best_size_so_far; + int low, high, remaining; + unsigned rand; + unsigned short first; inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; - best_size_so_far = UINT_MAX; - best = rover = net_random() % remaining + low; - - /* 1st pass: look for empty (or shortest) hash chain */ - for (i = 0; i < UDP_HTABLE_SIZE; i++) { - int size = 0; - - head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; - if (hlist_empty(head)) - goto gotit; - - sk_for_each(sk2, node, head) { - if (++size >= best_size_so_far) - goto next; - } - best_size_so_far = size; - best = rover; - next: - /* fold back if end of range */ - if (++rover > high) - rover = low + ((rover - low) - & (UDP_HTABLE_SIZE - 1)); - - - } - - /* 2nd pass: find hole in shortest hash chain */ - rover = best; - for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { - if (! __udp_lib_lport_inuse(net, rover, udptable)) - goto gotit; - rover += UDP_HTABLE_SIZE; - if (rover > high) - rover = low + ((rover - low) - & (UDP_HTABLE_SIZE - 1)); + rand = net_random(); + snum = first = rand % remaining + low; + rand |= 1; + while (udp_lib_lport_inuse(net, snum, udptable, sk, + saddr_comp)) { + do { + snum = snum + rand; + } while (snum < low || snum > high); + if (snum == first) + goto fail; } - - - /* All ports in use! */ + } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) goto fail; -gotit: - snum = rover; - } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; - - sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && - sk2 != sk && - net_eq(sock_net(sk2), net) && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (*saddr_comp)(sk, sk2) ) - goto fail; - } - inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; - sk_add_node(sk, head); + sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -266,7 +221,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int badness = -1; read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && @@ -307,6 +262,28 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, return result; } +static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct hlist_head udptable[]) +{ + struct sock *sk; + const struct iphdr *iph = ip_hdr(skb); + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else + return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); +} + +struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, int dif) +{ + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); +} +EXPORT_SYMBOL_GPL(udp4_lib_lookup); + static inline struct sock *udp_v4_mcast_next(struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, @@ -356,11 +333,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) struct sock *sk; int harderr; int err; + struct net *net = dev_net(skb->dev); - sk = __udp4_lib_lookup(dev_net(skb->dev), iph->daddr, uh->dest, + sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); if (sk == NULL) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -528,7 +506,8 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -656,11 +635,13 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct net *net = sock_net(sk); + security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + err = ip_route_output_flow(net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -727,7 +708,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -890,7 +872,8 @@ try_again: goto out_free; if (!peeked) - UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_timestamp(msg, sk, skb); @@ -919,7 +902,7 @@ out: csum_copy_err: lock_sock(sk); if (!skb_kill_datagram(sk, skb, flags)) - UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); release_sock(sk); if (noblock) @@ -950,6 +933,27 @@ int udp_disconnect(struct sock *sk, int flags) return 0; } +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int is_udplite = IS_UDPLITE(sk); + int rc; + + if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); + goto drop; + } + + return 0; + +drop: + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; +} + /* returns: * -1: error * 0: success @@ -990,7 +994,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ret = (*up->encap_rcv)(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } @@ -1040,17 +1045,19 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } - if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { - /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) - UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); - goto drop; - } + rc = 0; - return 0; + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = __udp_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + return rc; drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -1061,7 +1068,7 @@ drop: * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ -static int __udp4_lib_mcast_deliver(struct sk_buff *skb, +static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) @@ -1070,7 +1077,7 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1085,15 +1092,7 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { - int ret = 0; - - bh_lock_sock_nested(sk); - if (!sock_owned_by_user(sk)) - ret = udp_queue_rcv_skb(sk, skb1); - else - sk_add_backlog(sk, skb1); - bh_unlock_sock(sk); - + int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) /* we should probably re-process instead * of dropping packets here. */ @@ -1158,6 +1157,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], struct rtable *rt = (struct rtable*)skb->dst; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; + struct net *net = dev_net(skb->dev); /* * Validate the packet. @@ -1180,19 +1180,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], goto csum_error; if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); - sk = __udp4_lib_lookup(dev_net(skb->dev), saddr, uh->source, daddr, - uh->dest, inet_iif(skb), udptable); + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = 0; - bh_lock_sock_nested(sk); - if (!sock_owned_by_user(sk)) - ret = udp_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); + int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1211,7 +1205,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1245,7 +1239,7 @@ csum_error: ntohs(uh->dest), ulen); drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1255,12 +1249,11 @@ int udp_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); } -int udp_destroy_sock(struct sock *sk) +void udp_destroy_sock(struct sock *sk) { lock_sock(sk); udp_flush_pending_frames(sk); release_sock(sk); - return 0; } /* @@ -1319,6 +1312,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; + else if (val > USHORT_MAX) + val = USHORT_MAX; up->pcslen = val; up->pcflag |= UDPLITE_SEND_CC; break; @@ -1331,6 +1326,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; + else if (val > USHORT_MAX) + val = USHORT_MAX; up->pcrlen = val; up->pcflag |= UDPLITE_RECV_CC; break; @@ -1453,7 +1450,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INERRORS, is_lite); __skb_unlink(skb, rcvq); kfree_skb(skb); } @@ -1481,7 +1479,7 @@ struct proto udp_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = udp_queue_rcv_skb, + .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, @@ -1629,12 +1627,13 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", bucket, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, len); + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); } int udp4_seq_show(struct seq_file *seq, void *v) @@ -1643,7 +1642,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-127s\n", " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode"); + "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; int len; diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7288bf7977fb..2e9bad2fa1bc 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -26,7 +26,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, extern int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern int udp_destroy_sock(struct sock *sk); +extern void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS extern int udp4_seq_show(struct seq_file *seq, void *v); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 72ce26b6c4d3..3c807964da96 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -1,8 +1,6 @@ /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * - * Version: $Id: udplite.c,v 1.25 2006/10/19 07:22:36 gerrit Exp $ - * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: @@ -13,7 +11,6 @@ * 2 of the License, or (at your option) any later version. */ #include "udp_impl.h" -DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly; struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 9c798abce736..63418185f524 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -47,8 +47,10 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - skb_set_network_header(skb, IPV4_BEET_PHMAXLEN - x->props.header_len - - hdrlen); + skb_set_network_header(skb, -x->props.header_len - + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + sizeof(*top_iph); |