diff options
Diffstat (limited to 'net/netfilter')
106 files changed, 6467 insertions, 1274 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0d65f4d39494..91efae88e8c2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -20,7 +20,7 @@ config NETFILTER_FAMILY_ARP bool config NETFILTER_NETLINK_ACCT -tristate "Netfilter NFACCT over NFNETLINK interface" + tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED select NETFILTER_NETLINK help @@ -34,7 +34,7 @@ config NETFILTER_NETLINK_QUEUE help If this option is enabled, the kernel will include support for queueing packets via NFNETLINK. - + config NETFILTER_NETLINK_LOG tristate "Netfilter LOG over NFNETLINK interface" default m if NETFILTER_ADVANCED=n @@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" depends on NF_FLOW_TABLE help - This option adds the flow table mixed IPv4/IPv6 support. + This option adds the flow table mixed IPv4/IPv6 support. To compile it as a module, choose M here. @@ -1502,7 +1502,7 @@ config NETFILTER_XT_MATCH_REALM This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. - This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option in tc world. If you want to compile it as a module, say M here and read @@ -1523,7 +1523,7 @@ config NETFILTER_XT_MATCH_SCTP depends on NETFILTER_ADVANCED default IP_SCTP help - With this option enabled, you will be able to use the + With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports and SCTP chunk types. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 9270a7fae484..3f572e5a975e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o @@ -120,11 +121,12 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o -nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_offload.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o -# generic X tables +# generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o # combos diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5d5bdf450091..78f046ec506f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + INIT_LIST_HEAD(&sublist); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + ret = nf_hook_slow(skb, state, e, 0); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + /* Put passed packets back on main list */ + list_splice(&sublist, head); +} +EXPORT_SYMBOL(nf_hook_slow_list); + /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 063df74b4647..26ab0e9612d8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -60,9 +60,9 @@ mtype_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - ip_set_free(map->members); if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); + ip_set_free(map->members); ip_set_free(map); set->data = NULL; @@ -75,7 +75,7 @@ mtype_flush(struct ip_set *set) if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); - memset(map->members, 0, map->memsize); + bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } @@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } #ifndef IP_SET_BITMAP_STORED_TIMEOUT -static inline bool +static bool mtype_is_filled(const struct mtype_elem *x) { return true; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 11ff9d4a7006..0a2196f59106 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem { u16 id; }; -static inline u32 +static u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; @@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, size_t dsize) { @@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id, htonl(map->first_ip + id * map->hosts)); } -static inline int +static int bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, u32 first_ip, u32 last_ip, u32 elements, u32 hosts, u8 netmask) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, return true; } +static u32 +range_to_mask(u32 from, u32 to, u8 *bits) +{ + u32 mask = 0xFFFFFFFE; + + *bits = 32; + while (--(*bits) > 0 && mask && (to & mask) != from) + mask <<= 1; + + return mask; +} + static int bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -310,7 +322,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1d4e63326e68..739e343efaf6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -42,7 +42,7 @@ enum { /* Type structure */ struct bitmap_ipmac { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -65,7 +65,7 @@ struct bitmap_ipmac_elem { unsigned char filled; } __aligned(__alignof__(u64)); -static inline u32 +static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; @@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) /* Common functions */ -static inline int +static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { @@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return -EAGAIN; } -static inline int +static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; @@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } -static inline int +static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, @@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, return 0; } -static inline int +static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { @@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, return IPSET_ADD_STORE_PLAIN_TIMEOUT; } -static inline int +static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { @@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } -static inline int +static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || @@ -299,7 +299,7 @@ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; @@ -360,7 +360,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (!map) return -ENOMEM; - map->memsize = bitmap_bytes(0, elements - 1); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 704a0dda1609..b49978dd810d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -30,7 +30,7 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { - void *members; /* the set members */ + unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -46,7 +46,7 @@ struct bitmap_port_adt_elem { u16 id; }; -static inline u16 +static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; @@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port) /* Common functions */ -static inline int +static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } -static inline int +static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } -static inline int +static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { @@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, htons(map->first_port + id)); } -static inline int +static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } +static bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} + static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -204,7 +231,7 @@ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { - map->members = ip_set_alloc(map->memsize); + map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; @@ -244,7 +271,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], return -ENOMEM; map->elements = elements; - map->memsize = bitmap_bytes(0, map->elements); + map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e64d5f9a89dd..69c107f9ba8d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -35,7 +35,7 @@ struct ip_set_net { static unsigned int ip_set_net_id __read_mostly; -static inline struct ip_set_net *ip_set_pernet(struct net *net) +static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } @@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); * serialized by ip_set_type_mutex. */ -static inline void +static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } -static inline void +static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); @@ -277,7 +277,7 @@ ip_set_free(void *members) } EXPORT_SYMBOL_GPL(ip_set_free); -static inline bool +static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; @@ -296,7 +296,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -314,7 +315,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -325,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); +static u32 +ip_set_timeout_get(const unsigned long *timeout) +{ + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; +} + +static char * +ip_set_comment_uget(struct nlattr *tb) +{ + return nla_data(tb); +} + +/* Called from uadd only, protected by the set spinlock. + * The kadt functions don't use the comment extensions in any way. + */ +void +ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext) +{ + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); + size_t len = ext->comment ? strlen(ext->comment) : 0; + + if (unlikely(c)) { + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); + } + if (!len) + return; + if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) + len = IPSET_MAX_COMMENT_SIZE; + c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); + if (unlikely(!c)) + return; + strlcpy(c->str, ext->comment, len + 1); + set->ext_size += sizeof(*c) + strlen(c->str) + 1; + rcu_assign_pointer(comment->c, c); +} +EXPORT_SYMBOL_GPL(ip_set_init_comment); + +/* Used only when dumping a set, protected by rcu_read_lock() */ +static int +ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) +{ + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); + + if (!c) + return 0; + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); +} + +/* Called from uadd/udel, flush or the garbage collectors protected + * by the set spinlock. + * Called when the set is destroyed and when there can't be any user + * of the set data anymore. + */ +static void +ip_set_comment_free(struct ip_set *set, void *ptr) +{ + struct ip_set_comment *comment = ptr; + struct ip_set_comment_rcu *c; + + c = rcu_dereference_protected(comment->c, 1); + if (unlikely(!c)) + return; + set->ext_size -= sizeof(*c) + strlen(c->str) + 1; + kfree_rcu(c, rcu); + rcu_assign_pointer(comment->c, NULL); +} + typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ @@ -351,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), - .destroy = (destroyer) ip_set_comment_free, + .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); -static inline bool +static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? @@ -446,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); +static u64 +ip_set_get_bytes(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->bytes); +} + +static u64 +ip_set_get_packets(const struct ip_set_counter *counter) +{ + return (u64)atomic64_read(&(counter)->packets); +} + +static bool +ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) +{ + return nla_put_net64(skb, IPSET_ATTR_BYTES, + cpu_to_be64(ip_set_get_bytes(counter)), + IPSET_ATTR_PAD) || + nla_put_net64(skb, IPSET_ATTR_PACKETS, + cpu_to_be64(ip_set_get_packets(counter)), + IPSET_ATTR_PAD); +} + +static bool +ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) +{ + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask), + IPSET_ATTR_PAD)) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); +} + int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) @@ -471,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, } EXPORT_SYMBOL_GPL(ip_set_put_extensions); +static bool +ip_set_match_counter(u64 counter, u64 match, u8 op) +{ + switch (op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == match; + case IPSET_COUNTER_NE: + return counter != match; + case IPSET_COUNTER_LT: + return counter < match; + case IPSET_COUNTER_GT: + return counter > match; + } + return false; +} + +static void +ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) +{ + atomic64_add((long long)bytes, &(counter)->bytes); +} + +static void +ip_set_add_packets(u64 packets, struct ip_set_counter *counter) +{ + atomic64_add((long long)packets, &(counter)->packets); +} + +static void +ip_set_update_counter(struct ip_set_counter *counter, + const struct ip_set_ext *ext, u32 flags) +{ + if (ext->packets != ULLONG_MAX && + !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { + ip_set_add_bytes(ext->bytes, counter); + ip_set_add_packets(ext->packets, counter); + } +} + +static void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbinfo = *skbinfo; +} + bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) @@ -506,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions); * The set behind an index may change by swapping only, from userspace. */ -static inline void +static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -514,7 +682,7 @@ __ip_set_get(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -static inline void +static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -526,7 +694,7 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void +static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); @@ -541,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set) * so it can't be destroyed (or changed) under our foot. */ -static inline struct ip_set * +static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set *set; @@ -670,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * */ -static inline void +static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; @@ -934,7 +1102,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && - nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1252,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) +int +ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) +{ + u32 cadt_flags = 0; + + if (SET_WITH_TIMEOUT(set)) + if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(set->timeout)))) + return -EMSGSIZE; + if (SET_WITH_COUNTER(set)) + cadt_flags |= IPSET_FLAG_WITH_COUNTERS; + if (SET_WITH_COMMENT(set)) + cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; + if (SET_WITH_FORCEADD(set)) + cadt_flags |= IPSET_FLAG_WITH_FORCEADD; + + if (!cadt_flags) + return 0; + return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); +} +EXPORT_SYMBOL_GPL(ip_set_put_flags); + static int ip_set_dump_done(struct netlink_callback *cb) { @@ -1281,32 +1474,43 @@ dump_attrs(struct nlmsghdr *nlh) } } +static const struct nla_policy +ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, +}; + static int -dump_init(struct netlink_callback *cb, struct ip_set_net *inst) +ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; + struct sk_buff *skb = cb->skb; + struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; - ip_set_id_t index; int ret; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, - ip_set_setname_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_dump_policy, NULL); if (ret) - return ret; + goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { + ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (!set) - return -ENOENT; - + if (!set) { + ret = -ENOENT; + goto error; + } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { @@ -1322,10 +1526,17 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) cb->args[IPSET_CB_DUMP] = dump_type; return 0; + +error: + /* We have to create and send the error message manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) { + netlink_ack(cb->skb, nlh, ret, NULL); + } + return ret; } static int -ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; @@ -1336,18 +1547,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) bool is_destroyed; int ret = 0; - if (!cb->args[IPSET_CB_DUMP]) { - ret = dump_init(cb, inst); - if (ret < 0) { - nlh = nlmsg_hdr(cb->skb); - /* We have to create and send the error message - * manually :-( - */ - if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(cb->skb, nlh, ret, NULL); - return ret; - } - } + if (!cb->args[IPSET_CB_DUMP]) + return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; @@ -1483,7 +1684,8 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, { struct netlink_dump_control c = { - .dump = ip_set_dump_start, + .start = ip_set_dump_start, + .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(ctnl, skb, nlh, &c); @@ -1543,9 +1745,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, ip_set_adt_policy, + NULL); if (ret) { nlmsg_free(skb2); @@ -1596,7 +1798,9 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1606,7 +1810,8 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); @@ -1644,6 +1849,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; + u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || @@ -1655,11 +1861,12 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!set) return -ENOENT; - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); - ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) @@ -1961,7 +2168,7 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_LIST] = { .call = ip_set_dump, .attr_count = IPSET_ATTR_CMD_MAX, - .policy = ip_set_setname_policy, + .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, @@ -2069,8 +2276,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } req_version->version = IPSET_PROTOCOL; - ret = copy_to_user(user, req_version, - sizeof(struct ip_set_req_version)); + if (copy_to_user(user, req_version, + sizeof(struct ip_set_req_version))) + ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { @@ -2129,7 +2337,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) } /* end of switch(op) */ copy: - ret = copy_to_user(user, data, copylen); + if (copy_to_user(user, data, copylen)) + ret = -EFAULT; done: vfree(data); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 2b8f959574b4..36615eb3eae1 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); #endif - -bool -ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) -{ - bool ret; - u8 proto; - - switch (pf) { - case NFPROTO_IPV4: - ret = ip_set_get_ip4_port(skb, src, port, &proto); - break; - case NFPROTO_IPV6: - ret = ip_set_get_ip6_port(skb, src, port, &proto); - break; - default: - return false; - } - if (!ret) - return ret; - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - return true; - default: - return false; - } -} -EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 0feb77fa9edc..7480ce55b5c8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -7,7 +7,7 @@ #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/types.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set.h> #define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) #define ipset_dereference_protected(p, set) \ @@ -39,7 +39,7 @@ #ifdef IP_SET_HASH_WITH_MULTI #define AHASH_MAX(h) ((h)->ahash_max) -static inline u8 +static u8 tune_ahash_max(u8 curr, u32 multi) { u32 n; @@ -909,7 +909,7 @@ out: return ret; } -static inline int +static int mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, struct ip_set_ext *mext, struct ip_set *set, u32 flags) { @@ -953,7 +953,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = rcu_dereference_bh(hbucket(t, key)); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; for (i = 0; i < n->pos; i++) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f4432d9fcad0..5d6d68eaf6a9 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -44,7 +44,7 @@ struct hash_ip4_elem { /* Common functions */ -static inline bool +static bool hash_ip4_data_equal(const struct hash_ip4_elem *e1, const struct hash_ip4_elem *e2, u32 *multi) @@ -63,7 +63,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) { next->ip = e->ip; @@ -171,7 +171,7 @@ struct hash_ip6_elem { /* Common functions */ -static inline bool +static bool hash_ip6_data_equal(const struct hash_ip6_elem *ip1, const struct hash_ip6_elem *ip2, u32 *multi) @@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static inline void +static void hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip6_netmask(ip, prefix); @@ -196,7 +196,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e) { } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 24d8f4df4230..eceb7bc4a93a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -47,7 +47,7 @@ struct hash_ipmac4_elem { /* Common functions */ -static inline bool +static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) @@ -67,7 +67,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { @@ -154,7 +154,7 @@ struct hash_ipmac6_elem { /* Common functions */ -static inline bool +static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) @@ -175,7 +175,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { @@ -209,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - if (opt->flags & IPSET_DIM_ONE_SRC) + if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7a1734aad0c5..aba1df617d6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -42,7 +42,7 @@ struct hash_ipmark4_elem { /* Common functions */ -static inline bool +static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) @@ -64,7 +64,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { @@ -165,7 +165,7 @@ struct hash_ipmark6_elem { /* Common functions */ -static inline bool +static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) @@ -187,7 +187,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 32e240658334..1ff228717e29 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -47,7 +47,7 @@ struct hash_ipport4_elem { /* Common functions */ -static inline bool +static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) @@ -71,7 +71,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { @@ -202,7 +202,7 @@ struct hash_ipport6_elem { /* Common functions */ -static inline bool +static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) @@ -226,7 +226,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 15d419353179..fa88afd812fa 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -46,7 +46,7 @@ struct hash_ipportip4_elem { u8 padding; }; -static inline bool +static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) @@ -72,7 +72,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { @@ -210,7 +210,7 @@ struct hash_ipportip6_elem { /* Common functions */ -static inline bool +static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) @@ -236,7 +236,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7a4d7afd4121..eef6ecfcb409 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -59,7 +59,7 @@ struct hash_ipportnet4_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) @@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); @@ -116,7 +116,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { @@ -308,7 +308,7 @@ struct hash_ipportnet6_elem { /* Common functions */ -static inline bool +static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) @@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); @@ -365,7 +365,7 @@ nla_put_failure: return true; } -static inline void +static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index d94c585d33c5..0b61593165ef 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -37,7 +37,7 @@ struct hash_mac4_elem { /* Common functions */ -static inline bool +static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) @@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, return ether_addr_equal(e1->ether, e2->ether); } -static inline bool +static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) @@ -56,7 +56,7 @@ nla_put_failure: return true; } -static inline void +static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c259cbc3ef45..136cf0781d3a 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -47,7 +47,7 @@ struct hash_net4_elem { /* Common functions */ -static inline bool +static bool hash_net4_data_equal(const struct hash_net4_elem *ip1, const struct hash_net4_elem *ip2, u32 *multi) @@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net4_do_data_match(const struct hash_net4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -97,7 +97,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net4_data_next(struct hash_net4_elem *next, const struct hash_net4_elem *d) { @@ -212,7 +212,7 @@ struct hash_net6_elem { /* Common functions */ -static inline bool +static bool hash_net6_data_equal(const struct hash_net6_elem *ip1, const struct hash_net6_elem *ip2, u32 *multi) @@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_net6_do_data_match(const struct hash_net6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -262,7 +262,7 @@ nla_put_failure: return true; } -static inline void +static void hash_net6_data_next(struct hash_net6_elem *next, const struct hash_net6_elem *d) { @@ -368,6 +368,7 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 87b29f971226..be5e95a0d876 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -25,7 +25,8 @@ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ -#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ +/* 6 skbinfo support added */ +#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); @@ -57,12 +58,13 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) @@ -71,28 +73,30 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -103,7 +107,8 @@ static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -119,7 +124,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { @@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); @@ -280,12 +287,13 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; + u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ -static inline bool +static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) @@ -294,28 +302,30 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - strcmp(ip1->iface, ip2->iface) == 0; + (ip1->wildcard ? + strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : + strcmp(ip1->iface, ip2->iface) == 0); } -static inline int +static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -326,7 +336,8 @@ static bool hash_netiface6_data_list(struct sk_buff *skb, const struct hash_netiface6_elem *data) { - u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | + (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; @@ -342,7 +353,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { @@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); + if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) + e.wildcard = 1; } ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index a3ae69bfee66..da4ef910b12d 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -52,7 +52,7 @@ struct hash_netnet4_elem { /* Common functions */ -static inline bool +static bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, const struct hash_netnet4_elem *ip2, u32 *multi) @@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -117,7 +117,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet4_data_next(struct hash_netnet4_elem *next, const struct hash_netnet4_elem *d) { @@ -282,7 +282,7 @@ struct hash_netnet6_elem { /* Common functions */ -static inline bool +static bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, const struct hash_netnet6_elem *ip2, u32 *multi) @@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, ip1->ccmp == ip2->ccmp; } -static inline int +static int hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } -static inline void +static void hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) { if (inner) { @@ -348,7 +348,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netnet6_data_next(struct hash_netnet6_elem *next, const struct hash_netnet6_elem *d) { @@ -476,6 +476,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 799f2272cc65..34448df80fb9 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -57,7 +57,7 @@ struct hash_netport4_elem { /* Common functions */ -static inline bool +static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) @@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); @@ -112,7 +112,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { @@ -270,7 +270,7 @@ struct hash_netport6_elem { /* Common functions */ -static inline bool +static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) @@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1, ip1->cidr == ip2->cidr; } -static inline int +static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); @@ -325,7 +325,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index a82b70e8b9a6..934c1712cba8 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -56,7 +56,7 @@ struct hash_netportnet4_elem { /* Common functions */ -static inline bool +static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) @@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { @@ -126,7 +126,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { @@ -331,7 +331,7 @@ struct hash_netportnet6_elem { /* Common functions */ -static inline bool +static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) @@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, ip1->proto == ip2->proto; } -static inline int +static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } -static inline void +static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } -static inline void +static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } -static inline void +static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } -static inline void +static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { @@ -402,7 +402,7 @@ nla_put_failure: return true; } -static inline void +static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6f9ead6319e0..cd747c0962fd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu) kfree(e); } -static inline void +static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; @@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e) call_rcu(&e->rcu, __list_set_del_rcu); } -static inline void +static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; @@ -288,7 +288,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n && !(SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(n, set)))) - n = NULL; + n = NULL; e = kzalloc(set->dsize, GFP_ATOMIC); if (!e) diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index f6f1a0d5c47d..5b672e05d758 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -135,7 +135,7 @@ config IP_VS_WRR module, choose M here. If unsure, say N. config IP_VS_LC - tristate "least-connection scheduling" + tristate "least-connection scheduling" ---help--- The least-connection scheduling algorithm directs network connections to the server with the least number of active @@ -145,7 +145,7 @@ config IP_VS_LC module, choose M here. If unsure, say N. config IP_VS_WLC - tristate "weighted least-connection scheduling" + tristate "weighted least-connection scheduling" ---help--- The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections @@ -333,7 +333,7 @@ config IP_VS_NFCT config IP_VS_PE_SIP tristate "SIP persistence engine" - depends on IP_VS_PROTO_UDP + depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP ---help--- Allow persistence based on the SIP Call-ID diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 4515056ef1c2..f9b16f2b2219 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -193,21 +193,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * mutex_lock(&__ip_vs_app_mutex); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + list_for_each_entry(a, &ipvs->app_list, a_list) { if (!strcmp(app->name, a->name)) { err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } } a = kmemdup(app, sizeof(*app), GFP_KERNEL); if (!a) { err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); goto out_unlock; } INIT_LIST_HEAD(&a->incs_list); list_add(&a->a_list, &ipvs->app_list); - /* increase the module use count */ - ip_vs_use_count_inc(); out_unlock: mutex_unlock(&__ip_vs_app_mutex); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 46f06f92ab8f..512259f579d7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -617,7 +617,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; - union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); @@ -2402,18 +2402,22 @@ estimator_fail: return -ENOMEM; } -static void __net_exit __ip_vs_cleanup(struct net *net) +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); - - ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ - ip_vs_conn_net_cleanup(ipvs); - ip_vs_app_net_cleanup(ipvs); - ip_vs_protocol_net_cleanup(ipvs); - ip_vs_control_net_cleanup(ipvs); - ip_vs_estimator_net_cleanup(ipvs); - IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); - net->ipvs = NULL; + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } } static int __net_init __ip_vs_dev_init(struct net *net) @@ -2429,27 +2433,32 @@ hook_fail: return ret; } -static void __net_exit __ip_vs_dev_cleanup(struct net *net) +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) { - struct netns_ipvs *ipvs = net_ipvs(net); + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); - nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ipvs->enable = 0; /* Disable packet reception */ - smp_wmb(); - ip_vs_sync_net_cleanup(ipvs); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } LeaveFunction(2); } static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, - .exit = __ip_vs_cleanup, + .exit_batch = __ip_vs_cleanup_batch, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), }; static struct pernet_operations ipvs_core_dev_ops = { .init = __ip_vs_dev_init, - .exit = __ip_vs_dev_cleanup, + .exit_batch = __ip_vs_dev_cleanup_batch, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 060565e7d227..8d14a1acbc37 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -93,7 +93,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; - static int old_secure_tcp = 0; int availmem; int nomem; int to_change = -1; @@ -174,35 +173,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; } else { - if (old_secure_tcp >= 2) + if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: - if (old_secure_tcp < 2) + if (ipvs->old_secure_tcp < 2) to_change = 1; break; } - old_secure_tcp = ipvs->sysctl_secure_tcp; + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); @@ -262,7 +261,7 @@ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - register unsigned int porth = ntohs(port); + unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; @@ -424,7 +423,7 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol if (!svc && protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter) && - (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) { + (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. @@ -493,7 +492,7 @@ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned int porth = ntohs(port); + unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -1275,7 +1274,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service *svc = NULL; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { @@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) /* * Delete service by {netns} in the service table. - * Called by __ip_vs_cleanup() + * Called by __ip_vs_batch_cleanup() */ -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs) +void ip_vs_service_nets_cleanup(struct list_head *net_list) { + struct netns_ipvs *ipvs; + struct net *net; + EnterFunction(2); /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); - ip_vs_flush(ipvs, true); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } mutex_unlock(&__ip_vs_mutex); LeaveFunction(2); } @@ -1737,12 +1743,18 @@ proc_do_defense_mode(struct ctl_table *table, int write, int val = *valp; int rc; - rc = proc_dointvec(table, write, buffer, lenp, ppos); + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; + if (val < 0 || val > 3) { + rc = -EINVAL; } else { + *valp = val; update_defense_level(ipvs); } } @@ -1756,33 +1768,20 @@ proc_do_sync_threshold(struct ctl_table *table, int write, int *valp = table->data; int val[2]; int rc; + struct ctl_table tmp = { + .data = &val, + .maxlen = table->maxlen, + .mode = table->mode, + }; - /* backup the value first */ memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || - (valp[0] >= valp[1] && valp[1]))) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - -static int -proc_do_sync_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 1)) { - /* Restore the correct value */ - *valp = val; - } + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write) { + if (val[0] < 0 || val[1] < 0 || + (val[0] >= val[1] && val[1])) + rc = -EINVAL; + else + memcpy(valp, val, sizeof(val)); } return rc; } @@ -1795,12 +1794,18 @@ proc_do_sync_ports(struct ctl_table *table, int write, int val = *valp; int rc; - rc = proc_dointvec(table, write, buffer, lenp, ppos); + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { - if (*valp < 1 || !is_power_of_2(*valp)) { - /* Restore the correct value */ + if (val < 1 || !is_power_of_2(val)) + rc = -EINVAL; + else *valp = val; - } } return rc; } @@ -1860,7 +1865,9 @@ static struct ctl_table vs_vars[] = { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_do_sync_mode, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", @@ -2434,9 +2441,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (copy_from_user(arg, user, len) != 0) return -EFAULT; - /* increase the module use count */ - ip_vs_use_count_inc(); - /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { @@ -2449,13 +2453,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) - goto out_dec; + return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } - goto out_dec; + return ret; } mutex_lock(&__ip_vs_mutex); @@ -2550,10 +2554,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) out_unlock: mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - return ret; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index c8b5a504476c..77c323c36a88 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -160,7 +160,7 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) /* get weighted least-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *least; int loh, doh; @@ -209,7 +209,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) /* get weighted most-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) { - register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *most; int moh, doh; diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index 94d9d349ebb0..da0280cec506 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -174,8 +174,8 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s, return 0; } - table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), - sizeof(unsigned long), GFP_KERNEL); + table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), + sizeof(unsigned long), GFP_KERNEL); if (!table) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index 78b074cd5464..c03066fdd5ca 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -5,7 +5,7 @@ * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active - * connections , will keep all conections to the node with the highest weight + * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 8e104dff7abc..166c669f0763 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -68,7 +68,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) struct ip_vs_pe *tmp; /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 000d961b97e4..32b028853a7c 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -710,7 +710,7 @@ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; - pd->tcp_state_table = tcp_states; + pd->tcp_state_table = tcp_states; return 0; } diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 2f9d5cd5daee..d4903723be7e 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -179,7 +179,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) } /* increase the module use count */ - ip_vs_use_count_inc(); + if (!ip_vs_use_count_inc()) + return -ENOENT; mutex_lock(&ip_vs_sched_mutex); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index a4a78c4b06de..605e0f68f8bd 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1239,7 +1239,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, p = msg_end; if (p + sizeof(s->v4) > buffer+buflen) { - IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); return; } s = (union ip_vs_sync_conn *)p; @@ -1762,6 +1762,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1892,9 +1896,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); - /* increase the module use count */ - ip_vs_use_count_inc(); - return 0; out: @@ -1924,11 +1925,17 @@ out: } kfree(ti); } + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); return result; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9c464d24beec..b00866d777fe 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -208,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct rtable *ort = skb_rtable(skb); if (!skb->dev && sk && sk_fullsock(sk)) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); } static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, @@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, goto err_put; skb_dst_drop(skb); - if (noref) { - if (!local) - skb_dst_set_noref(skb, &rt->dst); - else - skb_dst_set(skb, dst_clone(&rt->dst)); - } else + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else skb_dst_set(skb, &rt->dst); return local; @@ -613,7 +607,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) ret = ip_vs_confirm_conntrack(skb); if (ret == NF_ACCEPT) { - nf_reset(skb); + nf_reset_ct(skb); skb_forward_csum(skb); } return ret; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 81a8ef42b88d..d1305423640f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -73,8 +73,7 @@ struct conntrack_gc_work { }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; -static __read_mostly spinlock_t nf_conntrack_locks_all_lock; -static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); +static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ @@ -574,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -897,9 +895,10 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } /* Resolve race on insertion if this protocol allows this. */ -static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_tuple_hash *h) +static __cold noinline int +nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -1418,7 +1417,6 @@ void nf_conntrack_free(struct nf_conn *ct) WARN_ON(atomic_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); - nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); @@ -1793,8 +1791,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); @@ -2250,8 +2248,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); - hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), - GFP_KERNEL | __GFP_ZERO); + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -2336,7 +2333,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); static __always_inline unsigned int total_extension_size(void) { diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 5e2812ee2149..7956c9f19899 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -24,11 +24,13 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) +#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) enum retry_state { STATE_CONGESTED, @@ -38,11 +40,11 @@ enum retry_state { static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) { - struct nf_conn *refs[16]; + struct nf_conn *refs[ECACHE_STACK_ALLOC]; + enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int evicted = 0; - enum retry_state ret = STATE_DONE; spin_lock(&pcpu->lock); @@ -53,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) if (!nf_ct_is_confirmed(ct)) continue; + /* This ecache access is safe because the ct is on the + * pcpu dying list and we hold the spinlock -- the entry + * cannot be free'd until after the lock is released. + * + * This is true even if ct has a refcount of 0: the + * cpu that is about to free the entry must remove it + * from the dying list and needs the lock to do so. + */ e = nf_ct_ecache_find(ct); if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; + /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means + * the worker owns this entry: the ct will remain valid + * until the worker puts its ct reference. + */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; @@ -188,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) if (notify == NULL) goto out_unlock; + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) + goto out_unlock; + e = nf_ct_ecache_find(ct); if (e == NULL) goto out_unlock; events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; - /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 65364de915d1..42557d2b6a90 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -25,8 +25,10 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index d4ed1e197921..3dbe2329c3f1 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -34,21 +34,23 @@ void nf_ct_ext_destroy(struct nf_conn *ct) t->destroy(ct); rcu_read_unlock(); } + + kfree(ct->ext); } -EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; - struct nf_ct_ext *old, *new; struct nf_ct_ext_type *t; + struct nf_ct_ext *new; /* Conntrack must not be confirmed to avoid races on reallocation. */ WARN_ON(nf_ct_is_confirmed(ct)); - old = ct->ext; - if (old) { + if (ct->ext) { + const struct nf_ct_ext *old = ct->ext; + if (__nf_ct_ext_exist(old, id)) return NULL; oldlen = old->len; @@ -68,22 +70,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) rcu_read_unlock(); alloc = max(newlen, NF_CT_EXT_PREALLOC); - kmemleak_not_leak(old); - new = __krealloc(old, alloc, gfp); + new = krealloc(ct->ext, alloc, gfp); if (!new) return NULL; - if (!old) { + if (!ct->ext) memset(new->offset, 0, sizeof(new->offset)); - ct->ext = new; - } else if (new != old) { - kfree_rcu(old, rcu); - rcu_assign_pointer(ct->ext, new); - } new->offset[id] = newoff; new->len = newlen; memset((void *)new + newoff, 0, newlen - newoff); + + ct->ext = new; return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 0ecb3e289ef2..9eca90414bb7 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -162,7 +162,7 @@ static int try_rfc959(const char *data, size_t dlen, if (length == 0) return 0; - cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); cmd->u.tcp.port = htons((array[4] << 8) | array[5]); return length; @@ -322,7 +322,7 @@ static int find_pattern(const char *data, size_t dlen, i++; } - pr_debug("Skipped up to `%c'!\n", skip); + pr_debug("Skipped up to 0x%hhx delimiter!\n", skip); *numoff = i; *numlen = getnum(data + i, dlen - i, cmd, term, numoff); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8d729e7c36ff..118f415928ae 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -21,10 +21,11 @@ #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_log.h> static DEFINE_MUTEX(nf_ct_helper_mutex); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 74b8113f7aeb..522792556632 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -11,7 +11,7 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> -static spinlock_t nf_connlabels_lock; +static DEFINE_SPINLOCK(nf_connlabels_lock); static int replace_u32(u32 *address, u32 mask, u32 new) { @@ -89,7 +89,6 @@ int nf_conntrack_labels_init(void) { BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); - spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6aa01eb6fe99..6a1c8f1f6171 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -506,9 +506,45 @@ nla_put_failure: return -1; } +/* all these functions access ct->ext. Caller must either hold a reference + * on ct or prevent its deletion by holding either the bucket spinlock or + * pcpu dying list lock. + */ +static int ctnetlink_dump_extinfo(struct sk_buff *skb, + struct nf_conn *ct, u32 type) +{ + if (ctnetlink_dump_acct(skb, ct, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_labels(skb, ct) < 0 || + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + return -1; + + return 0; +} + +static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) +{ + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0) + return -1; + + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && + (ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0)) + return -1; + + return 0; +} + static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct) + struct nf_conn *ct, bool extinfo) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; @@ -552,20 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; - if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0 || - ctnetlink_dump_helpinfo(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secctx(skb, ct) < 0 || - ctnetlink_dump_labels(skb, ct) < 0 || - ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0 || - ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || - ctnetlink_dump_ct_synproxy(skb, ct) < 0) + if (ctnetlink_dump_info(skb, ct) < 0) + goto nla_put_failure; + if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -950,13 +975,11 @@ restart: if (!ctnetlink_filter_match(ct, cb->data)) continue; - rcu_read_lock(); res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, true); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1361,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - rcu_read_lock(); err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct); - rcu_read_unlock(); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); nf_ct_put(ct); if (err <= 0) goto free; @@ -1426,12 +1447,18 @@ restart: continue; cb->args[1] = 0; } - rcu_read_lock(); + + /* We can't dump extension info for the unconfirmed + * list because unconfirmed conntracks can have + * ct->ext reallocated (and thus freed). + * + * In the dying list case ct->ext can't be free'd + * until after we drop pcpu->lock. + */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct); - rcu_read_unlock(); + ct, dying ? true : false); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; @@ -3599,6 +3626,9 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) list_for_each_entry(net, net_exit_list, exit_list) ctnetlink_net_exit(net); + + /* wait for other cpus until they are done with ctnl_notifiers */ + synchronize_rcu(); } static struct pernet_operations ctnetlink_net_ops = { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b6b14db3955b..b3f4a334f9d7 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -677,6 +677,9 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = dn->dccp_timeout; + /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 097deba7441a..c2e3dff773bc 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return NF_ACCEPT; memset(&outer_daddr, 0, sizeof(outer_daddr)); diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 7e317e6698ba..6f9144e1f1c1 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -22,7 +22,6 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fce3d93f1541..4f897b14b606 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, @@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, @@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - ct->proto.sctp.state = new_state; + ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; @@ -594,6 +594,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; + if (!timeouts) + timeouts = sn->timeouts; + /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 85c1f8c213b0..1926fd56df56 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1227,7 +1227,7 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, - [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, + [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e0d392cb3075..410809c669e1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -511,8 +511,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) /* Log invalid packets of a given protocol */ static int log_invalid_proto_min __read_mostly; static int log_invalid_proto_max __read_mostly = 255; -static int zero; -static int one = 1; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; @@ -629,8 +627,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", @@ -654,8 +652,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", @@ -663,8 +661,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { @@ -673,8 +671,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP @@ -684,8 +682,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { @@ -759,16 +757,16 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", @@ -904,8 +902,8 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE @@ -1037,9 +1035,14 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; + table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; + table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; +#endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; @@ -1164,7 +1167,6 @@ static int __init nf_conntrack_standalone_init(void) if (ret < 0) goto out_start; - BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK); BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 13d0f4a92647..14387e0b8008 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -19,6 +19,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> struct nf_ct_timeout * diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 5a35ef08c3cb..f108a76925dd 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -10,6 +10,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) @@ -50,5 +51,25 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); +int nft_fwd_dup_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + enum flow_action_id id, int oif) +{ + struct flow_action_entry *entry; + struct net_device *dev; + + /* nft_flow_rule_destroy() releases the reference on this device. */ + dev = dev_get_by_index(ctx->net, oif); + if (!dev) + return -EOPNOTSUPP; + + entry = &flow->rule->action.entries[ctx->num_actions++]; + entry->id = id; + entry->dev = dev; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 80a8f9ae4c93..8af28e10b4e6 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -11,26 +11,18 @@ #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> -struct flow_offload_entry { - struct flow_offload flow; - struct nf_conn *ct; - struct rcu_head rcu_head; -}; - static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void -flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, - struct nf_flow_route *route, +flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; - struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; - struct dst_entry *dst = route->tuple[dir].dst; + struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; @@ -38,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; - ft->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; - ft->mtu = ip6_dst_mtu_forward(dst); break; } @@ -51,49 +41,32 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->l4proto = ctt->dst.protonum; ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; - - ft->iifidx = other_dst->dev->ifindex; - ft->dst_cache = dst; } -struct flow_offload * -flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) +struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { - struct flow_offload_entry *entry; struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || !atomic_inc_not_zero(&ct->ct_general.use))) return NULL; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) goto err_ct_refcnt; - flow = &entry->flow; + flow->ct = ct; - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst)) - goto err_dst_cache_original; - - if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst)) - goto err_dst_cache_reply; - - entry->ct = ct; - - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) - flow->flags |= FLOW_OFFLOAD_SNAT; + __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) - flow->flags |= FLOW_OFFLOAD_DNAT; + __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; -err_dst_cache_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); -err_dst_cache_original: - kfree(entry); err_ct_refcnt: nf_ct_put(ct); @@ -101,6 +74,56 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static int flow_offload_fill_route(struct flow_offload *flow, + const struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; + struct dst_entry *dst = route->tuple[dir].dst; + + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + flow_tuple->mtu = ip6_dst_mtu_forward(dst); + break; + } + + flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->dst_cache = dst; + + return 0; +} + +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route) +{ + int err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + if (err < 0) + return err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); + if (err < 0) + goto err_route_reply; + + flow->type = NF_FLOW_OFFLOAD_ROUTE; + + return 0; + +err_route_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + + return err; +} +EXPORT_SYMBOL_GPL(flow_offload_route_init); + static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->state = TCP_CONNTRACK_ESTABLISHED; @@ -111,11 +134,6 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static inline __s32 nf_flow_timeout_delta(unsigned int timeout) -{ - return (__s32)(timeout - (u32)jiffies); -} - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; @@ -149,17 +167,23 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) flow_offload_fixup_ct_timeout(ct); } -void flow_offload_free(struct flow_offload *flow) +static void flow_offload_route_release(struct flow_offload *flow) { - struct flow_offload_entry *e; - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); - e = container_of(flow, struct flow_offload_entry, flow); - if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(e->ct, 0, 0); - nf_ct_put(e->ct); - kfree_rcu(e, rcu_head); +} + +void flow_offload_free(struct flow_offload *flow) +{ + switch (flow->type) { + case NF_FLOW_OFFLOAD_ROUTE: + flow_offload_route_release(flow); + break; + default: + break; + } + nf_ct_put(flow->ct); + kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); @@ -201,6 +225,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); @@ -217,7 +243,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - flow->timeout = (u32)jiffies; + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); + nf_flow_offload_add(flow_table, flow); + } + return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -230,8 +260,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { - struct flow_offload_entry *e; - rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); @@ -239,25 +267,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - e = container_of(flow, struct flow_offload_entry, flow); - clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(e->ct); - else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) - flow_offload_fixup_ct_timeout(e->ct); + flow_offload_fixup_ct(flow->ct); + else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { - struct flow_offload_entry *e; - - flow->flags |= FLOW_OFFLOAD_TEARDOWN; + set_bit(NF_FLOW_TEARDOWN, &flow->flags); - e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct_state(e->ct); + flow_offload_fixup_ct_state(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -267,7 +291,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table, { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; - struct flow_offload_entry *e; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, @@ -277,11 +300,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; - e = container_of(flow, struct flow_offload_entry, flow); - if (unlikely(nf_ct_is_dying(e->ct))) + if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; @@ -325,12 +347,20 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - struct flow_offload_entry *e; - e = container_of(flow, struct flow_offload_entry, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) - flow_offload_del(flow_table, flow); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || + test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) + nf_flow_offload_del(flow_table, flow); + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) + flow_offload_del(flow_table, flow); + } else { + flow_offload_del(flow_table, flow); + } + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { + nf_flow_offload_stats(flow_table, flow); + } } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -463,6 +493,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) int err; INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + flow_block_init(&flowtable->flow_block); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -483,18 +514,16 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - struct flow_offload_entry *e; - - e = container_of(flow, struct flow_offload_entry, flow); if (!dev) { flow_offload_teardown(flow); return; } - if (net_eq(nf_ct_net(e->ct), dev_net(dev)) && + + if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) - flow_offload_dead(flow); + flow_offload_teardown(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, @@ -502,6 +531,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); + nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) @@ -523,9 +553,23 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); + nf_flow_table_offload_flush(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); +static int __init nf_flow_table_module_init(void) +{ + return nf_flow_table_offload_init(); +} + +static void __exit nf_flow_table_module_exit(void) +{ + nf_flow_table_offload_exit(); +} + +module_init(nf_flow_table_module_init); +module_exit(nf_flow_table_module_exit); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 593357aedb36..88bedf1ff1ae 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,9 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; } +static int nf_flow_rule_route_inet(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + int err; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); + break; + case NFPROTO_IPV6: + err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); + break; + default: + err = -1; + break; + } + + return err; +} + static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, + .setup = nf_flow_table_offload_setup, + .action = nf_flow_rule_route_inet, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index d68c801dd614..9e563fd3da0f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; @@ -228,11 +228,17 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, { skb_orphan(skb); skb_dst_set_noref(skb, dst); - skb->tstamp = 0; dst_output(state->net, state->sk, skb); return NF_STOLEN; } +static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + return nf_flowtable_hw_offload(flow_table) && + test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags); +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -273,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -281,9 +290,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; iph = ip_hdr(skb); ip_decrease_ttl(iph); + skb->tstamp = 0; if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); @@ -414,11 +424,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; @@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -509,9 +522,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_nat_ipv6(flow, skb, dir) < 0) return NF_DROP; - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; ip6h = ipv6_hdr(skb); ip6h->hop_limit--; + skb->tstamp = 0; if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c new file mode 100644 index 000000000000..83e1db37c3b0 --- /dev/null +++ b/net/netfilter/nf_flow_table_offload.c @@ -0,0 +1,905 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/netdevice.h> +#include <linux/tc_act/tc_csum.h> +#include <net/flow_offload.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> + +static struct work_struct nf_flow_offload_work; +static DEFINE_SPINLOCK(flow_offload_pending_list_lock); +static LIST_HEAD(flow_offload_pending_list); + +struct flow_offload_work { + struct list_head list; + enum flow_cls_command cmd; + int priority; + struct nf_flowtable *flowtable; + struct flow_offload *flow; +}; + +struct nf_flow_key { + struct flow_dissector_key_meta meta; + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_tcp tcp; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nf_flow_match { + struct flow_dissector dissector; + struct nf_flow_key key; + struct nf_flow_key mask; +}; + +struct nf_flow_rule { + struct nf_flow_match match; + struct flow_rule *rule; +}; + +#define NF_FLOW_DISSECTOR(__match, __type, __field) \ + (__match)->dissector.offset[__type] = \ + offsetof(struct nf_flow_key, __field) + +static int nf_flow_rule_match(struct nf_flow_match *match, + const struct flow_offload_tuple *tuple) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + + switch (tuple->l3proto) { + case AF_INET: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key->basic.n_proto = htons(ETH_P_IP); + key->ipv4.src = tuple->src_v4.s_addr; + mask->ipv4.src = 0xffffffff; + key->ipv4.dst = tuple->dst_v4.s_addr; + mask->ipv4.dst = 0xffffffff; + break; + case AF_INET6: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + key->basic.n_proto = htons(ETH_P_IPV6); + key->ipv6.src = tuple->src_v6; + memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src)); + key->ipv6.dst = tuple->dst_v6; + memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst)); + break; + default: + return -EOPNOTSUPP; + } + match->dissector.used_keys |= BIT(key->control.addr_type); + mask->basic.n_proto = 0xffff; + + switch (tuple->l4proto) { + case IPPROTO_TCP: + key->tcp.flags = 0; + mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + break; + case IPPROTO_UDP: + break; + default: + return -EOPNOTSUPP; + } + + key->basic.ip_proto = tuple->l4proto; + mask->basic.ip_proto = 0xff; + + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_PORTS); + return 0; +} + +static void flow_offload_mangle(struct flow_action_entry *entry, + enum flow_action_mangle_base htype, u32 offset, + const __be32 *value, const __be32 *mask) +{ + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.offset = offset; + memcpy(&entry->mangle.mask, mask, sizeof(u32)); + memcpy(&entry->mangle.val, value, sizeof(u32)); +} + +static inline struct flow_action_entry * +flow_action_entry_next(struct nf_flow_rule *flow_rule) +{ + int i = flow_rule->rule->action.num_entries++; + + return &flow_rule->rule->action.entries[i]; +} + +static int flow_offload_eth_src(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + struct net_device *dev; + u32 mask, val; + u16 val16; + + dev = dev_get_by_index(net, tuple->iifidx); + if (!dev) + return -ENOENT; + + mask = ~0xffff0000; + memcpy(&val16, dev->dev_addr, 2); + val = val16 << 16; + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + + mask = ~0xffffffff; + memcpy(&val, dev->dev_addr + 2, 4); + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, + &val, &mask); + dev_put(dev); + + return 0; +} + +static int flow_offload_eth_dst(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct dst_entry *dst_cache; + unsigned char ha[ETH_ALEN]; + struct neighbour *n; + u32 mask, val; + u8 nud_state; + u16 val16; + + dst_cache = flow->tuplehash[dir].tuple.dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -ENOENT; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + + if (!(nud_state & NUD_VALID)) { + neigh_release(n); + return -ENOENT; + } + + mask = ~0xffffffff; + memcpy(&val, ha, 4); + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, + &val, &mask); + + mask = ~0x0000ffff; + memcpy(&val16, ha + 4, 2); + val = val16; + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + neigh_release(n); + + return 0; +} + +static void flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, + unsigned int offset, + const __be32 *addr, const __be32 *mask) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + entry = flow_action_entry_next(flow_rule); + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + offset + i, &addr[i], mask); + } +} + +static void flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static void flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static int flow_offload_l4proto(const struct flow_offload *flow) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + u8 type = 0; + + switch (protonum) { + case IPPROTO_TCP: + type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; + break; + case IPPROTO_UDP: + type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; + break; + default: + break; + } + + return type; +} + +static void flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; + + switch (protonum) { + case IPPROTO_TCP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + break; + case IPPROTO_UDP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + break; + } +} + +static void flow_offload_redirect(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + struct rtable *rt; + + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = rt->dst.dev; + dev_hold(rt->dst.dev); +} + +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv4_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv4_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) + flow_offload_ipv4_checksum(net, flow, flow_rule); + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv6_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv6_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); + +#define NF_FLOW_RULE_ACTION_MAX 16 + +static struct nf_flow_rule * +nf_flow_offload_rule_alloc(struct net *net, + const struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload *flow = offload->flow; + const struct flow_offload_tuple *tuple; + struct nf_flow_rule *flow_rule; + int err = -ENOMEM; + + flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + if (!flow_rule) + goto err_flow; + + flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); + if (!flow_rule->rule) + goto err_flow_rule; + + flow_rule->rule->match.dissector = &flow_rule->match.dissector; + flow_rule->rule->match.mask = &flow_rule->match.mask; + flow_rule->rule->match.key = &flow_rule->match.key; + + tuple = &flow->tuplehash[dir].tuple; + err = nf_flow_rule_match(&flow_rule->match, tuple); + if (err < 0) + goto err_flow_match; + + flow_rule->rule->action.num_entries = 0; + if (flowtable->type->action(net, flow, dir, flow_rule) < 0) + goto err_flow_match; + + return flow_rule; + +err_flow_match: + kfree(flow_rule->rule); +err_flow_rule: + kfree(flow_rule); +err_flow: + return NULL; +} + +static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < flow_rule->rule->action.num_entries; i++) { + entry = &flow_rule->rule->action.entries[i]; + if (entry->id != FLOW_ACTION_REDIRECT) + continue; + + dev_put(entry->dev); + } + kfree(flow_rule->rule); + kfree(flow_rule); +} + +static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) +{ + int i; + + for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) + __nf_flow_offload_destroy(flow_rule[i]); +} + +static int nf_flow_offload_alloc(const struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + struct net *net = read_pnet(&offload->flowtable->net); + + flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_ORIGINAL); + if (!flow_rule[0]) + return -ENOMEM; + + flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_REPLY); + if (!flow_rule[1]) { + __nf_flow_offload_destroy(flow_rule[0]); + return -ENOMEM; + } + + return 0; +} + +static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, + __be16 proto, int priority, + enum flow_cls_command cmd, + const struct flow_offload_tuple *tuple, + struct netlink_ext_ack *extack) +{ + cls_flow->common.protocol = proto; + cls_flow->common.prio = priority; + cls_flow->common.extack = extack; + cls_flow->command = cmd; + cls_flow->cookie = (unsigned long)tuple; +} + +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct list_head *block_cb_list) +{ + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + int err, i = 0; + + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; + + list_for_each_entry(block_cb, block_cb_list, list) { + err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, + block_cb->cb_priv); + if (err < 0) + continue; + + i++; + } + + return i; +} + +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, + &offload->flowtable->flow_block.cb_list); +} + +static void flow_offload_tuple_del(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, + &offload->flowtable->flow_block.cb_list); +} + +static int flow_offload_rule_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + int ok_count = 0; + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + + return 0; +} + +static void flow_offload_work_add(struct flow_offload_work *offload) +{ + struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; + int err; + + err = nf_flow_offload_alloc(offload, flow_rule); + if (err < 0) + return; + + err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + + nf_flow_offload_destroy(flow_rule); +} + +static void flow_offload_work_del(struct flow_offload_work *offload) +{ + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); +} + +static void flow_offload_tuple_stats(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir, + struct flow_stats *stats) +{ + struct nf_flowtable *flowtable = offload->flowtable; + struct flow_cls_offload cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + nf_flow_offload_init(&cls_flow, proto, offload->priority, + FLOW_CLS_STATS, + &offload->flow->tuplehash[dir].tuple, &extack); + + list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) + block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); + memcpy(stats, &cls_flow.stats, sizeof(*stats)); +} + +static void flow_offload_work_stats(struct flow_offload_work *offload) +{ + struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, + lastused + NF_FLOW_TIMEOUT); +} + +static void flow_offload_work_handler(struct work_struct *work) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + spin_lock_bh(&flow_offload_pending_list_lock); + list_replace_init(&flow_offload_pending_list, &offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + switch (offload->cmd) { + case FLOW_CLS_REPLACE: + flow_offload_work_add(offload); + break; + case FLOW_CLS_DESTROY: + flow_offload_work_del(offload); + break; + case FLOW_CLS_STATS: + flow_offload_work_stats(offload); + break; + default: + WARN_ON_ONCE(1); + } + list_del(&offload->list); + kfree(offload); + } +} + +static void flow_offload_queue_work(struct flow_offload_work *offload) +{ + spin_lock_bh(&flow_offload_pending_list_lock); + list_add_tail(&offload->list, &flow_offload_pending_list); + spin_unlock_bh(&flow_offload_pending_list_lock); + + schedule_work(&nf_flow_offload_work); +} + +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) +{ + struct flow_offload_work *offload; + + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) + return NULL; + + offload->cmd = cmd; + offload->flow = flow; + offload->priority = flowtable->priority; + offload->flowtable = flowtable; + + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); + if (!offload) + return; + + set_bit(NF_FLOW_HW_DYING, &flow->flags); + flow_offload_queue_work(offload); +} + +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + __s32 delta; + + delta = nf_flow_timeout_delta(flow->timeout); + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + return; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) +{ + if (nf_flowtable_hw_offload(flowtable)) + flush_work(&nf_flow_offload_work); +} + +static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + struct flow_block_cb *block_cb, *next; + int err = 0; + + switch (cmd) { + case FLOW_BLOCK_BIND: + list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); + break; + case FLOW_BLOCK_UNBIND: + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + int err; + + if (!nf_flowtable_hw_offload(flowtable)) + return 0; + + if (!dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + memset(bo, 0, sizeof(*bo)); + bo->net = dev_net(dev); + bo->block = &flowtable->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + if (err < 0) + return err; + + return 0; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; + + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); + if (err < 0) + return err; + + return nf_flow_table_block_setup(flowtable, &bo, cmd); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); + +int nf_flow_table_offload_init(void) +{ + INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + + return 0; +} + +void nf_flow_table_offload_exit(void) +{ + struct flow_offload_work *offload, *next; + LIST_HEAD(offload_pending_list); + + cancel_work_sync(&nf_flow_offload_work); + + list_for_each_entry_safe(offload, next, &offload_pending_list, list) { + list_del(&offload->list); + kfree(offload); + } +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3f6023ed4966..bfc555fcbc72 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -18,12 +18,12 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> -#include <linux/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 7ac733ebd060..64eedc17037a 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -233,6 +233,19 @@ icmp_manip_pkt(struct sk_buff *skb, return false; hdr = (struct icmphdr *)(skb->data + hdroff); + switch (hdr->type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + break; + default: + return true; + } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; @@ -722,7 +735,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, return ret; } -const struct nf_hook_ops nf_nat_ipv4_ops[] = { +static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_in, @@ -961,7 +974,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, return ret; } -const struct nf_hook_ops nf_nat_ipv6_ops[] = { +static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a2b58de82600..f8f52ff99cfb 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -189,7 +189,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } - if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + if (skb_dst(skb) && !skb_dst_force(skb)) { status = -ENETDOWN; goto err; } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c769462a839e..b0930d4aba22 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -56,7 +56,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { - opts->mss = get_unaligned_be16(ptr); + opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; @@ -115,7 +115,7 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | - opts->mss); + opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) @@ -642,7 +642,7 @@ synproxy_recv_client_ack(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) @@ -1060,7 +1060,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d47469f824a1..d1318bdf49ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -22,6 +22,8 @@ #include <net/net_namespace.h> #include <net/sock.h> +#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -151,11 +153,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static int nft_netdev_register_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err, j; + + j = 0; + list_for_each_entry(hook, hook_list, list) { + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) + goto err_register; + + j++; + } + return 0; + +err_register: + list_for_each_entry(hook, hook_list, list) { + if (j-- <= 0) + break; + + nf_unregister_net_hook(net, &hook->ops); + } + return err; +} + +static void nft_netdev_unregister_hooks(struct net *net, + struct list_head *hook_list) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} + +static int nft_register_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); +} + +static void nft_unregister_basechain_hooks(struct net *net, int family, + struct nft_base_chain *basechain) +{ + if (family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -168,14 +223,14 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nf_register_net_hook(net, ops); + return nft_register_basechain_hooks(net, table->family, basechain); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - const struct nft_base_chain *basechain; + struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || @@ -187,7 +242,7 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, ops); + nft_unregister_basechain_hooks(net, table->family, basechain); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -308,6 +363,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { + struct nft_flow_rule *flow; struct nft_trans *trans; int err; @@ -315,6 +371,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) if (trans == NULL) return -ENOMEM; + if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(ctx->net, rule); + if (IS_ERR(flow)) { + nft_trans_destroy(trans); + return PTR_ERR(flow); + } + + nft_trans_flow_rule(trans) = flow; + } + err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); @@ -487,46 +553,70 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * +__nft_chain_type_get(u8 family, enum nft_chain_types type) +{ + if (family >= NFPROTO_NUMPROTO || + type >= NFT_CHAIN_T_MAX) + return NULL; + + return chain_type[family][type]; +} + +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { + const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { - if (chain_type[family][i] != NULL && - !nla_strcmp(nla, chain_type[family][i]->name)) - return chain_type[family][i]; + type = __nft_chain_type_get(family, i); + if (!type) + continue; + if (!nla_strcmp(nla, type->name)) + return type; } return NULL; } -/* - * Loading a module requires dropping mutex that guards the - * transaction. - * We first need to abort any pending transactions as once - * mutex is unlocked a different client could start a new - * transaction. It must not see any 'future generation' - * changes * as these changes will never happen. - */ -#ifdef CONFIG_MODULES -static int __nf_tables_abort(struct net *net); +struct nft_module_request { + struct list_head list; + char module[MODULE_NAME_LEN]; + bool done; +}; -static void nft_request_module(struct net *net, const char *fmt, ...) +#ifdef CONFIG_MODULES +static int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nft_module_request *req; va_list args; int ret; - __nf_tables_abort(net); - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); - if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) - return; + if (ret >= MODULE_NAME_LEN) + return 0; - mutex_unlock(&net->nft.commit_mutex); - request_module("%s", module_name); - mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(req, &net->nft.module_list, list) { + if (!strcmp(req->module, module_name)) { + if (req->done) + return 0; + + /* A request to load this module already exists. */ + return -EAGAIN; + } + } + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->done = false; + strlcpy(req->module, module_name, MODULE_NAME_LEN); + list_add_tail(&req->list, &net->nft.module_list); + + return -EAGAIN; } #endif @@ -550,10 +640,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { - nft_request_module(net, "nft-chain-%u-%.*s", family, - nla_len(nla), (const char *)nla_data(nla)); - type = __nf_tables_chain_type_lookup(nla, family); - if (type != NULL) + if (nft_request_module(net, "nft-chain-%u-%.*s", family, + nla_len(nla), + (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -742,7 +831,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nft_unregister_basechain_hooks(net, table->family, + nft_base_chain(chain)); } } @@ -757,14 +847,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nf_register_net_hook(net, &nft_base_chain(chain)->ops); + err = nft_register_basechain_hooks(net, table->family, + nft_base_chain(chain)); if (err < 0) - goto err; + goto err_register_hooks; i++; } return 0; -err: + +err_register_hooks: if (i) nft_table_disable(net, table, i); return err; @@ -978,12 +1070,18 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { + if (!nft_is_active_next(ctx->net, flowtable)) + continue; + err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + if (!nft_is_active_next(ctx->net, obj)) + continue; + err = nft_delobj(ctx, obj); if (err < 0) goto out; @@ -1086,11 +1184,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) void nft_register_chain_type(const struct nft_chain_type *ctype) { - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return; - nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } @@ -1174,7 +1269,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; @@ -1225,6 +1321,46 @@ nla_put_failure: return -ENOSPC; } +static int nft_dump_basechain_hook(struct sk_buff *skb, int family, + const struct nft_base_chain *basechain) +{ + const struct nf_hook_ops *ops = &basechain->ops; + struct nft_hook *hook, *first = NULL; + struct nlattr *nest, *nest_devs; + int n = 0; + + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + goto nla_put_failure; + + if (family == NFPROTO_NETDEV) { + nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (!first) + first = hook; + + if (nla_put_string(skb, NFTA_DEVICE_NAME, + hook->ops.dev->name)) + goto nla_put_failure; + n++; + } + nla_nest_end(skb, nest_devs); + + if (n == 1 && + nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + + return 0; +nla_put_failure: + return -1; +} + static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, @@ -1253,21 +1389,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); - const struct nf_hook_ops *ops = &basechain->ops; struct nft_stats __percpu *stats; - struct nlattr *nest; - nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); - if (nest == NULL) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) + if (nft_dump_basechain_hook(skb, family, basechain)) goto nla_put_failure; - if (basechain->dev_name[0] && - nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) - goto nla_put_failure; - nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) @@ -1461,8 +1586,9 @@ static void nft_chain_stats_replace(struct nft_trans *trans) if (!nft_trans_chain_stats(trans)) return; - rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), - lockdep_commit_lock_is_held(trans->ctx.net)); + nft_trans_chain_stats(trans) = + rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); @@ -1485,6 +1611,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; + struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; @@ -1495,6 +1622,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (ctx->family == NFPROTO_NETDEV) { + list_for_each_entry_safe(hook, next, + &basechain->hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); @@ -1508,13 +1642,126 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) } } +static struct nft_hook *nft_netdev_hook_alloc(struct net *net, + const struct nlattr *attr) +{ + struct net_device *dev; + char ifname[IFNAMSIZ]; + struct nft_hook *hook; + int err; + + hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL); + if (!hook) { + err = -ENOMEM; + goto err_hook_alloc; + } + + nla_strlcpy(ifname, attr, IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + if (!dev) { + err = -ENOENT; + goto err_hook_dev; + } + hook->ops.dev = dev; + + return hook; + +err_hook_dev: + kfree(hook); +err_hook_alloc: + return ERR_PTR(err); +} + +static bool nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) +{ + struct nft_hook *hook; + + list_for_each_entry(hook, hook_list, list) { + if (this->ops.dev == hook->ops.dev) + return true; + } + + return false; +} + +static int nf_tables_parse_netdev_hooks(struct net *net, + const struct nlattr *attr, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + const struct nlattr *tmp; + int rem, n = 0, err; + + nla_for_each_nested(tmp, attr, rem) { + if (nla_type(tmp) != NFTA_DEVICE_NAME) { + err = -EINVAL; + goto err_hook; + } + + hook = nft_netdev_hook_alloc(net, tmp); + if (IS_ERR(hook)) { + err = PTR_ERR(hook); + goto err_hook; + } + if (nft_hook_list_find(hook_list, hook)) { + kfree(hook); + err = -EEXIST; + goto err_hook; + } + list_add_tail(&hook->list, hook_list); + n++; + + if (n == NFT_NETDEVICE_MAX) { + err = -EFBIG; + goto err_hook; + } + } + if (!n) + return -EINVAL; + + return 0; + +err_hook: + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del(&hook->list); + kfree(hook); + } + return err; +} + struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; - struct net_device *dev; + struct list_head list; }; +static int nft_chain_parse_netdev(struct net *net, + struct nlattr *tb[], + struct list_head *hook_list) +{ + struct nft_hook *hook; + int err; + + if (tb[NFTA_HOOK_DEV]) { + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + list_add_tail(&hook->list, hook_list); + } else if (tb[NFTA_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], + hook_list); + if (err < 0) + return err; + } else { + return -EINVAL; + } + + return 0; +} + static int nft_chain_parse_hook(struct net *net, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, @@ -1522,7 +1769,6 @@ static int nft_chain_parse_hook(struct net *net, { struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; - struct net_device *dev; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1541,7 +1787,10 @@ static int nft_chain_parse_hook(struct net *net, hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; + type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); + if (!type) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, autoload); @@ -1560,23 +1809,14 @@ static int nft_chain_parse_hook(struct net *net, hook->type = type; - hook->dev = NULL; + INIT_LIST_HEAD(&hook->list); if (family == NFPROTO_NETDEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - if (!dev) { + err = nft_chain_parse_netdev(net, ha, &hook->list); + if (err < 0) { module_put(type->owner); - return -ENOENT; + return err; } - hook->dev = dev; - } else if (ha[NFTA_HOOK_DEV]) { + } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } @@ -1586,6 +1826,12 @@ static int nft_chain_parse_hook(struct net *net, static void nft_chain_release_hook(struct nft_chain_hook *hook) { + struct nft_hook *h, *next; + + list_for_each_entry_safe(h, next, &hook->list, list) { + list_del(&h->list); + kfree(h); + } module_put(hook->type->owner); } @@ -1610,6 +1856,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha return kvmalloc(alloc, GFP_KERNEL); } +static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, + const struct nft_chain_hook *hook, + struct nft_chain *chain) +{ + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; +} + +static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + struct nft_chain_hook *hook, u32 flags) +{ + struct nft_chain *chain; + struct nft_hook *h; + + basechain->type = hook->type; + INIT_LIST_HEAD(&basechain->hook_list); + chain = &basechain->chain; + + if (family == NFPROTO_NETDEV) { + list_splice_init(&hook->list, &basechain->hook_list); + list_for_each_entry(h, &basechain->hook_list, list) + nft_basechain_hook_init(&h->ops, family, hook, chain); + + basechain->ops.hooknum = hook->num; + basechain->ops.priority = hook->priority; + } else { + nft_basechain_hook_init(&basechain->ops, family, hook, chain); + } + + chain->flags |= NFT_BASE_CHAIN | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + + flow_block_init(&basechain->flow_block); + + return 0; +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1628,7 +1917,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; - struct nf_hook_ops *ops; err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) @@ -1639,9 +1927,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_chain_release_hook(&hook); return -ENOMEM; } - - if (hook.dev != NULL) - strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); + chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1654,24 +1940,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, static_branch_inc(&nft_counters_enabled); } - basechain->type = hook.type; - chain = &basechain->chain; - - ops = &basechain->ops; - ops->pf = family; - ops->hooknum = hook.num; - ops->priority = hook.priority; - ops->priv = chain; - ops->hook = hook.type->hooks[ops->hooknum]; - ops->dev = hook.dev; - - chain->flags |= NFT_BASE_CHAIN | flags; - basechain->policy = NF_ACCEPT; - if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) - return -EOPNOTSUPP; - - flow_block_init(&basechain->flow_block); + err = nft_basechain_init(basechain, family, &hook, flags); + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); + return err; + } } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1715,7 +1989,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err2; } - nft_trans_chain_policy(trans) = -1; + nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; @@ -1731,6 +2005,25 @@ err1: return err; } +static bool nft_hook_list_equal(struct list_head *hook_list1, + struct list_head *hook_list2) +{ + struct nft_hook *hook; + int n = 0, m = 0; + + n = 0; + list_for_each_entry(hook, hook_list2, list) { + if (!nft_hook_list_find(hook_list1, hook)) + return false; + + n++; + } + list_for_each_entry(hook, hook_list1, list) + m++; + + return n == m; +} + static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags) { @@ -1762,12 +2055,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EBUSY; } - ops = &basechain->ops; - if (ops->hooknum != hook.num || - ops->priority != hook.priority || - ops->dev != hook.dev) { - nft_chain_release_hook(&hook); - return -EBUSY; + if (ctx->family == NFPROTO_NETDEV) { + if (!nft_hook_list_equal(&basechain->hook_list, + &hook.list)) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } else { + ops = &basechain->ops; + if (ops->hooknum != hook.num || + ops->priority != hook.priority) { + nft_chain_release_hook(&hook); + return -EBUSY; + } } nft_chain_release_hook(&hook); } @@ -1922,6 +2222,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + flags |= chain->flags & NFT_BASE_CHAIN; return nf_tables_updchain(&ctx, genmask, policy, flags); } @@ -2049,9 +2350,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; @@ -2077,9 +2377,9 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); - nft_request_module(net, "nft-expr-%.*s", - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_request_module(net, "nft-expr-%.*s", + nla_len(nla), + (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -2087,7 +2387,8 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { - [NFTA_EXPR_NAME] = { .type = NLA_STRING }, + [NFTA_EXPR_NAME] = { .type = NLA_STRING, + .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; @@ -2169,9 +2470,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) - nft_expr_type_request_module(ctx->net, - ctx->family, - tb[NFTA_EXPR_NAME]); + if (nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]) != -EAGAIN) + err = -ENOENT; #endif goto err1; } @@ -2853,7 +3155,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return nft_table_validate(net, table); if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(rule); + flow = nft_flow_rule_create(net, rule); if (IS_ERR(flow)) return PTR_ERR(flow); @@ -3008,8 +3310,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { - nft_request_module(ctx->net, "nft-set"); - if (!list_empty(&nf_tables_set_types)) + if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif @@ -3090,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, + [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, @@ -3256,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input) return cpu_to_be64(jiffies64_to_msecs(input)); } +static int nf_tables_fill_set_concat(struct sk_buff *skb, + const struct nft_set *set) +{ + struct nlattr *concat, *field; + int i; + + concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); + if (!concat) + return -ENOMEM; + + for (i = 0; i < set->field_count; i++) { + field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); + if (!field) + return -ENOMEM; + + if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, + htonl(set->field_len[i]))) + return -ENOMEM; + + nla_nest_end(skb, field); + } + + nla_nest_end(skb, concat); + + return 0; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -3319,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (desc == NULL) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) goto nla_put_failure; + + if (set->field_count > 1 && + nf_tables_fill_set_concat(skb, set)) + goto nla_put_failure; + nla_nest_end(skb, desc); nlmsg_end(skb, nlh); @@ -3496,6 +3831,53 @@ err: return err; } +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_set_desc_concat_parse(const struct nlattr *attr, + struct nft_set_desc *desc) +{ + struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; + u32 len; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, + nft_concat_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_SET_FIELD_LEN]) + return -EINVAL; + + len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); + + if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) + return -E2BIG; + + desc->field_len[desc->field_count++] = len; + + return 0; +} + +static int nft_set_desc_concat(struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *attr; + int rem, err; + + nla_for_each_nested(attr, nla, rem) { + if (nla_type(attr) != NFTA_LIST_ELEM) + return -EINVAL; + + err = nft_set_desc_concat_parse(attr, desc); + if (err < 0) + return err; + } + + return 0; +} + static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { @@ -3509,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + if (da[NFTA_SET_DESC_CONCAT]) + err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); - return 0; + return err; } static int nf_tables_newset(struct net *net, struct sock *nlsk, @@ -3533,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned char *udata; u16 udlen; int err; + int i; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -3562,8 +3947,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, NFT_SET_OBJECT)) return -EINVAL; /* Only one of these operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == - (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) + if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_OBJECT)) + return -EOPNOTSUPP; + if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } @@ -3708,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + err = ops->init(set, &desc, nla); if (err < 0) goto err3; @@ -3911,6 +4303,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, + [NFT_SET_EXT_KEY_END] = { + .align = __alignof__(u32), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -3927,7 +4322,9 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -3977,6 +4374,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), + NFT_DATA_VALUE, set->klen) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, @@ -4219,11 +4621,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4242,14 +4661,17 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - return err; + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + } priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) @@ -4376,8 +4798,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp) + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4390,6 +4812,8 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_init(ext, tmpl); memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -4449,13 +4873,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; + struct nft_data_desc desc; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; @@ -4485,14 +4909,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; - if (nla[NFTA_SET_ELEM_DATA] != NULL && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } + if ((flags & NFT_SET_ELEM_INTERVAL_END) && + (nla[NFTA_SET_ELEM_DATA] || + nla[NFTA_SET_ELEM_OBJREF] || + nla[NFTA_SET_ELEM_TIMEOUT] || + nla[NFTA_SET_ELEM_EXPIRATION] || + nla[NFTA_SET_ELEM_USERDATA] || + nla[NFTA_SET_ELEM_EXPR])) + return -EINVAL; + timeout = 0; if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) @@ -4515,15 +4945,22 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + goto err_parse_key; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4533,27 +4970,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; - goto err2; + goto err_parse_key_end; } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err2; + goto err_parse_key_end; } nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, + err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) - goto err2; + goto err_parse_key_end; err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) - goto err3; + if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) + goto err_parse_data; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { @@ -4569,18 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_validate_register_store(&bind_ctx, dreg, &data, - d2.type, d2.len); + desc.type, desc.len); if (err < 0) - goto err3; + goto err_parse_data; - if (d2.type == NFT_DATA_VERDICT && + if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || data.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4596,10 +5033,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, data.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) - goto err3; + goto err_parse_data; ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -4616,7 +5054,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err4; + goto err_trans; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -4627,7 +5065,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { err = -EBUSY; - goto err5; + goto err_element_clash; } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && @@ -4640,33 +5078,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } - goto err5; + goto err_element_clash; } if (set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; - goto err6; + goto err_set_full; } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err6: +err_set_full: set->ops->remove(ctx->net, set, &elem); -err5: +err_element_clash: kfree(trans); -err4: +err_trans: if (obj) obj->use--; kfree(elem.priv); -err3: +err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); -err2: - nft_data_release(&elem.key.val, d1.type); -err1: + nft_data_release(&data, desc.type); +err_parse_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); +err_parse_key: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); + return err; } @@ -4761,7 +5201,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -4772,11 +5211,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -4786,37 +5224,41 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; + return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - 0, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, NULL, 0, 0, + GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; @@ -4827,13 +5269,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } @@ -5123,14 +5564,45 @@ nft_obj_type_get(struct net *net, u32 objtype) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-obj-%u", objtype); - if (__nft_obj_type_get(objtype)) + if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } +static int nf_tables_updobj(const struct nft_ctx *ctx, + const struct nft_object_type *type, + const struct nlattr *attr, + struct nft_object *obj) +{ + struct nft_object *newobj; + struct nft_trans *trans; + int err; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, + sizeof(struct nft_trans_obj)); + if (!trans) + return -ENOMEM; + + newobj = nft_obj_init(ctx, type, attr); + if (IS_ERR(newobj)) { + err = PTR_ERR(newobj); + goto err_free_trans; + } + + nft_trans_obj(trans) = obj; + nft_trans_obj_update(trans) = true; + nft_trans_obj_newobj(trans) = newobj; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_free_trans: + kfree(trans); + return err; +} + static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -5170,7 +5642,13 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } - return 0; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + type = __nft_obj_type_get(objtype); + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -5538,6 +6016,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, @@ -5554,6 +6033,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); +void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, + struct nft_flowtable *flowtable, + enum nft_trans_phase phase) +{ + switch (phase) { + case NFT_TRANS_PREPARE: + case NFT_TRANS_ABORT: + case NFT_TRANS_RELEASE: + flowtable->use--; + /* fall through */ + default: + return; + } +} +EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); + static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) @@ -5568,43 +6063,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static int nf_tables_parse_devices(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct net_device *dev_array[], int *len) -{ - const struct nlattr *tmp; - struct net_device *dev; - char ifname[IFNAMSIZ]; - int rem, n = 0, err; - - nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { - err = -EINVAL; - goto err1; - } - - nla_strlcpy(ifname, tmp, IFNAMSIZ); - dev = __dev_get_by_name(ctx->net, ifname); - if (!dev) { - err = -ENOENT; - goto err1; - } - - dev_array[n++] = dev; - if (n == NFT_FLOWTABLE_DEVICE_MAX) { - err = -EFBIG; - goto err1; - } - } - if (!len) - return -EINVAL; - - err = 0; -err1: - *len = n; - return err; -} - static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, @@ -5615,11 +6073,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr *attr, struct nft_flowtable *flowtable) { - struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX]; struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; - struct nf_hook_ops *ops; + struct nft_hook *hook; int hooknum, priority; - int err, n = 0, i; + int err; err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); @@ -5637,27 +6094,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS], - dev_array, &n); + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable->hook_list); if (err < 0) return err; - ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - flowtable->hooknum = hooknum; - flowtable->priority = priority; - flowtable->ops = ops; - flowtable->ops_len = n; + flowtable->hooknum = hooknum; + flowtable->data.priority = priority; - for (i = 0; i < n; i++) { - flowtable->ops[i].pf = NFPROTO_NETDEV; - flowtable->ops[i].hooknum = hooknum; - flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data; - flowtable->ops[i].hook = flowtable->data.type->hook; - flowtable->ops[i].dev = dev_array[i]; + list_for_each_entry(hook, &flowtable->hook_list, list) { + hook->ops.pf = NFPROTO_NETDEV; + hook->ops.hooknum = hooknum; + hook->ops.priority = priority; + hook->ops.priv = &flowtable->data; + hook->ops.hook = flowtable->data.type->hook; } return err; @@ -5686,25 +6137,81 @@ nft_flowtable_type_get(struct net *net, u8 family) lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nf-flowtable-%u", family); - if (__nft_flowtable_type_get(family)) + if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } +/* Only called from error and netdev event paths. */ +static void nft_unregister_flowtable_hook(struct net *net, + struct nft_flowtable *flowtable, + struct nft_hook *hook) +{ + nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); +} + static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; + list_for_each_entry(hook, &flowtable->hook_list, list) + nf_unregister_net_hook(net, &hook->ops); +} - nf_unregister_net_hook(net, &flowtable->ops[i]); +static int nft_register_flowtable_net_hooks(struct net *net, + struct nft_table *table, + struct nft_flowtable *flowtable) +{ + struct nft_hook *hook, *hook2, *next; + struct nft_flowtable *ft; + int err, i = 0; + + list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(ft, &table->flowtables, list) { + list_for_each_entry(hook2, &ft->hook_list, list) { + if (hook->ops.dev == hook2->ops.dev && + hook->ops.pf == hook2->ops.pf) { + err = -EBUSY; + goto err_unregister_net_hooks; + } + } + } + + err = flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_BIND); + if (err < 0) + goto err_unregister_net_hooks; + + err = nf_register_net_hook(net, &hook->ops); + if (err < 0) { + flowtable->data.type->setup(&flowtable->data, + hook->ops.dev, + FLOW_BLOCK_UNBIND); + goto err_unregister_net_hooks; + } + + i++; } + + return 0; + +err_unregister_net_hooks: + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (i-- <= 0) + break; + + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; } static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, @@ -5715,12 +6222,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nf_flowtable_type *type; - struct nft_flowtable *flowtable, *ft; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_flowtable *flowtable; + struct nft_hook *hook, *next; struct nft_table *table; struct nft_ctx ctx; - int err, i, k; + int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || @@ -5759,6 +6267,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL); if (!flowtable->name) { @@ -5772,6 +6281,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, goto err2; } + if (nla[NFTA_FLOWTABLE_FLAGS]) { + flowtable->data.flags = + ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); + if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + goto err3; + } + + write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) @@ -5782,43 +6299,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err4; - for (i = 0; i < flowtable->ops_len; i++) { - if (!flowtable->ops[i].dev) - continue; - - list_for_each_entry(ft, &table->flowtables, list) { - for (k = 0; k < ft->ops_len; k++) { - if (!ft->ops[k].dev) - continue; - - if (flowtable->ops[i].dev == ft->ops[k].dev && - flowtable->ops[i].pf == ft->ops[k].pf) { - err = -EBUSY; - goto err5; - } - } - } - - err = nf_register_net_hook(net, &flowtable->ops[i]); - if (err < 0) - goto err5; - } + err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + if (err < 0) + goto err4; err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err6; + goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err6: - i = flowtable->ops_len; err5: - for (k = i - 1; k >= 0; k--) - nf_unregister_net_hook(net, &flowtable->ops[k]); - - kfree(flowtable->ops); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + nft_unregister_flowtable_hook(net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } err4: flowtable->data.type->free(&flowtable->data); err3: @@ -5885,8 +6383,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; + struct nft_hook *hook; struct nlmsghdr *nlh; - int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); @@ -5902,25 +6400,23 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), - NFTA_FLOWTABLE_PAD)) + NFTA_FLOWTABLE_PAD) || + nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || - nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) + nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; - for (i = 0; i < flowtable->ops_len; i++) { - const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev); - - if (dev && - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -6111,9 +6607,16 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - kfree(flowtable->ops); - kfree(flowtable->name); + struct nft_hook *hook, *next; + flowtable->data.type->free(&flowtable->data); + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); + list_del_rcu(&hook->list); + kfree(hook); + } + kfree(flowtable->name); module_put(flowtable->data.type->owner); kfree(flowtable); } @@ -6151,14 +6654,16 @@ nla_put_failure: static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { - int i; + struct nft_hook *hook; - for (i = 0; i < flowtable->ops_len; i++) { - if (flowtable->ops[i].dev != dev) + list_for_each_entry(hook, &flowtable->hook_list, list) { + if (hook->ops.dev != dev) continue; - nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]); - flowtable->ops[i].dev = NULL; + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); break; } } @@ -6431,6 +6936,20 @@ static void nft_chain_commit_update(struct nft_trans *trans) } } +static void nft_obj_commit_update(struct nft_trans *trans) +{ + struct nft_object *newobj; + struct nft_object *obj; + + obj = nft_trans_obj(trans); + newobj = nft_trans_obj_newobj(trans); + + if (obj->ops->update) + obj->ops->update(obj, newobj); + + kfree(newobj); +} + static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -6620,6 +7139,18 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_module_autoload_cleanup(struct net *net) +{ + struct nft_module_request *req, *next; + + WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); + list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!req->done); + list_del(&req->list); + kfree(req); + } +} + static void nf_tables_commit_release(struct net *net) { struct nft_trans *trans; @@ -6632,6 +7163,7 @@ static void nf_tables_commit_release(struct net *net) * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&net->nft.commit_list)) { + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); return; } @@ -6646,6 +7178,7 @@ static void nf_tables_commit_release(struct net *net) list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); + nf_tables_module_autoload_cleanup(net); mutex_unlock(&net->nft.commit_mutex); schedule_work(&trans_destroy_work); @@ -6795,10 +7328,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te->set->ndeact--; break; case NFT_MSG_NEWOBJ: - nft_clear(net, nft_trans_obj(trans)); - nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), - NFT_MSG_NEWOBJ); - nft_trans_destroy(trans); + if (nft_trans_obj_update(trans)) { + nft_obj_commit_update(trans); + nf_tables_obj_notify(&trans->ctx, + nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + } else { + nft_clear(net, nft_trans_obj(trans)); + nf_tables_obj_notify(&trans->ctx, + nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + nft_trans_destroy(trans); + } break; case NFT_MSG_DELOBJ: nft_obj_del(nft_trans_obj(trans)); @@ -6829,6 +7370,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_module_autoload(struct net *net) +{ + struct nft_module_request *req, *next; + LIST_HEAD(module_list); + + list_splice_init(&net->nft.module_list, &module_list); + mutex_unlock(&net->nft.commit_mutex); + list_for_each_entry_safe(req, next, &module_list, list) { + if (req->done) { + list_del(&req->list); + kfree(req); + } else { + request_module("%s", req->module); + req->done = true; + } + } + mutex_lock(&net->nft.commit_mutex); + list_splice(&module_list, &net->nft.module_list); +} + static void nf_tables_abort_release(struct nft_trans *trans) { switch (trans->msg_type) { @@ -6858,7 +7419,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int __nf_tables_abort(struct net *net) +static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -6945,8 +7506,13 @@ static int __nf_tables_abort(struct net *net) nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: - trans->ctx.table->use--; - nft_obj_del(nft_trans_obj(trans)); + if (nft_trans_obj_update(trans)) { + kfree(nft_trans_obj_newobj(trans)); + nft_trans_destroy(trans); + } else { + trans->ctx.table->use--; + nft_obj_del(nft_trans_obj(trans)); + } break; case NFT_MSG_DELOBJ: trans->ctx.table->use++; @@ -6975,6 +7541,11 @@ static int __nf_tables_abort(struct net *net) nf_tables_abort_release(trans); } + if (autoload) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + return 0; } @@ -6983,9 +7554,9 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload) { - int ret = __nf_tables_abort(net); + int ret = __nf_tables_abort(net, autoload); mutex_unlock(&net->nft.commit_mutex); @@ -7235,7 +7806,7 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len) return -EINVAL; if (len == 0) return -EINVAL; - if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data)) + if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data)) return -ERANGE; return 0; @@ -7283,7 +7854,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx, if (len == 0) return -EINVAL; if (reg * NFT_REG32_SIZE + len > - FIELD_SIZEOF(struct nft_regs, data)) + sizeof_field(struct nft_regs, data)) return -ERANGE; if (data != NULL && type != NFT_DATA_VALUE) @@ -7580,6 +8151,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + INIT_LIST_HEAD(&net->nft.module_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7591,7 +8163,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) { mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) - __nf_tables_abort(net); + __nf_tables_abort(net, false); __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); @@ -7627,13 +8199,20 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err4; + err = nft_offload_init(); + if (err < 0) + goto err5; + /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err5; + goto err6; nft_chain_route_init(); + return err; +err6: + nft_offload_exit(); err5: rhltable_destroy(&nft_objname_ht); err4: @@ -7650,6 +8229,7 @@ err1: static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); + nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index c0d18c1d77ac..2bb28483af22 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -28,13 +28,10 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) return flow; } -struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) +struct nft_flow_rule *nft_flow_rule_create(struct net *net, + const struct nft_rule *rule) { - struct nft_offload_ctx ctx = { - .dep = { - .type = NFT_OFFLOAD_DEP_UNSPEC, - }, - }; + struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; @@ -47,26 +44,40 @@ struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) expr = nft_expr_next(expr); } + if (num_actions == 0) + return ERR_PTR(-EOPNOTSUPP); + flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); + + ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + ctx->net = net; + ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; + while (expr->ops && expr != nft_expr_last(rule)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } - err = expr->ops->offload(&ctx, flow, expr); + err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } - flow->proto = ctx.dep.l3num; + flow->proto = ctx->dep.l3num; + kfree(ctx); return flow; err_out: + kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); @@ -74,6 +85,19 @@ err_out: void nft_flow_rule_destroy(struct nft_flow_rule *flow) { + struct flow_action_entry *entry; + int i; + + flow_action_for_each(i, entry, &flow->rule->action) { + switch (entry->id) { + case FLOW_ACTION_REDIRECT: + case FLOW_ACTION_MIRRED: + dev_put(entry->dev); + break; + default: + break; + } + } kfree(flow->rule); kfree(flow); } @@ -111,13 +135,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, common->extack = extack; } -static int nft_setup_cb_call(struct nft_base_chain *basechain, - enum tc_setup_type type, void *type_data) +static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, + struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; - list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) { + list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; @@ -134,32 +158,46 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } -static int nft_flow_offload_rule(struct nft_trans *trans, - enum flow_cls_command command) +static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, + const struct nft_base_chain *basechain, + const struct nft_rule *rule, + const struct nft_flow_rule *flow, + struct netlink_ext_ack *extack, + enum flow_cls_command command) { - struct nft_flow_rule *flow = nft_trans_flow_rule(trans); - struct nft_rule *rule = nft_trans_rule(trans); - struct flow_cls_offload cls_flow = {}; - struct nft_base_chain *basechain; - struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; - if (!nft_is_base_chain(trans->ctx.chain)) - return -EOPNOTSUPP; - - basechain = nft_base_chain(trans->ctx.chain); + memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; - nft_flow_offload_common_init(&cls_flow.common, proto, - basechain->ops.priority, &extack); - cls_flow.command = command; - cls_flow.cookie = (unsigned long) rule; + nft_flow_offload_common_init(&cls_flow->common, proto, + basechain->ops.priority, extack); + cls_flow->command = command; + cls_flow->cookie = (unsigned long) rule; if (flow) - cls_flow.rule = flow->rule; + cls_flow->rule = flow->rule; +} + +static int nft_flow_offload_rule(struct nft_chain *chain, + struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct netlink_ext_ack extack = {}; + struct flow_cls_offload cls_flow; + struct nft_base_chain *basechain; - return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); + if (!nft_is_base_chain(chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(chain); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + command); + + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + &basechain->flow_block.cb_list); } static int nft_flow_offload_bind(struct flow_block_offload *bo, @@ -173,6 +211,18 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; + struct flow_cls_offload cls_flow; + struct netlink_ext_ack extack; + struct nft_chain *chain; + struct nft_rule *rule; + + chain = &basechain->chain; + list_for_each_entry(rule, &chain->rules, list) { + memset(&extack, 0, sizeof(extack)); + nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, + &extack, FLOW_CLS_DESTROY); + nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); + } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); @@ -182,58 +232,220 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo, return 0; } +static int nft_block_setup(struct nft_base_chain *basechain, + struct flow_block_offload *bo, + enum flow_block_command cmd) +{ + int err; + + switch (cmd) { + case FLOW_BLOCK_BIND: + err = nft_flow_offload_bind(bo, basechain); + break; + case FLOW_BLOCK_UNBIND: + err = nft_flow_offload_unbind(bo, basechain); + break; + default: + WARN_ON_ONCE(1); + err = -EOPNOTSUPP; + } + + return err; +} + +static void nft_flow_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nft_base_chain *basechain, + struct netlink_ext_ack *extack) +{ + memset(bo, 0, sizeof(*bo)); + bo->net = net; + bo->block = &basechain->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); +} + +static int nft_block_offload_cmd(struct nft_base_chain *chain, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + return nft_block_setup(chain, &bo, cmd); +} + +static void nft_indr_block_ing_cmd(struct net_device *dev, + struct nft_base_chain *chain, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + + if (!chain) + return; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); + + nft_block_setup(chain, &bo, cmd); +} + +static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + + flow_indr_block_call(dev, &bo, cmd); + + if (list_empty(&bo.cb_list)) + return -EOPNOTSUPP; + + return nft_block_setup(chain, &bo, cmd); +} + #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK -static int nft_flow_offload_chain(struct nft_trans *trans, +static int nft_chain_offload_cmd(struct nft_base_chain *basechain, + struct net_device *dev, + enum flow_block_command cmd) +{ + int err; + + if (dev->netdev_ops->ndo_setup_tc) + err = nft_block_offload_cmd(basechain, dev, cmd); + else + err = nft_indr_block_offload_cmd(basechain, dev, cmd); + + return err; +} + +static int nft_flow_block_chain(struct nft_base_chain *basechain, + const struct net_device *this_dev, + enum flow_block_command cmd) +{ + struct net_device *dev; + struct nft_hook *hook; + int err, i = 0; + + list_for_each_entry(hook, &basechain->hook_list, list) { + dev = hook->ops.dev; + if (this_dev && this_dev != dev) + continue; + + err = nft_chain_offload_cmd(basechain, dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; + + return err; + } + i++; + } + + return 0; + +err_flow_block: + list_for_each_entry(hook, &basechain->hook_list, list) { + if (i-- <= 0) + break; + + dev = hook->ops.dev; + nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + } + return err; +} + +static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { - struct nft_chain *chain = trans->ctx.chain; - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; struct nft_base_chain *basechain; - struct net_device *dev; - int err; + u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - dev = basechain->ops.dev; - if (!dev || !dev->netdev_ops->ndo_setup_tc) - return -EOPNOTSUPP; + policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ - if (cmd == FLOW_BLOCK_BIND && - nft_trans_chain_policy(trans) != -1 && - nft_trans_chain_policy(trans) != NF_ACCEPT) + if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; - bo.command = cmd; - bo.block = &basechain->flow_block; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + return nft_flow_block_chain(basechain, NULL, cmd); +} - err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo); - if (err < 0) - return err; +static void nft_flow_rule_offload_abort(struct net *net, + struct nft_trans *trans) +{ + int err = 0; - switch (cmd) { - case FLOW_BLOCK_BIND: - err = nft_flow_offload_bind(&bo, basechain); - break; - case FLOW_BLOCK_UNBIND: - err = nft_flow_offload_unbind(&bo, basechain); - break; - } + list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; - return err; + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans->ctx.chain, NULL, + FLOW_BLOCK_BIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); + break; + } + + if (WARN_ON_ONCE(err)) + break; + } } int nft_flow_rule_offload_commit(struct net *net) { struct nft_trans *trans; int err = 0; + u8 policy; list_for_each_entry(trans, &net->nft.commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) @@ -241,39 +453,171 @@ int nft_flow_rule_offload_commit(struct net *net) switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || + nft_trans_chain_update(trans)) continue; - err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND); + policy = nft_trans_chain_policy(trans); + err = nft_flow_offload_chain(trans->ctx.chain, &policy, + FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND); + policy = nft_trans_chain_policy(trans); + err = nft_flow_offload_chain(trans->ctx.chain, &policy, + FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->ctx.flags & NLM_F_REPLACE || - !(trans->ctx.flags & NLM_F_APPEND)) - return -EOPNOTSUPP; - - err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE); - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + !(trans->ctx.flags & NLM_F_APPEND)) { + err = -EOPNOTSUPP; + break; + } + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + nft_trans_flow_rule(trans), + FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; - err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY); + err = nft_flow_offload_rule(trans->ctx.chain, + nft_trans_rule(trans), + NULL, FLOW_CLS_DESTROY); break; } - if (err) - return err; + if (err) { + nft_flow_rule_offload_abort(net, trans); + break; + } + } + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWRULE: + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + default: + break; + } } return err; } + +static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +{ + struct nft_base_chain *basechain; + struct net *net = dev_net(dev); + struct nft_hook *hook, *found; + const struct nft_table *table; + struct nft_chain *chain; + + list_for_each_entry(table, &net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain) || + !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + found = NULL; + basechain = nft_base_chain(chain); + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = hook; + break; + } + if (!found) + continue; + + return chain; + } + } + + return NULL; +} + +static void nft_indr_block_cb(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, void *cb_priv, + enum flow_block_command cmd) +{ + struct net *net = dev_net(dev); + struct nft_chain *chain; + + mutex_lock(&net->nft.commit_mutex); + chain = __nft_offload_get_chain(dev); + if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { + struct nft_base_chain *basechain; + + basechain = nft_base_chain(chain); + nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); + } + mutex_unlock(&net->nft.commit_mutex); +} + +static int nft_offload_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct nft_chain *chain; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + mutex_lock(&net->nft.commit_mutex); + chain = __nft_offload_get_chain(dev); + if (chain) + nft_flow_block_chain(nft_base_chain(chain), dev, + FLOW_BLOCK_UNBIND); + + mutex_unlock(&net->nft.commit_mutex); + + return NOTIFY_DONE; +} + +static struct flow_indr_block_entry block_ing_entry = { + .cb = nft_indr_block_cb, + .list = LIST_HEAD_INIT(block_ing_entry.list), +}; + +static struct notifier_block nft_offload_netdev_notifier = { + .notifier_call = nft_offload_netdev_event, +}; + +int nft_offload_init(void) +{ + int err; + + err = register_netdevice_notifier(&nft_offload_netdev_notifier); + if (err < 0) + return err; + + flow_indr_add_block_cb(&block_ing_entry); + + return 0; +} + +void nft_offload_exit(void) +{ + flow_indr_del_block_cb(&block_ing_entry); + unregister_netdevice_notifier(&nft_offload_netdev_notifier); +} diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index a9fce8d10051..586b621007eb 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void) nft_register_set(&nft_set_rhash_type); nft_register_set(&nft_set_bitmap_type); nft_register_set(&nft_set_rbtree_type); + nft_register_set(&nft_set_pipapo_type); return 0; } static void __exit nf_tables_set_module_exit(void) { + nft_unregister_set(&nft_set_pipapo_type); nft_unregister_set(&nft_set_rbtree_type); nft_unregister_set(&nft_set_bitmap_type); nft_unregister_set(&nft_set_rhash_type); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4abbb452cf6c..99127e2d95a8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -476,7 +476,7 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + ss->abort(net, oskb, true); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); @@ -487,11 +487,11 @@ done: status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { - ss->abort(net, oskb); + ss->abort(net, oskb, false); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { - ss->abort(net, oskb); + ss->abort(net, oskb, false); } if (ss->cleanup) ss->cleanup(net); diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 7525063c25f5..de3a9596b7f1 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -236,7 +236,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], nla_strlcpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); - if (size > FIELD_SIZEOF(struct nf_conn_help, data)) { + if (size > sizeof_field(struct nf_conn_help, data)) { ret = -ENOMEM; goto err2; } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6dee4f9a944c..0ba020ca38e6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -385,6 +385,57 @@ nfulnl_timer(struct timer_list *t) instance_put(inst); } +static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) +{ + u32 size = 0; + + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_present(skb)) { + size += nla_total_size(0); /* nested */ + size += nla_total_size(sizeof(u16)); /* id */ + size += nla_total_size(sizeof(u16)); /* tag */ + } + + if (skb->network_header > skb->mac_header) + size += nla_total_size(skb->network_header - skb->mac_header); + + return size; +} + +static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) +{ + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_present(skb)) { + struct nlattr *nest; + + nest = nla_nest_start(inst->skb, NFULA_VLAN); + if (!nest) + goto nla_put_failure; + + if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) || + nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto)) + goto nla_put_failure; + + nla_nest_end(inst->skb, nest); + } + + if (skb->mac_header < skb->network_header) { + int len = (int)(skb->network_header - skb->mac_header); + + if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -1; +} + /* This is an inline function, we don't really care about a long * list of arguments */ static inline int @@ -580,6 +631,10 @@ __build_packet_message(struct nfnl_log_net *log, NFULA_CT, NFULA_CT_INFO) < 0) goto nla_put_failure; + if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) && + nfulnl_put_bridge(inst, skb) < 0) + goto nla_put_failure; + if (data_len) { struct nlattr *nla; int size = nla_attr_size(data_len); @@ -651,7 +706,7 @@ nfulnl_log_packet(struct net *net, /* FIXME: do we want to make the size calculation conditional based on * what is actually present? way more branches and checks, but more * memory efficient... */ - size = nlmsg_total_size(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -668,7 +723,7 @@ nfulnl_log_packet(struct net *net, + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { - size += nla_total_size(skb->dev->hard_header_len) + size += nla_total_size(skb->dev->hard_header_len) + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + nla_total_size(sizeof(u_int16_t)); /* hwlen */ } @@ -687,6 +742,8 @@ nfulnl_log_packet(struct net *net, size += nfnl_ct->build_size(ct); } } + if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) + size += nfulnl_get_bridge_size(skb); qthreshold = inst->qthreshold; /* per-rule qthreshold overrides per-instance */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index b6a7ce622c72..76535fd9278c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -394,7 +394,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, char *secdata = NULL; u32 seclen = 0; - size = nlmsg_total_size(sizeof(struct nfgenmsg)) + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ @@ -453,7 +453,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if (queue->flags & NFQA_CFG_F_UID_GID) { - size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } @@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; + struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) queued++; else kfree_skb(segs); - segs = nskb; - } while (segs); + } if (queued) { if (err) /* some segments are already queued */ diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index b310b637b550..0ed2281f03be 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -13,25 +13,71 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> struct nft_bitwise { enum nft_registers sreg:8; enum nft_registers dreg:8; + enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; struct nft_data xor; + struct nft_data data; }; +static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; +} + +static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { + dst[i - 1] = (src[i - 1] << shift) | carry; + carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + } +} + +static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { + dst[i] = carry | (src[i] >> shift); + carry = src[i] << (BITS_PER_TYPE(u32) - shift); + } +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) - dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; + switch (priv->op) { + case NFT_BITWISE_BOOL: + nft_bitwise_eval_bool(dst, src, priv); + break; + case NFT_BITWISE_LSHIFT: + nft_bitwise_eval_lshift(dst, src, priv); + break; + case NFT_BITWISE_RSHIFT: + nft_bitwise_eval_rshift(dst, src, priv); + break; + } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { @@ -40,22 +86,86 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, + [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; +static int nft_bitwise_init_bool(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d1, d2; + int err; + + if (tb[NFTA_BITWISE_DATA]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + tb[NFTA_BITWISE_MASK]); + if (err < 0) + return err; + if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { + err = -EINVAL; + goto err1; + } + + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + tb[NFTA_BITWISE_XOR]); + if (err < 0) + goto err1; + if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { + err = -EINVAL; + goto err2; + } + + return 0; +err2: + nft_data_release(&priv->xor, d2.type); +err1: + nft_data_release(&priv->mask, d1.type); + return err; +} + +static int nft_bitwise_init_shift(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d; + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_DATA]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, d.type); + return -EINVAL; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_bitwise *priv = nft_expr_priv(expr); - struct nft_data_desc d1, d2; u32 len; int err; - if (tb[NFTA_BITWISE_SREG] == NULL || - tb[NFTA_BITWISE_DREG] == NULL || - tb[NFTA_BITWISE_LEN] == NULL || - tb[NFTA_BITWISE_MASK] == NULL || - tb[NFTA_BITWISE_XOR] == NULL) + if (!tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG] || + !tb[NFTA_BITWISE_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); @@ -75,55 +185,102 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, - tb[NFTA_BITWISE_MASK]); - if (err < 0) - return err; - if (d1.len != priv->len) { - err = -EINVAL; - goto err1; + if (tb[NFTA_BITWISE_OP]) { + priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); + switch (priv->op) { + case NFT_BITWISE_BOOL: + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + break; + default: + return -EOPNOTSUPP; + } + } else { + priv->op = NFT_BITWISE_BOOL; } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, - tb[NFTA_BITWISE_XOR]); - if (err < 0) - goto err1; - if (d2.len != priv->len) { - err = -EINVAL; - goto err2; + switch(priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_init_bool(priv, tb); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_init_shift(priv, tb); + break; } - return 0; -err2: - nft_data_release(&priv->xor, d2.type); -err1: - nft_data_release(&priv->mask, d1.type); return err; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + return 0; +} + +static int nft_bitwise_dump_shift(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); + int err = 0; if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) - goto nla_put_failure; + return -1; if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) - goto nla_put_failure; + return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) - goto nla_put_failure; + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) + return -1; - if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, - NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + switch (priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_dump_bool(skb, priv); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_dump_shift(skb, priv); + break; + } - if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, - NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return err; +} - return 0; +static struct nft_data zero; -nla_put_failure: - return -1; +static int nft_bitwise_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + if (priv->op != NFT_BITWISE_BOOL) + return -EOPNOTSUPP; + + if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || + priv->sreg != priv->dreg || priv->len != reg->len) + return -EOPNOTSUPP; + + memcpy(®->mask, &priv->mask, sizeof(priv->mask)); + + return 0; } static const struct nft_expr_ops nft_bitwise_ops = { @@ -132,6 +289,7 @@ static const struct nft_expr_ops nft_bitwise_ops = { .eval = nft_bitwise_eval, .init = nft_bitwise_init, .dump = nft_bitwise_dump, + .offload = nft_bitwise_offload, }; struct nft_expr_type nft_bitwise_type __read_mostly = { diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e06318428ea0..12bed3f7bbc6 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -43,14 +43,15 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned((u64 *)&src[i]); - put_unaligned_be64(src64, &dst[i]); + src64 = nft_reg_load64(&src[i]); + nft_reg_store64(&dst[i], be64_to_cpu(src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned_be64(&src[i]); - put_unaligned(src64, (u64 *)&dst[i]); + src64 = (__force __u64) + cpu_to_be64(nft_reg_load64(&src[i])); + nft_reg_store64(&dst[i], src64); } break; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b5d5d071d765..c78d01bc02e9 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + struct nft_hook *hook, *found = NULL; + int n = 0; - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - /* UNREGISTER events are also happpening on netns exit. - * - * Altough nf_tables core releases all tables/chains, only - * this event handler provides guarantee that - * basechain.ops->dev is still accessible, so we cannot - * skip exiting net namespaces. - */ - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; + if (event != NETDEV_UNREGISTER) + return; - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.dev == dev) + found = hook; + + n++; } + if (!found) + return; + + if (n > 1) { + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); + kfree_rcu(found, rcu); + return; + } + + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ + __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index bd173b1824c6..8a28c127effc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> +#include <linux/if_arp.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> @@ -80,6 +81,12 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + nft_data_release(&priv->data, desc.type); + return err; + } + priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); if (err < 0) @@ -116,7 +123,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; - if (priv->op != NFT_CMP_EQ) + if (priv->op != NFT_CMP_EQ || reg->len != priv->len) return -EOPNOTSUPP; memcpy(key + reg->offset, &priv->data, priv->len); @@ -125,6 +132,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; + if (reg->key == FLOW_DISSECTOR_KEY_META && + reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) && + nft_reg_load16(priv->data.data) != ARPHRD_ETHER) + return -EOPNOTSUPP; + nft_offload_update_dependency(ctx, &priv->data, priv->len); return 0; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index af1497ab9464..69d6173f91e2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -218,8 +218,13 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); + bool ret; - return nf_conncount_gc_list(net, &priv->list); + local_bh_disable(); + ret = nf_conncount_gc_list(net, &priv->list); + local_bh_enable(); + + return ret; } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 46ca8bcca1bd..faea72c2df32 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -440,12 +440,12 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, switch (ctx->family) { case NFPROTO_IPV4: - len = FIELD_SIZEOF(struct nf_conntrack_tuple, + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: - len = FIELD_SIZEOF(struct nf_conntrack_tuple, + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: @@ -457,20 +457,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6); + len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all); + len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: @@ -551,7 +551,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; - len = FIELD_SIZEOF(struct nf_conn, mark); + len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index c6052fdd2c40..c2e78c160fd7 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -10,6 +10,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { @@ -56,6 +57,16 @@ nla_put_failure: return -1; } +static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_dup_netdev *priv = nft_expr_priv(expr); + int oif = ctx->regs[priv->sreg_dev].data.data[0]; + + return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -63,6 +74,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .eval = nft_dup_netdev_eval, .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, + .offload = nft_dup_netdev_offload, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 33833a0cb989..683785225a3e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, timeout = priv->timeout ? : set->timeout; elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], + ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (elem == NULL) @@ -84,6 +84,11 @@ void nft_dynset_eval(const struct nft_expr *expr, const struct nft_expr *sexpr; u64 timeout; + if (priv->op == NFT_DYNSET_OP_DELETE) { + set->ops->delete(set, ®s->data[priv->sreg_key]); + return; + } + if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { sexpr = NULL; @@ -161,6 +166,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); switch (priv->op) { case NFT_DYNSET_OP_ADD: + case NFT_DYNSET_OP_DELETE: break; case NFT_DYNSET_OP_UPDATE: if (!(set->flags & NFT_SET_TIMEOUT)) diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c index 2cf3f32fe6d2..a2e726ae7f07 100644 --- a/net/netfilter/nft_fib_netdev.c +++ b/net/netfilter/nft_fib_netdev.c @@ -14,6 +14,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/ipv6.h> #include <net/netfilter/nft_fib.h> @@ -34,6 +35,8 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr, } break; case ETH_P_IPV6: + if (!ipv6_mod_enabled()) + break; switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 060a4ed46d5e..b70b48996801 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -6,12 +6,13 @@ #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> /* for ipv4 options. */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> -#include <linux/netfilter/nf_conntrack_common.h> +#include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { @@ -114,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (nft_flow_route(pkt, ct, &route, dir) < 0) goto err_flow_route; - flow = flow_offload_alloc(ct, &route); + flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; + if (flow_offload_route_init(flow, &route) < 0) + goto err_flow_add; + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; @@ -149,6 +153,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hook_mask); } +static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { + [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_NAME_MAXLEN - 1 }, +}; + static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -171,12 +180,26 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static void nft_flow_offload_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) +{ + struct nft_flow_offload *priv = nft_expr_priv(expr); + + nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); +} + +static void nft_flow_offload_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use--; + priv->flowtable->use++; +} + +static void nft_flow_offload_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ nf_ct_netns_put(ctx->net, ctx->family); } @@ -199,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, + .activate = nft_flow_offload_activate, + .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, @@ -207,6 +232,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = { static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, + .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 61b7f93ac681..aba11c2333f3 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -12,6 +12,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> #include <net/neighbour.h> #include <net/ip.h> @@ -63,6 +64,16 @@ nla_put_failure: return -1; } +static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_fwd_netdev *priv = nft_expr_priv(expr); + int oif = ctx->regs[priv->sreg_dev].data.data[0]; + + return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); +} + struct nft_fwd_neigh { enum nft_registers sreg_dev:8; enum nft_registers sreg_addr:8; @@ -194,6 +205,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .offload = nft_fwd_netdev_offload, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index ca2ae4b95a8d..c7f0ef73d939 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -125,17 +125,13 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, return 0; } -static int nft_immediate_offload(struct nft_offload_ctx *ctx, - struct nft_flow_rule *flow, - const struct nft_expr *expr) +static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_immediate_expr *priv) { - const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct flow_action_entry *entry; const struct nft_data *data; - if (priv->dreg != NFT_REG_VERDICT) - return -EOPNOTSUPP; - entry = &flow->rule->action.entries[ctx->num_actions++]; data = &priv->data; @@ -153,6 +149,20 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static int nft_immediate_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return nft_immediate_offload_verdict(ctx, flow, priv); + + memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data)); + + return 0; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c0560bf3c31b..660bad688e2b 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_EVAL) - return -EOPNOTSUPP; - priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 39dc94f2491e..bc9fd98c5d6d 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); struct nft_masq *priv = nft_expr_priv(expr); int err; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f69afb9ff3cb..951b6e87ed5d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> @@ -26,16 +27,295 @@ #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +#define NFT_META_SECS_PER_MINUTE 60 +#define NFT_META_SECS_PER_HOUR 3600 +#define NFT_META_SECS_PER_DAY 86400 +#define NFT_META_DAYS_PER_WEEK 7 + static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); +static u8 nft_meta_weekday(void) +{ + time64_t secs = ktime_get_real_seconds(); + unsigned int dse; + u8 wday; + + secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest; + dse = div_u64(secs, NFT_META_SECS_PER_DAY); + wday = (4 + dse) % NFT_META_DAYS_PER_WEEK; + + return wday; +} + +static u32 nft_meta_hour(time64_t secs) +{ + struct tm tm; + + time64_to_tm(secs, 0, &tm); + + return tm.tm_hour * NFT_META_SECS_PER_HOUR + + tm.tm_min * NFT_META_SECS_PER_MINUTE + + tm.tm_sec; +} + +static noinline_for_stack void +nft_meta_get_eval_time(enum nft_meta_keys key, + u32 *dest) +{ + switch (key) { + case NFT_META_TIME_NS: + nft_reg_store64(dest, ktime_get_real_ns()); + break; + case NFT_META_TIME_DAY: + nft_reg_store8(dest, nft_meta_weekday()); + break; + case NFT_META_TIME_HOUR: + *dest = nft_meta_hour(ktime_get_real_seconds()); + break; + default: + break; + } +} + +static noinline bool +nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, + u32 *dest) +{ + const struct sk_buff *skb = pkt->skb; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + break; + case NFPROTO_IPV6: + nft_reg_store8(dest, PACKET_MULTICAST); + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ipv4_is_multicast(iph->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + + break; + } + case htons(ETH_P_IPV6): + nft_reg_store8(dest, PACKET_MULTICAST); + break; + default: + WARN_ON_ONCE(1); + return false; + } + break; + default: + WARN_ON_ONCE(1); + return false; + } + + return true; +} + +static noinline bool +nft_meta_get_eval_skugid(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct socket *sock; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (!sock || !sock->file) { + read_unlock_bh(&sk->sk_callback_lock); + return false; + } + + switch (key) { + case NFT_META_SKUID: + *dest = from_kuid_munged(&init_user_ns, + sock->file->f_cred->fsuid); + break; + case NFT_META_SKGID: + *dest = from_kgid_munged(&init_user_ns, + sock->file->f_cred->fsgid); + break; + default: + break; + } + + read_unlock_bh(&sk->sk_callback_lock); + return true; +} + +#ifdef CONFIG_CGROUP_NET_CLASSID +static noinline bool +nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); + return true; +} +#endif + +static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + + switch (key) { + case NFT_META_IIFKIND: + if (!in || !in->rtnl_link_ops) + return false; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (!out || !out->rtnl_link_ops) + return false; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + default: + return false; + } + + return true; +} + +static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) +{ + *dest = dev ? dev->ifindex : 0; +} + +static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) +{ + strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); +} + +static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + nft_reg_store16(dest, dev->type); + return true; +} + +static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + *dest = dev->group; + return true; +} + +static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, + const struct nft_pktinfo *pkt) +{ + switch (key) { + case NFT_META_IIFNAME: + nft_meta_store_ifname(dest, nft_in(pkt)); + break; + case NFT_META_OIFNAME: + nft_meta_store_ifname(dest, nft_out(pkt)); + break; + case NFT_META_IIF: + nft_meta_store_ifindex(dest, nft_in(pkt)); + break; + case NFT_META_OIF: + nft_meta_store_ifindex(dest, nft_out(pkt)); + break; + case NFT_META_IIFTYPE: + if (!nft_meta_store_iftype(dest, nft_in(pkt))) + return false; + break; + case NFT_META_OIFTYPE: + if (!nft_meta_store_iftype(dest, nft_out(pkt))) + return false; + break; + case NFT_META_IIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + case NFT_META_OIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + default: + return false; + } + + return true; +} + +static noinline u32 nft_prandom_u32(void) +{ + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + + return prandom_u32_state(state); +} + +#ifdef CONFIG_IP_ROUTE_CLASSID +static noinline bool +nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) +{ + const struct dst_entry *dst = skb_dst(skb); + + if (!dst) + return false; + + *dest = dst->tclassid; + return true; +} +#endif + +static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return inet_sdif(pkt->skb); + case NFPROTO_IPV6: + return inet6_sdif(pkt->skb); + } + + return 0; +} + +static noinline void +nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) +{ + u32 sdif = nft_meta_get_eval_sdif(pkt); + const struct net_device *dev; + + dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; + nft_meta_store_ifname(dest, dev); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -60,69 +340,26 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->mark; break; case NFT_META_IIF: - *dest = in ? in->ifindex : 0; - break; case NFT_META_OIF: - *dest = out ? out->ifindex : 0; - break; case NFT_META_IIFNAME: - strncpy((char *)dest, in ? in->name : "", IFNAMSIZ); - break; case NFT_META_OIFNAME: - strncpy((char *)dest, out ? out->name : "", IFNAMSIZ); - break; case NFT_META_IIFTYPE: - if (in == NULL) - goto err; - nft_reg_store16(dest, in->type); - break; case NFT_META_OIFTYPE: - if (out == NULL) + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: + if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; - nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - - *dest = from_kuid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&sk->sk_callback_lock); - break; case NFT_META_SKGID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); + if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; - } - *dest = from_kgid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID - case NFT_META_RTCLASSID: { - const struct dst_entry *dst = skb_dst(skb); - - if (dst == NULL) + case NFT_META_RTCLASSID: + if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; - *dest = dst->tclassid; break; - } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: @@ -135,88 +372,41 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - break; - case NFPROTO_IPV6: - nft_reg_store8(dest, PACKET_MULTICAST); - break; - case NFPROTO_NETDEV: - switch (skb->protocol) { - case htons(ETH_P_IP): { - int noff = skb_network_offset(skb); - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, noff, - sizeof(_iph), &_iph); - if (!iph) - goto err; - - if (ipv4_is_multicast(iph->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - - break; - } - case htons(ETH_P_IPV6): - nft_reg_store8(dest, PACKET_MULTICAST); - break; - default: - WARN_ON_ONCE(1); - goto err; - } - break; - default: - WARN_ON_ONCE(1); + if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; - } break; case NFT_META_CPU: *dest = raw_smp_processor_id(); break; - case NFT_META_IIFGROUP: - if (in == NULL) - goto err; - *dest = in->group; - break; - case NFT_META_OIFGROUP: - if (out == NULL) - goto err; - *dest = out->group; - break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; - *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif - case NFT_META_PRANDOM: { - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - *dest = prandom_u32_state(state); + case NFT_META_PRANDOM: + *dest = nft_prandom_u32(); break; - } #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; #endif case NFT_META_IIFKIND: - if (in == NULL || in->rtnl_link_ops == NULL) - goto err; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); - break; case NFT_META_OIFKIND: - if (out == NULL || out->rtnl_link_ops == NULL) + if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_TIME_NS: + case NFT_META_TIME_DAY: + case NFT_META_TIME_HOUR: + nft_meta_get_eval_time(priv->key, dest); + break; + case NFT_META_SDIF: + *dest = nft_meta_get_eval_sdif(pkt); + break; + case NFT_META_SDIFNAME: + nft_meta_get_eval_sdifname(dest, pkt); break; default: WARN_ON(1); @@ -298,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: + case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -319,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: + case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: @@ -330,6 +522,15 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif + case NFT_META_TIME_NS: + len = sizeof(u64); + break; + case NFT_META_TIME_DAY: + len = sizeof(u8); + break; + case NFT_META_TIME_HOUR: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } @@ -340,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { -#ifdef CONFIG_XFRM - const struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; - if (priv->key != NFT_META_SECPATH) - return 0; + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) +{ +#ifdef CONFIG_XFRM + unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: @@ -372,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + switch (priv->key) { + case NFT_META_SECPATH: + return nft_meta_get_validate_xfrm(ctx); + case NFT_META_SDIF: + case NFT_META_SDIFNAME: + return nft_meta_get_validate_sdif(ctx); + default: + break; + } + + return 0; +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -501,6 +733,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; + case NFT_META_IIF: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_ifindex, sizeof(__u32), reg); + break; + case NFT_META_IIFTYPE: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta, + ingress_iftype, sizeof(__u16), reg); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index c3c93e95b46e..8b44a4de5329 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -141,10 +141,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (family) { case NFPROTO_IPV4: - alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip); + alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: - alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6); + alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: return -EAFNOSUPPORT; @@ -171,7 +171,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } } - plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { priv->sreg_proto_min = nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index f54d6ae15bb1..b42247aa48a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, int err; u8 ttl; + if (!tb[NFTA_OSF_DREG]) + return -EINVAL; + if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 22a80eb60222..1993af3a2979 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -23,50 +23,58 @@ #include <linux/ip.h> #include <linux/ipv6.h> +static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, + struct vlan_ethhdr *veth) +{ + if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN)) + return false; + + veth->h_vlan_proto = skb->vlan_proto; + veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth->h_vlan_encapsulated_proto = skb->protocol; + + return true; +} + /* add vlan header into the user buffer for if tag was removed by offloads */ static bool nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; - u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; + u8 vlan_hlen = 0; + + if ((skb->protocol == htons(ETH_P_8021AD) || + skb->protocol == htons(ETH_P_8021Q)) && + offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) + vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < ETH_HLEN) { - u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + if (offset < VLAN_ETH_HLEN + vlan_hlen) { + u8 ethlen = len; - if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + if (vlan_hlen && + skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) + return false; + else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - veth.h_vlan_proto = skb->vlan_proto; + if (offset + len > VLAN_ETH_HLEN + vlan_hlen) + ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen; - memcpy(dst_u8, vlanh + offset, ethlen); + memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN; - } else if (offset >= VLAN_ETH_HLEN) { - offset -= VLAN_HLEN; - goto skip; + offset = ETH_HLEN + vlan_hlen; + } else { + offset -= VLAN_HLEN + vlan_hlen; } - veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); - veth.h_vlan_encapsulated_proto = skb->protocol; - - vlanh += offset; - - vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); - memcpy(dst_u8, vlanh, vlan_len); - - len -= vlan_len; - if (!len) - return true; - - dst_u8 += vlan_len; - skip: return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } @@ -161,13 +169,59 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ethhdr, h_source): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): + if (priv->len != ETH_ALEN) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; + case offsetof(struct ethhdr, h_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, + n_proto, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tci, sizeof(__be16), reg); + break; + case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + + sizeof(struct vlan_hdr): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + vlan_tpid, sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; } return 0; @@ -181,14 +235,23 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct iphdr, saddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, daddr): + if (priv->len != sizeof(struct in_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); break; case offsetof(struct iphdr, protocol): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -208,14 +271,23 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, daddr): + if (priv->len != sizeof(struct in6_addr)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); break; case offsetof(struct ipv6hdr, nexthdr): + if (priv->len != sizeof(__u8)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); @@ -255,10 +327,16 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct tcphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; @@ -277,10 +355,16 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, switch (priv->offset) { case offsetof(struct udphdr, source): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): + if (priv->len != sizeof(__be16)) + return -EOPNOTSUPP; + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c8745d454bf8..4413690591f2 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -13,7 +13,7 @@ #include <net/netfilter/nf_tables.h> struct nft_quota { - u64 quota; + atomic64_t quota; unsigned long flags; atomic64_t consumed; }; @@ -21,7 +21,8 @@ struct nft_quota { static inline bool nft_overquota(struct nft_quota *priv, const struct sk_buff *skb) { - return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota; + return atomic64_add_return(skb->len, &priv->consumed) >= + atomic64_read(&priv->quota); } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -89,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } - priv->quota = quota; + atomic64_set(&priv->quota, quota); priv->flags = flags; atomic64_set(&priv->consumed, consumed); @@ -105,10 +106,22 @@ static int nft_quota_obj_init(const struct nft_ctx *ctx, return nft_quota_do_init(tb, priv); } +static void nft_quota_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_quota *newpriv = nft_obj_data(newobj); + struct nft_quota *priv = nft_obj_data(obj); + u64 newquota; + + newquota = atomic64_read(&newpriv->quota); + atomic64_set(&priv->quota, newquota); + priv->flags = newpriv->flags; +} + static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { - u64 consumed, consumed_cap; + u64 consumed, consumed_cap, quota; u32 flags = priv->flags; /* Since we inconditionally increment consumed quota for each packet @@ -116,14 +129,15 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, * userspace. */ consumed = atomic64_read(&priv->consumed); - if (consumed >= priv->quota) { - consumed_cap = priv->quota; + quota = atomic64_read(&priv->quota); + if (consumed >= quota) { + consumed_cap = quota; flags |= NFT_QUOTA_F_DEPLETED; } else { consumed_cap = consumed; } - if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), + if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(quota), NFTA_QUOTA_PAD) || nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap), NFTA_QUOTA_PAD) || @@ -155,6 +169,7 @@ static const struct nft_object_ops nft_quota_obj_ops = { .init = nft_quota_obj_init, .eval = nft_quota_obj_eval, .dump = nft_quota_obj_dump, + .update = nft_quota_obj_update, }; static struct nft_object_type nft_quota_obj_type __read_mostly = { diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 4701fa8a45e7..89efcc5a533d 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -66,11 +66,21 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr if (err < 0) return err; + if (desc_from.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), &desc_to, tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; + if (desc_to.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err2; + } + if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 43eeb1f609f1..5b779171565c 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { priv->sreg_proto_min = nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index b5aeccdddb22..87e8d9ba0c9b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -10,7 +10,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { struct list_head head; @@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[], } static int nft_bitmap_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const nla[]) + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6e8d20c03e3d..d350a7cd3af0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -16,7 +16,7 @@ #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> /* We target a hash table size of 4, element hint is 75% of final size */ #define NFT_RHASH_ELEMENT_HINT 3 @@ -234,6 +234,24 @@ static void nft_rhash_remove(const struct net *net, rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } +static bool nft_rhash_delete(const struct nft_set *set, + const u32 *key) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; + struct nft_rhash_elem *he; + + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); + if (he == NULL) + return false; + + return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; +} + static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { @@ -627,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) + struct nft_set_estimate *est) { if (!desc->size) return false; @@ -662,6 +680,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = { .remove = nft_rhash_remove, .lookup = nft_rhash_lookup, .update = nft_rhash_update, + .delete = nft_rhash_delete, .walk = nft_rhash_walk, .get = nft_rhash_get, }, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c new file mode 100644 index 000000000000..f0cb1e13af50 --- /dev/null +++ b/net/netfilter/nft_set_pipapo.c @@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +/** + * DOC: Theory of Operation + * + * + * Problem + * ------- + * + * Match packet bytes against entries composed of ranged or non-ranged packet + * field specifiers, mapping them to arbitrary references. For example: + * + * :: + * + * --- fields ---> + * | [net],[port],[net]... => [reference] + * entries [net],[port],[net]... => [reference] + * | [net],[port],[net]... => [reference] + * V ... + * + * where [net] fields can be IP ranges or netmasks, and [port] fields are port + * ranges. Arbitrary packet fields can be matched. + * + * + * Algorithm Overview + * ------------------ + * + * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally + * relies on the consideration that every contiguous range in a space of b bits + * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], + * as also illustrated in Section 9 of [Kogan 2014]. + * + * Classification against a number of entries, that require matching given bits + * of a packet field, is performed by grouping those bits in sets of arbitrary + * size, and classifying packet bits one group at a time. + * + * Example: + * to match the source port (16 bits) of a packet, we can divide those 16 bits + * in 4 groups of 4 bits each. Given the entry: + * 0000 0001 0101 1001 + * and a packet with source port: + * 0000 0001 1010 1001 + * first and second groups match, but the third doesn't. We conclude that the + * packet doesn't match the given entry. + * + * Translate the set to a sequence of lookup tables, one per field. Each table + * has two dimensions: bit groups to be matched for a single packet field, and + * all the possible values of said groups (buckets). Input entries are + * represented as one or more rules, depending on the number of composing + * netmasks for the given field specifier, and a group match is indicated as a + * set bit, with number corresponding to the rule index, in all the buckets + * whose value matches the entry for a given group. + * + * Rules are mapped between fields through an array of x, n pairs, with each + * item mapping a matched rule to one or more rules. The position of the pair in + * the array indicates the matched rule to be mapped to the next field, x + * indicates the first rule index in the next field, and n the amount of + * next-field rules the current rule maps to. + * + * The mapping array for the last field maps to the desired references. + * + * To match, we perform table lookups using the values of grouped packet bits, + * and use a sequence of bitwise operations to progressively evaluate rule + * matching. + * + * A stand-alone, reference implementation, also including notes about possible + * future optimisations, is available at: + * https://pipapo.lameexcu.se/ + * + * Insertion + * --------- + * + * - For each packet field: + * + * - divide the b packet bits we want to classify into groups of size t, + * obtaining ceil(b / t) groups + * + * Example: match on destination IP address, with t = 4: 32 bits, 8 groups + * of 4 bits each + * + * - allocate a lookup table with one column ("bucket") for each possible + * value of a group, and with one row for each group + * + * Example: 8 groups, 2^4 buckets: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 + * 1 + * 2 + * 3 + * 4 + * 5 + * 6 + * 7 + * + * - map the bits we want to classify for the current field, for a given + * entry, to a single rule for non-ranged and netmask set items, and to one + * or multiple rules for ranges. Ranges are expanded to composing netmasks + * by pipapo_expand(). + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 + * - rule #0: 10.0.0.5 + * - rule #1: 192.168.1.0/24 + * - rule #2: 192.168.2.0/31 + * + * - insert references to the rules in the lookup table, selecting buckets + * according to bit values of a rule in the given group. This is done by + * pipapo_insert(). + * + * Example: given: + * - rule #0: 10.0.0.5 mapping to buckets + * < 0 10 0 0 0 0 0 5 > + * - rule #1: 192.168.1.0/24 mapping to buckets + * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > + * - rule #2: 192.168.2.0/31 mapping to buckets + * < 12 0 10 8 0 2 0 < 0..1 > > + * + * these bits are set in the lookup table: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * - if this is not the last field in the set, fill a mapping array that maps + * rules from the lookup table to rules belonging to the same entry in + * the next lookup table, done by pipapo_map(). + * + * Note that as rules map to contiguous ranges of rules, given how netmask + * expansion and insertion is performed, &union nft_pipapo_map_bucket stores + * this information as pairs of first rule index, rule count. + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, + * given lookup table #0 for field 0 (see example above): + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * and lookup table #1 for field 1 with: + * - rule #0: 1024 mapping to buckets + * < 0 0 4 0 > + * - rule #1: 2048 mapping to buckets + * < 0 0 5 0 > + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 + * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 + * (rules #1, #2) to 2048 in lookup table #2 (rule #1): + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * - if this is the last field in the set, fill a mapping array that maps + * rules from the last lookup table to element pointers, also done by + * pipapo_map(). + * + * Note that, in this implementation, we have two elements (start, end) for + * each entry. The pointer to the end element is stored in this array, and + * the pointer to the start element is linked from it. + * + * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem + * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. + * From the rules of lookup table #1 as mapped above: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * + * Matching + * -------- + * + * We use a result bitmap, with the size of a single lookup table bucket, to + * represent the matching state that applies at every algorithm step. This is + * done by pipapo_lookup(). + * + * - For each packet field: + * + * - start with an all-ones result bitmap (res_map in pipapo_lookup()) + * + * - perform a lookup into the table corresponding to the current field, + * for each group, and at every group, AND the current result bitmap with + * the value from the lookup table bucket + * + * :: + * + * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from + * insertion examples. + * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for + * convenience in this example. Initial result bitmap is 0xff, the steps + * below show the value of the result bitmap after each group is processed: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 + * + * 1 1,2 0 + * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 + * + * 2 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 + * + * 3 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 + * + * 4 0,1,2 + * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 + * + * 5 0 1 2 + * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 + * + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 + * + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 + * + * - at the next field, start with a new, all-zeroes result bitmap. For each + * bit set in the previous result bitmap, fill the new result bitmap + * (fill_map in pipapo_lookup()) with the rule indices from the + * corresponding buckets of the mapping field for this field, done by + * pipapo_refill() + * + * Example: with mapping table from insertion examples, with the current + * result bitmap from the previous example, 0x02: + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be + * set. + * + * We can now extend this example to cover the second iteration of the step + * above (lookup and AND bitmap): assuming the port field is + * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table + * for "port" field from pre-computation example: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] + * & 0x3 [bucket 0], resulting bitmap is 0x2. + * + * - if this is the last field in the set, look up the value from the mapping + * array corresponding to the final result bitmap + * + * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for + * last field from insertion example: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * the matching element is at 0x42. + * + * + * References + * ---------- + * + * [Ligatti 2010] + * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with + * Automatic Time-space Tradeoffs + * Jay Ligatti, Josh Kuhn, and Chris Gage. + * Proceedings of the IEEE International Conference on Computer + * Communication Networks (ICCCN), August 2010. + * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * + * [Rottenstreich 2010] + * Worst-Case TCAM Rule Expansion + * Ori Rottenstreich and Isaac Keslassy. + * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf + * + * [Kogan 2014] + * SAX-PAC (Scalable And eXpressive PAcket Classification) + * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, + * and Patrick Eugster. + * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. + * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <net/ipv6.h> /* For the maximum length of a field */ +#include <linux/bitmap.h> +#include <linux/bitops.h> + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Number of bits to be grouped together in lookup table buckets, arbitrary */ +#define NFT_PIPAPO_GROUP_BITS 4 +#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ + (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(x) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) + +/* Number of buckets, given by 2 ^ n, with n grouped bits */ +#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of 4-bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[0]; +}; + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @groups: Total amount of 4-bit groups for fields in this set + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int groups; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +/** + * pipapo_refill() - For each set bit, set bits from selected mapping table item + * @map: Bitmap to be scanned for set bits + * @len: Length of bitmap in longs + * @rules: Number of rules in field + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @match_only: Find a single bit and return, don't fill + * + * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. + * + * For each bit set in map, select the bucket from mapping table with index + * corresponding to the position of the bit set. Use start bit and amount of + * bits specified in bucket to fill region in dst. + * + * Return: -1 on no match, bit position on 'match_only', 0 otherwise. + */ +static int pipapo_refill(unsigned long *map, int len, int rules, + unsigned long *dst, union nft_pipapo_map_bucket *mt, + bool match_only) +{ + unsigned long bitset; + int k, ret = -1; + + for (k = 0; k < len; k++) { + bitset = map[k]; + while (bitset) { + unsigned long t = bitset & -bitset; + int r = __builtin_ctzl(bitset); + int i = k * BITS_PER_LONG + r; + + if (unlikely(i >= rules)) { + map[k] = 0; + return -1; + } + + if (unlikely(match_only)) { + bitmap_clear(map, i, 1); + return i; + } + + ret = 0; + + bitmap_set(dst, mt[i].to, mt[i].n); + + bitset ^= t; + } + map[k] = 0; + } + + return ret; +} + +/** + * nft_pipapo_lookup() - Lookup function + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation. + * + * Return: true on match, false otherwise. + */ +static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res_map, *fill_map; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i; + + local_bh_disable(); + + map_index = raw_cpu_read(nft_pipapo_scratch_index); + + m = rcu_dereference(priv->match); + + if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + goto out; + + res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); + fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group += 2) { + u8 v; + + v = *rp >> 4; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + + v = *rp & 0x0f; + rp++; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) { + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return false; + } + + if (last) { + *ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) + goto next_match; + + /* Last field: we're just returning the key without + * filling the initial bitmap for the next field, so the + * current inactive bitmap is clean and can be reused as + * *next* bitmap (not initial) for the next packet. + */ + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return true; + } + + /* Swap bitmap indices: res_map is the initial bitmap for the + * next field, and fill_map is guaranteed to be all-zeroes at + * this point. + */ + map_index = !map_index; + swap(res_map, fill_map); + + rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + } + +out: + local_bh_enable(); + return false; +} + +/** + * pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * + * This is essentially the same as the lookup function, except that it matches + * key data against the uncommitted copy and doesn't use preallocated maps for + * bitmap results. + * + * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct net *net, + const struct nft_set *set, + const u8 *data, u8 genmask) +{ + struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + unsigned long *res_map, *fill_map = NULL; + struct nft_pipapo_field *f; + int i; + + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!res_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!fill_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group++) { + u8 v; + + if (group % 2) { + v = *data & 0x0f; + data++; + } else { + v = *data >> 4; + } + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) + goto out; + + if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext) || + (genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + + ret = f->mt[b].e; + goto out; + } + + data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + + /* Swap bitmap indices: fill_map will be the initial bitmap for + * the next field (i.e. the new res_map), and res_map is + * guaranteed to be all-zeroes at this point, ready to be filled + * according to the next mapping table. + */ + swap(res_map, fill_map); + } + +out: + kfree(fill_map); + kfree(res_map); + return ret; +} + +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + */ +void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +} + +/** + * pipapo_resize() - Resize lookup or mapping table, or both + * @f: Field containing lookup and mapping tables + * @old_rules: Previous amount of rules in field + * @rules: New amount of rules + * + * Increase, decrease or maintain tables size depending on new amount of rules, + * and copy data over. In case the new size is smaller, throw away data for + * highest-numbered rules. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ +static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +{ + long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; + union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; + size_t new_bucket_size, copy; + int group, bucket; + + new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); + + if (new_bucket_size == f->bsize) + goto mt; + + if (new_bucket_size > f->bsize) + copy = f->bsize; + else + copy = new_bucket_size; + + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * + sizeof(*new_lt), GFP_KERNEL); + if (!new_lt) + return -ENOMEM; + + new_p = new_lt; + old_p = old_lt; + for (group = 0; group < f->groups; group++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + memcpy(new_p, old_p, copy * sizeof(*new_p)); + new_p += copy; + old_p += copy; + + if (new_bucket_size > f->bsize) + new_p += new_bucket_size - f->bsize; + else + old_p += f->bsize - new_bucket_size; + } + } + +mt: + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); + return -ENOMEM; + } + + memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } + + if (new_lt) { + f->bsize = new_bucket_size; + f->lt = new_lt; + kvfree(old_lt); + } + + f->mt = new_mt; + kvfree(old_mt); + + return 0; +} + +/** + * pipapo_bucket_set() - Set rule bit in bucket given group and group value + * @f: Field containing lookup table + * @rule: Rule index + * @group: Group index + * @v: Value of bit group + */ +static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, + int v) +{ + unsigned long *pos; + + pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos += f->bsize * v; + + __set_bit(rule, pos); +} + +/** + * pipapo_insert() - Insert new rule in field given input key and mask length + * @f: Field containing lookup table + * @k: Input key for classification, without nftables padding + * @mask_bits: Length of mask; matches field length for non-ranged entry + * + * Insert a new rule reference in lookup buckets corresponding to k and + * mask_bits. + * + * Return: 1 on success (one rule inserted), negative error code on failure. + */ +static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, + int mask_bits) +{ + int rule = f->rules++, group, ret; + + ret = pipapo_resize(f, f->rules - 1, f->rules); + if (ret) + return ret; + + for (group = 0; group < f->groups; group++) { + int i, v; + u8 mask; + + if (group % 2) + v = k[group / 2] & 0x0f; + else + v = k[group / 2] >> 4; + + if (mask_bits >= (group + 1) * 4) { + /* Not masked */ + pipapo_bucket_set(f, rule, group, v); + } else if (mask_bits <= group * 4) { + /* Completely masked */ + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + pipapo_bucket_set(f, rule, group, i); + } else { + /* The mask limit falls on this group */ + mask = 0x0f >> (mask_bits - group * 4); + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + if ((i & ~mask) == (v & ~mask)) + pipapo_bucket_set(f, rule, group, i); + } + } + } + + return 1; +} + +/** + * pipapo_step_diff() - Check if setting @step bit in netmask would change it + * @base: Mask we are expanding + * @step: Step bit for given expansion step + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if step bit changes mask (i.e. isn't set), false otherwise. + */ +static bool pipapo_step_diff(u8 *base, int step, int len) +{ + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); +#else + return !(BIT(step % BITS_PER_BYTE) & + base[len - 1 - step / BITS_PER_BYTE]); +#endif +} + +/** + * pipapo_step_after_end() - Check if mask exceeds range end with given step + * @base: Mask we are expanding + * @end: End of range + * @step: Step bit for given expansion step, highest bit to be set + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if mask exceeds range setting step bits, false otherwise. + */ +static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, + int len) +{ + u8 tmp[NFT_PIPAPO_MAX_BYTES]; + int i; + + memcpy(tmp, base, len); + + /* Network order, byte-addressed */ + for (i = 0; i <= step; i++) +#ifdef __BIG_ENDIAN__ + tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#else + tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#endif + + return memcmp(tmp, end, len) > 0; +} + +/** + * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry + * @base: Netmask base + * @step: Step bit to sum + * @len: Netmask length, bytes + */ +static void pipapo_base_sum(u8 *base, int step, int len) +{ + bool carry = false; + int i; + + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + for (i = step / BITS_PER_BYTE; i < len; i++) { +#else + for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { +#endif + if (carry) + base[i]++; + else + base[i] += 1 << (step % BITS_PER_BYTE); + + if (base[i]) + break; + + carry = true; + } +} + +/** + * pipapo_expand() - Expand to composing netmasks, insert into lookup table + * @f: Field containing lookup table + * @start: Start of range + * @end: End of range + * @len: Length of value in bits + * + * Expand range to composing netmasks and insert corresponding rule references + * in lookup buckets. + * + * Return: number of inserted rules on success, negative error code on failure. + */ +static int pipapo_expand(struct nft_pipapo_field *f, + const u8 *start, const u8 *end, int len) +{ + int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); + u8 base[NFT_PIPAPO_MAX_BYTES]; + + memcpy(base, start, bytes); + while (memcmp(base, end, bytes) <= 0) { + int err; + + step = 0; + while (pipapo_step_diff(base, step, bytes)) { + if (pipapo_step_after_end(base, end, step, bytes)) + break; + + step++; + if (step >= len) { + if (!masks) { + pipapo_insert(f, base, 0); + masks = 1; + } + goto out; + } + } + + err = pipapo_insert(f, base, len - step); + + if (err < 0) + return err; + + masks++; + pipapo_base_sum(base, step, bytes); + } +out: + return masks; +} + +/** + * pipapo_map() - Insert rules in mapping tables, mapping them between fields + * @m: Matching data, including mapping table + * @map: Table of rule maps: array of first rule and amount of rules + * in next field a given rule maps to, for each field + * @ext: For last field, nft_set_ext pointer matching rules map to + */ +static void pipapo_map(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], + struct nft_pipapo_elem *e) +{ + struct nft_pipapo_field *f; + int i, j; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { + for (j = 0; j < map[i].n; j++) { + f->mt[map[i].to + j].to = map[i + 1].to; + f->mt[map[i].to + j].n = map[i + 1].n; + } + } + + /* Last field: map to ext instead of mapping to next field */ + for (j = 0; j < map[i].n; j++) + f->mt[map[i].to + j].e = e; +} + +/** + * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results + * @clone: Copy of matching data with pending insertions and deletions + * @bsize_max Maximum bucket size, scratch maps cover two buckets + * + * Return: 0 on success, -ENOMEM on failure. + */ +static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, + unsigned long bsize_max) +{ + int i; + + for_each_possible_cpu(i) { + unsigned long *scratch; + + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + GFP_KERNEL, cpu_to_node(i)); + if (!scratch) { + /* On failure, there's no need to undo previous + * allocations: this means that some scratch maps have + * a bigger allocated size now (this is only called on + * insertion), but the extra space won't be used by any + * CPU as new elements are not inserted and m->bsize_max + * is not updated. + */ + return -ENOMEM; + } + + kfree(*per_cpu_ptr(clone->scratch, i)); + + *per_cpu_ptr(clone->scratch, i) = scratch; + } + + return 0; +} + +/** + * nft_pipapo_insert() - Validate and insert ranged elements + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * + * Return: 0 on success, error pointer on failure. + */ +static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext2) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *start = (const u8 *)elem->key.val.data, *end; + struct nft_pipapo_elem *e = elem->priv, *dup; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + u8 genmask = nft_genmask_next(net); + struct nft_pipapo_field *f; + int i, bsize_max, err = 0; + + dup = pipapo_get(net, set, start, genmask); + if (PTR_ERR(dup) == -ENOENT) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { + end = (const u8 *)nft_set_ext_key_end(ext)->data; + dup = pipapo_get(net, set, end, nft_genmask_next(net)); + } else { + end = start; + } + } + + if (PTR_ERR(dup) != -ENOENT) { + if (IS_ERR(dup)) + return PTR_ERR(dup); + *ext2 = &dup->ext; + return -EEXIST; + } + + /* Validate */ + nft_pipapo_for_each_field(f, i, m) { + const u8 *start_p = start, *end_p = end; + + if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; + + if (memcmp(start_p, end_p, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + return -EINVAL; + + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + /* Insert */ + priv->dirty = true; + + bsize_max = m->bsize_max; + + nft_pipapo_for_each_field(f, i, m) { + int ret; + + rulemap[i].to = f->rules; + + ret = memcmp(start, end, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + if (!ret) { + ret = pipapo_insert(f, start, + f->groups * NFT_PIPAPO_GROUP_BITS); + } else { + ret = pipapo_expand(f, start, end, + f->groups * NFT_PIPAPO_GROUP_BITS); + } + + if (f->bsize > bsize_max) + bsize_max = f->bsize; + + rulemap[i].n = ret; + + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + err = pipapo_realloc_scratch(m, bsize_max); + if (err) + return err; + + this_cpu_write(nft_pipapo_scratch_index, false); + + m->bsize_max = bsize_max; + } + + *ext2 = &e->ext; + + pipapo_map(m, rulemap, e); + + return 0; +} + +/** + * pipapo_clone() - Clone matching data to create new working copy + * @old: Existing matching data + * + * Return: copy of matching data passed as 'old', error pointer on failure + */ +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) +{ + struct nft_pipapo_field *dst, *src; + struct nft_pipapo_match *new; + int i; + + new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, + GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + new->field_count = old->field_count; + new->bsize_max = old->bsize_max; + + new->scratch = alloc_percpu(*new->scratch); + if (!new->scratch) + goto out_scratch; + + rcu_head_init(&new->rcu); + + src = old->f; + dst = new->f; + + for (i = 0; i < old->field_count; i++) { + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); + + dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * + src->bsize * sizeof(*dst->lt), + GFP_KERNEL); + if (!dst->lt) + goto out_lt; + + memcpy(dst->lt, src->lt, + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS); + + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + src++; + dst++; + } + + return new; + +out_mt: + kvfree(dst->lt); +out_lt: + for (dst--; i > 0; i--) { + kvfree(dst->mt); + kvfree(dst->lt); + dst--; + } + free_percpu(new->scratch); +out_scratch: + kfree(new); + + return ERR_PTR(-ENOMEM); +} + +/** + * pipapo_rules_same_key() - Get number of rules originated from the same entry + * @f: Field containing mapping table + * @first: Index of first rule in set of rules mapping to same entry + * + * Using the fact that all rules in a field that originated from the same entry + * will map to the same set of rules in the next field, or to the same element + * reference, return the cardinality of the set of rules that originated from + * the same entry as the rule with index @first, @first rule included. + * + * In pictures: + * rules + * field #0 0 1 2 3 4 + * map to: 0 1 2-4 2-4 5-9 + * . . ....... . ... + * | | | | \ \ + * | | | | \ \ + * | | | | \ \ + * ' ' ' ' ' \ + * in field #1 0 1 2 3 4 5 ... + * + * if this is called for rule 2 on field #0, it will return 3, as also rules 2 + * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. + * + * For the last field in a set, we can rely on associated entries to map to the + * same element references. + * + * Return: Number of rules that originated from the same entry as @first. + */ +static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +{ + struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ + int r; + + for (r = first; r < f->rules; r++) { + if (r != first && e != f->mt[r].e) + return r - first; + + e = f->mt[r].e; + } + + if (r != first) + return r - first; + + return 0; +} + +/** + * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones + * @mt: Mapping array + * @rules: Original amount of rules in mapping table + * @start: First rule index to be removed + * @n: Amount of rules to be removed + * @to_offset: First rule index, in next field, this group of rules maps to + * @is_last: If this is the last field, delete reference from mapping array + * + * This is used to unmap rules from the mapping table for a single field, + * maintaining consistency and compactness for the existing ones. + * + * In pictures: let's assume that we want to delete rules 2 and 3 from the + * following mapping array: + * + * rules + * 0 1 2 3 4 + * map to: 4-10 4-10 11-15 11-15 16-18 + * + * the result will be: + * + * rules + * 0 1 2 + * map to: 4-10 4-10 11-13 + * + * for fields before the last one. In case this is the mapping table for the + * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: + * + * rules + * 0 1 2 3 4 + * element pointers: 0x42 0x42 0x33 0x33 0x44 + * + * the result will be: + * + * rules + * 0 1 2 + * element pointers: 0x42 0x42 0x44 + */ +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, + int start, int n, int to_offset, bool is_last) +{ + int i; + + memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); + memset(mt + rules - n, 0, n * sizeof(*mt)); + + if (is_last) + return; + + for (i = start; i < rules - n; i++) + mt[i].to -= to_offset; +} + +/** + * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map + * @m: Matching data + * @rulemap Table of rule maps, arrays of first rule and amount of rules + * in next field a given entry maps to, for each field + * + * For each rule in lookup table buckets mapping to this set of rules, drop + * all bits set in lookup table mapping. In pictures, assuming we want to drop + * rules 0 and 1 from this lookup table: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * rule 2 becomes rule 0, and the result will be: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 + * 1 0 + * 2 0 + * 3 0 + * 4 0 + * 5 0 + * 6 0 + * 7 0 0 + * + * once this is done, call unmap() to drop all the corresponding rule references + * from mapping tables. + */ +static void pipapo_drop(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket rulemap[]) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + int g; + + for (g = 0; g < f->groups; g++) { + unsigned long *pos; + int b; + + pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + bitmap_cut(pos, pos, rulemap[i].to, + rulemap[i].n, + f->bsize * BITS_PER_LONG); + + pos += f->bsize; + } + } + + pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, + rulemap[i + 1].n, i == m->field_count - 1); + if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { + /* We can ignore this, a failure to shrink tables down + * doesn't make tables invalid. + */ + ; + } + f->rules -= rulemap[i].n; + } +} + +/** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +{ + struct nft_pipapo *priv = nft_set_priv(set); + int rules_f0, first_rule = 0; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + struct nft_pipapo_field *f; + struct nft_pipapo_elem *e; + int i, start, rules_fx; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + if (i < m->field_count - 1) { + rules_fx = f->mt[start].n; + start = f->mt[start].to; + } + } + + /* Pick the last field, and its last index */ + f--; + i--; + e = f->mt[rulemap[i].to].e; + if (nft_set_elem_expired(&e->ext) && + !nft_set_elem_mark_busy(&e->ext)) { + priv->dirty = true; + pipapo_drop(m, rulemap); + + rcu_barrier(); + nft_set_elem_destroy(set, e, true); + + /* And check again current first rule, which is now the + * first we haven't checked. + */ + } else { + first_rule += rules_f0; + } + } + + priv->last_gc = jiffies; +} + +/** + * pipapo_free_fields() - Free per-field tables contained in matching data + * @m: Matching data + */ +static void pipapo_free_fields(struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + kvfree(f->lt); + kvfree(f->mt); + } +} + +/** + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + int i; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + + free_percpu(m->scratch); + + pipapo_free_fields(m); + + kfree(m); +} + +/** + * pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working + * copy before committing it for lookup, and don't replace the table if the + * working copy doesn't have pending changes. + * + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +static void pipapo_commit(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); + + if (!priv->dirty) + return; + + new_clone = pipapo_clone(priv->clone); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + old = rcu_access_pointer(priv->match); + rcu_assign_pointer(priv->match, priv->clone); + if (old) + call_rcu(&old->rcu, pipapo_reclaim_match); + + priv->clone = new_clone; +} + +/** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently + * in use for lookups, and not directly inserted into current lookup data, so + * we'll take care of that by calling pipapo_commit() here. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); + if (IS_ERR(e)) + return; + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); + + pipapo_commit(set); +} + +/** + * pipapo_deactivate() - Check that element is in set, mark as inactive + * @net: Network namespace + * @set: nftables API set representation + * @data: Input key data + * @ext: nftables API extension pointer, used to check for end element + * + * This is a convenience function that can be called from both + * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same + * operation. + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, + const u8 *data, const struct nft_set_ext *ext) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, nft_genmask_next(net)); + if (IS_ERR(e)) + return NULL; + + nft_set_elem_change_active(net, set, &e->ext); + + return e; +} + +/** + * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *nft_pipapo_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); +} + +/** + * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * This is functionally the same as nft_pipapo_deactivate(), with a slightly + * different interface, and it's also called once for each element in a set + * being flushed, so we can't implement, strictly speaking, a flush operation, + * which would otherwise be as simple as allocating an empty copy of the + * matching data. + * + * Note that we could in theory do that, mark the set as flushed, and ignore + * subsequent calls, but we would leak all the elements after the first one, + * because they wouldn't then be freed as result of API calls. + * + * Return: true if element was found and deactivated. + */ +static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, + void *elem) +{ + struct nft_pipapo_elem *e = elem; + + return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), + &e->ext); +} + +/** + * pipapo_get_boundaries() - Get byte interval for associated rules + * @f: Field including lookup table + * @first_rule: First rule (lowest index) + * @rule_count: Number of associated rules + * @left: Byte expression for left boundary (start of range) + * @right: Byte expression for right boundary (end of range) + * + * Given the first rule and amount of rules that originated from the same entry, + * build the original range associated with the entry, and calculate the length + * of the originating netmask. + * + * In pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 1,2 + * 1 1,2 + * 2 1,2 + * 3 1,2 + * 4 1,2 + * 5 1 2 + * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * + * this is the lookup table corresponding to the IPv4 range + * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, + * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. + * + * This function fills @left and @right with the byte values of the leftmost + * and rightmost bucket indices for the lowest and highest rule indices, + * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in + * nibbles: + * left: < 12, 0, 10, 8, 0, 1, 0, 0 > + * right: < 12, 0, 10, 8, 0, 2, 2, 1 > + * corresponding to bytes: + * left: < 192, 168, 1, 0 > + * right: < 192, 168, 2, 1 > + * with mask length irrelevant here, unused on return, as the range is already + * defined by its start and end points. The mask length is relevant for a single + * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore + * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes + * < 192, 168, 1, 255 >, and the mask length, calculated from the distances + * between leftmost and rightmost bucket indices for each group, would be 24. + * + * Return: mask length, in bits. + */ +static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, + int rule_count, u8 *left, u8 *right) +{ + u8 *l = left, *r = right; + int g, mask_len = 0; + + for (g = 0; g < f->groups; g++) { + int b, x0, x1; + + x0 = -1; + x1 = -1; + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + unsigned long *pos; + + pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + if (test_bit(first_rule, pos) && x0 == -1) + x0 = b; + if (test_bit(first_rule + rule_count - 1, pos)) + x1 = b; + } + + if (g % 2) { + *(l++) |= x0 & 0x0f; + *(r++) |= x1 & 0x0f; + } else { + *l |= x0 << 4; + *r |= x1 << 4; + } + + if (x1 - x0 == 0) + mask_len += 4; + else if (x1 - x0 == 1) + mask_len += 3; + else if (x1 - x0 == 3) + mask_len += 2; + else if (x1 - x0 == 7) + mask_len += 1; + } + + return mask_len; +} + +/** + * pipapo_match_field() - Match rules against byte ranges + * @f: Field including the lookup table + * @first_rule: First of associated rules originating from same entry + * @rule_count: Amount of associated rules + * @start: Start of range to be matched + * @end: End of range to be matched + * + * Return: true on match, false otherwise. + */ +static bool pipapo_match_field(struct nft_pipapo_field *f, + int first_rule, int rule_count, + const u8 *start, const u8 *end) +{ + u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; + u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; + + pipapo_get_boundaries(f, first_rule, rule_count, left, right); + + return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); +} + +/** + * nft_pipapo_remove() - Remove element given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Similarly to nft_pipapo_activate(), this is used as commit operation by the + * API, but it's called once per element in the pending transaction, so we can't + * implement this as a single commit operation. Closest we can get is to remove + * the matched element here, if any, and commit the updated matching data. + */ +static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const u8 *data = (const u8 *)elem->key.val.data; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, 0); + if (IS_ERR(e)) + return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; + struct nft_pipapo_field *f; + int i, start, rules_fx; + + match_start = data; + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + rules_fx = f->mt[start].n; + start = f->mt[start].to; + + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); + pipapo_commit(set); + return; + } + + first_rule += rules_f0; + } +} + +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * As elements are referenced in the mapping array for the last field, directly + * scan that array: there's no need to follow rule mappings from the first + * field. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r; + + rcu_read_lock(); + m = rcu_dereference(priv->match); + + if (unlikely(!m)) + goto out; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + struct nft_set_elem elem; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + if (iter->count < iter->skip) + goto cont; + + e = f->mt[r].e; + if (nft_set_elem_expired(&e->ext)) + goto cont; + + elem.priv = e; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rcu_read_unlock(); +} + +/** + * nft_pipapo_privsize() - Return the size of private data for the set + * @nla: netlink attributes, ignored as size doesn't depend on them + * @desc: Set description, ignored as size doesn't depend on it + * + * Return: size of private data for this set implementation, in bytes + */ +static u64 nft_pipapo_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_pipapo); +} + +/** + * nft_pipapo_estimate() - Estimate set size, space and lookup complexity + * @desc: Set description, element count and field description used here + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: true only for compatible range concatenations + */ +static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned long entry_size; + int i; + + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + return false; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return false; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + est->size = desc->size * entry_size; + if (est->size && div_u64(est->size, desc->size) != entry_size) + return false; + + est->size += sizeof(struct nft_pipapo) + + sizeof(struct nft_pipapo_match) * 2; + + est->size += sizeof(struct nft_pipapo_field) * desc->field_count; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_init() - Initialise data for a set instance + * @set: nftables API set representation + * @desc: Set description + * @nla: netlink attributes + * + * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink + * attributes, initialise internal set parameters, current instance of matching + * data and a copy for subsequent insertions. + * + * Return: 0 on success, negative error code on failure. + */ +static int nft_pipapo_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int err, i; + + if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + return -EINVAL; + + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + GFP_KERNEL); + if (!m) + return -ENOMEM; + + m->field_count = desc->field_count; + m->bsize_max = 0; + + m->scratch = alloc_percpu(unsigned long *); + if (!m->scratch) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch, i) = NULL; + + rcu_head_init(&m->rcu); + + nft_pipapo_for_each_field(f, i, m) { + f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; + priv->groups += f->groups; + + priv->width += round_up(desc->field_len[i], sizeof(u32)); + + f->bsize = 0; + f->rules = 0; + f->lt = NULL; + f->mt = NULL; + } + + /* Create an initial clone of matching data for next insertion */ + priv->clone = pipapo_clone(m); + if (IS_ERR(priv->clone)) { + err = PTR_ERR(priv->clone); + goto out_free; + } + + priv->dirty = false; + + rcu_assign_pointer(priv->match, m); + + return 0; + +out_free: + free_percpu(m->scratch); + kfree(m); + + return err; +} + +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r, cpu; + + m = rcu_dereference_protected(priv->match, true); + if (m) { + rcu_barrier(); + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } + + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(m->scratch, cpu)); + free_percpu(m->scratch); + + pipapo_free_fields(m); + kfree(m); + priv->match = NULL; + } + + if (priv->clone) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + free_percpu(priv->clone->scratch); + + pipapo_free_fields(priv->clone); + kfree(priv->clone); + priv->clone = NULL; + } +} + +/** + * nft_pipapo_gc_init() - Initialise garbage collection + * @set: nftables API set representation + * + * Instead of actually setting up a periodic work for garbage collection, as + * this operation requires a swap of matching data with the working copy, we'll + * do that opportunistically with other commit operations if the interval is + * elapsed, so we just need to set the current jiffies timestamp here. + */ +static void nft_pipapo_gc_init(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +struct nft_set_type nft_set_pipapo_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 419d58ef802b..5000b938ab1e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -13,7 +13,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_rbtree { struct rb_root root; @@ -74,8 +74,13 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); continue; } - if (nft_rbtree_interval_end(rbe)) - goto out; + if (nft_rbtree_interval_end(rbe)) { + if (nft_set_is_anonymous(set)) + return false; + parent = rcu_dereference_raw(parent->rb_left); + interval = NULL; + continue; + } *ext = &rbe->ext; return true; @@ -88,7 +93,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set *ext = &interval->ext; return true; } -out: + return false; } @@ -139,8 +144,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, if (flags & NFT_SET_ELEM_INTERVAL_END) interval = rbe; } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) + if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = rcu_dereference_raw(parent->rb_left); + continue; + } if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == @@ -148,7 +155,11 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, *elem = rbe; return true; } - return false; + + if (nft_rbtree_interval_end(rbe)) + interval = NULL; + + parent = rcu_dereference_raw(parent->rb_left); } } @@ -455,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (desc->field_count > 1) + return false; + if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index d7f3776dfd71..637ce3e8c575 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -47,9 +47,6 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - /* So that subsequent socket matching not to require other lookups. */ - skb->sk = sk; - switch(priv->key) { case NFT_SOCKET_TRANSPARENT: nft_reg_store8(dest, inet_sk_transparent(sk)); @@ -66,6 +63,9 @@ static void nft_socket_eval(const struct nft_expr *expr, WARN_ON(1); regs->verdict.code = NFT_BREAK; } + + if (sk != skb->sk) + sock_gen_put(sk); } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 928e661d1517..e2c1fc608841 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -24,15 +24,15 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, const struct tcphdr *tcp, struct synproxy_net *snet, struct nf_synproxy_info *info, - struct nft_synproxy *priv) + const struct nft_synproxy *priv) { this_cpu_inc(snet->stats->syn_received); if (tcp->ece && tcp->cwr) opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; - opts->mss_encode = opts->mss; - opts->mss = info->mss; + opts->mss_encode = opts->mss_option; + opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else @@ -41,14 +41,13 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, NF_SYNPROXY_OPT_ECN); } -static void nft_synproxy_eval_v4(const struct nft_expr *expr, +static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nft_synproxy *priv = nft_expr_priv(expr); struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); @@ -73,14 +72,13 @@ static void nft_synproxy_eval_v4(const struct nft_expr *expr, } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) -static void nft_synproxy_eval_v6(const struct nft_expr *expr, +static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nft_synproxy *priv = nft_expr_priv(expr); struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); @@ -105,9 +103,9 @@ static void nft_synproxy_eval_v6(const struct nft_expr *expr, } #endif /* CONFIG_NF_TABLES_IPV6*/ -static void nft_synproxy_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_synproxy_do_eval(const struct nft_synproxy *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; @@ -140,23 +138,22 @@ static void nft_synproxy_eval(const struct nft_expr *expr, switch (skb->protocol) { case htons(ETH_P_IP): - nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts); return; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case htons(ETH_P_IPV6): - nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts); return; #endif } regs->verdict.code = NFT_BREAK; } -static int nft_synproxy_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_synproxy_do_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_synproxy *priv) { struct synproxy_net *snet = synproxy_pernet(ctx->net); - struct nft_synproxy *priv = nft_expr_priv(expr); u32 flags; int err; @@ -206,8 +203,7 @@ nf_ct_failure: return err; } -static void nft_synproxy_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) { struct synproxy_net *snet = synproxy_pernet(ctx->net); @@ -229,10 +225,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv) { - const struct nft_synproxy *priv = nft_expr_priv(expr); - if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) @@ -244,6 +238,15 @@ nla_put_failure: return -1; } +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + nft_synproxy_do_eval(priv, regs, pkt); +} + static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -252,6 +255,28 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, (1 << NF_INET_FORWARD)); } +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + + return nft_synproxy_do_init(ctx, tb, priv); +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nft_synproxy_do_destroy(ctx); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + + return nft_synproxy_do_dump(skb, priv); +} + static struct nft_expr_type nft_synproxy_type; static const struct nft_expr_ops nft_synproxy_ops = { .eval = nft_synproxy_eval, @@ -271,14 +296,89 @@ static struct nft_expr_type nft_synproxy_type __read_mostly = { .maxattr = NFTA_SYNPROXY_MAX, }; +static int nft_synproxy_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_synproxy *priv = nft_obj_data(obj); + + return nft_synproxy_do_init(ctx, tb, priv); +} + +static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nft_synproxy_do_destroy(ctx); +} + +static int nft_synproxy_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + struct nft_synproxy *priv = nft_obj_data(obj); + + return nft_synproxy_do_dump(skb, priv); +} + +static void nft_synproxy_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_synproxy *priv = nft_obj_data(obj); + + nft_synproxy_do_eval(priv, regs, pkt); +} + +static void nft_synproxy_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_synproxy *newpriv = nft_obj_data(newobj); + struct nft_synproxy *priv = nft_obj_data(obj); + + priv->info = newpriv->info; +} + +static struct nft_object_type nft_synproxy_obj_type; +static const struct nft_object_ops nft_synproxy_obj_ops = { + .type = &nft_synproxy_obj_type, + .size = sizeof(struct nft_synproxy), + .init = nft_synproxy_obj_init, + .destroy = nft_synproxy_obj_destroy, + .dump = nft_synproxy_obj_dump, + .eval = nft_synproxy_obj_eval, + .update = nft_synproxy_obj_update, +}; + +static struct nft_object_type nft_synproxy_obj_type __read_mostly = { + .type = NFT_OBJECT_SYNPROXY, + .ops = &nft_synproxy_obj_ops, + .maxattr = NFTA_SYNPROXY_MAX, + .policy = nft_synproxy_policy, + .owner = THIS_MODULE, +}; + static int __init nft_synproxy_module_init(void) { - return nft_register_expr(&nft_synproxy_type); + int err; + + err = nft_register_obj(&nft_synproxy_obj_type); + if (err < 0) + return err; + + err = nft_register_expr(&nft_synproxy_type); + if (err < 0) + goto err; + + return 0; + +err: + nft_unregister_obj(&nft_synproxy_obj_type); + return err; } static void __exit nft_synproxy_module_exit(void) { - return nft_unregister_expr(&nft_synproxy_type); + nft_unregister_expr(&nft_synproxy_type); + nft_unregister_obj(&nft_synproxy_obj_type); } module_init(nft_synproxy_module_init); @@ -287,3 +387,4 @@ module_exit(nft_synproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f92a82c73880..d67f83a0958d 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) - tport = regs->data[priv->sreg_port]; + tport = nft_reg_load16(®s->data[priv->sreg_port]); if (!tport) tport = hp->dest; @@ -218,14 +218,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, switch (priv->family) { case NFPROTO_IPV4: - alen = FIELD_SIZEOF(union nf_inet_addr, in); + alen = sizeof_field(union nf_inet_addr, in); err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: - alen = FIELD_SIZEOF(union nf_inet_addr, in6); + alen = sizeof_field(union nf_inet_addr, in6); err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3d4c2ae605a8..4c3f2e24c7cb 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -76,7 +76,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, struct nft_tunnel *priv = nft_expr_priv(expr); u32 len; - if (!tb[NFTA_TUNNEL_KEY] && + if (!tb[NFTA_TUNNEL_KEY] || !tb[NFTA_TUNNEL_DREG]) return -EINVAL; @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; @@ -266,6 +267,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, if (err < 0) return err; + if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]) + return -EINVAL; + version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); switch (version) { case ERSPAN_VERSION: @@ -442,10 +446,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || - nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || - nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, + &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, + &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + info->key.label)) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } else { @@ -453,9 +462,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || - nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, + info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, + info->key.u.ipv4.dst) < 0) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } @@ -467,42 +480,58 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; - struct nlattr *nest; + struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (opts->flags & TUNNEL_VXLAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); + if (!inner) + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) - return -1; + goto inner_failure; + nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); + if (!inner) + goto failure; + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, + htonl(opts->u.erspan.version))) + goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) - return -1; + goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) - return -1; + goto inner_failure; break; } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); - return 0; + +inner_failure: + nla_nest_cancel(skb, inner); +failure: + nla_nest_cancel(skb, nest); + return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ce70c2576bb2..e27c6c5ba9df 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -939,14 +939,14 @@ EXPORT_SYMBOL(xt_check_entry_offsets); * * @size: number of entries * - * Return: NULL or kmalloc'd or vmalloc'd array + * Return: NULL or zeroed kmalloc'd or vmalloc'd array */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) return NULL; - return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); + return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL); } EXPORT_SYMBOL(xt_alloc_entry_offsets); diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index be7798a50546..713fb38541df 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) return 0; /* Error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) + if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 9cec9eae556a..f56d3ed93e56 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -283,7 +283,7 @@ static int __init idletimer_tg_init(void) idletimer_tg_kobj = &idletimer_tg_device->kobj; - err = xt_register_target(&idletimer_tg); + err = xt_register_target(&idletimer_tg); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 2236455b10a3..37253d399c6b 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -30,7 +30,7 @@ static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { - return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) & + return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bc6c8ab0fa62..46fcac75f726 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -13,6 +13,8 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/ip.h> +#include <linux/ipv6.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2d2691dd51e0..bccd47cd7190 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -34,9 +34,14 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter/xt_hashlimit.h> #include <linux/mutex.h> #include <linux/kernel.h> +#include <uapi/linux/netfilter/xt_hashlimit.h> + +#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ + XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \ + XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\ + XT_HASHLIMIT_RATE_MATCH) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); @@ -352,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -376,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -390,7 +381,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -414,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) { cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); + htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index d0ab1adf5bff..5aab6df74e0f 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -54,25 +54,39 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par) nfnl_acct_put(info->nfacct); } -static struct xt_match nfacct_mt_reg __read_mostly = { - .name = "nfacct", - .family = NFPROTO_UNSPEC, - .checkentry = nfacct_mt_checkentry, - .match = nfacct_mt, - .destroy = nfacct_mt_destroy, - .matchsize = sizeof(struct xt_nfacct_match_info), - .usersize = offsetof(struct xt_nfacct_match_info, nfacct), - .me = THIS_MODULE, +static struct xt_match nfacct_mt_reg[] __read_mostly = { + { + .name = "nfacct", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info), + .usersize = offsetof(struct xt_nfacct_match_info, nfacct), + .me = THIS_MODULE, + }, + { + .name = "nfacct", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info_v1), + .usersize = offsetof(struct xt_nfacct_match_info_v1, nfacct), + .me = THIS_MODULE, + }, }; static int __init nfacct_mt_init(void) { - return xt_register_match(&nfacct_mt_reg); + return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } static void __exit nfacct_mt_exit(void) { - xt_unregister_match(&nfacct_mt_reg); + xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg)); } module_init(nfacct_mt_init); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ead7c6022208..ec6ed6fda96c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -5,12 +5,13 @@ /* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter_bridge.h> -#include <linux/netfilter/xt_physdev.h> #include <linux/netfilter/x_tables.h> -#include <net/netfilter/br_netfilter.h> +#include <uapi/linux/netfilter/xt_physdev.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); @@ -101,11 +102,9 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { + par->hook_mask & (1 << NF_INET_LOCAL_OUT)) { pr_info_ratelimited("--physdev-out and --physdev-is-out only supported in the FORWARD and POSTROUTING chains with bridged traffic\n"); - if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) - return -EINVAL; + return -EINVAL; } if (!brnf_probed) { diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 781e0b482189..0a9708004e20 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static const struct file_operations recent_mt_fops; +static const struct proc_ops recent_mt_proc_ops; #endif static u_int32_t hash_rnd __read_mostly; @@ -405,7 +405,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, goto out; } pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, - &recent_mt_fops, t); + &recent_mt_proc_ops, t); if (pde == NULL) { recent_table_free(t); ret = -ENOMEM; @@ -616,13 +616,12 @@ recent_mt_proc_write(struct file *file, const char __user *input, return size + 1; } -static const struct file_operations recent_mt_fops = { - .open = recent_seq_open, - .read = seq_read, - .write = recent_mt_proc_write, - .release = seq_release_private, - .owner = THIS_MODULE, - .llseek = seq_lseek, +static const struct proc_ops recent_mt_proc_ops = { + .proc_open = recent_seq_open, + .proc_read = seq_read, + .proc_write = recent_mt_proc_write, + .proc_release = seq_release_private, + .proc_lseek = seq_lseek, }; static int __net_init recent_proc_net_init(struct net *net) diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index ecbfa291fb70..731bc2cafae4 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -14,7 +14,6 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/ipset/ip_set.h> -#include <linux/netfilter/ipset/ip_set_timeout.h> #include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 8dbb4d48f2ed..67cb98489415 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y) * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ -static inline unsigned int localtime_1(struct xtm *r, time_t time) +static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ - v = time % SECONDS_PER_DAY; + div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; @@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time) return v; } -static inline void localtime_2(struct xtm *r, time_t time) +static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ - r->dse = time / 86400; + r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). @@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time) r->weekday = (4 + r->dse - 1) % 7 + 1; } -static void localtime_3(struct xtm *r, time_t time) +static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; @@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; - s64 stamp; + time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp @@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * 1. match before 13:00 * 2. match after 13:00 * - * If you match against processing time (get_seconds) it + * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ - stamp = get_seconds(); + stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ @@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) + * + * note: info->date_start/stop are unsigned 32-bit values that + * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) |