From 4f031fa9f188b2b0641ac20087d9e16bcfb4e49d Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Thu, 6 Nov 2014 11:52:13 +0100 Subject: mac80211: Fix regression that triggers a kernel BUG with CCMP Commit 7ec7c4a9a686c608315739ab6a2b0527a240883c (mac80211: port CCMP to cryptoapi's CCM driver) introduced a regression when decrypting empty packets (data_len == 0). This will lead to backtraces like: (scatterwalk_start) from [] (scatterwalk_map_and_copy+0x2c/0xa8) (scatterwalk_map_and_copy) from [] (crypto_ccm_decrypt+0x7c/0x25c) (crypto_ccm_decrypt) from [] (ieee80211_aes_ccm_decrypt+0x160/0x170) (ieee80211_aes_ccm_decrypt) from [] (ieee80211_crypto_ccmp_decrypt+0x1ac/0x238) (ieee80211_crypto_ccmp_decrypt) from [] (ieee80211_rx_handlers+0x870/0x1d24) (ieee80211_rx_handlers) from [] (ieee80211_prepare_and_rx_handle+0x8a0/0x91c) (ieee80211_prepare_and_rx_handle) from [] (ieee80211_rx+0x568/0x730) (ieee80211_rx) from [] (__carl9170_rx+0x94c/0xa20) (__carl9170_rx) from [] (carl9170_rx_stream+0x1fc/0x320) (carl9170_rx_stream) from [] (carl9170_usb_tasklet+0x80/0xc8) (carl9170_usb_tasklet) from [] (tasklet_hi_action+0x88/0xcc) (tasklet_hi_action) from [] (__do_softirq+0xcc/0x200) (__do_softirq) from [] (irq_exit+0x80/0xe0) (irq_exit) from [] (handle_IRQ+0x64/0x80) (handle_IRQ) from [] (__irq_svc+0x40/0x4c) (__irq_svc) from [] (arch_cpu_idle+0x2c/0x34) Such packets can appear for example when using the carl9170 wireless driver because hardware sometimes generates garbage when the internal FIFO overruns. This patch adds an additional length check. Cc: stable@vger.kernel.org Fixes: 7ec7c4a9a686 ("mac80211: port CCMP to cryptoapi's CCM driver") Acked-by: Ard Biesheuvel Signed-off-by: Ronald Wahl Signed-off-by: Johannes Berg --- net/mac80211/aes_ccm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index ec24378caaaf..09d9caaec591 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -53,6 +53,9 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, __aligned(__alignof__(struct aead_request)); struct aead_request *aead_req = (void *) aead_req_data; + if (data_len == 0) + return -EINVAL; + memset(aead_req, 0, sizeof(aead_req_data)); sg_init_one(&pt, data, data_len); -- cgit v1.2.1 From 6b96686ecffcbea85dcb502e4584e4a20a2bfb29 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Nov 2014 15:34:54 +0100 Subject: netfilter: nft_masq: fix uninitialized range in nft_masq_{ipv4, ipv6}_eval When transferring from the original range in nf_nat_masquerade_{ipv4,ipv6}() we copy over values from stack in from min_proto/max_proto due to uninitialized range variable in both, nft_masq_{ipv4,ipv6}_eval. As we only initialize flags at this time from nft_masq struct, just zero out the rest. Fixes: 9ba1f726bec09 ("netfilter: nf_tables: add new nft_masq expression") Signed-off-by: Daniel Borkmann Acked-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_masq_ipv4.c | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index c1023c445920..665de06561cd 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -24,6 +24,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, struct nf_nat_range range; unsigned int verdict; + memset(&range, 0, sizeof(range)); range.flags = priv->flags; verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 8a7ac685076d..529c119cbb14 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -25,6 +25,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, struct nf_nat_range range; unsigned int verdict; + memset(&range, 0, sizeof(range)); range.flags = priv->flags; verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); -- cgit v1.2.1 From 2196937e12b1b4ba139806d132647e1651d655df Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Nov 2014 17:11:21 +0100 Subject: netfilter: ipset: small potential read beyond the end of buffer We could be reading 8 bytes into a 4 byte buffer here. It seems harmless but adding a check is the right thing to do and it silences a static checker warning. Signed-off-by: Dan Carpenter Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 86f9d76b1464..d259da3ce67a 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1863,6 +1863,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ struct ip_set_req_version *req_version = data; + + if (*len < sizeof(struct ip_set_req_version)) { + ret = -EINVAL; + goto done; + } + if (req_version->version != IPSET_PROTOCOL) { ret = -EPROTO; goto done; -- cgit v1.2.1 From 50656d9df63d69ce399c8be62d4473b039dac36a Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Tue, 4 Nov 2014 16:37:40 -0800 Subject: ipvs: Keep skb->sk when allocating headroom on tunnel xmit ip_vs_prepare_tunneled_skb() ignores ->sk when allocating a new skb, either unconditionally setting ->sk to NULL or allowing the uninitialized ->sk from a newly allocated skb to leak through to the caller. This patch properly copies ->sk and increments its reference count. Signed-off-by: Calvin Owens Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 437a3663ad03..bd90bf8107da 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -846,6 +846,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto error; + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } -- cgit v1.2.1 From 2daf1b4d18e3add229d1a3b5c554331d99ac6c7e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Nov 2014 18:48:33 +0100 Subject: netfilter: nft_compat: use current net namespace Instead of init_net when using xtables over nftables compat. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 9d6d6f60a80f..b92f129beade 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -117,7 +117,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, struct xt_target *target, void *info, union nft_entry *entry, u8 proto, bool inv) { - par->net = &init_net; + par->net = ctx->net; par->table = ctx->table->name; switch (ctx->afi->family) { case AF_INET: @@ -324,7 +324,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, union nft_entry *entry, u8 proto, bool inv) { - par->net = &init_net; + par->net = ctx->net; par->table = ctx->table->name; switch (ctx->afi->family) { case AF_INET: -- cgit v1.2.1 From c918687f5e3962375a19de6ded3c1be85ebdbcd6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 20:53:55 +0100 Subject: netfilter: nft_compat: relax chain type validation Check for nat chain dependency only, which is the one that can actually crash the kernel. Don't care if mangle, filter and security specific match and targets are used out of their scope, they are harmless. This restores iptables-compat with mangle specific match/target when used out of the OUTPUT chain, that are actually emulated through filter chains, which broke when performing strict validation. Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index b92f129beade..70dc96516305 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -21,45 +21,17 @@ #include #include -static const struct { - const char *name; - u8 type; -} table_to_chaintype[] = { - { "filter", NFT_CHAIN_T_DEFAULT }, - { "raw", NFT_CHAIN_T_DEFAULT }, - { "security", NFT_CHAIN_T_DEFAULT }, - { "mangle", NFT_CHAIN_T_ROUTE }, - { "nat", NFT_CHAIN_T_NAT }, - { }, -}; - -static int nft_compat_table_to_chaintype(const char *table) -{ - int i; - - for (i = 0; table_to_chaintype[i].name != NULL; i++) { - if (strcmp(table_to_chaintype[i].name, table) == 0) - return table_to_chaintype[i].type; - } - - return -1; -} - static int nft_compat_chain_validate_dependency(const char *tablename, const struct nft_chain *chain) { - enum nft_chain_type type; const struct nft_base_chain *basechain; if (!tablename || !(chain->flags & NFT_BASE_CHAIN)) return 0; - type = nft_compat_table_to_chaintype(tablename); - if (type < 0) - return -EINVAL; - basechain = nft_base_chain(chain); - if (basechain->type->type != type) + if (strcmp(tablename, "nat") == 0 && + basechain->type->type != NFT_CHAIN_T_NAT) return -EINVAL; return 0; -- cgit v1.2.1 From afefb6f928ed42d5db452ee9251ce6de62673c67 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 19:08:21 +0100 Subject: netfilter: nft_compat: use the match->table to validate dependencies Instead of the match->name, which is of course not relevant. Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 70dc96516305..265e190f2218 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -346,7 +346,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, union nft_entry e = {}; int ret; - ret = nft_compat_chain_validate_dependency(match->name, ctx->chain); + ret = nft_compat_chain_validate_dependency(match->table, ctx->chain); if (ret < 0) goto err; @@ -420,7 +420,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (!(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->name, + ret = nft_compat_chain_validate_dependency(match->table, ctx->chain); if (ret < 0) return ret; -- cgit v1.2.1 From b326dd37b94e29bf6a15940f4fa66aa21a678ab1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 21:14:12 +0100 Subject: netfilter: nf_tables: restore synchronous object release from commit/abort The existing xtables matches and targets, when used from nft_compat, may sleep from the destroy path, ie. when removing rules. Since the objects are released via call_rcu from softirq context, this results in lockdep splats and possible lockups that may be hard to reproduce. Patrick also indicated that delayed object release via call_rcu can cause us problems in the ordering of event notifications when anonymous sets are in place. So, this patch restores the synchronous object release from the commit and abort paths. This includes a call to synchronize_rcu() to make sure that no packets are walking on the objects that are going to be released. This is slowier though, but it's simple and it resolves the aforementioned problems. This is a partial revert of c7c32e7 ("netfilter: nf_tables: defer all object release via rcu") that was introduced in 3.16 to speed up interaction with userspace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 11ab4b078f3b..66e8425dbfe7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3484,13 +3484,8 @@ static void nft_chain_commit_update(struct nft_trans *trans) } } -/* Schedule objects for release via rcu to make sure no packets are accesing - * removed rules. - */ -static void nf_tables_commit_release_rcu(struct rcu_head *rt) +static void nf_tables_commit_release(struct nft_trans *trans) { - struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); - switch (trans->msg_type) { case NFT_MSG_DELTABLE: nf_tables_table_destroy(&trans->ctx); @@ -3612,10 +3607,11 @@ static int nf_tables_commit(struct sk_buff *skb) } } + synchronize_rcu(); + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { list_del(&trans->list); - trans->ctx.nla = NULL; - call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + nf_tables_commit_release(trans); } nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); @@ -3623,13 +3619,8 @@ static int nf_tables_commit(struct sk_buff *skb) return 0; } -/* Schedule objects for release via rcu to make sure no packets are accesing - * aborted rules. - */ -static void nf_tables_abort_release_rcu(struct rcu_head *rt) +static void nf_tables_abort_release(struct nft_trans *trans) { - struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); - switch (trans->msg_type) { case NFT_MSG_NEWTABLE: nf_tables_table_destroy(&trans->ctx); @@ -3725,11 +3716,12 @@ static int nf_tables_abort(struct sk_buff *skb) } } + synchronize_rcu(); + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { list_del(&trans->list); - trans->ctx.nla = NULL; - call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + nf_tables_abort_release(trans); } return 0; -- cgit v1.2.1 From 5195c14c8b27cc0b18220ddbf0e5ad3328a04187 Mon Sep 17 00:00:00 2001 From: bill bonaparte Date: Thu, 6 Nov 2014 14:36:48 +0100 Subject: netfilter: conntrack: fix race in __nf_conntrack_confirm against get_next_corpse After removal of the central spinlock nf_conntrack_lock, in commit 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock"), it is possible to race against get_next_corpse(). The race is against the get_next_corpse() cleanup on the "unconfirmed" list (a per-cpu list with seperate locking), which set the DYING bit. Fix this race, in __nf_conntrack_confirm(), by removing the CT from unconfirmed list before checking the DYING bit. In case race occured, re-add the CT to the dying list. While at this, fix coding style of the comment that has been updated. Fixes: 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock") Reported-by: bill bonaparte Signed-off-by: bill bonaparte Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5016a6929085..2c699757bccf 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -611,12 +611,16 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - /* We have to check the DYING flag inside the lock to prevent - a race against nf_ct_get_next_corpse() possibly called from - user context, else we insert an already 'dead' hash, blocking - further use of that particular connection -JM */ + + /* We have to check the DYING flag after unlink to prevent + * a race against nf_ct_get_next_corpse() possibly called from + * user context, else we insert an already 'dead' hash, blocking + * further use of that particular connection -JM. + */ + nf_ct_del_from_dying_or_unconfirmed_list(ct); if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_ACCEPT; @@ -636,8 +640,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - nf_ct_del_from_dying_or_unconfirmed_list(ct); - /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ -- cgit v1.2.1 From ab64f16ff2e83371927c57a0380fd3c0fee5c1c1 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 11 Nov 2014 13:40:49 -0800 Subject: openvswitch: Fix memory leak. Need to free memory in case of sample action error. Introduced by commit 651887b0c22cffcfce7eb9c ("openvswitch: Sample action without side effects"). Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 006886dbee36..00e447a17f64 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -722,8 +722,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a); - if (unlikely(err)) /* skb already freed. */ - return err; break; } -- cgit v1.2.1 From 856447d0209c2214d806b30bd3b0d873db5998bd Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 11 Nov 2014 14:32:20 -0800 Subject: openvswitch: Fix checksum calculation when modifying ICMPv6 packets. The checksum of ICMPv6 packets uses the IP pseudoheader as part of the calculation, unlike ICMP in IPv4. This was not implemented, which means that modifying the IP addresses of an ICMPv6 packet would cause the checksum to no longer be correct as the psuedoheader did not match. Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action"). Reported-by: Neal Shrader Signed-off-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 00e447a17f64..8c4229b11c34 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -246,11 +246,11 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, { int transport_len = skb->len - skb_transport_offset(skb); - if (l4_proto == IPPROTO_TCP) { + if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, addr, new_addr, 1); - } else if (l4_proto == IPPROTO_UDP) { + } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); @@ -261,6 +261,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, uh->check = CSUM_MANGLED_0; } } + } else if (l4_proto == NEXTHDR_ICMP) { + if (likely(transport_len >= sizeof(struct icmp6hdr))) + inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, + skb, addr, new_addr, 1); } } -- cgit v1.2.1 From 19e7a3df7261c9b7ebced8163c383712d5b6ac6b Mon Sep 17 00:00:00 2001 From: Daniele Di Proietto Date: Tue, 11 Nov 2014 14:51:22 -0800 Subject: openvswitch: Fix NDP flow mask validation match_validate() enforce that a mask matching on NDP attributes has also an exact match on ICMPv6 type. The ICMPv6 type, which is 8-bit wide, is stored in the 'tp.src' field of 'struct sw_flow_key', which is 16-bit wide. Therefore, an exact match on ICMPv6 type should only check the first 8 bits. This commit fixes a bug that prevented flows with an exact match on NDP field from being installed Introduced by commit 03f0d916aa03 ("openvswitch: Mega flow implementation"). Signed-off-by: Daniele Di Proietto Signed-off-by: Pravin B Shelar --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 939bcb32100f..dda040e693a3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -145,7 +145,7 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_ARP) || match->key->eth.type == htons(ETH_P_RARP)) { key_expected |= 1 << OVS_KEY_ATTR_ARP; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && (match->mask->key.tp.src == htons(0xff))) mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } -- cgit v1.2.1 From 8ec609d8b561468691b60347ff594bd443ea58c0 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 11 Nov 2014 15:55:16 -0800 Subject: openvswitch: Convert dp rcu read operation to locked operations dp read operations depends on ovs_dp_cmd_fill_info(). This API needs to looup vport to find dp name, but vport lookup can fail. Therefore to keep vport reference alive we need to take ovs lock. Introduced by commit 6093ae9abac1 ("openvswitch: Minimize dp and vport critical sections"). Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/datapath.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e6d7255183eb..f9e556b56086 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1265,7 +1265,7 @@ static size_t ovs_dp_cmd_msg_size(void) return msgsize; } -/* Called with ovs_mutex or RCU read lock. */ +/* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { @@ -1555,7 +1555,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) if (!reply) return -ENOMEM; - rcu_read_lock(); + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); @@ -1564,12 +1564,12 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); - rcu_read_unlock(); + ovs_unlock(); return genlmsg_reply(reply, info); err_unlock_free: - rcu_read_unlock(); + ovs_unlock(); kfree_skb(reply); return err; } @@ -1581,8 +1581,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int skip = cb->args[0]; int i = 0; - rcu_read_lock(); - list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) { + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -1590,7 +1590,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) break; i++; } - rcu_read_unlock(); + ovs_unlock(); cb->args[0] = i; -- cgit v1.2.1 From fecaef85f7188ad1822210e2c7a7625c9a32a8e4 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Tue, 11 Nov 2014 14:36:30 -0800 Subject: openvswitch: Validate IPv6 flow key and mask values. Reject flow label key and mask values with invalid bits set. Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action"). Signed-off-by: Jarno Rajahalme Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/flow_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dda040e693a3..fa4ec2e4a78b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -689,6 +689,13 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } + + if (ipv6_key->ipv6_label & htonl(0xFFF00000)) { + OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n", + ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ipv6.label, ipv6_key->ipv6_label, is_mask); SW_FLOW_KEY_PUT(match, ip.proto, -- cgit v1.2.1 From 49dd18ba4615eaa72f15c9087dea1c2ab4744cf5 Mon Sep 17 00:00:00 2001 From: Panu Matilainen Date: Fri, 14 Nov 2014 13:14:32 +0200 Subject: ipv4: Fix incorrect error code when adding an unreachable route Trying to add an unreachable route incorrectly returns -ESRCH if if custom FIB rules are present: [root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4 RTNETLINK answers: Network is unreachable [root@localhost ~]# ip rule add to 55.66.77.88 table 200 [root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4 RTNETLINK answers: No such process [root@localhost ~]# Commit 83886b6b636173b206f475929e58fac75c6f2446 ("[NET]: Change "not found" return value for rule lookup") changed fib_rules_lookup() to use -ESRCH as a "not found" code internally, but for user space it should be translated into -ENETUNREACH. Handle the translation centrally in ipv4-specific fib_lookup(), leaving the DECnet case alone. On a related note, commit b7a71b51ee37d919e4098cd961d59a883fd272d8 ("ipv4: removed redundant conditional") removed a similar translation from ip_route_input_slow() prematurely AIUI. Fixes: b7a71b51ee37 ("ipv4: removed redundant conditional") Signed-off-by: Panu Matilainen Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2e15738534d..8f7bd56955b0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -62,6 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) else res->tclassid = 0; #endif + + if (err == -ESRCH) + err = -ENETUNREACH; + return err; } EXPORT_SYMBOL_GPL(__fib_lookup); -- cgit v1.2.1 From 52cff74eef5dd7bdab759300e7d1ca36eba18254 Mon Sep 17 00:00:00 2001 From: Anish Bhatt Date: Fri, 14 Nov 2014 16:38:31 -0800 Subject: dcbnl : Disable software interrupts before taking dcb_lock Solves possible lockup issues that can be seen from firmware DCB agents calling into the DCB app api. DCB firmware event queues can be tied in with NAPI so that dcb events are generated in softIRQ context. This can results in calls to dcb_*app() functions which try to take the dcb_lock. If the the event triggers while we also have the dcb_lock because lldpad or some other agent happened to be issuing a get/set command we could see a cpu lockup. This code was not originally written with firmware agents in mind, hence grabbing dcb_lock from softIRQ context was not considered. Signed-off-by: Anish Bhatt Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index ca11d283bbeb..93ea80196f0e 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1080,13 +1080,13 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (!app) return -EMSGSIZE; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app), &itr->app); if (err) { - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } @@ -1097,7 +1097,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) else dcbx = -EOPNOTSUPP; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); /* get peer info if available */ @@ -1234,7 +1234,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) } /* local app */ - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; @@ -1271,7 +1271,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) else dcbx = -EOPNOTSUPP; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { @@ -1326,7 +1326,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) return 0; dcb_unlock: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); nla_put_failure: return err; } @@ -1762,10 +1762,10 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) struct dcb_app_type *itr; u8 prio = 0; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) prio = itr->app.priority; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return prio; } @@ -1789,7 +1789,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) { if (new->priority) @@ -1804,7 +1804,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) if (new->priority) err = dcb_app_add(new, dev->ifindex); out: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1823,10 +1823,10 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) struct dcb_app_type *itr; u8 prio = 0; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) prio |= 1 << itr->app.priority; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return prio; } @@ -1850,7 +1850,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; @@ -1859,7 +1859,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) err = dcb_app_add(new, dev->ifindex); out: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1882,7 +1882,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); @@ -1890,7 +1890,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) err = 0; } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1902,12 +1902,12 @@ static void dcb_flushapp(void) struct dcb_app_type *app; struct dcb_app_type *tmp; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { list_del(&app->list); kfree(app); } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); } static int __init dcbnl_init(void) -- cgit v1.2.1 From feb91a02ccb09661507f170b2a444aec94f307f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Nov 2014 20:27:38 +0100 Subject: ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: [] ? skb_put+0x3a/0x3b [] ? add_grhead+0x45/0x8e [] ? add_grec+0x394/0x3d4 [] ? mld_ifc_timer_expire+0x195/0x20d [] ? mld_dad_timer_expire+0x45/0x45 [] ? call_timer_fn.isra.29+0x12/0x68 [] ? run_timer_softirq+0x163/0x182 [] ? __do_softirq+0xe0/0x21d [] ? irq_exit+0x4e/0xd3 [] ? smp_apic_timer_interrupt+0x3b/0x46 [] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Hannes Frederic Sowa Cc: David L Stevens Acked-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 11 +++++------ net/ipv6/mcast.c | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fb70e3ecc3e4..bb15d0e03d4f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } -#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) - -static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; @@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, @@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } skb->priority = TC_PRIO_CONTROL; - igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, @@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); skb_reset_network_header(skb); @@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9648de2b6745..ed2c4e400b46 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, hdr->daddr = *daddr; } -static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) +static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { struct net_device *dev = idev->dev; struct net *net = dev_net(dev); @@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu + hlen + tlen; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - size += hlen + tlen; /* limit our allocations to order-0 page */ size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); @@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) return NULL; skb->priority = TC_PRIO_CONTROL; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) -- cgit v1.2.1 From 97840cb67ff5ac8add836684f011fd838518d698 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 14 Nov 2014 18:14:33 +0100 Subject: netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind Make sure the netlink group exists, otherwise you can trigger an out of bound array memory access from the netlink_bind() path. This splat can only be triggered only by superuser. [ 180.203600] UBSan: Undefined behaviour in ../net/netfilter/nfnetlink.c:467:28 [ 180.204249] index 9 is out of range for type 'int [9]' [ 180.204697] CPU: 0 PID: 1771 Comm: trinity-main Not tainted 3.18.0-rc4-mm1+ #122 [ 180.205365] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org +04/01/2014 [ 180.206498] 0000000000000018 0000000000000000 0000000000000009 ffff88007bdf7da8 [ 180.207220] ffffffff82b0ef5f 0000000000000092 ffffffff845ae2e0 ffff88007bdf7db8 [ 180.207887] ffffffff8199e489 ffff88007bdf7e18 ffffffff8199ea22 0000003900000000 [ 180.208639] Call Trace: [ 180.208857] dump_stack (lib/dump_stack.c:52) [ 180.209370] ubsan_epilogue (lib/ubsan.c:174) [ 180.209849] __ubsan_handle_out_of_bounds (lib/ubsan.c:400) [ 180.210512] nfnetlink_bind (net/netfilter/nfnetlink.c:467) [ 180.210986] netlink_bind (net/netlink/af_netlink.c:1483) [ 180.211495] SYSC_bind (net/socket.c:1541) Moreover, define the missing nf_tables and nf_acct multicast groups too. Reported-by: Andrey Ryabinin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 6c5a915cfa75..13c2e17bbe27 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, }; void nfnl_lock(__u8 subsys_id) @@ -464,7 +466,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) static int nfnetlink_bind(int group) { const struct nfnetlink_subsystem *ss; - int type = nfnl_group2type[group]; + int type; + + if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) + return -EINVAL; + + type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type); @@ -514,6 +521,9 @@ static int __init nfnetlink_init(void) { int i; + for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++) + BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE); + for (i=0; i Date: Mon, 17 Nov 2014 12:20:28 +0100 Subject: bridge: fix netfilter/NF_BR_LOCAL_OUT for own, locally generated queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ebtables on the OUTPUT chain (NF_BR_LOCAL_OUT) would not work as expected for both locally generated IGMP and MLD queries. The IP header specific filter options are off by 14 Bytes for netfilter (actual output on interfaces is fine). NF_HOOK() expects the skb->data to point to the IP header, not the ethernet one (while dev_queue_xmit() does not). Luckily there is an br_dev_queue_push_xmit() helper function already - let's just use that. Introduced by eb1d16414339a6e113d89e2cca2556005d7ce919 ("bridge: Add core IGMP snooping support") Ebtables example: $ ebtables -I OUTPUT -p IPv6 -o eth1 --logical-out br0 \ --log --log-level 6 --log-ip6 --log-prefix="~EBT: " -j DROP before (broken): ~EBT: IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \ MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \ SRC=64a4:39c2:86dd:6000:0000:0020:0001:fe80 IPv6 \ DST=0000:0000:0000:0004:64ff:fea4:39c2:ff02, \ IPv6 priority=0x3, Next Header=2 after (working): ~EBT: IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \ MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \ SRC=fe80:0000:0000:0000:0004:64ff:fea4:39c2 IPv6 \ DST=ff02:0000:0000:0000:0000:0000:0000:0001, \ IPv6 priority=0x0, Next Header=0 Signed-off-by: Linus Lüssing Acked-by: Herbert Xu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_multicast.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 648d79ccf462..c465876c7861 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -813,10 +813,9 @@ static void __br_multicast_send_query(struct net_bridge *br, return; if (port) { - __skb_push(skb, sizeof(struct ethhdr)); skb->dev = port->dev; NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, - dev_queue_xmit); + br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); netif_rx(skb); -- cgit v1.2.1 From 280ba51d60be6f4ca3347eaa60783314f38df72e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 18 Nov 2014 22:35:31 +0100 Subject: mac80211: minstrel_ht: fix a crash in rate sorting The commit 5935839ad73583781b8bbe8d91412f6826e218a4 "mac80211: improve minstrel_ht rate sorting by throughput & probability" introduced a crash on rate sorting that occurs when the rate added to the sorting array is faster than all the previous rates. Due to an off-by-one error, it reads the rate index from tp_list[-1], which contains uninitialized stack garbage, and then uses the resulting index for accessing the group rate stats, leading to a crash if the garbage value is big enough. Cc: Thomas Huehn Reported-by: Jouni Malinen Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index df90ce2db00c..408fd8ab4eef 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -252,19 +252,16 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index, cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp; cur_prob = mi->groups[cur_group].rates[cur_idx].probability; - tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; - tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; - - while (j > 0 && (cur_thr > tmp_thr || - (cur_thr == tmp_thr && cur_prob > tmp_prob))) { - j--; + do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; - } + if (cur_thr < tmp_thr || + (cur_thr == tmp_thr && cur_prob <= tmp_prob)) + break; + j--; + } while (j > 0); if (j < MAX_THR_RATES - 1) { memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * -- cgit v1.2.1 From ffb1388a364d135810337182d6800a0c7ee44f48 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Wed, 19 Nov 2014 09:35:39 +0800 Subject: ipv6: delete protocol and unregister rtnetlink when cleanup pim6_protocol was added when initiation, but it not deleted. Similarly, unregister RTNL_FAMILY_IP6MR rtnetlink. Signed-off-by: Duan Jiong Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0171f08325c3..1a01d79b8698 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1439,6 +1439,10 @@ reg_pernet_fail: void ip6_mr_cleanup(void) { + rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); +#ifdef CONFIG_IPV6_PIMSM_V2 + inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); +#endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); -- cgit v1.2.1 From d3052bb5d306b29c1e7d9e5998c5ac4ca1ff0ca9 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 19 Nov 2014 13:54:49 -0800 Subject: openvswitch: Don't validate IPv6 label masks. When userspace doesn't provide a mask, OVS datapath generates a fully unwildcarded mask for the flow by copying the flow and setting all bits in all fields. For IPv6 label, this creates a mask that matches on the upper 12 bits, causing the following error: openvswitch: netlink: Invalid IPv6 flow label value (value=ffffffff, max=fffff) This patch ignores the label validation check for masks, avoiding this error. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index fa4ec2e4a78b..089b195c064a 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -690,7 +690,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, return -EINVAL; } - if (ipv6_key->ipv6_label & htonl(0xFFF00000)) { + if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n", ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); return -EINVAL; -- cgit v1.2.1 From 01462405f0c093b2f8dfddafcadcda6c9e4c5cdf Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Nov 2014 23:05:49 +0100 Subject: ipx: fix locking regression in ipx_sendmsg and ipx_recvmsg This fixes an old regression introduced by commit b0d0d915 (ipx: remove the BKL). When a recvmsg syscall blocks waiting for new data, no data can be sent on the same socket with sendmsg because ipx_recvmsg() sleeps with the socket locked. This breaks mars-nwe (NetWare emulator): - the ncpserv process reads the request using recvmsg - ncpserv forks and spawns nwconn - ncpserv calls a (blocking) recvmsg and waits for new requests - nwconn deadlocks in sendmsg on the same socket Commit b0d0d915 has simply replaced BKL locking with lock_sock/release_sock. Unlike now, BKL got unlocked while sleeping, so a blocking recvmsg did not block a concurrent sendmsg. Only keep the socket locked while actually working with the socket data and release it prior to calling skb_recv_datagram(). Signed-off-by: Jiri Bohac Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 91729b807c7d..1b095ca37aa4 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1764,6 +1764,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, struct ipxhdr *ipx = NULL; struct sk_buff *skb; int copied, rc; + bool locked = true; lock_sock(sk); /* put the autobinding in */ @@ -1790,6 +1791,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, if (sock_flag(sk, SOCK_ZAPPED)) goto out; + release_sock(sk); + locked = false; skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); if (!skb) { @@ -1826,7 +1829,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, out_free: skb_free_datagram(sk, skb); out: - release_sock(sk); + if (locked) + release_sock(sk); return rc; } -- cgit v1.2.1 From e7820e39b7d19b9fe1928fc19de9361b44150ca6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2014 11:47:16 -0800 Subject: net: Revert "net: avoid one atomic operation in skb_clone()" Not sure what I was thinking, but doing anything after releasing a refcount is suicidal or/and embarrassing. By the time we set skb->fclone to SKB_FCLONE_FREE, another cpu could have released last reference and freed whole skb. We potentially corrupt memory or trap if CONFIG_DEBUG_PAGEALLOC is set. Reported-by: Chris Mason Fixes: ce1a4ea3f1258 ("net: avoid one atomic operation in skb_clone()") Signed-off-by: Eric Dumazet Cc: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c16615bfb61e..32e31c299631 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -552,20 +552,13 @@ static void kfree_skbmem(struct sk_buff *skb) case SKB_FCLONE_CLONE: fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* Warning : We must perform the atomic_dec_and_test() before - * setting skb->fclone back to SKB_FCLONE_FREE, otherwise - * skb_clone() could set clone_ref to 2 before our decrement. - * Anyway, if we are going to free the structure, no need to - * rewrite skb->fclone. + /* The clone portion is available for + * fast-cloning again. */ - if (atomic_dec_and_test(&fclones->fclone_ref)) { + skb->fclone = SKB_FCLONE_FREE; + + if (atomic_dec_and_test(&fclones->fclone_ref)) kmem_cache_free(skbuff_fclone_cache, fclones); - } else { - /* The clone portion is available for - * fast-cloning again. - */ - skb->fclone = SKB_FCLONE_FREE; - } break; } } @@ -887,11 +880,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; - /* As our fastclone was free, clone_ref must be 1 at this point. - * We could use atomic_inc() here, but it is faster - * to set the final value. - */ - atomic_set(&fclones->fclone_ref, 2); + atomic_inc(&fclones->fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -- cgit v1.2.1 From 0c228e833c88e3aa029250f5db77d5968c5ce5b5 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Thu, 20 Nov 2014 15:09:53 -0800 Subject: tcp: Restore RFC5961-compliant behavior for SYN packets Commit c3ae62af8e755 ("tcp: should drop incoming frames without ACK flag set") was created to mitigate a security vulnerability in which a local attacker is able to inject data into locally-opened sockets by using TCP protocol statistics in procfs to quickly find the correct sequence number. This broke the RFC5961 requirement to send a challenge ACK in response to spurious RST packets, which was subsequently fixed by commit 7b514a886ba50 ("tcp: accept RST without ACK flag"). Unfortunately, the RFC5961 requirement that spurious SYN packets be handled in a similar manner remains broken. RFC5961 section 4 states that: ... the handling of the SYN in the synchronized state SHOULD be performed as follows: 1) If the SYN bit is set, irrespective of the sequence number, TCP MUST send an ACK (also referred to as challenge ACK) to the remote peer: After sending the acknowledgment, TCP MUST drop the unacceptable segment and stop processing further. By sending an ACK, the remote peer is challenged to confirm the loss of the previous connection and the request to start a new connection. A legitimate peer, after restart, would not have a TCB in the synchronized state. Thus, when the ACK arrives, the peer should send a RST segment back with the sequence number derived from the ACK field that caused the RST. This RST will confirm that the remote peer has indeed closed the previous connection. Upon receipt of a valid RST, the local TCP endpoint MUST terminate its connection. The local TCP endpoint should then rely on SYN retransmission from the remote end to re-establish the connection. This patch lets SYN packets through the discard added in c3ae62af8e755, so that spurious SYN packets are properly dealt with as per the RFC. The challenge ACK is sent unconditionally and is rate-limited, so the original vulnerability is not reintroduced by this patch. Signed-off-by: Calvin Owens Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88fa2d160685..d107ee246a1d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5231,7 +5231,7 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; /* @@ -5650,7 +5650,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; if (!tcp_validate_incoming(sk, skb, th, 0)) -- cgit v1.2.1