From 004fa5ed08cc5d3188db42c05d6b80feaae004c2 Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Wed, 10 Dec 2014 14:19:53 +0200 Subject: Bluetooth: 6lowpan: Do not free skb when packet is dropped If we need to drop the message because of some error in the compression etc, then do not free the skb as that is done automatically in other part of networking stack. Signed-off-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 76617be1e797..c989253737f0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -390,7 +390,6 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, drop: dev->stats.rx_dropped++; - kfree_skb(skb); return NET_RX_DROP; } -- cgit v1.2.1 From 51bda2bca53b265715ca1852528f38dc67429d9a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Dec 2014 06:20:57 +0000 Subject: Bluetooth: hidp_connection_add() unsafe use of l2cap_pi() it's OK after we'd verified the sockets, but not before that. Signed-off-by: Al Viro Signed-off-by: Marcel Holtmann --- net/bluetooth/hidp/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index cc25d0b74b36..07348e142f16 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1314,13 +1314,14 @@ int hidp_connection_add(struct hidp_connadd_req *req, { struct hidp_session *session; struct l2cap_conn *conn; - struct l2cap_chan *chan = l2cap_pi(ctrl_sock->sk)->chan; + struct l2cap_chan *chan; int ret; ret = hidp_verify_sockets(ctrl_sock, intr_sock); if (ret) return ret; + chan = l2cap_pi(ctrl_sock->sk)->chan; conn = NULL; l2cap_chan_lock(chan); if (chan->conn) -- cgit v1.2.1 From 96c26653ce65bf84f3212f8b00d4316c1efcbf4c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Dec 2014 06:20:58 +0000 Subject: Bluetooth: cmtp: cmtp_add_connection() should verify that it's dealing with l2cap socket ... rather than relying on ciptool(8) never passing it anything else. Give it e.g. an AF_UNIX connected socket (from socketpair(2)) and it'll oops, trying to evaluate &l2cap_pi(sock->sk)->chan->dst... Signed-off-by: Al Viro Signed-off-by: Marcel Holtmann --- net/bluetooth/cmtp/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 67fe5e84e68f..278a194e6af4 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -334,6 +334,9 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) BT_DBG(""); + if (!l2cap_is_socket(sock)) + return -EBADFD; + session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); if (!session) return -ENOMEM; -- cgit v1.2.1 From 71bb99a02b32b4cc4265118e85f6035ca72923f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Dec 2014 06:20:59 +0000 Subject: Bluetooth: bnep: bnep_add_connection() should verify that it's dealing with l2cap socket same story as cmtp Signed-off-by: Al Viro Signed-off-by: Marcel Holtmann --- net/bluetooth/bnep/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 85bcc21e84d2..ce82722d049b 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -533,6 +533,9 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) BT_DBG(""); + if (!l2cap_is_socket(sock)) + return -EBADFD; + baswap((void *) dst, &l2cap_pi(sock->sk)->chan->dst); baswap((void *) src, &l2cap_pi(sock->sk)->chan->src); -- cgit v1.2.1 From da413eec729dae5dcb150e2eb34c5e7e5e4e1b49 Mon Sep 17 00:00:00 2001 From: Dan Collins Date: Fri, 19 Dec 2014 16:49:25 +1300 Subject: packet: Fixed TPACKET V3 to signal poll when block is closed rather than every packet Make TPACKET_V3 signal poll when block is closed rather than for every packet. Side effect is that poll will be signaled when block retire timer expires which didn't previously happen. Issue was visible when sending packets at a very low frequency such that all blocks are retired before packets are received by TPACKET_V3. This caused avoidable packet loss. The fix ensures that the signal is sent when blocks are closed which covers the normal path where the block is filled as well as the path where the timer expires. The case where a block is filled without moving to the next block (ie. all blocks are full) will still cause poll to be signaled. Signed-off-by: Dan Collins Signed-off-by: David S. Miller --- net/packet/af_packet.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e52a44785681..6880f34a529a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -785,6 +785,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; + struct sock *sk = &po->sk; if (po->stats.stats3.tp_drops) status |= TP_STATUS_LOSING; @@ -809,6 +810,8 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, /* Flush the block */ prb_flush_block(pkc1, pbd1, status); + sk->sk_data_ready(sk); + pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } @@ -2052,12 +2055,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, smp_wmb(); #endif - if (po->tp_version <= TPACKET_V2) + if (po->tp_version <= TPACKET_V2) { __packet_set_status(po, h.raw, status); - else + sk->sk_data_ready(sk); + } else { prb_clear_blk_fill_status(&po->rx_ring); - - sk->sk_data_ready(sk); + } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { -- cgit v1.2.1 From 2dc49d1680b534877fd20cce52557ea542bb06b6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 22 Dec 2014 18:22:48 +0100 Subject: tcp6: don't move IP6CB before xfrm6_policy_check() When xfrm6_policy_check() is used, _decode_session6() is called after some intermediate functions. This function uses IP6CB(), thus TCP_SKB_CB() must be prepared after the call of xfrm6_policy_check(). Before this patch, scenarii with IPv6 + TCP + IPsec Transport are broken. Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Reported-by: Huaibin Wang Suggested-by: Eric Dumazet Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5ff87805258e..9c0b54e87b47 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1387,6 +1387,28 @@ ipv6_pktoptions: return 0; } +static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, + const struct tcphdr *th) +{ + /* This is tricky: we move IP6CB at its correct location into + * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because + * _decode_session6() uses IP6CB(). + * barrier() makes sure compiler won't play aliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); + TCP_SKB_CB(skb)->sacked = 0; +} + static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; @@ -1418,24 +1440,9 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); - /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() - * barrier() makes sure compiler wont play fool^Waliasing games. - */ - memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), - sizeof(struct inet6_skb_parm)); - barrier(); - - TCP_SKB_CB(skb)->seq = ntohl(th->seq); - TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + - skb->len - th->doff*4); - TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; - TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); - TCP_SKB_CB(skb)->sacked = 0; sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - tcp_v6_iif(skb)); + inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1451,6 +1458,8 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; + tcp_v6_fill_cb(skb, hdr, th); + #ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; @@ -1482,6 +1491,8 @@ no_tcp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; + tcp_v6_fill_cb(skb, hdr, th); + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); @@ -1505,6 +1516,8 @@ do_time_wait: goto discard_it; } + tcp_v6_fill_cb(skb, hdr, th); + if (skb->len < (th->doff<<2)) { inet_twsk_put(inet_twsk(sk)); goto bad_packet; -- cgit v1.2.1 From af6dabc9c70ae3f307685b1f32f52d60b1bf0527 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 19 Dec 2014 11:09:13 +0800 Subject: net: drop the packet when fails to do software segmentation or header check Commit cecda693a969816bac5e470e1d9c9c0ef5567bca ("net: keep original skb which only needs header checking during software GSO") keeps the original skb for packets that only needs header check, but it doesn't drop the packet if software segmentation or header check were failed. Fixes cecda693a9 ("net: keep original skb which only needs header checking during software GSO") Cc: Eric Dumazet Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f411c28d0a66..a989f8502412 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2673,7 +2673,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { - segs = NULL; + goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; -- cgit v1.2.1 From 5b6698b0e4a37053de35cc24ee695b98a7eb712b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 20 Dec 2014 13:48:55 +0100 Subject: batman-adv: Calculate extra tail size based on queued fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fragmentation code was replaced in 610bfc6bc99bc83680d190ebc69359a05fc7f605 ("batman-adv: Receive fragmented packets and merge"). The new code provided a mostly unused parameter skb for the merging function. It is used inside the function to calculate the additionally needed skb tailroom. But instead of increasing its own tailroom, it is only increasing the tailroom of the first queued skb. This is not correct in some situations because the first queued entry can be a different one than the parameter. An observed problem was: 1. packet with size 104, total_size 1464, fragno 1 was received - packet is queued 2. packet with size 1400, total_size 1464, fragno 0 was received - packet is queued at the end of the list 3. enough data was received and can be given to the merge function (1464 == (1400 - 20) + (104 - 20)) - merge functions gets 1400 byte large packet as skb argument 4. merge function gets first entry in queue (104 byte) - stored as skb_out 5. merge function calculates the required extra tail as total_size - skb->len - pskb_expand_head tail of skb_out with 64 bytes 6. merge function tries to squeeze the extra 1380 bytes from the second queued skb (1400 byte aka skb parameter) in the 64 extra tail bytes of skb_out Instead calculate the extra required tail bytes for skb_out also using skb_out instead of using the parameter skb. The skb parameter is only used to get the total_size from the last received packet. This is also the total_size used to decide that all fragments were received. Reported-by: Philipp Psurek Signed-off-by: Sven Eckelmann Acked-by: Martin Hundebøll Signed-off-by: David S. Miller --- net/batman-adv/fragmentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index fc1835c6bb40..8af3461d18d2 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -251,7 +251,7 @@ batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) kfree(entry); /* Make room for the rest of the fragments. */ - if (pskb_expand_head(skb_out, 0, size - skb->len, GFP_ATOMIC) < 0) { + if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { kfree_skb(skb_out); skb_out = NULL; goto free; -- cgit v1.2.1 From 0402e444cd199389b7fe47be68a67b817e09e097 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 20 Dec 2014 13:48:56 +0100 Subject: batman-adv: Unify fragment size calculation The fragmentation code was replaced in 610bfc6bc99bc83680d190ebc69359a05fc7f605 ("batman-adv: Receive fragmented packets and merge") by an implementation which can handle up to 16 fragments of a packet. The packet is prepared for the split in fragments by the function batadv_frag_send_packet and the actual split is done by batadv_frag_create. Both functions calculate the size of a fragment themself. But their calculation differs because batadv_frag_send_packet also subtracts ETH_HLEN. Therefore, the check in batadv_frag_send_packet "can a full fragment can be created?" may return true even when batadv_frag_create cannot create a full fragment. The function batadv_frag_create doesn't check the size of the skb before splitting it and therefore might try to create a larger fragment than the remaining buffer. This creates an integer underflow and an invalid len is given to skb_split. Signed-off-by: Sven Eckelmann Signed-off-by: David S. Miller --- net/batman-adv/fragmentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 8af3461d18d2..00f9e144cc97 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -434,7 +434,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE */ mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE); - max_fragment_size = (mtu - header_size - ETH_HLEN); + max_fragment_size = mtu - header_size; max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; /* Don't even try to fragment, if we need more than 16 fragments */ -- cgit v1.2.1 From 0d1644919578db525b9a7b6c8197ce02adbfce26 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sat, 20 Dec 2014 13:48:57 +0100 Subject: batman-adv: avoid NULL dereferences and fix if check Gateway having bandwidth_down equal to zero are not accepted at all and so never added to the Gateway list. For this reason checking the bandwidth_down member in batadv_gw_out_of_range() is useless. This is probably a copy/paste error and this check was supposed to be "!gw_node" only. Moreover, the way the check is written now may also lead to a NULL dereference. Fix this by rewriting the if-condition properly. Introduced by 414254e342a0d58144de40c3da777521ebaeeb07 ("batman-adv: tvlv - gateway download/upload bandwidth container") Signed-off-by: Antonio Quartulli Reported-by: David Binderman Signed-off-by: Marek Lindner Signed-off-by: David S. Miller --- net/batman-adv/gateway_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 90cff585b37d..e0bcf9e84273 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -810,7 +810,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, goto out; gw_node = batadv_gw_node_get(bat_priv, orig_dst_node); - if (!gw_node->bandwidth_down == 0) + if (!gw_node) goto out; switch (atomic_read(&bat_priv->gw_mode)) { -- cgit v1.2.1 From 726ce70e9e4050409243f3a1d735dc86bc6e6e57 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Dec 2014 07:16:21 +1100 Subject: net: Move napi polling code out of net_rx_action This patch creates a new function napi_poll and moves the napi polling code from net_rx_action into it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 98 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 54 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a989f8502412..493ae8ee569f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4557,6 +4557,59 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + void *have; + int work, weight; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } + + WARN_ON_ONCE(work > weight); + + if (likely(work < weight)) + goto out_unlock; + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + goto out_unlock; + } + + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(n, HZ >= 1000); + } + + list_add_tail(&n->poll_list, repoll); + +out_unlock: + netpoll_poll_unlock(have); + + return work; +} + static void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -4564,7 +4617,6 @@ static void net_rx_action(struct softirq_action *h) int budget = netdev_budget; LIST_HEAD(list); LIST_HEAD(repoll); - void *have; local_irq_disable(); list_splice_init(&sd->poll_list, &list); @@ -4572,7 +4624,6 @@ static void net_rx_action(struct softirq_action *h) while (!list_empty(&list)) { struct napi_struct *n; - int work, weight; /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow @@ -4583,48 +4634,7 @@ static void net_rx_action(struct softirq_action *h) n = list_first_entry(&list, struct napi_struct, poll_list); - list_del_init(&n->poll_list); - - have = netpoll_poll_lock(n); - - weight = n->weight; - - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. - */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); - } - - WARN_ON_ONCE(work > weight); - - budget -= work; - - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. - */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - napi_complete(n); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - list_add_tail(&n->poll_list, &repoll); - } - } - - netpoll_poll_unlock(have); + budget -= napi_poll(n, &repoll); } if (!sd_has_rps_ipi_waiting(sd) && -- cgit v1.2.1 From 001ce546bb537bb5b7821f05633556a0c9787e32 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Dec 2014 07:16:22 +1100 Subject: net: Detect drivers that reschedule NAPI and exhaust budget The commit d75b1ade567ffab085e8adbbdacf0092d10cd09c (net: less interrupt masking in NAPI) required drivers to leave poll_list empty if the entire budget is consumed. We have already had two broken drivers so let's add a check for this. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 493ae8ee569f..c0cf1293df06 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4602,6 +4602,15 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) napi_gro_flush(n, HZ >= 1000); } + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + goto out_unlock; + } + list_add_tail(&n->poll_list, repoll); out_unlock: -- cgit v1.2.1 From 6bd373ebbac4b13ecd39ddc37a0dc5ad4c5e4585 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Dec 2014 07:16:24 +1100 Subject: net: Always poll at least one device in net_rx_action We should only perform the softnet_break check after we have polled at least one device in net_rx_action. Otherwise a zero or negative setting of netdev_budget can lock up the whole system. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c0cf1293df06..b85eba9dfd88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4634,16 +4634,15 @@ static void net_rx_action(struct softirq_action *h) while (!list_empty(&list)) { struct napi_struct *n; + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; - - - n = list_first_entry(&list, struct napi_struct, poll_list); - budget -= napi_poll(n, &repoll); } if (!sd_has_rps_ipi_waiting(sd) && -- cgit v1.2.1 From ceb8d5bf17d366534f32d2f60f41d905a5bc864b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 21 Dec 2014 07:16:25 +1100 Subject: net: Rearrange loop in net_rx_action This patch rearranges the loop in net_rx_action to reduce the amount of jumping back and forth when reading the code. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b85eba9dfd88..c97ae6fec040 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4631,9 +4631,15 @@ static void net_rx_action(struct softirq_action *h) list_splice_init(&sd->poll_list, &list); local_irq_enable(); - while (!list_empty(&list)) { + for (;;) { struct napi_struct *n; + if (list_empty(&list)) { + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + return; + break; + } + n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); @@ -4641,15 +4647,13 @@ static void net_rx_action(struct softirq_action *h) * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + sd->time_squeeze++; + break; + } } - if (!sd_has_rps_ipi_waiting(sd) && - list_empty(&list) && - list_empty(&repoll)) - return; -out: local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -4659,12 +4663,6 @@ out: __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); - - return; - -softnet_break: - sd->time_squeeze++; - goto out; } struct netdev_adjacent { -- cgit v1.2.1 From d0edc7bf397a5e0f312bf8a1e87cfee0019dc07b Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 23 Dec 2014 16:20:11 -0800 Subject: mpls: Fix config check for mpls. Fixes MPLS GSO for case when mpls is compiled as kernel module. Fixes: 0d89d2035f ("MPLS: Add limited GSO support"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c97ae6fec040..67b6210a589a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2522,7 +2522,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ -#ifdef CONFIG_NET_MPLS_GSO +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) -- cgit v1.2.1 From 4cc1beca3096a6425543cb83a61665b9c8040f98 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 23 Dec 2014 16:20:16 -0800 Subject: mpls: Fix allowed protocols for mpls gso MPLS and Tunnel GSO does not work together. Reject packet which request such GSO. Fixes: 0d89d2035f ("MPLS: Add limited GSO support"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/mpls/mpls_gso.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index ca27837974fe..349295d21946 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -31,10 +31,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_UDP | SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP))) + SKB_GSO_TCP_ECN))) goto out; /* Setup inner SKB. */ -- cgit v1.2.1 From ec449f40bb3e19c77f62ddabf7c1fe3ccefece6f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 23 Dec 2014 16:20:20 -0800 Subject: openvswitch: Fix MPLS action validation. Linux stack does not implement GSO for packet with multiple encapsulations. Therefore there was check in MPLS action validation to detect such case, But this check introduced bug which deleted one or more actions from actions list. Following patch removes this check to fix the validation. Fixes: 25cd9ba0abc ("openvswitch: Add basic MPLS support to kernel"). Signed-off-by: Pravin B Shelar Reported-by: Srinivas Neginhal Acked-by: Jarno Rajahalme Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 9645a21d9eaa..d1eecf707613 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1753,7 +1753,6 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, __be16 eth_type, __be16 vlan_tci, bool log) { const struct nlattr *a; - bool out_tnl_port = false; int rem, err; if (depth >= SAMPLE_ACTION_DEPTH) @@ -1796,8 +1795,6 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_OUTPUT: if (nla_get_u32(a) >= DP_MAX_PORTS) return -EINVAL; - out_tnl_port = false; - break; case OVS_ACTION_ATTR_HASH: { @@ -1832,12 +1829,6 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_PUSH_MPLS: { const struct ovs_action_push_mpls *mpls = nla_data(a); - /* Networking stack do not allow simultaneous Tunnel - * and MPLS GSO. - */ - if (out_tnl_port) - return -EINVAL; - if (!eth_p_mpls(mpls->mpls_ethertype)) return -EINVAL; /* Prohibit push MPLS other than to a white list @@ -1873,11 +1864,9 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, - &out_tnl_port, eth_type, log); + &skip_copy, eth_type, log); if (err) return err; - - skip_copy = out_tnl_port; break; case OVS_ACTION_ATTR_SAMPLE: -- cgit v1.2.1 From cbe7e76d94f59e89302bd514e4b685e03d1ebbe4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 23 Dec 2014 16:20:28 -0800 Subject: openvswitch: Fix GSO with multiple MPLS label. MPLS GSO needs to know inner most protocol to process GSO packets. Fixes: 25cd9ba0abc ("openvswitch: Add basic MPLS support to kernel"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 764fdc39c63b..770064c83711 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -147,7 +147,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; - skb_set_inner_protocol(skb, skb->protocol); + if (!skb->inner_protocol) + skb_set_inner_protocol(skb, skb->protocol); skb->protocol = mpls->mpls_ethertype; invalidate_flow_key(key); -- cgit v1.2.1 From 997e068ebc17d8d57e735578df44b6341cd5f2f3 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 23 Dec 2014 16:20:32 -0800 Subject: openvswitch: Fix vport_send double free Today vport-send has complex error handling because it involves freeing skb and updating stats depending on return value from vport send implementation. This can be simplified by delegating responsibility of freeing skb to the vport implementation for all cases. So that vport-send needs just update stats. Fixes: 91b7514cdf ("openvswitch: Unify vport error stats handling") Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 6 +++++- net/openvswitch/vport-geneve.c | 3 +++ net/openvswitch/vport-gre.c | 18 +++++++++++------- net/openvswitch/vport-vxlan.c | 2 ++ net/openvswitch/vport.c | 5 ++--- 5 files changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 95e47c97585e..394a200f93c1 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -122,14 +122,18 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, int err; skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); + if (IS_ERR(skb)) + return PTR_ERR(skb); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) + if (unlikely(err)) { + kfree_skb(skb); return err; + } skb = vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 347fa2325b22..484864dd0e68 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -219,7 +219,10 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) false); if (err < 0) ip_rt_put(rt); + return err; + error: + kfree_skb(skb); return err; } diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 6b69df545b1d..28f54e9a6b80 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -73,7 +73,7 @@ static struct sk_buff *__build_header(struct sk_buff *skb, skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) - return NULL; + return skb; tpi.flags = filter_tnl_flags(tun_key->tun_flags); tpi.proto = htons(ETH_P_TEB); @@ -144,7 +144,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; - goto error; + goto err_free_skb; } tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; @@ -157,8 +157,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) fl.flowi4_proto = IPPROTO_GRE; rt = ip_route_output_key(net, &fl); - if (IS_ERR(rt)) - return PTR_ERR(rt); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err_free_skb; + } tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); @@ -183,8 +185,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) /* Push Tunnel header. */ skb = __build_header(skb, tunnel_hlen); - if (unlikely(!skb)) { - err = 0; + if (IS_ERR(skb)) { + err = PTR_ERR(rt); + skb = NULL; goto err_free_rt; } @@ -198,7 +201,8 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); err_free_rt: ip_rt_put(rt); -error: +err_free_skb: + kfree_skb(skb); return err; } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 38f95a52241b..d7c46b301024 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -187,7 +187,9 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) false); if (err < 0) ip_rt_put(rt); + return err; error: + kfree_skb(skb); return err; } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 9584526c0778..53f3ebbfceab 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -519,10 +519,9 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) u64_stats_update_end(&stats->syncp); } else if (sent < 0) { ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - kfree_skb(skb); - } else + } else { ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - + } return sent; } -- cgit v1.2.1 From 796f2da81bead71ffc91ef70912cd8d1827bf756 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Mon, 22 Dec 2014 19:04:14 +0900 Subject: net: Fix stacked vlan offload features computation When vlan tags are stacked, it is very likely that the outer tag is stored in skb->vlan_tci and skb->protocol shows the inner tag's vlan_proto. Currently netif_skb_features() first looks at skb->protocol even if there is the outer tag in vlan_tci, thus it incorrectly retrieves the protocol encapsulated by the inner vlan instead of the inner vlan protocol. This allows GSO packets to be passed to HW and they end up being corrupted. Fixes: 58e998c6d239 ("offloading: Force software GSO for multiple vlan tags.") Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/core/dev.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 67b6210a589a..bd44e28c735e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2570,11 +2570,14 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + if (!vlan_tx_tag_present(skb)) { + if (unlikely(protocol == htons(ETH_P_8021Q) || + protocol == htons(ETH_P_8021AD))) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else { + return harmonize_features(skb, features); + } } features = netdev_intersect_features(features, -- cgit v1.2.1 From b8fb4e0648a2ab3734140342002f68fb0c7d1602 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 23 Dec 2014 01:13:18 +0100 Subject: net: Reset secmark when scrubbing packet skb_scrub_packet() is called when a packet switches between a context such as between underlay and overlay, between namespaces, or between L3 subnets. While we already scrub the packet mark, connection tracking entry, and cached destination, the security mark/context is left intact. It seems wrong to inherit the security context of a packet when going from overlay to underlay or across forwarding paths. Signed-off-by: Thomas Graf Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ae13ef6b3ea7..395c15b82087 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4148,6 +4148,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; + skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); -- cgit v1.2.1 From 8bfe8442ff20fdc2d965c197103d935a99bd3296 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 23 Dec 2014 23:10:48 +0100 Subject: Bluetooth: Fix controller configuration with HCI_QUIRK_INVALID_BDADDR When controllers set the HCI_QUIRK_INVALID_BDADDR flag, it is required by userspace to program a valid public Bluetooth device address into the controller before it can be used. After successful address configuration, the internal state changes and the controller runs the complete initialization procedure. However one small difference is that this is no longer the HCI_SETUP stage. The HCI_SETUP stage is only valid during initial controller setup. In this case the stack runs the initialization as part of the HCI_CONFIG stage. The controller version information, default name and supported commands are only stored during HCI_SETUP. While these information are static, they are not read initially when HCI_QUIRK_INVALID_BDADDR is set. So when running in HCI_CONFIG state, these information need to be updated as well. This especially impacts Bluetooth 4.1 and later controllers using extended feature pages and second event mask page. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg Cc: stable@vger.kernel.org # 3.17+ --- net/bluetooth/hci_event.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 39a5c8a01726..c03d4b09caa3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -242,7 +242,8 @@ static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb) if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags)) + if (test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags)) memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH); } @@ -509,7 +510,8 @@ static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags)) { + if (test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags)) { hdev->hci_ver = rp->hci_ver; hdev->hci_rev = __le16_to_cpu(rp->hci_rev); hdev->lmp_ver = rp->lmp_ver; @@ -528,7 +530,8 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags)) + if (test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags)) memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } -- cgit v1.2.1 From 6a8fc95c87110a466ee81675b41170b963f82bdb Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 24 Dec 2014 20:43:11 +0200 Subject: Bluetooth: Fix accepting connections when not using mgmt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When connectable mode is enabled (page scan on) through some non-mgmt method the HCI_CONNECTABLE flag will not be set. For backwards compatibility with user space versions not using mgmt we should not require HCI_CONNECTABLE to be set if HCI_MGMT is not set. Reported-by: Pali Rohár Tested-by: Pali Rohár Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 3.17+ --- net/bluetooth/hci_event.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c03d4b09caa3..3f2e8b830cbd 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2197,7 +2197,12 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (!test_bit(HCI_CONNECTABLE, &hdev->dev_flags) && + /* Require HCI_CONNECTABLE or a whitelist entry to accept the + * connection. These features are only touched through mgmt so + * only do the checks if HCI_MGMT is set. + */ + if (test_bit(HCI_MGMT, &hdev->dev_flags) && + !test_bit(HCI_CONNECTABLE, &hdev->dev_flags) && !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); -- cgit v1.2.1 From 4aa6118811c09c75f508f3f070328c71292f1ce4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 24 Dec 2014 14:41:39 +0800 Subject: openvswitch: fix odd_ptr_err.cocci warnings net/openvswitch/vport-gre.c:188:5-11: inconsistent IS_ERR and PTR_ERR, PTR_ERR on line 189 PTR_ERR should access the value just tested by IS_ERR Semantic patch information: There can be false positives in the patch case, where it is the call IS_ERR that is wrong. Generated by: scripts/coccinelle/tests/odd_ptr_err.cocci CC: Pravin B Shelar Signed-off-by: Fengguang Wu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport-gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 28f54e9a6b80..d4168c442db5 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -186,7 +186,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) /* Push Tunnel header. */ skb = __build_header(skb, tunnel_hlen); if (IS_ERR(skb)) { - err = PTR_ERR(rt); + err = PTR_ERR(skb); skb = NULL; goto err_free_rt; } -- cgit v1.2.1 From 2c26d34bbcc0b3f30385d5587aa232289e2eed8e Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 19 Dec 2014 15:32:00 -0800 Subject: net/core: Handle csum for CHECKSUM_COMPLETE VXLAN forwarding When using VXLAN tunnels and a sky2 device, I have experienced checksum failures of the following type: [ 4297.761899] eth0: hw csum failure [...] [ 4297.765223] Call Trace: [ 4297.765224] [] dump_stack+0x46/0x58 [ 4297.765235] [] netdev_rx_csum_fault+0x42/0x50 [ 4297.765238] [] ? skb_push+0x40/0x40 [ 4297.765240] [] __skb_checksum_complete+0xbc/0xd0 [ 4297.765243] [] tcp_v4_rcv+0x2e2/0x950 [ 4297.765246] [] ? ip_rcv_finish+0x360/0x360 These are reliably reproduced in a network topology of: container:eth0 == host(OVS VXLAN on VLAN) == bond0 == eth0 (sky2) -> switch When VXLAN encapsulated traffic is received from a similarly configured peer, the above warning is generated in the receive processing of the encapsulated packet. Note that the warning is associated with the container eth0. The skbs from sky2 have ip_summed set to CHECKSUM_COMPLETE, and because the packet is an encapsulated Ethernet frame, the checksum generated by the hardware includes the inner protocol and Ethernet headers. The receive code is careful to update the skb->csum, except in __dev_forward_skb, as called by dev_forward_skb. __dev_forward_skb calls eth_type_trans, which in turn calls skb_pull_inline(skb, ETH_HLEN) to skip over the Ethernet header, but does not update skb->csum when doing so. This patch resolves the problem by adding a call to skb_postpull_rcsum to update the skb->csum after the call to eth_type_trans. Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index bd44e28c735e..0094562b732a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1694,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_scrub_packet(skb, true); skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); return 0; } -- cgit v1.2.1 From 5f35227ea34bb616c436d9da47fc325866c428f3 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 23 Dec 2014 22:37:26 -0800 Subject: net: Generalize ndo_gso_check to ndo_features_check GSO isn't the only offload feature with restrictions that potentially can't be expressed with the current features mechanism. Checksum is another although it's a general issue that could in theory apply to anything. Even if it may be possible to implement these restrictions in other ways, it can result in duplicate code or inefficient per-packet behavior. This generalizes ndo_gso_check so that drivers can remove any features that don't make sense for a given packet, similar to netif_skb_features(). It also converts existing driver restrictions to the new format, completing the work that was done to support tunnel protocols since the issues apply to checksums as well. By actually removing features from the set that are used to do offloading, it solves another problem with the existing interface. In these cases, GSO would run with the original set of features and not do anything because it appears that segmentation is not required. CC: Tom Herbert CC: Joe Stringer CC: Eric Dumazet CC: Hayes Wang Signed-off-by: Jesse Gross Acked-by: Tom Herbert Fixes: 04ffcb255f22 ("net: Add ndo_gso_check") Tested-by: Hayes Wang Signed-off-by: David S. Miller --- net/core/dev.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0094562b732a..683d493aa1bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2563,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { - const struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev; netdev_features_t features = dev->features; u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; @@ -2571,13 +2571,20 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + if (!vlan_tx_tag_present(skb)) { if (unlikely(protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else { - return harmonize_features(skb, features); + goto finalize; } } @@ -2595,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); +finalize: + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); @@ -2665,13 +2677,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; - if (netif_needs_gso(dev, skb, features)) { struct sk_buff *segs; -- cgit v1.2.1 From 02c81ab95d8718d75886d16227a10cc7774493ea Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 22 Dec 2014 18:56:35 +0100 Subject: netlink: rename netlink_unbind() to netlink_undo_bind() The new name is more expressive - this isn't a generic unbind function but rather only a little undo helper for use only in netlink_bind(). Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 074cf3e91c6f..b4cf8ee0e1b8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1410,8 +1410,8 @@ static int netlink_realloc_groups(struct sock *sk) return err; } -static void netlink_unbind(int group, long unsigned int groups, - struct netlink_sock *nlk) +static void netlink_undo_bind(int group, long unsigned int groups, + struct netlink_sock *nlk) { int undo; @@ -1461,7 +1461,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, err = nlk->netlink_bind(group); if (!err) continue; - netlink_unbind(group, groups, nlk); + netlink_undo_bind(group, groups, nlk); return err; } } @@ -1471,7 +1471,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_unbind(nlk->ngroups, groups, nlk); + netlink_undo_bind(nlk->ngroups, groups, nlk); return err; } } -- cgit v1.2.1 From f8403a2e47afb37bcd3b7e286996d138a116c39d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 22 Dec 2014 18:56:36 +0100 Subject: genetlink: pass only network namespace to genl_has_listeners() There's no point to force the caller to know about the internal genl_sock to use inside struct net, just have them pass the network namespace. This doesn't really change code generation since it's an inline, but makes the caller less magic - there's never any reason to pass another socket. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 332b5a031739..4e9a5f035cbc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -83,8 +83,7 @@ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || - genl_has_listeners(family, genl_info_net(info)->genl_sock, - group); + genl_has_listeners(family, genl_info_net(info), group); } static void ovs_notify(struct genl_family *family, -- cgit v1.2.1 From b10dcb3b94010e3ac3951f68789400b1665effb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 22 Dec 2014 18:56:37 +0100 Subject: netlink: update listeners directly when removing socket The code is now confusing to read - first in one function down (netlink_remove) any group subscriptions are implicitly removed by calling __sk_del_bind_node(), but the subscriber database is only updated far later by calling netlink_update_listeners(). Move the latter call to just after removal from the list so it is easier to follow the code. This also enables moving the locking inside the kernel-socket conditional, which improves the normal socket destruction path. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b4cf8ee0e1b8..6a9fb7c489a8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1091,8 +1091,10 @@ static void netlink_remove(struct sock *sk) mutex_unlock(&nl_sk_hash_lock); netlink_table_grab(); - if (nlk_sk(sk)->subscriptions) + if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); + netlink_update_listeners(sk); + } netlink_table_ungrab(); } @@ -1226,8 +1228,8 @@ static int netlink_release(struct socket *sock) module_put(nlk->module); - netlink_table_grab(); if (netlink_is_kernel(sk)) { + netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; @@ -1241,10 +1243,8 @@ static int netlink_release(struct socket *sock) nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } - } else if (nlk->subscriptions) { - netlink_update_listeners(sk); + netlink_table_ungrab(); } - netlink_table_ungrab(); kfree(nlk->groups); nlk->groups = NULL; -- cgit v1.2.1 From 7d68536bed72b09de03b07479dd707c5831b3b94 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 22 Dec 2014 18:56:38 +0100 Subject: netlink: call unbind when releasing socket Currently, netlink_unbind() is only called when the socket explicitly unbinds, which limits its usefulness (luckily there are no users of it yet anyway.) Call netlink_unbind() also when a socket is released, so it becomes possible to track listeners with this callback and without also implementing a netlink notifier (and checking netlink_has_listeners() in there.) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6a9fb7c489a8..f29b63fad932 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1246,6 +1246,13 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } + if (nlk->netlink_unbind) { + int i; + + for (i = 0; i < nlk->ngroups; i++) + if (test_bit(i, nlk->groups)) + nlk->netlink_unbind(i + 1); + } kfree(nlk->groups); nlk->groups = NULL; -- cgit v1.2.1 From c380d9a7afff0e4c2e5f3c1c2dc7d2f4214dd962 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Dec 2014 20:54:40 +0100 Subject: genetlink: pass multicast bind/unbind to families In order to make the newly fixed multicast bind/unbind functionality in generic netlink, pass them down to the appropriate family. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 76393f2f4b22..05bf40bbd189 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -983,11 +983,70 @@ static struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; +static int genl_bind(int group) +{ + int i, err; + bool found = false; + + down_read(&cb_lock); + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(i), family_list) { + if (group >= f->mcgrp_offset && + group < f->mcgrp_offset + f->n_mcgrps) { + int fam_grp = group - f->mcgrp_offset; + + if (f->mcast_bind) + err = f->mcast_bind(fam_grp); + else + err = 0; + found = true; + break; + } + } + } + up_read(&cb_lock); + + if (WARN_ON(!found)) + err = 0; + + return err; +} + +static void genl_unbind(int group) +{ + int i; + bool found = false; + + down_read(&cb_lock); + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(i), family_list) { + if (group >= f->mcgrp_offset && + group < f->mcgrp_offset + f->n_mcgrps) { + int fam_grp = group - f->mcgrp_offset; + + if (f->mcast_unbind) + f->mcast_unbind(fam_grp); + found = true; + break; + } + } + } + up_read(&cb_lock); + + WARN_ON(!found); +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, + .bind = genl_bind, + .unbind = genl_unbind, }; /* we'll bump the group number right afterwards */ -- cgit v1.2.1 From 023e2cfa36c31b0ad28c159a1bb0d61ff57334c8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Dec 2014 21:00:06 +0100 Subject: netlink/genetlink: pass network namespace to bind/unbind Netlink families can exist in multiple namespaces, and for the most part multicast subscriptions are per network namespace. Thus it only makes sense to have bind/unbind notifications per network namespace. To achieve this, pass the network namespace of a given client socket to the bind/unbind functions. Also do this in generic netlink, and there also make sure that any bind for multicast groups that only exist in init_net is rejected. This isn't really a problem if it is accepted since a client in a different namespace will never receive any notifications from such a group, but it can confuse the family if not rejected (it's also possible to silently (without telling the family) accept it, but it would also have to be ignored on unbind so families that take any kind of action on bind/unbind won't do unnecessary work for invalid clients like that. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/netfilter/nfnetlink.c | 2 +- net/netlink/af_netlink.c | 21 +++++++++++---------- net/netlink/af_netlink.h | 8 ++++---- net/netlink/genetlink.c | 12 +++++++----- 4 files changed, 23 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 13c2e17bbe27..cde4a6702fa3 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -463,7 +463,7 @@ static void nfnetlink_rcv(struct sk_buff *skb) } #ifdef CONFIG_MODULES -static int nfnetlink_bind(int group) +static int nfnetlink_bind(struct net *net, int group) { const struct nfnetlink_subsystem *ss; int type; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f29b63fad932..84ea76ca3f1f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1141,8 +1141,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; - int (*bind)(int group); - void (*unbind)(int group); + int (*bind)(struct net *net, int group); + void (*unbind)(struct net *net, int group); int err = 0; sock->state = SS_UNCONNECTED; @@ -1251,7 +1251,7 @@ static int netlink_release(struct socket *sock) for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) - nlk->netlink_unbind(i + 1); + nlk->netlink_unbind(sock_net(sk), i + 1); } kfree(nlk->groups); nlk->groups = NULL; @@ -1418,8 +1418,9 @@ static int netlink_realloc_groups(struct sock *sk) } static void netlink_undo_bind(int group, long unsigned int groups, - struct netlink_sock *nlk) + struct sock *sk) { + struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) @@ -1427,7 +1428,7 @@ static void netlink_undo_bind(int group, long unsigned int groups, for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) - nlk->netlink_unbind(undo); + nlk->netlink_unbind(sock_net(sk), undo); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, @@ -1465,10 +1466,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, for (group = 0; group < nlk->ngroups; group++) { if (!test_bit(group, &groups)) continue; - err = nlk->netlink_bind(group); + err = nlk->netlink_bind(net, group); if (!err) continue; - netlink_undo_bind(group, groups, nlk); + netlink_undo_bind(group, groups, sk); return err; } } @@ -1478,7 +1479,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_undo_bind(nlk->ngroups, groups, nlk); + netlink_undo_bind(nlk->ngroups, groups, sk); return err; } } @@ -2129,7 +2130,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { - err = nlk->netlink_bind(val); + err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } @@ -2138,7 +2139,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) - nlk->netlink_unbind(val); + nlk->netlink_unbind(sock_net(sk), val); err = 0; break; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index b20a1731759b..f123a88496f8 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -39,8 +39,8 @@ struct netlink_sock { struct mutex *cb_mutex; struct mutex cb_def_mutex; void (*netlink_rcv)(struct sk_buff *skb); - int (*netlink_bind)(int group); - void (*netlink_unbind)(int group); + int (*netlink_bind)(struct net *net, int group); + void (*netlink_unbind)(struct net *net, int group); struct module *module; #ifdef CONFIG_NETLINK_MMAP struct mutex pg_vec_lock; @@ -65,8 +65,8 @@ struct netlink_table { unsigned int groups; struct mutex *cb_mutex; struct module *module; - int (*bind)(int group); - void (*unbind)(int group); + int (*bind)(struct net *net, int group); + void (*unbind)(struct net *net, int group); bool (*compare)(struct net *net, struct sock *sock); int registered; }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 05bf40bbd189..91566ed36c43 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -983,7 +983,7 @@ static struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; -static int genl_bind(int group) +static int genl_bind(struct net *net, int group) { int i, err; bool found = false; @@ -997,8 +997,10 @@ static int genl_bind(int group) group < f->mcgrp_offset + f->n_mcgrps) { int fam_grp = group - f->mcgrp_offset; - if (f->mcast_bind) - err = f->mcast_bind(fam_grp); + if (!f->netnsok && net != &init_net) + err = -ENOENT; + else if (f->mcast_bind) + err = f->mcast_bind(net, fam_grp); else err = 0; found = true; @@ -1014,7 +1016,7 @@ static int genl_bind(int group) return err; } -static void genl_unbind(int group) +static void genl_unbind(struct net *net, int group) { int i; bool found = false; @@ -1029,7 +1031,7 @@ static void genl_unbind(int group) int fam_grp = group - f->mcgrp_offset; if (f->mcast_unbind) - f->mcast_unbind(fam_grp); + f->mcast_unbind(net, fam_grp); found = true; break; } -- cgit v1.2.1 From dc97a1a9477f969e34b38ca9d9cd231cb93ebea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Dec 2014 16:31:49 -0500 Subject: genetlink: A genl_bind() to an out-of-range multicast group should not WARN(). Users can request to bind to arbitrary multicast groups, so warning when the requested group number is out of range is not appropriate. And with the warning removed, and the 'err' variable properly given an initial value, we can remove 'found' altogether. Reported-by: Sedat Dilek Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 91566ed36c43..2e11061ef885 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -985,8 +985,7 @@ static struct genl_multicast_group genl_ctrl_groups[] = { static int genl_bind(struct net *net, int group) { - int i, err; - bool found = false; + int i, err = 0; down_read(&cb_lock); for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { @@ -1003,16 +1002,12 @@ static int genl_bind(struct net *net, int group) err = f->mcast_bind(net, fam_grp); else err = 0; - found = true; break; } } } up_read(&cb_lock); - if (WARN_ON(!found)) - err = 0; - return err; } -- cgit v1.2.1