From 936523441bb64cdc9a5b263e8fd2782e70313a57 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 6 Aug 2016 15:50:52 +0200 Subject: batman-adv: Add missing refcnt for last_candidate batadv_find_router dereferences last_bonding_candidate from orig_node without making sure that it has a valid reference. This reference has to be retrieved by increasing the reference counter while holding neigh_list_lock. The lock is required to avoid that batadv_last_bonding_replace removes the current last_bonding_candidate, reduces the reference counter and maybe destroys the object in this process. Fixes: f3b3d9018975 ("batman-adv: add bonding again") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/routing.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7602c001e92b..3d199478c405 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -469,6 +469,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, return 0; } +/** + * batadv_last_bonding_get - Get last_bonding_candidate of orig_node + * @orig_node: originator node whose last bonding candidate should be retrieved + * + * Return: last bonding candidate of router or NULL if not found + * + * The object is returned with refcounter increased by 1. + */ +static struct batadv_orig_ifinfo * +batadv_last_bonding_get(struct batadv_orig_node *orig_node) +{ + struct batadv_orig_ifinfo *last_bonding_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + last_bonding_candidate = orig_node->last_bonding_candidate; + + if (last_bonding_candidate) + kref_get(&last_bonding_candidate->refcount); + spin_unlock_bh(&orig_node->neigh_list_lock); + + return last_bonding_candidate; +} + /** * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node * @orig_node: originator node whose bonding candidates should be replaced @@ -539,7 +562,7 @@ batadv_find_router(struct batadv_priv *bat_priv, * router - obviously there are no other candidates. */ rcu_read_lock(); - last_candidate = orig_node->last_bonding_candidate; + last_candidate = batadv_last_bonding_get(orig_node); if (last_candidate) last_cand_router = rcu_dereference(last_candidate->router); @@ -631,6 +654,9 @@ next: batadv_orig_ifinfo_put(next_candidate); } + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + return router; } -- cgit v1.2.1 From 1e5d343b8f23770e8ac5d31f5c439826bdb35148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 23 Aug 2016 03:13:03 +0200 Subject: batman-adv: fix elp packet data reservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skb_reserve() call only reserved headroom for the mac header, but not the elp packet header itself. Fixing this by using skb_put()'ing towards the skb tail instead of skb_push()'ing towards the skb head. Fixes: d6f94d91f766 ("batman-adv: ELP - adding basic infrastructure") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 7d170010beb9..ee08540ce503 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -335,7 +335,7 @@ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface) goto out; skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN); - elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); + elp_buff = skb_put(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN); elp_packet = (struct batadv_elp_packet *)elp_buff; memset(elp_packet, 0, BATADV_ELP_HLEN); -- cgit v1.2.1 From a41bd25ae67d3e4052c7f00ee9f2b4ba9219309e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 25 Aug 2016 18:42:35 +0200 Subject: sunrpc: fix UDP memory accounting The commit f9b2ee714c5c ("SUNRPC: Move UDP receive data path into a workqueue context"), as a side effect, moved the skb_free_datagram() call outside the scope of the related socket lock, but UDP sockets require such lock to be held for proper memory accounting. Fix it by replacing skb_free_datagram() with skb_free_datagram_locked(). Fixes: f9b2ee714c5c ("SUNRPC: Move UDP receive data path into a workqueue context") Reported-and-tested-by: Jan Stancek Signed-off-by: Paolo Abeni Cc: stable@vger.kernel.org # 4.4+ Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8ede3bc52481..bf168838a029 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) skb = skb_recv_datagram(sk, 0, 1, &err); if (skb != NULL) { xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); + skb_free_datagram_locked(sk, skb); continue; } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) -- cgit v1.2.1 From 5210d393ef84e5d2a4854671a9af2d97fd1b8dd4 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Fri, 2 Sep 2016 20:49:12 +0800 Subject: netfilter: nf_tables_trace: fix endiness when dump chain policy NFTA_TRACE_POLICY attribute is big endian, but we forget to call htonl to convert it. Fortunately, this attribute is parsed as big endian in libnftnl. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 39eb1cc62e91..fa24a5b398b1 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info) break; case NFT_TRACETYPE_POLICY: if (nla_put_be32(skb, NFTA_TRACE_POLICY, - info->basechain->policy)) + htonl(info->basechain->policy))) goto nla_put_failure; break; } -- cgit v1.2.1 From d1a6cba576fc7c43e476538fe5aa72fe04bd80e1 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 6 Sep 2016 22:31:02 +0800 Subject: netfilter: nft_chain_route: re-route before skb is queued to userspace Imagine such situation, user add the following nft rules, and queue the packets to userspace for further check: # ip rule add fwmark 0x0/0x1 lookup eth0 # ip rule add fwmark 0x1/0x1 lookup eth1 # nft add table filter # nft add chain filter output {type route hook output priority 0 \;} # nft add rule filter output mark set 0x1 # nft add rule filter output queue num 0 But after we reinject the skbuff, the packet will be sent via the wrong route, i.e. in this case, the packet will be routed via eth0 table, not eth1 table. Because we skip to do re-route when verdict is NF_QUEUE, even if the mark was changed. Acctually, we should not touch sk_buff if verdict is NF_DROP or NF_STOLEN, and when re-route fails, return NF_DROP with error code. This is consistent with the mangle table in iptables. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++++++---- net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 2375b0a8be46..30493beb611a 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, __be32 saddr, daddr; u_int8_t tos; const struct iphdr *iph; + int err; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || @@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv, tos = iph->tos; ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || - iph->tos != tos) - if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) - ret = NF_DROP; + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } } return ret; } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3108..2535223ba956 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv, struct in6_addr saddr, daddr; u_int8_t hop_limit; u32 mark, flowlabel; + int err; /* malformed packet, drop it */ if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) @@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv, flowlabel = *((u32 *)ipv6_hdr(skb)); ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_QUEUE && + if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) - return ip6_route_me_harder(state->net, skb) == 0 ? ret : NF_DROP; + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } -- cgit v1.2.1 From 78d506e1b7071b24850fd5ac22b896c459b0a04c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 6 Sep 2016 11:22:49 -0400 Subject: xprtrdma: Revert 3d4cf35bd4fa ("xprtrdma: Reply buffer exhaustion...") Receive buffer exhaustion, if it were to actually occur, would be catastrophic. However, when there are no reply buffers to post, that means all of them have already been posted and are waiting for incoming replies. By design, there can never be more RPCs in flight than there are available receive buffers. A receive buffer can be left posted after an RPC exits without a received reply; say, due to a credential problem or a soft timeout. This does not result in fewer posted receive buffers than there are pending RPCs, and there is already logic in xprtrdma to deal appropriately with this case. It also looks like the "+ 2" that was removed was accidentally accommodating the number of extra receive buffers needed for receiving backchannel requests. That will need to be addressed by another patch. Fixes: 3d4cf35bd4fa ("xprtrdma: Reply buffer exhaustion can be...") Signed-off-by: Chuck Lever Reviewed-by: Anna Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 536d0be3f61b..fefcba982415 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -923,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests; i++) { + for (i = 0; i < buf->rb_max_requests + 2; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -1076,6 +1076,8 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) /* * Get a set of request/reply buffers. + * + * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) @@ -1094,13 +1096,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); + pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; out_repbuf: - list_add(&req->rl_free, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); - pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); - return NULL; + pr_warn("RPC: %s: out of reply buffers\n", __func__); + req->rl_reply = NULL; + return req; } /* -- cgit v1.2.1 From 05c974669ecec510a85d8534099bb75404e82c41 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 6 Sep 2016 11:22:58 -0400 Subject: xprtrdma: Fix receive buffer accounting An RPC can terminate before its reply arrives, if a credential problem or a soft timeout occurs. After this happens, xprtrdma reports it is out of Receive buffers. A Receive buffer is posted before each RPC is sent, and returned to the buffer pool when a reply is received. If no reply is received for an RPC, that Receive buffer remains posted. But xprtrdma tries to post another when the next RPC is sent. If this happens a few dozen times, there are no receive buffers left to be posted at send time. I don't see a way for a transport connection to recover at that point, and it will spit warnings and unnecessarily delay RPCs on occasion for its remaining lifetime. Commit 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays") removed a little bit of logic to detect this case and not provide a Receive buffer so no more buffers are posted, and then transport operation continues correctly. We didn't understand what that logic did, and it wasn't commented, so it was removed as part of the overhaul to support backchannel requests. Restore it, but be wary of the need to keep extra Receives posted to deal with backchannel requests. Fixes: 1e465fd4ff47 ("xprtrdma: Replace send and receive arrays") Signed-off-by: Chuck Lever Reviewed-by: Anna Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 41 +++++++++++++++++++++++++++++------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fefcba982415..799cce6cbe45 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include /* try_module_get()/module_put() */ @@ -923,7 +924,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { + for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -1018,6 +1019,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rep = rpcrdma_buffer_get_rep_locked(buf); rpcrdma_destroy_rep(ia, rep); } + buf->rb_send_count = 0; spin_lock(&buf->rb_reqslock); while (!list_empty(&buf->rb_allreqs)) { @@ -1032,6 +1034,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); + buf->rb_recv_count = 0; rpcrdma_destroy_mrs(buf); } @@ -1074,6 +1077,23 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) spin_unlock(&buf->rb_mwlock); } +static struct rpcrdma_rep * +rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers) +{ + /* If an RPC previously completed without a reply (say, a + * credential problem or a soft timeout occurs) then hold off + * on supplying more Receive buffers until the number of new + * pending RPCs catches up to the number of posted Receives. + */ + if (unlikely(buffers->rb_send_count < buffers->rb_recv_count)) + return NULL; + + if (unlikely(list_empty(&buffers->rb_recv_bufs))) + return NULL; + buffers->rb_recv_count++; + return rpcrdma_buffer_get_rep_locked(buffers); +} + /* * Get a set of request/reply buffers. * @@ -1087,10 +1107,9 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_send_bufs)) goto out_reqbuf; + buffers->rb_send_count++; req = rpcrdma_buffer_get_req_locked(buffers); - if (list_empty(&buffers->rb_recv_bufs)) - goto out_repbuf; - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); return req; @@ -1098,11 +1117,6 @@ out_reqbuf: spin_unlock(&buffers->rb_lock); pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; -out_repbuf: - spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of reply buffers\n", __func__); - req->rl_reply = NULL; - return req; } /* @@ -1119,9 +1133,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; spin_lock(&buffers->rb_lock); + buffers->rb_send_count--; list_add_tail(&req->rl_free, &buffers->rb_send_bufs); - if (rep) + if (rep) { + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + } spin_unlock(&buffers->rb_lock); } @@ -1135,8 +1152,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; spin_lock(&buffers->rb_lock); - if (!list_empty(&buffers->rb_recv_bufs)) - req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); } @@ -1150,6 +1166,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; spin_lock(&buffers->rb_lock); + buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); spin_unlock(&buffers->rb_lock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 670fad57153a..a71b0f5897d8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -321,6 +321,7 @@ struct rpcrdma_buffer { char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ + int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; -- cgit v1.2.1 From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:11 +0800 Subject: netfilter: gre: Use consistent GRE_* macros instead of ones defined by netfilter. There are already some GRE_* macros in kernel, so it is unnecessary to define these macros. And remove some useless macros Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_proto_gre.c | 4 ++-- net/netfilter/nf_conntrack_proto_gre.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 9414923f1e15..93198d71dbb6 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -104,11 +104,11 @@ gre_manip_pkt(struct sk_buff *skb, if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->version) { - case GRE_VERSION_1701: + case ntohs(GRE_VERSION_0): /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; - case GRE_VERSION_PPTP: + case ntohs(GRE_VERSION_1): pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index a96451a7af20..deb239a014e4 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -200,7 +200,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { + if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -212,7 +212,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, if (!pgrehdr) return true; - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { + if (grehdr->protocol != GRE_PROTO_PPP) { pr_debug("GRE_VERSION_PPTP but unknown proto\n"); return false; } -- cgit v1.2.1 From c579a9e7d58f66030a144c7a33cc9bdf827a4b6d Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:47 +0800 Subject: netfilter: gre: Use consistent GRE and PTTP header structure instead of the ones defined by netfilter There are two existing strutures which defines the GRE and PPTP header. So use these two structures instead of the ones defined by netfilter to keep consitent with other codes. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_proto_gre.c | 13 +++++++------ net/netfilter/nf_conntrack_proto_gre.c | 12 ++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 93198d71dbb6..edf05002d674 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - const struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; + const struct gre_base_hdr *greh; + struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ @@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb, return false; greh = (void *)skb->data + hdroff; - pgreh = (struct gre_hdr_pptp *)greh; + pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; - switch (greh->version) { - case ntohs(GRE_VERSION_0): + + switch (greh->flags & GRE_VERSION) { + case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; - case ntohs(GRE_VERSION_1): + case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index deb239a014e4..9a715f88b2f1 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct gre_hdr_pptp *pgrehdr; - struct gre_hdr_pptp _pgrehdr; + const struct pptp_gre_header *pgrehdr; + struct pptp_gre_header _pgrehdr; __be16 srckey; - const struct gre_hdr *grehdr; - struct gre_hdr _grehdr; + const struct gre_base_hdr *grehdr; + struct gre_base_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) { + if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -213,7 +213,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; if (grehdr->protocol != GRE_PROTO_PPP) { - pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol)); return false; } -- cgit v1.2.1 From 68cb9fe47ea661bffd48c8ca35790be26935e1c5 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 30 Aug 2016 18:48:19 +0200 Subject: netfilter: nf_ct_sip: correct parsing of continuation lines in SIP headers Current parsing methods for SIP headers do not properly manage continuation lines: in case of Call-ID header the first character of Call-ID header value is truncated. As a result IPVS SIP persistence engine hashes over a call-id that is not exactly the one present in the originale message. Example: "Call-ID: \r\n abcdeABCDE1234" results in extracted call-id equal to "bcdeABCDE1234". In above example Call-ID is represented as a string in C language. Obviously in real message the first bytes after colon (":") are "20 0d 0a 20". Proposed fix is in nf_conntrack_sip module. Since sip_follow_continuation() function walks past the leading spaces or tabs of the continuation line, sip_skip_whitespace() should simply return the ouput of sip_follow_continuation(). Otherwise another iteration of the for loop is done and dptr is incremented by one pointing to the second character of the first word in the header. Below is an extract of relevant SIP ABNF syntax. Call-ID = ( "Call-ID" / "i" ) HCOLON callid callid = word [ "@" word ] HCOLON = *( SP / HTAB ) ":" SWS SWS = [LWS] ; sep whitespace LWS = [*WSP CRLF] 1*WSP ; linear whitespace WSP = SP / HTAB word = 1*(alphanum / "-" / "." / "!" / "%" / "*" / "_" / "+" / "`" / "'" / "~" / "(" / ")" / "<" / ">" / ":" / "\" / DQUOTE / "/" / "[" / "]" / "?" / "{" / "}" ) Signed-off-by: Marco Angaroni Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 7d77217de6a3..251a9a44d189 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -334,8 +334,7 @@ static const char *sip_skip_whitespace(const char *dptr, const char *limit) if (*dptr != '\r' && *dptr != '\n') break; dptr = sip_follow_continuation(dptr, limit); - if (dptr == NULL) - return NULL; + break; } return dptr; } -- cgit v1.2.1 From f0608ceaa79d99d24e97517f9a9a0fed2b9698b4 Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 30 Aug 2016 18:48:24 +0200 Subject: netfilter: nf_ct_sip: correct allowed characters in Call-ID SIP header Current parsing methods for SIP header Call-ID do not check correctly all characters allowed by RFC 3261. In particular "," character is allowed instead of "'" character. As a result Call-ID headers like the following are discarded by IPVS SIP persistence engine. Call-ID: -.!%*_+`'~()<>:\"/[]?{} Above example is composed using all non-alphanumeric characters listed in RFC 3261 for Call-ID header syntax. Proposed fix is in nf_conntrack_sip module; function iswordc() checks this range: (c >= '(' && c <= '/') which includes these characters: ()*+,-./ They are all allowed except ",". Instead "'" is not included in the list. Below is an extract of relevant SIP ABNF syntax. Call-ID = ( "Call-ID" / "i" ) HCOLON callid callid = word [ "@" word ] HCOLON = *( SP / HTAB ) ":" SWS SWS = [LWS] ; sep whitespace LWS = [*WSP CRLF] 1*WSP ; linear whitespace WSP = SP / HTAB word = 1*(alphanum / "-" / "." / "!" / "%" / "*" / "_" / "+" / "`" / "'" / "~" / "(" / ")" / "<" / ">" / ":" / "\" / DQUOTE / "/" / "[" / "]" / "?" / "{" / "}" ) Signed-off-by: Marco Angaroni Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 251a9a44d189..d8035351aff5 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr, static int iswordc(const char c) { if (isalnum(c) || c == '!' || c == '"' || c == '%' || - (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' || c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || - c == '{' || c == '}' || c == '~') + c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') || + c == '\'') return 1; return 0; } -- cgit v1.2.1 From 723eb299de62ce75dbd31e7ee25d45887c32a602 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 1 Sep 2016 18:58:29 +0800 Subject: netfilter: ftp: Remove the useless dlen==0 condition check in find_pattern The caller function "help" has already make sure the datalen could not be zero before invoke find_pattern as a parameter by the following codes if (dataoff >= skb->len) { pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, skb->len); return NF_ACCEPT; } datalen = skb->len - dataoff; And the latter codes "ends_in_nl = (fb_ptr[datalen - 1] == '\n');" use datalen directly without checking if it is zero. So it is unneccessary to check it in find_pattern too. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_ftp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index b6934b5edf7a..d49a2d410813 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen, size_t i = plen; pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); - if (dlen == 0) - return 0; if (dlen <= plen) { /* Short packet: try for partial? */ -- cgit v1.2.1 From ddb075b0cdbca140d6c6503db9cc9990d3461308 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 1 Sep 2016 18:59:02 +0800 Subject: netfilter: ftp: Remove the useless code There are some debug code which are commented out in find_pattern by #if 0. Now remove them. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_ftp.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index d49a2d410813..e3ed20060878 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -309,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen, else return 0; } - if (strncasecmp(data, pattern, plen) != 0) { -#if 0 - size_t i; - - pr_debug("ftp: string mismatch\n"); - for (i = 0; i < plen; i++) { - pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", - i, data[i], data[i], - pattern[i], pattern[i]); - } -#endif + if (strncasecmp(data, pattern, plen) != 0) return 0; - } pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip -- cgit v1.2.1 From 0d9932b2875f568d679f2af33ce610da3903ac11 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Fri, 2 Sep 2016 15:05:57 +0200 Subject: netfilter: nft_numgen: rename until attribute by modulus The _until_ attribute is renamed to _modulus_ as the behaviour is similar to other expresions with number limits (ex. nft_hash). Renaming is possible because there isn't a kernel release yet with these changes. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 294745ecb0fc..f51a3ede3932 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -21,7 +21,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); struct nft_ng_inc { enum nft_registers dreg:8; - u32 until; + u32 modulus; atomic_t counter; }; @@ -34,7 +34,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, do { oval = atomic_read(&priv->counter); - nval = (oval + 1 < priv->until) ? oval + 1 : 0; + nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); memcpy(®s->data[priv->dreg], &priv->counter, sizeof(u32)); @@ -42,7 +42,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, - [NFTA_NG_UNTIL] = { .type = NLA_U32 }, + [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, }; @@ -52,8 +52,8 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, { struct nft_ng_inc *priv = nft_expr_priv(expr); - priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); - if (priv->until == 0) + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) return -ERANGE; priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); @@ -64,11 +64,11 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, - u32 until, enum nft_ng_types type) + u32 modulus, enum nft_ng_types type) { if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until))) + if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) goto nla_put_failure; @@ -83,12 +83,12 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL); } struct nft_ng_random { enum nft_registers dreg:8; - u32 until; + u32 modulus; }; static void nft_ng_random_eval(const struct nft_expr *expr, @@ -99,7 +99,7 @@ static void nft_ng_random_eval(const struct nft_expr *expr, struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state), - priv->until); + priv->modulus); } static int nft_ng_random_init(const struct nft_ctx *ctx, @@ -108,8 +108,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, { struct nft_ng_random *priv = nft_expr_priv(expr); - priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL])); - if (priv->until == 0) + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) return -ERANGE; prandom_init_once(&nft_numgen_prandom_state); @@ -124,7 +124,7 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM); } static struct nft_expr_type nft_ng_type; @@ -149,8 +149,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 type; - if (!tb[NFTA_NG_DREG] || - !tb[NFTA_NG_UNTIL] || + if (!tb[NFTA_NG_DREG] || + !tb[NFTA_NG_MODULUS] || !tb[NFTA_NG_TYPE]) return ERR_PTR(-EINVAL); -- cgit v1.2.1 From db6d857b819a00627a3bd911f49ee3156766bba8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Sep 2016 21:00:58 +0200 Subject: netfilter: nft_quota: fix overquota logic Use xor to decide to break further rule evaluation or not, since the existing logic doesn't achieve the expected inversion. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 6eafbf987ed9..92b6ff16dbb3 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -33,7 +33,7 @@ static void nft_quota_eval(const struct nft_expr *expr, { struct nft_quota *priv = nft_expr_priv(expr); - if (nft_quota(priv, pkt) < 0 && !priv->invert) + if ((nft_quota(priv, pkt) < 0) ^ priv->invert) regs->verdict.code = NFT_BREAK; } -- cgit v1.2.1 From 22609b43b194917dce2188ae9a78bc40a14e67b5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Sep 2016 21:00:59 +0200 Subject: netfilter: nft_quota: introduce nft_overquota() This is patch renames the existing function to nft_overquota() and make it return a boolean that tells us if we have exceeded our byte quota. Just a cleanup. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_quota.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 92b6ff16dbb3..c00104c07095 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -21,10 +21,10 @@ struct nft_quota { atomic64_t remain; }; -static inline long nft_quota(struct nft_quota *priv, - const struct nft_pktinfo *pkt) +static inline bool nft_overquota(struct nft_quota *priv, + const struct nft_pktinfo *pkt) { - return atomic64_sub_return(pkt->skb->len, &priv->remain); + return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; } static void nft_quota_eval(const struct nft_expr *expr, @@ -33,7 +33,7 @@ static void nft_quota_eval(const struct nft_expr *expr, { struct nft_quota *priv = nft_expr_priv(expr); - if ((nft_quota(priv, pkt) < 0) ^ priv->invert) + if (nft_overquota(priv, pkt) ^ priv->invert) regs->verdict.code = NFT_BREAK; } -- cgit v1.2.1 From 1bcabc81ee94c0a65989128258f8c1d3e1c1b0ea Mon Sep 17 00:00:00 2001 From: Marco Angaroni Date: Tue, 30 Aug 2016 18:52:22 +0200 Subject: netfilter: nf_ct_sip: allow tab character in SIP headers Current parsing methods for SIP headers do not allow the presence of tab characters between header name and header value. As a result Call-ID SIP headers like the following are discarded by IPVS SIP persistence engine: "Call-ID\t: mycallid@abcde" "Call-ID:\tmycallid@abcde" In above examples Call-IDs are represented as strings in C language. Obviously in real message we have byte "09" before/after colon (":"). Proposed fix is in nf_conntrack_sip module. Function sip_skip_whitespace() should skip tabs in addition to spaces, since in SIP grammar whitespace (WSP) corresponds to space or tab. Below is an extract of relevant SIP ABNF syntax. Call-ID = ( "Call-ID" / "i" ) HCOLON callid callid = word [ "@" word ] HCOLON = *( SP / HTAB ) ":" SWS SWS = [LWS] ; sep whitespace LWS = [*WSP CRLF] 1*WSP ; linear whitespace WSP = SP / HTAB word = 1*(alphanum / "-" / "." / "!" / "%" / "*" / "_" / "+" / "`" / "'" / "~" / "(" / ")" / "<" / ">" / ":" / "\" / DQUOTE / "/" / "[" / "]" / "?" / "{" / "}" ) Signed-off-by: Marco Angaroni Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d8035351aff5..621b81c7bddc 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -330,7 +330,7 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit) static const char *sip_skip_whitespace(const char *dptr, const char *limit) { for (; dptr < limit; dptr++) { - if (*dptr == ' ') + if (*dptr == ' ' || *dptr == '\t') continue; if (*dptr != '\r' && *dptr != '\n') break; -- cgit v1.2.1 From 2f30ea5090cbc57ea573cdc66421264b3de3fb0a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 8 Sep 2016 18:09:57 +0200 Subject: xfrm_user: propagate sec ctx allocation errors When we fail to attach the security context in xfrm_state_construct() we'll return 0 as error value which, in turn, will wrongly claim success to userland when, in fact, we won't be adding / updating the XFRM state. This is a regression introduced by commit fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl(), attach_sec_ctx(), and attach_one_addr()"). Fix it by propagating the error returned by security_xfrm_state_alloc() in this case. Fixes: fd21150a0fe1 ("[XFRM] netlink: Inline attach_encap_tmpl()...") Signed-off-by: Mathias Krause Cc: Thomas Graf Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index cb65d916a345..08892091cfe3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -581,9 +581,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (err) goto error; - if (attrs[XFRMA_SEC_CTX] && - security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) - goto error; + if (attrs[XFRMA_SEC_CTX]) { + err = security_xfrm_state_alloc(x, + nla_data(attrs[XFRMA_SEC_CTX])); + if (err) + goto error; + } if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) -- cgit v1.2.1 From 1fb81e09d487656aa23f2acb1232c7f56b4c2367 Mon Sep 17 00:00:00 2001 From: "thomas.zeitlhofer+lkml@ze-it.at" Date: Wed, 7 Sep 2016 20:40:38 +0200 Subject: vti: use right inner_mode for inbound inter address family policy checks In case of inter address family tunneling (IPv6 over vti4 or IPv4 over vti6), the inbound policy checks in vti_rcv_cb() and vti6_rcv_cb() are using the wrong address family. As a result, all inbound inter address family traffic is dropped. Use the xfrm_ip2inner_mode() helper, as done in xfrm_input() (i.e., also increment LINUX_MIB_XFRMINSTATEMODEERROR in case of error), to select the inner_mode that contains the right address family for the inbound policy checks. Signed-off-by: Thomas Zeitlhofer Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 15 ++++++++++++++- net/ipv6/ip6_vti.c | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index cc701fa70b12..5d7944f394d9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -88,6 +88,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; @@ -105,7 +106,19 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d90a11f14040..52a2f735881f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -340,6 +340,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; + struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; @@ -357,7 +358,19 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) } x = xfrm_input_state(skb); - family = x->inner_mode->afinfo->family; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + family = inner_mode->afinfo->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); -- cgit v1.2.1 From fe01111d23810c0cf6830ce5af1c14c6d3df6dc5 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Tue, 6 Sep 2016 22:33:37 +0800 Subject: netfilter: nft_queue: check the validation of queues_total and queuenum Although the validation of queues_total and queuenum is checked in nft utility, but user can add nft rules via nfnetlink, so it is necessary to check the validation at the nft_queue expr init routine too. Tested by run ./nft-test.py any/queue.t: any/queue.t: 6 unit tests, 0 error, 0 warning Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_queue.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 61d216eb7917..d16d59959ff6 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -65,6 +65,7 @@ static int nft_queue_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); + u32 maxid; if (tb[NFTA_QUEUE_NUM] == NULL) return -EINVAL; @@ -74,6 +75,16 @@ static int nft_queue_init(const struct nft_ctx *ctx, if (tb[NFTA_QUEUE_TOTAL] != NULL) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); + else + priv->queues_total = 1; + + if (priv->queues_total == 0) + return -EINVAL; + + maxid = priv->queues_total - 1 + priv->queuenum; + if (maxid > U16_MAX) + return -ERANGE; + if (tb[NFTA_QUEUE_FLAGS] != NULL) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) -- cgit v1.2.1 From 83843c80dcf11a78995d167255b03072a1e49c2c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 28 Aug 2016 13:10:37 +0200 Subject: mac80211: fix tim recalculation after PS response Handle the case where the mac80211 intermediate queues are empty and the driver has buffered frames Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation") Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 76b737dcc36f..aa58df80ede0 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1616,7 +1616,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, sta_info_recalc_tim(sta); } else { - unsigned long tids = sta->txq_buffered_tids & driver_release_tids; int tid; /* @@ -1648,7 +1647,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(tids & BIT(tid)) || txqi->tin.backlog_packets) + if (!(driver_release_tids & BIT(tid)) || + txqi->tin.backlog_packets) continue; sta_info_recalc_tim(sta); -- cgit v1.2.1 From df6ef5d8a87ace995d5c10a7bd684be05911a321 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 4 Sep 2016 18:00:59 +0200 Subject: mac80211: fix sequence number assignment for PS response frames When using intermediate queues, sequence number allocation is deferred until dequeue. This doesn't work for PS response frames, which bypass those queues. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 65 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 502396694f47..cc8e95554b48 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -796,6 +796,36 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) return ret; } +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_txq *txq = NULL; + + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) + return NULL; + + if (!ieee80211_is_data(hdr->frame_control)) + return NULL; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + return NULL; + + return to_txq_info(txq); +} + static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { @@ -853,7 +883,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!tx->sta->sta.txq[0]) + if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, + tx->skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; @@ -1243,36 +1274,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_txq *txq = NULL; - - if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || - (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - return NULL; - - if (!ieee80211_is_data(hdr->frame_control)) - return NULL; - - if (pubsta) { - u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - - txq = pubsta->txq[tid]; - } else if (vif) { - txq = vif->txq; - } - - if (!txq) - return NULL; - - return to_txq_info(txq); -} - static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); @@ -3264,7 +3265,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { *ieee80211_get_qos_ctl(hdr) = tid; - if (!sta->sta.txq[0]) + if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb)) hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; -- cgit v1.2.1 From 5df20f2141eadb5430caaad20eceac61cfe0f139 Mon Sep 17 00:00:00 2001 From: "Pedersen, Thomas" Date: Tue, 6 Sep 2016 11:59:00 -0700 Subject: mac80211: make mpath path fixing more robust A fixed mpath was not quite being treated as such: 1) if a PERR frame was received, a fixed mpath was deactivated. 2) queued path discovery for fixed mpath was potentially being considered, changing mpath state. 3) other mpath flags were potentially being inherited when fixing the mpath. Just assign PATH_FIXED and SN_VALID. This solves several issues when fixing a mesh path in one direction. The reverse direction mpath should probably also be fixed, or root announcements at least be enabled. Signed-off-by: Thomas Pedersen Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 3 ++- net/mac80211/mesh_pathtbl.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 8f9c3bde835f..faccef977670 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -746,6 +746,7 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, sta = next_hop_deref_protected(mpath); if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && + !(mpath->flags & MESH_PATH_FIXED) && (!(mpath->flags & MESH_PATH_SN_VALID) || SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; @@ -1012,7 +1013,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) goto enddiscovery; spin_lock_bh(&mpath->state_lock); - if (mpath->flags & MESH_PATH_DELETED) { + if (mpath->flags & (MESH_PATH_DELETED | MESH_PATH_FIXED)) { spin_unlock_bh(&mpath->state_lock); goto enddiscovery; } diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 6db2ddfa0695..f0e6175a9821 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -826,7 +826,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mpath->metric = 0; mpath->hop_count = 0; mpath->exp_time = 0; - mpath->flags |= MESH_PATH_FIXED; + mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); mesh_path_tx_pending(mpath); -- cgit v1.2.1 From 70ca767ea1b2748f45e96192400e515dddbe517c Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 6 Sep 2016 08:44:19 +0200 Subject: netfilter: nft_hash: Add hash offset value Add support to pass through an offset to the hash value. With this feature, the sysadmin is able to generate a hash with a given offset value. Example: meta mark set jhash ip saddr mod 2 seed 0xabcd offset 100 This option generates marks according to the source address from 100 to 101. Signed-off-by: Laura Garcia Liebana --- net/netfilter/nft_hash.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 764251d31e46..bd12f7a801c2 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -23,6 +23,7 @@ struct nft_hash { u8 len; u32 modulus; u32 seed; + u32 offset; }; static void nft_hash_eval(const struct nft_expr *expr, @@ -31,10 +32,10 @@ static void nft_hash_eval(const struct nft_expr *expr, { struct nft_hash *priv = nft_expr_priv(expr); const void *data = ®s->data[priv->sreg]; + u32 h; - regs->data[priv->dreg] = - reciprocal_scale(jhash(data, priv->len, priv->seed), - priv->modulus); + h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); + regs->data[priv->dreg] = h + priv->offset; } static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { @@ -59,6 +60,9 @@ static int nft_hash_init(const struct nft_ctx *ctx, !tb[NFTA_HASH_MODULUS]) return -EINVAL; + if (tb[NFTA_HASH_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); + priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); @@ -72,6 +76,9 @@ static int nft_hash_init(const struct nft_ctx *ctx, if (priv->modulus <= 1) return -ERANGE; + if (priv->offset + priv->modulus - 1 < U32_MAX) + return -EOVERFLOW; + priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); return nft_validate_register_load(priv->sreg, len) && @@ -94,7 +101,9 @@ static int nft_hash_dump(struct sk_buff *skb, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) goto nla_put_failure; - + if (priv->offset != 0) + if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.1 From dbd2be0646e3239022630c426cbceefa15714bca Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Sep 2016 12:22:18 +0200 Subject: netfilter: nft_dynset: allow to invert match criteria The dynset expression matches if we can fit a new entry into the set. If there is no room for it, then it breaks the rule evaluation. This patch introduces the inversion flag so you can add rules to explicitly drop packets that don't fit into the set. For example: # nft filter input flow table xyz size 4 { ip saddr timeout 120s counter } overflow drop This is useful to provide a replacement for connlimit. For the rule above, every new entry uses the IPv4 address as key in the set, this entry gets a timeout of 120 seconds that gets refresh on every packet seen. If we get new flow and our set already contains 4 entries already, then this packet is dropped. You can already express this in positive logic, assuming default policy to drop: # nft filter input flow table xyz size 4 { ip saddr timeout 10s counter } accept Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 0af26699bf04..e3b83c31da2e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -22,6 +22,7 @@ struct nft_dynset { enum nft_dynset_ops op:8; enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; + bool invert; u64 timeout; struct nft_expr *expr; struct nft_set_binding binding; @@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr, if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); + + if (priv->invert) + regs->verdict.code = NFT_BREAK; return; } out: - regs->verdict.code = NFT_BREAK; + if (!priv->invert) + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { @@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, + [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; + if (tb[NFTA_DYNSET_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); + + if (flags & ~NFT_DYNSET_F_INV) + return -EINVAL; + if (flags & NFT_DYNSET_F_INV) + priv->invert = true; + } + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], genmask); if (IS_ERR(set)) { @@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_dynset *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.1 From beac5afa2d78605b70f40cf5ab5601ab10659c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:49 +0200 Subject: netfilter: nf_tables: ensure proper initialization of nft_pktinfo fields This patch introduces nft_set_pktinfo_unspec() that ensures proper initialization all of pktinfo fields for non-IP traffic. This is used by the bridge, netdev and arp families. This new function relies on nft_set_pktinfo_proto_unspec() to set a new tprot_set field that indicates if transport protocol information is available. Remain fields are zeroed. The meta expression has been also updated to check to tprot_set in first place given that zero is a valid tprot value. Even a handcrafted packet may come with the IPPROTO_RAW (255) protocol number so we can't rely on this value as tprot unset. Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_tables_bridge.c | 6 +++--- net/ipv4/netfilter/nf_tables_arp.c | 2 +- net/netfilter/nf_tables_netdev.c | 4 +++- net/netfilter/nft_meta.c | 2 ++ 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index a78c4e2826e5..29899887163e 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -71,7 +71,7 @@ static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, if (nft_bridge_iphdr_validate(skb)) nft_set_pktinfo_ipv4(pkt, skb, state); else - nft_set_pktinfo(pkt, skb, state); + nft_set_pktinfo_unspec(pkt, skb, state); } static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, @@ -83,7 +83,7 @@ static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, nft_set_pktinfo_ipv6(pkt, skb, state) == 0) return; #endif - nft_set_pktinfo(pkt, skb, state); + nft_set_pktinfo_unspec(pkt, skb, state); } static unsigned int @@ -101,7 +101,7 @@ nft_do_chain_bridge(void *priv, nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index cd84d4295a20..058c034be376 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); return nft_do_chain(&pkt, priv); } diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 5eefe4a355c6..8de502b0c37b 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -41,6 +41,7 @@ nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, else if (len < thoff) return; + pkt->tprot_set = true; pkt->tprot = iph->protocol; pkt->xt.thoff = thoff; pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; @@ -74,6 +75,7 @@ __nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, if (protohdr < 0) return; + pkt->tprot_set = true; pkt->tprot = protohdr; pkt->xt.thoff = thoff; pkt->xt.fragoff = frag_off; @@ -102,7 +104,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2863f3493038..14264edf2d77 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = pkt->pf; break; case NFT_META_L4PROTO: + if (!pkt->tprot_set) + goto err; *dest = pkt->tprot; break; case NFT_META_PRIORITY: -- cgit v1.2.1 From ddc8b6027ad08d145a6d7a6a6abc00e43f315bd1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:51 +0200 Subject: netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate() These functions are extracted from the netdev family, they initialize the pktinfo structure and validate that the IPv4 and IPv6 headers are well-formed given that these functions are called from a path where layer 3 sanitization did not happen yet. These functions are placed in include/net/netfilter/nf_tables_ipv{4,6}.h so they can be reused by a follow up patch to use them from the bridge family too. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_netdev.c | 79 +--------------------------------------- 1 file changed, 2 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 8de502b0c37b..3e5475a833a5 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -15,81 +15,6 @@ #include #include -static inline void -nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct iphdr *iph, _iph; - u32 len, thoff; - - nft_set_pktinfo(pkt, skb, state); - - iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), - &_iph); - if (!iph) - return; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return; - - len = ntohs(iph->tot_len); - thoff = iph->ihl * 4; - if (skb->len < len) - return; - else if (len < thoff) - return; - - pkt->tprot_set = true; - pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; -} - -static inline void -__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6hdr *ip6h, _ip6h; - unsigned int thoff = 0; - unsigned short frag_off; - int protohdr; - u32 pkt_len; - - ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), - &_ip6h); - if (!ip6h) - return; - - if (ip6h->version != 6) - return; - - pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) - return; - - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); - if (protohdr < 0) - return; - - pkt->tprot_set = true; - pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; -#endif -} - -static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - nft_set_pktinfo(pkt, skb, state); - __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); -} - static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -98,10 +23,10 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: nft_set_pktinfo_unspec(&pkt, skb, state); -- cgit v1.2.1 From 10151d7b03e23afce76a59f717f2616a10ddef86 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:52 +0200 Subject: netfilter: nf_tables_bridge: use nft_set_pktinfo_ipv{4, 6}_validate Consolidate pktinfo setup and validation by using the new generic functions so we converge to the netdev family codebase. We only need a linear IPv4 and IPv6 header from the reject expression, so move nft_bridge_iphdr_validate() and nft_bridge_ip6hdr_validate() to net/bridge/netfilter/nft_reject_bridge.c. Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_tables_bridge.c | 72 +------------------------------- net/bridge/netfilter/nft_reject_bridge.c | 44 ++++++++++++++++++- 2 files changed, 45 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 29899887163e..06f0f81456a0 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -13,79 +13,11 @@ #include #include #include -#include #include #include #include #include -int nft_bridge_iphdr_validate(struct sk_buff *skb) -{ - struct iphdr *iph; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return 0; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return 0; - - len = ntohs(iph->tot_len); - if (skb->len < len) - return 0; - else if (len < (iph->ihl*4)) - return 0; - - if (!pskb_may_pull(skb, iph->ihl*4)) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate); - -int nft_bridge_ip6hdr_validate(struct sk_buff *skb) -{ - struct ipv6hdr *hdr; - u32 pkt_len; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return 0; - - hdr = ipv6_hdr(skb); - if (hdr->version != 6) - return 0; - - pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); - -static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, skb, state); - else - nft_set_pktinfo_unspec(pkt, skb, state); -} - -static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, skb, state) == 0) - return; -#endif - nft_set_pktinfo_unspec(pkt, skb, state); -} - static unsigned int nft_do_chain_bridge(void *priv, struct sk_buff *skb, @@ -95,10 +27,10 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: nft_set_pktinfo_unspec(&pkt, skb, state); diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 0b77ffbc27d6..4b3df6b0e3b9 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, skb_pull(nskb, ETH_HLEN); } +static int nft_bridge_iphdr_validate(struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return 0; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return 0; + + len = ntohs(iph->tot_len); + if (skb->len < len) + return 0; + else if (len < (iph->ihl*4)) + return 0; + + if (!pskb_may_pull(skb, iph->ihl*4)) + return 0; + + return 1; +} + /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ @@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net, br_forward(br_port_get_rcu(dev), nskb, false, true); } +static int nft_bridge_ip6hdr_validate(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return 0; + + pkt_len = ntohs(hdr->payload_len); + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return 0; + + return 1; +} + static void nft_reject_br_send_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, -- cgit v1.2.1 From 71212c9b04eba76faa4dca26ccd1552d6bb300c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 12:42:53 +0200 Subject: netfilter: nf_tables: don't drop IPv6 packets that cannot parse transport This is overly conservative and not flexible at all, so better let them go through and let the filtering policy decide what to do with them. We use skb_header_pointer() all over the place so we would just fail to match when trying to access fields from malformed traffic. Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_tables_ipv6.c | 4 +--- net/ipv6/netfilter/nft_chain_route_ipv6.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 30b22f4dff55..05d05926962a 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 71d995ff3108..01eb0f658366 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -32,9 +32,7 @@ static unsigned int nf_route_table_hook(void *priv, u_int8_t hop_limit; u32 mark, flowlabel; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); -- cgit v1.2.1 From cf71c03edf10076f05a0b678fc9c8f8e6c6e24e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Fri, 9 Sep 2016 14:01:26 +0200 Subject: netfilter: nf_conntrack: simplify __nf_ct_try_assign_helper() return logic Instead of several goto's just to return the result, simply return it. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index b989b81ac156..4ffe388a9a1e 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -189,7 +189,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; struct net *net = nf_ct_net(ct); - int ret = 0; /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -223,15 +222,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); - goto out; + return 0; } if (help == NULL) { help = nf_ct_helper_ext_add(ct, helper, flags); - if (help == NULL) { - ret = -ENOMEM; - goto out; - } + if (help == NULL) + return -ENOMEM; } else { /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. @@ -240,13 +237,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); - goto out; + return 0; } } rcu_assign_pointer(help->helper, helper); -out: - return ret; + + return 0; } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); -- cgit v1.2.1 From 4e6577de71803142d01e374cf15664af0388799a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 9 Sep 2016 23:25:09 +0800 Subject: netfilter: Add the missed return value check of register_netdevice_notifier There are some codes of netfilter module which did not check the return value of register_netdevice_notifier. Add the checks now. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_netdev.c | 18 +++++++++++++----- net/netfilter/nfnetlink_queue.c | 9 ++++++++- net/netfilter/xt_TEE.c | 8 +++++++- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 3e5475a833a5..38a3e8385042 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -151,12 +151,20 @@ static int __init nf_tables_netdev_init(void) nft_register_chain_type(&nft_filter_chain_netdev); ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret < 0) { - nft_unregister_chain_type(&nft_filter_chain_netdev); - return ret; - } - register_netdevice_notifier(&nf_tables_netdev_notifier); + if (ret) + goto err1; + + ret = register_netdevice_notifier(&nf_tables_netdev_notifier); + if (ret) + goto err2; + return 0; + +err2: + unregister_pernet_subsys(&nf_tables_netdev_net_ops); +err1: + nft_unregister_chain_type(&nft_filter_chain_netdev); + return ret; } static void __exit nf_tables_netdev_exit(void) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f49f45081acb..808da34f94cd 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1522,9 +1522,16 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_notifier; } - register_netdevice_notifier(&nfqnl_dev_notifier); + status = register_netdevice_notifier(&nfqnl_dev_notifier); + if (status < 0) { + pr_err("nf_queue: failed to register netdevice notifier\n"); + goto cleanup_netlink_subsys; + } + return status; +cleanup_netlink_subsys: + nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 6e57a3966dc5..0471db4032c5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->oif[0]) { + int ret; + if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; @@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par) priv->notifier.notifier_call = tee_netdev_event; info->priv = priv; - register_netdevice_notifier(&priv->notifier); + ret = register_netdevice_notifier(&priv->notifier); + if (ret) { + kfree(priv); + return ret; + } } else info->priv = NULL; -- cgit v1.2.1 From 23d07508d25cea9984ee068538b4e86932b015c2 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Sat, 10 Sep 2016 10:04:30 +0800 Subject: netfilter: Add the missed return value check of nft_register_chain_type There are some codes of netfilter module which did not check the return value of nft_register_chain_type. Add the checks now. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_tables_bridge.c | 18 +++++++++++++----- net/ipv4/netfilter/nf_tables_arp.c | 5 ++++- net/ipv4/netfilter/nf_tables_ipv4.c | 5 ++++- net/ipv6/netfilter/nf_tables_ipv6.c | 5 ++++- net/netfilter/nf_tables_inet.c | 5 ++++- net/netfilter/nf_tables_netdev.c | 5 ++++- 6 files changed, 33 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 06f0f81456a0..97afdc0744e6 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -139,12 +139,20 @@ static int __init nf_tables_bridge_init(void) int ret; nf_register_afinfo(&nf_br_afinfo); - nft_register_chain_type(&filter_bridge); + ret = nft_register_chain_type(&filter_bridge); + if (ret < 0) + goto err1; + ret = register_pernet_subsys(&nf_tables_bridge_net_ops); - if (ret < 0) { - nft_unregister_chain_type(&filter_bridge); - nf_unregister_afinfo(&nf_br_afinfo); - } + if (ret < 0) + goto err2; + + return ret; + +err2: + nft_unregister_chain_type(&filter_bridge); +err1: + nf_unregister_afinfo(&nf_br_afinfo); return ret; } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 058c034be376..805c8ddfe860 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void) { int ret; - nft_register_chain_type(&filter_arp); + ret = nft_register_chain_type(&filter_arp); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_arp_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_arp); diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index e44ba3b12fbb..2840a29b2e04 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void) { int ret; - nft_register_chain_type(&filter_ipv4); + ret = nft_register_chain_type(&filter_ipv4); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_ipv4); diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 05d05926962a..d6e4ba5de916 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -100,7 +100,10 @@ static int __init nf_tables_ipv6_init(void) { int ret; - nft_register_chain_type(&filter_ipv6); + ret = nft_register_chain_type(&filter_ipv6); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_ipv6); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index 6b5f76295d3d..f713cc205669 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void) { int ret; - nft_register_chain_type(&filter_inet); + ret = nft_register_chain_type(&filter_inet); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_inet_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_inet); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 38a3e8385042..9e2ae424b640 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -149,7 +149,10 @@ static int __init nf_tables_netdev_init(void) { int ret; - nft_register_chain_type(&nft_filter_chain_netdev); + ret = nft_register_chain_type(&nft_filter_chain_netdev); + if (ret) + return ret; + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); if (ret) goto err1; -- cgit v1.2.1 From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 11 Sep 2016 22:55:53 +0200 Subject: netfilter: conntrack: remove packet hotpath stats These counters sit in hot path and do show up in perf, this is especially true for 'found' and 'searched' which get incremented for every packet processed. Information like searched=212030105 new=623431 found=333613 delete=623327 does not seem too helpful nowadays: - on busy systems found and searched will overflow every few hours (these are 32bit integers), other more busy ones every few days. - for debugging there are better methods, such as iptables' trace target, the conntrack log sysctls. Nowadays we also have perf tool. This removes packet path stat counters except those that are expected to be 0 (or close to 0) on a normal system, e.g. 'insert_failed' (race happened) or 'invalid' (proto tracker rejects). The insert stat is retained for the ctnetlink case. The found stat is retained for the tuple-is-taken check when NAT has to determine if it needs to pick a different source address. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 14 ++------------ net/netfilter/nf_conntrack_netlink.c | 6 +----- net/netfilter/nf_conntrack_standalone.c | 8 ++++---- 3 files changed, 7 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ac1db4019d5c..8d1ddb9b63ed 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -379,7 +379,6 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); @@ -406,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_del_from_dying_or_unconfirmed_list(ct); - NF_CT_STAT_INC(net, delete); local_bh_enable(); if (ct->master) @@ -438,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) nf_ct_add_to_dying_list(ct); - NF_CT_STAT_INC(net, delete_list); local_bh_enable(); } @@ -529,11 +526,8 @@ begin: if (nf_ct_is_dying(ct)) continue; - if (nf_ct_key_equal(h, tuple, zone, net)) { - NF_CT_STAT_INC_ATOMIC(net, found); + if (nf_ct_key_equal(h, tuple, zone, net)) return h; - } - NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -798,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert); local_bh_enable(); help = nfct_help(ct); @@ -857,7 +850,6 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_unlock(); return 1; } - NF_CT_STAT_INC_ATOMIC(net, searched); } if (get_nulls_value(n) != hash) { @@ -1177,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock(&nf_conntrack_expect_lock); } - if (!exp) { + if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - NF_CT_STAT_INC(net, new); - } /* Now it is inserted into the unconfirmed list, bump refcount */ nf_conntrack_get(&ct->ct_general); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c052b712c49f..27540455dc62 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1984,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || - nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || - nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || + if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || - nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || - nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 3d9a316a3c77..7d52f8401afd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -352,13 +352,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - st->searched, + 0, st->found, - st->new, + 0, st->invalid, st->ignore, - st->delete, - st->delete_list, + 0, + 0, st->insert, st->insert_failed, st->drop, -- cgit v1.2.1 From 2e917d602acd9e3e8c6e4c43b213c8929d986503 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 12 Sep 2016 22:21:36 +0800 Subject: netfilter: nft_numgen: fix race between num generate and store it After we generate a new number, we still use the priv->counter and store it to the dreg. This is not correct, another cpu may already change it to a new number. So we must use the generated number, not the priv->counter itself. Fixes: 91dbc6be0a62 ("netfilter: nf_tables: add number generator expression") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index f51a3ede3932..f173ebec30a7 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -37,7 +37,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); - memcpy(®s->data[priv->dreg], &priv->counter, sizeof(u32)); + regs->data[priv->dreg] = nval; } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { -- cgit v1.2.1 From ecfcdfec7e0cc64215a194044305f02a5a836e6d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Sep 2016 15:38:12 +0200 Subject: netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup() nf_nat_setup_info() returns NF_* verdicts, so convert them to error codes that is what ctnelink expects. This has passed overlook without having any impact since this nf_nat_setup_info() has always returned NF_ACCEPT so far. Since 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable"), this is problem. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index de31818417b8..19c081e1b328 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -807,7 +807,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, if (err < 0) return err; - return nf_nat_setup_info(ct, &range, manip); + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int -- cgit v1.2.1 From bf2c4b6f9b74c2ee1dd3c050b181e9b9c86fbcdb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Sep 2016 10:50:38 -0400 Subject: svcauth_gss: Revert 64c59a3726f2 ("Remove unnecessary allocation") rsc_lookup steals the passed-in memory to avoid doing an allocation of its own, so we can't just pass in a pointer to memory that someone else is using. If we really want to avoid allocation there then maybe we should preallocate somwhere, or reference count these handles. For now we should revert. On occasion I see this on my server: kernel: kernel BUG at /home/cel/src/linux/linux-2.6/mm/slub.c:3851! kernel: invalid opcode: 0000 [#1] SMP kernel: Modules linked in: cts rpcsec_gss_krb5 sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd btrfs xor iTCO_wdt iTCO_vendor_support raid6_pq pcspkr i2c_i801 i2c_smbus lpc_ich mfd_core mei_me sg mei shpchp wmi ioatdma ipmi_si ipmi_msghandler acpi_pad acpi_power_meter rpcrdma ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm nfsd nfs_acl lockd grace auth_rpcgss sunrpc ip_tables xfs libcrc32c mlx4_ib mlx4_en ib_core sr_mod cdrom sd_mod ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel igb mlx4_core ahci libahci libata ptp pps_core dca i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod kernel: CPU: 7 PID: 145 Comm: kworker/7:2 Not tainted 4.8.0-rc4-00006-g9d06b0b #15 kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015 kernel: Workqueue: events do_cache_clean [sunrpc] kernel: task: ffff8808541d8000 task.stack: ffff880854344000 kernel: RIP: 0010:[] [] kfree+0x155/0x180 kernel: RSP: 0018:ffff880854347d70 EFLAGS: 00010246 kernel: RAX: ffffea0020fe7660 RBX: ffff88083f9db064 RCX: 146ff0f9d5ec5600 kernel: RDX: 000077ff80000000 RSI: ffff880853f01500 RDI: ffff88083f9db064 kernel: RBP: ffff880854347d88 R08: ffff8808594ee000 R09: ffff88087fdd8780 kernel: R10: 0000000000000000 R11: ffffea0020fe76c0 R12: ffff880853f01500 kernel: R13: ffffffffa013cf76 R14: ffffffffa013cff0 R15: ffffffffa04253a0 kernel: FS: 0000000000000000(0000) GS:ffff88087fdc0000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007fed60b020c3 CR3: 0000000001c06000 CR4: 00000000001406e0 kernel: Stack: kernel: ffff8808589f2f00 ffff880853f01500 0000000000000001 ffff880854347da0 kernel: ffffffffa013cf76 ffff8808589f2f00 ffff880854347db8 ffffffffa013d006 kernel: ffff8808589f2f20 ffff880854347e00 ffffffffa0406f60 0000000057c7044f kernel: Call Trace: kernel: [] rsc_free+0x16/0x90 [auth_rpcgss] kernel: [] rsc_put+0x16/0x30 [auth_rpcgss] kernel: [] cache_clean+0x2e0/0x300 [sunrpc] kernel: [] do_cache_clean+0xe/0x70 [sunrpc] kernel: [] process_one_work+0x1ff/0x3b0 kernel: [] worker_thread+0x2bc/0x4a0 kernel: [] ? rescuer_thread+0x3a0/0x3a0 kernel: [] kthread+0xe4/0xf0 kernel: [] ret_from_fork+0x1f/0x40 kernel: [] ? kthread_stop+0x110/0x110 kernel: Code: f7 ff ff eb 3b 65 8b 05 da 30 e2 7e 89 c0 48 0f a3 05 a0 38 b8 00 0f 92 c0 84 c0 0f 85 d1 fe ff ff 0f 1f 44 00 00 e9 f5 fe ff ff <0f> 0b 49 8b 03 31 f6 f6 c4 40 0f 85 62 ff ff ff e9 61 ff ff ff kernel: RIP [] kfree+0x155/0x180 kernel: RSP kernel: ---[ end trace 3fdec044969def26 ]--- It seems to be most common after a server reboot where a client has been using a Kerberos mount, and reconnects to continue its workload. Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1d281816f2bf..d8582028b346 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -569,9 +569,10 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) struct rsc *found; memset(&rsci, 0, sizeof(rsci)); - rsci.handle.data = handle->data; - rsci.handle.len = handle->len; + if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) + return NULL; found = rsc_lookup(cd, &rsci); + rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) -- cgit v1.2.1 From 14e2dee0996f51e0ff0d868497c7e1b90f012665 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 13 Sep 2016 10:21:46 +0200 Subject: netfilter: nft_hash: fix hash overflow validation The overflow validation in the init() function establishes that the maximum value that the hash could reach is less than U32_MAX, which is likely to be true. The fix detects the overflow when the maximum hash value is less than the offset itself. Fixes: 70ca767ea1b2 ("netfilter: nft_hash: Add hash offset value") Reported-by: Liping Zhang Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index bd12f7a801c2..09473b415b95 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -76,7 +76,7 @@ static int nft_hash_init(const struct nft_ctx *ctx, if (priv->modulus <= 1) return -ERANGE; - if (priv->offset + priv->modulus - 1 < U32_MAX) + if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); -- cgit v1.2.1 From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 13 Sep 2016 08:49:18 +0800 Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions When memory is exhausted, nfct_seqadj_ext_add may fail to add the synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't check if get valid seqadj pointer by the nfct_seqadj. Now drop the packet directly when fail to add seqadj extension to avoid dereference NULL pointer in nf_ct_seqadj_init from init_conntrack(). Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 6 +++--- net/netfilter/nf_nat_core.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dd2c43abf9e2..9934b0c93c1e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (tmpl && nfct_synproxy(tmpl)) { - nfct_seqadj_ext_add(ct); - nfct_synproxy_ext_add(ct); + if (!nf_ct_add_synproxy(ct, tmpl)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 19c081e1b328..ecee105bbada 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; if (nfct_help(ct)) - nfct_seqadj_ext_add(ct); + if (!nfct_seqadj_ext_add(ct)) + return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { -- cgit v1.2.1 From 715f5552b1e90ba3eecf6d1a6d044d0d5226663f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 10 Sep 2016 23:11:23 +0800 Subject: sctp: hold the transport before using it in sctp_hash_cmp Since commit 4f0087812648 ("sctp: apply rhashtable api to send/recv path"), sctp uses transport rhashtable with .obj_cmpfn sctp_hash_cmp, in which it compares the members of the transport with the rhashtable args to check if it's the right transport. But sctp uses the transport without holding it in sctp_hash_cmp, it can cause a use-after-free panic. As after it gets transport from hashtable, another CPU may close the sk and free the asoc. In sctp_association_free, it frees all the transports, meanwhile, the assoc's refcnt may be reduced to 0, assoc can be destroyed by sctp_association_destroy. So after that, transport->assoc is actually an unavailable memory address in sctp_hash_cmp. Although sctp_hash_cmp is under rcu_read_lock, it still can not avoid this, as assoc is not freed by RCU. This patch is to hold the transport before checking it's members with sctp_transport_hold, in which it checks the refcnt first, holds it if it's not 0. Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 69444d32ecda..1555fb8c68e0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -796,27 +796,34 @@ struct sctp_hash_cmp_arg { static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { + struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; - const struct sctp_transport *t = ptr; - struct sctp_association *asoc = t->asoc; - const struct net *net = x->net; + struct sctp_association *asoc; + int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) - return 1; - if (!net_eq(sock_net(asoc->base.sk), net)) - return 1; + return err; + if (!sctp_transport_hold(t)) + return err; + + asoc = t->asoc; + if (!net_eq(sock_net(asoc->base.sk), x->net)) + goto out; if (x->ep) { if (x->ep != asoc->ep) - return 1; + goto out; } else { if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; + goto out; if (!sctp_bind_addr_match(&asoc->base.bind_addr, x->laddr, sctp_sk(asoc->base.sk))) - return 1; + goto out; } - return 0; + err = 0; +out: + sctp_transport_put(t); + return err; } static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) -- cgit v1.2.1 From ad5987b47e96a0fb6d13fea250e936aed000093c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 13 Sep 2016 15:53:55 +0200 Subject: nl80211: validate number of probe response CSA counters Due to an apparent copy/paste bug, the number of counters for the beacon configuration were checked twice, instead of checking the number of probe response counters. Fix this to check the number of probe response counters before parsing those. Cc: stable@vger.kernel.org Fixes: 9a774c78e211 ("cfg80211: Support multiple CSA counters") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f02653a08993..4809f4d2cdcc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6978,7 +6978,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) params.n_counter_offsets_presp = len / sizeof(u16); if (rdev->wiphy.max_num_csa_counters && - (params.n_counter_offsets_beacon > + (params.n_counter_offsets_presp > rdev->wiphy.max_num_csa_counters)) return -EINVAL; -- cgit v1.2.1 From 0b97a484e52cb423662eb98904aad82dafcc1f10 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Sep 2016 09:41:34 +0200 Subject: mac80211: check skb_linearize() return value The A-MSDU TX code (within TXQs) didn't always check the return value of skb_linearize() properly, resulting in potentially passing a frag- list SKB down to the driver even when it said it can't handle it. Fix that. Fixes: 6e0456b545456 ("mac80211: add A-MSDU tx support") Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index cc8e95554b48..18b285e06bc8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1515,8 +1515,12 @@ out: spin_unlock_bh(&fq->lock); if (skb && skb_has_frag_list(skb) && - !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) - skb_linearize(skb); + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { + if (skb_linearize(skb)) { + ieee80211_free_txskb(&local->hw, skb); + return NULL; + } + } return skb; } -- cgit v1.2.1 From 85d5313ed717ad60769491c7c072d23bc0a68e7a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Sep 2016 11:38:31 +0200 Subject: mac80211: reject TSPEC TIDs (TSIDs) for aggregation Since mac80211 doesn't currently support TSIDs 8-15 which can only be used after QoS TSPEC negotiation (and not even after WMM negotiation), reject attempts to set up aggregation sessions for them, which might confuse drivers. In mac80211 we do correctly handle that, but the TSIDs should never get used anyway, and drivers might not be able to handle it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 8 +++++++- net/mac80211/agg-tx.c | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index a9aff6079c42..afa94687d5e1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -261,10 +261,16 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, .timeout = timeout, .ssn = start_seq_num, }; - int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + if (tid >= IEEE80211_FIRST_TSPEC_TSID) { + ht_dbg(sta->sdata, + "STA %pM requests BA session on unsupported tid %d\n", + sta->sta.addr, tid); + goto end_no_lock; + } + if (!sta->sta.ht_cap.ht_supported) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 5650c46bf91a..45319cc01121 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -584,6 +584,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; + if (WARN_ON(tid >= IEEE80211_FIRST_TSPEC_TSID)) + return -EINVAL; + ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", pubsta->addr, tid); -- cgit v1.2.1 From d6f64d725bac20df66b2eacd847fc41d7a1905e0 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Thu, 15 Sep 2016 11:40:05 +1200 Subject: net: VRF: Pass original iif to ip_route_input() The function ip_rcv_finish() calls l3mdev_ip_rcv(). On any VRF except the global VRF, this replaces skb->dev with the VRF master interface. When calling ip_route_input_noref() from here, the checks for forwarding look at this master device instead of the initial ingress interface. This will allow packets to be routed which normally would be dropped. For example, an interface that is not assigned an IP address should drop packets, but because the checking is against the master device, the packet will be forwarded. The fix here is to still call l3mdev_ip_rcv(), but remember the initial net_device. This is passed to the other functions within ip_rcv_finish, so they still see the original interface. Signed-off-by: Mark Tomlinson Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4b351af3e67b..d6feabb03516 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + struct net_device *dev = skb->dev; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + iph->tos, dev); if (unlikely(err)) { if (err == -EXDEV) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); @@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * -- cgit v1.2.1 From fabf9201806255d70386d8bc9f6a2942c0940da2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Remove some whitespace. Remove a tab that's on a line that should otherwise be blank. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 61432049869b..9367c3be31eb 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -31,7 +31,7 @@ static void rxrpc_set_timer(struct rxrpc_call *call) _enter("{%ld,%ld,%ld:%ld}", call->ack_at - now, call->resend_at - now, call->expire_at - now, call->timer.expires - now); - + read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { -- cgit v1.2.1 From 4b22457c06a3a950e14938c486283ad0f308c13d Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Move the check of rx_pkt_offset from rxrpc_locate_data() to caller Move the check of rx_pkt_offset from rxrpc_locate_data() to the caller, rxrpc_recvmsg_data(), so that it's more clear what's going on there. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a284205b8ecf..0d085f5cf1bf 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -240,9 +240,6 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, int ret; u8 annotation = *_annotation; - if (offset > 0) - return 0; - /* Locate the subpacket */ offset = sp->offset; len = skb->len - sp->offset; @@ -303,8 +300,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (msg) sock_recv_timestamp(msg, sock->sk, skb); - ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], - &rx_pkt_offset, &rx_pkt_len); + if (rx_pkt_offset == 0) + ret = rxrpc_locate_data(call, skb, + &call->rxtx_annotations[ix], + &rx_pkt_offset, &rx_pkt_len); _debug("recvmsg %x DATA #%u { %d, %d }", sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); -- cgit v1.2.1 From 2e2ea51dec2ab6a81950d4b436eb66ebf45dd507 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Check the return value of rxrpc_locate_data() Check the return value of rxrpc_locate_data() in rxrpc_recvmsg_data(). Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 0d085f5cf1bf..1edf2cf62cc5 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -300,10 +300,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (msg) sock_recv_timestamp(msg, sock->sk, skb); - if (rx_pkt_offset == 0) + if (rx_pkt_offset == 0) { ret = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], &rx_pkt_offset, &rx_pkt_len); + if (ret < 0) + goto out; + } _debug("recvmsg %x DATA #%u { %d, %d }", sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); -- cgit v1.2.1 From 816c9fce12f3745abc959c0fca8ace1c2c51421c Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Fix handling of the last packet in rxrpc_recvmsg_data() The code for determining the last packet in rxrpc_recvmsg_data() has been using the RXRPC_CALL_RX_LAST flag to determine if the rx_top pointer points to the last packet or not. This isn't a good idea, however, as the input code may be running simultaneously on another CPU and that sets the flag *before* updating the top pointer. Fix this by the following means: (1) Restrict the use of RXRPC_CALL_RX_LAST to the input routines only. There's otherwise a synchronisation problem between detecting the flag and checking tx_top. This could probably be dealt with by appropriate application of memory barriers, but there's a simpler way. (2) Set RXRPC_CALL_RX_LAST after setting rx_top. (3) Make rxrpc_rotate_rx_window() consult the flags header field of the DATA packet it's about to discard to see if that was the last packet. Use this as the basis for ending the Rx phase. This shouldn't be a problem because the recvmsg side of things is guaranteed to see the packets in order. (4) Make rxrpc_recvmsg_data() return 1 to indicate the end of the data if: (a) the packet it has just processed is marked as RXRPC_LAST_PACKET (b) the call's Rx phase has been ended. Signed-off-by: David Howells --- net/rxrpc/input.c | 4 +++- net/rxrpc/recvmsg.c | 49 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 75af0bd316c7..f0d9115b9b7e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -238,7 +238,7 @@ next_subpacket: len = RXRPC_JUMBO_DATALEN; if (flags & RXRPC_LAST_PACKET) { - if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && seq != call->rx_top) return rxrpc_proto_abort("LSN", call, seq); } else { @@ -282,6 +282,8 @@ next_subpacket: call->rxtx_buffer[ix] = skb; if (after(seq, call->rx_top)) smp_store_release(&call->rx_top, seq); + if (flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RX_LAST, &call->flags); queued = true; if (after_eq(seq, call->rx_expect_next)) { diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 1edf2cf62cc5..8b8d7e14f800 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -134,6 +134,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) { _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); + ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); + if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); @@ -163,8 +165,10 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) */ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { + struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t hard_ack, top; + u8 flags; int ix; _enter("%d", call->debug_id); @@ -177,6 +181,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) ix = hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; rxrpc_see_skb(skb); + sp = rxrpc_skb(skb); + flags = sp->hdr.flags; call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ @@ -184,8 +190,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_free_skb(skb); - _debug("%u,%u,%lx", hard_ack, top, call->flags); - if (hard_ack == top && test_bit(RXRPC_CALL_RX_LAST, &call->flags)) + _debug("%u,%u,%02x", hard_ack, top, flags); + if (flags & RXRPC_LAST_PACKET) rxrpc_end_rx_phase(call); } @@ -278,13 +284,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t remain; bool last; unsigned int rx_pkt_offset, rx_pkt_len; - int ix, copy, ret = 0; + int ix, copy, ret = -EAGAIN, ret2; _enter(""); rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; + if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { + seq = call->rx_hard_ack; + ret = 1; + goto done; + } + /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; top = smp_load_acquire(&call->rx_top); @@ -301,11 +313,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, sock_recv_timestamp(msg, sock->sk, skb); if (rx_pkt_offset == 0) { - ret = rxrpc_locate_data(call, skb, - &call->rxtx_annotations[ix], - &rx_pkt_offset, &rx_pkt_len); - if (ret < 0) + ret2 = rxrpc_locate_data(call, skb, + &call->rxtx_annotations[ix], + &rx_pkt_offset, &rx_pkt_len); + if (ret2 < 0) { + ret = ret2; goto out; + } } _debug("recvmsg %x DATA #%u { %d, %d }", sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); @@ -316,10 +330,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (copy > remain) copy = remain; if (copy > 0) { - ret = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, - copy); - if (ret < 0) + ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, + copy); + if (ret2 < 0) { + ret = ret2; goto out; + } /* handle piecemeal consumption of data packets */ _debug("copied %d @%zu", copy, *_offset); @@ -332,6 +348,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_len > 0) { _debug("buffer full"); ASSERTCMP(*_offset, ==, len); + ret = 0; break; } @@ -342,19 +359,19 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rx_pkt_offset = 0; rx_pkt_len = 0; - ASSERTIFCMP(last, seq, ==, top); - } - - if (after(seq, top)) { - ret = -EAGAIN; - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags)) + if (last) { + ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); ret = 1; + goto out; + } } + out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } +done: _leave(" = %d [%u/%u]", ret, seq, top); return ret; } -- cgit v1.2.1 From e6f3afb3fc058e17b407b6f7cac08058b19e641c Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Record calls that need to be accepted Record calls that need to be accepted using sk_acceptq_added() otherwise the backlog counter goes negative because sk_acceptq_removed() is called. This causes the preallocator to malfunction. Calls that are preaccepted by AFS within the kernel aren't affected by this. Signed-off-by: David Howells --- net/rxrpc/call_accept.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 26c293ef98eb..323b8da50163 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -369,6 +369,8 @@ found_service: if (rx->notify_new_call) rx->notify_new_call(&rx->sk, call, call->user_call_ID); + else + sk_acceptq_added(&rx->sk); spin_lock(&conn->state_lock); switch (conn->state) { -- cgit v1.2.1 From 0360da6db7d6390e7bd2f6c93b01af29bcd36ad5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:11 +0100 Subject: rxrpc: Purge the to_be_accepted queue on socket release Purge the queue of to_be_accepted calls on socket release. Note that purging sock_calls doesn't release the ref owned by to_be_accepted. Probably the sock_calls list is redundant given a purges of the recvmsg_q, the to_be_accepted queue and the calls tree. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 22f9b0d1a138..b0ffbd9664e6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -476,6 +476,16 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) _enter("%p", rx); + while (!list_empty(&rx->to_be_accepted)) { + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); + list_del(&call->accept_link); + rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + } + while (!list_empty(&rx->sock_calls)) { call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); -- cgit v1.2.1 From 66d58af7f4af53e8318e852efa31a7cb0e31bfb6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Fix the putting of client connections In rxrpc_put_one_client_conn(), if a connection has RXRPC_CONN_COUNTED set on it, then it's accounted for in rxrpc_nr_client_conns and may be on various lists - and this is cleaned up correctly. However, if the connection doesn't have RXRPC_CONN_COUNTED set on it, then the put routine returns rather than just skipping the extra bit of cleanup. Fix this by making the extra bit of clean up conditional instead and always killing off the connection. This manifests itself as connections with a zero usage count hanging around in /proc/net/rxrpc_conns because the connection allocated, but discarded, due to a race with another process that set up a parallel connection, which was then shared instead. Signed-off-by: David Howells --- net/rxrpc/conn_client.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9344a8416ceb..5a675c43cace 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -818,7 +818,7 @@ idle_connection: static struct rxrpc_connection * rxrpc_put_one_client_conn(struct rxrpc_connection *conn) { - struct rxrpc_connection *next; + struct rxrpc_connection *next = NULL; struct rxrpc_local *local = conn->params.local; unsigned int nr_conns; @@ -834,24 +834,22 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); - if (!test_bit(RXRPC_CONN_COUNTED, &conn->flags)) - return NULL; - - spin_lock(&rxrpc_client_conn_cache_lock); - nr_conns = --rxrpc_nr_client_conns; + if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + spin_lock(&rxrpc_client_conn_cache_lock); + nr_conns = --rxrpc_nr_client_conns; + + if (nr_conns < rxrpc_max_client_connections && + !list_empty(&rxrpc_waiting_client_conns)) { + next = list_entry(rxrpc_waiting_client_conns.next, + struct rxrpc_connection, cache_link); + rxrpc_get_connection(next); + rxrpc_activate_conn(next); + } - next = NULL; - if (nr_conns < rxrpc_max_client_connections && - !list_empty(&rxrpc_waiting_client_conns)) { - next = list_entry(rxrpc_waiting_client_conns.next, - struct rxrpc_connection, cache_link); - rxrpc_get_connection(next); - rxrpc_activate_conn(next); + spin_unlock(&rxrpc_client_conn_cache_lock); } - spin_unlock(&rxrpc_client_conn_cache_lock); rxrpc_kill_connection(conn); - if (next) rxrpc_activate_channels(next); -- cgit v1.2.1 From 357f5ef64628c2d6c532e7a6bfc0bc3830b4c221 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Call rxrpc_release_call() on error in rxrpc_new_client_call() Call rxrpc_release_call() on getting an error in rxrpc_new_client_call() rather than trying to do the cleanup ourselves. This isn't a problem, provided we set RXRPC_CALL_HAS_USERID only if we actually add the call to the calls tree as cleanup code fragments that would otherwise cause problems are conditional. Without this, we miss some of the cleanup. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index b0ffbd9664e6..23f5a5f58282 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -226,9 +226,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, (const void *)user_call_ID); /* Publish the call, even though it is incompletely set up as yet */ - call->user_call_ID = user_call_ID; - __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); - write_lock(&rx->call_lock); pp = &rx->calls.rb_node; @@ -242,10 +239,12 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, else if (user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else - goto found_user_ID_now_present; + goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); + call->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); @@ -276,33 +275,22 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _leave(" = %p [new]", call); return call; -error: - write_lock(&rx->call_lock); - rb_erase(&call->sock_node, &rx->calls); - write_unlock(&rx->call_lock); - rxrpc_put_call(call, rxrpc_call_put_userid); - - write_lock(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock(&rxrpc_call_lock); - -error_out: - __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, ret); - set_bit(RXRPC_CALL_RELEASED, &call->flags); - rxrpc_put_call(call, rxrpc_call_put); - _leave(" = %d", ret); - return ERR_PTR(ret); - /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the * same time in different threads. */ -found_user_ID_now_present: +error_dup_user_ID: write_unlock(&rx->call_lock); ret = -EEXIST; - goto error_out; + +error: + __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_CALL_DEAD, ret); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + _leave(" = %d", ret); + return ERR_PTR(ret); } /* -- cgit v1.2.1 From 78883793f8ac4bb3f97d48db7a8c71d8476bcf98 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Fix unexposed client conn release If the last call on a client connection is release after the connection has had a bunch of calls allocated but before any DATA packets are sent (so that it's not yet marked RXRPC_CONN_EXPOSED), an assertion will happen in rxrpc_disconnect_client_call(). af_rxrpc: Assertion failed - 1(0x1) >= 2(0x2) is false ------------[ cut here ]------------ kernel BUG at ../net/rxrpc/conn_client.c:753! This is because it's expecting the conn to have been exposed and to have 2 or more refs - but this isn't necessarily the case. Simply remove the assertion. This allows the conn to be moved into the inactive state and deleted if it isn't resurrected before the final put is called. Signed-off-by: David Howells --- net/rxrpc/conn_client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5a675c43cace..226bc910e556 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -721,7 +721,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) } ASSERTCMP(rcu_access_pointer(chan->call), ==, call); - ASSERTCMP(atomic_read(&conn->usage), >=, 2); /* If a client call was exposed to the world, we save the result for * retransmission. -- cgit v1.2.1 From d01dc4c3c1209e865368d5f8d3b5e08f97326ca9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Fix the parsing of soft-ACKs The soft-ACK parser doesn't increment the pointer into the soft-ACK list, resulting in the first ACK/NACK value being applied to all the relevant packets in the Tx queue. This has the potential to miss retransmissions and cause excessive retransmissions. Fix this by incrementing the pointer. Signed-off-by: David Howells --- net/rxrpc/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index f0d9115b9b7e..c1f83d22f9b7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -384,7 +384,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, for (; nr_acks > 0; nr_acks--, seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; - switch (*acks) { + switch (*acks++) { case RXRPC_ACK_TYPE_ACK: call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK; break; -- cgit v1.2.1 From dfa7d9204054b091949d87270e55e0fd5800c3ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Fix retransmission algorithm Make the retransmission algorithm use for-loops instead of do-loops and move the counter increments into the for-statement increment slots. Though the do-loops are slighly more efficient since there will be at least one pass through the each loop, the counter increments are harder to get right as the continue-statements skip them. Without this, if there are any positive acks within the loop, the do-loop will cycle forever because the counter increment is never done. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 9367c3be31eb..f0cabc48a1b7 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -163,8 +163,7 @@ static void rxrpc_resend(struct rxrpc_call *call) */ now = jiffies; resend_at = now + rxrpc_resend_timeout; - seq = cursor + 1; - do { + for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; annotation = call->rxtx_annotations[ix]; if (annotation == RXRPC_TX_ANNO_ACK) @@ -184,8 +183,7 @@ static void rxrpc_resend(struct rxrpc_call *call) /* Okay, we need to retransmit a packet. */ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; - seq++; - } while (before_eq(seq, top)); + } call->resend_at = resend_at; @@ -194,8 +192,7 @@ static void rxrpc_resend(struct rxrpc_call *call) * lock is dropped, it may clear some of the retransmission markers for * packets that it soft-ACKs. */ - seq = cursor + 1; - do { + for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; annotation = call->rxtx_annotations[ix]; if (annotation != RXRPC_TX_ANNO_RETRANS) @@ -237,8 +234,7 @@ static void rxrpc_resend(struct rxrpc_call *call) if (after(call->tx_hard_ack, seq)) seq = call->tx_hard_ack; - seq++; - } while (before_eq(seq, top)); + } out_unlock: spin_unlock_bh(&call->lock); -- cgit v1.2.1 From 27d0fc431c6b4847231c1490fa541bc3f5a7a351 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Don't transmit an ACK if there's no reason set Don't transmit an ACK if call->ackr_reason in unset. There's the possibility of a race between recvmsg() sending an ACK and the background processing thread trying to send the same one. Signed-off-by: David Howells --- net/rxrpc/output.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 06a9aca739d1..aa0507214b31 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -137,6 +137,11 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) switch (type) { case RXRPC_PACKET_TYPE_ACK: spin_lock_bh(&call->lock); + if (!call->ackr_reason) { + spin_unlock_bh(&call->lock); + ret = 0; + goto out; + } n = rxrpc_fill_out_ack(call, pkt); call->ackr_reason = 0; -- cgit v1.2.1 From 2311e327cda015a24a201efc7655a9a983679e55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Be consistent about switch value in rxrpc_send_call_packet() rxrpc_send_call_packet() should use type in both its switch-statements rather than using pkt->whdr.type. This might give the compiler an easier job of uninitialised variable checking. Signed-off-by: David Howells --- net/rxrpc/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index aa0507214b31..0b21ed859de7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -182,7 +182,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) &msg, iov, ioc, len); if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) { - switch (pkt->whdr.type) { + switch (type) { case RXRPC_PACKET_TYPE_ACK: rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), -- cgit v1.2.1 From 182f50562490e5861afaa7a2e42dcc0dd9dcfcca Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:12 +0100 Subject: rxrpc: Fix the basic transmit DATA packet content size at 1412 bytes Fix the basic transmit DATA packet content size at 1412 bytes so that they can be arbitrarily assembled into jumbo packets. In the future, I'm thinking of moving to keeping a jumbo packet header at the beginning of each packet in the Tx queue and creating the packet header on the spot when kernel_sendmsg() is invoked. That way, jumbo packets can be assembled on the spur of the moment for (re-)transmission. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index cba236575073..8bfddf4e338c 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -214,7 +214,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto maybe_error; } - max = call->conn->params.peer->maxdata; + max = RXRPC_JUMBO_DATALEN; max -= call->conn->security_size; max &= ~(call->conn->size_align - 1UL); -- cgit v1.2.1 From a3868bfc8d5b0f36c784deab644ee1d2b0e6974b Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Print the packet type name in the Rx packet trace Print a symbolic packet type name for each valid received packet in the trace output, not just a number. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e78c40b37db5..0f6fafa2c271 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -551,6 +551,9 @@ enum rxrpc_call_trace { extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; +extern const char *const rxrpc_pkts[]; +extern const char *rxrpc_acks(u8 reason); + #include /* @@ -851,11 +854,8 @@ extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; extern unsigned int rxrpc_resend_timeout; -extern const char *const rxrpc_pkts[]; extern const s8 rxrpc_ack_priority[]; -extern const char *rxrpc_acks(u8 reason); - /* * output.c */ -- cgit v1.2.1 From a84a46d73050f70fd8820c74840e2815c78d8690 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Add some additional call tracing Add additional call tracepoint points for noting call-connected, call-released and connection-failed events. Also fix one tracepoint that was using an integer instead of the corresponding enum value as the point type. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/call_object.c | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0f6fafa2c271..4a73c20d9436 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -539,6 +539,8 @@ enum rxrpc_call_trace { rxrpc_call_queued, rxrpc_call_queued_ref, rxrpc_call_seen, + rxrpc_call_connected, + rxrpc_call_release, rxrpc_call_got, rxrpc_call_got_userid, rxrpc_call_got_kernel, @@ -546,6 +548,7 @@ enum rxrpc_call_trace { rxrpc_call_put_userid, rxrpc_call_put_kernel, rxrpc_call_put_noqueue, + rxrpc_call_error, rxrpc_call__nr_trace }; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 23f5a5f58282..0df9d1af8edb 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -53,6 +53,8 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { [rxrpc_call_new_service] = "NWs", [rxrpc_call_queued] = "QUE", [rxrpc_call_queued_ref] = "QUR", + [rxrpc_call_connected] = "CON", + [rxrpc_call_release] = "RLS", [rxrpc_call_seen] = "SEE", [rxrpc_call_got] = "GOT", [rxrpc_call_got_userid] = "Gus", @@ -61,6 +63,7 @@ const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { [rxrpc_call_put_userid] = "Pus", [rxrpc_call_put_kernel] = "Pke", [rxrpc_call_put_noqueue] = "PNQ", + [rxrpc_call_error] = "*E*", }; struct kmem_cache *rxrpc_call_jar; @@ -222,8 +225,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - trace_rxrpc_call(call, 0, atomic_read(&call->usage), here, - (const void *)user_call_ID); + trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + here, (const void *)user_call_ID); /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); @@ -263,6 +266,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error; + trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), + here, ERR_PTR(ret)); + spin_lock_bh(&call->conn->params.peer->lock); hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); @@ -287,6 +293,8 @@ error_dup_user_ID: error: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); + trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), + here, ERR_PTR(ret)); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); @@ -396,15 +404,17 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) */ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { + const void *here = __builtin_return_address(0); struct rxrpc_connection *conn = call->conn; bool put = false; int i; _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); + trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + here, (const void *)call->flags); - rxrpc_see_call(call); + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); spin_lock_bh(&call->lock); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) -- cgit v1.2.1 From 363deeab6d0f308d33d011323661ae9cf5f9f8d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Add connection tracepoint and client conn state tracepoint Add a pair of tracepoints, one to track rxrpc_connection struct ref counting and the other to track the client connection cache state. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 76 +++++++++++++++++++++++++++----------------- net/rxrpc/call_accept.c | 4 +++ net/rxrpc/call_object.c | 2 -- net/rxrpc/conn_client.c | 82 +++++++++++++++++++++++++++++++++--------------- net/rxrpc/conn_event.c | 2 +- net/rxrpc/conn_object.c | 72 ++++++++++++++++++++++++++++++++++++++++-- net/rxrpc/conn_service.c | 4 +++ net/rxrpc/misc.c | 31 ++++++++++++++++++ 8 files changed, 214 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4a73c20d9436..6ca40eea3022 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -314,6 +314,7 @@ enum rxrpc_conn_cache_state { RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ + RXRPC_CONN__NR_CACHE_STATES }; /* @@ -533,6 +534,44 @@ struct rxrpc_call { rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ }; +enum rxrpc_conn_trace { + rxrpc_conn_new_client, + rxrpc_conn_new_service, + rxrpc_conn_queued, + rxrpc_conn_seen, + rxrpc_conn_got, + rxrpc_conn_put_client, + rxrpc_conn_put_service, + rxrpc_conn__nr_trace +}; + +extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4]; + +enum rxrpc_client_trace { + rxrpc_client_activate_chans, + rxrpc_client_alloc, + rxrpc_client_chan_activate, + rxrpc_client_chan_disconnect, + rxrpc_client_chan_pass, + rxrpc_client_chan_unstarted, + rxrpc_client_cleanup, + rxrpc_client_count, + rxrpc_client_discard, + rxrpc_client_duplicate, + rxrpc_client_exposed, + rxrpc_client_replace, + rxrpc_client_to_active, + rxrpc_client_to_culled, + rxrpc_client_to_idle, + rxrpc_client_to_inactive, + rxrpc_client_to_waiting, + rxrpc_client_uncount, + rxrpc_client__nr_trace +}; + +extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7]; +extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5]; + enum rxrpc_call_trace { rxrpc_call_new_client, rxrpc_call_new_service, @@ -734,7 +773,11 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); -void __rxrpc_put_connection(struct rxrpc_connection *); +bool rxrpc_queue_conn(struct rxrpc_connection *); +void rxrpc_see_connection(struct rxrpc_connection *); +void rxrpc_get_connection(struct rxrpc_connection *); +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); +void rxrpc_put_service_conn(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) @@ -747,38 +790,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return !rxrpc_conn_is_client(conn); } -static inline void rxrpc_get_connection(struct rxrpc_connection *conn) -{ - atomic_inc(&conn->usage); -} - -static inline -struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) -{ - return atomic_inc_not_zero(&conn->usage) ? conn : NULL; -} - static inline void rxrpc_put_connection(struct rxrpc_connection *conn) { if (!conn) return; - if (rxrpc_conn_is_client(conn)) { - if (atomic_dec_and_test(&conn->usage)) - rxrpc_put_client_conn(conn); - } else { - if (atomic_dec_return(&conn->usage) == 1) - __rxrpc_put_connection(conn); - } -} - -static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) -{ - if (!rxrpc_get_connection_maybe(conn)) - return false; - if (!rxrpc_queue_work(&conn->processor)) - rxrpc_put_connection(conn); - return true; + if (rxrpc_conn_is_client(conn)) + rxrpc_put_client_conn(conn); + else + rxrpc_put_service_conn(conn); } /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 323b8da50163..3e474508ba75 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -85,6 +85,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, b->conn_backlog[head] = conn; smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); + + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), here); } /* Now it gets complicated, because calls get registered with the @@ -290,6 +293,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_get_local(local); conn->params.local = local; conn->params.peer = peer; + rxrpc_see_connection(conn); rxrpc_new_incoming_connection(conn, skb); } else { rxrpc_get_connection(conn); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0df9d1af8edb..54f30482a7fd 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -479,8 +479,6 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET); - rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); - rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 226bc910e556..c76a125df891 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -105,6 +105,14 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *); static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, rxrpc_discard_expired_client_conns); +const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = { + [RXRPC_CONN_CLIENT_INACTIVE] = "Inac", + [RXRPC_CONN_CLIENT_WAITING] = "Wait", + [RXRPC_CONN_CLIENT_ACTIVE] = "Actv", + [RXRPC_CONN_CLIENT_CULLED] = "Cull", + [RXRPC_CONN_CLIENT_IDLE] = "Idle", +}; + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -220,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); + trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + __builtin_return_address(0)); + trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); return conn; @@ -385,6 +396,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call, rb_replace_node(&conn->client_node, &candidate->client_node, &local->client_conns); + trace_rxrpc_client(conn, -1, rxrpc_client_replace); goto candidate_published; } } @@ -409,8 +421,11 @@ found_extant_conn: _debug("found conn"); spin_unlock(&local->client_conns_lock); - rxrpc_put_connection(candidate); - candidate = NULL; + if (candidate) { + trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); + rxrpc_put_connection(candidate); + candidate = NULL; + } spin_lock(&conn->channel_lock); call->conn = conn; @@ -433,6 +448,7 @@ error: */ static void rxrpc_activate_conn(struct rxrpc_connection *conn) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_active); conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; rxrpc_nr_active_client_conns++; list_move_tail(&conn->cache_link, &rxrpc_active_client_conns); @@ -462,8 +478,10 @@ static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) spin_lock(&rxrpc_client_conn_cache_lock); nr_conns = rxrpc_nr_client_conns; - if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) + if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_count); rxrpc_nr_client_conns = nr_conns + 1; + } switch (conn->cache_state) { case RXRPC_CONN_CLIENT_ACTIVE: @@ -494,6 +512,7 @@ activate_conn: wait_for_capacity: _debug("wait"); + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); goto out_unlock; @@ -524,6 +543,8 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, struct rxrpc_call, chan_wait_link); u32 call_id = chan->call_counter + 1; + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); + write_lock_bh(&call->state_lock); call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; write_unlock_bh(&call->state_lock); @@ -563,6 +584,8 @@ static void rxrpc_activate_channels(struct rxrpc_connection *conn) _enter("%d", conn->debug_id); + trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans); + if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE || conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) return; @@ -657,10 +680,13 @@ int rxrpc_connect_call(struct rxrpc_call *call, * had a chance at re-use (the per-connection security negotiation is * expensive). */ -static void rxrpc_expose_client_conn(struct rxrpc_connection *conn) +static void rxrpc_expose_client_conn(struct rxrpc_connection *conn, + unsigned int channel) { - if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + trace_rxrpc_client(conn, channel, rxrpc_client_exposed); rxrpc_get_connection(conn); + } } /* @@ -669,9 +695,9 @@ static void rxrpc_expose_client_conn(struct rxrpc_connection *conn) */ void rxrpc_expose_client_call(struct rxrpc_call *call) { + unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; - struct rxrpc_channel *chan = - &conn->channels[call->cid & RXRPC_CHANNELMASK]; + struct rxrpc_channel *chan = &conn->channels[channel]; if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { /* Mark the call ID as being used. If the callNumber counter @@ -682,7 +708,7 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) chan->call_counter++; if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - rxrpc_expose_client_conn(conn); + rxrpc_expose_client_conn(conn, channel); } } @@ -695,6 +721,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; + trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); call->conn = NULL; spin_lock(&conn->channel_lock); @@ -709,6 +736,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); list_del_init(&call->chan_wait_link); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted); + /* We must deactivate or idle the connection if it's now * waiting for nothing. */ @@ -739,7 +768,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) /* See if we can pass the channel directly to another call. */ if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE && !list_empty(&conn->waiting_calls)) { - _debug("pass chan"); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); goto out_2; } @@ -762,7 +791,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) goto out; } - _debug("pass chan 2"); + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); goto out; @@ -794,7 +823,7 @@ idle_connection: * immediately or moved to the idle list for a short while. */ if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { - _debug("make idle"); + trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; conn->cache_state = RXRPC_CONN_CLIENT_IDLE; list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns); @@ -804,7 +833,7 @@ idle_connection: &rxrpc_client_conn_reap, rxrpc_conn_idle_client_expiry); } else { - _debug("make inactive"); + trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; list_del_init(&conn->cache_link); } @@ -821,6 +850,8 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) struct rxrpc_local *local = conn->params.local; unsigned int nr_conns; + trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); + if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) { spin_lock(&local->client_conns_lock); if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, @@ -834,6 +865,7 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_uncount); spin_lock(&rxrpc_client_conn_cache_lock); nr_conns = --rxrpc_nr_client_conns; @@ -863,20 +895,18 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) */ void rxrpc_put_client_conn(struct rxrpc_connection *conn) { - struct rxrpc_connection *next; + const void *here = __builtin_return_address(0); + int n; do { - _enter("%p{u=%d,d=%d}", - conn, atomic_read(&conn->usage), conn->debug_id); - - next = rxrpc_put_one_client_conn(conn); - - if (!next) - break; - conn = next; - } while (atomic_dec_and_test(&conn->usage)); - - _leave(""); + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + if (n > 0) + return; + ASSERTCMP(n, >=, 0); + + conn = rxrpc_put_one_client_conn(conn); + } while (conn); } /* @@ -907,9 +937,11 @@ static void rxrpc_cull_active_client_conns(void) ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); if (list_empty(&conn->waiting_calls)) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_culled); conn->cache_state = RXRPC_CONN_CLIENT_CULLED; list_del_init(&conn->cache_link); } else { + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); conn->cache_state = RXRPC_CONN_CLIENT_WAITING; list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); @@ -983,7 +1015,7 @@ next: goto not_yet_expired; } - _debug("discard conn %d", conn->debug_id); + trace_rxrpc_client(conn, -1, rxrpc_client_discard); if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) BUG(); conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 0691007cfc02..a43f4c94a88d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -377,7 +377,7 @@ void rxrpc_process_connection(struct work_struct *work) u32 abort_code = RX_PROTOCOL_ERROR; int ret; - _enter("{%d}", conn->debug_id); + rxrpc_see_connection(conn); if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index bb1f29280aea..3b55aee0c436 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -246,11 +246,77 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn) } /* - * release a virtual connection + * Queue a connection's work processor, getting a ref to pass to the work + * queue. */ -void __rxrpc_put_connection(struct rxrpc_connection *conn) +bool rxrpc_queue_conn(struct rxrpc_connection *conn) { - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + const void *here = __builtin_return_address(0); + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n == 0) + return false; + if (rxrpc_queue_work(&conn->processor)) + trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + else + rxrpc_put_connection(conn); + return true; +} + +/* + * Note the re-emergence of a connection. + */ +void rxrpc_see_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + if (conn) { + int n = atomic_read(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + } +} + +/* + * Get a ref on a connection. + */ +void rxrpc_get_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); +} + +/* + * Try to get a ref on a connection. + */ +struct rxrpc_connection * +rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + + if (conn) { + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n > 0) + trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + else + conn = NULL; + } + return conn; +} + +/* + * Release a service connection + */ +void rxrpc_put_service_conn(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + ASSERTCMP(n, >=, 0); + if (n == 0) + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); } /* diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 83d54da4ce8b..eef551f40dc2 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -136,6 +136,10 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) list_add_tail(&conn->link, &rxrpc_connections); list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); write_unlock(&rxrpc_connection_lock); + + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), + __builtin_return_address(0)); } return conn; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 8b910780f1ac..598064d3bdd2 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -101,3 +101,34 @@ const char *rxrpc_acks(u8 reason) reason = ARRAY_SIZE(str) - 1; return str[reason]; } + +const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { + [rxrpc_conn_new_client] = "NWc", + [rxrpc_conn_new_service] = "NWs", + [rxrpc_conn_queued] = "QUE", + [rxrpc_conn_seen] = "SEE", + [rxrpc_conn_got] = "GOT", + [rxrpc_conn_put_client] = "PTc", + [rxrpc_conn_put_service] = "PTs", +}; + +const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { + [rxrpc_client_activate_chans] = "Activa", + [rxrpc_client_alloc] = "Alloc ", + [rxrpc_client_chan_activate] = "ChActv", + [rxrpc_client_chan_disconnect] = "ChDisc", + [rxrpc_client_chan_pass] = "ChPass", + [rxrpc_client_chan_unstarted] = "ChUnst", + [rxrpc_client_cleanup] = "Clean ", + [rxrpc_client_count] = "Count ", + [rxrpc_client_discard] = "Discar", + [rxrpc_client_duplicate] = "Duplic", + [rxrpc_client_exposed] = "Expose", + [rxrpc_client_replace] = "Replac", + [rxrpc_client_to_active] = "->Actv", + [rxrpc_client_to_culled] = "->Cull", + [rxrpc_client_to_idle] = "->Idle", + [rxrpc_client_to_inactive] = "->Inac", + [rxrpc_client_to_waiting] = "->Wait", + [rxrpc_client_uncount] = "Uncoun", +}; -- cgit v1.2.1 From a124fe3ee5d82f2c9a9b8818ed5cb9f61685f1d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to follow the life of a packet in the Tx buffer Add a tracepoint to follow the insertion of a packet into the transmit buffer, its transmission and its rotation out of the buffer. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 12 ++++++++++++ net/rxrpc/input.c | 2 ++ net/rxrpc/misc.c | 9 +++++++++ net/rxrpc/sendmsg.c | 9 ++++++++- 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6ca40eea3022..afa5dcc05fe0 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -593,6 +593,18 @@ enum rxrpc_call_trace { extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; +enum rxrpc_transmit_trace { + rxrpc_transmit_wait, + rxrpc_transmit_queue, + rxrpc_transmit_queue_reqack, + rxrpc_transmit_queue_last, + rxrpc_transmit_rotate, + rxrpc_transmit_end, + rxrpc_transmit__nr_trace +}; + +extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c1f83d22f9b7..c7eb5104e91a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) spin_unlock(&call->lock); + trace_rxrpc_transmit(call, rxrpc_transmit_rotate); wake_up(&call->waitq); while (list) { @@ -107,6 +108,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) } write_unlock(&call->state_lock); + trace_rxrpc_transmit(call, rxrpc_transmit_end); _leave(" = ok"); return true; } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 598064d3bdd2..dca89995f03e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -132,3 +132,12 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { [rxrpc_client_to_waiting] = "->Wait", [rxrpc_client_uncount] = "Uncoun", }; + +const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { + [rxrpc_transmit_wait] = "WAI", + [rxrpc_transmit_queue] = "QUE", + [rxrpc_transmit_queue_reqack] = "QRA", + [rxrpc_transmit_queue_last] = "QLS", + [rxrpc_transmit_rotate] = "ROT", + [rxrpc_transmit_end] = "END", +}; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 8bfddf4e338c..28d8f73cf11d 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -56,6 +56,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, break; } + trace_rxrpc_transmit(call, rxrpc_transmit_wait); release_sock(&rx->sk); *timeo = schedule_timeout(*timeo); lock_sock(&rx->sk); @@ -104,8 +105,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, smp_wmb(); call->rxtx_buffer[ix] = skb; call->tx_top = seq; - if (last) + if (last) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); + trace_rxrpc_transmit(call, rxrpc_transmit_queue_last); + } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack); + } else { + trace_rxrpc_transmit(call, rxrpc_transmit_queue); + } if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); -- cgit v1.2.1 From ec71eb9ada34f8d1a58b7c35d906c59411295445 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to log received ACK packets Add a tracepoint to log information from received ACK packets. Signed-off-by: David Howells --- net/rxrpc/input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c7eb5104e91a..7b18ca124978 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -440,6 +440,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; + trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks); + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", sp->hdr.serial, ntohs(buf.ack.maxSkew), -- cgit v1.2.1 From f3639df2d90bc919328c459b3c7c49ed5667a52f Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to log ACK transmission Add a tracepoint to log information about ACK transmission. Signed-off-by: David Howels --- net/rxrpc/conn_event.c | 3 +++ net/rxrpc/output.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a43f4c94a88d..9b19c51831aa 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -98,6 +98,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); len += sizeof(pkt.ack) + sizeof(pkt.info); + + trace_rxrpc_tx_ack(NULL, chan->last_seq, 0, + RXRPC_ACK_DUPLICATE, 0); break; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0b21ed859de7..2c9daeadce87 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -38,12 +38,14 @@ struct rxrpc_pkt_buffer { static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_pkt_buffer *pkt) { + rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; /* Barrier against rxrpc_input_data(). */ + serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); @@ -51,7 +53,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ack.maxSkew = htons(call->ackr_skew); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); - pkt->ack.serial = htonl(call->ackr_serial); + pkt->ack.serial = htonl(serial); pkt->ack.reason = call->ackr_reason; pkt->ack.nAcks = top - hard_ack; @@ -75,6 +77,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); + trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason, + top - hard_ack); + *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; -- cgit v1.2.1 From 58dc63c998ea3c5a27e2bf9251eddbf0977056a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:13 +0100 Subject: rxrpc: Add a tracepoint to follow packets in the Rx buffer Add a tracepoint to follow the life of packets that get added to a call's receive buffer. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 12 ++++++++++++ net/rxrpc/call_accept.c | 3 +++ net/rxrpc/input.c | 6 +++++- net/rxrpc/misc.c | 9 +++++++++ net/rxrpc/recvmsg.c | 11 +++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index afa5dcc05fe0..e5d2f2fb8e41 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -605,6 +605,18 @@ enum rxrpc_transmit_trace { extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; +enum rxrpc_receive_trace { + rxrpc_receive_incoming, + rxrpc_receive_queue, + rxrpc_receive_queue_last, + rxrpc_receive_front, + rxrpc_receive_rotate, + rxrpc_receive_end, + rxrpc_receive__nr_trace +}; + +extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3e474508ba75..a8d39d7cf42c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -367,6 +367,9 @@ found_service: goto out; } + trace_rxrpc_receive(call, rxrpc_receive_incoming, + sp->hdr.serial, sp->hdr.seq); + /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7b18ca124978..b690220533c6 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -284,8 +284,12 @@ next_subpacket: call->rxtx_buffer[ix] = skb; if (after(seq, call->rx_top)) smp_store_release(&call->rx_top, seq); - if (flags & RXRPC_LAST_PACKET) + if (flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_RX_LAST, &call->flags); + trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + } else { + trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); + } queued = true; if (after_eq(seq, call->rx_expect_next)) { diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index dca89995f03e..db5f1d54fc90 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -141,3 +141,12 @@ const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { [rxrpc_transmit_rotate] = "ROT", [rxrpc_transmit_end] = "END", }; + +const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { + [rxrpc_receive_incoming] = "INC", + [rxrpc_receive_queue] = "QUE", + [rxrpc_receive_queue_last] = "QLS", + [rxrpc_receive_front] = "FRN", + [rxrpc_receive_rotate] = "ROT", + [rxrpc_receive_end] = "END", +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8b8d7e14f800..22d51087c580 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -134,6 +134,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) { _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); + trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { @@ -167,6 +168,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; + rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; u8 flags; int ix; @@ -183,6 +185,10 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_see_skb(skb); sp = rxrpc_skb(skb); flags = sp->hdr.flags; + serial = sp->hdr.serial; + if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) + serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1; + call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ @@ -191,6 +197,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_free_skb(skb); _debug("%u,%u,%02x", hard_ack, top, flags); + trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); if (flags & RXRPC_LAST_PACKET) rxrpc_end_rx_phase(call); } @@ -309,6 +316,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rxrpc_see_skb(skb); sp = rxrpc_skb(skb); + if (!(flags & MSG_PEEK)) + trace_rxrpc_receive(call, rxrpc_receive_front, + sp->hdr.serial, seq); + if (msg) sock_recv_timestamp(msg, sock->sk, skb); -- cgit v1.2.1 From 849979051cbc9352857d8bb31895ae55afe19d96 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 11:13:31 +0100 Subject: rxrpc: Add a tracepoint to follow what recvmsg does Add a tracepoint to follow what recvmsg does within AF_RXRPC. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 17 +++++++++++++++++ net/rxrpc/misc.c | 14 ++++++++++++++ net/rxrpc/recvmsg.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e5d2f2fb8e41..a17341d2df3d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -617,6 +617,23 @@ enum rxrpc_receive_trace { extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; +enum rxrpc_recvmsg_trace { + rxrpc_recvmsg_enter, + rxrpc_recvmsg_wait, + rxrpc_recvmsg_dequeue, + rxrpc_recvmsg_hole, + rxrpc_recvmsg_next, + rxrpc_recvmsg_cont, + rxrpc_recvmsg_full, + rxrpc_recvmsg_data_return, + rxrpc_recvmsg_terminal, + rxrpc_recvmsg_to_be_accepted, + rxrpc_recvmsg_return, + rxrpc_recvmsg__nr_trace +}; + +extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index db5f1d54fc90..c7065d893d1e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -150,3 +150,17 @@ const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { [rxrpc_receive_rotate] = "ROT", [rxrpc_receive_end] = "END", }; + +const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { + [rxrpc_recvmsg_enter] = "ENTR", + [rxrpc_recvmsg_wait] = "WAIT", + [rxrpc_recvmsg_dequeue] = "DEQU", + [rxrpc_recvmsg_hole] = "HOLE", + [rxrpc_recvmsg_next] = "NEXT", + [rxrpc_recvmsg_cont] = "CONT", + [rxrpc_recvmsg_full] = "FULL", + [rxrpc_recvmsg_data_return] = "DATA", + [rxrpc_recvmsg_terminal] = "TERM", + [rxrpc_recvmsg_to_be_accepted] = "TBAC", + [rxrpc_recvmsg_return] = "RETN", +}; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 22d51087c580..b62a08151895 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -94,6 +94,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) break; } + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, + call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } @@ -124,6 +126,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); } + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret); return ret; } @@ -310,8 +313,11 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, for (seq = hard_ack + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - if (!skb) + if (!skb) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, + rx_pkt_offset, rx_pkt_len, 0); break; + } smp_rmb(); rxrpc_see_skb(skb); sp = rxrpc_skb(skb); @@ -327,10 +333,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret2 = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], &rx_pkt_offset, &rx_pkt_len); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, + rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } + } else { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, + rx_pkt_offset, rx_pkt_len, 0); } _debug("recvmsg %x DATA #%u { %d, %d }", sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); @@ -357,6 +368,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } if (rx_pkt_len > 0) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, + rx_pkt_offset, rx_pkt_len, 0); _debug("buffer full"); ASSERTCMP(*_offset, ==, len); ret = 0; @@ -383,6 +396,8 @@ out: call->rx_pkt_len = rx_pkt_len; } done: + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, + rx_pkt_offset, rx_pkt_len, ret); _leave(" = %d [%u/%u]", ret, seq, top); return ret; } @@ -404,7 +419,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, DEFINE_WAIT(wait); - _enter(",,,%zu,%d", len, flags); + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -424,8 +439,10 @@ try_again: if (list_empty(&rx->recvmsg_q)) { ret = -EWOULDBLOCK; - if (timeo == 0) + if (timeo == 0) { + call = NULL; goto error_no_call; + } release_sock(&rx->sk); @@ -439,6 +456,8 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, + 0, 0, 0, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -457,7 +476,7 @@ try_again: rxrpc_get_call(call, rxrpc_call_got); write_unlock_bh(&rx->recvmsg_lock); - _debug("recvmsg call %p", call); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -527,16 +546,15 @@ error: rxrpc_put_call(call, rxrpc_call_put); error_no_call: release_sock(&rx->sk); - _leave(" = %d", ret); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); - release_sock(&rx->sk); - _leave(" = %d [wait]", ret); - return ret; + call = NULL; + goto error_no_call; } /** -- cgit v1.2.1 From ba39f3a0ed756ccd882adf4a77916ec863db3ce4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Remove printks from rxrpc_recvmsg_data() to fix uninit var Remove _enter/_debug/_leave calls from rxrpc_recvmsg_data() of which one uses an uninitialised variable. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index b62a08151895..79e65668bc58 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -296,8 +296,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; - _enter(""); - rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; @@ -343,8 +341,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); } - _debug("recvmsg %x DATA #%u { %d, %d }", - sp->hdr.callNumber, seq, rx_pkt_offset, rx_pkt_len); /* We have to handle short, empty and used-up DATA packets. */ remain = len - *_offset; @@ -360,8 +356,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } /* handle piecemeal consumption of data packets */ - _debug("copied %d @%zu", copy, *_offset); - rx_pkt_offset += copy; rx_pkt_len -= copy; *_offset += copy; @@ -370,7 +364,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (rx_pkt_len > 0) { trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, rx_pkt_offset, rx_pkt_len, 0); - _debug("buffer full"); ASSERTCMP(*_offset, ==, len); ret = 0; break; @@ -398,7 +391,6 @@ out: done: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); - _leave(" = %d [%u/%u]", ret, seq, top); return ret; } -- cgit v1.2.1 From 71f3ca408fd43b586c02480768a503af075b247e Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:14 +0100 Subject: rxrpc: Improve skb tracing Improve sk_buff tracing within AF_RXRPC by the following means: (1) Use an enum to note the event type rather than plain integers and use an array of event names rather than a big multi ?: list. (2) Distinguish Rx from Tx packets and account them separately. This requires the call phase to be tracked so that we know what we might find in rxtx_buffer[]. (3) Add a parameter to rxrpc_{new,see,get,free}_skb() to indicate the event type. (4) A pair of 'rotate' events are added to indicate packets that are about to be rotated out of the Rx and Tx windows. (5) A pair of 'lost' events are added, along with rxrpc_lose_skb() for packet loss injection recording. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 5 +++-- net/rxrpc/ar-internal.h | 33 +++++++++++++++++++++++++----- net/rxrpc/call_event.c | 8 ++++---- net/rxrpc/call_object.c | 11 +++++++--- net/rxrpc/conn_event.c | 6 +++--- net/rxrpc/input.c | 13 ++++++------ net/rxrpc/local_event.c | 4 ++-- net/rxrpc/misc.c | 18 +++++++++++++++++ net/rxrpc/output.c | 4 ++-- net/rxrpc/peer_event.c | 10 +++++----- net/rxrpc/recvmsg.c | 7 ++++--- net/rxrpc/sendmsg.c | 10 +++++----- net/rxrpc/skbuff.c | 53 +++++++++++++++++++++++++++++++++++-------------- 13 files changed, 127 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 09f81befc705..8dbf7bed2cc4 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -45,7 +45,7 @@ u32 rxrpc_epoch; atomic_t rxrpc_debug_id; /* count of skbs currently in use */ -atomic_t rxrpc_n_skbs; +atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; @@ -867,7 +867,8 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a17341d2df3d..034f525f2235 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -520,6 +520,7 @@ struct rxrpc_call { rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ + bool tx_phase; /* T if transmission phase, F if receive phase */ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ /* receive-phase ACK management */ @@ -534,6 +535,27 @@ struct rxrpc_call { rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ }; +enum rxrpc_skb_trace { + rxrpc_skb_rx_cleaned, + rxrpc_skb_rx_freed, + rxrpc_skb_rx_got, + rxrpc_skb_rx_lost, + rxrpc_skb_rx_received, + rxrpc_skb_rx_rotated, + rxrpc_skb_rx_purged, + rxrpc_skb_rx_seen, + rxrpc_skb_tx_cleaned, + rxrpc_skb_tx_freed, + rxrpc_skb_tx_got, + rxrpc_skb_tx_lost, + rxrpc_skb_tx_new, + rxrpc_skb_tx_rotated, + rxrpc_skb_tx_seen, + rxrpc_skb__nr_trace +}; + +extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7]; + enum rxrpc_conn_trace { rxrpc_conn_new_client, rxrpc_conn_new_service, @@ -642,7 +664,7 @@ extern const char *rxrpc_acks(u8 reason); /* * af_rxrpc.c */ -extern atomic_t rxrpc_n_skbs; +extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -1000,10 +1022,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); -void rxrpc_new_skb(struct sk_buff *); -void rxrpc_see_skb(struct sk_buff *); -void rxrpc_get_skb(struct sk_buff *); -void rxrpc_free_skb(struct sk_buff *); +void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f0cabc48a1b7..7d1b99824ed9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -170,7 +170,7 @@ static void rxrpc_resend(struct rxrpc_call *call) continue; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); sp = rxrpc_skb(skb); if (annotation == RXRPC_TX_ANNO_UNACK) { @@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call) continue; skb = call->rxtx_buffer[ix]; - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_tx_got); spin_unlock_bh(&call->lock); sp = rxrpc_skb(skb); @@ -211,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call) if (rxrpc_send_data_packet(call->conn, skb) < 0) { call->resend_at = now + 2; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; } @@ -219,7 +219,7 @@ static void rxrpc_resend(struct rxrpc_call *call) rxrpc_expose_client_call(call); sp->resend_at = now + rxrpc_resend_timeout; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); spin_lock_bh(&call->lock); /* We need to clear the retransmit state, but there are two diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 54f30482a7fd..f50a6094e198 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -182,6 +182,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->service_id = srx->srx_service; + call->tx_phase = true; _leave(" = %p", call); return call; @@ -458,7 +459,9 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_disconnect_call(call); for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { - rxrpc_free_skb(call->rxtx_buffer[i]); + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); call->rxtx_buffer[i] = NULL; } @@ -552,9 +555,11 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) /* Clean up the Rx/Tx buffer */ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) - rxrpc_free_skb(call->rxtx_buffer[i]); + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); - rxrpc_free_skb(call->tx_pending); + rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9b19c51831aa..75a15a4c74c3 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -388,7 +388,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -399,7 +399,7 @@ void rxrpc_process_connection(struct work_struct *work) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); break; } } @@ -416,7 +416,7 @@ requeue_and_leave: protocol_error: if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [EPROTO]"); goto out; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b690220533c6..84bb16d47b85 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -50,7 +50,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) call->tx_hard_ack++; ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_rotated); call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; skb->next = list; @@ -66,7 +66,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) skb = list; list = skb->next; skb->next = NULL; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); } } @@ -99,6 +99,7 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) default: break; case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->tx_phase = false; call->state = RXRPC_CALL_CLIENT_RECV_REPLY; break; case RXRPC_CALL_SERVER_AWAIT_ACK: @@ -278,7 +279,7 @@ next_subpacket: * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() * and also rxrpc_fill_out_ack(). */ - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_rx_got); call->rxtx_annotations[ix] = annotation; smp_wmb(); call->rxtx_buffer[ix] = skb; @@ -691,13 +692,13 @@ void rxrpc_data_ready(struct sock *udp_sk) return; } - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); /* we'll probably need to checksum it (didn't call sock_recvmsg) */ if (skb_checksum_complete(skb)) { - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; @@ -821,7 +822,7 @@ void rxrpc_data_ready(struct sock *udp_sk) discard_unlock: rcu_read_unlock(); discard: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); out: trace_rxrpc_rx_done(0, 0); return; diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index f073e932500e..190f68bd9e27 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { @@ -107,7 +107,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) break; } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index c7065d893d1e..026e1f2e83ff 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -102,6 +102,24 @@ const char *rxrpc_acks(u8 reason) return str[reason]; } +const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { + [rxrpc_skb_rx_cleaned] = "Rx CLN", + [rxrpc_skb_rx_freed] = "Rx FRE", + [rxrpc_skb_rx_got] = "Rx GOT", + [rxrpc_skb_rx_lost] = "Rx *L*", + [rxrpc_skb_rx_received] = "Rx RCV", + [rxrpc_skb_rx_purged] = "Rx PUR", + [rxrpc_skb_rx_rotated] = "Rx ROT", + [rxrpc_skb_rx_seen] = "Rx SEE", + [rxrpc_skb_tx_cleaned] = "Tx CLN", + [rxrpc_skb_tx_freed] = "Tx FRE", + [rxrpc_skb_tx_got] = "Tx GOT", + [rxrpc_skb_tx_lost] = "Tx *L*", + [rxrpc_skb_tx_new] = "Tx NEW", + [rxrpc_skb_tx_rotated] = "Tx ROT", + [rxrpc_skb_tx_seen] = "Tx SEE", +}; + const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { [rxrpc_conn_new_client] = "NWc", [rxrpc_conn_new_service] = "NWs", diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2c9daeadce87..a2cad5ce7416 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -324,7 +324,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { @@ -343,7 +343,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) kernel_sendmsg(local->socket, &msg, iov, 2, size); } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 9e0725f5652b..18276e7cb9e0 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -155,11 +155,11 @@ void rxrpc_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); return; } @@ -169,7 +169,7 @@ void rxrpc_error_report(struct sock *sk) peer = NULL; if (!peer) { rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [no peer]"); return; } @@ -179,7 +179,7 @@ void rxrpc_error_report(struct sock *sk) serr->ee.ee_code == ICMP_FRAG_NEEDED)) { rxrpc_adjust_mtu(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); rxrpc_put_peer(peer); _leave(" [MTU update]"); return; @@ -187,7 +187,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); /* The ref we obtained is passed off to the work item */ rxrpc_queue_work(&peer->error_distributor); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 79e65668bc58..6ba4af5a8d95 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -155,6 +155,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) break; case RXRPC_CALL_SERVER_RECV_REQUEST: + call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; break; default: @@ -185,7 +186,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) hard_ack++; ix = hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_rotated); sp = rxrpc_skb(skb); flags = sp->hdr.flags; serial = sp->hdr.serial; @@ -197,7 +198,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) /* Barrier against rxrpc_input_data(). */ smp_store_release(&call->rx_hard_ack, hard_ack); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); @@ -317,7 +318,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, break; } smp_rmb(); - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); if (!(flags & MSG_PEEK)) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 28d8f73cf11d..6a39ee97a0b7 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -100,7 +100,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ASSERTCMP(seq, ==, call->tx_top + 1); ix = seq & RXRPC_RXTX_BUFF_MASK; - rxrpc_get_skb(skb); + rxrpc_get_skb(skb, rxrpc_skb_tx_got); call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; smp_wmb(); call->rxtx_buffer[ix] = skb; @@ -146,7 +146,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_instant_resend(call, ix); } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); _leave(""); } @@ -201,7 +201,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, skb = call->tx_pending; call->tx_pending = NULL; - rxrpc_see_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); copied = 0; do { @@ -242,7 +242,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (!skb) goto maybe_error; - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_tx_new); _debug("ALLOC SEND %p", skb); @@ -352,7 +352,7 @@ out: return ret; call_terminated: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); _leave(" = %d", -call->error); return -call->error; diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 620d9ccaf3c1..5154cbf7e540 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -18,54 +18,76 @@ #include #include "ar-internal.h" +#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) + /* - * Note the existence of a new-to-us socket buffer (allocated or dequeued). + * Note the allocation or reception of a socket buffer. */ -void rxrpc_new_skb(struct sk_buff *skb) +void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 0, atomic_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ -void rxrpc_see_skb(struct sk_buff *skb) +void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { - int n = atomic_read(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 1, atomic_read(&skb->users), n, here); + int n = atomic_read(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } } /* * Note the addition of a ref on a socket buffer. */ -void rxrpc_get_skb(struct sk_buff *skb) +void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); - int n = atomic_inc_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 2, atomic_read(&skb->users), n, here); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); skb_get(skb); } /* * Note the destruction of a socket buffer. */ -void rxrpc_free_skb(struct sk_buff *skb) +void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { int n; CHECK_SLAB_OKAY(&skb->users); - n = atomic_dec_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 3, atomic_read(&skb->users), n, here); + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); kfree_skb(skb); } } +/* + * Note the injected loss of a socket buffer. + */ +void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + if (op == rxrpc_skb_tx_lost) { + n = atomic_read(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + } else { + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } + } +} + /* * Clear a queue of socket buffers. */ @@ -74,8 +96,9 @@ void rxrpc_purge_queue(struct sk_buff_head *list) const void *here = __builtin_return_address(0); struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { - int n = atomic_dec_return(&rxrpc_n_skbs); - trace_rxrpc_skb(skb, 4, atomic_read(&skb->users), n, here); + int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged)); + trace_rxrpc_skb(skb, rxrpc_skb_rx_purged, + atomic_read(&skb->users), n, here); kfree_skb(skb); } } -- cgit v1.2.1 From 8a681c360559f75a80b37e6a6a9590457361ccb0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 17 Sep 2016 10:49:15 +0100 Subject: rxrpc: Add config to inject packet loss Add a configuration option to inject packet loss by discarding approximately every 8th packet received and approximately every 8th DATA packet transmitted. Note that no locking is used, but it shouldn't really matter. Signed-off-by: David Howells --- net/rxrpc/Kconfig | 7 +++++++ net/rxrpc/input.c | 8 ++++++++ net/rxrpc/output.c | 9 +++++++++ 3 files changed, 24 insertions(+) (limited to 'net') diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 13396c74b5c1..86f8853a038c 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -26,6 +26,13 @@ config AF_RXRPC_IPV6 Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as its network transport. +config AF_RXRPC_INJECT_LOSS + bool "Inject packet loss into RxRPC packet stream" + depends on AF_RXRPC + help + Say Y here to inject packet loss by discarding some received and some + transmitted packets. + config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 84bb16d47b85..7ac1edf3aac7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -712,6 +712,14 @@ void rxrpc_data_ready(struct sock *udp_sk) skb_orphan(skb); sp = rxrpc_skb(skb); + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + if ((lose++ & 7) == 7) { + rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); + return; + } + } + _net("Rx UDP packet from %08x:%04hu", ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a2cad5ce7416..16e18a94ffa6 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -225,6 +225,15 @@ int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) msg.msg_controllen = 0; msg.msg_flags = 0; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + if ((lose++ & 7) == 7) { + rxrpc_lose_skb(skb, rxrpc_skb_tx_lost); + _leave(" = 0 [lose]"); + return 0; + } + } + /* send the packet with the don't fragment bit set if we currently * think it's small enough */ if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { -- cgit v1.2.1 From ffb4d6c8508657824bcef68a36b2a0f9d8c09d10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Sep 2016 08:12:33 -0700 Subject: tcp: fix overflow in __tcp_retransmit_skb() If a TCP socket gets a large write queue, an overflow can happen in a test in __tcp_retransmit_skb() preventing all retransmits. The flow then stalls and resets after timeouts. Tested: sysctl -w net.core.wmem_max=1000000000 netperf -H dest -- -s 1000000000 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bdaef7fd6e47..f53d0cca5fa4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2605,7 +2605,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > - min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), + sk->sk_sndbuf)) return -EAGAIN; if (skb_still_in_host_queue(sk, skb)) -- cgit v1.2.1 From 8ab86c00e349cef9fb14719093a7f198bcc72629 Mon Sep 17 00:00:00 2001 From: "phil.turnbull@oracle.com" Date: Thu, 15 Sep 2016 12:41:44 -0400 Subject: irda: Free skb on irda_accept error path. skb is not freed if newsk is NULL. Rework the error path so free_skb is unconditionally called on function exit. Fixes: c3ea9fa27413 ("[IrDA] af_irda: IRDA_ASSERT cleanups") Signed-off-by: Phil Turnbull Signed-off-by: David S. Miller --- net/irda/af_irda.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b491d..ccc244406fb9 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -832,7 +832,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); struct sock *newsk; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int err; err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); @@ -900,7 +900,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) err = -EPERM; /* value does not seem to make sense. -arnd */ if (!new->tsap) { pr_debug("%s(), dup failed!\n", __func__); - kfree_skb(skb); goto out; } @@ -919,7 +918,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) /* Clean up the original one to keep it in listen state */ irttp_listen(self->tsap); - kfree_skb(skb); sk->sk_ack_backlog--; newsock->state = SS_CONNECTED; @@ -927,6 +925,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: + kfree_skb(skb); release_sock(sk); return err; } -- cgit v1.2.1 From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:37 +0200 Subject: net: core: Add offload stats to if_stats_msg Add a nested attribute of offload stats to if_stats_msg named IFLA_STATS_LINK_OFFLOAD_XSTATS. Under it, add SW stats, meaning stats only per packets that went via slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 937e459bdaa9..0dbae4244a89 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } +#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) +static int rtnl_get_offload_stats_attr_size(int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return sizeof(struct rtnl_link_stats64); + } + + return 0; +} + +static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, + int *prividx) +{ + struct nlattr *attr = NULL; + int attr_id, size; + void *attr_data; + int err; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return -ENODATA; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (attr_id < *prividx) + continue; + + size = rtnl_get_offload_stats_attr_size(attr_id); + if (!size) + continue; + + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, + attr_data); + if (err) + goto get_offload_stats_failure; + } + + if (!attr) + return -ENODATA; + + *prividx = 0; + return 0; + +nla_put_failure: + err = -EMSGSIZE; +get_offload_stats_failure: + *prividx = attr_id; + return err; +} + +static int rtnl_get_offload_stats_size(const struct net_device *dev) +{ + int nla_size = 0; + int attr_id; + int size; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return 0; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + size = rtnl_get_offload_stats_attr_size(attr_id); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, @@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; + int err; ASSERT_RTNL(); @@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS); @@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS_SLAVE); @@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) + goto nla_put_failure; + + err = rtnl_get_offload_stats(skb, dev, prividx); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) + size += rtnl_get_offload_stats_size(dev); + return size; } -- cgit v1.2.1 From d409b84768037ad03d1d73538d99fb902adf7365 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:08 -0700 Subject: ipv6: Export p6_route_input_lookup symbol Make ip6_route_input_lookup available outside of ipv6 the module similar to ip_route_input_noref in the IPv4 world. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fc..4dab585f7642 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table * return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); } -static struct dst_entry *ip6_route_input_lookup(struct net *net, - struct net_device *dev, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } +EXPORT_SYMBOL_GPL(ip6_route_input_lookup); void ip6_route_input(struct sk_buff *skb) { -- cgit v1.2.1 From e8bffe0cf964f0330595bb376b74921cccdaac88 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:13 -0700 Subject: net: Add _nf_(un)register_hooks symbols Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow caller to hold RTNL mutex. Signed-off-by: Mahesh Bandewar CC: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/core.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f39276d1c2d7..2c5327e43a88 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks); static LIST_HEAD(nf_hook_list); -int nf_register_hook(struct nf_hook_ops *reg) +static int _nf_register_hook(struct nf_hook_ops *reg) { struct net *net, *last; int ret; - rtnl_lock(); for_each_net(net) { ret = nf_register_net_hook(net, reg); if (ret && ret != -ENOENT) goto rollback; } list_add_tail(®->list, &nf_hook_list); - rtnl_unlock(); return 0; rollback: @@ -210,19 +208,34 @@ rollback: break; nf_unregister_net_hook(net, reg); } + return ret; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + int ret; + + rtnl_lock(); + ret = _nf_register_hook(reg); rtnl_unlock(); + return ret; } EXPORT_SYMBOL(nf_register_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +static void _nf_unregister_hook(struct nf_hook_ops *reg) { struct net *net; - rtnl_lock(); list_del(®->list); for_each_net(net) nf_unregister_net_hook(net, reg); +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + rtnl_lock(); + _nf_unregister_hook(reg); rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -246,6 +259,26 @@ err: } EXPORT_SYMBOL(nf_register_hooks); +/* Caller MUST take rtnl_lock() */ +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = _nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + _nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(_nf_register_hooks); + void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { while (n-- > 0) @@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); +/* Caller MUST take rtnl_lock */ +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + _nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(_nf_unregister_hooks); + unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, struct nf_hook_state *state, -- cgit v1.2.1 From e8bc8f9a670e26e91562e724a2114243898bd616 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Fri, 16 Sep 2016 23:05:35 +0200 Subject: sctp: Remove some redundant code In commit 311b21774f13 ("sctp: simplify sk_receive_queue locking"), a call to 'skb_queue_splice_tail_init()' has been made explicit. Previously it was hidden in 'sctp_skb_list_tail()' Now, the code around it looks redundant. The '_init()' part of 'skb_queue_splice_tail_init()' should already do the same. Signed-off-by: Christophe JAILLET Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/ulpqueue.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 877e55066f89..84d0fdaf7de9 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) * we can go ahead and clear out the lobby in one shot */ if (!skb_queue_empty(&sp->pd_lobby)) { - struct list_head *list; skb_queue_splice_tail_init(&sp->pd_lobby, &sk->sk_receive_queue); - list = (struct list_head *)&sctp_sk(sk)->pd_lobby; - INIT_LIST_HEAD(list); return 1; } } else { -- cgit v1.2.1 From 1486587b2fcda08dee7eab23784d504eed772c45 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:30 +0200 Subject: pie: use qdisc_dequeue_head wrapper Doesn't change generated code. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index a570b0bb254c..d976d74b22d7 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { struct sk_buff *skb; - skb = __qdisc_dequeue_head(sch, &sch->q); + skb = qdisc_dequeue_head(sch); if (!skb) return NULL; -- cgit v1.2.1 From 97d0678f913369af0dc8b510a682a641654ab743 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:31 +0200 Subject: sched: don't use skb queue helpers A followup change will replace the sk_buff_head in the qdisc struct with a slightly different list. Use of the sk_buff_head helpers will thus cause compiler warnings. Open-code these accesses in an extra change to ease review. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_fifo.c | 4 ++-- net/sched/sch_generic.c | 2 +- net/sched/sch_netem.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index baeed6a78d28..1e37247656f8 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(skb_queue_len(&sch->q) < sch->limit)) + if (likely(sch->q.qlen < sch->limit)) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; - if (likely(skb_queue_len(&sch->q) < sch->limit)) + if (likely(sch->q.qlen < sch->limit)) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0d21b567ff27..5e63bf638350 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -486,7 +486,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { + if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff_head *list = band2list(priv, band); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index aaaf02175338..1832d7732dbc 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -502,7 +502,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<<(prandom_u32() % 8); } - if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) + if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); @@ -522,7 +522,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->rate) { struct sk_buff *last; - if (!skb_queue_empty(&sch->q)) + if (sch->q.qlen) last = skb_peek_tail(&sch->q); else last = netem_rb_to_skb(rb_last(&q->t_root)); -- cgit v1.2.1 From ec323368793b8570c02e723127611a8d906a9b3f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:32 +0200 Subject: sched: remove qdisc arg from __qdisc_dequeue_head Moves qdisc stat accouting to qdisc_dequeue_head. The only direct caller of the __qdisc_dequeue_head version open-codes this now. This allows us to later use __qdisc_dequeue_head as a replacement of __skb_dequeue() (which operates on sk_buff_head list). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5e63bf638350..73877d9c2bcb 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -506,7 +506,12 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(band >= 0)) { struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + struct sk_buff *skb = __qdisc_dequeue_head(list); + + if (likely(skb != NULL)) { + qdisc_qstats_backlog_dec(qdisc, skb); + qdisc_bstats_update(qdisc, skb); + } qdisc->q.qlen--; if (skb_queue_empty(list)) -- cgit v1.2.1 From ed760cb8aae7c2b84c193d4a7637b0c9e752f07e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:33 +0200 Subject: sched: replace __skb_dequeue with __qdisc_dequeue_head After previous patch these functions are identical. Replace __skb_dequeue in qdiscs with __qdisc_dequeue_head. Next patch will then make __qdisc_dequeue_head handle single-linked list instead of strcut sk_buff_head argument. Doesn't change generated code. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_codel.c | 4 ++-- net/sched/sch_netem.c | 2 +- net/sched/sch_pie.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 4002df3c7d9f..5bfa79ee657c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -69,7 +69,7 @@ struct codel_sched_data { static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) { struct Qdisc *sch = ctx; - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (skb) sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1832d7732dbc..0a964b35f8c7 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -587,7 +587,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct rb_node *p; tfifo_dequeue: - skb = __skb_dequeue(&sch->q); + skb = __qdisc_dequeue_head(&sch->q); if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index d976d74b22d7..5c3a99d6aa82 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); -- cgit v1.2.1 From 48da34b7a74201f15315cb1fc40bb9a7bd2b4940 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Sep 2016 00:57:34 +0200 Subject: sched: add and use qdisc_skb_head helpers This change replaces sk_buff_head struct in Qdiscs with new qdisc_skb_head. Its similar to the skb_buff_head api, but does not use skb->prev pointers. Qdiscs will commonly enqueue at the tail of a list and dequeue at head. While skb_buff_head works fine for this, enqueue/dequeue needs to also adjust the prev pointer of next element. The ->prev pointer is not required for qdiscs so we can just leave it undefined and avoid one cacheline write access for en/dequeue. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 21 +++++++++++---------- net/sched/sch_htb.c | 24 ++++++++++++++++++++---- net/sched/sch_netem.c | 14 ++++++++++++-- 3 files changed, 43 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 73877d9c2bcb..6cfb6e9038c2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -466,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { */ struct pfifo_fast_priv { u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct qdisc_skb_head q[PFIFO_FAST_BANDS]; }; /* @@ -477,7 +477,7 @@ struct pfifo_fast_priv { */ static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, +static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, int band) { return priv->q + band; @@ -489,7 +489,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *list = band2list(priv, band); priv->bitmap |= (1 << band); qdisc->q.qlen++; @@ -505,8 +505,8 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(list); + struct qdisc_skb_head *qh = band2list(priv, band); + struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(qdisc, skb); @@ -514,7 +514,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) } qdisc->q.qlen--; - if (skb_queue_empty(list)) + if (qh->qlen == 0) priv->bitmap &= ~(1 << band); return skb; @@ -529,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *qh = band2list(priv, band); - return skb_peek(list); + return qh->head; } return NULL; @@ -569,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __skb_queue_head_init(band2list(priv, prio)); + qdisc_skb_head_init(band2list(priv, prio)); /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; @@ -617,7 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } - skb_queue_head_init(&sch->q); + qdisc_skb_head_init(&sch->q); + spin_lock_init(&sch->q.lock); spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 53dbfa187870..c798d0de8a9d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -162,7 +162,7 @@ struct htb_sched { struct work_struct work; /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; + struct qdisc_skb_head direct_queue; long direct_pkts; struct qdisc_watchdog watchdog; @@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } +static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, + struct qdisc_skb_head *qh) +{ + struct sk_buff *last = qh->tail; + + if (last) { + skb->next = NULL; + last->next = skb; + qh->tail = skb; + } else { + qh->tail = skb; + qh->head = skb; + } + qh->qlen++; +} + static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == HTB_DIRECT) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen) { - __skb_queue_tail(&q->direct_queue, skb); + htb_enqueue_tail(skb, sch, &q->direct_queue); q->direct_pkts++; } else { return qdisc_drop(skb, sch, to_free); @@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ - skb = __skb_dequeue(&q->direct_queue); + skb = __qdisc_dequeue_head(&q->direct_queue); if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); @@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); - __skb_queue_head_init(&q->direct_queue); + qdisc_skb_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0a964b35f8c7..9f7b380cf0a3 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, return segs; } +static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) +{ + skb->next = qh->head; + + if (!qh->head) + qh->tail = skb; + qh->head = skb; + qh->qlen++; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -523,7 +533,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *last; if (sch->q.qlen) - last = skb_peek_tail(&sch->q); + last = sch->q.tail; else last = netem_rb_to_skb(rb_last(&q->t_root)); if (last) { @@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb->time_to_send = psched_get_time(); q->counter = 0; - __skb_queue_head(&sch->q, skb); + netem_enqueue_skb_head(&sch->q, skb); sch->qstats.requeues++; } -- cgit v1.2.1 From b588479358ce26f32138e0f0a7ab0678f8e3e601 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Sun, 18 Sep 2016 07:42:53 +0000 Subject: xfrm: Fix memory leak of aead algorithm name commit 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") introduced aead. The function attach_aead kmemdup()s the algorithm name during xfrm_state_construct(). However this memory is never freed. Implementation has since been slightly modified in commit ee5c23176fcc ("xfrm: Clone states properly on migration") without resolving this leak. This patch adds a kfree() call for the aead algorithm name. Fixes: 1a6509d99122 ("[IPSEC]: Add support for combined mode algorithms") Signed-off-by: Ilan Tayari Acked-by: Rami Rosen Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9895a8c56d8c..a30f898dc1c5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -332,6 +332,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); + kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); -- cgit v1.2.1 From e64c97b53bc6727aa4385535166aaa047281e02d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 21 Jul 2016 14:12:41 +0200 Subject: Bluetooth: Add combined LED trigger for controller power Instead of just having a LED trigger for power on a specific controller, this adds the LED trigger "bluetooth-power" that combines the power states of all controllers into a single trigger. This simplifies the trigger selection and also supports multiple controllers per host system via a single LED. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/af_bluetooth.c | 5 +++++ net/bluetooth/leds.c | 27 +++++++++++++++++++++++++++ net/bluetooth/leds.h | 10 ++++++++++ 3 files changed, 42 insertions(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 0b5f729d08d2..1d96ff3a8d87 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -31,6 +31,7 @@ #include #include +#include "leds.h" #include "selftest.h" /* Bluetooth sockets */ @@ -726,6 +727,8 @@ static int __init bt_init(void) bt_debugfs = debugfs_create_dir("bluetooth", NULL); + bt_leds_init(); + err = bt_sysfs_init(); if (err < 0) return err; @@ -785,6 +788,8 @@ static void __exit bt_exit(void) bt_sysfs_cleanup(); + bt_leds_cleanup(); + debugfs_remove_recursive(bt_debugfs); } diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c index 8319c8440c89..cb670b5594eb 100644 --- a/net/bluetooth/leds.c +++ b/net/bluetooth/leds.c @@ -11,6 +11,8 @@ #include "leds.h" +DEFINE_LED_TRIGGER(bt_power_led_trigger); + struct hci_basic_led_trigger { struct led_trigger led_trigger; struct hci_dev *hdev; @@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) if (hdev->power_led) led_trigger_event(hdev->power_led, enabled ? LED_FULL : LED_OFF); + + if (!enabled) { + struct hci_dev *d; + + read_lock(&hci_dev_list_lock); + + list_for_each_entry(d, &hci_dev_list, list) { + if (test_bit(HCI_UP, &d->flags)) + enabled = true; + } + + read_unlock(&hci_dev_list_lock); + } + + led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); } static void power_activate(struct led_classdev *led_cdev) @@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev) /* initialize power_led */ hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); } + +void bt_leds_init(void) +{ + led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger); +} + +void bt_leds_cleanup(void) +{ + led_trigger_unregister_simple(bt_power_led_trigger); +} diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h index a9c4d6ea01cf..08725a2fbd9b 100644 --- a/net/bluetooth/leds.h +++ b/net/bluetooth/leds.h @@ -7,10 +7,20 @@ */ #if IS_ENABLED(CONFIG_BT_LEDS) + void hci_leds_update_powered(struct hci_dev *hdev, bool enabled); void hci_leds_init(struct hci_dev *hdev); + +void bt_leds_init(void); +void bt_leds_cleanup(void); + #else + static inline void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) {} static inline void hci_leds_init(struct hci_dev *hdev) {} + +static inline void bt_leds_init(void) {} +static inline void bt_leds_cleanup(void) {} + #endif -- cgit v1.2.1 From abbcc341adb16f68915cae7ef9a10e0d7b57e3c0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 24 Jul 2016 16:12:24 +0200 Subject: mac802154: set phy net namespace for new ifaces This patch sets the net namespace when creating SoftMAC interfaces. This is important if the namespace at phy layer was switched before. Currently we losing interfaces in some namespace and it's not possible to recover that. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 7079cd32a7ad..06019dba4b10 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); + dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); -- cgit v1.2.1 From 5ddedce3b7331959a6da217ed3189d020090873c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 24 Jul 2016 16:12:25 +0200 Subject: 6lowpan: ndisc: no overreact if no short address is available This patch removes handling to remove short address for a neigbour entry if RS/RA/NS/NA doesn't contain a short address. If these messages doesn't has any short address option, the existing short address from ndisc cache will be used. The current behaviour will set that the neigbour doesn't has a short address anymore. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/6lowpan/ndisc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index 86450b7e2899..941df2fa4448 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr)) neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); - } else { - neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); } write_unlock_bh(&n->lock); } -- cgit v1.2.1 From ca1de81aa262dcf48354a7c55f2558205517d06e Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 25 Jul 2016 11:46:40 -0400 Subject: mac802154: don't warn on unsupported frames Just because we don't support certain types of frames yet doesn't mean we have to flood the message log with warnings about "invalid" frames. Signed-off-by: Aristeu Rozanski Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/rx.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 446e1300383e..b978da018bf8 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -101,6 +101,11 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, sdata->dev->stats.rx_bytes += skb->len; switch (mac_cb(skb)->type) { + case IEEE802154_FC_TYPE_BEACON: + case IEEE802154_FC_TYPE_ACK: + case IEEE802154_FC_TYPE_MAC_CMD: + goto fail; + case IEEE802154_FC_TYPE_DATA: return ieee802154_deliver_skb(skb); default: -- cgit v1.2.1 From bd89bb6daaca3e4a7c509bdacb53a610f432fa2c Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 25 Jul 2016 11:46:41 -0400 Subject: mac802154: use rate limited warnings for malformed frames Signed-off-by: Aristeu Rozanski Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index b978da018bf8..4dcf6e18563a 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -109,8 +109,8 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, case IEEE802154_FC_TYPE_DATA: return ieee802154_deliver_skb(skb); default: - pr_warn("ieee802154: bad frame received (type = %d)\n", - mac_cb(skb)->type); + pr_warn_ratelimited("ieee802154: bad frame received " + "(type = %d)\n", mac_cb(skb)->type); goto fail; } -- cgit v1.2.1 From 47b0f573f2fa7634860e16ea31f2bc3057a1022a Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:37 +0200 Subject: Bluetooth: Check SOL_HCI for raw socket options The SOL_HCI level should be enforced when using socket options on the HCI raw socket interface. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 96f04b7b9556..99dd1503ef56 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1440,6 +1440,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); + if (level != SOL_HCI) + return -ENOPROTOOPT; + lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { @@ -1523,6 +1526,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); + if (level != SOL_HCI) + return -ENOPROTOOPT; + if (get_user(len, optlen)) return -EFAULT; -- cgit v1.2.1 From 70ecce91e3a2d7e332fe56fd065c67d404b8fccf Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:38 +0200 Subject: Bluetooth: Store control socket cookie and comm information To further allow unique identification and tracking of control socket, store cookie and comm information when binding the socket. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 99dd1503ef56..4dce6dfdb0f2 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -38,6 +39,8 @@ static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); +static DEFINE_IDA(sock_cookie_ida); + static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ @@ -52,6 +55,8 @@ struct hci_pinfo { __u32 cmsg_mask; unsigned short channel; unsigned long flags; + __u32 cookie; + char comm[TASK_COMM_LEN]; }; void hci_sock_set_flag(struct sock *sk, int nr) @@ -74,6 +79,11 @@ unsigned short hci_sock_get_channel(struct sock *sk) return hci_pi(sk)->channel; } +u32 hci_sock_get_cookie(struct sock *sk) +{ + return hci_pi(sk)->cookie; +} + static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); @@ -585,6 +595,7 @@ static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; + int id; BT_DBG("sock %p sk %p", sock, sk); @@ -593,8 +604,17 @@ static int hci_sock_release(struct socket *sock) hdev = hci_pi(sk)->hdev; - if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR) + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); + break; + case HCI_CHANNEL_CONTROL: + id = hci_pi(sk)->cookie; + + hci_pi(sk)->cookie = 0xffffffff; + ida_simple_remove(&sock_cookie_ida, id); + break; + } bt_sock_unlink(&hci_sk_list, sk); @@ -957,6 +977,15 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * are changes to settings, class of device, name etc. */ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + int id; + + id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + if (id < 0) + id = 0xffffffff; + + hci_pi(sk)->cookie = id; + get_task_comm(hci_pi(sk)->comm, current); + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); -- cgit v1.2.1 From 03c979c4717c7fa0c058fafe76ac4d6acdd1fb0d Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:39 +0200 Subject: Bluetooth: Introduce helper to pack mgmt version information The mgmt version information will be also needed for the control changell tracing feature. This provides a helper to pack them. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7639290b6de3..9071886df194 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -278,6 +278,14 @@ static u8 le_addr_type(u8 mgmt_addr_type) return ADDR_LE_DEV_RANDOM; } +void mgmt_fill_version_info(void *ver) +{ + struct mgmt_rp_read_version *rp = ver; + + rp->version = MGMT_VERSION; + rp->revision = cpu_to_le16(MGMT_REVISION); +} + static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -285,8 +293,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("sock %p", sk); - rp.version = MGMT_VERSION; - rp.revision = cpu_to_le16(MGMT_REVISION); + mgmt_fill_version_info(&rp); return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, sizeof(rp)); -- cgit v1.2.1 From 249fa1699f8642c73eb43e61b321969f0549ab2c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:40 +0200 Subject: Bluetooth: Add support for sending MGMT open and close to monitor This sends new notifications to the monitor support whenever a management channel has been opened or closed. This allows tracing of control channels really easily. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 4dce6dfdb0f2..2d8725006838 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -394,6 +394,59 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) return skb; } +static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + u16 format = 0x0002; + u8 ver[3]; + u32 flags; + + skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); + if (!skb) + return NULL; + + mgmt_fill_version_info(ver); + flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(format, skb_put(skb, 2)); + memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver)); + put_unaligned_le32(flags, skb_put(skb, 4)); + *skb_put(skb, 1) = TASK_COMM_LEN; + memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + +static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(4, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { @@ -468,6 +521,29 @@ static void send_monitor_replay(struct sock *sk) read_unlock(&hci_dev_list_lock); } +static void send_monitor_control_replay(struct sock *mon_sk) +{ + struct sock *sk; + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct sk_buff *skb; + + if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + continue; + + skb = create_monitor_ctrl_open(sk); + if (!skb) + continue; + + if (sock_queue_rcv_skb(mon_sk, skb)) + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { @@ -595,6 +671,7 @@ static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; + struct sk_buff *skb; int id; BT_DBG("sock %p sk %p", sock, sk); @@ -611,6 +688,14 @@ static int hci_sock_release(struct socket *sock) case HCI_CHANNEL_CONTROL: id = hci_pi(sk)->cookie; + /* Send event to monitor */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + hci_pi(sk)->cookie = 0xffffffff; ida_simple_remove(&sock_cookie_ida, id); break; @@ -931,6 +1016,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, send_monitor_note(sk, "Bluetooth subsystem version %s", BT_SUBSYS_VERSION); send_monitor_replay(sk); + send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; @@ -977,6 +1063,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * are changes to settings, class of device, name etc. */ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + struct sk_buff *skb; int id; id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); @@ -986,6 +1073,14 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_pi(sk)->cookie = id; get_task_comm(hci_pi(sk)->comm, current); + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); -- cgit v1.2.1 From 38ceaa00d02dceb22c6bdd5268f5a44d5c00e123 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 27 Aug 2016 20:23:41 +0200 Subject: Bluetooth: Add support for sending MGMT commands and events to monitor This adds support for tracing all management commands and events via the monitor interface. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++ net/bluetooth/mgmt_util.c | 66 +++++++++++++++++++++++++++++++-- 2 files changed, 157 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 2d8725006838..576ea48631b9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -315,6 +315,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } +void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, + void *data, u16 data_len, ktime_t tstamp, + int flag, struct sock *skip_sk) +{ + struct sock *sk; + __le16 index; + + if (hdev) + index = cpu_to_le16(hdev->id); + else + index = cpu_to_le16(MGMT_INDEX_NONE); + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + continue; + + /* Ignore socket without the flag set */ + if (!hci_sock_test_flag(sk, flag)) + continue; + + /* Skip the original socket */ + if (sk == skip_sk) + continue; + + skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); + if (!skb) + continue; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(event, skb_put(skb, 2)); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + skb->tstamp = tstamp; + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; @@ -447,6 +501,33 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) return skb; } +static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, + u16 opcode, u16 len, + const void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { @@ -1257,6 +1338,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, goto done; } + if (chan->channel == HCI_CHANNEL_CONTROL) { + struct sk_buff *skb; + + /* Send event to monitor */ + skb = create_monitor_ctrl_command(sk, index, opcode, len, + buf + sizeof(*hdr)); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 8c30c7eb8bef..c933bd08c1fe 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -21,12 +21,41 @@ SOFTWARE IS DISCLAIMED. */ +#include + #include #include +#include #include #include "mgmt_util.h" +static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, + u16 opcode, u16 len, void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { @@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, __net_timestamp(skb); hci_send_to_channel(channel, skb, flag, skip_sk); - kfree_skb(skb); + if (channel == HCI_CHANNEL_CONTROL) + hci_send_monitor_ctrl_event(hdev, event, data, data_len, + skb_get_ktime(skb), flag, skip_sk); + + kfree_skb(skb); return 0; } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; @@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) ev->status = status; ev->opcode = cpu_to_le16(cmd); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_STATUS, sizeof(*ev), ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; @@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, if (rp) memcpy(ev->data, rp, rp_len); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_COMPLETE, + sizeof(*ev) + rp_len, ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } -- cgit v1.2.1 From 37d3a1fab50fa07ac706787646e61c60e7c520e0 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 28 Aug 2016 20:53:34 +0300 Subject: Bluetooth: mgmt: Fix sending redundant event for Advertising Instance When an Advertising Instance is removed, the Advertising Removed event shouldn't be sent to the same socket that issued the Remove Advertising command (it gets a command complete event instead). The mgmt_advertising_removed() function already has a parameter for skipping a specific socket, but there was no code to propagate the right value to this parameter. This patch fixes the issue by making sure the intermediate hci_req_clear_adv_instance() function gets the socket pointer. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 11 ++++++----- net/bluetooth/hci_request.h | 5 +++-- net/bluetooth/mgmt.c | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index b0e23dfc5c34..9968b1c7c03a 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1194,7 +1194,7 @@ static void adv_timeout_expire(struct work_struct *work) hci_req_init(&req, hdev); - hci_req_clear_adv_instance(hdev, &req, instance, false); + hci_req_clear_adv_instance(hdev, NULL, &req, instance, false); if (list_empty(&hdev->adv_instances)) __hci_req_disable_advertising(&req); @@ -1284,8 +1284,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev) * setting. * - force == false: Only instances that have a timeout will be removed. */ -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, - u8 instance, bool force) +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, + struct hci_request *req, u8 instance, + bool force) { struct adv_info *adv_instance, *n, *next_instance = NULL; int err; @@ -1311,7 +1312,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, rem_inst = adv_instance->instance; err = hci_remove_adv_instance(hdev, rem_inst); if (!err) - mgmt_advertising_removed(NULL, hdev, rem_inst); + mgmt_advertising_removed(sk, hdev, rem_inst); } } else { adv_instance = hci_find_adv_instance(hdev, instance); @@ -1325,7 +1326,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, err = hci_remove_adv_instance(hdev, instance); if (!err) - mgmt_advertising_removed(NULL, hdev, instance); + mgmt_advertising_removed(sk, hdev, instance); } } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index b2d044bdc732..ac1e11006f38 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -73,8 +73,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance); int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, bool force); -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, - u8 instance, bool force); +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, + struct hci_request *req, u8 instance, + bool force); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9071886df194..f9af5f7c2ea2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -929,7 +929,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - hci_req_clear_adv_instance(hdev, NULL, 0x00, false); + hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_disable_advertising(&req); @@ -1697,7 +1697,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) enabled = lmp_host_le_capable(hdev); if (!val) - hci_req_clear_adv_instance(hdev, NULL, 0x00, true); + hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true); if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -6182,7 +6182,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - hci_req_clear_adv_instance(hdev, &req, cp->instance, true); + hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true); if (list_empty(&hdev->adv_instances)) __hci_req_disable_advertising(&req); -- cgit v1.2.1 From 5504c3a31061704512707bb23bd7835e8a5281e4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 29 Aug 2016 06:19:46 +0200 Subject: Bluetooth: Use individual flags for certain management events Instead of hiding everything behind a general managment events flag, introduce indivdual flags that allow fine control over which events are send to a given management channel. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 5 ++++- net/bluetooth/mgmt.c | 32 +++++++++++++------------------- 2 files changed, 17 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 576ea48631b9..d37c2243157b 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1164,7 +1164,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); - hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f9af5f7c2ea2..469f5cc3109b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -256,13 +256,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data, flag, skip_sk); } -static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data, - u16 len, struct sock *skip_sk) -{ - return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, - HCI_MGMT_GENERIC_EVENTS, skip_sk); -} - static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, struct sock *skip_sk) { @@ -579,8 +572,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip) { __le32 options = get_missing_options(hdev); - return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, - sizeof(options), skip); + return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, + sizeof(options), HCI_MGMT_OPTION_EVENTS, skip); } static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) @@ -1007,8 +1000,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) { __le32 ev = cpu_to_le32(get_current_settings(hdev)); - return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, - sizeof(ev), skip); + return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, + sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip); } int mgmt_new_settings(struct hci_dev *hdev) @@ -3000,8 +2993,8 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, if (err < 0) goto failed; - err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, - data, len, sk); + err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, + len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); goto failed; } @@ -6502,8 +6495,9 @@ void __mgmt_power_off(struct hci_dev *hdev) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - zero_cod, sizeof(zero_cod), NULL); + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + zero_cod, sizeof(zero_cod), + HCI_MGMT_DEV_CLASS_EVENTS, NULL); new_settings(hdev, match.sk); @@ -7100,8 +7094,8 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); if (!status) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - dev_class, 3, NULL); + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, + 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); if (match.sk) sock_put(match.sk); @@ -7130,8 +7124,8 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) return; } - mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); + mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), + HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) -- cgit v1.2.1 From 56f787c5024de7829f8cccce7569feb520829baf Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 29 Aug 2016 06:19:47 +0200 Subject: Bluetooth: Fix wrong Get Clock Information return parameters The address information of the Get Clock Information return parameters is copying from a different memory location. It uses &cmd->param while it actually needs to be cmd->param. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 469f5cc3109b..0c83dd36b7e3 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4869,7 +4869,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) int err; memset(&rp, 0, sizeof(rp)); - memcpy(&rp.addr, &cmd->param, sizeof(rp.addr)); + memcpy(&rp.addr, cmd->param, sizeof(rp.addr)); if (status) goto complete; -- cgit v1.2.1 From 9db5c62951871c33e4443fe433e234419cf574d2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 29 Aug 2016 06:31:57 +0200 Subject: Bluetooth: Use command status event for Set IO Capability errors In case of failure, the Set IO Capability command is suppose to return command status and not command complete. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0c83dd36b7e3..47efdb4a669a 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2513,8 +2513,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, - MGMT_STATUS_INVALID_PARAMS, NULL, 0); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); -- cgit v1.2.1 From df1cb87af9f24527a8932e4d195d49ffab1168d2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:34 +0200 Subject: Bluetooth: Introduce helper functions for socket cookie handling Instead of manually allocating cookie information each time, use helper functions for generating and releasing cookies. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d37c2243157b..804208d48368 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -84,6 +84,33 @@ u32 hci_sock_get_cookie(struct sock *sk) return hci_pi(sk)->cookie; } +static bool hci_sock_gen_cookie(struct sock *sk) +{ + int id = hci_pi(sk)->cookie; + + if (!id) { + id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + if (id < 0) + id = 0xffffffff; + + hci_pi(sk)->cookie = id; + get_task_comm(hci_pi(sk)->comm, current); + return true; + } + + return false; +} + +static void hci_sock_free_cookie(struct sock *sk) +{ + int id = hci_pi(sk)->cookie; + + if (id) { + hci_pi(sk)->cookie = 0xffffffff; + ida_simple_remove(&sock_cookie_ida, id); + } +} + static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); @@ -753,7 +780,6 @@ static int hci_sock_release(struct socket *sock) struct sock *sk = sock->sk; struct hci_dev *hdev; struct sk_buff *skb; - int id; BT_DBG("sock %p sk %p", sock, sk); @@ -767,8 +793,6 @@ static int hci_sock_release(struct socket *sock) atomic_dec(&monitor_promisc); break; case HCI_CHANNEL_CONTROL: - id = hci_pi(sk)->cookie; - /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); if (skb) { @@ -777,8 +801,7 @@ static int hci_sock_release(struct socket *sock) kfree_skb(skb); } - hci_pi(sk)->cookie = 0xffffffff; - ida_simple_remove(&sock_cookie_ida, id); + hci_sock_free_cookie(sk); break; } @@ -1145,14 +1168,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, */ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { struct sk_buff *skb; - int id; - - id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); - if (id < 0) - id = 0xffffffff; - hci_pi(sk)->cookie = id; - get_task_comm(hci_pi(sk)->comm, current); + hci_sock_gen_cookie(sk); /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); -- cgit v1.2.1 From 9e8305b39bfa23a83b932007654097f4676c2ba2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:35 +0200 Subject: Bluetooth: Use numbers for subsystem version string Instead of keeping a version string around, use version and revision numbers and then stringify them for use as module parameter. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/af_bluetooth.c | 10 +++++++--- net/bluetooth/hci_sock.c | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1d96ff3a8d87..1aff2da9bc74 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -713,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = { struct dentry *bt_debugfs; EXPORT_SYMBOL_GPL(bt_debugfs); +#define VERSION __stringify(BT_SUBSYS_VERSION) "." \ + __stringify(BT_SUBSYS_REVISION) + static int __init bt_init(void) { int err; sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); - BT_INFO("Core ver %s", BT_SUBSYS_VERSION); + BT_INFO("Core ver %s", VERSION); err = bt_selftest(); if (err < 0) @@ -797,7 +801,7 @@ subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann "); -MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION); -MODULE_VERSION(BT_SUBSYS_VERSION); +MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); +MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 804208d48368..a4227c777d16 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1117,8 +1117,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); - send_monitor_note(sk, "Bluetooth subsystem version %s", - BT_SUBSYS_VERSION); + send_monitor_note(sk, "Bluetooth subsystem version %u.%u", + BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); send_monitor_control_replay(sk); -- cgit v1.2.1 From 0ef2c42f8c4e372bad16f67dc0f4b15b9be910f6 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:36 +0200 Subject: Bluetooth: Send control open and close only when cookie is present Only when the cookie has been assigned, then send the open and close monitor messages. Also if the socket is bound to a device, then include the index into the message. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a4227c777d16..0deca758fd9e 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -483,6 +483,10 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) u8 ver[3]; u32 flags; + /* No message needed when cookie is not present */ + if (!hci_pi(sk)->cookie) + return NULL; + skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); if (!skb) return NULL; @@ -501,7 +505,10 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); - hdr->index = cpu_to_le16(HCI_DEV_NONE); + if (hci_pi(sk)->hdev) + hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); + else + hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; @@ -512,6 +519,10 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) struct hci_mon_hdr *hdr; struct sk_buff *skb; + /* No message needed when cookie is not present */ + if (!hci_pi(sk)->cookie) + return NULL; + skb = bt_skb_alloc(4, GFP_ATOMIC); if (!skb) return NULL; @@ -522,7 +533,10 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); - hdr->index = cpu_to_le16(HCI_DEV_NONE); + if (hci_pi(sk)->hdev) + hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); + else + hdr->index = cpu_to_le16(HCI_DEV_NONE); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; -- cgit v1.2.1 From 5a6d2cf5f18b5afbae0b1b450070bbba50f1e3e0 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:37 +0200 Subject: Bluetooth: Assign the channel early when binding HCI sockets Assignment of the hci_pi(sk)->channel should be done early when binding the HCI socket. This avoids confusion with the RAW channel that is used for legacy access. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0deca758fd9e..ca13fac1c132 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1045,6 +1045,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, atomic_inc(&hdev->promisc); } + hci_pi(sk)->channel = haddr.hci_channel; hci_pi(sk)->hdev = hdev; break; @@ -1107,9 +1108,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } } - atomic_inc(&hdev->promisc); - + hci_pi(sk)->channel = haddr.hci_channel; hci_pi(sk)->hdev = hdev; + + atomic_inc(&hdev->promisc); break; case HCI_CHANNEL_MONITOR: @@ -1123,6 +1125,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } + hci_pi(sk)->channel = haddr.hci_channel; + /* The monitor interface is restricted to CAP_NET_RAW * capabilities and with that implicitly trusted. */ @@ -1149,6 +1153,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = -EPERM; goto done; } + + hci_pi(sk)->channel = haddr.hci_channel; break; default: @@ -1170,6 +1176,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + hci_pi(sk)->channel = haddr.hci_channel; + /* At the moment the index and unconfigured index events * are enabled unconditionally. Setting them on each * socket when binding keeps this functionality. They @@ -1180,7 +1188,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * received by untrusted users. Example for such events * are changes to settings, class of device, name etc. */ - if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { struct sk_buff *skb; hci_sock_gen_cookie(sk); @@ -1203,8 +1211,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, break; } - - hci_pi(sk)->channel = haddr.hci_channel; sk->sk_state = BT_BOUND; done: -- cgit v1.2.1 From d0bef1d26fb6fdad818f3d15a178d51e2a8478ae Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:38 +0200 Subject: Bluetooth: Add extra channel checks for control open/close messages The control open and close monitoring events require special channel checks to ensure messages are only send when the right events happen. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index ca13fac1c132..b22efe272f7e 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -479,7 +479,7 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) { struct hci_mon_hdr *hdr; struct sk_buff *skb; - u16 format = 0x0002; + u16 format; u8 ver[3]; u32 flags; @@ -487,11 +487,20 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) if (!hci_pi(sk)->cookie) return NULL; + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_CONTROL: + format = 0x0002; + mgmt_fill_version_info(ver); + break; + default: + /* No message for unsupported format */ + return NULL; + } + skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); if (!skb) return NULL; - mgmt_fill_version_info(ver); flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); @@ -523,6 +532,14 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) if (!hci_pi(sk)->cookie) return NULL; + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_CONTROL: + break; + default: + /* No message for unsupported format */ + return NULL; + } + skb = bt_skb_alloc(4, GFP_ATOMIC); if (!skb) return NULL; @@ -652,9 +669,6 @@ static void send_monitor_control_replay(struct sock *mon_sk) sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *skb; - if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) - continue; - skb = create_monitor_ctrl_open(sk); if (!skb) continue; -- cgit v1.2.1 From f81f5b2db8692ff1d2d5f4db1fde58e67aa976a3 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:39 +0200 Subject: Bluetooth: Send control open and close messages for HCI raw sockets When opening and closing HCI raw sockets their main usage is for legacy userspace. To track interaction with the modern mgmt interface, send open and close monitoring messages for these action. The HCI raw sockets is special since it supports unbound ioctl operation and for that special case delay the notification message until at least one ioctl has been executed. The difference between a bound and unbound socket will be detailed by the fact the HCI index is present or not. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index b22efe272f7e..c7772436f508 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -488,6 +488,11 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) return NULL; switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_RAW: + format = 0x0000; + ver[0] = BT_SUBSYS_VERSION; + put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); + break; case HCI_CHANNEL_CONTROL: format = 0x0002; mgmt_fill_version_info(ver); @@ -533,6 +538,7 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) return NULL; switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_RAW: case HCI_CHANNEL_CONTROL: break; default: @@ -820,6 +826,7 @@ static int hci_sock_release(struct socket *sock) case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); break; + case HCI_CHANNEL_RAW: case HCI_CHANNEL_CONTROL: /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); @@ -958,6 +965,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, goto done; } + /* When calling an ioctl on an unbound raw socket, then ensure + * that the monitor gets informed. Ensure that the resulting event + * is only send once by checking if the cookie exists or not. The + * socket cookie will be only ever generated once for the lifetime + * of a given socket. + */ + if (hci_sock_gen_cookie(sk)) { + struct sk_buff *skb; + + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + release_sock(sk); switch (cmd) { @@ -1061,6 +1089,26 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_pi(sk)->channel = haddr.hci_channel; hci_pi(sk)->hdev = hdev; + + /* Only send the event to monitor when a new cookie has + * been generated. An existing cookie means that an unbound + * socket has seen an ioctl and that triggered the cookie + * generation and sending of the monitor event. + */ + if (hci_sock_gen_cookie(sk)) { + struct sk_buff *skb; + + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } break; case HCI_CHANNEL_USER: -- cgit v1.2.1 From f4cdbb3f25c15c17a952deae1f2e0db6df8f1948 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 30 Aug 2016 05:00:40 +0200 Subject: Bluetooth: Handle HCI raw socket transition from unbound to bound In case an unbound HCI raw socket is later on bound, ensure that the monitor notification messages indicate a close and re-open. None of the userspace tools use the socket this, but it is actually possible to use an ioctl on an unbound socket and then later bind it. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 53 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index c7772436f508..83e9fdb712e5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1049,6 +1049,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_hci haddr; struct sock *sk = sock->sk; struct hci_dev *hdev = NULL; + struct sk_buff *skb; int len, err = 0; BT_DBG("sock %p sk %p", sock, sk); @@ -1088,27 +1089,34 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } hci_pi(sk)->channel = haddr.hci_channel; - hci_pi(sk)->hdev = hdev; - - /* Only send the event to monitor when a new cookie has - * been generated. An existing cookie means that an unbound - * socket has seen an ioctl and that triggered the cookie - * generation and sending of the monitor event. - */ - if (hci_sock_gen_cookie(sk)) { - struct sk_buff *skb; - - if (capable(CAP_NET_ADMIN)) - hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); - /* Send event to monitor */ - skb = create_monitor_ctrl_open(sk); + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been assigned, + * then there has been already an ioctl issued against + * an unbound socket and with that triggerd an open + * notification. Send a close notification first to + * allow the state transition to bounded. + */ + skb = create_monitor_ctrl_close(sk); if (skb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } + + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + hci_pi(sk)->hdev = hdev; + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } break; case HCI_CHANNEL_USER: @@ -1251,9 +1259,20 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * are changes to settings, class of device, name etc. */ if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { - struct sk_buff *skb; - - hci_sock_gen_cookie(sk); + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been + * assigned, this socket will transtion from + * a raw socket into a control socket. To + * allow for a clean transtion, send the + * close notification first. + */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } /* Send event to monitor */ skb = create_monitor_ctrl_open(sk); -- cgit v1.2.1 From 321c6feed2519a2691f65e41c4d62332d6ee3d52 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 1 Sep 2016 16:46:23 +0200 Subject: Bluetooth: Add framework for Extended Controller Information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This command is used to retrieve the current state and basic information of a controller. It is typically used right after getting the response to the Read Controller Index List command or an Index Added event (or its extended counterparts). When any of the values in the EIR_Data field changes, the event Extended Controller Information Changed will be used to inform clients about the updated information. Signed-off-by: Marcel Holtmann Signed-off-by: Michał Narajowski --- net/bluetooth/mgmt.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 47efdb4a669a..69001f415efa 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -104,6 +104,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_REMOVE_ADVERTISING, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, + MGMT_OP_READ_EXT_INFO, }; static const u16 mgmt_events[] = { @@ -141,6 +142,7 @@ static const u16 mgmt_events[] = { MGMT_EV_LOCAL_OOB_DATA_UPDATED, MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -149,6 +151,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_UNCONF_INDEX_LIST, MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, + MGMT_OP_READ_EXT_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -162,6 +165,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_NEW_CONFIG_OPTIONS, MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -862,6 +866,52 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, sizeof(rp)); } +static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_ext_info rp; + + BT_DBG("sock %p %s", sk, hdev->name); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + + bacpy(&rp.bdaddr, &hdev->bdaddr); + + rp.version = hdev->hci_ver; + rp.manufacturer = cpu_to_le16(hdev->manufacturer); + + rp.supported_settings = cpu_to_le32(get_supported_settings(hdev)); + rp.current_settings = cpu_to_le32(get_current_settings(hdev)); + + rp.eir_len = cpu_to_le16(0); + + hci_dev_unlock(hdev); + + /* If this command is called at least once, then the events + * for class of device and local name changes are disabled + * and only the new extended controller information event + * is used. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp, + sizeof(rp)); +} + +static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) +{ + struct mgmt_ev_ext_info_changed ev; + + ev.eir_len = cpu_to_le16(0); + + return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev, + sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip); +} + static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 settings = cpu_to_le32(get_current_settings(hdev)); @@ -2995,6 +3045,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); + ext_info_changed(hdev, sk); goto failed; } @@ -6356,6 +6407,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, + { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -6494,10 +6547,12 @@ void __mgmt_power_off(struct hci_dev *hdev) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); - if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) + if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, zero_cod, sizeof(zero_cod), HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } new_settings(hdev, match.sk); @@ -7093,9 +7148,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); - if (!status) + if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } if (match.sk) sock_put(match.sk); @@ -7126,6 +7183,7 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); + ext_info_changed(hdev, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) -- cgit v1.2.1 From 8a0c9f49090fe8ae122fd1bbf7260c8492289386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Thu, 1 Sep 2016 16:46:24 +0200 Subject: Bluetooth: Append local name and CoD to Extended Controller Info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds device class, complete local name and short local name to EIR data in Extended Controller Info as specified in docs. Signed-off-by: Michał Narajowski Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 63 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 69001f415efa..74179b92ef22 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -866,26 +866,58 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, sizeof(rp)); } +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, + u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - struct mgmt_rp_read_ext_info rp; + struct mgmt_rp_read_ext_info *rp; + char buff[512]; + u16 eir_len = 0; + u8 name_len; BT_DBG("sock %p %s", sk, hdev->name); hci_dev_lock(hdev); - memset(&rp, 0, sizeof(rp)); + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + eir_len = eir_append_data(buff, eir_len, + EIR_CLASS_OF_DEV, + hdev->dev_class, 3); - bacpy(&rp.bdaddr, &hdev->bdaddr); + name_len = strlen(hdev->dev_name); + eir_len = eir_append_data(buff, eir_len, EIR_NAME_COMPLETE, + hdev->dev_name, name_len); - rp.version = hdev->hci_ver; - rp.manufacturer = cpu_to_le16(hdev->manufacturer); + name_len = strlen(hdev->short_name); + eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT, + hdev->short_name, name_len); - rp.supported_settings = cpu_to_le32(get_supported_settings(hdev)); - rp.current_settings = cpu_to_le32(get_current_settings(hdev)); + rp = kmalloc(sizeof(*rp) + eir_len, GFP_KERNEL); + if (!rp) + return -ENOMEM; + + memset(rp, 0, sizeof(*rp) + eir_len); + + rp->eir_len = cpu_to_le16(eir_len); + memcpy(rp->eir, buff, eir_len); - rp.eir_len = cpu_to_le16(0); + bacpy(&rp->bdaddr, &hdev->bdaddr); + + rp->version = hdev->hci_ver; + rp->manufacturer = cpu_to_le16(hdev->manufacturer); + + rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); + rp->current_settings = cpu_to_le32(get_current_settings(hdev)); hci_dev_unlock(hdev); @@ -898,8 +930,8 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, &rp, - sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp, + sizeof(*rp) + eir_len); } static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) @@ -5552,17 +5584,6 @@ unlock: return err; } -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, - u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { -- cgit v1.2.1 From aa1638dde75d00e4f549902017d0df48b77e86ff Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 1 Sep 2016 19:48:28 +0200 Subject: Bluetooth: Send control open and close messages for HCI user channels When opening and closing HCI user channel, send monitoring messages to be able to trace its behavior. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 83e9fdb712e5..48f9471e7c85 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -493,6 +493,11 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) ver[0] = BT_SUBSYS_VERSION; put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); break; + case HCI_CHANNEL_USER: + format = 0x0001; + ver[0] = BT_SUBSYS_VERSION; + put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); + break; case HCI_CHANNEL_CONTROL: format = 0x0002; mgmt_fill_version_info(ver); @@ -539,6 +544,7 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: break; default: @@ -827,6 +833,7 @@ static int hci_sock_release(struct socket *sock) atomic_dec(&monitor_promisc); break; case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: case HCI_CHANNEL_CONTROL: /* Send event to monitor */ skb = create_monitor_ctrl_close(sk); @@ -1179,8 +1186,36 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } hci_pi(sk)->channel = haddr.hci_channel; + + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been assigned, + * this socket will transition from a raw socket into + * an user channel socket. For a clean transition, send + * the close notification first. + */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + + /* The user channel is restricted to CAP_NET_ADMIN + * capabilities and with that implicitly trusted. + */ + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + hci_pi(sk)->hdev = hdev; + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + atomic_inc(&hdev->promisc); break; -- cgit v1.2.1 From baab793225c9badf46309f56982eb1012dbaac80 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 4 Sep 2016 05:13:46 +0200 Subject: Bluetooth: Fix wrong New Settings event when closing HCI User Channel When closing HCI User Channel, the New Settings event was send out to inform about changed settings. However such event is wrong since the exclusive HCI User Channel access is active until the Index Added event has been sent. @ USER Close: test @ MGMT Event: New Settings (0x0006) plen 4 Current settings: 0x00000ad0 Bondable Secure Simple Pairing BR/EDR Low Energy Secure Connections = Close Index: 00:14:EF:22:04:12 @ MGMT Event: Index Added (0x0004) plen 0 Calling __mgmt_power_off from hci_dev_do_close requires an extra check for an active HCI User Channel. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ddf8432fe8fb..3ac89e9ace71 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); if (!auto_off && hdev->dev_type == HCI_PRIMARY && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); -- cgit v1.2.1 From 3c0975a7a1087add3bf873601f0270aa695d7616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Dalleau?= Date: Thu, 8 Sep 2016 12:00:11 +0200 Subject: Bluetooth: Fix reason code used for rejecting SCO connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A comment in the code states that SCO connection should be rejected with the proper error value between 0xd-0xf. The code uses HCI_ERROR_REMOTE_LOW_RESOURCES which is 0x14. This led to following error: < HCI Command: Reject Synchronous Co.. (0x01|0x002a) plen 7 Address: 34:51:C9:EF:02:CA (Apple, Inc.) Reason: Remote Device Terminated due to Low Resources (0x14) > HCI Event: Command Status (0x0f) plen 4 Reject Synchronous Connection Request (0x01|0x002a) ncmd 1 Status: Invalid HCI Command Parameters (0x12) Instead make use of HCI_ERROR_REJ_LIMITED_RESOURCES which is 0xd. Signed-off-by: Frédéric Dalleau Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 9968b1c7c03a..9566ff8e3223 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1717,7 +1717,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, * function. To be safe hard-code one of the * values that's suitable for SCO. */ - rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES; + rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(rej), &rej); -- cgit v1.2.1 From 3e36ca483a642f441b8e29b4e98091f2c62bfb38 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 10 Sep 2016 12:21:22 +0000 Subject: Bluetooth: Use kzalloc instead of kmalloc/memset Use kzalloc rather than kmalloc followed by memset with 0. Generated by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci Signed-off-by: Wei Yongjun Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 74179b92ef22..0ac881cfc646 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -902,12 +902,10 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT, hdev->short_name, name_len); - rp = kmalloc(sizeof(*rp) + eir_len, GFP_KERNEL); + rp = kzalloc(sizeof(*rp) + eir_len, GFP_KERNEL); if (!rp) return -ENOMEM; - memset(rp, 0, sizeof(*rp) + eir_len); - rp->eir_len = cpu_to_le16(eir_len); memcpy(rp->eir, buff, eir_len); -- cgit v1.2.1 From 83ebb9ec734e9e768a9fae469e4a7ed1762ef43a Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Fri, 9 Sep 2016 20:24:40 +0200 Subject: Bluetooth: Fix not registering BR/EDR SMP channel with force_bredr flag If force_bredr is set SMP BR/EDR channel should also be for non-SC capable controllers. Since hcidev flag is persistent wrt power toggle it can be already set when calling smp_register(). This resulted in SMP BR/EDR channel not being registered even if HCI_FORCE_BREDR_SMP flag was set. This also fix NULL pointer dereference when trying to disable force_bredr after power cycle. BUG: unable to handle kernel NULL pointer dereference at 0000000000000388 IP: [] smp_del_chan+0x18/0x80 [bluetooth] Call Trace: [] force_bredr_smp_write+0xba/0x100 [bluetooth] [] full_proxy_write+0x54/0x90 [] __vfs_write+0x37/0x160 [] ? selinux_file_permission+0xd7/0x110 [] ? security_file_permission+0x3d/0xc0 [] ? percpu_down_read+0x12/0x50 [] vfs_write+0xb5/0x1a0 [] SyS_write+0x55/0xc0 [] entry_SYSCALL_64_fastpath+0x1a/0xa4 Code: 48 8b 45 f0 eb c1 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 f6 05 c6 3b 02 00 04 55 48 89 e5 41 54 53 49 89 fc 75 4b <49> 8b 9c 24 88 03 00 00 48 85 db 74 31 49 c7 84 24 88 03 00 00 RIP [] smp_del_chan+0x18/0x80 [bluetooth] RSP CR2: 0000000000000388 Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4c1a16a96ae5..43faf2aea2ab 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev) if (!lmp_sc_capable(hdev)) { debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs, hdev, &force_bredr_smp_fops); - return 0; + + /* Flag can be already set here (due to power toggle) */ + if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) + return 0; } if (WARN_ON(hdev->smp_bredr_data)) { -- cgit v1.2.1 From 7c295c4801b2de24fc25687eb0cb73cf0c99d114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Sun, 18 Sep 2016 12:50:02 +0200 Subject: Bluetooth: Add support for local name in scan rsp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables appending local name to scan response data. If currently advertised instance has name flag set it is expired immediately. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 28 +++++++++++++++++++-------- net/bluetooth/mgmt.c | 46 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 9566ff8e3223..0ce6cdd278b2 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -971,14 +971,14 @@ void __hci_req_enable_advertising(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } -static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) { - u8 ad_len = 0; size_t name_len; + int max_len; + max_len = HCI_MAX_AD_LENGTH - ad_len - 2; name_len = strlen(hdev->dev_name); - if (name_len > 0) { - size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2; + if (name_len > 0 && max_len > 0) { if (name_len > max_len) { name_len = max_len; @@ -997,22 +997,34 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } +static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +{ + return append_local_name(hdev, ptr, 0); +} + static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, u8 *ptr) { struct adv_info *adv_instance; + u32 instance_flags; + u8 scan_rsp_len = 0; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return 0; - /* TODO: Set the appropriate entries based on advertising instance flags - * here once flags other than 0 are supported. - */ + instance_flags = adv_instance->flags; + memcpy(ptr, adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); - return adv_instance->scan_rsp_len; + scan_rsp_len += adv_instance->scan_rsp_len; + ptr += adv_instance->scan_rsp_len; + + if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME) + scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len); + + return scan_rsp_len; } void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0ac881cfc646..89954bb19222 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3012,6 +3012,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, HCI_OP_USER_PASSKEY_NEG_REPLY, 0); } +static void adv_expire(struct hci_dev *hdev, u32 flags) +{ + struct adv_info *adv_instance; + struct hci_request req; + int err; + + adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + if (!adv_instance) + return; + + /* stop if current instance doesn't need to be changed */ + if (!(adv_instance->flags & flags)) + return; + + cancel_adv_timeout(hdev); + + adv_instance = hci_get_next_instance(hdev, adv_instance->instance); + if (!adv_instance) + return; + + hci_req_init(&req, hdev); + err = __hci_req_schedule_adv_instance(&req, adv_instance->instance, + true); + if (err) + return; + + hci_req_run(&req, NULL); +} + static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_cp_set_local_name *cp; @@ -3027,13 +3056,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) cp = cmd->param; - if (status) + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, mgmt_status(status)); - else + } else { mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, cp, sizeof(*cp)); + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME); + } + mgmt_pending_remove(cmd); unlock: @@ -5885,6 +5918,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_DISCOV; flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; + flags |= MGMT_ADV_FLAG_LOCAL_NAME; if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) flags |= MGMT_ADV_FLAG_TX_POWER; @@ -5961,6 +5995,10 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, tx_power_managed = true; max_len -= 3; } + } else { + /* at least 1 byte of name should fit in */ + if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) + max_len -= 3; } if (len > max_len) @@ -6293,6 +6331,10 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) if (adv_flags & MGMT_ADV_FLAG_TX_POWER) max_len -= 3; + } else { + /* at least 1 byte of name should fit in */ + if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) + max_len -= 3; } return max_len; -- cgit v1.2.1 From c4960ecf2b09210930964ef2c05ce2590802ccf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Sun, 18 Sep 2016 12:50:03 +0200 Subject: Bluetooth: Add support for appearance in scan rsp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables prepending appearance value to scan response data. It also adds support for setting appearance value through mgmt command. If currently advertised instance has apperance flag set it is expired immediately. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 8 ++++++++ net/bluetooth/mgmt.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 0ce6cdd278b2..c8135680c43e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1015,6 +1015,14 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, instance_flags = adv_instance->flags; + if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) { + ptr[0] = 3; + ptr[1] = EIR_APPEARANCE; + put_unaligned_le16(hdev->appearance, ptr + 2); + scan_rsp_len += 4; + ptr += 4; + } + memcpy(ptr, adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 89954bb19222..78d708851208 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -105,6 +105,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, + MGMT_OP_SET_APPEARANCE, }; static const u16 mgmt_events[] = { @@ -3143,6 +3144,34 @@ failed: return err; } +static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_set_appearance *cp = data; + u16 apperance; + int err; + + BT_DBG(""); + + apperance = le16_to_cpu(cp->appearance); + + hci_dev_lock(hdev); + + if (hdev->appearance != apperance) { + hdev->appearance = apperance; + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); + } + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL, + 0); + + hci_dev_unlock(hdev); + + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -5918,6 +5947,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_DISCOV; flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; + flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) @@ -5999,6 +6029,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, /* at least 1 byte of name should fit in */ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= 3; + + if (adv_flags & MGMT_ADV_FLAG_APPEARANCE) + max_len -= 4; } if (len > max_len) @@ -6335,6 +6368,9 @@ static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) /* at least 1 byte of name should fit in */ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= 3; + + if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) + max_len -= 4; } return max_len; @@ -6470,6 +6506,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, HCI_MGMT_UNTRUSTED }, + { set_appearance, MGMT_SET_APPEARANCE_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.1 From 5e2c59e84b633e4f7719fdc6a2930f2a311da83a Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Sun, 18 Sep 2016 12:50:04 +0200 Subject: Bluetooth: Remove unused parameter from tlv_data_is_valid function hdev parameter is not used in function. Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 78d708851208..97f70b7fb7b1 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6005,8 +6005,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, return err; } -static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, - u8 len, bool is_adv_data) +static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) { u8 max_len = HCI_MAX_AD_LENGTH; int i, cur_len; @@ -6168,8 +6167,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) || - !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len, + if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) || + !tlv_data_is_valid(flags, cp->data + cp->adv_data_len, cp->scan_rsp_len, false)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); -- cgit v1.2.1 From 2bb36870e8cb29949ef9acec37129cd8e70f1857 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Sun, 18 Sep 2016 12:50:05 +0200 Subject: Bluetooth: Unify advertising instance flags check This unifies max length and TLV validity checks. Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 85 +++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 97f70b7fb7b1..c96b0adc4971 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6005,34 +6005,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, return err; } -static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) +static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) { u8 max_len = HCI_MAX_AD_LENGTH; - int i, cur_len; - bool flags_managed = false; - bool tx_power_managed = false; if (is_adv_data) { if (adv_flags & (MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | - MGMT_ADV_FLAG_MANAGED_FLAGS)) { - flags_managed = true; + MGMT_ADV_FLAG_MANAGED_FLAGS)) max_len -= 3; - } - if (adv_flags & MGMT_ADV_FLAG_TX_POWER) { - tx_power_managed = true; + if (adv_flags & MGMT_ADV_FLAG_TX_POWER) max_len -= 3; - } } else { /* at least 1 byte of name should fit in */ if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= 3; - if (adv_flags & MGMT_ADV_FLAG_APPEARANCE) + if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) max_len -= 4; } + return max_len; +} + +static bool flags_managed(u32 adv_flags) +{ + return adv_flags & (MGMT_ADV_FLAG_DISCOV | + MGMT_ADV_FLAG_LIMITED_DISCOV | + MGMT_ADV_FLAG_MANAGED_FLAGS); +} + +static bool tx_power_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_TX_POWER; +} + +static bool name_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME; +} + +static bool appearance_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_APPEARANCE; +} + +static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) +{ + int i, cur_len; + u8 max_len; + + max_len = tlv_data_max_len(adv_flags, is_adv_data); + if (len > max_len) return false; @@ -6040,10 +6065,20 @@ static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; - if (flags_managed && data[i + 1] == EIR_FLAGS) + if (data[i + 1] == EIR_FLAGS && flags_managed(adv_flags)) + return false; + + if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags)) + return false; + + if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags)) + return false; + + if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags)) return false; - if (tx_power_managed && data[i + 1] == EIR_TX_POWER) + if (data[i + 1] == EIR_APPEARANCE && + appearance_managed(adv_flags)) return false; /* If the current field length would exceed the total data @@ -6351,30 +6386,6 @@ unlock: return err; } -static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) -{ - u8 max_len = HCI_MAX_AD_LENGTH; - - if (is_adv_data) { - if (adv_flags & (MGMT_ADV_FLAG_DISCOV | - MGMT_ADV_FLAG_LIMITED_DISCOV | - MGMT_ADV_FLAG_MANAGED_FLAGS)) - max_len -= 3; - - if (adv_flags & MGMT_ADV_FLAG_TX_POWER) - max_len -= 3; - } else { - /* at least 1 byte of name should fit in */ - if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) - max_len -= 3; - - if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) - max_len -= 4; - } - - return max_len; -} - static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { -- cgit v1.2.1 From 9c9db78dc0fbbd95177fefdad008e46ffaa777f2 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Sun, 18 Sep 2016 12:50:06 +0200 Subject: Bluetooth: Fix advertising instance validity check for flags Flags are not allowed in Scan Response. Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c96b0adc4971..2758c6a4425c 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6065,7 +6065,8 @@ static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; - if (data[i + 1] == EIR_FLAGS && flags_managed(adv_flags)) + if (data[i + 1] == EIR_FLAGS && + (!is_adv_data || flags_managed(adv_flags))) return false; if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags)) -- cgit v1.2.1 From 3310230c5dddfafe3d1ef87f1257812011681aca Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Sun, 18 Sep 2016 12:50:07 +0200 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to the recently added Read Extended Controller Information and Set Appearance commands. Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 2758c6a4425c..54dd218d06f7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 13 +#define MGMT_REVISION 14 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.1 From 7d5c11da1ff6389511c42448f59456373edfc103 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Mon, 19 Sep 2016 20:25:52 +0200 Subject: Bluetooth: Refactor read_ext_controller_info handler There is no need to allocate heap for reply only to copy stack data to it. This also fix rp memory leak and missing hdev unlock if kmalloc failed. Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 54dd218d06f7..604c48142848 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -881,42 +881,38 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - struct mgmt_rp_read_ext_info *rp; - char buff[512]; + char buf[512]; + struct mgmt_rp_read_ext_info *rp = (void *)buf; u16 eir_len = 0; - u8 name_len; + size_t name_len; BT_DBG("sock %p %s", sk, hdev->name); + memset(&buf, 0, sizeof(buf)); + hci_dev_lock(hdev); + bacpy(&rp->bdaddr, &hdev->bdaddr); + + rp->version = hdev->hci_ver; + rp->manufacturer = cpu_to_le16(hdev->manufacturer); + + rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); + rp->current_settings = cpu_to_le32(get_current_settings(hdev)); + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - eir_len = eir_append_data(buff, eir_len, - EIR_CLASS_OF_DEV, + eir_len = eir_append_data(rp->eir, eir_len, EIR_CLASS_OF_DEV, hdev->dev_class, 3); name_len = strlen(hdev->dev_name); - eir_len = eir_append_data(buff, eir_len, EIR_NAME_COMPLETE, + eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_COMPLETE, hdev->dev_name, name_len); name_len = strlen(hdev->short_name); - eir_len = eir_append_data(buff, eir_len, EIR_NAME_SHORT, + eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_SHORT, hdev->short_name, name_len); - rp = kzalloc(sizeof(*rp) + eir_len, GFP_KERNEL); - if (!rp) - return -ENOMEM; - rp->eir_len = cpu_to_le16(eir_len); - memcpy(rp->eir, buff, eir_len); - - bacpy(&rp->bdaddr, &hdev->bdaddr); - - rp->version = hdev->hci_ver; - rp->manufacturer = cpu_to_le16(hdev->manufacturer); - - rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); - rp->current_settings = cpu_to_le32(get_current_settings(hdev)); hci_dev_unlock(hdev); -- cgit v1.2.1 From cde7a863d36a4a629c111f37edc2297d6b822a82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Mon, 19 Sep 2016 20:25:53 +0200 Subject: Bluetooth: Factor appending EIR to separate helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will also be used for Extended Information Event handling. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 604c48142848..2b6fe10256b9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -878,13 +878,32 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, return eir_len; } +static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) +{ + u16 eir_len = 0; + size_t name_len; + + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV, + hdev->dev_class, 3); + + name_len = strlen(hdev->dev_name); + eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE, + hdev->dev_name, name_len); + + name_len = strlen(hdev->short_name); + eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT, + hdev->short_name, name_len); + + return eir_len; +} + static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { char buf[512]; struct mgmt_rp_read_ext_info *rp = (void *)buf; - u16 eir_len = 0; - size_t name_len; + u16 eir_len; BT_DBG("sock %p %s", sk, hdev->name); @@ -900,18 +919,8 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); rp->current_settings = cpu_to_le32(get_current_settings(hdev)); - if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - eir_len = eir_append_data(rp->eir, eir_len, EIR_CLASS_OF_DEV, - hdev->dev_class, 3); - - name_len = strlen(hdev->dev_name); - eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_COMPLETE, - hdev->dev_name, name_len); - - name_len = strlen(hdev->short_name); - eir_len = eir_append_data(rp->eir, eir_len, EIR_NAME_SHORT, - hdev->short_name, name_len); + eir_len = append_eir_data_to_buf(hdev, rp->eir); rp->eir_len = cpu_to_le16(eir_len); hci_dev_unlock(hdev); -- cgit v1.2.1 From 6a9e90bff9cfb33d5939c29e5bf2674c9176365d Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Mon, 19 Sep 2016 20:25:54 +0200 Subject: Bluetooth: Add appearance to Read Ext Controller Info command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If LE is enabled appearance is added to EIR data. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 2b6fe10256b9..d3837e0633af 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -878,6 +878,16 @@ static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, return eir_len; } +static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) +{ + eir[eir_len++] = sizeof(type) + sizeof(data); + eir[eir_len++] = type; + put_unaligned_le16(data, &eir[eir_len]); + eir_len += sizeof(data); + + return eir_len; +} + static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) { u16 eir_len = 0; @@ -887,6 +897,10 @@ static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV, hdev->dev_class, 3); + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE, + hdev->appearance); + name_len = strlen(hdev->dev_name); eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE, hdev->dev_name, name_len); -- cgit v1.2.1 From 5e9fae48f800b973e45887ce0b8d717d54c0bb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Mon, 19 Sep 2016 20:25:55 +0200 Subject: Bluetooth: Add supported data types to ext info changed event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds EIR data to extended info changed event. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d3837e0633af..29e5ce95c50c 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -954,12 +954,18 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) { - struct mgmt_ev_ext_info_changed ev; + char buf[512]; + struct mgmt_ev_ext_info_changed *ev = (void *)buf; + u16 eir_len; - ev.eir_len = cpu_to_le16(0); + memset(buf, 0, sizeof(buf)); + + eir_len = append_eir_data_to_buf(hdev, ev->eir); + ev->eir_len = cpu_to_le16(eir_len); - return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, &ev, - sizeof(ev), HCI_MGMT_EXT_INFO_EVENTS, skip); + return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev, + sizeof(*ev) + eir_len, + HCI_MGMT_EXT_INFO_EVENTS, skip); } static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) -- cgit v1.2.1 From e74317f43f5ce2d13cddaab867c59d42934d9585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Mon, 19 Sep 2016 20:25:56 +0200 Subject: Bluetooth: Fix missing ext info event when setting appearance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds missing event when setting appearance, just like in the set local name command. Signed-off-by: Michał Narajowski Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 29e5ce95c50c..cd9f345894e0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3187,6 +3187,8 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, if (hci_dev_test_flag(hdev, HCI_LE_ADV)) adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); + + ext_info_changed(hdev, sk); } err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL, -- cgit v1.2.1 From af4168c5a925dc3b11b0246c2b91124327919f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Mon, 19 Sep 2016 14:33:33 +0200 Subject: Bluetooth: Set appearance only for LE capable controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting appearance on controllers without LE support will result in No Supported error. Signed-off-by: Michał Narajowski Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index cd9f345894e0..7b2bac492fb1 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3178,6 +3178,10 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); + if (!lmp_le_capable(hdev)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, + MGMT_STATUS_NOT_SUPPORTED); + apperance = le16_to_cpu(cp->appearance); hci_dev_lock(hdev); -- cgit v1.2.1 From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..fd7b41edf1ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); - cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); } @@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.2.1 From 07b26c9454a2a19fff86d6fcf2aba6bc801eb8d8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 19 Sep 2016 12:58:47 +0200 Subject: gso: Support partial splitting at the frag_list pointer Since commit 8a29111c7 ("net: gro: allow to build full sized skb") gro may build buffers with a frag_list. This can hurt forwarding because most NICs can't offload such packets, they need to be segmented in software. This patch splits buffers with a frag_list at the frag_list pointer into buffers that can be TSO offloaded. Signed-off-by: Steffen Klassert Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 51 +++++++++++++++++++++++++++++++++++++++----------- net/ipv4/af_inet.c | 14 ++++++++++---- net/ipv4/gre_offload.c | 6 ++++-- net/ipv4/tcp_offload.c | 13 +++++++------ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/ip6_offload.c | 5 ++++- 6 files changed, 69 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e329d411242..7bf82a28e10a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3097,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - /* GSO partial only requires that we trim off any excess that - * doesn't fit into an MSS sized block, so take care of that - * now. - */ - if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* Split the buffer at the frag_list pointer. + * This is based on the assumption that all + * buffers in the chain excluding the last + * containing the same amount of data. + */ + skb_walk_frags(head_skb, iter) { + if (skb_headlen(iter)) + goto normal; + + len -= iter->len; + } + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; @@ -3109,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } +normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3300,21 +3321,29 @@ perform_csum_check: */ segs->prev = tail; - /* Update GSO info on first skb in partial sequence. */ if (partial_segs) { + struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ - type |= SKB_GSO_PARTIAL; + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ - skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; - skb_shinfo(segs)->gso_segs = partial_segs; - skb_shinfo(segs)->gso_type = type; - SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e94b47be0019..1effc986739e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; @@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); @@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } - tot_len = skb_shinfo(skb)->gso_size + - SKB_GSO_CB(skb)->data_offset + - skb->head - (unsigned char *)iph; + + if (gso_partial) + tot_len = skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)iph; + else + tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ecd1e09dbbf1..96e0efecefa6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo; + bool need_csum, ufo, gso_partial; if (!skb->encapsulation) goto out; @@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; @@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (skb_is_gso(skb)) { + if (gso_partial) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5c5964962d0c..bc68da38ea86 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, goto out; } - /* GSO partial only requires splitting the frame into an MSS - * multiple and possibly a remainder. So update the mss now. - */ - if (features & NETIF_F_GSO_PARTIAL) - mss = skb->len - (skb->len % mss); - copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ @@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; + /* GSO partial and frag_list segmentation only requires splitting + * the frame into an MSS multiple and possibly a remainder, both + * cases return a GSO skb. So update the mss now. + */ + if (skb_is_gso(segs)) + mss *= skb_shinfo(segs)->gso_segs; + delta = htonl(oldlen + (thlen + mss)); skb = segs; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 81f253b6ff36..f9333c963607 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo; + bool remcsum, need_csum, offload_csum, ufo, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; @@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (skb_is_gso(skb)) { + if (gso_partial) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 22e90e56b5a9..e7bfd55899a3 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int offset = 0; bool encap, udpfrag; int nhoff; + bool gso_partial; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); @@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (IS_ERR(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) + if (gso_partial) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); -- cgit v1.2.1 From 6a5d58b67e205f2ffc62d0a9ee4ef7d237e9a7fb Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 07:31:42 -0400 Subject: net sched ife action: add 16 bit helpers encoder and checker for 16 bits metadata Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index e87cd81315e1..ccf7b4b655fe 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) } EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); +int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) +{ + u16 edata = 0; + + if (mi->metaval) + edata = *(u16 *)mi->metaval; + else if (metaval) + edata = metaval; + + if (!edata) /* will not encode */ + return 0; + + edata = htons(edata); + return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); +} +EXPORT_SYMBOL_GPL(ife_encode_meta_u16); + int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) @@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) } EXPORT_SYMBOL_GPL(ife_check_meta_u32); +int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) +{ + if (metaval || mi->metaval) + return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ + + return 0; +} +EXPORT_SYMBOL_GPL(ife_check_meta_u16); + int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; -- cgit v1.2.1 From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 07:31:43 -0400 Subject: net sched ife action: Introduce skb tcindex metadata encap decap Sample use case of how this is encoded: user space via tuntap (or a connected VM/Machine/container) encodes the tcindex TLV. Sample use case of decoding: IFE action decodes it and the skb->tc_index is then used to classify. So something like this for encoded ICMP packets: .. first decode then reclassify... skb->tcindex will be set sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \ u32 match u32 0 0 flowid 1:1 \ action ife decode reclassify ...next match the decode icmp packet... sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \ u32 match ip protocol 1 0xff flowid 1:1 \ action continue ... last classify it using the tcindex classifier and do someaction.. sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \ handle 0x11 tcindex classid 1:1 \ action blah.. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/Kconfig | 5 +++ net/sched/Makefile | 1 + net/sched/act_meta_skbtcindex.c | 79 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 net/sched/act_meta_skbtcindex.c (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7795d5a3f79a..87956a768d1b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO depends on NET_ACT_IFE ---help--- +config NET_IFE_SKBTCINDEX + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE + ---help--- + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 148ae0d5ac2c..4bdda3634e0b 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o +obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c new file mode 100644 index 000000000000..3b35774ce890 --- /dev/null +++ b/net/sched/act_meta_skbtcindex.c @@ -0,0 +1,79 @@ +/* + * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2016) + * +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifetc_index = skb->tc_index; + + return ife_encode_meta_u16(ifetc_index, skbdata, e); +} + +static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len) +{ + u16 ifetc_index = *(u16 *)data; + + skb->tc_index = ntohs(ifetc_index); + return 0; +} + +static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u16(skb->tc_index, e); +} + +static struct tcf_meta_ops ife_skbtcindex_ops = { + .metaid = IFE_META_TCINDEX, + .metatype = NLA_U16, + .name = "tc_index", + .synopsis = "skb tc_index 16 bit metadata", + .check_presence = skbtcindex_check, + .encode = skbtcindex_encode, + .decode = skbtcindex_decode, + .get = ife_get_meta_u16, + .alloc = ife_alloc_meta_u16, + .release = ife_release_meta_gen, + .validate = ife_validate_meta_u16, + .owner = THIS_MODULE, +}; + +static int __init ifetc_index_init_module(void) +{ + return register_ife_op(&ife_skbtcindex_ops); +} + +static void __exit ifetc_index_cleanup_module(void) +{ + unregister_ife_op(&ife_skbtcindex_ops); +} + +module_init(ifetc_index_init_module); +module_exit(ifetc_index_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2016)"); +MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); -- cgit v1.2.1 From f71b109f1730902b73f70d78764d8a41265080dd Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Sun, 18 Sep 2016 07:53:08 -0400 Subject: net sched actions police: peg drop stats for conforming traffic setting conforming action to drop is a valid policy. When it is set we need to at least see the stats indicating it for debugging. Signed-off-by: Roman Mashak Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_police.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8a3be1d99775..ba7074b391ae 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; + if (police->tcfp_result == TC_ACT_SHOT) + police->tcf_qstats.drops++; spin_unlock(&police->tcf_lock); return police->tcfp_result; } -- cgit v1.2.1 From 5a7a5555a362f60350668cd124df9a396f546c61 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 08:45:33 -0400 Subject: net sched: stylistic cleanups Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 16 ++++++---------- net/sched/act_csum.c | 36 ++++++++++++++++++------------------ net/sched/act_gact.c | 3 ++- net/sched/act_mirred.c | 3 ++- net/sched/act_police.c | 10 ++++------ net/sched/cls_api.c | 18 ++++++++++-------- net/sched/cls_bpf.c | 6 ++++-- net/sched/cls_flow.c | 21 ++++++++++++++------- net/sched/cls_flower.c | 3 ++- net/sched/cls_fw.c | 10 +++++----- net/sched/cls_route.c | 9 +++------ net/sched/cls_tcindex.c | 12 ++++++------ net/sched/cls_u32.c | 30 ++++++++++++------------------ net/sched/sch_api.c | 41 ++++++++++++++++++++++++++--------------- 14 files changed, 114 insertions(+), 104 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d09d0687594b..d0aceb1740b1 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -592,9 +592,8 @@ err_out: return ERR_PTR(err); } -int tcf_action_init(struct net *net, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, - int bind, struct list_head *actions) +int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind, struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -923,9 +922,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return err; } -static int -tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 portid, int ovr) +static int tcf_action_add(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid, int ovr) { int ret = 0; LIST_HEAD(actions); @@ -988,8 +986,7 @@ replay: return ret; } -static struct nlattr * -find_dump_kind(const struct nlmsghdr *n) +static struct nlattr *find_dump_kind(const struct nlmsghdr *n) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -1016,8 +1013,7 @@ find_dump_kind(const struct nlmsghdr *n) return kind; } -static int -tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) +static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b5dbf633a863..e0defcef376d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, return (void *)(skb_network_header(skb) + ihl); } -static int tcf_csum_ipv4_icmp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct icmphdr *icmph; @@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_icmp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct icmp6hdr *icmp6h; const struct ipv6hdr *ip6h; @@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv4_tcp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct tcphdr *tcph; const struct iphdr *iph; @@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_tcp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct tcphdr *tcph; const struct ipv6hdr *ip6h; @@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv4_udp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl, int udplite) +static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl, int udplite) { struct udphdr *udph; const struct iphdr *iph; @@ -270,8 +270,8 @@ ignore_obscure_skb: return 1; } -static int tcf_csum_ipv6_udp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl, int udplite) +static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl, int udplite) { struct udphdr *udph; const struct ipv6hdr *ip6h; @@ -380,8 +380,8 @@ fail: return 0; } -static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, - unsigned int ixhl, unsigned int *pl) +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl, + unsigned int *pl) { int off, len, optlen; unsigned char *xh = (void *)ip6xh; @@ -494,8 +494,8 @@ fail: return 0; } -static int tcf_csum(struct sk_buff *skb, - const struct tc_action *a, struct tcf_result *res) +static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); int action; @@ -531,8 +531,8 @@ drop: return TC_ACT_SHOT; } -static int tcf_csum_dump(struct sk_buff *skb, - struct tc_action *a, int bind, int ref) +static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_csum *p = to_tcf_csum(a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e24a4093d6f6..e0aa30f83c6c 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets); + _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, + packets); if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6038c85d92f5..1c76387c5d9c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -204,7 +204,8 @@ out: return retval; } -static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ba7074b391ae..d1bd248fe146 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -263,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcf_action; } -static int -tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); @@ -349,14 +349,12 @@ static struct pernet_operations police_net_ops = { .size = sizeof(struct tc_action_net), }; -static int __init -police_init_module(void) +static int __init police_init_module(void) { return tcf_register_action(&act_police_ops, &police_net_ops); } -static void __exit -police_cleanup_module(void) +static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops, &police_net_ops); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a7c5645373af..11da7da0b7c4 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -344,13 +344,15 @@ replay: if (err == 0) { struct tcf_proto *next = rtnl_dereference(tp->next); - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, + RTM_DELTFILTER); if (tcf_destroy(tp, false)) RCU_INIT_POINTER(*back, next); } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + err = tfilter_notify(net, skb, n, tp, fh, + RTM_NEWTFILTER); goto errout; default: err = -EINVAL; @@ -448,7 +450,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct net *net = sock_net(a->skb->sk); return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, - a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER); } /* called with RTNL */ @@ -552,7 +555,7 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) { #ifdef CONFIG_NET_CLS_ACT { @@ -560,8 +563,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tb[exts->police], rate_tlv, - "police", ovr, - TCA_ACT_BIND); + "police", ovr, TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); @@ -573,8 +575,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, int err, i = 0; err = tcf_action_init(net, tb[exts->action], rate_tlv, - NULL, ovr, - TCA_ACT_BIND, &actions); + NULL, ovr, TCA_ACT_BIND, + &actions); if (err) return err; list_for_each_entry(act, &actions, list) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1d92d4d3f222..c6f7a47541eb 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -55,7 +55,8 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, - [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, + .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -409,7 +410,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; } - ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], + ovr); if (ret < 0) goto errout; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a379bae1d74e..e39672394c7b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } -static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto(const struct sk_buff *skb, + const struct flow_keys *flow) { return flow->basic.ip_proto; } -static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto_src(const struct sk_buff *skb, + const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.src); @@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys return addr_fold(skb->sk); } -static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.dst); @@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb) }) #endif -static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_src(const struct sk_buff *skb, + const struct flow_keys *flow) { switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): @@ -161,7 +165,8 @@ fallback: return flow_get_src(skb, flow); } -static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): @@ -173,14 +178,16 @@ fallback: return flow_get_dst(skb, flow); } -static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, + const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: return flow_get_proto_src(skb, flow); } -static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a3f4c706dfaa..2af09c872a1a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -241,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, tc.type = TC_SETUP_CLSFLOWER; tc.cls_flower = &offload; - err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &tc); if (tc_skip_sw(flags)) return err; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index cc0bda945800..9dc63d54e167 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle) } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) + struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; @@ -188,7 +188,8 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, - struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) + struct nlattr **tb, struct nlattr **tca, unsigned long base, + bool ovr) { struct fw_head *head = rtnl_dereference(tp->root); struct tcf_exts e; @@ -237,9 +238,8 @@ errout: static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + u32 handle, struct nlattr **tca, unsigned long *arg, + bool ovr) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *) *arg; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index c91e65d81a48..a4ce39b19be0 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp) return 0; } -static void -route4_delete_filter(struct rcu_head *head) +static void route4_delete_filter(struct rcu_head *head) { struct route4_filter *f = container_of(head, struct route4_filter, rcu); @@ -474,10 +473,8 @@ errout: } static int route4_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index d9500709831f..96144bdf30db 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -50,14 +50,13 @@ struct tcindex_data { struct rcu_head rcu; }; -static inline int -tcindex_filter_is_set(struct tcindex_filter_result *r) +static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { return tcf_exts_is_predicative(&r->exts) || r->res.classid; } -static struct tcindex_filter_result * -tcindex_lookup(struct tcindex_data *p, u16 key) +static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, + u16 key) { if (p->perfect) { struct tcindex_filter_result *f = p->perfect + key; @@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head) static void tcindex_destroy_fexts(struct rcu_head *head) { - struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); + struct tcindex_filter *f = container_of(head, struct tcindex_filter, + rcu); tcf_exts_destroy(&f->result.exts); kfree(f); @@ -550,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force) static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index a29263a9d8c1..ae83c3aec308 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key, return h; } -static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) +static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) { struct { struct tc_u_knode *knode; @@ -256,8 +257,7 @@ deadloop: return -1; } -static struct tc_u_hnode * -u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; @@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) return ht; } -static struct tc_u_knode * -u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { unsigned int sel; struct tc_u_knode *n = NULL; @@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp) return 0; } -static int u32_destroy_key(struct tcf_proto *tp, - struct tc_u_knode *n, +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { tcf_exts_destroy(&n->exts); @@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) } } -static int u32_replace_hw_hnode(struct tcf_proto *tp, - struct tc_u_hnode *h, - u32 flags) +static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, + u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; @@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) } } -static int u32_replace_hw_knode(struct tcf_proto *tp, - struct tc_u_knode *n, - u32 flags) +static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, + u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; @@ -763,8 +759,7 @@ errout: return err; } -static void u32_replace_knode(struct tcf_proto *tp, - struct tc_u_common *tp_c, +static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, struct tc_u_knode *n) { struct tc_u_knode __rcu **ins; @@ -845,8 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + struct nlattr **tca, unsigned long *arg, bool ovr) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -1088,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; struct tc_u_hnode *ht_up, *ht_down; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d677b3484d81..206dc24add3a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -389,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) static struct qdisc_rate_table *qdisc_rtab_list; -struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, + struct nlattr *tab) { struct qdisc_rate_table *rtab; @@ -541,7 +542,8 @@ nla_put_failure: return -1; } -void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) +void __qdisc_calculate_pkt_len(struct sk_buff *skb, + const struct qdisc_size_table *stab) { int pkt_len, slot; @@ -888,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock; Parameters are passed via opt. */ -static struct Qdisc * -qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, - struct Qdisc *p, u32 parent, u32 handle, - struct nlattr **tca, int *errp) +static struct Qdisc *qdisc_create(struct net_device *dev, + struct netdev_queue *dev_queue, + struct Qdisc *p, u32 parent, u32 handle, + struct nlattr **tca, int *errp) { int err; struct nlattr *kind = tca[TCA_KIND]; @@ -1073,7 +1075,8 @@ struct check_loop_arg { int depth; }; -static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); +static int check_loop_fn(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { @@ -1450,7 +1453,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, } else { if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWQDISC) <= 0) goto done; q_idx++; } @@ -1471,7 +1475,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, } if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWQDISC) <= 0) goto done; q_idx++; } @@ -1505,7 +1510,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, true) < 0) + if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + true) < 0) goto done; dev_queue = dev_ingress_queue(dev); @@ -1640,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) if (cops->delete) err = cops->delete(q, cl); if (err == 0) - tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); + tclass_notify(net, skb, n, q, cl, + RTM_DELTCLASS); goto out; case RTM_GETTCLASS: err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); @@ -1738,12 +1745,14 @@ struct qdisc_dump_args { struct netlink_callback *cb; }; -static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, - a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTCLASS); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, @@ -1976,10 +1985,12 @@ static int __init pktsched_init(void) rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, + NULL); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, + NULL); return 0; } -- cgit v1.2.1 From a435a07f9164dda7c0c26e8ad758881f4bafc127 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Sun, 18 Sep 2016 17:46:07 +0200 Subject: net: ipv6: fallback to full lookup if table lookup is unsuitable Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") Signed-off-by: Vincent Bernat Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 49817555449e..e3a224b97905 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1986,9 +1986,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - if (cfg->fc_table) + if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); -- cgit v1.2.1 From 83e7e4ce9e93c3b020497144f4354b62aed5d894 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 19 Sep 2016 19:00:10 +0800 Subject: mac80211: Use rhltable instead of rhashtable mac80211 currently uses rhashtable with insecure_elasticity set to true. The latter is because of duplicate objects. What's more, mac80211 walks the rhashtable chains by hand which is broken as rhashtable may contain multiple tables due to resizing or rehashing. This patch fixes it by converting it to the newly added rhltable interface which is designed for use with duplicate objects. With rhltable a lookup returns a list of objects instead of a single one. This is then fed into the existing for_each_sta_info macro. This patch also deletes the sta_addr_hash function since rhashtable defaults to jhash. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/rx.c | 7 ++----- net/mac80211/sta_info.c | 52 +++++++++++++++++++--------------------------- net/mac80211/sta_info.h | 19 +++++++---------- net/mac80211/status.c | 7 ++----- 5 files changed, 33 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c71c73594790..e496dee5af08 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1213,7 +1213,7 @@ struct ieee80211_local { spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; - struct rhashtable sta_hash; + struct rhltable sta_hash; struct timer_list sta_cleanup; int sta_generation; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e796060b7c5e..f7cf342bab52 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4003,7 +4003,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, __le16 fc; struct ieee80211_rx_data rx; struct ieee80211_sub_if_data *prev; - struct rhash_head *tmp; + struct rhlist_head *tmp; int err = 0; fc = ((struct ieee80211_hdr *)skb->data)->frame_control; @@ -4046,13 +4046,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, goto out; } else if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; - const struct bucket_table *tbl; prev_sta = NULL; - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) { + for_each_sta_info(local, hdr->addr2, sta, tmp) { if (!prev_sta) { prev_sta = sta; continue; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 1b1b28ff4fdb..c803e2cb58bc 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -67,12 +67,10 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ - .insecure_elasticity = true, /* Disable chain-length checks. */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, - .hashfn = sta_addr_hash, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; @@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = { static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { - return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node, - sta_rht_params); + return rhltable_remove(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void __cleanup_single_sta(struct sta_info *sta) @@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta) sta_info_free(local, sta); } +struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, + const u8 *addr) +{ + return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); +} + /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; rcu_read_lock(); - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold @@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; rcu_read_lock(); - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); @@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { - return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node, - sta_rht_params); + return rhltable_insert(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) @@ -453,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta) is_multicast_ether_addr(sta->sta.addr))) return -EINVAL; - /* Strictly speaking this isn't necessary as we hold the mutex, but - * the rhashtable code can't really deal with that distinction. We - * do require the mutex for correctness though. + /* The RCU read lock is required by rhashtable due to + * asynchronous resize/rehash. We also require the mutex + * for correctness. */ rcu_read_lock(); lockdep_assert_held(&sdata->local->sta_mtx); @@ -1043,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data) round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } -u32 sta_addr_hash(const void *key, u32 length, u32 seed) -{ - return jhash(key, ETH_ALEN, seed); -} - int sta_info_init(struct ieee80211_local *local) { int err; - err = rhashtable_init(&local->sta_hash, &sta_rht_params); + err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; @@ -1068,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local) void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); - rhashtable_destroy(&local->sta_hash); + rhltable_destroy(&local->sta_hash); } @@ -1138,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; - - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); /* * Just return a random station if localaddr is NULL * ... first in list. */ - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 530231b73278..ed5fcb984a01 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -455,7 +455,7 @@ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; - struct rhash_head hash_node; + struct rhlist_head hash_node; u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -638,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid) */ #define STA_INFO_CLEANUP_INTERVAL (10 * HZ) +struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, + const u8 *addr); + /* * Get a STA info, must be under RCU read lock. */ @@ -647,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); -u32 sta_addr_hash(const void *key, u32 length, u32 seed); - -#define _sta_bucket_idx(_tbl, _a) \ - rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd)) - -#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \ - rht_for_each_entry_rcu(_sta, _tmp, tbl, \ - _sta_bucket_idx(tbl, _addr), \ - hash_node) \ - /* compare address and run code only if it matches */ \ - if (ether_addr_equal(_sta->addr, (_addr))) +#define for_each_sta_info(local, _addr, _sta, _tmp) \ + rhl_for_each_entry_rcu(_sta, _tmp, \ + sta_info_hash_lookup(local, _addr), hash_node) /* * Get STA info by index, BROKEN! diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ea39f8a7baf3..ddf71c648cab 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -746,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); __le16 fc; struct ieee80211_supported_band *sband; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; int retry_count; int rates_idx; bool send_to_cooked; @@ -755,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; - const struct bucket_table *tbl; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -764,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) { + for_each_sta_info(local, hdr->addr1, sta, tmp) { /* skip wrong virtual interface */ if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) continue; -- cgit v1.2.1 From 06f8ec9041f02d44bb0b75d47668e2fe00d5e0c3 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 19 Sep 2016 15:28:00 +0200 Subject: net-next: dsa: fix duplicate invocation of set_addr() commit 83c0afaec7b730b ("net: dsa: Add new binding implementation") has a duplicate invocation of the set_addr() operation callback. Remove one of them. Signed-off-by: John Crispin Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 8278385dcd21..cffc19e972a1 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -308,10 +308,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err < 0) return err; - err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); - if (err < 0) - return err; - if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) -- cgit v1.2.1 From 092183df0fa1f4b49baad3a980c55d55de07dfb7 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 19 Sep 2016 15:28:01 +0200 Subject: net-next: dsa: make the set_addr() operation optional Only 1 of the 3 drivers currently has a set_addr() operation. Make the set_addr() callback optional to reduce the amount of empty stubs inside the drivers. Signed-off-by: John Crispin Signed-off-by: David S. Miller --- net/dsa/dsa.c | 8 +++++--- net/dsa/dsa2.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 66e31acfcad8..a6902c1e2f28 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -378,9 +378,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (ret < 0) goto out; - ret = ops->set_addr(ds, dst->master_netdev->dev_addr); - if (ret < 0) - goto out; + if (ops->set_addr) { + ret = ops->set_addr(ds, dst->master_netdev->dev_addr); + if (ret < 0) + goto out; + } if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index cffc19e972a1..f8a7d9aab437 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -304,9 +304,11 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err < 0) return err; - err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); - if (err < 0) - return err; + if (ds->ops->set_addr) { + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + } if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); -- cgit v1.2.1 From 190aa3e77880a05332ea1ccb382a51285d57adb5 Mon Sep 17 00:00:00 2001 From: pravin shelar Date: Mon, 19 Sep 2016 13:50:59 -0700 Subject: openvswitch: Fix Frame-size larger than 1024 bytes warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to declare separate key on stack, we can just use sw_flow->key to store the key directly. This commit fixes following warning: net/openvswitch/datapath.c: In function ‘ovs_flow_cmd_new’: net/openvswitch/datapath.c:1080:1: warning: the frame size of 1040 bytes is larger than 1024 bytes [-Wframe-larger-than=] Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0536ab3504d5..474e7a6bfeb7 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct sw_flow_key key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &new_flow->key, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, true, &mask); - /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &key, log); + &new_flow->key, log); if (error) goto err_kfree_flow; + /* unmasked key is needed to match when ufid is not used. */ + if (ovs_identifier_is_key(&new_flow->id)) + match.key = new_flow->id.unmasked_key; + + ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); @@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); -- cgit v1.2.1 From 2279994d07ab67ff7a1d09bfbd65588332dfb6d8 Mon Sep 17 00:00:00 2001 From: pravin shelar Date: Mon, 19 Sep 2016 13:51:00 -0700 Subject: openvswitch: avoid resetting flow key while installing new flow. since commit commit db74a3335e0f6 ("openvswitch: use percpu flow stats") flow alloc resets flow-key. So there is no need to reset the flow-key again if OVS is using newly allocated flow-key. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 8 ++++---- net/openvswitch/flow.c | 2 -- net/openvswitch/flow_netlink.c | 6 ++++-- net/openvswitch/flow_netlink.h | 3 ++- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 474e7a6bfeb7..4d67ea856067 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -955,7 +955,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, &mask); + ovs_match_init(&match, &new_flow->key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) @@ -1124,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &key, true, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); } else if (!ufid_present) { @@ -1241,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { @@ -1300,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 0fa45439def1..634cc10d6dee 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -767,8 +767,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, { int err; - memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); - /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 8efa718ddb5e..ae25ded82b3b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1996,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, + bool reset_key, struct sw_flow_mask *mask) { memset(match, 0, sizeof(*match)); match->key = key; match->mask = mask; - memset(key, 0, sizeof(*key)); + if (reset_key) + memset(key, 0, sizeof(*key)); if (mask) { memset(&mask->key, 0, sizeof(mask->key)); @@ -2049,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct nlattr *a; int err = 0, start, opts_type; - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 47dd142eca1c..45f9769e5aac 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void); size_t ovs_key_attr_size(void); void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, struct sw_flow_mask *mask); + struct sw_flow_key *key, bool reset_key, + struct sw_flow_mask *mask); int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -- cgit v1.2.1 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 298b146b47e7..0920c2ac1d00 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, return err; } +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) @@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_STACK, .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, @@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; + struct sk_buff *clone; + int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; @@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(!dev)) return -EINVAL; - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - bpf_push_mac_rcsum(skb); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + bpf_push_mac_rcsum(clone); return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { bool bpf_helper_changes_skb_data(void *func) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_skb_change_proto) - return true; - if (func == bpf_skb_change_tail) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace) return true; return false; @@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) @@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops xdp_ops = { -- cgit v1.2.1 From aecc5cefc389735b5327d234e11d1fe505e1c280 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 19 Sep 2016 19:02:51 -0400 Subject: net sched actions: fix GETing actions With the batch changes that translated transient actions into a temporary list lost in the translation was the fact that tcf_action_destroy() will eventually delete the action from the permanent location if the refcount is zero. Example of what broke: ...add a gact action to drop sudo $TC actions add action drop index 10 ...now retrieve it, looks good sudo $TC actions get action gact index 10 ...retrieve it again and find it is gone! sudo $TC actions get action gact index 10 Fixes: 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array"), Fixes: 824a7e8863b3 ("net_sched: remove an unnecessary list_del()") Fixes: f07fed82ad79 ("net_sched: remove the leftover cleanup_a()") Acked-by: Cong Wang Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d0aceb1740b1..c9102172ce3b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -592,6 +592,17 @@ err_out: return ERR_PTR(err); } +static void cleanup_a(struct list_head *actions, int ovr) +{ + struct tc_action *a; + + if (!ovr) + return; + + list_for_each_entry(a, actions, list) + a->tcfa_refcnt--; +} + int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions) { @@ -611,8 +622,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, goto err; } act->order = i; + if (ovr) + act->tcfa_refcnt++; list_add_tail(&act->list, actions); } + + /* Remove the temp refcnt which was necessary to protect against + * destroying an existing action which was being replaced + */ + cleanup_a(actions, ovr); return 0; err: @@ -882,6 +900,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; + if (event == RTM_GETACTION) + act->tcfa_refcnt++; list_add_tail(&act->list, &actions); } -- cgit v1.2.1 From f78e73e27fdeab6f9317667f7e9676b59c1ec1fb Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 19 Sep 2016 23:39:08 -0400 Subject: tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming conflict The upcoming change "lib/win_minmax: windowed min or max estimator" introduces a struct called minmax, which is then included in include/linux/tcp.h in the upcoming change "tcp: use windowed min filter library for TCP min_rtt estimation". This would create a compilation error for tcp_cdg.c, which defines its own minmax struct. To avoid this naming conflict (and potentially others in the future), this commit renames the version used in tcp_cdg.c to cdg_minmax. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Kenneth Klette Jonassen Acked-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_cdg.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 03725b294286..35b280361cb2 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); module_param(use_tolerance, bool, 0644); MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); -struct minmax { +struct cdg_minmax { union { struct { s32 min; @@ -74,10 +74,10 @@ enum cdg_state { }; struct cdg { - struct minmax rtt; - struct minmax rtt_prev; - struct minmax *gradients; - struct minmax gsum; + struct cdg_minmax rtt; + struct cdg_minmax rtt_prev; + struct cdg_minmax *gradients; + struct cdg_minmax gsum; bool gfilled; u8 tail; u8 state; @@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - struct minmax *gradients; + struct cdg_minmax *gradients; switch (ev) { case CA_EVENT_CWND_RESTART: -- cgit v1.2.1 From 6403389211e1f4d40ed963fe47a96fce1a3ba7a9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:10 -0400 Subject: tcp: use windowed min filter library for TCP min_rtt estimation Refactor the TCP min_rtt code to reuse the new win_minmax library in lib/win_minmax.c to simplify the TCP code. This is a pure refactor: the functionality is exactly the same. We just moved the windowed min code to make TCP easier to read and maintain, and to allow other parts of the kernel to use the windowed min/max filter code. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 64 ++++-------------------------------------------- net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 7 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7dae800092e6..e79ed17ccfd6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - tp->rtt_min[0].rtt = ~0U; + minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dad3e7eeed94..6886f386464f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2879,67 +2879,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, *rexmit = REXMIT_LOST; } -/* Kathleen Nichols' algorithm for tracking the minimum value of - * a data stream over some fixed time interval. (E.g., the minimum - * RTT over the past five minutes.) It uses constant space and constant - * time per update yet almost always delivers the same minimum as an - * implementation that has to keep all the data in the window. - * - * The algorithm keeps track of the best, 2nd best & 3rd best min - * values, maintaining an invariant that the measurement time of the - * n'th best >= n-1'th best. It also makes sure that the three values - * are widely separated in the time window since that bounds the worse - * case error when that data is monotonically increasing over the window. - * - * Upon getting a new min, we can forget everything earlier because it - * has no value - the new min is <= everything else in the window by - * definition and it's the most recent. So we restart fresh on every new min - * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd - * best. - */ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { - const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; - struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { - .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), - .ts = now, - }; - u32 elapsed; - - /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ - if (unlikely(rttm.rtt <= m[0].rtt)) - m[0] = m[1] = m[2] = rttm; - else if (rttm.rtt <= m[1].rtt) - m[1] = m[2] = rttm; - else if (rttm.rtt <= m[2].rtt) - m[2] = rttm; - - elapsed = now - m[0].ts; - if (unlikely(elapsed > wlen)) { - /* Passed entire window without a new min so make 2nd choice - * the new min & 3rd choice the new 2nd. So forth and so on. - */ - m[0] = m[1]; - m[1] = m[2]; - m[2] = rttm; - if (now - m[0].ts > wlen) { - m[0] = m[1]; - m[1] = rttm; - if (now - m[0].ts > wlen) - m[0] = rttm; - } - } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { - /* Passed a quarter of the window without a new min so - * take 2nd choice from the 2nd quarter of the window. - */ - m[2] = m[1] = rttm; - } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { - /* Passed half the window without a new min so take the 3rd - * choice from the last half of the window. - */ - m[2] = rttm; - } + struct tcp_sock *tp = tcp_sk(sk); + u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; + + minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + rtt_us ? : jiffies_to_usecs(1)); } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f63c73dc0acb..568947110b60 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - newtp->rtt_min[0].rtt = ~0U; + minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; -- cgit v1.2.1 From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2016 23:39:11 -0400 Subject: net_sched: sch_fq: add low_rate_threshold parameter This commit adds to the fq module a low_rate_threshold parameter to insert a delay after all packets if the socket requests a pacing rate below the threshold. This helps achieve more precise control of the sending rate with low-rate paths, especially policers. The basic issue is that if a congestion control module detects a policer at a certain rate, it may want fq to be able to shape to that policed rate. That way the sender can avoid policer drops by having the packets arrive at the policer at or just under the policed rate. The default threshold of 550Kbps was chosen analytically so that for policers or links at 500Kbps or 512Kbps fq would very likely invoke this mechanism, even if the pacing rate was briefly slightly above the available bandwidth. This value was then empirically validated with two years of production testing on YouTube video servers. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index dc52cc10d6ed..5dd929cc1423 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -94,6 +94,7 @@ struct fq_sched_data { u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ u32 orphan_mask; /* mask for orphaned skb */ + u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; @@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; - u32 rate; + u32 rate, plen; skb = fq_dequeue_head(sch, &q->internal); if (skb) @@ -482,7 +483,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (f->credit > 0 || !q->rate_enable) + if (!q->rate_enable) goto out; /* Do not pace locally generated ack packets */ @@ -493,8 +494,15 @@ begin: if (skb->sk) rate = min(skb->sk->sk_pacing_rate, rate); + if (rate <= q->low_rate_threshold) { + f->credit = 0; + plen = qdisc_pkt_len(skb); + } else { + plen = max(qdisc_pkt_len(skb), q->quantum); + if (f->credit > 0) + goto out; + } if (rate != ~0U) { - u32 plen = max(qdisc_pkt_len(skb), q->quantum); u64 len = (u64)plen * NSEC_PER_SEC; if (likely(rate)) @@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_FLOW_MAX_RATE]) q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) + q->low_rate_threshold = + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); @@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->fq_root = NULL; q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; + q->low_rate_threshold = 550000 / 8; qdisc_watchdog_init(&q->watchdog, sch); if (opt) @@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, + q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; -- cgit v1.2.1 From b2d3ea4a730f812b9c0f67a67b6762ce66ddb17c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2016 23:39:12 -0400 Subject: tcp: switch back to proper tcp_skb_cb size check in tcp_init() Revert to the tcp_skb_cb size check that tcp_init() had before commit b4772ef879a8 ("net: use common macro for assering skb->cb[] available size in protocol families"). As related commit 744d5a3e9fe2 ("net: move skb->dropcount to skb->cb[]") explains, the sock_skb_cb_check_size() mechanism was added to ensure that there is space for dropcount, "for protocol families using it". But TCP is not a protocol using dropcount, so tcp_init() doesn't need to provision space for dropcount in the skb->cb[], and thus we can revert to the older form of the tcp_skb_cb size check. Doing so allows TCP to use 4 more bytes of the skb->cb[] space. Fixes: b4772ef879a8 ("net: use common macro for assering skb->cb[] available size in protocol families") Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e79ed17ccfd6..de02fb4b1349 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3261,11 +3261,12 @@ static void __init tcp_init_mem(void) void __init tcp_init(void) { - unsigned long limit; int max_rshare, max_wshare, cnt; + unsigned long limit; unsigned int i; - sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > + FIELD_SIZEOF(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); -- cgit v1.2.1 From 0682e6902a52aca7caf6ad42551b16ea0f87bc31 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:13 -0400 Subject: tcp: count packets marked lost for a TCP connection Count the number of packets that a TCP connection marks lost. Congestion control modules can use this loss rate information for more intelligent decisions about how fast to send. Specifically, this is used in TCP BBR policer detection. BBR uses a high packet loss rate as one signal in its policer detection and policer bandwidth estimation algorithm. The BBR policer detection algorithm cannot simply track retransmits, because a retransmit can be (and often is) an indicator of packets lost long, long ago. This is particularly true in a long CA_Loss period that repairs the initial massive losses when a policer kicks in. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6886f386464f..9413288c2778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -899,12 +899,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } +/* Sum the number of packets on the wire we have marked as lost. + * There are two cases we care about here: + * a) Packet hasn't been marked lost (nor retransmitted), + * and this is the first loss. + * b) Packet has been marked both lost and retransmitted, + * and this means we think it was lost again. + */ +static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (!(sacked & TCPCB_LOST) || + ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) + tp->lost += tcp_skb_pcount(skb); +} + static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) { if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tcp_verify_retransmit_hint(tp, skb); tp->lost_out += tcp_skb_pcount(skb); + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } } @@ -913,6 +930,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); + tcp_sum_lost(tp, skb); if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1890,6 +1908,7 @@ void tcp_enter_loss(struct sock *sk) struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ + bool mark_lost; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1923,8 +1942,12 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; + mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || + is_reneg); + if (mark_lost) + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { + if (mark_lost) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); -- cgit v1.2.1 From b9f64820fb226a4e8ab10591f46cecd91ca56b30 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:14 -0400 Subject: tcp: track data delivery rate for a TCP connection This patch generates data delivery rate (throughput) samples on a per-ACK basis. These rate samples can be used by congestion control modules, and specifically will be used by TCP BBR in later patches in this series. Key state: tp->delivered: Tracks the total number of data packets (original or not) delivered so far. This is an already-existing field. tp->delivered_mstamp: the last time tp->delivered was updated. Algorithm: A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis: d1: the current tp->delivered after processing the ACK t1: the current time after processing the ACK d0: the prior tp->delivered when the acked skb was transmitted t0: the prior tp->delivered_mstamp when the acked skb was transmitted When an skb is transmitted, we snapshot d0 and t0 in its control block in tcp_rate_skb_sent(). When an ACK arrives, it may SACK and ACK some skbs. For each SACKed or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct to reflect the latest (d0, t0). Finally, tcp_rate_gen() generates a rate sample by storing (d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us. One caveat: if an skb was sent with no packets in flight, then tp->delivered_mstamp may be either invalid (if the connection is starting) or outdated (if the connection was idle). In that case, we'll re-stamp tp->delivered_mstamp. At first glance it seems t0 should always be the time when an skb was transmitted, but actually this could over-estimate the rate due to phase mismatch between transmit and ACK events. To track the delivery rate, we ensure that if packets are in flight then t0 and and t1 are times at which packets were marked delivered. If the initial and final RTTs are different then one may be corrupted by some sort of noise. The noise we see most often is sending gaps caused by delayed, compressed, or stretched acks. This either affects both RTTs equally or artificially reduces the final RTT. We approach this by recording the info we need to compute the initial RTT (duration of the "send phase" of the window) when we recorded the associated inflight. Then, for a filter to avoid bandwidth overestimates, we generalize the per-sample bandwidth computation from: bw = delivered / ack_phase_rtt to the following: bw = delivered / max(send_phase_rtt, ack_phase_rtt) In large-scale experiments, this filtering approach incorporating send_phase_rtt is effective at avoiding bandwidth overestimates due to ACK compression or stretched ACKs. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/Makefile | 2 +- net/ipv4/tcp_input.c | 46 +++++++++++----- net/ipv4/tcp_output.c | 4 ++ net/ipv4/tcp_rate.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 net/ipv4/tcp_rate.c (limited to 'net') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 24629b6f57cc..9cfff1a0bf71 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_recovery.o \ + tcp_rate.o tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9413288c2778..d9ed4bb96f74 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1112,6 +1112,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct rate_sample *rate; int flag; }; @@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) + TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack, tcp_skb_pcount(skb), &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); - if (found_dup_sack) + if (found_dup_sack) { state->flag |= FLAG_DSACKING_ACK; + tp->delivered++; /* A spurious retransmission is delivered */ + } /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, + struct skb_mstamp *now) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt, now; + struct skb_mstamp first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; - fully_acked = false; } else { /* Speedup tcp_unlink_write_queue() and next loop */ @@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; + tcp_rate_skb_delivered(sk, skb, sack->rate); /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - skb_mstamp_get(&now); if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); + ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); } if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); + sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); } - + sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, ca_rtt_us); @@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { + sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_sacktag_state sack_state; + struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - u32 prior_delivered = tp->delivered; + u32 delivered = tp->delivered; + u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + struct skb_mstamp now; sack_state.first_sackt.v64 = 0; + sack_state.rate = &rs; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + skb_mstamp_get(&now); + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) } prior_fackets = tp->fackets_out; + rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + &sack_state, &now); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + lost = tp->lost - lost; /* freshly marked lost */ + tcp_rate_gen(sk, delivered, lost, &now, &rs); + tcp_cong_control(sk, ack, delivered, flag); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8b45794eb6b2..e02c8ebf3ed4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; + tcp_rate_skb_sent(sk, skb); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); + /* Update delivered info for the new segment */ + TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; + /* If this packet has been sent out already, we must * adjust the various packet counters. */ diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c new file mode 100644 index 000000000000..1daed6af6e80 --- /dev/null +++ b/net/ipv4/tcp_rate.c @@ -0,0 +1,149 @@ +#include + +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + */ + + +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + tp->first_tx_mstamp = skb->skb_mstamp; + tp->delivered_mstamp = skb->skb_mstamp; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the highest prior_delivered count. + */ +void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (!scb->tx.delivered_mstamp.v64) + return; + + if (!rs->prior_delivered || + after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = skb_mstamp_us_delta( + &skb->skb_mstamp, + &scb->tx.first_tx_mstamp); + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = skb->skb_mstamp; + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp.v64 = 0; +} + +/* Update the connection delivery information and generate a rate sample. */ +void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + struct skb_mstamp *now, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = *now; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available. */ + if (!rs->prior_mstamp.v64) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + rs->interval_us = -1; + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + } +} -- cgit v1.2.1 From d7722e8570fc0f1e003cee7cf37694041828918b Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 19 Sep 2016 23:39:15 -0400 Subject: tcp: track application-limited rate samples This commit adds code to track whether the delivery rate represented by each rate_sample was limited by the application. Upon each transmit, we store in the is_app_limited field in the skb a boolean bit indicating whether there is a known "bubble in the pipe": a point in the rate sample interval where the sender was application-limited, and did not transmit even though the cwnd and pacing rate allowed it. This logic marks the flow app-limited on a write if *all* of the following are true: 1) There is less than 1 MSS of unsent data in the write queue available to transmit. 2) There is no packet in the sender's queues (e.g. in fq or the NIC tx queue). 3) The connection is not limited by cwnd. 4) There are no lost packets to retransmit. The tcp_rate_check_app_limited() code in tcp_rate.c determines whether the connection is application-limited at the moment. If the flow is application-limited, it sets the tp->app_limited field. If the flow is application-limited then that means there is effectively a "bubble" of silence in the pipe now, and this silence will be reflected in a lower bandwidth sample for any rate samples from now until we get an ACK indicating this bubble has exited the pipe: specifically, until we get an ACK for the next packet we transmit. When we send every skb we record in scb->tx.is_app_limited whether the resulting rate sample will be application-limited. The code in tcp_rate_gen() checks to see when it is safe to mark all known application-limited bubbles of silence as having exited the pipe. It does this by checking to see when the delivered count moves past the tp->app_limited marker. At this point it zeroes the tp->app_limited marker, as all known bubbles are out of the pipe. We make room for the tx.is_app_limited bit in the skb by borrowing a bit from the in_flight field used by NV to record the number of bytes in flight. The receive window in the TCP header is 16 bits, and the max receive window scaling shift factor is 14 (RFC 1323). So the max receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we only need 30 bits for the tx.in_flight used by NV. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++++++ net/ipv4/tcp_minisocks.c | 3 +++ net/ipv4/tcp_rate.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de02fb4b1349..2250f891f931 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk) */ tp->snd_cwnd = TCP_INIT_CWND; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ @@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); + + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + res = do_tcp_sendpages(sk, page, offset, size, flags); release_sock(sk); return res; @@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 568947110b60..6234ebaa7db1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + tcp_init_xmit_timers(newsk); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 1daed6af6e80..52ff84be59ab 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -26,9 +26,13 @@ * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. */ - /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, after(scb->tx.delivered, rs->prior_delivered)) { rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ @@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". @@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); } } + +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tp->snd_cwnd && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} -- cgit v1.2.1 From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:16 -0400 Subject: tcp: export data delivery rate This commit export two new fields in struct tcp_info: tcpi_delivery_rate: The most recent goodput, as measured by tcp_rate_gen(). If the socket is limited by the sending application (e.g., no data to send), it reports the highest measurement instead of the most recent. The unit is bytes per second (like other rate fields in tcp_info). tcpi_delivery_rate_app_limited: A boolean indicating if the goodput was measured when the socket's throughput was limited by the sending application. This delivery rate information can be useful for applications that want to know the current throughput the TCP connection is seeing, e.g. adaptive bitrate video streaming. It can also be very useful for debugging or troubleshooting. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 ++++++++++- net/ipv4/tcp_rate.c | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2250f891f931..f253e5019d22 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2712,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp, intv; unsigned int start; int notsent_bytes; u64 rate64; @@ -2802,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; + + info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; + rate = READ_ONCE(tp->rate_delivered); + intv = READ_ONCE(tp->rate_interval_us); + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + put_unaligned(rate64, &info->tcpi_delivery_rate); + } } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 52ff84be59ab..9be1581a5a08 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -149,12 +149,22 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * for connections suffer heavy or prolonged losses. */ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - rs->interval_us = -1; if (!rs->is_retrans) pr_debug("tcp rate: %ld %d %u %u %u\n", rs->interval_us, rs->delivered, inet_csk(sk)->icsk_ca_state, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; } } -- cgit v1.2.1 From ed6e7268b930e0a9a65d895d368eac79a438d992 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:17 -0400 Subject: tcp: allow congestion control module to request TSO skb segment count Add the tso_segs_goal() function in tcp_congestion_ops to allow the congestion control module to specify the number of segments that should be in a TSO skb sent by tcp_write_xmit() and tcp_xmit_retransmit_queue(). The congestion control module can either request a particular number of segments in TSO skb that we transmit, or return 0 if it doesn't care. This allows the upcoming BBR congestion control module to select small TSO skb sizes if the module detects that the bottleneck bandwidth is very low, or that the connection is policed to a low rate. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e02c8ebf3ed4..01379567a732 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1566,6 +1566,17 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) return min_t(u32, segs, sk->sk_gso_max_segs); } +/* Return the number of segments we want in the skb we are transmitting. + * See if congestion control module wants to decide; otherwise, autosize. + */ +static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + + return tso_segs ? : tcp_tso_autosize(sk, mss_now); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -2061,7 +2072,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } - max_segs = tcp_tso_autosize(sk, mss_now); + max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2778,7 +2789,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } - max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); + max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); tcp_for_write_queue_from(skb, sk) { __u8 sacked; int segs; -- cgit v1.2.1 From 1b3878ca1551f3baab2c408d1e703b5ef785a1b2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:18 -0400 Subject: tcp: export tcp_tso_autosize() and parameterize minimum number of TSO segments To allow congestion control modules to use the default TSO auto-sizing algorithm as one of the ingredients in their own decision about TSO sizing: 1) Export tcp_tso_autosize() so that CC modules can use it. 2) Change tcp_tso_autosize() to allow callers to specify a minimum number of segments per TSO skb, in case the congestion control module has a different notion of the best floor for TSO skbs for the connection right now. For very low-rate paths or policed connections it can be appropriate to use smaller TSO skbs. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 01379567a732..0bf3d481fa85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1549,7 +1549,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1561,10 +1562,11 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) * This preserves ACK clocking and is consistent * with tcp_tso_should_defer() heuristic. */ - segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + segs = max_t(u32, bytes / mss_now, min_tso_segs); return min_t(u32, segs, sk->sk_gso_max_segs); } +EXPORT_SYMBOL(tcp_tso_autosize); /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. @@ -1574,7 +1576,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; - return tso_segs ? : tcp_tso_autosize(sk, mss_now); + return tso_segs ? : + tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ -- cgit v1.2.1 From 556c6b46d194cc0dbb6a5b22f1d2bbc699c86d8e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:19 -0400 Subject: tcp: export tcp_mss_to_mtu() for congestion control modules Export tcp_mss_to_mtu(), so that congestion control modules can use this to help calculate a pacing rate. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bf3d481fa85..7d025a7804b5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1362,6 +1362,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) } return mtu; } +EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) -- cgit v1.2.1 From 77bfc174c38e558a3425d3b069aa2762b2fedfdd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:20 -0400 Subject: tcp: allow congestion control to expand send buffer differently Currently the TCP send buffer expands to twice cwnd, in order to allow limited transmits in the CA_Recovery state. This assumes that cwnd does not increase in the CA_Recovery. For some congestion control algorithms, like the upcoming BBR module, if the losses in recovery do not indicate congestion then we may continue to raise cwnd multiplicatively in recovery. In such cases the current multiplier will falsely limit the sending rate, much as if it were limited by the application. This commit adds an optional congestion control callback to use a different multiplier to expand the TCP send buffer. For congestion control modules that do not specificy this callback, TCP continues to use the previous default of 2. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9ed4bb96f74..13a2e70141f5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; @@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk) * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to POLLOUT) */ - sndmem = 2 * nr_segs * per_mss; + sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; + sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); -- cgit v1.2.1 From c0402760f565ae066621ebf8720a32fba074d538 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:21 -0400 Subject: tcp: new CC hook to set sending rate with rate_sample in any CA state This commit introduces an optional new "omnipotent" hook, cong_control(), for congestion control modules. The cong_control() function is called at the end of processing an ACK (i.e., after updating sequence numbers, the SACK scoreboard, and loss detection). At that moment we have precise delivery rate information the congestion control module can use to control the sending behavior (using cwnd, TSO skb size, and pacing rate) in any CA state. This function can also be used by a congestion control that prefers not to use the default cwnd reduction approach (i.e., the PRR algorithm) during CA_Recovery to control the cwnd and sending rate during loss recovery. We take advantage of the fact that recent changes defer the retransmission or transmission of new data (e.g. by F-RTO) in recovery until the new tcp_cong_control() function is run. With this commit, we only run tcp_update_pacing_rate() if the congestion control is not using this new API. New congestion controls which use the new API do not want the TCP stack to run the default pacing rate calculation and overwrite whatever pacing rate they have chosen at initialization time. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_input.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 882caa4e72bc..1294af4e0127 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid) { + if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 13a2e70141f5..980a83edfa63 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + if (inet_csk(sk)->icsk_ca_ops->cong_control) + return; + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { @@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, - int flag) + int flag, const struct rate_sample *rs) { + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->cong_control) { + icsk->icsk_ca_ops->cong_control(sk, rs); + return; + } + if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ tcp_cwnd_reduction(sk, acked_sacked, flag); @@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag); + tcp_cong_control(sk, ack, delivered, flag, &rs); tcp_xmit_recovery(sk, rexmit); return 1; @@ -5982,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else tcp_init_metrics(sk); - tcp_update_pacing_rate(sk); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_time_stamp; -- cgit v1.2.1 From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:23 -0400 Subject: tcp_bbr: add BBR congestion control This commit implements a new TCP congestion control algorithm: BBR (Bottleneck Bandwidth and RTT). A detailed description of BBR will be published in ACM Queue, Vol. 14 No. 5, September-October 2016, as "BBR: Congestion-Based Congestion Control". BBR has significantly increased throughput and reduced latency for connections on Google's internal backbone networks and google.com and YouTube Web servers. BBR requires only changes on the sender side, not in the network or the receiver side. Thus it can be incrementally deployed on today's Internet, or in datacenters. The Internet has predominantly used loss-based congestion control (largely Reno or CUBIC) since the 1980s, relying on packet loss as the signal to slow down. While this worked well for many years, loss-based congestion control is unfortunately out-dated in today's networks. On today's Internet, loss-based congestion control causes the infamous bufferbloat problem, often causing seconds of needless queuing delay, since it fills the bloated buffers in many last-mile links. On today's high-speed long-haul links using commodity switches with shallow buffers, loss-based congestion control has abysmal throughput because it over-reacts to losses caused by transient traffic bursts. In 1981 Kleinrock and Gale showed that the optimal operating point for a network maximizes delivered bandwidth while minimizing delay and loss, not only for single connections but for the network as a whole. Finding that optimal operating point has been elusive, since any single network measurement is ambiguous: network measurements are the result of both bandwidth and propagation delay, and those two cannot be measured simultaneously. While it is impossible to disambiguate any single bandwidth or RTT measurement, a connection's behavior over time tells a clearer story. BBR uses a measurement strategy designed to resolve this ambiguity. It combines these measurements with a robust servo loop using recent control systems advances to implement a distributed congestion control algorithm that reacts to actual congestion, not packet loss or transient queue delay, and is designed to converge with high probability to a point near the optimal operating point. In a nutshell, BBR creates an explicit model of the network pipe by sequentially probing the bottleneck bandwidth and RTT. On the arrival of each ACK, BBR derives the current delivery rate of the last round trip, and feeds it through a windowed max-filter to estimate the bottleneck bandwidth. Conversely it uses a windowed min-filter to estimate the round trip propagation delay. The max-filtered bandwidth and min-filtered RTT estimates form BBR's model of the network pipe. Using its model, BBR sets control parameters to govern sending behavior. The primary control is the pacing rate: BBR applies a gain multiplier to transmit faster or slower than the observed bottleneck bandwidth. The conventional congestion window (cwnd) is now the secondary control; the cwnd is set to a small multiple of the estimated BDP (bandwidth-delay product) in order to allow full utilization and bandwidth probing while bounding the potential amount of queue at the bottleneck. When a BBR connection starts, it enters STARTUP mode and applies a high gain to perform an exponential search to quickly probe the bottleneck bandwidth (doubling its sending rate each round trip, like slow start). However, instead of continuing until it fills up the buffer (i.e. a loss), or until delay or ACK spacing reaches some threshold (like Hystart), it uses its model of the pipe to estimate when that pipe is full: it estimates the pipe is full when it notices the estimated bandwidth has stopped growing. At that point it exits STARTUP and enters DRAIN mode, where it reduces its pacing rate to drain the queue it estimates it has created. Then BBR enters steady state. In steady state, PROBE_BW mode cycles between first pacing faster to probe for more bandwidth, then pacing slower to drain any queue that created if no more bandwidth was available, and then cruising at the estimated bandwidth to utilize the pipe without creating excess queue. Occasionally, on an as-needed basis, it sends significantly slower to probe for RTT (PROBE_RTT mode). BBR has been fully deployed on Google's wide-area backbone networks and we're experimenting with BBR on Google.com and YouTube on a global scale. Replacing CUBIC with BBR has resulted in significant improvements in network latency and application (RPC, browser, and video) metrics. For more details please refer to our upcoming ACM Queue publication. Example performance results, to illustrate the difference between BBR and CUBIC: Resilience to random loss (e.g. from shallow buffers): Consider a netperf TCP_STREAM test lasting 30 secs on an emulated path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher). Low latency with the bloated buffers common in today's last-mile links: Consider a netperf TCP_STREAM test lasting 120 secs on an emulated path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck buffer. Both fully utilize the bottleneck bandwidth, but BBR achieves this with a median RTT 25x lower (43 ms instead of 1.09 secs). Our long-term goal is to improve the congestion control algorithms used on the Internet. We are hopeful that BBR can help advance the efforts toward this goal, and motivate the community to do further research. Test results, performance evaluations, feedback, and BBR-related discussions are very welcome in the public e-mail list for BBR: https://groups.google.com/forum/#!forum/bbr-dev NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, since pacing is integral to the BBR design and implementation. BBR without pacing would not function properly, and may incur unnecessary high packet loss rates. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 18 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_bbr.c | 896 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 915 insertions(+) create mode 100644 net/ipv4/tcp_bbr.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 50d6a9b49f6c..300b06888fdf 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -640,6 +640,21 @@ config TCP_CONG_CDG D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg +config TCP_CONG_BBR + tristate "BBR TCP" + default n + ---help--- + + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -674,6 +689,9 @@ choice config DEFAULT_CDG bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_RENO bool "Reno" endchoice diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9cfff1a0bf71..bc6a6c8b9bcd 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o +obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c new file mode 100644 index 000000000000..0ea66c2c9344 --- /dev/null +++ b/net/ipv4/tcp_bbr.c @@ -0,0 +1,896 @@ +/* Bottleneck Bandwidth and RTT (BBR) congestion control + * + * BBR congestion control computes the sending rate based on the delivery + * rate (throughput) estimated from ACKs. In a nutshell: + * + * On each ACK, update our model of the network path: + * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) + * min_rtt = windowed_min(rtt, 10 seconds) + * pacing_rate = pacing_gain * bottleneck_bandwidth + * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) + * + * The core algorithm does not react directly to packet losses or delays, + * although BBR may adjust the size of next send per ACK when loss is + * observed, or adjust the sending rate if it estimates there is a + * traffic policer, in order to keep the drop rate reasonable. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, + * since pacing is integral to the BBR design and implementation. + * BBR without pacing would not function properly, and may incur unnecessary + * high packet loss rates. + */ +#include +#include +#include +#include +#include +#include + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ + u32 rtt_cnt; /* count of packet-timed rounds elapsed */ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + restore_cwnd:1, /* decided to revert cwnd to old value */ + round_start:1, /* start of packet-timed tx->ack round? */ + tso_segs_goal:7, /* segments we want in each skb we send */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + unused:5, + lt_is_sampling:1, /* taking long-term ("LT") samples now? */ + lt_rtt_cnt:7, /* round trips in long-term interval */ + lt_use_bw:1; /* use lt_bw as our bw estimate? */ + u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ + u32 lt_last_delivered; /* LT intvl start: tp->delivered */ + u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ + u32 lt_last_lost; /* LT intvl start: tp->lost */ + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_cnt:3, /* number of rounds without large bw gains */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + unused_b:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ +}; + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* Window length of bw filter (in rounds): */ +static const int bbr_bw_rtts = CYCLE_LEN + 2; +/* Window length of min_rtt filter (in sec): */ +static const u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ +static const u32 bbr_probe_rtt_mode_ms = 200; +/* Skip TSO below the following bandwidth (bits/sec): */ +static const int bbr_min_tso_rate = 1200000; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ +static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ +static const int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ +static const int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; +/* Randomize the starting gain cycling phase over N phases: */ +static const u32 bbr_cycle_rand = 7; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight: + */ +static const u32 bbr_cwnd_min_target = 4; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available: */ +static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ +static const u32 bbr_full_bw_cnt = 3; + +/* "long-term" ("LT") bandwidth estimator parameters... */ +/* The minimum number of rounds in an LT bw sampling interval: */ +static const u32 bbr_lt_intvl_min_rtts = 4; +/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +static const u32 bbr_lt_loss_thresh = 50; +/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +static const u32 bbr_lt_bw_diff = 4000 / 8; +/* If we estimate we're policed, use lt_bw for this many round trips: */ +static const u32 bbr_lt_bw_max_rtts = 48; + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_cnt >= bbr_full_bw_cnt; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return minmax_get(&bbr->bw); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) +{ + rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC; + return rate >> BW_SCALE; +} + +/* Pace using current bw estimate and a gain factor. In order to help drive the + * network toward lower queues while maintaining high utilization and low + * latency, the average pacing rate aims to be slightly (~1%) lower than the + * estimated bandwidth. This is an important aspect of the design. In this + * implementation this slightly lower pacing rate is achieved implicitly by not + * including link-layer headers in the packet size used for the pacing rate. + */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +/* Return count of segments we want in the skbs we send, or 0 for default. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->tso_segs_goal; +} + +static void bbr_set_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 min_segs; + + min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), + 0x7FU); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START && tp->app_limited) { + bbr->idle_restart = 1; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + } +} + +/* Find target cwnd. Right-size the cwnd based on min RTT and the + * estimated bottleneck bandwidth: + * + * cwnd = bw * min_rtt * gain = BDP * gain + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + * + * To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: TCP_INIT_CWND. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + cwnd += 3 * bbr->tso_segs_goal; + + /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ + cwnd = (cwnd + 1) & ~1U; + + return cwnd; +} + +/* An optimization in BBR to reduce losses: On the first round of recovery, we + * follow the packet conservation principle: send P packets per P packets acked. + * After that, we slow-start and send at most 2*P packets per P packets acked. + * After recovery finishes, or upon undo, we restore the cwnd we had when + * recovery started (capped by the target cwnd based on estimated BDP). + * + * TODO(ycheng/ncardwell): implement a rate-based approach. + */ +static bool bbr_set_cwnd_to_recover_or_restore( + struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; + u32 cwnd = tp->snd_cwnd; + + /* An ACK for P pkts should release at most 2*P packets. We do this + * in two steps. First, here we deduct the number of lost packets. + * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. + */ + if (rs->losses > 0) + cwnd = max_t(s32, cwnd - rs->losses, 1); + + if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { + /* Starting 1st round of Recovery, so do packet conservation. */ + bbr->packet_conservation = 1; + bbr->next_rtt_delivered = tp->delivered; /* start round now */ + /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ + cwnd = tcp_packets_in_flight(tp) + acked; + } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { + /* Exiting loss recovery; restore cwnd saved before recovery. */ + bbr->restore_cwnd = 1; + bbr->packet_conservation = 0; + } + bbr->prev_ca_state = state; + + if (bbr->restore_cwnd) { + /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ + cwnd = max(cwnd, bbr->prior_cwnd); + bbr->restore_cwnd = 0; + } + + if (bbr->packet_conservation) { + *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); + return true; /* yes, using packet conservation */ + } + *new_cwnd = cwnd; + return false; +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd = 0, target_cwnd = 0; + + if (!acked) + return; + + if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) + goto done; + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + target_cwnd = bbr_target_cwnd(sk, bw, gain); + if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ + cwnd = min(cwnd + acked, target_cwnd); + else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) + cwnd = cwnd + acked; + cwnd = max(cwnd, bbr_cwnd_min_target); + +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); +} + +/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +static bool bbr_is_next_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool is_full_length = + skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + bbr->min_rtt_us; + u32 inflight, bw; + + /* The pacing_gain of 1.0 paces at the estimated bw to try to fully + * use the pipe without increasing the queue. + */ + if (bbr->pacing_gain == BBR_UNIT) + return is_full_length; /* just use wall clock time */ + + inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + bw = bbr_max_bw(sk); + + /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at + * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is + * small (e.g. on a LAN). We do not persist if packets are lost, since + * a path with small buffers may not hold that much. + */ + if (bbr->pacing_gain > BBR_UNIT) + return is_full_length && + (rs->losses || /* perhaps pacing_gain*BDP won't fit */ + inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + + /* A pacing_gain < 1.0 tries to drain extra queue we added if bw + * probing didn't find more bw. If inflight falls to match BDP then we + * estimate queue is drained; persisting would underutilize the pipe. + */ + return is_full_length || + inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); +} + +static void bbr_advance_cycle_phase(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); + bbr->cycle_mstamp = tp->delivered_mstamp; + bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; +} + +/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +static void bbr_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && + bbr_is_next_cycle_phase(sk, rs)) + bbr_advance_cycle_phase(sk); +} + +static void bbr_reset_startup_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_STARTUP; + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; +} + +static void bbr_reset_probe_bw_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_PROBE_BW; + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = bbr_cwnd_gain; + bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); + bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +} + +static void bbr_reset_mode(struct sock *sk) +{ + if (!bbr_full_bw_reached(sk)) + bbr_reset_startup_mode(sk); + else + bbr_reset_probe_bw_mode(sk); +} + +/* Start a new long-term sampling interval. */ +static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_delivered = tp->delivered; + bbr->lt_last_lost = tp->lost; + bbr->lt_rtt_cnt = 0; +} + +/* Completely reset long-term bandwidth sampling. */ +static void bbr_reset_lt_bw_sampling(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_bw = 0; + bbr->lt_use_bw = 0; + bbr->lt_is_sampling = false; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 diff; + + if (bbr->lt_bw) { /* do we have bw from a previous interval? */ + /* Is new bw close to the lt_bw from the previous interval? */ + diff = abs(bw - bbr->lt_bw); + if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || + (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= + bbr_lt_bw_diff)) { + /* All criteria are met; estimate we're policed. */ + bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ + bbr->lt_use_bw = 1; + bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ + bbr->lt_rtt_cnt = 0; + return; + } + } + bbr->lt_bw = bw; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of + * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and + * explicitly models their policed rate, to reduce unnecessary losses. We + * estimate that we're policed if we see 2 consecutive sampling intervals with + * consistent throughput and high packet loss. If we think we're being policed, + * set lt_bw to the "long-term" average delivery rate from those 2 intervals. + */ +static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 lost, delivered; + u64 bw; + s32 t; + + if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ + if (bbr->mode == BBR_PROBE_BW && bbr->round_start && + ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { + bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ + bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ + } + return; + } + + /* Wait for the first loss before sampling, to let the policer exhaust + * its tokens and estimate the steady-state rate allowed by the policer. + * Starting samples earlier includes bursts that over-estimate the bw. + */ + if (!bbr->lt_is_sampling) { + if (!rs->losses) + return; + bbr_reset_lt_bw_sampling_interval(sk); + bbr->lt_is_sampling = true; + } + + /* To avoid underestimates, reset sampling if we run out of data. */ + if (rs->is_app_limited) { + bbr_reset_lt_bw_sampling(sk); + return; + } + + if (bbr->round_start) + bbr->lt_rtt_cnt++; /* count round trips in this interval */ + if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) + return; /* sampling interval needs to be longer */ + if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { + bbr_reset_lt_bw_sampling(sk); /* interval is too long */ + return; + } + + /* End sampling interval when a packet is lost, so we estimate the + * policer tokens were exhausted. Stopping the sampling before the + * tokens are exhausted under-estimates the policed rate. + */ + if (!rs->losses) + return; + + /* Calculate packets lost and delivered in sampling interval. */ + lost = tp->lost - bbr->lt_last_lost; + delivered = tp->delivered - bbr->lt_last_delivered; + /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ + if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) + return; + + /* Find average delivery rate in this sampling interval. */ + t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); + if (t < 1) + return; /* interval is less than one jiffy, so wait */ + t = jiffies_to_usecs(t); + /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ + if (t < 1) { + bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ + return; + } + bw = (u64)delivered * BW_UNIT; + do_div(bw, t); + bbr_lt_bw_interval_done(sk, bw); +} + +/* Estimate the bandwidth based on how fast packets are delivered */ +static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->round_start = 0; + if (rs->delivered < 0 || rs->interval_us <= 0) + return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ + if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->rtt_cnt++; + bbr->round_start = 1; + bbr->packet_conservation = 0; + } + + bbr_lt_bw_sampling(sk, rs); + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + */ + bw = (u64)rs->delivered * BW_UNIT; + do_div(bw, rs->interval_us); + + /* If this sample is application-limited, it is likely to have a very + * low delivered count that represents application behavior rather than + * the available network rate. Such a sample could drag down estimated + * bw, causing needless slow-down. Thus, to continue to send at the + * last measured network rate, we filter out app-limited samples unless + * they describe the path bw at least as well as our bw model. + * + * So the goal during app-limited phase is to proceed with the best + * network rate no matter how long. We automatically leave this + * phase when app writes faster than the network can deliver :) + */ + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { + /* Incorporate new sample into our max bw filter. */ + minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); + } +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ + bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool filter_expired; + + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + filter_expired = after(tcp_time_stamp, + bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + bbr->min_rtt_us = rs->rtt_us; + bbr->min_rtt_stamp = tcp_time_stamp; + } + + if (bbr_probe_rtt_mode_ms > 0 && filter_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { + bbr->probe_rtt_done_stamp = tcp_time_stamp + + msecs_to_jiffies(bbr_probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done && + after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_time_stamp; + bbr->restore_cwnd = 1; /* snap to prior_cwnd */ + bbr_reset_mode(sk); + } + } + } + bbr->idle_restart = 0; +} + +static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) +{ + bbr_update_bw(sk, rs); + bbr_update_cycle_phase(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr_check_drain(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +static void bbr_main(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw; + + bbr_update_model(sk, rs); + + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_tso_segs_goal(sk); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->prior_cwnd = 0; + bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ + bbr->rtt_cnt = 0; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_time_stamp; + + minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + + /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); + sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ + bbr_set_pacing_rate(sk, bw, bbr_high_gain); + + bbr->restore_cwnd = 0; + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp.v64 = 0; + bbr->cycle_idx = 0; + bbr_reset_lt_bw_sampling(sk); + bbr_reset_startup_mode(sk); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* In theory BBR does not need to undo the cwnd since it does not + * always reduce cwnd on losses (see bbr_main()). Keep it for now. + */ +static u32 bbr_undo_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ +static u32 bbr_ssthresh(struct sock *sk) +{ + bbr_save_cwnd(sk); + return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ +} + +static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw(sk); + + bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; + memset(&info->bbr, 0, sizeof(info->bbr)); + info->bbr.bbr_bw_lo = (u32)bw; + info->bbr.bbr_bw_hi = (u32)(bw >> 32); + info->bbr.bbr_min_rtt = bbr->min_rtt_us; + info->bbr.bbr_pacing_gain = bbr->pacing_gain; + info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr); + } + return 0; +} + +static void bbr_set_state(struct sock *sk, u8 new_state) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + bbr->round_start = 1; /* treat RTO like end of a round */ + bbr_lt_bw_sampling(sk, &rs); + } +} + +static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, + .tso_segs_goal = bbr_tso_segs_goal, + .get_info = bbr_get_info, + .set_state = bbr_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson "); +MODULE_AUTHOR("Neal Cardwell "); +MODULE_AUTHOR("Yuchung Cheng "); +MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); -- cgit v1.2.1 From b5036cd4ed3173ab8cdbc85e2ba74acf46bafb51 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 20 Sep 2016 16:17:22 +0200 Subject: ipmr, ip6mr: return lastuse relative to now When I introduced the lastuse member I made a subtle error because it was returned as an absolute value but that is meaningless to user-space as it doesn't allow to see how old exactly an entry is. Let's make it similar to how the bridge returns such values and make it relative to "now" (jiffies). This allows us to show the actual age of the entries and is much more useful (e.g. user-space daemons can age out entries, iproute2 can display the lastuse properly). Fixes: 43b9e1274060 ("net: ipmr/ip6mr: add support for keeping an entry age") Reported-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 7 +++++-- net/ipv6/ip6mr.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 26253328d227..a87bcd2d4a94 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2076,6 +2076,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2105,12 +2106,14 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6122f9c5cc49..fccb5dd91902 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2239,6 +2239,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; + unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ @@ -2269,12 +2270,14 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, - jiffies_to_clock_t(c->mfc_un.res.lastuse), + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; -- cgit v1.2.1 From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 19 Sep 2016 16:17:57 +0200 Subject: vti6: fix input path Since commit 1625f4529957, vti6 is broken, all input packets are dropped (LINUX_MIB_XFRMINNOSTATES is incremented). XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in xfrm6_rcv_spi(). A new function xfrm6_rcv_tnl() that enables to pass a value to xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function is used in several handlers). CC: Alexey Kodanev Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- net/ipv6/ip6_vti.c | 4 +--- net/ipv6/xfrm6_input.c | 16 +++++++++++----- net/ipv6/xfrm6_tunnel.c | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 52a2f735881f..5bd3afdcc771 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00a2d40677d6..b5789562aded 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5743044cd660..e1c0bbe7996c 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.1 From c2f672fc94642bae96821a393f342edcfa9794a6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 Sep 2016 15:45:26 +0200 Subject: xfrm: state lookup can be lockless This is called from the packet input path, we get lock contention if many cpus handle ipsec in parallel. After recent rcu conversion it is safe to call __xfrm_state_lookup without the spinlock. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ba8bf518ba14..a38fdead38ea 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1431,9 +1431,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 { struct xfrm_state *x; - spin_lock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_lock(); x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_unlock(); return x; } EXPORT_SYMBOL(xfrm_state_lookup); -- cgit v1.2.1 From 332ae8e2f6ecda5e50c5c62ed62894963e3a83f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:53 +0100 Subject: net: cls_bpf: add hardware offload This patch adds hardware offload capability to cls_bpf classifier, similar to what have been done with U32 and flower. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c6f7a47541eb..6523c5b4c0a5 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -39,6 +39,7 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; + bool offloaded; struct tcf_exts exts; u32 handle; union { @@ -138,6 +139,71 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) return !prog->bpf_ops; } +static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, + enum tc_clsbpf_command cmd) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_bpf_offload bpf_offload = {}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSBPF; + offload.cls_bpf = &bpf_offload; + + bpf_offload.command = cmd; + bpf_offload.exts = &prog->exts; + bpf_offload.prog = prog->filter; + bpf_offload.name = prog->bpf_name; + bpf_offload.exts_integrated = prog->exts_integrated; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); +} + +static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct cls_bpf_prog *oldprog) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_bpf_prog *obj = prog; + enum tc_clsbpf_command cmd; + + if (oldprog && oldprog->offloaded) { + if (tc_should_offload(dev, tp, 0)) { + cmd = TC_CLSBPF_REPLACE; + } else { + obj = oldprog; + cmd = TC_CLSBPF_DESTROY; + } + } else { + if (!tc_should_offload(dev, tp, 0)) + return; + cmd = TC_CLSBPF_ADD; + } + + if (cls_bpf_offload_cmd(tp, obj, cmd)) + return; + + obj->offloaded = true; + if (oldprog) + oldprog->offloaded = false; +} + +static void cls_bpf_stop_offload(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + int err; + + if (!prog->offloaded) + return; + + err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + if (err) { + pr_err("Stopping hardware offload failed: %d\n", err); + return; + } + + prog->offloaded = false; +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -177,6 +243,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -193,6 +260,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(prog, tmp, &head->plist, link) { + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -415,6 +483,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) goto errout; + cls_bpf_offload(tp, prog, oldprog); + if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); -- cgit v1.2.1 From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:54 +0100 Subject: net: cls_bpf: limit hardware offload by software-only flag Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag. Unlike U32 and flower cls_bpf already has some netlink flags defined. Create a new attribute to be able to use the same flag values as the above. Unlike U32 and flower reject unknown flags. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6523c5b4c0a5..ebf01f7c1470 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann "); MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 +#define CLS_BPF_SUPPORTED_GEN_FLAGS \ + TCA_CLS_FLAGS_SKIP_HW struct cls_bpf_head { struct list_head plist; @@ -40,6 +42,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; bool offloaded; + u32 gen_flags; struct tcf_exts exts; u32 handle; union { @@ -55,6 +58,7 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, @@ -154,6 +158,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, bpf_offload.prog = prog->filter; bpf_offload.name = prog->bpf_name; bpf_offload.exts_integrated = prog->exts_integrated; + bpf_offload.gen_flags = prog->gen_flags; return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &offload); @@ -167,14 +172,14 @@ static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, enum tc_clsbpf_command cmd; if (oldprog && oldprog->offloaded) { - if (tc_should_offload(dev, tp, 0)) { + if (tc_should_offload(dev, tp, prog->gen_flags)) { cmd = TC_CLSBPF_REPLACE; } else { obj = oldprog; cmd = TC_CLSBPF_DESTROY; } } else { - if (!tc_should_offload(dev, tp, 0)) + if (!tc_should_offload(dev, tp, prog->gen_flags)) return; cmd = TC_CLSBPF_ADD; } @@ -370,6 +375,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, { bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; + u32 gen_flags = 0; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; @@ -394,8 +400,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; } + if (tb[TCA_BPF_FLAGS_GEN]) { + gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); + if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || + !tc_flags_valid(gen_flags)) { + ret = -EINVAL; + goto errout; + } + } prog->exts_integrated = have_exts; + prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, tp); @@ -568,6 +583,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) goto nla_put_failure; + if (prog->gen_flags && + nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags)) + goto nla_put_failure; nla_nest_end(skb, nest); -- cgit v1.2.1 From eadb41489fd2249e71fd14b36fb488ed7217ca4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:55 +0100 Subject: net: cls_bpf: add support for marking filters as hardware-only Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_SW flag. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index ebf01f7c1470..1becc2fe1bc5 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -28,7 +28,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 #define CLS_BPF_SUPPORTED_GEN_FLAGS \ - TCA_CLS_FLAGS_SKIP_HW + (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW) struct cls_bpf_head { struct list_head plist; @@ -96,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, qdisc_skb_cb(skb)->tc_classid = prog->res.classid; - if (at_ingress) { + if (tc_skip_sw(prog->gen_flags)) { + filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0; + } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_end(skb); @@ -164,32 +166,42 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, tp->protocol, &offload); } -static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, - struct cls_bpf_prog *oldprog) +static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct cls_bpf_prog *oldprog) { struct net_device *dev = tp->q->dev_queue->dev; struct cls_bpf_prog *obj = prog; enum tc_clsbpf_command cmd; + bool skip_sw; + int ret; + + skip_sw = tc_skip_sw(prog->gen_flags) || + (oldprog && tc_skip_sw(oldprog->gen_flags)); if (oldprog && oldprog->offloaded) { if (tc_should_offload(dev, tp, prog->gen_flags)) { cmd = TC_CLSBPF_REPLACE; - } else { + } else if (!tc_skip_sw(prog->gen_flags)) { obj = oldprog; cmd = TC_CLSBPF_DESTROY; + } else { + return -EINVAL; } } else { if (!tc_should_offload(dev, tp, prog->gen_flags)) - return; + return skip_sw ? -EINVAL : 0; cmd = TC_CLSBPF_ADD; } - if (cls_bpf_offload_cmd(tp, obj, cmd)) - return; + ret = cls_bpf_offload_cmd(tp, obj, cmd); + if (ret) + return skip_sw ? ret : 0; obj->offloaded = true; if (oldprog) oldprog->offloaded = false; + + return 0; } static void cls_bpf_stop_offload(struct tcf_proto *tp, @@ -498,7 +510,11 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) goto errout; - cls_bpf_offload(tp, prog, oldprog); + ret = cls_bpf_offload(tp, prog, oldprog); + if (ret) { + cls_bpf_delete_prog(tp, prog); + return ret; + } if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); -- cgit v1.2.1 From 68d640630d4ef2a4bf3f68b5073dec5e4c4f878b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:44:02 +0100 Subject: net: cls_bpf: allow offloaded filters to update stats Call into offloaded filters to update stats. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1becc2fe1bc5..bb1d5a487081 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -221,6 +221,15 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp, prog->offloaded = false; } +static void cls_bpf_offload_update_stats(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + if (!prog->offloaded) + return; + + cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -577,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, tm->tcm_handle = prog->handle; + cls_bpf_offload_update_stats(tp, prog); + nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; -- cgit v1.2.1 From 9798e6fe4f9b6a2847a40e24b75e68afdc7a01b3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:44:05 +0100 Subject: net: act_mirred: allow statistic updates from offloaded actions Implement .stats_update() callback. The implementation is generic and can be reused by other simple actions if needed. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1c76387c5d9c..667dc382df82 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -204,6 +204,13 @@ out: return retval; } +static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse) +{ + tcf_lastuse_update(&a->tcfa_tm); + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); +} + static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -281,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = { .type = TCA_ACT_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred, + .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, -- cgit v1.2.1 From 5a924b8951f835b5ff8a3d9f434f3b230fc9905f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:31 +0100 Subject: rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs Don't store the rxrpc protocol header in sk_buffs on the transmit queue, but rather generate it on the fly and pass it to kernel_sendmsg() as a separate iov. This reduces the amount of storage required. Note that the security header is still stored in the sk_buff as it may get encrypted along with the data (and doesn't change with each transmission). Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 5 ++- net/rxrpc/call_event.c | 11 ++----- net/rxrpc/conn_object.c | 1 - net/rxrpc/output.c | 83 +++++++++++++++++++++++++++++++++---------------- net/rxrpc/rxkad.c | 8 ++--- net/rxrpc/sendmsg.c | 51 +++++------------------------- 6 files changed, 71 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 034f525f2235..f021df4a6a22 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -385,10 +385,9 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + u32 security_nonce; /* response re-use preventer */ u8 size_align; /* data size alignment (for security) */ - u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ - u32 security_nonce; /* response re-use preventer */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; @@ -946,7 +945,7 @@ extern const s8 rxrpc_ack_priority[]; * output.c */ int rxrpc_send_call_packet(struct rxrpc_call *, u8); -int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); +int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_reject_packets(struct rxrpc_local *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7d1b99824ed9..6247ce25eb21 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -139,7 +139,6 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, */ static void rxrpc_resend(struct rxrpc_call *call) { - struct rxrpc_wire_header *whdr; struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; @@ -201,15 +200,8 @@ static void rxrpc_resend(struct rxrpc_call *call) skb = call->rxtx_buffer[ix]; rxrpc_get_skb(skb, rxrpc_skb_tx_got); spin_unlock_bh(&call->lock); - sp = rxrpc_skb(skb); - - /* Each Tx packet needs a new serial number */ - sp->hdr.serial = atomic_inc_return(&call->conn->serial); - whdr = (struct rxrpc_wire_header *)skb->head; - whdr->serial = htonl(sp->hdr.serial); - - if (rxrpc_send_data_packet(call->conn, skb) < 0) { + if (rxrpc_send_data_packet(call, skb) < 0) { call->resend_at = now + 2; rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; @@ -217,6 +209,7 @@ static void rxrpc_resend(struct rxrpc_call *call) if (rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); + sp = rxrpc_skb(skb); sp->resend_at = now + rxrpc_resend_timeout; rxrpc_free_skb(skb, rxrpc_skb_tx_freed); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 3b55aee0c436..e1e83af47866 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -53,7 +53,6 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->size_align = 4; - conn->header_size = sizeof(struct rxrpc_wire_header); conn->idle_timestamp = jiffies; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 16e18a94ffa6..817fb0e82d6a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -208,19 +208,42 @@ out: /* * send a packet through the transport endpoint */ -int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) { - struct kvec iov[1]; + struct rxrpc_connection *conn = call->conn; + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; + struct kvec iov[2]; + rxrpc_serial_t serial; + size_t len; int ret, opt; _enter(",{%d}", skb->len); - iov[0].iov_base = skb->head; - iov[0].iov_len = skb->len; + /* Each transmission of a Tx packet needs a new serial number */ + serial = atomic_inc_return(&conn->serial); + + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(serial); + whdr.type = RXRPC_PACKET_TYPE_DATA; + whdr.flags = sp->hdr.flags; + whdr.userStatus = 0; + whdr.securityIndex = call->security_ix; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(call->service_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = skb->head; + iov[1].iov_len = skb->len; + len = iov[0].iov_len + iov[1].iov_len; - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -234,26 +257,33 @@ int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) } } + _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); + /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { - down_read(&conn->params.local->defrag_sem); - /* send the packet by UDP - * - returns -EMSGSIZE if UDP would have to fragment the packet - * to go out of the interface - * - in which case, we'll have processed the ICMP error - * message and update the peer record - */ - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, - iov[0].iov_len); - - up_read(&conn->params.local->defrag_sem); - if (ret == -EMSGSIZE) - goto send_fragmentable; - - _leave(" = %d [%u]", ret, conn->params.peer->maxdata); - return ret; + if (iov[1].iov_len >= call->peer->maxdata) + goto send_fragmentable; + + down_read(&conn->params.local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + + up_read(&conn->params.local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + +done: + if (ret == 0) { + sp->resend_at = jiffies + rxrpc_resend_timeout; + sp->hdr.serial = serial; } + _leave(" = %d [%u]", ret, call->peer->maxdata); + return ret; send_fragmentable: /* attempt to send this message with fragmentation enabled */ @@ -268,8 +298,8 @@ send_fragmentable: SOL_IP, IP_MTU_DISCOVER, (char *)&opt, sizeof(opt)); if (ret == 0) { - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, - iov[0].iov_len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, + iov, 2, len); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, @@ -298,8 +328,7 @@ send_fragmentable: } up_write(&conn->params.local->defrag_sem); - _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata); - return ret; + goto done; } /* diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index ae392558829d..88d080a1a3de 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) case RXRPC_SECURITY_AUTH: conn->size_align = 8; conn->security_size = sizeof(struct rxkad_level1_hdr); - conn->header_size += sizeof(struct rxkad_level1_hdr); break; case RXRPC_SECURITY_ENCRYPT: conn->size_align = 8; conn->security_size = sizeof(struct rxkad_level2_hdr); - conn->header_size += sizeof(struct rxkad_level2_hdr); break; default: ret = -EKEYREJECTED; @@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = sp->hdr.seq ^ sp->hdr.callNumber; + check = sp->hdr.seq ^ call->call_id; data_size |= (u32)check << 16; hdr.data_size = htonl(data_size); @@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, _enter(""); - check = sp->hdr.seq ^ sp->hdr.callNumber; + check = sp->hdr.seq ^ call->call_id; rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; @@ -277,7 +275,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, /* calculate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - call->crypto_buf[0] = htonl(sp->hdr.callNumber); + call->crypto_buf[0] = htonl(call->call_id); call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6a39ee97a0b7..814b17f23971 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -134,13 +134,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, write_unlock_bh(&call->state_lock); } - _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - if (seq == 1 && rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); sp->resend_at = jiffies + rxrpc_resend_timeout; - ret = rxrpc_send_data_packet(call->conn, skb); + ret = rxrpc_send_data_packet(call, skb); if (ret < 0) { _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); @@ -150,29 +148,6 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, _leave(""); } -/* - * Convert a host-endian header into a network-endian header. - */ -static void rxrpc_insert_header(struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = htonl(sp->hdr.serial); - whdr.type = sp->hdr.type; - whdr.flags = sp->hdr.flags; - whdr.userStatus = sp->hdr.userStatus; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = htons(sp->hdr._rsvd); - whdr.serviceId = htons(sp->hdr.serviceId); - - memcpy(skb->head, &whdr, sizeof(whdr)); -} - /* * send data through a socket * - must be called in process context @@ -232,7 +207,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, space = chunk + call->conn->size_align; space &= ~(call->conn->size_align - 1UL); - size = space + call->conn->header_size; + size = space + call->conn->security_size; _debug("SIZE: %zu/%zu/%zu", chunk, space, size); @@ -248,9 +223,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ASSERTCMP(skb->mark, ==, 0); - _debug("HS: %u", call->conn->header_size); - skb_reserve(skb, call->conn->header_size); - skb->len += call->conn->header_size; + _debug("HS: %u", call->conn->security_size); + skb_reserve(skb, call->conn->security_size); + skb->len += call->conn->security_size; sp = rxrpc_skb(skb); sp->remain = chunk; @@ -312,33 +287,23 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, seq = call->tx_top + 1; - sp->hdr.epoch = conn->proto.epoch; - sp->hdr.cid = call->cid; - sp->hdr.callNumber = call->call_id; sp->hdr.seq = seq; - sp->hdr.serial = atomic_inc_return(&conn->serial); - sp->hdr.type = RXRPC_PACKET_TYPE_DATA; - sp->hdr.userStatus = 0; - sp->hdr.securityIndex = call->security_ix; sp->hdr._rsvd = 0; - sp->hdr.serviceId = call->service_id; + sp->hdr.flags = conn->out_clientflag; - sp->hdr.flags = conn->out_clientflag; if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; else if (call->tx_top - call->tx_hard_ack < call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - if (more && seq & 1) + if (seq & 1) sp->hdr.flags |= RXRPC_REQUEST_ACK; ret = conn->security->secure_packet( - call, skb, skb->mark, - skb->head + sizeof(struct rxrpc_wire_header)); + call, skb, skb->mark, skb->head); if (ret < 0) goto out; - rxrpc_insert_header(skb); rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } -- cgit v1.2.1 From f07373ead455a396e15a431bc08d8ce1dac6f1cf Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:32 +0100 Subject: rxrpc: Add re-sent Tx annotation Add a Tx-phase annotation for packet buffers to indicate that a buffer has already been retransmitted. This will be used by future congestion management. Re-retransmissions of a packet don't affect the congestion window managment in the same way as initial retransmissions. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_event.c | 28 +++++++++++++++++++--------- net/rxrpc/input.c | 14 +++++++++++--- 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f021df4a6a22..dcf54e3fb478 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -505,6 +505,8 @@ struct rxrpc_call { #define RXRPC_TX_ANNO_UNACK 1 #define RXRPC_TX_ANNO_NAK 2 #define RXRPC_TX_ANNO_RETRANS 3 +#define RXRPC_TX_ANNO_MASK 0x03 +#define RXRPC_TX_ANNO_RESENT 0x04 #define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */ #define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */ #define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6247ce25eb21..34ad967f2d81 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -144,7 +144,7 @@ static void rxrpc_resend(struct rxrpc_call *call) rxrpc_seq_t cursor, seq, top; unsigned long resend_at, now; int ix; - u8 annotation; + u8 annotation, anno_type; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); @@ -165,14 +165,16 @@ static void rxrpc_resend(struct rxrpc_call *call) for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; annotation = call->rxtx_annotations[ix]; - if (annotation == RXRPC_TX_ANNO_ACK) + anno_type = annotation & RXRPC_TX_ANNO_MASK; + annotation &= ~RXRPC_TX_ANNO_MASK; + if (anno_type == RXRPC_TX_ANNO_ACK) continue; skb = call->rxtx_buffer[ix]; rxrpc_see_skb(skb, rxrpc_skb_tx_seen); sp = rxrpc_skb(skb); - if (annotation == RXRPC_TX_ANNO_UNACK) { + if (anno_type == RXRPC_TX_ANNO_UNACK) { if (time_after(sp->resend_at, now)) { if (time_before(sp->resend_at, resend_at)) resend_at = sp->resend_at; @@ -181,7 +183,7 @@ static void rxrpc_resend(struct rxrpc_call *call) } /* Okay, we need to retransmit a packet. */ - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; } call->resend_at = resend_at; @@ -194,7 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call) for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; annotation = call->rxtx_annotations[ix]; - if (annotation != RXRPC_TX_ANNO_RETRANS) + anno_type = annotation & RXRPC_TX_ANNO_MASK; + if (anno_type != RXRPC_TX_ANNO_RETRANS) continue; skb = call->rxtx_buffer[ix]; @@ -220,10 +223,17 @@ static void rxrpc_resend(struct rxrpc_call *call) * received and the packet might have been hard-ACK'd (in which * case it will no longer be in the buffer). */ - if (after(seq, call->tx_hard_ack) && - (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_RETRANS || - call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK)) - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; + if (after(seq, call->tx_hard_ack)) { + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + if (anno_type == RXRPC_TX_ANNO_RETRANS || + anno_type == RXRPC_TX_ANNO_NAK) { + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_UNACK; + } + annotation |= RXRPC_TX_ANNO_RESENT; + call->rxtx_annotations[ix] = annotation; + } if (after(call->tx_hard_ack, seq)) seq = call->tx_hard_ack; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7ac1edf3aac7..aa261df9fc9e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -388,17 +388,25 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, { bool resend = false; int ix; + u8 annotation, anno_type; for (; nr_acks > 0; nr_acks--, seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + annotation &= ~RXRPC_TX_ANNO_MASK; switch (*acks++) { case RXRPC_ACK_TYPE_ACK: - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK; + if (anno_type == RXRPC_TX_ANNO_ACK) + continue; + call->rxtx_annotations[ix] = + RXRPC_TX_ANNO_ACK | annotation; break; case RXRPC_ACK_TYPE_NACK: - if (call->rxtx_annotations[ix] == RXRPC_TX_ANNO_NAK) + if (anno_type == RXRPC_TX_ANNO_NAK) continue; - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK; + call->rxtx_annotations[ix] = + RXRPC_TX_ANNO_NAK | annotation; resend = true; break; default: -- cgit v1.2.1 From cf1a6474f80735ff4a5d99f3dd68a94dbec8455f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:41:53 +0100 Subject: rxrpc: Add per-peer RTT tracker Add a function to track the average RTT for a peer. Sources of RTT data will be added in subsequent patches. The RTT data will be useful in the future for determining resend timeouts and for handling the slow-start part of the Rx protocol. Also add a pair of tracepoints, one to log transmissions to elicit a response for RTT purposes and one to log responses that contribute RTT data. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 25 +++++++++++++++++++++---- net/rxrpc/misc.c | 8 ++++++++ net/rxrpc/peer_event.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dcf54e3fb478..79c671e552c3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -258,10 +258,11 @@ struct rxrpc_peer { /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 - suseconds_t rtt; /* current RTT estimate (in uS) */ - unsigned int rtt_point; /* next entry at which to insert */ - unsigned int rtt_usage; /* amount of cache actually used */ - suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ + u64 rtt; /* Current RTT estimate (in nS) */ + u64 rtt_sum; /* Sum of cache contents */ + u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ + u8 rtt_cursor; /* next entry at which to insert */ + u8 rtt_usage; /* amount of cache actually used */ }; /* @@ -657,6 +658,20 @@ enum rxrpc_recvmsg_trace { extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; +enum rxrpc_rtt_tx_trace { + rxrpc_rtt_tx_ping, + rxrpc_rtt_tx__nr_trace +}; + +extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; + +enum rxrpc_rtt_rx_trace { + rxrpc_rtt_rx_ping_response, + rxrpc_rtt_rx__nr_trace +}; + +extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); @@ -955,6 +970,8 @@ void rxrpc_reject_packets(struct rxrpc_local *); */ void rxrpc_error_report(struct sock *); void rxrpc_peer_error_distributor(struct work_struct *); +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, + rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); /* * peer_object.c diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 026e1f2e83ff..6321c23f9a6e 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -182,3 +182,11 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { [rxrpc_recvmsg_to_be_accepted] = "TBAC", [rxrpc_recvmsg_return] = "RETN", }; + +const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { + [rxrpc_rtt_tx_ping] = "PING", +}; + +const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { + [rxrpc_rtt_rx_ping_response] = "PONG", +}; diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 18276e7cb9e0..bf13b8470c9a 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -305,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work) rxrpc_put_peer(peer); _leave(""); } + +/* + * Add RTT information to cache. This is called in softirq mode and has + * exclusive access to the peer RTT data. + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time) +{ + struct rxrpc_peer *peer = call->peer; + s64 rtt; + u64 sum = peer->rtt_sum, avg; + u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; + + rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); + if (rtt < 0) + return; + + /* Replace the oldest datum in the RTT buffer */ + sum -= peer->rtt_cache[cursor]; + sum += rtt; + peer->rtt_cache[cursor] = rtt; + peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); + peer->rtt_sum = sum; + if (usage < RXRPC_RTT_CACHE_SIZE) { + usage++; + peer->rtt_usage = usage; + } + + /* Now recalculate the average */ + if (usage == RXRPC_RTT_CACHE_SIZE) { + avg = sum / RXRPC_RTT_CACHE_SIZE; + } else { + avg = sum; + do_div(avg, usage); + } + + peer->rtt = avg; + trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, + usage, avg); +} -- cgit v1.2.1 From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:09 +0300 Subject: net: skbuff: Export __skb_vlan_pop This exports the functionality of extracting the tag from the payload, without moving next vlan tag into hw accel tag. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7bf82a28e10a..6c22351bd519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4522,8 +4522,10 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; unsigned int offset = skb->data - skb_mac_header(skb); @@ -4554,6 +4556,7 @@ pull: return err; } +EXPORT_SYMBOL(__skb_vlan_pop); int skb_vlan_pop(struct sk_buff *skb) { -- cgit v1.2.1 From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:10 +0300 Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action TCA_VLAN_ACT_MODIFY allows one to change an existing tag. It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id, priority). If packet is vlan tagged, then the tag gets overwritten according to user specified attributes. For example, this allows user to replace a tag's vid while preserving its priority bits (as opposed to "action vlan pop pipe action vlan push"). Signed-off-by: Shmulik Ladkani Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 59a8d3150ae2..a95c00b119da 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_vlan *v = to_vlan(a); int action; int err; + u16 tci; spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); @@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (err) goto drop; break; + case TCA_VLAN_ACT_MODIFY: + /* No-op if no vlan tag (either hw-accel or in-payload) */ + if (!skb_vlan_tagged(skb)) + goto unlock; + /* extract existing tag (and guarantee no hw-accel tag) */ + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + skb->vlan_tci = 0; + } else { + /* in-payload vlan tag, pop it */ + err = __skb_vlan_pop(skb, &tci); + if (err) + goto drop; + } + /* replace the vid */ + tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + /* replace prio bits, if tcfv_push_prio specified */ + if (v->tcfv_push_prio) { + tci &= ~VLAN_PRIO_MASK; + tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + } + /* put updated tci as hwaccel tag */ + __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + break; default: BUG(); } @@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: + case TCA_VLAN_ACT_MODIFY: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_hash_release(*a, bind); @@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (v->tcfv_action == TCA_VLAN_ACT_PUSH && + if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || + v->tcfv_action == TCA_VLAN_ACT_MODIFY) && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto) || -- cgit v1.2.1 From 636c2628086e40c86dac7ddc84a1c4b4fcccc6e3 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:36 +0300 Subject: net: skbuff: Remove errornous length validation in skb_vlan_pop() In 93515d53b1 "net: move vlan pop/push functions into common code" skb_vlan_pop was moved from its private location in openvswitch to skbuff common code. In case skb has non hw-accel vlan tag, the original 'pop_vlan()' assured that skb->len is sufficient (if skb->len < VLAN_ETH_HLEN then pop was considered a no-op). This validation was moved as is into the new common 'skb_vlan_pop'. Alas, in its original location (openvswitch), there was a guarantee that 'data' points to the mac_header, therefore the 'skb->len < VLAN_ETH_HLEN' condition made sense. However there's no such guarantee in the generic 'skb_vlan_pop'. For short packets received in rx path going through 'skb_vlan_pop', this causes 'skb_vlan_pop' to fail pop-ing a valid vlan hdr (in the non hw-accel case) or to fail moving next tag into hw-accel tag. Remove the 'skb->len < VLAN_ETH_HLEN' condition entirely: It is superfluous since inner '__skb_vlan_pop' already verifies there are VLAN_ETH_HLEN writable bytes at the mac_header. Note this presents a slight change to skb_vlan_pop() users: In case total length is smaller than VLAN_ETH_HLEN, skb_vlan_pop() now returns an error, as opposed to previous "no-op" behavior. Existing callers (e.g. tc act vlan, ovs) usually drop the packet if 'skb_vlan_pop' fails. Fixes: 93515d53b1 ("net: move vlan pop/push functions into common code") Signed-off-by: Shmulik Ladkani Cc: Pravin Shelar Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c22351bd519..b2a51bf1b0f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,9 +4567,8 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4577,9 +4576,8 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.1 From ecf4ee41d25832a6ec52f8b54dfaa46c08b949d5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 20 Sep 2016 12:48:37 +0300 Subject: net: skbuff: Coding: Use eth_type_vlan() instead of open coding it Fix 'skb_vlan_pop' to use eth_type_vlan instead of directly comparing skb->protocol to ETH_P_8021Q or ETH_P_8021AD. Signed-off-by: Shmulik Ladkani Reviewed-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2a51bf1b0f9..d36c7548952f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4567,8 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4576,8 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; -- cgit v1.2.1 From adb03115f4590baa280ddc440a8eff08a6be0cb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Sep 2016 18:06:17 -0700 Subject: net: get rid of an signed integer overflow in ip_idents_reserve() Jiri Pirko reported an UBSAN warning happening in ip_idents_reserve() [] UBSAN: Undefined behaviour in ./arch/x86/include/asm/atomic.h:156:11 [] signed integer overflow: [] -2117905507 + -695755206 cannot be represented in type 'int' Since we do not have uatomic_add_return() yet, use atomic_cmpxchg() so that the arithmetics can be done using unsigned int. Fixes: 04ca6973f7c1 ("ip: make IP identifiers less predictable") Signed-off-by: Eric Dumazet Reported-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/route.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1f2830d8110..b5b47a26d4ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs) atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; - u32 delta = 0; + u32 new, delta = 0; if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, p_id) - segs; + /* Do not use atomic_add_return() as it makes UBSAN unhappy */ + do { + old = (u32)atomic_read(p_id); + new = old + delta + segs; + } while (atomic_cmpxchg(p_id, old, new) != old); + + return new - segs; } EXPORT_SYMBOL(ip_idents_reserve); -- cgit v1.2.1 From f9616c35a0d786bc64fff4bf819d1e4984873367 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Sep 2016 22:45:58 -0700 Subject: tcp: implement TSQ for retransmits We saw sch_fq drops caused by the per flow limit of 100 packets and TCP when dealing with large cwnd and bursts of retransmits. Even after increasing the limit to 1000, and even after commit 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time"), we can still have these drops. Under certain conditions, TCP can spend a considerable amount of time queuing thousands of skbs in a single tcp_xmit_retransmit_queue() invocation, incurring latency spikes and stalls of other softirq handlers. This patch implements TSQ for retransmits, limiting number of packets and giving more chance for scheduling packets in both ways. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 72 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7d025a7804b5..478dfc539178 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | - TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->lost_out > tp->retrans_out && + tp->snd_cwnd > tcp_packets_in_flight(tp)) + tcp_xmit_retransmit_queue(sk); + + tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); + } } /* * One tasklet per cpu tries to send more skbs. @@ -2039,6 +2046,39 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } +/* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * (These limits are doubled for retransmits) + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates + * Alas, some drivers / subsystems require a fair amount + * of queued bytes to ensure line rate. + * One example is wifi aggregation (802.11 AMPDU) + */ +static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, + unsigned int factor) +{ + unsigned int limit; + + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit <<= factor; + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { + set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + return true; + } + return false; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2125,29 +2165,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - /* TCP Small Queues : - * Control number of packets in qdisc/devices to two packets / or ~1 ms. - * This allows for : - * - better RTT estimation and ACK scheduling - * - faster recovery - * - high rates - * Alas, some drivers / subsystems require a fair amount - * of queued bytes to ensure line rate. - * One example is wifi aggregation (802.11 AMPDU) - */ - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); - - if (atomic_read(&sk->sk_wmem_alloc) > limit) { - set_bit(TSQ_THROTTLED, &tp->tsq_flags); - /* It is possible TX completion already happened - * before we set TSQ_THROTTLED, so we must - * test again the condition. - */ - smp_mb__after_atomic(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) - break; - } + if (tcp_small_queue_check(sk, skb, 0)) + break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2847,6 +2866,9 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; + if (tcp_small_queue_check(sk, skb, 1)) + return; + if (tcp_retransmit_skb(sk, skb, segs)) return; -- cgit v1.2.1 From e2f036a97271cf5811ee754bf321a29a814577f9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 21 Sep 2016 08:45:55 -0300 Subject: sctp: rename WORD_TRUNC/ROUND macros To something more meaningful these days, specially because this is working on packet headers or lengths and which are not tied to any CPU arch but to the protocol itself. So, WORD_TRUNC becomes SCTP_TRUNC4 and WORD_ROUND becomes SCTP_PAD4. Reported-by: David Laight Reported-by: David Miller Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/netfilter/xt_sctp.c | 2 +- net/sctp/associola.c | 2 +- net/sctp/chunk.c | 6 +++--- net/sctp/input.c | 8 ++++---- net/sctp/inqueue.c | 2 +- net/sctp/output.c | 12 ++++++------ net/sctp/sm_make_chunk.c | 28 ++++++++++++++-------------- net/sctp/sm_statefuns.c | 6 +++--- net/sctp/transport.c | 4 ++-- net/sctp/ulpevent.c | 4 ++-- 10 files changed, 37 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index ef36a56a02c6..4dedb96d1a06 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb, ++i, offset, sch->type, htons(sch->length), sch->flags); #endif - offset += WORD_ROUND(ntohs(sch->length)); + offset += SCTP_PAD4(ntohs(sch->length)); pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 1c23060c41a6..f10d3397f917 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) transports) { if (t->pmtu_pending && t->dst) { sctp_transport_update_pmtu(sk, t, - WORD_TRUNC(dst_mtu(t->dst))); + SCTP_TRUNC4(dst_mtu(t->dst))); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index af9cc8055465..76eae828ec89 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -208,8 +208,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) - max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) + - hmac_desc->hmac_len); + max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) + + hmac_desc->hmac_len); } /* Now, check if we need to reduce our max */ @@ -229,7 +229,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max) - max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t)); + max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) diff --git a/net/sctp/input.c b/net/sctp/input.c index 69444d32ecda..a1d85065bfc0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) /* PMTU discovery (RFC1191) */ if (ICMP_FRAG_NEEDED == code) { sctp_icmp_frag_needed(sk, asoc, transport, - WORD_TRUNC(info)); + SCTP_TRUNC4(info)); goto out_unlock; } else { if (ICMP_PROT_UNREACH == code) { @@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = offset + WORD_ROUND(ntohs(ch->length)); + ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; @@ -1121,7 +1121,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; @@ -1190,7 +1190,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ - if (WORD_ROUND(ntohs(ch->length)) > skb->len) + if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 6437aa97cfd7..f731de3e8428 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -213,7 +213,7 @@ new_skb: } chunk->chunk_hdr = ch; - chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 0c605ec74dc4..2a5c1896d18f 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -297,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { sctp_xmit_t retval = SCTP_XMIT_OK; - __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); + __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); @@ -508,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (gso) { pkt_size = packet->overhead; list_for_each_entry(chunk, &packet->chunk_list, list) { - int padded = WORD_ROUND(chunk->skb->len); + int padded = SCTP_PAD4(chunk->skb->len); if (pkt_size + padded > tp->pathmtu) break; @@ -538,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * included in the chunk length field. The sender should * never pad with more than 3 bytes. * - * [This whole comment explains WORD_ROUND() below.] + * [This whole comment explains SCTP_PAD4() below.] */ pkt_size -= packet->overhead; @@ -560,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) has_data = 1; } - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) memset(skb_put(chunk->skb, padding), 0, padding); @@ -587,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * acknowledged or have failed. * Re-queue auth chunks if needed. */ - pkt_size -= WORD_ROUND(chunk->skb->len); + pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); @@ -911,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, */ maxsize = pmtu - packet->overhead; if (packet->auth) - maxsize -= WORD_ROUND(packet->auth->skb->len); + maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8c77b87a8565..79dd66079dd7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; - chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); + chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); if (asoc->prsctp_enable) @@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* Add HMACS parameter length if any were defined */ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* If we have any extensions to report, account for that */ if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * @@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, } if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); @@ -1390,7 +1390,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sock *sk; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); + skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp); if (!skb) goto nodata; @@ -1482,7 +1482,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) void *target; void *padding; int chunklen = ntohs(chunk->chunk_hdr->length); - int padlen = WORD_ROUND(chunklen) - chunklen; + int padlen = SCTP_PAD4(chunklen) - chunklen; padding = skb_put(chunk->skb, padlen); target = skb_put(chunk->skb, len); @@ -1900,7 +1900,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc, struct __sctp_missing report; __u16 len; - len = WORD_ROUND(sizeof(report)); + len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. @@ -2098,9 +2098,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, if (*errp) { if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, - WORD_ROUND(ntohs(param.p->length)))) + SCTP_PAD4(ntohs(param.p->length)))) sctp_addto_chunk_fixed(*errp, - WORD_ROUND(ntohs(param.p->length)), + SCTP_PAD4(ntohs(param.p->length)), param.v); } else { /* If there is no memory for generating the ERROR diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d88bb2b0b699..026e3bca4a94 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net, } /* Report violation if chunk len overflows */ - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, @@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 81b86678be4d..ce54dce13ddb 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); } else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport, return; } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d85b803da11d..bea00058ce35 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, ch = (sctp_errhdr_t *)(chunk->skb->data); cause = ch->cause; - elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); + elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t); /* Pull off the ERROR header. */ skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); @@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * MUST ignore the padding bytes. */ len = ntohs(chunk->chunk_hdr->length); - padding = WORD_ROUND(len) - len; + padding = SCTP_PAD4(len) - len; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); -- cgit v1.2.1 From 4a225ce3950879a5426c56f306f5d1c9d6330292 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 21 Sep 2016 08:45:56 -0300 Subject: sctp: make use of SCTP_TRUNC4 macro And avoid the usage of '&~3'. This is the last place still not using the macro. Also break the line to make it easier to read. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 76eae828ec89..8afe2e90d003 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -195,9 +195,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, /* This is the biggest possible DATA chunk that can fit into * the packet */ - max_data = (asoc->pathmtu - - sctp_sk(asoc->base.sk)->pf->af->net_header_len - - sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3; + max_data = asoc->pathmtu - + sctp_sk(asoc->base.sk)->pf->af->net_header_len - + sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); + max_data = SCTP_TRUNC4(max_data); max = asoc->frag_point; /* If the the peer requested that we authenticate DATA chunks -- cgit v1.2.1 From 8e83134db4ecb77a1dc3390b60ddeea840a5afbc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:31 +0100 Subject: rxrpc: Send pings to get RTT data Send a PING ACK packet to the peer when we get a new incoming call from a peer we don't have a record for. The PING RESPONSE ACK packet will tell us the following about the peer: (1) its receive window size (2) its MTU sizes (3) its support for jumbo DATA packets (4) if it supports slow start (similar to RFC 5681) (5) an estimate of the RTT This is necessary because the peer won't normally send us an ACK until it gets to the Rx phase and we send it a packet, but we would like to know some of this information before we start sending packets. A pair of tracepoints are added so that RTT determination can be observed. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 7 +++++-- net/rxrpc/input.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- net/rxrpc/misc.c | 11 ++++++----- net/rxrpc/output.c | 22 ++++++++++++++++++++++ 4 files changed, 80 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 79c671e552c3..8b47f468eb9d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -403,6 +403,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ + RXRPC_CALL_PINGING, /* Ping in process */ }; /* @@ -487,6 +488,8 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ + unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ + unsigned short rx_pkt_len; /* Current recvmsg packet len */ /* Rx/Tx circular buffer, depending on phase. * @@ -530,8 +533,8 @@ struct rxrpc_call { u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ - unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ - unsigned short rx_pkt_len; /* Current recvmsg packet len */ + rxrpc_serial_t ackr_ping; /* Last ping sent */ + ktime_t ackr_ping_time; /* Time last ping sent */ /* transmission-phase ACK management */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index aa261df9fc9e..a0a5bd108c9e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -36,6 +36,19 @@ static void rxrpc_proto_abort(const char *why, } } +/* + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. + */ +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, + int skew) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + true, true); +} + /* * Apply a hard ACK by advancing the Tx window. */ @@ -342,6 +355,32 @@ ack: _leave(" [queued]"); } +/* + * Process a ping response. + */ +static void rxrpc_input_ping_response(struct rxrpc_call *call, + ktime_t resp_time, + rxrpc_serial_t orig_serial, + rxrpc_serial_t ack_serial) +{ + rxrpc_serial_t ping_serial; + ktime_t ping_time; + + ping_time = call->ackr_ping_time; + smp_rmb(); + ping_serial = call->ackr_ping; + + if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || + before(orig_serial, ping_serial)) + return; + clear_bit(RXRPC_CALL_PINGING, &call->flags); + if (after(orig_serial, ping_serial)) + return; + + rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response, + orig_serial, ack_serial, ping_time, resp_time); +} + /* * Process the extra information that may be appended to an ACK packet */ @@ -438,6 +477,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_ackinfo info; u8 acks[RXRPC_MAXACKS]; } buf; + rxrpc_serial_t acked_serial; rxrpc_seq_t first_soft_ack, hard_ack; int nr_acks, offset; @@ -449,6 +489,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, } sp->offset += sizeof(buf.ack); + acked_serial = ntohl(buf.ack.serial); first_soft_ack = ntohl(buf.ack.firstPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; @@ -460,10 +501,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, ntohs(buf.ack.maxSkew), first_soft_ack, ntohl(buf.ack.previousPacket), - ntohl(buf.ack.serial), + acked_serial, rxrpc_acks(buf.ack.reason), buf.ack.nAcks); + if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_input_ping_response(call, skb->tstamp, acked_serial, + sp->hdr.serial); + if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, @@ -830,6 +875,7 @@ void rxrpc_data_ready(struct sock *udp_sk) rcu_read_unlock(); goto reject_packet; } + rxrpc_send_ping(call, skb, skew); } rxrpc_input_call_packet(call, skb, skew); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 6321c23f9a6e..56e668352fc7 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -83,11 +83,12 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_DELAY] = 1, [RXRPC_ACK_REQUESTED] = 2, [RXRPC_ACK_IDLE] = 3, - [RXRPC_ACK_PING_RESPONSE] = 4, - [RXRPC_ACK_DUPLICATE] = 5, - [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, - [RXRPC_ACK_EXCEEDS_WINDOW] = 7, - [RXRPC_ACK_NOSPACE] = 8, + [RXRPC_ACK_DUPLICATE] = 4, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 5, + [RXRPC_ACK_EXCEEDS_WINDOW] = 6, + [RXRPC_ACK_NOSPACE] = 7, + [RXRPC_ACK_PING_RESPONSE] = 8, + [RXRPC_ACK_PING] = 9, }; const char *rxrpc_acks(u8 reason) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 817fb0e82d6a..0d89cd3f2c01 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -57,6 +57,9 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ack.reason = call->ackr_reason; pkt->ack.nAcks = top - hard_ack; + if (pkt->ack.reason == RXRPC_ACK_PING) + pkt->whdr.flags |= RXRPC_REQUEST_ACK; + if (after(top, hard_ack)) { seq = hard_ack + 1; do { @@ -97,6 +100,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) struct kvec iov[2]; rxrpc_serial_t serial; size_t len, n; + bool ping = false; int ioc, ret; u32 abort_code; @@ -147,6 +151,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) ret = 0; goto out; } + ping = (call->ackr_reason == RXRPC_ACK_PING); n = rxrpc_fill_out_ack(call, pkt); call->ackr_reason = 0; @@ -183,12 +188,29 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) goto out; } + if (ping) { + call->ackr_ping = serial; + smp_wmb(); + /* We need to stick a time in before we send the packet in case + * the reply gets back before kernel_sendmsg() completes - but + * asking UDP to send the packet can take a relatively long + * time, so we update the time after, on the assumption that + * the packet transmission is more likely to happen towards the + * end of the kernel_sendmsg() call. + */ + call->ackr_ping_time = ktime_get_real(); + set_bit(RXRPC_CALL_PINGING, &call->flags); + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial); + } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); + if (ping) + call->ackr_ping_time = ktime_get_real(); if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) { switch (type) { case RXRPC_PACKET_TYPE_ACK: + clear_bit(RXRPC_CALL_PINGING, &call->flags); rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), -- cgit v1.2.1 From 7aa51da7c88d42cc0bb85ab7d01429fbd4e51282 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:31 +0100 Subject: rxrpc: Expedite ping response transmission Expedite the transmission of a response to a PING ACK by sending it from sendmsg if one is pending. We're most likely to see a PING ACK during the client call Tx phase as the other side may use it to determine a number of parameters, such as the client's receive window size, the RTT and whether the client is doing slow start (similar to RFC5681). If we don't expedite it, it's left to the background processing thread to transmit. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 814b17f23971..3c969de3ef05 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -180,6 +180,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, copied = 0; do { + /* Check to see if there's a ping ACK to reply to. */ + if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + if (!skb) { size_t size, chunk, max, space; -- cgit v1.2.1 From 50235c4b5a2fb9a9690f02cd1dea6ca047d7f79e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:31 +0100 Subject: rxrpc: Obtain RTT data by requesting ACKs on DATA packets In addition to sending a PING ACK to gain RTT data, we can set the RXRPC_REQUEST_ACK flag on a DATA packet and get a REQUESTED-ACK ACK. The ACK packet contains the serial number of the packet it is in response to, so we can look through the Tx buffer for a matching DATA packet. This requires that the data packets be stamped with the time of transmission as a ktime rather than having the resend_at time in jiffies. This further requires the resend code to do the resend determination in ktimes and convert to jiffies to set the timer. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 7 +++---- net/rxrpc/call_event.c | 19 +++++++++---------- net/rxrpc/input.c | 35 +++++++++++++++++++++++++++++++++++ net/rxrpc/misc.c | 6 ++++-- net/rxrpc/output.c | 7 +++++-- net/rxrpc/sendmsg.c | 1 - net/rxrpc/sysctl.c | 2 +- 7 files changed, 57 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 8b47f468eb9d..1c4597b2c6cd 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -142,10 +142,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - unsigned long resend_at; /* time in jiffies at which to resend */ - struct { - u8 nr_jumbo; /* Number of jumbo subpackets */ - }; + u8 nr_jumbo; /* Number of jumbo subpackets */ }; union { unsigned int offset; /* offset into buffer of next read */ @@ -663,6 +660,7 @@ extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_ping, + rxrpc_rtt_tx_data, rxrpc_rtt_tx__nr_trace }; @@ -670,6 +668,7 @@ extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_ping_response, + rxrpc_rtt_rx_requested_ack, rxrpc_rtt_rx__nr_trace }; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 34ad967f2d81..adb2ec61e21f 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -142,12 +142,14 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - unsigned long resend_at, now; + ktime_t now = ktime_get_real(), max_age, oldest, resend_at; int ix; u8 annotation, anno_type; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); + max_age = ktime_sub_ms(now, rxrpc_resend_timeout); + spin_lock_bh(&call->lock); cursor = call->tx_hard_ack; @@ -160,8 +162,7 @@ static void rxrpc_resend(struct rxrpc_call *call) * the packets in the Tx buffer we're going to resend and what the new * resend timeout will be. */ - now = jiffies; - resend_at = now + rxrpc_resend_timeout; + oldest = now; for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; annotation = call->rxtx_annotations[ix]; @@ -175,9 +176,9 @@ static void rxrpc_resend(struct rxrpc_call *call) sp = rxrpc_skb(skb); if (anno_type == RXRPC_TX_ANNO_UNACK) { - if (time_after(sp->resend_at, now)) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; + if (ktime_after(skb->tstamp, max_age)) { + if (ktime_before(skb->tstamp, oldest)) + oldest = skb->tstamp; continue; } } @@ -186,7 +187,8 @@ static void rxrpc_resend(struct rxrpc_call *call) call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; } - call->resend_at = resend_at; + resend_at = ktime_sub(ktime_add_ns(oldest, rxrpc_resend_timeout), now); + call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at)); /* Now go through the Tx window and perform the retransmissions. We * have to drop the lock for each send. If an ACK comes in whilst the @@ -205,15 +207,12 @@ static void rxrpc_resend(struct rxrpc_call *call) spin_unlock_bh(&call->lock); if (rxrpc_send_data_packet(call, skb) < 0) { - call->resend_at = now + 2; rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; } if (rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - sp = rxrpc_skb(skb); - sp->resend_at = now + rxrpc_resend_timeout; rxrpc_free_skb(skb, rxrpc_skb_tx_freed); spin_lock_bh(&call->lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a0a5bd108c9e..c121949de3c8 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -355,6 +355,38 @@ ack: _leave(" [queued]"); } +/* + * Process a requested ACK. + */ +static void rxrpc_input_requested_ack(struct rxrpc_call *call, + ktime_t resp_time, + rxrpc_serial_t orig_serial, + rxrpc_serial_t ack_serial) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + ktime_t sent_at; + int ix; + + for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) { + skb = call->rxtx_buffer[ix]; + if (!skb) + continue; + + sp = rxrpc_skb(skb); + if (sp->hdr.serial != orig_serial) + continue; + smp_rmb(); + sent_at = skb->tstamp; + goto found; + } + return; + +found: + rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack, + orig_serial, ack_serial, sent_at, resp_time); +} + /* * Process a ping response. */ @@ -508,6 +540,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) rxrpc_input_ping_response(call, skb->tstamp, acked_serial, sp->hdr.serial); + if (buf.ack.reason == RXRPC_ACK_REQUESTED) + rxrpc_input_requested_ack(call, skb->tstamp, acked_serial, + sp->hdr.serial); if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", sp->hdr.serial); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 56e668352fc7..0d425e707f22 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -68,9 +68,9 @@ unsigned int rxrpc_rx_mtu = 5692; unsigned int rxrpc_rx_jumbo_max = 4; /* - * Time till packet resend (in jiffies). + * Time till packet resend (in milliseconds). */ -unsigned int rxrpc_resend_timeout = 4 * HZ; +unsigned int rxrpc_resend_timeout = 4 * 1000; const char *const rxrpc_pkts[] = { "?00", @@ -186,8 +186,10 @@ const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { [rxrpc_rtt_tx_ping] = "PING", + [rxrpc_rtt_tx_data] = "DATA", }; const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { [rxrpc_rtt_rx_ping_response] = "PONG", + [rxrpc_rtt_rx_requested_ack] = "RACK", }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0d89cd3f2c01..db01fbb70d23 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -300,9 +300,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) goto send_fragmentable; done: - if (ret == 0) { - sp->resend_at = jiffies + rxrpc_resend_timeout; + if (ret >= 0) { + skb->tstamp = ktime_get_real(); + smp_wmb(); sp->hdr.serial = serial; + if (whdr.flags & RXRPC_REQUEST_ACK) + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 3c969de3ef05..607223f4f871 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -137,7 +137,6 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (seq == 1 && rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - sp->resend_at = jiffies + rxrpc_resend_timeout; ret = rxrpc_send_data_packet(call, skb); if (ret < 0) { _debug("need instant resend %d", ret); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index a03c61c672f5..13d1df03ebac 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -59,7 +59,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_resend_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { -- cgit v1.2.1 From de1d657816c6fbb70f07b01d50ec669dff0d4e60 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 21 Sep 2016 16:16:14 -0700 Subject: tcp: fix under-accounting retransmit SNMP counters This patch fixes these under-accounting SNMP rtx stats LINUX_MIB_TCPFORWARDRETRANS LINUX_MIB_TCPFASTRETRANS LINUX_MIB_TCPSLOWSTARTRETRANS when retransmitting TSO packets Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f53d0cca5fa4..e15ec82a6319 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2831,7 +2831,7 @@ begin_fwd: if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS(sock_net(sk), mib_idx); + NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); -- cgit v1.2.1 From 7e32b44361abc77fbc01f2b97b045c405b2583e5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 21 Sep 2016 16:16:15 -0700 Subject: tcp: properly account Fast Open SYN-ACK retrans Since the TFO socket is accepted right off SYN-data, the socket owner can call getsockopt(TCP_INFO) to collect ongoing SYN-ACK retransmission or timeout stats (i.e., tcpi_total_retrans, tcpi_retransmits). Currently those stats are only updated upon handshake completes. This patch fixes it. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 ++ net/ipv4/tcp_timer.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc3..08323bd95f2a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,7 +5885,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { - tp->total_retrans = req->num_retrans; + inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e15ec82a6319..5288cec4a2b2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3568,6 +3568,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + if (unlikely(tcp_passive_fastopen(sk))) + tcp_sk(sk)->total_retrans++; } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d84930b2dd95..f712b411f6ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -384,6 +384,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) */ inet_rtx_syn_ack(sk, req); req->num_timeout++; + icsk->icsk_retransmits++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } -- cgit v1.2.1 From 0d4b103c008ac9f6f438d2618c155f6e868e5a67 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:31 +0100 Subject: rxrpc: Reduce the number of ACK-Requests sent Reduce the number of ACK-Requests we set on DATA packets that we're sending to reduce network traffic. We set the flag on odd-numbered DATA packets to start off the RTT cache until we have at least three entries in it and then probe once per second thereafter to keep it topped up. This could be made tunable in future. Note that from this point, the RXRPC_REQUEST_ACK flag is set on DATA packets as we transmit them and not stored statically in the sk_buff. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/output.c | 13 +++++++++++-- net/rxrpc/peer_object.c | 1 + net/rxrpc/sendmsg.c | 2 -- 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1c4597b2c6cd..b13754a6dd7a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -255,6 +255,7 @@ struct rxrpc_peer { /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 + ktime_t rtt_last_req; /* Time of last RTT request */ u64 rtt; /* Current RTT estimate (in nS) */ u64 rtt_sum; /* Sum of cache contents */ u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index db01fbb70d23..282cb1e36d06 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -270,6 +270,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) msg.msg_controllen = 0; msg.msg_flags = 0; + /* If our RTT cache needs working on, request an ACK. */ + if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real())) + whdr.flags |= RXRPC_REQUEST_ACK; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { @@ -301,11 +307,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) done: if (ret >= 0) { - skb->tstamp = ktime_get_real(); + ktime_t now = ktime_get_real(); + skb->tstamp = now; smp_wmb(); sp->hdr.serial = serial; - if (whdr.flags & RXRPC_REQUEST_ACK) + if (whdr.flags & RXRPC_REQUEST_ACK) { + call->peer->rtt_last_req = now; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); + } } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index f3e5766910fd..941b724d523b 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -244,6 +244,7 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key) peer->hash_key = hash_key; rxrpc_assess_MTU_size(peer); peer->mtu = peer->if_mtu; + peer->rtt_last_req = ktime_get_real(); switch (peer->srx.transport.family) { case AF_INET: diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 607223f4f871..ca7c3be60ad2 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -299,8 +299,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, else if (call->tx_top - call->tx_hard_ack < call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - if (seq & 1) - sp->hdr.flags |= RXRPC_REQUEST_ACK; ret = conn->security->secure_packet( call, skb, skb->mark, skb->head); -- cgit v1.2.1 From fc943f67773487bb85131273f39b5f183caafe95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:32 +0100 Subject: rxrpc: Reduce the number of PING ACKs sent We don't want to send a PING ACK for every new incoming call as that just adds to the network traffic. Instead, we send a PING ACK to the first three that we receive and then once per second thereafter. This could probably be made adjustable in future. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- net/rxrpc/input.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index adb2ec61e21f..6e2ea8f4ae75 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -142,7 +142,7 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - ktime_t now = ktime_get_real(), max_age, oldest, resend_at; + ktime_t now = ktime_get_real(), max_age, oldest, resend_at; int ix; u8 annotation, anno_type; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c121949de3c8..cbb5d53f09d7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -44,9 +44,12 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, int skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + ktime_t now = skb->tstamp; - rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, - true, true); + if (call->peer->rtt_usage < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + true, true); } /* -- cgit v1.2.1 From 2b03bf732488a3c2e920afe22c03b82cb8477e28 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 13 Sep 2016 13:49:53 +0200 Subject: netfilter: nft_numgen: add number generation offset Add support of an offset value for incremental counter and random. With this option the sysadmin is able to start the counter to a certain value and then apply the generated number. Example: meta mark set numgen inc mod 2 offset 100 This will generate marks with the serie 100, 101, 100, 101, ... Suggested-by: Pablo Neira Ayuso Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_numgen.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index f173ebec30a7..55bc5ab78d4a 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -23,6 +23,7 @@ struct nft_ng_inc { enum nft_registers dreg:8; u32 modulus; atomic_t counter; + u32 offset; }; static void nft_ng_inc_eval(const struct nft_expr *expr, @@ -37,13 +38,14 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); - regs->data[priv->dreg] = nval; + regs->data[priv->dreg] = nval + priv->offset; } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, + [NFTA_NG_OFFSET] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -52,10 +54,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, { struct nft_ng_inc *priv = nft_expr_priv(expr); + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); atomic_set(&priv->counter, 0); @@ -64,7 +72,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, - u32 modulus, enum nft_ng_types type) + u32 modulus, enum nft_ng_types type, u32 offset) { if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) goto nla_put_failure; @@ -72,6 +80,8 @@ static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset))) + goto nla_put_failure; return 0; @@ -83,12 +93,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL, + priv->offset); } struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; + u32 offset; }; static void nft_ng_random_eval(const struct nft_expr *expr, @@ -97,9 +109,10 @@ static void nft_ng_random_eval(const struct nft_expr *expr, { struct nft_ng_random *priv = nft_expr_priv(expr); struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); + u32 val; - regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state), - priv->modulus); + val = reciprocal_scale(prandom_u32_state(state), priv->modulus); + regs->data[priv->dreg] = val + priv->offset; } static int nft_ng_random_init(const struct nft_ctx *ctx, @@ -108,10 +121,16 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, { struct nft_ng_random *priv = nft_expr_priv(expr); + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + prandom_init_once(&nft_numgen_prandom_state); priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); @@ -124,7 +143,8 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); - return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM); + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM, + priv->offset); } static struct nft_expr_type nft_ng_type; -- cgit v1.2.1 From dd7e39bbfce1fa6de8315d790d1fe01e92cba44d Mon Sep 17 00:00:00 2001 From: Arek Lichwa Date: Thu, 22 Sep 2016 14:08:05 +0200 Subject: Bluetooth: Fix NULL pointer dereference in mgmt context Adds missing callback assignment to cmd_complete in pending management command context. Dump path involves security procedure performed on legacy (pre-SSP) devices with service security requirements set to HIGH (16digits PIN). It fails when shorter PIN is delivered by user. [ 1.517950] Bluetooth: PIN code is not 16 bytes long [ 1.518491] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1.518584] IP: [< (null)>] (null) [ 1.518584] PGD 9e08067 PUD 9fdf067 PMD 0 [ 1.518584] Oops: 0010 [#1] SMP [ 1.518584] Modules linked in: [ 1.518584] CPU: 0 PID: 1002 Comm: kworker/u3:2 Not tainted 4.8.0-rc6-354649-gaf4168c #16 [ 1.518584] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-20160701_074356-anatol 04/01/2014 [ 1.518584] Workqueue: hci0 hci_rx_work [ 1.518584] task: ffff880009ce14c0 task.stack: ffff880009e10000 [ 1.518584] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 1.518584] RSP: 0018:ffff880009e13bc8 EFLAGS: 00010293 [ 1.518584] RAX: 0000000000000000 RBX: ffff880009eed100 RCX: 0000000000000006 [ 1.518584] RDX: ffff880009ddc000 RSI: 0000000000000000 RDI: ffff880009eed100 [ 1.518584] RBP: ffff880009e13be0 R08: 0000000000000000 R09: 0000000000000001 [ 1.518584] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 1.518584] R13: ffff880009e13ccd R14: ffff880009ddc000 R15: ffff880009ddc010 [ 1.518584] FS: 0000000000000000(0000) GS:ffff88000bc00000(0000) knlGS:0000000000000000 [ 1.518584] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.518584] CR2: 0000000000000000 CR3: 0000000009fdd000 CR4: 00000000000006f0 [ 1.518584] Stack: [ 1.518584] ffffffff81909808 ffff880009e13cce ffff880009e0d40b ffff880009e13c68 [ 1.518584] ffffffff818f428d 00000000024000c0 ffff880009e13c08 ffffffff810ca903 [ 1.518584] ffff880009e13c48 ffffffff811ade34 ffffffff8178c31f ffff880009ee6200 [ 1.518584] Call Trace: [ 1.518584] [] ? mgmt_pin_code_neg_reply_complete+0x38/0x60 [ 1.518584] [] hci_cmd_complete_evt+0x69d/0x3200 [ 1.518584] [] ? rcu_read_lock_sched_held+0x53/0x60 [ 1.518584] [] ? kmem_cache_alloc+0x1a4/0x200 [ 1.518584] [] ? skb_clone+0x4f/0xa0 [ 1.518584] [] hci_event_packet+0x8e1/0x28e0 [ 1.518584] [] ? _raw_spin_unlock_irqrestore+0x31/0x50 [ 1.518584] [] ? trace_hardirqs_on_caller+0xee/0x1b0 [ 1.518584] [] hci_rx_work+0x1e1/0x5b0 [ 1.518584] [] ? process_one_work+0x1ed/0x6b0 [ 1.518584] [] process_one_work+0x268/0x6b0 [ 1.518584] [] ? process_one_work+0x1ed/0x6b0 [ 1.518584] [] worker_thread+0x43/0x4e0 [ 1.518584] [] ? process_one_work+0x6b0/0x6b0 [ 1.518584] [] ? process_one_work+0x6b0/0x6b0 [ 1.518584] [] kthread+0xdf/0x100 [ 1.518584] [] ret_from_fork+0x1f/0x40 [ 1.518584] [] ? kthread_create_on_node+0x210/0x210 Signed-off-by: Arek Lichwa Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7b2bac492fb1..63f42f45a96a 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2541,6 +2541,8 @@ static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, if (!cmd) return -ENOMEM; + cmd->cmd_complete = addr_cmd_complete; + err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(cp->addr.bdaddr), &cp->addr.bdaddr); if (err < 0) -- cgit v1.2.1 From 7dc6f16c68757548a332a0c5fbe661987c2189a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Narajowski?= Date: Thu, 22 Sep 2016 16:01:39 +0200 Subject: Bluetooth: Fix not updating scan rsp when adv off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scan response data should not be updated unless there is an advertising instance. Signed-off-by: Michał Narajowski Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 63f42f45a96a..19b8a5e9420d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3159,7 +3159,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, /* The name is stored in the scan response data and so * no need to udpate the advertising data here. */ - if (lmp_le_capable(hdev)) + if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); err = hci_req_run(&req, set_name_complete); -- cgit v1.2.1 From 36b701fae12ac763a568037e4e7c96b5727a8b3e Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Wed, 14 Sep 2016 15:00:02 +0200 Subject: netfilter: nf_tables: validate maximum value of u32 netlink attributes Fetch value and validate u32 netlink attribute. This validation is usually required when the u32 netlink attributes are being stored in a field whose size is smaller. This patch revisits 4da449ae1df9 ("netfilter: nft_exthdr: Add size check on u8 nft_exthdr attributes"). Fixes: 96518518cc41 ("netfilter: add nftables") Suggested-by: Pablo Neira Ayuso Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 25 +++++++++++++++++++++++++ net/netfilter/nft_bitwise.c | 8 +++++++- net/netfilter/nft_byteorder.c | 15 +++++++++++++-- net/netfilter/nft_cmp.c | 3 +++ net/netfilter/nft_exthdr.c | 12 +++++++----- net/netfilter/nft_immediate.c | 4 ++++ 6 files changed, 59 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bd9715e5ff26..b70d3ea1430e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4409,6 +4409,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, return 0; } +/** + * nft_parse_u32_check - fetch u32 attribute and check for maximum value + * + * @attr: netlink attribute to fetch value from + * @max: maximum value to be stored in dest + * @dest: pointer to the variable + * + * Parse, check and store a given u32 netlink attribute into variable. + * This function returns -ERANGE if the value goes over maximum value. + * Otherwise a 0 is returned and the attribute value is stored in the + * destination variable. + */ +unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) +{ + int val; + + val = ntohl(nla_get_be32(attr)); + if (val > max) + return -ERANGE; + + *dest = val; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_u32_check); + /** * nft_parse_register - parse a register value from a netlink attribute * diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index d71cc18fa35d..31c15ed2e5fc 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, { struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_data_desc d1, d2; + u32 len; int err; if (tb[NFTA_BITWISE_SREG] == NULL || @@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_XOR] == NULL) return -EINVAL; - priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b78c28ba465f..ee63d981268d 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); + u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || @@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); + if (err < 0) + return err; + + priv->size = size; + switch (priv->size) { case 2: case 4: @@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, } priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); - priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index e25b35d70e4d..2e53739812b1 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.len > U8_MAX) + return -ERANGE; + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); priv->len = desc.len; return 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 82c264e40278..a84cf3d66056 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len; + u32 offset, len, err; if (tb[NFTA_EXTHDR_DREG] == NULL || tb[NFTA_EXTHDR_TYPE] == NULL || @@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, tb[NFTA_EXTHDR_LEN] == NULL) return -EINVAL; - offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); + if (err < 0) + return err; - if (offset > U8_MAX || len > U8_MAX) - return -ERANGE; + err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); + if (err < 0) + return err; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index db3b746858e3..d17018ff54e6 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; + + if (desc.len > U8_MAX) + return -ERANGE; + priv->dlen = desc.len; priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); -- cgit v1.2.1 From 8061bb54436c19fd16b7c734a69ff60bac26e3e9 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 14 Sep 2016 23:41:46 +0800 Subject: netfilter: nft_queue: add _SREG_QNUM attr to select the queue number Currently, the user can specify the queue numbers by _QUEUE_NUM and _QUEUE_TOTAL attributes, this is enough in most situations. But acctually, it is not very flexible, for example: tcp dport 80 mapped to queue0 tcp dport 81 mapped to queue1 tcp dport 82 mapped to queue2 In order to do this thing, we must add 3 nft rules, and more mapping meant more rules ... So take one register to select the queue number, then we can add one simple rule to mapping queues, maybe like this: queue num tcp dport map { 80:0, 81:1, 82:2 ... } Florian Westphal also proposed wider usage scenarios: queue num jhash ip saddr . ip daddr mod ... queue num meta cpu ... queue num meta mark ... The last point is how to load a queue number from sreg, although we can use *(u16*)®s->data[reg] to load the queue number, just like nat expr to load its l4port do. But we will cooperate with hash expr, meta cpu, meta mark expr and so on. They all store the result to u32 type, so cast it to u16 pointer and dereference it will generate wrong result in the big endian system. So just keep it simple, we treat queue number as u32 type, although u16 type is already enough. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_queue.c | 102 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index d16d59959ff6..393d359a1889 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -22,9 +22,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - u16 queuenum; - u16 queues_total; - u16 flags; + enum nft_registers sreg_qnum:8; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -54,26 +55,39 @@ static void nft_queue_eval(const struct nft_expr *expr, regs->verdict.code = ret; } +static void nft_queue_sreg_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue, ret; + + queue = regs->data[priv->sreg_qnum]; + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + regs->verdict.code = ret; +} + static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, + [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 }, }; static int nft_queue_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); u32 maxid; - if (tb[NFTA_QUEUE_NUM] == NULL) - return -EINVAL; - - init_hashrandom(&jhash_initval); priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); - if (tb[NFTA_QUEUE_TOTAL] != NULL) + if (tb[NFTA_QUEUE_TOTAL]) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); else priv->queues_total = 1; @@ -85,11 +99,34 @@ static int nft_queue_init(const struct nft_ctx *ctx, if (maxid > U16_MAX) return -ERANGE; - if (tb[NFTA_QUEUE_FLAGS] != NULL) { + if (tb[NFTA_QUEUE_FLAGS]) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_sreg_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + int err; + + priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); + err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + if (err < 0) + return err; + + if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) + return -EOPNOTSUPP; } + return 0; } @@ -108,6 +145,21 @@ nla_put_failure: return -1; } +static int +nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_queue_type; static const struct nft_expr_ops nft_queue_ops = { .type = &nft_queue_type, @@ -117,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = { .dump = nft_queue_dump, }; +static const struct nft_expr_ops nft_queue_sreg_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_sreg_eval, + .init = nft_queue_sreg_init, + .dump = nft_queue_sreg_dump, +}; + +static const struct nft_expr_ops * +nft_queue_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM]) + return ERR_PTR(-EINVAL); + + init_hashrandom(&jhash_initval); + + if (tb[NFTA_QUEUE_NUM]) + return &nft_queue_ops; + + if (tb[NFTA_QUEUE_SREG_QNUM]) + return &nft_queue_sreg_ops; + + return ERR_PTR(-EINVAL); +} + static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", - .ops = &nft_queue_ops, + .select_ops = &nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, -- cgit v1.2.1 From 8dc3c2b86bb16e8f345b80a8af69696e9a7edb65 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Thu, 15 Sep 2016 21:29:08 +0800 Subject: netfilter: nf_tables: improve nft payload fast eval There's an off-by-one issue in nft_payload_fast_eval, skb_tail_pointer and ptr + priv->len all point to the last valid address plus 1. So if they are equal, we can still fetch the valid data. It's unnecessary to fall back to nft_payload_eval. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index fb8b5892b5ff..36ba4e55d84e 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -98,7 +98,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, ptr += priv->offset; - if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) + if (unlikely(ptr + priv->len > skb_tail_pointer(skb))) return false; *dest = 0; -- cgit v1.2.1 From a20877b5edec4d2b62560b5245199af04846476c Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 17 Sep 2016 14:31:20 +0800 Subject: netfilter: nf_tables: check tprot_set first when we use xt.thoff pkt->xt.thoff is not always set properly, but we use it without any check. For payload expr, it will cause wrong results. For nftrace, we may notify the wrong network or transport header to the user space, furthermore, input the following nft rules, warning message will be printed out: # nft add rule arp filter output meta nftrace set 1 WARNING: CPU: 0 PID: 13428 at net/netfilter/nf_tables_trace.c:263 nft_trace_notify+0x4a3/0x5e0 [nf_tables] Call Trace: [] dump_stack+0x63/0x85 [] __warn+0xcb/0xf0 [] warn_slowpath_null+0x1d/0x20 [] nft_trace_notify+0x4a3/0x5e0 [nf_tables] [ ... ] [] nft_do_chain_arp+0x78/0x90 [nf_tables_arp] [] nf_iterate+0x62/0x80 [] nf_hook_slow+0x73/0xd0 [] arp_xmit+0x8f/0xb0 [ ... ] [] arp_solicit+0x106/0x2c0 So before we use pkt->xt.thoff, check the tprot_set first. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 5 ++++- net/netfilter/nf_tables_trace.c | 20 +++++++++++--------- net/netfilter/nft_payload.c | 4 ++++ 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 36ba4e55d84e..67259cefef06 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -93,8 +93,11 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb); - else + else { + if (!pkt->tprot_set) + return false; ptr = skb_network_header(skb) + pkt->xt.thoff; + } ptr += priv->offset; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 39eb1cc62e91..696fe8f6f2f2 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { const struct sk_buff *skb = pkt->skb; - unsigned int len = min_t(unsigned int, - pkt->xt.thoff - skb_network_offset(skb), - NFT_TRACETYPE_NETWORK_HSIZE); int off = skb_network_offset(skb); + unsigned int len, nh_end; + nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + len = min_t(unsigned int, nh_end - skb_network_offset(skb), + NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; - len = min_t(unsigned int, skb->len - pkt->xt.thoff, - NFT_TRACETYPE_TRANSPORT_HSIZE); - - if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) - return -1; + if (pkt->tprot_set) { + len = min_t(unsigned int, skb->len - pkt->xt.thoff, + NFT_TRACETYPE_TRANSPORT_HSIZE); + if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, + pkt->xt.thoff, len)) + return -1; + } if (!skb_mac_header_was_set(skb)) return 0; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 12cd4bf16d17..b2f88617611a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -92,6 +92,8 @@ static void nft_payload_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!pkt->tprot_set) + goto err; offset = pkt->xt.thoff; break; default: @@ -184,6 +186,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!pkt->tprot_set) + goto err; offset = pkt->xt.thoff; break; default: -- cgit v1.2.1 From 7bdc66242de7f9cbe8dbb01757042dd18744d800 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Sun, 18 Sep 2016 10:52:25 +0800 Subject: netfilter: Enhance the codes used to get random once There are some codes which are used to get one random once in netfilter. We could use net_get_random_once to simplify these codes. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_RATEEST.c | 6 +----- net/netfilter/xt_connlimit.c | 8 +------- net/netfilter/xt_recent.c | 7 ++----- 3 files changed, 4 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 515131f9e021..dbd6c4a12b97 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex); #define RATEEST_HSIZE 16 static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; static unsigned int jhash_rnd __read_mostly; -static bool rnd_inited __read_mostly; static unsigned int xt_rateest_hash(const char *name) { @@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; - if (unlikely(!rnd_inited)) { - get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); - rnd_inited = true; - } + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); est = xt_rateest_lookup(info->name); if (est) { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 99bbc829868d..b6dc322593a3 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -366,14 +366,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) unsigned int i; int ret; - if (unlikely(!connlimit_rnd)) { - u_int32_t rand; + net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&connlimit_rnd, 0, rand); - } ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) { pr_info("cannot load conntrack support for " diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index d725a27743a1..e3b7a09b103e 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops; #endif static u_int32_t hash_rnd __read_mostly; -static bool hash_rnd_inited __read_mostly; static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) { @@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par, int ret = -EINVAL; size_t sz; - if (unlikely(!hash_rnd_inited)) { - get_random_bytes(&hash_rnd, sizeof(hash_rnd)); - hash_rnd_inited = true; - } + net_get_random_once(&hash_rnd, sizeof(hash_rnd)); + if (info->check_set & ~XT_RECENT_VALID_FLAGS) { pr_info("Unsupported user space flags (%08x)\n", info->check_set); -- cgit v1.2.1 From b9d80f83bf8c3485ae53a4f3a715363d764bb0e4 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 20 Sep 2016 10:31:04 +0800 Subject: netfilter: xt_helper: Use sizeof(variable) instead of literal number It's better to use sizeof(info->name)-1 as index to force set the string tail instead of literal number '29'. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 9f4ab00c8050..805c9f64a04c 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -65,7 +65,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par) par->family); return ret; } - info->name[29] = '\0'; + info->name[sizeof(info->name) - 1] = '\0'; return 0; } -- cgit v1.2.1 From 4004d5c374dabcbce201e16442e4596b764cc60b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 Sep 2016 18:22:46 +0200 Subject: netfilter: nft_lookup: remove superfluous element found check We already checked for !found just a bit before: if (!found) { regs->verdict.code = NFT_BREAK; return; } if (found && set->flags & NFT_SET_MAP) ^^^^^ So this redundant check can just go away. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index e164325d1bc0..8166b6994cc7 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -43,7 +43,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, return; } - if (found && set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); -- cgit v1.2.1 From 21641c2e1ffd0b504610a33beaeab8fcc5140677 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sun, 18 Sep 2016 15:52:20 -0700 Subject: net_sched: check NULL on error path in route4_change() On error path in route4_change(), 'f' could be NULL, so we should check NULL before calling tcf_exts_destroy(). Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Reported-by: kbuild test robot Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index a4ce39b19be0..455fc8f83d0a 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -559,7 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return 0; errout: - tcf_exts_destroy(&f->exts); + if (f) + tcf_exts_destroy(&f->exts); kfree(f); return err; } -- cgit v1.2.1 From 2ed5c3f09627f72a2e0e407a86b2ac05494190f9 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sun, 18 Sep 2016 16:22:47 -0700 Subject: sch_qfq: keep backlog updated with qlen Reported-by: Stas Nichiporovich Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f27ffee106f6..ca0516e6f743 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1153,6 +1153,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) if (!skb) return NULL; + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); @@ -1256,6 +1257,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } bstats_update(&cl->bstats, skb); + qdisc_qstats_backlog_inc(sch, skb); ++sch->q.qlen; agg = cl->agg; @@ -1476,6 +1478,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch) qdisc_reset(cl->qdisc); } } + sch->qstats.backlog = 0; sch->q.qlen = 0; } -- cgit v1.2.1 From 3d4357fba82b3cf19ebf0a04d1c9cb086af15d02 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sun, 18 Sep 2016 16:22:48 -0700 Subject: sch_sfb: keep backlog updated with qlen Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index add3cc7d37ec..20a350bd1b1d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -400,6 +400,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, enqueue: ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { @@ -428,6 +429,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } @@ -450,6 +452,7 @@ static void sfb_reset(struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->slot = 0; q->double_buffering = false; -- cgit v1.2.1 From a3007446e53af07c53bdb4cabad7b3ea60859da4 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 20 Sep 2016 18:19:13 -0300 Subject: sctp: fix the handling of SACK Gap Ack blocks sctp_acked() is using 32bit arithmetics on 16bits vars, via TSN_lte() macros, which is weird and confusing. Once the offset to ctsn is calculated, all wrapping is already handled and thus to verify the Gap Ack blocks we can just use pure less/big-or-equal than checks. Also, rename gap variable to tsn_offset, so it's more meaningful, as it doesn't point to any gap at all. Even so, I don't think this discrepancy resulted in any practical bug. This patch is a preparation for the next one, which will introduce typecheck() for TSN_lte() macros and would cause a compile error here. Suggested-by: David Laight Reported-by: David Laight Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 8c3f446d965c..3ec6da8bbb53 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1719,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { int i; sctp_sack_variable_t *frags; - __u16 gap; + __u16 tsn_offset, blocks; __u32 ctsn = ntohl(sack->cum_tsn_ack); if (TSN_lte(tsn, ctsn)) @@ -1738,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) */ frags = sack->variable; - gap = tsn - ctsn; - for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) { - if (TSN_lte(ntohs(frags[i].gab.start), gap) && - TSN_lte(gap, ntohs(frags[i].gab.end))) + blocks = ntohs(sack->num_gap_ack_blocks); + tsn_offset = tsn - ctsn; + for (i = 0; i < blocks; ++i) { + if (tsn_offset >= ntohs(frags[i].gab.start) && + tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } -- cgit v1.2.1 From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Sep 2016 08:58:55 -0700 Subject: net_sched: sch_fq: account for schedule/timers drifts It looks like the following patch can make FQ very precise, even in VM or stressed hosts. It matters at high pacing rates. We take into account the difference between the time that was programmed when last packet was sent, and current time (a drift of tens of usecs is often observed) Add an EWMA of the unthrottle latency to help diagnostics. This latency is the difference between current time and oldest packet in delayed RB-tree. This accounts for the high resolution timer latency, but can be different under stress, as fq_check_throttled() can be opportunistically be called from a dequeue() called after an enqueue() for a different flow. Tested: // Start a 10Gbit flow $ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr & Before patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17106.04 756876.84 1102.75 1119049.02 0.00 0.00 0.52 After patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17867.00 800245.90 1151.77 1183172.12 0.00 0.00 0.52 A new iproute2 tc can output the 'unthrottle latency' : $ tc -s qd sh dev eth0 | grep latency 0 gc, 0 highprio, 32490767 throttled, 2382 ns latency Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 5dd929cc1423..18e752439f6f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -86,6 +86,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; @@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static void fq_check_throttled(struct fq_sched_data *q, u64 now) { + unsigned long sample; struct rb_node *p; if (q->time_next_delayed_flow > now) return; + /* Update unthrottle latency EWMA. + * This is cheap and can help diagnosing timer/latency problems. + */ + sample = (unsigned long)(now - q->time_next_delayed_flow); + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { struct fq_flow *f = container_of(p, struct fq_flow, rate_node); @@ -515,7 +524,12 @@ begin: len = NSEC_PER_SEC; q->stat_pkts_too_long++; } - + /* Account for schedule/timers drifts. + * f->time_next_packet was set when prior packet was sent, + * and current time (@now) can be too late by tens of us. + */ + if (f->time_next_packet) + len -= min(len/2, now - f->time_next_packet); f->time_next_packet = now + len; } out: @@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->flow_refill_delay = msecs_to_jiffies(40); q->flow_max_rate = ~0U; + q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; q->new_flows.first = NULL; q->old_flows.first = NULL; @@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; - st.pad = 0; - + st.unthrottle_latency_ns = min_t(unsigned long, + q->unthrottle_latency_ns, ~0U); sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); -- cgit v1.2.1 From 2fe664f1fcf7c4da6891f95708a7a56d3c024354 Mon Sep 17 00:00:00 2001 From: Douglas Caetano dos Santos Date: Thu, 22 Sep 2016 15:52:04 -0300 Subject: tcp: fix wrong checksum calculation on MTU probing With TCP MTU probing enabled and offload TX checksumming disabled, tcp_mtu_probe() calculated the wrong checksum when a fragment being copied into the probe's SKB had an odd length. This was caused by the direct use of skb_copy_and_csum_bits() to calculate the checksum, as it pads the fragment being copied, if needed. When this fragment was not the last, a subsequent call used the previous checksum without considering this padding. The effect was a stale connection in one way, as even retransmissions wouldn't solve the problem, because the checksum was never recalculated for the full SKB length. Signed-off-by: Douglas Caetano dos Santos Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5288cec4a2b2..d48d5571e62a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1966,12 +1966,14 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - if (nskb->ip_summed) + if (nskb->ip_summed) { skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); - else - nskb->csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), - copy, nskb->csum); + } else { + __wsum csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), + copy, 0); + nskb->csum = csum_block_add(nskb->csum, csum, len); + } if (skb->len <= copy) { /* We've eaten all the data from this skb. -- cgit v1.2.1 From b24d2891cfb0a7975b0039743439c98fe7b7dea7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 13:17:33 +0100 Subject: rxrpc: Preset timestamp on Tx sk_buffs Set the timestamp on sk_buffs holding packets to be transmitted before queueing them because the moment the packet is on the queue it can be seen by the retransmission algorithm - which may see a completely random timestamp. If the retransmission algorithm sees such a timestamp, it may retransmit the packet and, in future, tell the congestion management algorithm that the retransmit timer expired. Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index ca7c3be60ad2..ca3811bfbd17 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -99,6 +99,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ASSERTCMP(seq, ==, call->tx_top + 1); + /* We have to set the timestamp before queueing as the retransmit + * algorithm can see the packet as soon as we queue it. + */ + skb->tstamp = ktime_get_real(); + ix = seq & RXRPC_RXTX_BUFF_MASK; rxrpc_get_skb(skb, rxrpc_skb_tx_got); call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; -- cgit v1.2.1 From 9aff212bd677829189fae2e2e408cefc196ae5ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:23 +0100 Subject: rxrpc: Don't send an ACK at the end of service call response transmission Don't send an IDLE ACK at the end of the transmission of the response to a service call. The service end resends DATA packets until the client sends an ACK that hard-acks all the send data. At that point, the call is complete. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 6ba4af5a8d95..99e4c0ae30f1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -143,8 +143,6 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); - } else { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, false); } write_lock_bh(&call->state_lock); -- cgit v1.2.1 From c0d058c21c69b3685c3f1bb008aa11f1a5eaee7e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:23 +0100 Subject: rxrpc: Make sure sendmsg() is woken on call completion Make sure that sendmsg() gets woken up if the call it is waiting for completes abnormally. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b13754a6dd7a..808ab750dc6b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -758,6 +758,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, call->error = error; call->completion = compl, call->state = RXRPC_CALL_COMPLETE; + wake_up(&call->waitq); return true; } return false; -- cgit v1.2.1 From 90bd684ded900673d86f64f4b4197704a38f04bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:23 +0100 Subject: rxrpc: Should be using ktime_add_ms() not ktime_add_ns() ktime_add_ms() should be used to add the resend time (in ms) rather than ktime_add_ns(). Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6e2ea8f4ae75..a2909da5d581 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -187,7 +187,7 @@ static void rxrpc_resend(struct rxrpc_call *call) call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; } - resend_at = ktime_sub(ktime_add_ns(oldest, rxrpc_resend_timeout), now); + resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now); call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at)); /* Now go through the Tx window and perform the retransmissions. We -- cgit v1.2.1 From 019b1c9fe32a2a32c1153e31375f87ec3e591273 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Sep 2016 17:54:00 -0700 Subject: tcp: fix a compile error in DBGUNDO() If DBGUNDO() is enabled (FASTRETRANS_DEBUG > 1), a compile error will happen, since inet6_sk(sk)->daddr became sk->sk_v6_daddr Fixes: efe4208f47f9 ("ipv6: make lookups simpler and faster") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 08323bd95f2a..a756b8749a26 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2329,10 +2329,9 @@ static void DBGUNDO(struct sock *sk, const char *msg) } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, - &np->daddr, ntohs(inet->inet_dport), + &sk->sk_v6_daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); -- cgit v1.2.1 From 4acfee8143b33efa8bda6f03fe1462d545ff8170 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 22 Sep 2016 16:49:21 -0400 Subject: net: dsa: add port STP state helper Add a void helper to set the STP state of a port, checking first if the required routine is provided by the driver. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9ecbe787f102..fd78d4c9b197 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -69,6 +69,12 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) return !!p->bridge_dev; } +static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) +{ + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, port, state); +} + static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -104,8 +110,7 @@ static int dsa_slave_open(struct net_device *dev) goto clear_promisc; } - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, p->port, stp_state); + dsa_port_set_stp_state(ds, p->port, stp_state); if (p->phy) phy_start(p->phy); @@ -147,8 +152,7 @@ static int dsa_slave_close(struct net_device *dev) if (ds->ops->port_disable) ds->ops->port_disable(ds, p->port, p->phy); - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); + dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED); return 0; } @@ -354,7 +358,7 @@ static int dsa_slave_stp_state_set(struct net_device *dev, if (switchdev_trans_ph_prepare(trans)) return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - ds->ops->port_stp_state_set(ds, p->port, attr->u.stp_state); + dsa_port_set_stp_state(ds, p->port, attr->u.stp_state); return 0; } @@ -556,8 +560,7 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev) /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - if (ds->ops->port_stp_state_set) - ds->ops->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); + dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, -- cgit v1.2.1 From 732f794c1baf58e1eb2be4431635829c3da655bd Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 22 Sep 2016 16:49:22 -0400 Subject: net: dsa: add port fast ageing Today the DSA drivers are in charge of flushing the MAC addresses associated to a port when its STP state changes from Learning or Forwarding, to Disabled or Blocking or Listening. This makes the drivers more complex and hides the generic switch logic. Introduce a new optional port_fast_age operation to dsa_switch_ops, to move this logic to the DSA layer and keep drivers simple. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fd78d4c9b197..6b1282c006b1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -71,8 +71,26 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) { + struct dsa_port *dp = &ds->ports[port]; + if (ds->ops->port_stp_state_set) ds->ops->port_stp_state_set(ds, port, state); + + if (ds->ops->port_fast_age) { + /* Fast age FDB entries or flush appropriate forwarding database + * for the given port, if we are moving it from Learning or + * Forwarding state, to Disabled or Blocking or Listening state. + */ + + if ((dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING) && + (state == BR_STATE_DISABLED || + state == BR_STATE_BLOCKING || + state == BR_STATE_LISTENING)) + ds->ops->port_fast_age(ds, port); + } + + dp->stp_state = state; } static int dsa_slave_open(struct net_device *dev) -- cgit v1.2.1 From 2d48c5f9335e48ddac7a52db10bf3bfd01986b9c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:35 +0200 Subject: bpf: use skb_to_full_sk helper in bpf_skb_under_cgroup We need to use skb_to_full_sk() helper introduced in commit bd5eb35f16a9 ("xfrm: take care of request sockets") as otherwise we miss tcp synack messages, since ownership is on request socket and therefore it would miss the sk_fullsock() check. Use skb_to_full_sk() as also done similarly in the bpf_get_cgroup_classid() helper via 2309236c13fe ("cls_cgroup: get sk_classid only from full sockets") fix to not let this fall through. Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0920c2ac1d00..e5d997759d5e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2408,7 +2408,7 @@ BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, struct cgroup *cgrp; struct sock *sk; - sk = skb->sk; + sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) -- cgit v1.2.1 From 669dc4d76d0ecc2d795df735839f43cfddf9f617 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:36 +0200 Subject: bpf: use bpf_get_smp_processor_id_proto instead of raw one Same motivation as in commit 80b48c445797 ("bpf: don't use raw processor id in generic helper"), but this time for XDP typed programs. Thus, allow for preemption checks when we have DEBUG_PREEMPT enabled, and otherwise use the raw variant. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e5d997759d5e..acf84fbfb043 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2551,6 +2551,8 @@ xdp_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.1 From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:37 +0200 Subject: bpf: add helper to invalidate hash Add a small helper that complements 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") for invalidating the current skb->hash after mangling on headers via direct packet write. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index acf84fbfb043..00351cdf7d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.1 From 98dafac5697fbe1fb4bef9e3204baf9051641b00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 14:04:38 +0100 Subject: rxrpc: Use before_eq() and friends to compare serial numbers before_eq() and friends should be used to compare serial numbers (when not checking for (non)equality) rather than casting to int, subtracting and checking the result. Signed-off-by: David Howells --- net/rxrpc/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cbb5d53f09d7..06027b6d9c19 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -578,7 +578,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, } /* Discard any out-of-order or duplicate ACKs. */ - if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) { + if (before_eq(sp->hdr.serial, call->acks_latest)) { _debug("discard ACK %d <= %d", sp->hdr.serial, call->acks_latest); return; -- cgit v1.2.1 From dfc3da4404ad1ec42a0a649a4ffa2b0f37e80352 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:23 +0100 Subject: rxrpc: Need to start the resend timer on initial transmission When a DATA packet has its initial transmission, we may need to start or adjust the resend timer. Without this we end up relying on being sent a NACK to initiate the resend. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 2 +- net/rxrpc/sendmsg.c | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 808ab750dc6b..9e3ba4dc9578 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -704,6 +704,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ +void rxrpc_set_timer(struct rxrpc_call *); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index a2909da5d581..3a7f90a2659c 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,7 +24,7 @@ /* * Set the timer */ -static void rxrpc_set_timer(struct rxrpc_call *call) +void rxrpc_set_timer(struct rxrpc_call *call) { unsigned long t, now = jiffies; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index ca3811bfbd17..7cb34b2dfba9 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -146,6 +146,15 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (ret < 0) { _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); + } else { + unsigned long resend_at; + + resend_at = jiffies + msecs_to_jiffies(rxrpc_resend_timeout); + + if (time_before(resend_at, call->resend_at)) { + call->resend_at = resend_at; + rxrpc_set_timer(call); + } } rxrpc_free_skb(skb, rxrpc_skb_tx_freed); -- cgit v1.2.1 From be8aa3380678183821bd7d7b5dec845f10d776ce Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:23 +0100 Subject: rxrpc: Fix accidental cancellation of scheduled resend by ACK parser When rxrpc_input_soft_acks() is parsing the soft-ACKs from an ACK packet, it updates the Tx packet annotations in the annotation buffer. If a soft-ACK is an ACK, then we overwrite unack'd, nak'd or to-be-retransmitted states and that is fine; but if the soft-ACK is an NACK, we overwrite the to-be-retransmitted with a nak - which isn't. Instead, we need to let any scheduled retransmission stand if the packet was NAK'd. Note that we don't reissue a resend if the annotation is in the to-be-retransmitted state because someone else must've scheduled the resend already. Signed-off-by: David Howells --- net/rxrpc/input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 06027b6d9c19..d3d69ab1f0a1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -479,6 +479,8 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, case RXRPC_ACK_TYPE_NACK: if (anno_type == RXRPC_TX_ANNO_NAK) continue; + if (anno_type == RXRPC_TX_ANNO_RETRANS) + continue; call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK | annotation; resend = true; -- cgit v1.2.1 From 01a88f7f6bd4514de9551c3fc9a6fd9e65cbf79d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Fix call timer Fix the call timer in the following ways: (1) If call->resend_at or call->ack_at are before or equal to the current time, then ignore that timeout. (2) If call->expire_at is before or equal to the current time, then don't set the timer at all (possibly we should queue the call). (3) Don't skip modifying the timer if timer_pending() is true. This indicates that the timer is working, not that it has expired and is running/waiting to run its expiry handler. Also call rxrpc_set_timer() to start the call timer going rather than calling add_timer(). Signed-off-by: David Howells --- net/rxrpc/call_event.c | 25 ++++++++++++++----------- net/rxrpc/call_object.c | 4 ++-- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3a7f90a2659c..8bc5c8e37ab4 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -28,24 +28,27 @@ void rxrpc_set_timer(struct rxrpc_call *call) { unsigned long t, now = jiffies; - _enter("{%ld,%ld,%ld:%ld}", - call->ack_at - now, call->resend_at - now, call->expire_at - now, - call->timer.expires - now); - read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { - t = call->ack_at; - if (time_before(call->resend_at, t)) + t = call->expire_at; + if (time_before_eq(t, now)) + goto out; + + if (time_after(call->resend_at, now) && + time_before(call->resend_at, t)) t = call->resend_at; - if (time_before(call->expire_at, t)) - t = call->expire_at; - if (!timer_pending(&call->timer) || - time_before(t, call->timer.expires)) { - _debug("set timer %ld", t - now); + + if (time_after(call->ack_at, now) && + time_before(call->ack_at, t)) + t = call->ack_at; + + if (call->timer.expires != t || !timer_pending(&call->timer)) { mod_timer(&call->timer, t); } } + +out: read_unlock_bh(&call->state_lock); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f50a6094e198..f2fadf667e19 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -199,8 +199,8 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) call->expire_at = expire_at; call->ack_at = expire_at; call->resend_at = expire_at; - call->timer.expires = expire_at; - add_timer(&call->timer); + call->timer.expires = expire_at + 1; + rxrpc_set_timer(call); } /* -- cgit v1.2.1 From 70790dbe3f6651fb66ad38da0a1e24368778bc16 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Pass the last Tx packet marker in the annotation buffer When the last packet of data to be transmitted on a call is queued, tx_top is set and then the RXRPC_CALL_TX_LAST flag is set. Unfortunately, this leaves a race in the ACK processing side of things because the flag affects the interpretation of tx_top and also allows us to start receiving reply data before we've finished transmitting. To fix this, make the following changes: (1) rxrpc_queue_packet() now sets a marker in the annotation buffer instead of setting the RXRPC_CALL_TX_LAST flag. (2) rxrpc_rotate_tx_window() detects the marker and sets the flag in the same context as the routines that use it. (3) rxrpc_end_tx_phase() is simplified to just shift the call state. The Tx window must have been rotated before calling to discard the last packet. (4) rxrpc_receiving_reply() is added to handle the arrival of the first DATA packet of a reply to a client call (which is an implicit ACK of the Tx phase). (5) The last part of rxrpc_input_ack() is reordered to perform Tx rotation, then soft-ACK application and then to end the phase if we've rotated the last packet. In the event of a terminal ACK, the soft-ACK application will be skipped as nAcks should be 0. (6) rxrpc_input_ackall() now has to rotate as well as ending the phase. In addition: (7) Alter the transmit tracepoint to log the rotation of the last packet. (8) Remove the no-longer relevant queue_reqack tracepoint note. The ACK-REQUESTED packet header flag is now set as needed when we actually transmit the packet and may vary by retransmission. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 7 +++- net/rxrpc/input.c | 102 +++++++++++++++++++++++++++++++----------------- net/rxrpc/misc.c | 3 +- net/rxrpc/sendmsg.c | 14 +++---- 4 files changed, 81 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9e3ba4dc9578..a494d56eb236 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -508,7 +508,9 @@ struct rxrpc_call { #define RXRPC_TX_ANNO_NAK 2 #define RXRPC_TX_ANNO_RETRANS 3 #define RXRPC_TX_ANNO_MASK 0x03 -#define RXRPC_TX_ANNO_RESENT 0x04 +#define RXRPC_TX_ANNO_LAST 0x04 +#define RXRPC_TX_ANNO_RESENT 0x08 + #define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */ #define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */ #define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ @@ -621,9 +623,10 @@ extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; enum rxrpc_transmit_trace { rxrpc_transmit_wait, rxrpc_transmit_queue, - rxrpc_transmit_queue_reqack, rxrpc_transmit_queue_last, rxrpc_transmit_rotate, + rxrpc_transmit_rotate_last, + rxrpc_transmit_await_reply, rxrpc_transmit_end, rxrpc_transmit__nr_trace }; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index d3d69ab1f0a1..fb3e2f6afa3b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) { struct sk_buff *skb, *list = NULL; int ix; + u8 annotation; spin_lock(&call->lock); @@ -66,16 +67,22 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) call->tx_hard_ack++; ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; + annotation = call->rxtx_annotations[ix]; rxrpc_see_skb(skb, rxrpc_skb_tx_rotated); call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; skb->next = list; list = skb; + + if (annotation & RXRPC_TX_ANNO_LAST) + set_bit(RXRPC_CALL_TX_LAST, &call->flags); } spin_unlock(&call->lock); - trace_rxrpc_transmit(call, rxrpc_transmit_rotate); + trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ? + rxrpc_transmit_rotate_last : + rxrpc_transmit_rotate)); wake_up(&call->waitq); while (list) { @@ -92,42 +99,65 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) * This occurs when we get an ACKALL packet, the first DATA packet of a reply, * or a final ACK packet. */ -static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why) +static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, + const char *abort_why) { - _enter(""); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - return true; - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - case RXRPC_CALL_SERVER_AWAIT_ACK: - break; - default: - rxrpc_proto_abort(abort_why, call, call->tx_top); - return false; - } - rxrpc_rotate_tx_window(call, call->tx_top); + ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); write_lock(&call->state_lock); switch (call->state) { - default: - break; + case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->tx_phase = false; - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + if (reply_begun) + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + else + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; break; + case RXRPC_CALL_SERVER_AWAIT_ACK: __rxrpc_call_completed(call); rxrpc_notify_socket(call); break; + + default: + goto bad_state; } write_unlock(&call->state_lock); - trace_rxrpc_transmit(call, rxrpc_transmit_end); + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { + trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); + } else { + trace_rxrpc_transmit(call, rxrpc_transmit_end); + } _leave(" = ok"); return true; + +bad_state: + write_unlock(&call->state_lock); + kdebug("end_tx %s", rxrpc_call_states[call->state]); + rxrpc_proto_abort(abort_why, call, call->tx_top); + return false; +} + +/* + * Begin the reply reception phase of a call. + */ +static bool rxrpc_receiving_reply(struct rxrpc_call *call) +{ + rxrpc_seq_t top = READ_ONCE(call->tx_top); + + if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + rxrpc_rotate_tx_window(call, top); + if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { + rxrpc_proto_abort("TXL", call, top); + return false; + } + if (!rxrpc_end_tx_phase(call, true, "ETD")) + return false; + call->tx_phase = false; + return true; } /* @@ -226,8 +256,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY && - !rxrpc_end_tx_phase(call, "ETD")) + if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST || + call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && + !rxrpc_receiving_reply(call)) return; call->ackr_prev_seq = seq; @@ -587,27 +618,26 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, } call->acks_latest = sp->hdr.serial; - if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - hard_ack == call->tx_top) { - rxrpc_end_tx_phase(call, "ETA"); - return; - } - if (before(hard_ack, call->tx_hard_ack) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort("AKW", call, 0); + if (nr_acks > call->tx_top - hard_ack) + return rxrpc_proto_abort("AKN", call, 0); if (after(hard_ack, call->tx_hard_ack)) rxrpc_rotate_tx_window(call, hard_ack); - if (after(first_soft_ack, call->tx_top)) + if (nr_acks > 0) { + if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0) + return rxrpc_proto_abort("XSA", call, 0); + rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks); + } + + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { + rxrpc_end_tx_phase(call, false, "ETA"); return; + } - if (nr_acks > call->tx_top - first_soft_ack + 1) - nr_acks = first_soft_ack - call->tx_top + 1; - if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0) - return rxrpc_proto_abort("XSA", call, 0); - rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks); } /* @@ -619,7 +649,9 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) _proto("Rx ACKALL %%%u", sp->hdr.serial); - rxrpc_end_tx_phase(call, "ETL"); + rxrpc_rotate_tx_window(call, call->tx_top); + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + rxrpc_end_tx_phase(call, false, "ETL"); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 0d425e707f22..fe648711c2f7 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -155,9 +155,10 @@ const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { [rxrpc_transmit_wait] = "WAI", [rxrpc_transmit_queue] = "QUE", - [rxrpc_transmit_queue_reqack] = "QRA", [rxrpc_transmit_queue_last] = "QLS", [rxrpc_transmit_rotate] = "ROT", + [rxrpc_transmit_rotate_last] = "RLS", + [rxrpc_transmit_await_reply] = "AWR", [rxrpc_transmit_end] = "END", }; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 7cb34b2dfba9..93e6584cd751 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -94,11 +94,15 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_seq_t seq = sp->hdr.seq; int ret, ix; + u8 annotation = RXRPC_TX_ANNO_UNACK; _net("queue skb %p [%d]", skb, seq); ASSERTCMP(seq, ==, call->tx_top + 1); + if (last) + annotation |= RXRPC_TX_ANNO_LAST; + /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. */ @@ -106,18 +110,14 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ix = seq & RXRPC_RXTX_BUFF_MASK; rxrpc_get_skb(skb, rxrpc_skb_tx_got); - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK; + call->rxtx_annotations[ix] = annotation; smp_wmb(); call->rxtx_buffer[ix] = skb; call->tx_top = seq; - if (last) { - set_bit(RXRPC_CALL_TX_LAST, &call->flags); + if (last) trace_rxrpc_transmit(call, rxrpc_transmit_queue_last); - } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - trace_rxrpc_transmit(call, rxrpc_transmit_queue_reqack); - } else { + else trace_rxrpc_transmit(call, rxrpc_transmit_queue); - } if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); -- cgit v1.2.1 From b86e218e0d422488e0febb07620fa97ae9713779 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 15:08:48 +0100 Subject: rxrpc: Don't call the tx_ack tracepoint if don't generate an ACK rxrpc_send_call_packet() is invoking the tx_ack tracepoint before it checks whether there's an ACK to transmit (another thread may jump in and transmit it). Fix this by only invoking the tracepoint if we get a valid ACK to transmit. Further, only allocate a serial number if we're going to actually transmit something. Signed-off-by: David Howells --- net/rxrpc/output.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 282cb1e36d06..5c1e008a5323 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -80,9 +80,6 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); - trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason, - top - hard_ack); - *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; @@ -119,8 +116,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) return -ENOMEM; } - serial = atomic_inc_return(&conn->serial); - msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; @@ -131,7 +126,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) pkt->whdr.cid = htonl(call->cid); pkt->whdr.callNumber = htonl(call->call_id); pkt->whdr.seq = 0; - pkt->whdr.serial = htonl(serial); pkt->whdr.type = type; pkt->whdr.flags = conn->out_clientflag; pkt->whdr.userStatus = 0; @@ -157,14 +151,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) spin_unlock_bh(&call->lock); - _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - serial, - ntohs(pkt->ack.maxSkew), - ntohl(pkt->ack.firstPacket), - ntohl(pkt->ack.previousPacket), - ntohl(pkt->ack.serial), - rxrpc_acks(pkt->ack.reason), - pkt->ack.nAcks); iov[0].iov_len += sizeof(pkt->ack) + n; iov[1].iov_base = &pkt->ackinfo; @@ -176,7 +162,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) case RXRPC_PACKET_TYPE_ABORT: abort_code = call->abort_code; pkt->abort_code = htonl(abort_code); - _proto("Tx ABORT %%%u { %d }", serial, abort_code); iov[0].iov_len += sizeof(pkt->abort_code); len += sizeof(pkt->abort_code); ioc = 1; @@ -188,6 +173,17 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) goto out; } + serial = atomic_inc_return(&conn->serial); + pkt->whdr.serial = htonl(serial); + switch (type) { + case RXRPC_PACKET_TYPE_ACK: + trace_rxrpc_tx_ack(call, + ntohl(pkt->ack.firstPacket), + ntohl(pkt->ack.serial), + pkt->ack.reason, pkt->ack.nAcks); + break; + } + if (ping) { call->ackr_ping = serial; smp_wmb(); -- cgit v1.2.1 From fc7ab6d29a3af0b7f6df7c095509378c8caf85b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 15:22:36 +0100 Subject: rxrpc: Add a tracepoint for the call timer Add a tracepoint to log call timer initiation, setting and expiry. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 13 ++++++++++++- net/rxrpc/call_event.c | 7 ++++--- net/rxrpc/call_object.c | 6 ++++-- net/rxrpc/misc.c | 8 ++++++++ net/rxrpc/sendmsg.c | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a494d56eb236..e564eca75985 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -678,6 +678,17 @@ enum rxrpc_rtt_rx_trace { extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; +enum rxrpc_timer_trace { + rxrpc_timer_begin, + rxrpc_timer_expired, + rxrpc_timer_set_for_ack, + rxrpc_timer_set_for_resend, + rxrpc_timer_set_for_send, + rxrpc_timer__nr_trace +}; + +extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; + extern const char *const rxrpc_pkts[]; extern const char *rxrpc_acks(u8 reason); @@ -707,7 +718,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void rxrpc_set_timer(struct rxrpc_call *); +void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8bc5c8e37ab4..90e970ba048a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,7 +24,7 @@ /* * Set the timer */ -void rxrpc_set_timer(struct rxrpc_call *call) +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why) { unsigned long t, now = jiffies; @@ -45,6 +45,7 @@ void rxrpc_set_timer(struct rxrpc_call *call) if (call->timer.expires != t || !timer_pending(&call->timer)) { mod_timer(&call->timer, t); + trace_rxrpc_timer(call, why, now); } } @@ -120,7 +121,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now); if (time_before(ack_at, call->ack_at)) { call->ack_at = ack_at; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_ack); } } } @@ -293,7 +294,7 @@ recheck_state: goto recheck_state; } - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f2fadf667e19..a53f4c2c0025 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -76,8 +76,10 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) + if (call->state < RXRPC_CALL_COMPLETE) { + trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); rxrpc_queue_call(call); + } } /* @@ -200,7 +202,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) call->ack_at = expire_at; call->resend_at = expire_at; call->timer.expires = expire_at + 1; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_begin); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index fe648711c2f7..fa9942fabdf2 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -194,3 +194,11 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { [rxrpc_rtt_rx_ping_response] = "PONG", [rxrpc_rtt_rx_requested_ack] = "RACK", }; + +const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { + [rxrpc_timer_begin] = "Begin ", + [rxrpc_timer_expired] = "*EXPR*", + [rxrpc_timer_set_for_ack] = "SetAck", + [rxrpc_timer_set_for_send] = "SetTx ", + [rxrpc_timer_set_for_resend] = "SetRTx", +}; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 93e6584cd751..99939372b5a4 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -153,7 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (time_before(resend_at, call->resend_at)) { call->resend_at = resend_at; - rxrpc_set_timer(call); + rxrpc_set_timer(call, rxrpc_timer_set_for_send); } } -- cgit v1.2.1 From be832aecc5ba811728e15a10f675f4a2187f25dd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint Add a tracepoint to log transmission of DATA packets (including loss injection). Adjust the ACK transmission tracepoint to include the packet serial number and to line this up with the DATA transmission display. Signed-off-by: David Howells --- net/rxrpc/conn_event.c | 5 ++--- net/rxrpc/output.c | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 75a15a4c74c3..a1cf1ec5f29e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -98,9 +98,6 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); len += sizeof(pkt.ack) + sizeof(pkt.info); - - trace_rxrpc_tx_ack(NULL, chan->last_seq, 0, - RXRPC_ACK_DUPLICATE, 0); break; } @@ -122,6 +119,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); break; case RXRPC_PACKET_TYPE_ACK: + trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0, + RXRPC_ACK_DUPLICATE, 0); _proto("Tx ACK %%%u [re]", serial); break; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5c1e008a5323..e47fbd1c836d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -177,7 +177,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) pkt->whdr.serial = htonl(serial); switch (type) { case RXRPC_PACKET_TYPE_ACK: - trace_rxrpc_tx_ack(call, + trace_rxrpc_tx_ack(call, serial, ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); @@ -275,6 +275,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, + whdr.flags, true); rxrpc_lose_skb(skb, rxrpc_skb_tx_lost); _leave(" = 0 [lose]"); return 0; @@ -302,6 +304,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) goto send_fragmentable; done: + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false); if (ret >= 0) { ktime_t now = ktime_get_real(); skb->tstamp = now; -- cgit v1.2.1 From 89b475abdb107a74f57497b65becaf837a0e5b6b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 12:39:22 +0100 Subject: rxrpc: Add a tracepoint to log injected Rx packet loss Add a tracepoint to log received packets that get discarded due to Rx packet loss. Signed-off-by: David Howells --- net/rxrpc/input.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index fb3e2f6afa3b..19b1e189f5dc 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -837,20 +837,19 @@ void rxrpc_data_ready(struct sock *udp_sk) skb_orphan(skb); sp = rxrpc_skb(skb); + /* dig out the RxRPC connection details */ + if (rxrpc_extract_header(sp, skb) < 0) + goto bad_message; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { + trace_rxrpc_rx_lose(sp); rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); return; } } - _net("Rx UDP packet from %08x:%04hu", - ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); - - /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; trace_rxrpc_rx_packet(sp); _net("Rx RxRPC %s ep=%x call=%x:%x", -- cgit v1.2.1 From 9c7ad434441da6b5d4ac878cac368fbdaec99b56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 13:50:40 +0100 Subject: rxrpc: Add tracepoint for ACK proposal Add a tracepoint to log proposed ACKs, including whether the proposal is used to update a pending ACK or is discarded in favour of an easlier, higher priority ACK. Whilst we're at it, get rid of the rxrpc_acks() function and access the name array directly. We do, however, need to validate the ACK reason number given to trace_rxrpc_rx_ack() to make sure we don't overrun the array. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 25 +++++++++++++++++++++++-- net/rxrpc/call_event.c | 21 ++++++++++++++------- net/rxrpc/input.c | 19 +++++++++++++------ net/rxrpc/misc.c | 30 +++++++++++++++++++----------- net/rxrpc/output.c | 3 ++- net/rxrpc/recvmsg.c | 3 ++- 6 files changed, 73 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e564eca75985..042dbcc52654 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -689,8 +689,28 @@ enum rxrpc_timer_trace { extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; +enum rxrpc_propose_ack_trace { + rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_params, + rxrpc_propose_ack_respond_to_ack, + rxrpc_propose_ack_respond_to_ping, + rxrpc_propose_ack_retry_tx, + rxrpc_propose_ack_terminal_ack, + rxrpc_propose_ack__nr_trace +}; + +enum rxrpc_propose_ack_outcome { + rxrpc_propose_ack_use, + rxrpc_propose_ack_update, + rxrpc_propose_ack_subsume, + rxrpc_propose_ack__nr_outcomes +}; + +extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; +extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; + extern const char *const rxrpc_pkts[]; -extern const char *rxrpc_acks(u8 reason); +extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; #include @@ -719,7 +739,8 @@ int rxrpc_reject_call(struct rxrpc_sock *); * call_event.c */ void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, + enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 90e970ba048a..fd5b11339ffb 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -58,14 +58,13 @@ out: */ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, u16 skew, u32 serial, bool immediate, - bool background) + bool background, + enum rxrpc_propose_ack_trace why) { + enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; - _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), serial, immediate); - /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial * numbers, but we don't alter the timeout. */ @@ -74,15 +73,18 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]); if (ack_reason == call->ackr_reason) { if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { + outcome = rxrpc_propose_ack_update; call->ackr_serial = serial; call->ackr_skew = skew; } if (!immediate) - return; + goto trace; } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { call->ackr_reason = ack_reason; call->ackr_serial = serial; call->ackr_skew = skew; + } else { + outcome = rxrpc_propose_ack_subsume; } switch (ack_reason) { @@ -124,17 +126,22 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_set_timer(call, rxrpc_timer_set_for_ack); } } + +trace: + trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate, + background, outcome); } /* * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate, bool background) + u16 skew, u32 serial, bool immediate, bool background, + enum rxrpc_propose_ack_trace why) { spin_lock_bh(&call->lock); __rxrpc_propose_ACK(call, ack_reason, skew, serial, - immediate, background); + immediate, background, why); spin_unlock_bh(&call->lock); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 19b1e189f5dc..349698d87ad1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -49,7 +49,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, if (call->peer->rtt_usage < 3 || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, - true, true); + true, true, + rxrpc_propose_ack_ping_for_params); } /* @@ -382,7 +383,8 @@ skip: ack: if (ack) rxrpc_propose_ACK(call, ack, skew, ack_serial, - immediate_ack, true); + immediate_ack, true, + rxrpc_propose_ack_input_data); if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) rxrpc_notify_socket(call); @@ -539,6 +541,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { + u8 ack_reason; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); union { struct rxrpc_ackpacket ack; @@ -561,8 +564,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack = ntohl(buf.ack.firstPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; + ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? + buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, first_soft_ack, buf.ack.reason, nr_acks); + trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks); _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", sp->hdr.serial, @@ -570,7 +575,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack, ntohl(buf.ack.previousPacket), acked_serial, - rxrpc_acks(buf.ack.reason), + rxrpc_ack_names[ack_reason], buf.ack.nAcks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) @@ -583,10 +588,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - skew, sp->hdr.serial, true, true); + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - skew, sp->hdr.serial, true, true); + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ack); } offset = sp->offset + nr_acks + 3; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index fa9942fabdf2..1ca14835d87f 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -91,17 +91,10 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_PING] = 9, }; -const char *rxrpc_acks(u8 reason) -{ - static const char *const str[] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" - }; - - if (reason >= ARRAY_SIZE(str)) - reason = ARRAY_SIZE(str) - 1; - return str[reason]; -} +const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", + "IDL", "-?-" +}; const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { [rxrpc_skb_rx_cleaned] = "Rx CLN", @@ -202,3 +195,18 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { [rxrpc_timer_set_for_send] = "SetTx ", [rxrpc_timer_set_for_resend] = "SetRTx", }; + +const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { + [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_params] = "Params ", + [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", + [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", + [rxrpc_propose_ack_retry_tx] = "RetryTx", + [rxrpc_propose_ack_terminal_ack] = "ClTerm ", +}; + +const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { + [rxrpc_propose_ack_use] = "", + [rxrpc_propose_ack_update] = " Update", + [rxrpc_propose_ack_subsume] = " Subsume", +}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e47fbd1c836d..0c563e325c9d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -210,7 +210,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true); + true, true, + rxrpc_propose_ack_retry_tx); break; case RXRPC_PACKET_TYPE_ABORT: break; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 99e4c0ae30f1..8c7f3de45bac 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -141,7 +141,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false); + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false, + rxrpc_propose_ack_terminal_ack); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); } -- cgit v1.2.1 From c6672e3fe4a641bf302d6309ab4d5ee55648e758 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Sep 2016 13:58:55 +0100 Subject: rxrpc: Add a tracepoint to log which packets will be retransmitted Add a tracepoint to log in rxrpc_resend() which packets will be retransmitted. Note that if a positive ACK comes in whilst we have dropped the lock to retransmit another packet, the actual retransmission may not happen, though some of the effects will (such as altering the congestion management). Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fd5b11339ffb..a78a92fe5d77 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -196,6 +196,8 @@ static void rxrpc_resend(struct rxrpc_call *call) /* Okay, we need to retransmit a packet. */ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; + trace_rxrpc_retransmit(call, seq, annotation | anno_type, + ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now); -- cgit v1.2.1 From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:15 +0300 Subject: net: Update API for VF vlan protocol 802.1ad support Introduce new rtnl UAPI that exposes a list of vlans per VF, giving the ability for user-space application to specify it for the VF, as an option to support 802.1ad. We adjusted IP Link tool to support this option. For future use cases, the new UAPI supports multiple vlans. For now we limit the list size to a single vlan in kernel. Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward compatibility with older versions of IP Link tool. Add a vlan protocol parameter to the ndo_set_vf_vlan callback. We kept 802.1Q as the drivers' default vlan protocol. Suitable ip link tool command examples: Set vf vlan protocol 802.1ad: ip link set eth0 vf 1 vlan 100 proto 802.1ad Set vf to VST (802.1Q) mode: ip link set eth0 vf 1 vlan 100 proto 802.1Q Or by omitting the new parameter ip link set eth0 vf 1 vlan 100 Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 80 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0dbae4244a89..3ac8946bf244 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += nla_total_size(num_vfs * sizeof(struct nlattr)); size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + - nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct nlattr)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + @@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct nlattr *vfinfo) { struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; - struct nlattr *vf, *vfstats; struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; @@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; vf_mac.vf = vf_vlan.vf = + vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = @@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; @@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vf) + goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), @@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) - return -EMSGSIZE; + goto nla_put_vf_failure; + vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vfstats) + goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, @@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) - return -EMSGSIZE; + vf_stats.multicast, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) @@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, @@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, - ivv->qos); + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } -- cgit v1.2.1 From db32e4e49ce2b0e5fcc17803d011a401c0a637f6 Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Fri, 23 Sep 2016 15:50:29 -0400 Subject: ip6_gre: fix flowi6_proto value in ip6gre_xmit_other() Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup. Up until now, ip6gre_xmit_other() has set flowi6_proto to a bogus value. This affected output route lookup for packets sent on an ip6gretap device in cases where routing was dependent on the value of flowi6_proto. Since the correct proto is already set in the tunnel flowi6 template via commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit path."), simply delete the line setting the incorrect flowi6_proto value. Suggested-by: Jiri Benc Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reviewed-by: Shmulik Ladkani Signed-off-by: Lance Richardson Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 704274cbd495..edc3daab354e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -648,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = skb->protocol; err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) -- cgit v1.2.1 From 805b21b929e29192fb5de16154f616bfc1116e3e Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:26 +0100 Subject: rxrpc: Send an ACK after every few DATA packets we receive Send an ACK if we haven't sent one for the last two packets we've received. This keeps the other end apprised of where we've got to - which is important if they're doing slow-start. We do this in recvmsg so that we can dispatch a packet directly without the need to wake up the background thread. This should possibly be made configurable in future. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/misc.c | 1 + net/rxrpc/output.c | 25 +++++++++++++++++-------- net/rxrpc/recvmsg.c | 13 ++++++++++++- 4 files changed, 33 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 042dbcc52654..e3bf9c0e3ad1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -533,6 +533,8 @@ struct rxrpc_call { u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ + rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ + rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ rxrpc_serial_t ackr_ping; /* Last ping sent */ ktime_t ackr_ping_time; /* Time last ping sent */ @@ -695,6 +697,7 @@ enum rxrpc_propose_ack_trace { rxrpc_propose_ack_respond_to_ack, rxrpc_propose_ack_respond_to_ping, rxrpc_propose_ack_retry_tx, + rxrpc_propose_ack_rotate_rx, rxrpc_propose_ack_terminal_ack, rxrpc_propose_ack__nr_trace }; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 1ca14835d87f..a473fd7dabaa 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -202,6 +202,7 @@ const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", [rxrpc_propose_ack_retry_tx] = "RetryTx", + [rxrpc_propose_ack_rotate_rx] = "RxAck ", [rxrpc_propose_ack_terminal_ack] = "ClTerm ", }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0c563e325c9d..3eb01445e814 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -36,7 +36,9 @@ struct rxrpc_pkt_buffer { * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_pkt_buffer *pkt) + struct rxrpc_pkt_buffer *pkt, + rxrpc_seq_t *_hard_ack, + rxrpc_seq_t *_top) { rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; @@ -48,6 +50,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); + *_hard_ack = hard_ack; + *_top = top; pkt->ack.bufferSpace = htons(8); pkt->ack.maxSkew = htons(call->ackr_skew); @@ -96,6 +100,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; + rxrpc_seq_t hard_ack, top; size_t len, n; bool ping = false; int ioc, ret; @@ -146,7 +151,7 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) goto out; } ping = (call->ackr_reason == RXRPC_ACK_PING); - n = rxrpc_fill_out_ack(call, pkt); + n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top); call->ackr_reason = 0; spin_unlock_bh(&call->lock); @@ -203,18 +208,22 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) if (ping) call->ackr_ping_time = ktime_get_real(); - if (ret < 0 && call->state < RXRPC_CALL_COMPLETE) { - switch (type) { - case RXRPC_PACKET_TYPE_ACK: + if (type == RXRPC_PACKET_TYPE_ACK && + call->state < RXRPC_CALL_COMPLETE) { + if (ret < 0) { clear_bit(RXRPC_CALL_PINGING, &call->flags); rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), true, true, rxrpc_propose_ack_retry_tx); - break; - case RXRPC_PACKET_TYPE_ABORT: - break; + } else { + spin_lock_bh(&call->lock); + if (after(hard_ack, call->ackr_consumed)) + call->ackr_consumed = hard_ack; + if (after(top, call->ackr_seen)) + call->ackr_seen = top; + spin_unlock_bh(&call->lock); } } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8c7f3de45bac..a7458c398b9e 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -201,8 +201,19 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); - if (flags & RXRPC_LAST_PACKET) + if (flags & RXRPC_LAST_PACKET) { rxrpc_end_rx_phase(call); + } else { + /* Check to see if there's an ACK that needs sending. */ + if (after_eq(hard_ack, call->ackr_consumed + 2) || + after_eq(top, call->ackr_seen + 2) || + (hard_ack == top && after(hard_ack, call->ackr_consumed))) + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, + true, false, + rxrpc_propose_ack_rotate_rx); + if (call->ackr_reason) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + } } /* -- cgit v1.2.1 From 50f4c7b73f831a53fa9ddeb9bdf4cfb5b23d3aa7 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 7 Sep 2016 10:40:24 +0800 Subject: netfilter: xt_TCPMSS: Refactor the codes to decrease one condition check and more readable The origin codes perform two condition checks with dst_mtu(skb_dst(skb)) and in_mtu. And the last statement is "min(dst_mtu(skb_dst(skb)), in_mtu) - minlen". It may let reader think about how about the result. Would it be negative. Now assign the result of min(dst_mtu(skb_dst(skb)), in_mtu) to a new variable, then only perform one condition check, and it is more readable. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPMSS.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e118397254af..872db2d0e2a9 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -110,18 +110,14 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = par->net; unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); + unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); - if (dst_mtu(skb_dst(skb)) <= minlen) { + if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", - dst_mtu(skb_dst(skb))); + min_mtu); return -1; } - if (in_mtu <= minlen) { - net_err_ratelimited("unknown or invalid path-MTU (%u)\n", - in_mtu); - return -1; - } - newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; + newmss = min_mtu - minlen; } else newmss = info->mss; -- cgit v1.2.1 From c5136b15ea364124299c8a9ba96b300e96061e3a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Sep 2016 11:35:01 -0400 Subject: netfilter: bridge: add and use br_nf_hook_thresh This replaces the last uses of NF_HOOK_THRESH(). Followup patch will remove it and rename nf_hook_thresh. The reason is that inet (non-bridge) netfilter no longer invokes the hooks from hooks, so we do no longer need the thresh value to skip hooks with a lower priority. The bridge netfilter however may need to do this. br_nf_hook_thresh is a wrapper that is supposed to do this, i.e. only call hooks with a priority that exceeds NF_BR_PRI_BRNF. It's used only in the recursion cases of br_netfilter. It invokes nf_hook_slow while holding an rcu read-side critical section to make a future cleanup simpler. Signed-off-by: Florian Westphal Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 60 ++++++++++++++++++++++++++++++++++------- net/bridge/br_netfilter_ipv6.c | 12 ++++----- 2 files changed, 56 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 77e7f69bf80d..6029af47377d 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -395,11 +396,10 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, - NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, + NULL, + br_nf_pre_routing_finish); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -417,10 +417,8 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, + br_handle_frame_finish); return 0; } @@ -992,6 +990,50 @@ static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; +/* recursively invokes nf_hook_slow (again), skipping already-called + * hooks (< NF_BR_PRI_BRNF). + * + * Called with rcu read lock held. + */ +int br_nf_hook_thresh(unsigned int hook, struct net *net, + struct sock *sk, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct nf_hook_ops *elem; + struct nf_hook_state state; + struct list_head *head; + int ret; + + head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; + + list_for_each_entry_rcu(elem, head, list) { + struct nf_hook_ops *next; + + next = list_entry_rcu(list_next_rcu(&elem->list), + struct nf_hook_ops, list); + if (next->priority <= NF_BR_PRI_BRNF) + continue; + } + + if (&elem->list == head) + return okfn(net, sk, skb); + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, + NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); + + ret = nf_hook_slow(skb, &state); + rcu_read_unlock(); + if (ret == 1) + ret = okfn(net, sk, skb); + + return ret; +} + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 5e59a8457e7b..5989661c659f 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, + skb->dev, NULL, br_handle_frame_finish); return 0; } -- cgit v1.2.1 From 2c1e2703ff812ccaa42a4bc8a25803955e342b85 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:03 -0400 Subject: netfilter: call nf_hook_ingress with rcu_read_lock This commit ensures that the rcu read-side lock is held while the ingress hook is called. This ensures that a call to nf_hook_slow (and ultimately nf_ingress) will be read protected. Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 34b5322bc081..064919425b7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4040,12 +4040,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, { #ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { + int ingress_retval; + if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return nf_hook_ingress(skb); + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; } #endif /* CONFIG_NETFILTER_INGRESS */ return 0; -- cgit v1.2.1 From e2361cb90a0327bdab34d01d1a7b9dbd67c31e60 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:04 -0400 Subject: netfilter: Remove explicit rcu_read_lock in nf_hook_slow All of the callers of nf_hook_slow already hold the rcu_read_lock, so this cleanup removes the recursive call. This is just a cleanup, as the locking code gracefully handles this situation. Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_redirect.c | 2 +- net/bridge/netfilter/ebtables.c | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/core.c | 6 +----- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nfnetlink_cthelper.c | 2 +- net/netfilter/nfnetlink_log.c | 8 ++++++-- net/netfilter/nfnetlink_queue.c | 2 +- net/netfilter/xt_helper.c | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 203964997a51..2e7c4f974340 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -24,7 +24,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) return EBT_DROP; if (par->hooknum != NF_BR_BROUTING) - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ ether_addr_copy(eth_hdr(skb)->h_dest, br_port_get_rcu(par->in)->br->dev->dev_addr); else diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index cceac5bb658f..dd7133216c9c 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 1; if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) return 1; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ if (in && (p = br_port_get_rcu(in)) != NULL && NF_INVF(e, EBT_ILOGICALIN, ebt_dev_check(e->logical_in, p->br->dev))) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 870aebda2932..713c09a74b90 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -110,7 +110,7 @@ static unsigned int ipv4_helper(void *priv, if (!help) return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b5904bc2614..d075b3cf2400 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 1aa5848764a7..963ee3848675 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -115,7 +115,7 @@ static unsigned int ipv6_helper(void *priv, help = nfct_help(ct); if (!help) return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 660bc10c7a9c..f5a61bc3ec2b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f39276d1c2d7..c8faf8102394 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -291,16 +291,13 @@ repeat: /* Returns 1 if okfn() needs to be executed by the caller, - * -EPERM for NF_DROP, 0 otherwise. */ + * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { struct nf_hook_ops *elem; unsigned int verdict; int ret = 0; - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: verdict = nf_iterate(state->hook_list, skb, state, &elem); @@ -321,7 +318,6 @@ next_hook: kfree_skb(skb); } } - rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_hook_slow); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8d1ddb9b63ed..c94ec197845c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1275,7 +1275,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, skb->nfct = NULL; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ l3proto = __nf_ct_l3proto_find(pf); ret = l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5c0db5c64734..f65d93639d12 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net, const struct nf_afinfo *afinfo; int ret = 0; - /* rcu_read_lock()ed by nf_hook_slow() */ + /* rcu_read_lock()ed by nf_hook_thresh */ afinfo = nf_get_afinfo(family); if (!afinfo) return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4ffe388a9a1e..336e21559e01 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -346,7 +346,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, /* Called from the helper function, this call never fails */ help = nfct_help(ct); - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index e924e95fcc7f..3b79f34b5095 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, if (help == NULL) return NF_DROP; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper == NULL) return NF_DROP; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6577db524ef6..eb086a192c5a 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log, if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + /* rcu_read_lock()ed by nf_hook_thresh or + * nf_log_packet. + */ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; @@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log, if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + /* rcu_read_lock()ed by nf_hook_thresh or + * nf_log_packet. + */ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 808da34f94cd..7caa8b082c41 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); - /* rcu_read_lock()ed by nf_hook_slow() */ + /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 805c9f64a04c..f679dd4c272a 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par) if (!master_help) return ret; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(master_help->helper); if (!helper) return ret; -- cgit v1.2.1 From d4bb5caa9cc1a802ba25f605b24b5640c025806b Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:05 -0400 Subject: netfilter: Only allow sane values in nf_register_net_hook This commit adds an upfront check for sane values to be passed when registering a netfilter hook. This will be used in a future patch for a simplified hook list traversal. Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c8faf8102394..67b74287535d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -89,6 +89,11 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) struct nf_hook_entry *entry; struct nf_hook_ops *elem; + if (reg->pf == NFPROTO_NETDEV && + (reg->hooknum != NF_NETDEV_INGRESS || + !reg->dev || dev_net(reg->dev) != net)) + return -EINVAL; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; -- cgit v1.2.1 From a7056c5ba67ee6a956b42cf9ff9ba3a6a0bd9794 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Send an immediate ACK if we fill in a hole Send an immediate ACK if we fill in a hole in the buffer left by an out-of-sequence packet. This may allow the congestion management in the peer to avoid a retransmission if packets got reordered on the wire. Signed-off-by: David Howells --- net/rxrpc/input.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 349698d87ad1..757c16f033a0 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -331,8 +331,16 @@ next_subpacket: call->rxtx_annotations[ix] = annotation; smp_wmb(); call->rxtx_buffer[ix] = skb; - if (after(seq, call->rx_top)) + if (after(seq, call->rx_top)) { smp_store_release(&call->rx_top, seq); + } else if (before(seq, call->rx_top)) { + /* Send an immediate ACK if we fill in a hole */ + if (!ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } + immediate_ack = true; + } if (flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_RX_LAST, &call->flags); trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); -- cgit v1.2.1 From b69d94d7991f83928d3ea18fe12ab011fa852bb0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Include the last reply DATA serial number in the final ACK In a client call, include the serial number of the last DATA packet of the reply in the final ACK. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a7458c398b9e..038ae62ddb4d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -133,7 +133,7 @@ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, /* * End the packet reception phase. */ -static void rxrpc_end_rx_phase(struct rxrpc_call *call) +static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) { _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); @@ -141,7 +141,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call) ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, true, false, + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); } @@ -202,7 +202,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); if (flags & RXRPC_LAST_PACKET) { - rxrpc_end_rx_phase(call); + rxrpc_end_rx_phase(call, serial); } else { /* Check to see if there's an ACK that needs sending. */ if (after_eq(hard_ack, call->ackr_consumed + 2) || -- cgit v1.2.1 From dd7c1ee59a90ca8a75bce72c721851d5550f3c59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Reinitialise the call ACK and timer state for client reply phase Clear the ACK reason, ACK timer and resend timer when entering the client reply phase when the first DATA packet is received. New ACKs will be proposed once the data is queued. The resend timer is no longer relevant and we need to cancel ACKs scheduled to probe for a lost reply. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/input.c | 9 +++++++++ net/rxrpc/misc.c | 1 + 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e3bf9c0e3ad1..cdd35e2b40ba 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -682,6 +682,7 @@ extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; enum rxrpc_timer_trace { rxrpc_timer_begin, + rxrpc_timer_init_for_reply, rxrpc_timer_expired, rxrpc_timer_set_for_ack, rxrpc_timer_set_for_resend, diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 757c16f033a0..bda11eb2ab2a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -149,6 +149,15 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) { rxrpc_seq_t top = READ_ONCE(call->tx_top); + if (call->ackr_reason) { + spin_lock_bh(&call->lock); + call->ackr_reason = 0; + call->resend_at = call->expire_at; + call->ack_at = call->expire_at; + spin_unlock_bh(&call->lock); + rxrpc_set_timer(call, rxrpc_timer_init_for_reply); + } + if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) rxrpc_rotate_tx_window(call, top); if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index a473fd7dabaa..901c012a2700 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -191,6 +191,7 @@ const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { [rxrpc_timer_begin] = "Begin ", [rxrpc_timer_expired] = "*EXPR*", + [rxrpc_timer_init_for_reply] = "IniRpl", [rxrpc_timer_set_for_ack] = "SetAck", [rxrpc_timer_set_for_send] = "SetTx ", [rxrpc_timer_set_for_resend] = "SetRTx", -- cgit v1.2.1 From df0562a72dba13ab49c7dd7cb15170697b9848ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 23:00:54 +0100 Subject: rxrpc: Delay the resend timer to allow for nsec->jiffies conv error When determining the resend timer value, we have a value in nsec but the timer is in jiffies which may be a million or more times more coarse. nsecs_to_jiffies() rounds down - which means that the resend timeout expressed as jiffies is very likely earlier than the one expressed as nanoseconds from which it was derived. The problem is that rxrpc_resend() gets triggered by the timer, but can't then find anything to resend yet. It sets the timer again - but gets kicked off immediately again and again until the nanosecond-based expiry time is reached and we actually retransmit. Fix this by adding 1 to the jiffies-based resend_at value to counteract the rounding and make sure that the timer happens after the nanosecond-based expiry is passed. Alternatives would be to adjust the timestamp on the packets to align with the jiffie scale or to switch back to using jiffie-timestamps. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index a78a92fe5d77..d5bf9ce7ec6f 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -200,8 +200,14 @@ static void rxrpc_resend(struct rxrpc_call *call) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - resend_at = ktime_sub(ktime_add_ms(oldest, rxrpc_resend_timeout), now); - call->resend_at = jiffies + nsecs_to_jiffies(ktime_to_ns(resend_at)); + resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); + call->resend_at = jiffies + + nsecs_to_jiffies(ktime_to_ns(ktime_sub(resend_at, now))) + + 1; /* We have to make sure that the calculated jiffies value + * falls at or after the nsec value, or we shall loop + * ceaselessly because the timer times out, but we haven't + * reached the nsec timeout yet. + */ /* Now go through the Tx window and perform the retransmissions. We * have to drop the lock for each send. If an ACK comes in whilst the -- cgit v1.2.1 From 31a1b989508ce64e8ead504884ced01e61870852 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:26 +0100 Subject: rxrpc: Generate a summary of the ACK state for later use Generate a summary of the Tx buffer packet state when an ACK is received for use in a later patch that does congestion management. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 14 ++++++++++++++ net/rxrpc/input.c | 45 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cdd35e2b40ba..1a700b6a998b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -540,6 +540,20 @@ struct rxrpc_call { /* transmission-phase ACK management */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ + rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ +}; + +/* + * Summary of a new ACK and the changes it made. + */ +struct rxrpc_ack_summary { + u8 ack_reason; + u8 nr_acks; /* Number of ACKs in packet */ + u8 nr_nacks; /* Number of NACKs in packet */ + u8 nr_new_acks; /* Number of new ACKs in packet */ + u8 nr_new_nacks; /* Number of new NACKs in packet */ + u8 nr_rot_new_acks; /* Number of rotated new ACKs */ + bool new_low_nack; /* T if new low NACK found */ }; enum rxrpc_skb_trace { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index bda11eb2ab2a..dd699667eeef 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -56,12 +56,20 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, /* * Apply a hard ACK by advancing the Tx window. */ -static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, + struct rxrpc_ack_summary *summary) { struct sk_buff *skb, *list = NULL; int ix; u8 annotation; + if (call->acks_lowest_nak == call->tx_hard_ack) { + call->acks_lowest_nak = to; + } else if (before_eq(call->acks_lowest_nak, to)) { + summary->new_low_nack = true; + call->acks_lowest_nak = to; + } + spin_lock(&call->lock); while (before(call->tx_hard_ack, to)) { @@ -77,6 +85,8 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to) if (annotation & RXRPC_TX_ANNO_LAST) set_bit(RXRPC_CALL_TX_LAST, &call->flags); + if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK) + summary->nr_rot_new_acks++; } spin_unlock(&call->lock); @@ -147,6 +157,7 @@ bad_state: */ static bool rxrpc_receiving_reply(struct rxrpc_call *call) { + struct rxrpc_ack_summary summary = { 0 }; rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { @@ -159,7 +170,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) - rxrpc_rotate_tx_window(call, top); + rxrpc_rotate_tx_window(call, top, &summary); if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { rxrpc_proto_abort("TXL", call, top); return false; @@ -508,7 +519,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, * the time the ACK was sent. */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, - rxrpc_seq_t seq, int nr_acks) + rxrpc_seq_t seq, int nr_acks, + struct rxrpc_ack_summary *summary) { bool resend = false; int ix; @@ -521,14 +533,23 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, annotation &= ~RXRPC_TX_ANNO_MASK; switch (*acks++) { case RXRPC_ACK_TYPE_ACK: + summary->nr_acks++; if (anno_type == RXRPC_TX_ANNO_ACK) continue; + summary->nr_new_acks++; call->rxtx_annotations[ix] = RXRPC_TX_ANNO_ACK | annotation; break; case RXRPC_ACK_TYPE_NACK: + if (!summary->nr_nacks && + call->acks_lowest_nak != seq) { + call->acks_lowest_nak = seq; + summary->new_low_nack = true; + } + summary->nr_nacks++; if (anno_type == RXRPC_TX_ANNO_NAK) continue; + summary->nr_new_nacks++; if (anno_type == RXRPC_TX_ANNO_RETRANS) continue; call->rxtx_annotations[ix] = @@ -558,7 +579,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { - u8 ack_reason; + struct rxrpc_ack_summary summary = { 0 }; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); union { struct rxrpc_ackpacket ack; @@ -581,10 +602,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack = ntohl(buf.ack.firstPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; - ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? - buf.ack.reason : RXRPC_ACK__INVALID); + summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? + buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, first_soft_ack, ack_reason, nr_acks); + trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", sp->hdr.serial, @@ -592,7 +613,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, first_soft_ack, ntohl(buf.ack.previousPacket), acked_serial, - rxrpc_ack_names[ack_reason], + rxrpc_ack_names[summary.ack_reason], buf.ack.nAcks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) @@ -649,12 +670,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, return rxrpc_proto_abort("AKN", call, 0); if (after(hard_ack, call->tx_hard_ack)) - rxrpc_rotate_tx_window(call, hard_ack); + rxrpc_rotate_tx_window(call, hard_ack, &summary); if (nr_acks > 0) { if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0) return rxrpc_proto_abort("XSA", call, 0); - rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks); + rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks, + &summary); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { @@ -669,11 +691,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, */ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { + struct rxrpc_ack_summary summary = { 0 }; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _proto("Rx ACKALL %%%u", sp->hdr.serial); - rxrpc_rotate_tx_window(call, call->tx_top); + rxrpc_rotate_tx_window(call, call->tx_top, &summary); if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) rxrpc_end_tx_phase(call, false, "ETL"); } -- cgit v1.2.1 From 0d967960d39ee89f9e0289692e9f7232f490e55c Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Schedule an ACK if the reply to a client call appears overdue If we've sent all the request data in a client call but haven't seen any sign of the reply data yet, schedule an ACK to be sent to the server to find out if the reply data got lost. If the server hasn't yet hard-ACK'd the request data, we send a PING ACK to demand a response to find out whether we need to retransmit. If the server says it has received all of the data, we send an IDLE ACK to tell the server that we haven't received anything in the receive phase as yet. To make this work, a non-immediate PING ACK must carry a delay. I've chosen the same as the IDLE ACK for the moment. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_event.c | 1 + net/rxrpc/input.c | 8 ++++++++ net/rxrpc/misc.c | 2 ++ 4 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1a700b6a998b..b1e697fc9ffb 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -707,7 +707,9 @@ enum rxrpc_timer_trace { extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; enum rxrpc_propose_ack_trace { + rxrpc_propose_ack_client_tx_end, rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_lost_reply, rxrpc_propose_ack_ping_for_params, rxrpc_propose_ack_respond_to_ack, rxrpc_propose_ack_respond_to_ping, diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index d5bf9ce7ec6f..05b94d1acf52 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -100,6 +100,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, expiry = rxrpc_soft_ack_delay; break; + case RXRPC_ACK_PING: case RXRPC_ACK_IDLE: if (rxrpc_idle_ack_delay < expiry) expiry = rxrpc_idle_ack_delay; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index dd699667eeef..0344f4494eb7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -138,6 +138,8 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, write_unlock(&call->state_lock); if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true, + rxrpc_propose_ack_client_tx_end); trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); } else { trace_rxrpc_transmit(call, rxrpc_transmit_end); @@ -684,6 +686,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, return; } + if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & + RXRPC_TX_ANNO_LAST && + summary.nr_acks == call->tx_top - hard_ack) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + false, true, + rxrpc_propose_ack_ping_for_lost_reply); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 901c012a2700..a608769343e6 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -198,7 +198,9 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { }; const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { + [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", [rxrpc_propose_ack_ping_for_params] = "Params ", [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", -- cgit v1.2.1 From 57494343cb5d66962bb197878fb1cc576177db31 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Sep 2016 18:05:27 +0100 Subject: rxrpc: Implement slow-start Implement RxRPC slow-start, which is similar to RFC 5681 for TCP. A tracepoint is added to log the state of the congestion management algorithm and the decisions it makes. Notes: (1) Since we send fixed-size DATA packets (apart from the final packet in each phase), counters and calculations are in terms of packets rather than bytes. (2) The ACK packet carries the equivalent of TCP SACK. (3) The FLIGHT_SIZE calculation in RFC 5681 doesn't seem particularly suited to SACK of a small number of packets. It seems that, almost inevitably, by the time three 'duplicate' ACKs have been seen, we have narrowed the loss down to one or two missing packets, and the FLIGHT_SIZE calculation ends up as 2. (4) In rxrpc_resend(), if there was no data that apparently needed retransmission, we transmit a PING ACK to ask the peer to tell us what its Rx window state is. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 53 ++++++++++++++- net/rxrpc/call_event.c | 36 ++++++++++- net/rxrpc/call_object.c | 13 ++++ net/rxrpc/conn_event.c | 1 + net/rxrpc/input.c | 169 ++++++++++++++++++++++++++++++++++++++++++++++-- net/rxrpc/misc.c | 19 ++++++ net/rxrpc/output.c | 9 ++- net/rxrpc/sendmsg.c | 7 +- 8 files changed, 294 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b1e697fc9ffb..ca96e547cb9a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -402,6 +402,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_PINGING, /* Ping in process */ + RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ }; /* @@ -446,6 +447,17 @@ enum rxrpc_call_completion { NR__RXRPC_CALL_COMPLETIONS }; +/* + * Call Tx congestion management modes. + */ +enum rxrpc_congest_mode { + RXRPC_CALL_SLOW_START, + RXRPC_CALL_CONGEST_AVOIDANCE, + RXRPC_CALL_PACKET_LOSS, + RXRPC_CALL_FAST_RETRANSMIT, + NR__RXRPC_CONGEST_MODES +}; + /* * RxRPC call definition * - matched by { connection, call_id } @@ -518,6 +530,20 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + + /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS + * is fixed, we keep these numbers in terms of segments (ie. DATA + * packets) rather than bytes. + */ +#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN + u8 cong_cwnd; /* Congestion window size */ + u8 cong_extra; /* Extra to send for congestion management */ + u8 cong_ssthresh; /* Slow-start threshold */ + enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ + u8 cong_dup_acks; /* Count of ACKs showing missing packets */ + u8 cong_cumul_acks; /* Cumulative ACK count */ + ktime_t cong_tstamp; /* Last time cwnd was changed */ + rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not * consumed packet follows this. */ @@ -539,12 +565,13 @@ struct rxrpc_call { ktime_t ackr_ping_time; /* Time last ping sent */ /* transmission-phase ACK management */ + ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ }; /* - * Summary of a new ACK and the changes it made. + * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { u8 ack_reason; @@ -554,6 +581,14 @@ struct rxrpc_ack_summary { u8 nr_new_nacks; /* Number of new NACKs in packet */ u8 nr_rot_new_acks; /* Number of rotated new ACKs */ bool new_low_nack; /* T if new low NACK found */ + bool retrans_timeo; /* T if reTx due to timeout happened */ + u8 flight_size; /* Number of unreceived transmissions */ + /* Place to stash values for tracing */ + enum rxrpc_congest_mode mode:8; + u8 cwnd; + u8 ssthresh; + u8 dup_acks; + u8 cumulative_acks; }; enum rxrpc_skb_trace { @@ -709,6 +744,7 @@ extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; enum rxrpc_propose_ack_trace { rxrpc_propose_ack_client_tx_end, rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_lost_ack, rxrpc_propose_ack_ping_for_lost_reply, rxrpc_propose_ack_ping_for_params, rxrpc_propose_ack_respond_to_ack, @@ -729,6 +765,21 @@ enum rxrpc_propose_ack_outcome { extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; +enum rxrpc_congest_change { + rxrpc_cong_begin_retransmission, + rxrpc_cong_cleared_nacks, + rxrpc_cong_new_low_nack, + rxrpc_cong_no_change, + rxrpc_cong_progress, + rxrpc_cong_retransmit_again, + rxrpc_cong_rtt_window_end, + rxrpc_cong_saw_nack, + rxrpc_congest__nr_change +}; + +extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; +extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; + extern const char *const rxrpc_pkts[]; extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 05b94d1acf52..0e8478012212 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -146,6 +146,14 @@ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, spin_unlock_bh(&call->lock); } +/* + * Handle congestion being detected by the retransmit timeout. + */ +static void rxrpc_congestion_timeout(struct rxrpc_call *call) +{ + set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); +} + /* * Perform retransmission of NAK'd and unack'd packets. */ @@ -154,9 +162,9 @@ static void rxrpc_resend(struct rxrpc_call *call) struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - ktime_t now = ktime_get_real(), max_age, oldest, resend_at; + ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts; int ix; - u8 annotation, anno_type; + u8 annotation, anno_type, retrans = 0, unacked = 0; _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); @@ -193,10 +201,13 @@ static void rxrpc_resend(struct rxrpc_call *call) oldest = skb->tstamp; continue; } + if (!(annotation & RXRPC_TX_ANNO_RESENT)) + unacked++; } /* Okay, we need to retransmit a packet. */ call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; + retrans++; trace_rxrpc_retransmit(call, seq, annotation | anno_type, ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } @@ -210,6 +221,25 @@ static void rxrpc_resend(struct rxrpc_call *call) * reached the nsec timeout yet. */ + if (unacked) + rxrpc_congestion_timeout(call); + + /* If there was nothing that needed retransmission then it's likely + * that an ACK got lost somewhere. Send a ping to find out instead of + * retransmitting data. + */ + if (!retrans) { + rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + spin_unlock_bh(&call->lock); + ack_ts = ktime_sub(now, call->acks_latest_ts); + if (ktime_to_ns(ack_ts) < call->peer->rtt) + goto out; + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_lost_ack); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + goto out; + } + /* Now go through the Tx window and perform the retransmissions. We * have to drop the lock for each send. If an ACK comes in whilst the * lock is dropped, it may clear some of the retransmission markers for @@ -260,6 +290,7 @@ static void rxrpc_resend(struct rxrpc_call *call) out_unlock: spin_unlock_bh(&call->lock); +out: _leave(""); } @@ -293,6 +324,7 @@ recheck_state: if (time_after_eq(now, call->expire_at)) { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); + goto recheck_state; } if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a53f4c2c0025..d4b3293b78fa 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -160,6 +160,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; call->rx_expect_next = 1; + + if (RXRPC_TX_SMSS > 2190) + call->cong_cwnd = 2; + else if (RXRPC_TX_SMSS > 1095) + call->cong_cwnd = 3; + else + call->cong_cwnd = 4; + call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; return call; nomem_2: @@ -176,6 +184,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; + ktime_t now; _enter(""); @@ -185,6 +194,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->service_id = srx->srx_service; call->tx_phase = true; + now = ktime_get_real(); + call->acks_latest_ts = now; + call->cong_tstamp = now; _leave(" = %p", call); return call; @@ -325,6 +337,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->state = RXRPC_CALL_SERVER_ACCEPTING; if (sp->hdr.securityIndex > 0) call->state = RXRPC_CALL_SERVER_SECURING; + call->cong_tstamp = skb->tstamp; /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a1cf1ec5f29e..37609ce89f52 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -97,6 +97,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.info.maxMTU = htonl(mtu); pkt.info.rwind = htonl(rxrpc_rx_window_size); pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + pkt.whdr.flags |= RXRPC_SLOW_START_OK; len += sizeof(pkt.ack) + sizeof(pkt.info); break; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 0344f4494eb7..094720dd1eaf 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -36,6 +36,166 @@ static void rxrpc_proto_abort(const char *why, } } +/* + * Do TCP-style congestion management [RFC 5681]. + */ +static void rxrpc_congestion_management(struct rxrpc_call *call, + struct sk_buff *skb, + struct rxrpc_ack_summary *summary) +{ + enum rxrpc_congest_change change = rxrpc_cong_no_change; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int cumulative_acks = call->cong_cumul_acks; + unsigned int cwnd = call->cong_cwnd; + bool resend = false; + + summary->flight_size = + (call->tx_top - call->tx_hard_ack) - summary->nr_acks; + + if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { + summary->retrans_timeo = true; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = 1; + if (cwnd > call->cong_ssthresh && + call->cong_mode == RXRPC_CALL_SLOW_START) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + cumulative_acks = 0; + } + } + + cumulative_acks += summary->nr_new_acks; + cumulative_acks += summary->nr_rot_new_acks; + if (cumulative_acks > 255) + cumulative_acks = 255; + + summary->mode = call->cong_mode; + summary->cwnd = call->cong_cwnd; + summary->ssthresh = call->cong_ssthresh; + summary->cumulative_acks = cumulative_acks; + summary->dup_acks = call->cong_dup_acks; + + switch (call->cong_mode) { + case RXRPC_CALL_SLOW_START: + if (summary->nr_nacks > 0) + goto packet_loss_detected; + if (summary->cumulative_acks > 0) + cwnd += 1; + if (cwnd > call->cong_ssthresh) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + } + goto out; + + case RXRPC_CALL_CONGEST_AVOIDANCE: + if (summary->nr_nacks > 0) + goto packet_loss_detected; + + /* We analyse the number of packets that get ACK'd per RTT + * period and increase the window if we managed to fill it. + */ + if (call->peer->rtt_usage == 0) + goto out; + if (ktime_before(skb->tstamp, + ktime_add_ns(call->cong_tstamp, + call->peer->rtt))) + goto out_no_clear_ca; + change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = skb->tstamp; + if (cumulative_acks >= cwnd) + cwnd++; + goto out; + + case RXRPC_CALL_PACKET_LOSS: + if (summary->nr_nacks == 0) + goto resume_normality; + + if (summary->new_low_nack) { + change = rxrpc_cong_new_low_nack; + call->cong_dup_acks = 1; + if (call->cong_extra > 1) + call->cong_extra = 1; + goto send_extra_data; + } + + call->cong_dup_acks++; + if (call->cong_dup_acks < 3) + goto send_extra_data; + + change = rxrpc_cong_begin_retransmission; + call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = call->cong_ssthresh + 3; + call->cong_extra = 0; + call->cong_dup_acks = 0; + resend = true; + goto out; + + case RXRPC_CALL_FAST_RETRANSMIT: + if (!summary->new_low_nack) { + if (summary->nr_new_acks == 0) + cwnd += 1; + call->cong_dup_acks++; + if (call->cong_dup_acks == 2) { + change = rxrpc_cong_retransmit_again; + call->cong_dup_acks = 0; + resend = true; + } + } else { + change = rxrpc_cong_progress; + cwnd = call->cong_ssthresh; + if (summary->nr_nacks == 0) + goto resume_normality; + } + goto out; + + default: + BUG(); + goto out; + } + +resume_normality: + change = rxrpc_cong_cleared_nacks; + call->cong_dup_acks = 0; + call->cong_extra = 0; + call->cong_tstamp = skb->tstamp; + if (cwnd <= call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_SLOW_START; + else + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; +out: + cumulative_acks = 0; +out_no_clear_ca: + if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1) + cwnd = RXRPC_RXTX_BUFF_SIZE - 1; + call->cong_cwnd = cwnd; + call->cong_cumul_acks = cumulative_acks; + trace_rxrpc_congest(call, summary, sp->hdr.serial, change); + if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); + return; + +packet_loss_detected: + change = rxrpc_cong_saw_nack; + call->cong_mode = RXRPC_CALL_PACKET_LOSS; + call->cong_dup_acks = 0; + goto send_extra_data; + +send_extra_data: + /* Send some previously unsent DATA if we have some to advance the ACK + * state. + */ + if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & + RXRPC_TX_ANNO_LAST || + summary->nr_acks != call->tx_top - call->tx_hard_ack) { + call->cong_extra++; + wake_up(&call->waitq); + } + goto out_no_clear_ca; +} + /* * Ping the other end to fill our RTT cache and to retrieve the rwind * and MTU parameters. @@ -524,7 +684,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, rxrpc_seq_t seq, int nr_acks, struct rxrpc_ack_summary *summary) { - bool resend = false; int ix; u8 annotation, anno_type; @@ -556,16 +715,11 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, continue; call->rxtx_annotations[ix] = RXRPC_TX_ANNO_NAK | annotation; - resend = true; break; default: return rxrpc_proto_abort("SFT", call, 0); } } - - if (resend && - !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); } /* @@ -663,6 +817,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, sp->hdr.serial, call->acks_latest); return; } + call->acks_latest_ts = skb->tstamp; call->acks_latest = sp->hdr.serial; if (before(hard_ack, call->tx_hard_ack) || @@ -692,6 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, false, true, rxrpc_propose_ack_ping_for_lost_reply); + + return rxrpc_congestion_management(call, skb, &summary); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index a608769343e6..aedb8978226d 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -200,6 +200,7 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", [rxrpc_propose_ack_ping_for_params] = "Params ", [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", @@ -214,3 +215,21 @@ const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { [rxrpc_propose_ack_update] = " Update", [rxrpc_propose_ack_subsume] = " Subsume", }; + +const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = { + [RXRPC_CALL_SLOW_START] = "SlowStart", + [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid", + [RXRPC_CALL_PACKET_LOSS] = "PktLoss ", + [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ", +}; + +const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = { + [rxrpc_cong_begin_retransmission] = " Retrans", + [rxrpc_cong_cleared_nacks] = " Cleared", + [rxrpc_cong_new_low_nack] = " NewLowN", + [rxrpc_cong_no_change] = "", + [rxrpc_cong_progress] = " Progres", + [rxrpc_cong_retransmit_again] = " ReTxAgn", + [rxrpc_cong_rtt_window_end] = " RttWinE", + [rxrpc_cong_saw_nack] = " SawNack", +}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3eb01445e814..cf43a715685e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -157,6 +157,8 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) spin_unlock_bh(&call->lock); + pkt->whdr.flags |= RXRPC_SLOW_START_OK; + iov[0].iov_len += sizeof(pkt->ack) + n; iov[1].iov_base = &pkt->ackinfo; iov[1].iov_len = sizeof(pkt->ackinfo); @@ -276,8 +278,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) msg.msg_controllen = 0; msg.msg_flags = 0; - /* If our RTT cache needs working on, request an ACK. */ - if ((call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + /* If our RTT cache needs working on, request an ACK. Also request + * ACKs if a DATA packet appears to have been lost. + */ + if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT || + (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) whdr.flags |= RXRPC_REQUEST_ACK; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 99939372b5a4..1f8040d82395 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -45,7 +45,9 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, for (;;) { set_current_state(TASK_INTERRUPTIBLE); ret = 0; - if (call->tx_top - call->tx_hard_ack < call->tx_winsize) + if (call->tx_top - call->tx_hard_ack < + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) break; if (call->state >= RXRPC_CALL_COMPLETE) { ret = -call->error; @@ -203,7 +205,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); if (call->tx_top - call->tx_hard_ack >= - call->tx_winsize) { + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; -- cgit v1.2.1 From c2675de447f8238e7e2e7eced78fa671d42a9a7e Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Sat, 24 Sep 2016 14:01:04 -0400 Subject: gre: use nla_get_be32() to extract flowinfo Eliminate a sparse endianness mismatch warning, use nla_get_be32() to extract a __be32 value instead of nla_get_u32(). Signed-off-by: Lance Richardson Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 397e1ed3daa3..4ce74f86291b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1239,7 +1239,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); if (data[IFLA_GRE_FLOWINFO]) - parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]); + parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]); if (data[IFLA_GRE_FLAGS]) parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); -- cgit v1.2.1 From e3b37f11e6e4e6b6f02cc762f182ce233d2c1c9d Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:07 -0400 Subject: netfilter: replace list_head with single linked list The netfilter hook list never uses the prev pointer, and so can be trimmed to be a simple singly-linked list. In addition to having a more light weight structure for hook traversal, struct net becomes 5568 bytes (down from 6400) and struct net_device becomes 2176 bytes (down from 2240). Signed-off-by: Aaron Conole Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 19 ++---- net/netfilter/core.c | 141 ++++++++++++++++++++++++++-------------- net/netfilter/nf_internals.h | 10 +-- net/netfilter/nf_queue.c | 18 ++--- net/netfilter/nfnetlink_queue.c | 8 ++- 5 files changed, 118 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6029af47377d..2fe9345c1407 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1002,28 +1002,21 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct nf_hook_ops *elem; + struct nf_hook_entry *elem; struct nf_hook_state state; - struct list_head *head; int ret; - head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; + elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); - list_for_each_entry_rcu(elem, head, list) { - struct nf_hook_ops *next; + while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) + elem = rcu_dereference(elem->next); - next = list_entry_rcu(list_next_rcu(&elem->list), - struct nf_hook_ops, list); - if (next->priority <= NF_BR_PRI_BRNF) - continue; - } - - if (&elem->list == head) + if (!elem) return okfn(net, sk, skb); /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, + nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 67b74287535d..72fc514ec676 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -61,33 +62,50 @@ EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); +#define nf_entry_dereference(e) \ + rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct list_head *nf_find_hook_list(struct net *net, - const struct nf_hook_ops *reg) +static struct nf_hook_entry *nf_hook_entry_head(struct net *net, + const struct nf_hook_ops *reg) { - struct list_head *hook_list = NULL; + struct nf_hook_entry *hook_head = NULL; if (reg->pf != NFPROTO_NETDEV) - hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_head = nf_entry_dereference(net->nf.hooks[reg->pf] + [reg->hooknum]); else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - hook_list = ®->dev->nf_hooks_ingress; + hook_head = + nf_entry_dereference( + reg->dev->nf_hooks_ingress); #endif } - return hook_list; + return hook_head; } -struct nf_hook_entry { - const struct nf_hook_ops *orig_ops; - struct nf_hook_ops ops; -}; +/* must hold nf_hook_mutex */ +static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg, + struct nf_hook_entry *entry) +{ + switch (reg->pf) { + case NFPROTO_NETDEV: + /* We already checked in nf_register_net_hook() that this is + * used from ingress. + */ + rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry); + break; + default: + rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], + entry); + break; + } +} int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; + struct nf_hook_entry *hooks_entry; struct nf_hook_entry *entry; - struct nf_hook_ops *elem; if (reg->pf == NFPROTO_NETDEV && (reg->hooknum != NF_NETDEV_INGRESS || @@ -100,19 +118,30 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; + entry->next = NULL; + + mutex_lock(&nf_hook_mutex); + hooks_entry = nf_hook_entry_head(net, reg); - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) { - kfree(entry); - return -ENOENT; + if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) { + /* This is the case where we need to insert at the head */ + entry->next = hooks_entry; + hooks_entry = NULL; } - mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - if (reg->priority < elem->priority) - break; + while (hooks_entry && + reg->priority >= hooks_entry->orig_ops->priority && + nf_entry_dereference(hooks_entry->next)) { + hooks_entry = nf_entry_dereference(hooks_entry->next); + } + + if (hooks_entry) { + entry->next = nf_entry_dereference(hooks_entry->next); + rcu_assign_pointer(hooks_entry->next, entry); + } else { + nf_set_hooks_head(net, reg, entry); } - list_add_rcu(&entry->ops.list, elem->list.prev); + mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -127,24 +156,33 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; - struct nf_hook_entry *entry; - struct nf_hook_ops *elem; - - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) - return; + struct nf_hook_entry *hooks_entry; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - entry = container_of(elem, struct nf_hook_entry, ops); - if (entry->orig_ops == reg) { - list_del_rcu(&entry->ops.list); - break; + hooks_entry = nf_hook_entry_head(net, reg); + if (hooks_entry->orig_ops == reg) { + nf_set_hooks_head(net, reg, + nf_entry_dereference(hooks_entry->next)); + goto unlock; + } + while (hooks_entry && nf_entry_dereference(hooks_entry->next)) { + struct nf_hook_entry *next = + nf_entry_dereference(hooks_entry->next); + struct nf_hook_entry *nnext; + + if (next->orig_ops != reg) { + hooks_entry = next; + continue; } + nnext = nf_entry_dereference(next->next); + rcu_assign_pointer(hooks_entry->next, nnext); + hooks_entry = next; + break; } + +unlock: mutex_unlock(&nf_hook_mutex); - if (&elem->list == hook_list) { + if (!hooks_entry) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } @@ -156,10 +194,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, &entry->ops); + nf_queue_nf_hook_drop(net, hooks_entry); /* other cpu might still process nfqueue verdict that used reg */ synchronize_net(); - kfree(entry); + kfree(hooks_entry); } EXPORT_SYMBOL(nf_unregister_net_hook); @@ -258,10 +296,9 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); -unsigned int nf_iterate(struct list_head *head, - struct sk_buff *skb, +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_ops **elemp) + struct nf_hook_entry **entryp) { unsigned int verdict; @@ -269,20 +306,23 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_entry_continue_rcu((*elemp), head, list) { - if (state->thresh > (*elemp)->priority) + while (*entryp) { + if (state->thresh > (*entryp)->ops.priority) { + *entryp = rcu_dereference((*entryp)->next); continue; + } /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook((*elemp)->priv, skb, state); + verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - (*elemp)->hook, state->hook); + (*entryp)->ops.hook, state->hook); + *entryp = rcu_dereference((*entryp)->next); continue; } #endif @@ -290,6 +330,7 @@ repeat: return verdict; goto repeat; } + *entryp = rcu_dereference((*entryp)->next); } return NF_ACCEPT; } @@ -299,13 +340,13 @@ repeat: * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { - struct nf_hook_ops *elem; + struct nf_hook_entry *entry; unsigned int verdict; int ret = 0; - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); + entry = rcu_dereference(state->hook_entries); next_hook: - verdict = nf_iterate(state->hook_list, skb, state, &elem); + verdict = nf_iterate(skb, state, &entry); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -314,8 +355,10 @@ next_hook: if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - int err = nf_queue(skb, elem, state, - verdict >> NF_VERDICT_QBITS); + int err; + + RCU_INIT_POINTER(state->hook_entries, entry); + err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -442,7 +485,7 @@ static int __net_init netfilter_net_init(struct net *net) for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&net->nf.hooks[i][h]); + RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); } #ifdef CONFIG_PROC_FS diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 065522564ac6..e0adb5959342 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,13 +13,13 @@ /* core.c */ -unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - struct nf_hook_state *state, struct nf_hook_ops **elemp); +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, + struct nf_hook_entry **entryp); /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, - struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b19ad20a705c..96964a0070e1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - qh->nf_hook_drop(net, ops); + qh->nf_hook_drop(net, entry); rcu_read_unlock(); } @@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum) { @@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, - .elem = elem, .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -165,11 +163,15 @@ err: void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { + struct nf_hook_entry *hook_entry; struct sk_buff *skb = entry->skb; - struct nf_hook_ops *elem = entry->elem; const struct nf_afinfo *afinfo; + struct nf_hook_ops *elem; int err; + hook_entry = rcu_dereference(entry->state.hook_entries); + elem = &hook_entry->ops; + nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ @@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(entry->state.hook_list, - skb, &entry->state, &elem); + verdict = nf_iterate(skb, &entry->state, &hook_entry); } switch (verdict & NF_VERDICT_MASK) { @@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, elem, &entry->state, + RCU_INIT_POINTER(entry->state.hook_entries, hook_entry); + err = nf_queue(skb, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7caa8b082c41..af832c526048 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; -static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr) { - return entry->elem == (struct nf_hook_ops *)ops_ptr; + return rcu_access_pointer(entry->state.hook_entries) == + (struct nf_hook_entry *)entry_ptr; } -static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +static void nfqnl_nf_hook_drop(struct net *net, + const struct nf_hook_entry *hook) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; -- cgit v1.2.1 From 8d11350f5f33378efc5f905bee325f3e76d6bcca Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 22 Sep 2016 14:53:53 +0800 Subject: netfilter: seqadj: Fix the wrong ack adjust for the RST packet without ack It is valid that the TCP RST packet which does not set ack flag, and bytes of ack number are zero. But current seqadj codes would adjust the "0" ack to invalid ack number. Actually seqadj need to check the ack flag before adjust it for these RST packets. The following is my test case client is 10.26.98.245, and add one iptable rule: iptables -I INPUT -p tcp --sport 12345 -m connbytes --connbytes 2: --connbytes-dir reply --connbytes-mode packets -j REJECT --reject-with tcp-reset This iptables rule could generate on TCP RST without ack flag. server:10.172.135.55 Enable the synproxy with seqadjust by the following iptables rules iptables -t raw -A PREROUTING -i eth0 -p tcp -d 10.172.135.55 --dport 12345 -m tcp --syn -j CT --notrack iptables -A INPUT -i eth0 -p tcp -d 10.172.135.55 --dport 12345 -m conntrack --ctstate INVALID,UNTRACKED -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460 iptables -A OUTPUT -o eth0 -p tcp -s 10.172.135.55 --sport 12345 -m conntrack --ctstate INVALID,UNTRACKED -m tcp --tcp-flags SYN,RST,ACK SYN,ACK -j ACCEPT The following is my test result. 1. packet trace on client root@routers:/tmp# tcpdump -i eth0 tcp port 12345 -n tcpdump: verbose output suppressed, use -v or -vv for full protocol decode listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [S], seq 3695959829, win 29200, options [mss 1460,sackOK,TS val 452367884 ecr 0,nop,wscale 7], length 0 IP 10.172.135.55.12345 > 10.26.98.245.45154: Flags [S.], seq 546723266, ack 3695959830, win 0, options [mss 1460,sackOK,TS val 15643479 ecr 452367884, nop,wscale 7], length 0 IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [.], ack 1, win 229, options [nop,nop,TS val 452367885 ecr 15643479], length 0 IP 10.172.135.55.12345 > 10.26.98.245.45154: Flags [.], ack 1, win 226, options [nop,nop,TS val 15643479 ecr 452367885], length 0 IP 10.26.98.245.45154 > 10.172.135.55.12345: Flags [R], seq 3695959830, win 0, length 0 2. seqadj log on server [62873.867319] Adjusting sequence number from 602341895->546723267, ack from 3695959830->3695959830 [62873.867644] Adjusting sequence number from 602341895->546723267, ack from 3695959830->3695959830 [62873.869040] Adjusting sequence number from 3695959830->3695959830, ack from 0->55618628 To summarize, it is clear that the seqadj codes adjust the 0 ack when receive one TCP RST packet without ack. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_seqadj.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dff0f0cc59e4..ef7063eced7c 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; - int res; + int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; @@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb, else seqoff = this_way->offset_before; + newseq = htonl(ntohl(tcph->seq) + seqoff); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); + pr_debug("Adjusting sequence number from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq)); + tcph->seq = newseq; + + if (!tcph->ack) + goto out; + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; - newseq = htonl(ntohl(tcph->seq) + seqoff); newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); - - tcph->seq = newseq; tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); +out: spin_unlock_bh(&ct->lock); return res; -- cgit v1.2.1 From d767ff2c84f19be1aa403762f34eebbb403caf6d Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Thu, 22 Sep 2016 22:28:51 +0800 Subject: netfilter: nft_ct: unnecessary to require dir when use ct l3proto/protocol Currently, if the user want to match ct l3proto, we must specify the direction, for example: # nft add rule filter input ct original l3proto ipv4 ^^^^^^^^ Otherwise, error message will be reported: # nft add rule filter input ct l3proto ipv4 nft add rule filter input ct l3proto ipv4 :1:1-38: Error: Could not process rule: Invalid argument add rule filter input ct l3proto ipv4 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Actually, there's no need to require NFTA_CT_DIRECTION attr, because ct l3proto and protocol are unrelated to direction. And for compatibility, even if the user specify the NFTA_CT_DIRECTION attr, do not report error, just skip it. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 51e180f2a003..825fbbc62f48 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -128,15 +128,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, &count, sizeof(count)); return; } + case NFT_CT_L3PROTOCOL: + *dest = nf_ct_l3num(ct); + return; + case NFT_CT_PROTOCOL: + *dest = nf_ct_protonum(ct); + return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { - case NFT_CT_L3PROTOCOL: - *dest = nf_ct_l3num(ct); - return; case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); @@ -145,9 +148,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; - case NFT_CT_PROTOCOL: - *dest = nf_ct_protonum(ct); - return; case NFT_CT_PROTO_SRC: *dest = (__force __u16)tuple->src.u.all; return; @@ -283,8 +283,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: - if (tb[NFTA_CT_DIRECTION] == NULL) - return -EINVAL; + /* For compatibility, do not report error if NFTA_CT_DIRECTION + * attribute is specified. + */ len = sizeof(u8); break; case NFT_CT_SRC: @@ -432,8 +433,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { - case NFT_CT_L3PROTOCOL: - case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_PROTO_SRC: -- cgit v1.2.1 From 7bfdde7045ad54d9fdccac70baffd094d9de73f8 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Thu, 22 Sep 2016 22:28:52 +0800 Subject: netfilter: nft_ct: report error if mark and dir specified simultaneously NFT_CT_MARK is unrelated to direction, so if NFTA_CT_DIRECTION attr is specified, report EINVAL to the userspace. This validation check was already done at nft_ct_get_init, but we missed it in nft_ct_set_init. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 825fbbc62f48..d7b0d171172a 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -364,6 +364,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif -- cgit v1.2.1 From 0dc60a4546fefc6dc9f54abf60beeeb3501726fa Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Thu, 22 Sep 2016 12:42:46 -0400 Subject: netfilter: xt_hashlimit: Prepare for revision 2 I am planning to add a revision 2 for the hashlimit xtables module to support higher packets per second rates. This patch renames all the functions and variables related to revision 1 by adding _v1 at the end of the names. Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 61 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 178696852bde..e93d9e0a3f35 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net) } /* need to declare this at the top */ -static const struct file_operations dl_file_ops; +static const struct file_operations dl_file_ops_v1; /* hash table crap */ struct dsthash_dst { @@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(struct work_struct *work); -static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, - u_int8_t family) +static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, + u_int8_t family) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; @@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->pde = proc_create_data(minfo->name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, - &dl_file_ops, hinfo); + &dl_file_ops_v1, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); @@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. */ -#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) +#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a @@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) -#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) +#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1) /* in byte mode, the lowest possible rate is one packet/second. * credit_cap is used as a counter that tells us how many times we can @@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len) static u32 user2credits(u32 user) { /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) /* Divide first. */ - return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + return (user / XT_HASHLIMIT_SCALE) *\ + HZ * CREDITS_PER_JIFFY_v1; - return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; + return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE; } static u32 user2credits_byte(u32 user) @@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) return; } } else { - dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; + dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1; cap = dh->rateinfo.credit_cap; } if (dh->rateinfo.credit > cap) @@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) } static bool -hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; @@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; } -static int hashlimit_mt_check(const struct xt_mtchk_param *par) +static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) { struct net *net = par->net; struct xt_hashlimit_mtinfo1 *info = par->matchinfo; @@ -701,7 +702,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par) mutex_lock(&hashlimit_mutex); info->hinfo = htable_find_get(net, info->name, par->family); if (info->hinfo == NULL) { - ret = htable_create(net, info, par->family); + ret = htable_create_v1(net, info, par->family); if (ret < 0) { mutex_unlock(&hashlimit_mutex); return ret; @@ -711,7 +712,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par) return 0; } -static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; @@ -723,10 +724,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV4, - .match = hashlimit_mt, + .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), - .checkentry = hashlimit_mt_check, - .destroy = hashlimit_mt_destroy, + .checkentry = hashlimit_mt_check_v1, + .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -734,10 +735,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV6, - .match = hashlimit_mt, + .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), - .checkentry = hashlimit_mt_check, - .destroy = hashlimit_mt_destroy, + .checkentry = hashlimit_mt_check_v1, + .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, #endif @@ -786,8 +787,8 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, - struct seq_file *s) +static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) { const struct xt_hashlimit_htable *ht = s->private; @@ -825,7 +826,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, return seq_has_overflowed(s); } -static int dl_seq_show(struct seq_file *s, void *v) +static int dl_seq_show_v1(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = s->private; unsigned int *bucket = (unsigned int *)v; @@ -833,22 +834,22 @@ static int dl_seq_show(struct seq_file *s, void *v) if (!hlist_empty(&htable->hash[*bucket])) { hlist_for_each_entry(ent, &htable->hash[*bucket], node) - if (dl_seq_real_show(ent, htable->family, s)) + if (dl_seq_real_show_v1(ent, htable->family, s)) return -1; } return 0; } -static const struct seq_operations dl_seq_ops = { +static const struct seq_operations dl_seq_ops_v1 = { .start = dl_seq_start, .next = dl_seq_next, .stop = dl_seq_stop, - .show = dl_seq_show + .show = dl_seq_show_v1 }; -static int dl_proc_open(struct inode *inode, struct file *file) +static int dl_proc_open_v1(struct inode *inode, struct file *file) { - int ret = seq_open(file, &dl_seq_ops); + int ret = seq_open(file, &dl_seq_ops_v1); if (!ret) { struct seq_file *sf = file->private_data; @@ -857,9 +858,9 @@ static int dl_proc_open(struct inode *inode, struct file *file) return ret; } -static const struct file_operations dl_file_ops = { +static const struct file_operations dl_file_ops_v1 = { .owner = THIS_MODULE, - .open = dl_proc_open, + .open = dl_proc_open_v1, .read = seq_read, .llseek = seq_lseek, .release = seq_release -- cgit v1.2.1 From 11d5f15723c9f39d7c131d0149d024c17dbef676 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Thu, 22 Sep 2016 12:43:44 -0400 Subject: netfilter: xt_hashlimit: Create revision 2 to support higher pps rates Create a new revision for the hashlimit iptables extension module. Rev 2 will support higher pps of upto 1 million, Version 1 supports only 10k. To support this we have to increase the size of the variables avg and burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2 and xt_hashlimit_mtinfo2 and also create newer versions of all the functions for match, checkentry and destroy. Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very similar in both rev1 and rev2 with only minor changes, so I have split those functions and moved all the common code to a *_common function. Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 330 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 262 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index e93d9e0a3f35..44a095ecc7b7 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net) /* need to declare this at the top */ static const struct file_operations dl_file_ops_v1; +static const struct file_operations dl_file_ops; /* hash table crap */ struct dsthash_dst { @@ -86,8 +87,8 @@ struct dsthash_ent { unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ - u_int32_t credit; - u_int32_t credit_cap, cost; + u_int64_t credit; + u_int64_t credit_cap, cost; } rateinfo; struct rcu_head rcu; }; @@ -98,7 +99,7 @@ struct xt_hashlimit_htable { u_int8_t family; bool rnd_initialized; - struct hashlimit_cfg1 cfg; /* config */ + struct hashlimit_cfg2 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ @@ -114,6 +115,30 @@ struct xt_hashlimit_htable { struct hlist_head hash[0]; /* hashtable itself */ }; +static int +cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) +{ + if (revision == 1) { + struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; + + to->mode = cfg->mode; + to->avg = cfg->avg; + to->burst = cfg->burst; + to->size = cfg->size; + to->max = cfg->max; + to->gc_interval = cfg->gc_interval; + to->expire = cfg->expire; + to->srcmask = cfg->srcmask; + to->dstmask = cfg->dstmask; + } else if (revision == 2) { + memcpy(to, from, sizeof(struct hashlimit_cfg2)); + } else { + return -EINVAL; + } + + return 0; +} + static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ static struct kmem_cache *hashlimit_cachep __read_mostly; @@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(struct work_struct *work); -static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, - u_int8_t family) +static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, + const char *name, u_int8_t family, + struct xt_hashlimit_htable **out_hinfo, + int revision) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - unsigned int size; - unsigned int i; + unsigned int size, i; + int ret; - if (minfo->cfg.size) { - size = minfo->cfg.size; + if (cfg->size) { + size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); @@ -238,10 +265,14 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, sizeof(struct list_head) * size); if (hinfo == NULL) return -ENOMEM; - minfo->hinfo = hinfo; + *out_hinfo = hinfo; /* copy match config into hashtable config */ - memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2); + + if (ret) + return ret; + hinfo->cfg.size = size; if (hinfo->cfg.max == 0) hinfo->cfg.max = 8 * hinfo->cfg.size; @@ -255,17 +286,18 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; - hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { vfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); - hinfo->pde = proc_create_data(minfo->name, 0, + hinfo->pde = proc_create_data(name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, - &dl_file_ops_v1, hinfo); + (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops, + hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); @@ -399,6 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. */ #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24)) +#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a @@ -408,8 +441,11 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) +#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1) +#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ) #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1) /* in byte mode, the lowest possible rate is one packet/second. @@ -425,15 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len) } /* Precision saver. */ -static u32 user2credits(u32 user) +static u64 user2credits(u64 user, int revision) { - /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) - /* Divide first. */ - return (user / XT_HASHLIMIT_SCALE) *\ - HZ * CREDITS_PER_JIFFY_v1; + if (revision == 1) { + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) + /* Divide first. */ + return (user / XT_HASHLIMIT_SCALE) *\ + HZ * CREDITS_PER_JIFFY_v1; + + return (user * HZ * CREDITS_PER_JIFFY_v1) \ + / XT_HASHLIMIT_SCALE; + } else { + if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + return (user / XT_HASHLIMIT_SCALE_v2) *\ + HZ * CREDITS_PER_JIFFY; - return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE; + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2; + } } static u32 user2credits_byte(u32 user) @@ -443,10 +488,11 @@ static u32 user2credits_byte(u32 user) return (u32) (us >> 32); } -static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, + u32 mode, int revision) { unsigned long delta = now - dh->rateinfo.prev; - u32 cap; + u64 cap, cpj; if (delta == 0) return; @@ -454,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) dh->rateinfo.prev = now; if (mode & XT_HASHLIMIT_BYTES) { - u32 tmp = dh->rateinfo.credit; + u64 tmp = dh->rateinfo.credit; dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; cap = CREDITS_PER_JIFFY_BYTES * HZ; if (tmp >= dh->rateinfo.credit) {/* overflow */ @@ -462,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) return; } } else { - dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1; + cpj = (revision == 1) ? + CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; + dh->rateinfo.credit += delta * cpj; cap = dh->rateinfo.credit_cap; } if (dh->rateinfo.credit > cap) @@ -470,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) } static void rateinfo_init(struct dsthash_ent *dh, - struct xt_hashlimit_htable *hinfo) + struct xt_hashlimit_htable *hinfo, int revision) { dh->rateinfo.prev = jiffies; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { @@ -479,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.credit_cap = hinfo->cfg.burst; } else { dh->rateinfo.credit = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + hinfo->cfg.burst, revision); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision); dh->rateinfo.credit_cap = dh->rateinfo.credit; } } @@ -604,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) } static bool -hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, + struct xt_hashlimit_htable *hinfo, + const struct hashlimit_cfg2 *cfg, int revision) { - const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; - struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; bool race = false; - u32 cost; + u64 cost; if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; @@ -627,18 +675,18 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) } else if (race) { /* Already got an entry, update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } else { dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_init(dh, hinfo); + rateinfo_init(dh, hinfo, revision); } } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } - if (info->cfg.mode & XT_HASHLIMIT_BYTES) + if (cfg->mode & XT_HASHLIMIT_BYTES) cost = hashlimit_byte_cost(skb->len, dh); else cost = dh->rateinfo.cost; @@ -648,70 +696,126 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); rcu_read_unlock_bh(); - return !(info->cfg.mode & XT_HASHLIMIT_INVERT); + return !(cfg->mode & XT_HASHLIMIT_INVERT); } spin_unlock(&dh->lock); rcu_read_unlock_bh(); /* default match is underlimit - so over the limit, we need to invert */ - return info->cfg.mode & XT_HASHLIMIT_INVERT; + return cfg->mode & XT_HASHLIMIT_INVERT; hotdrop: par->hotdrop = true; return false; } -static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) +static bool +hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_common(skb, par, hinfo, &cfg, 1); +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + + return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2); +} + +static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, + struct xt_hashlimit_htable **hinfo, + struct hashlimit_cfg2 *cfg, + const char *name, int revision) { struct net *net = par->net; - struct xt_hashlimit_mtinfo1 *info = par->matchinfo; int ret; - if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) - return -EINVAL; - if (info->name[sizeof(info->name)-1] != '\0') + if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; if (par->family == NFPROTO_IPV4) { - if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; } else { - if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + if (cfg->srcmask > 128 || cfg->dstmask > 128) return -EINVAL; } - if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { + if (cfg->mode & ~XT_HASHLIMIT_ALL) { pr_info("Unknown mode mask %X, kernel too old?\n", - info->cfg.mode); + cfg->mode); return -EINVAL; } /* Check for overflow. */ - if (info->cfg.mode & XT_HASHLIMIT_BYTES) { - if (user2credits_byte(info->cfg.avg) == 0) { - pr_info("overflow, rate too high: %u\n", info->cfg.avg); + if (cfg->mode & XT_HASHLIMIT_BYTES) { + if (user2credits_byte(cfg->avg) == 0) { + pr_info("overflow, rate too high: %llu\n", cfg->avg); return -EINVAL; } - } else if (info->cfg.burst == 0 || - user2credits(info->cfg.avg * info->cfg.burst) < - user2credits(info->cfg.avg)) { - pr_info("overflow, try lower: %u/%u\n", - info->cfg.avg, info->cfg.burst); + } else if (cfg->burst == 0 || + user2credits(cfg->avg * cfg->burst, revision) < + user2credits(cfg->avg, revision)) { + pr_info("overflow, try lower: %llu/%llu\n", + cfg->avg, cfg->burst); return -ERANGE; } mutex_lock(&hashlimit_mutex); - info->hinfo = htable_find_get(net, info->name, par->family); - if (info->hinfo == NULL) { - ret = htable_create_v1(net, info, par->family); + *hinfo = htable_find_get(net, name, par->family); + if (*hinfo == NULL) { + ret = htable_create(net, cfg, name, par->family, + hinfo, revision); if (ret < 0) { mutex_unlock(&hashlimit_mutex); return ret; } } mutex_unlock(&hashlimit_mutex); + return 0; } +static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_check_common(par, &info->hinfo, + &cfg, info->name, 1); +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, + info->name, 2); +} + static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; @@ -719,6 +823,13 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) htable_put(info->hinfo); } +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + htable_put(info->hinfo); +} + static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", @@ -730,6 +841,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "hashlimit", @@ -741,6 +862,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, #endif }; @@ -787,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, - struct seq_file *s) +static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) { - const struct xt_hashlimit_htable *ht = s->private; - - spin_lock(&ent->lock); - /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies, ht->cfg.mode); - switch (family) { case NFPROTO_IPV4: - seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", + seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip.src, ntohs(ent->dst.src_port), @@ -809,7 +934,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: - seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", + seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip6.src, ntohs(ent->dst.src_port), @@ -822,6 +947,34 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, default: BUG(); } +} + +static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1); + + dl_seq_print(ent, family, s); + + spin_unlock(&ent->lock); + return seq_has_overflowed(s); +} + +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); + + dl_seq_print(ent, family, s); + spin_unlock(&ent->lock); return seq_has_overflowed(s); } @@ -840,6 +993,20 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) return 0; } +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, &htable->hash[*bucket], node) + if (dl_seq_real_show(ent, htable->family, s)) + return -1; + } + return 0; +} + static const struct seq_operations dl_seq_ops_v1 = { .start = dl_seq_start, .next = dl_seq_next, @@ -847,6 +1014,13 @@ static const struct seq_operations dl_seq_ops_v1 = { .show = dl_seq_show_v1 }; +static const struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + static int dl_proc_open_v1(struct inode *inode, struct file *file) { int ret = seq_open(file, &dl_seq_ops_v1); @@ -858,6 +1032,18 @@ static int dl_proc_open_v1(struct inode *inode, struct file *file) return ret; } +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + + sf->private = PDE_DATA(inode); + } + return ret; +} + static const struct file_operations dl_file_ops_v1 = { .owner = THIS_MODULE, .open = dl_proc_open_v1, @@ -866,6 +1052,14 @@ static const struct file_operations dl_file_ops_v1 = { .release = seq_release }; +static const struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + static int __net_init hashlimit_proc_net_init(struct net *net) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); -- cgit v1.2.1 From 58e207e4983d7acea39b7fbec9343d8a6d218a18 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Sep 2016 23:49:17 +0200 Subject: netfilter: evict stale entries when user reads /proc/net/nf_conntrack Fabian reports a possible conntrack memory leak (could not reproduce so far), however, one minor issue can be easily resolved: > cat /proc/net/nf_conntrack | wc -l = 5 > 4 minutes required to clean up the table. We should not report those timed-out entries to the user in first place. And instead of just skipping those timed-out entries while iterating over the table we can also zap them (we already do this during ctnetlink walks, but I forgot about the /proc interface). Fixes: f330a7fdbe16 ("netfilter: conntrack: get rid of conntrack timer") Reported-by: Fabian Frederick Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 7d52f8401afd..5f446cd9f3fd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v) if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) return 0; + if (nf_ct_should_gc(ct)) { + nf_ct_kill(ct); + goto release; + } + /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; -- cgit v1.2.1 From 7a682575ad4829b4de3e672a6ad5f73a05826b82 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Fri, 23 Sep 2016 11:27:42 +0200 Subject: netfilter: xt_socket: fix transparent match for IPv6 request sockets The introduction of TCP_NEW_SYN_RECV state, and the addition of request sockets to the ehash table seems to have broken the --transparent option of the socket match for IPv6 (around commit a9407000). Now that the socket lookup finds the TCP_NEW_SYN_RECV socket instead of the listener, the --transparent option tries to match on the no_srccheck flag of the request socket. Unfortunately, that flag was only set for IPv4 sockets in tcp_v4_init_req() by copying the transparent flag of the listener socket. This effectively causes '-m socket --transparent' not match on the ACK packet sent by the client in a TCP handshake. Based on the suggestion from Eric Dumazet, this change moves the code initializing no_srccheck to tcp_conn_request(), rendering the above scenario working again. Fixes: a940700003 ("netfilter: xt_socket: prepare for TCP_NEW_SYN_RECV support") Signed-off-by: Alex Badics Signed-off-by: KOVACS Krisztian Signed-off-by: Pablo Neira Ayuso --- net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8cd02c0b056c..f3a9f3c2c8d8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6269,6 +6269,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); + inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a75bf48d7950..13b05adf9d3e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1196,7 +1196,6 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); } -- cgit v1.2.1 From 0f3cd9b3697708c86a825ae3cedabf7be6fd3e72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Sep 2016 15:23:33 +0200 Subject: netfilter: nf_tables: add range expression Inverse ranges != [a,b] are not currently possible because rules are composites of && operations, and we need to express this: data < a || data > b This patch adds a new range expression. Positive ranges can be already through two cmp expressions: cmp(sreg, data, >=) cmp(sreg, data, <=) This new range expression provides an alternative way to express this. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Makefile | 3 +- net/netfilter/nf_tables_core.c | 7 ++- net/netfilter/nft_range.c | 138 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 net/netfilter/nft_range.c (limited to 'net') diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0c8581100ac6..c23c3c84416f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o +nf_tables-objs += nft_lookup.o nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 67259cefef06..7c94ce0080d5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -263,8 +263,13 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err7; - return 0; + err = nft_range_module_init(); + if (err < 0) + goto err8; + return 0; +err8: + nft_dynset_module_exit(); err7: nft_payload_module_exit(); err6: diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c new file mode 100644 index 000000000000..c6d5358482d1 --- /dev/null +++ b/net/netfilter/nft_range.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_range_expr { + struct nft_data data_from; + struct nft_data data_to; + enum nft_registers sreg:8; + u8 len; + enum nft_range_ops op:8; +}; + +static void nft_range_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + bool mismatch; + int d1, d2; + + d1 = memcmp(®s->data[priv->sreg], &priv->data_from, priv->len); + d2 = memcmp(®s->data[priv->sreg], &priv->data_to, priv->len); + switch (priv->op) { + case NFT_RANGE_EQ: + mismatch = (d1 < 0 || d2 > 0); + break; + case NFT_RANGE_NEQ: + mismatch = (d1 >= 0 && d2 <= 0); + break; + } + + if (mismatch) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { + [NFTA_RANGE_SREG] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, + [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_range_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc_from, desc_to; + int err; + + err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), + &desc_from, tb[NFTA_RANGE_FROM_DATA]); + if (err < 0) + return err; + + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), + &desc_to, tb[NFTA_RANGE_TO_DATA]); + if (err < 0) + goto err1; + + if (desc_from.len != desc_to.len) { + err = -EINVAL; + goto err2; + } + + priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); + err = nft_validate_register_load(priv->sreg, desc_from.len); + if (err < 0) + goto err2; + + priv->op = ntohl(nla_get_be32(tb[NFTA_RANGE_OP])); + priv->len = desc_from.len; + return 0; +err2: + nft_data_uninit(&priv->data_to, desc_to.type); +err1: + nft_data_uninit(&priv->data_from, desc_from.type); + return err; +} + +static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from, + NFT_DATA_VALUE, priv->len) < 0 || + nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_range_type; +static const struct nft_expr_ops nft_range_ops = { + .type = &nft_range_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)), + .eval = nft_range_eval, + .init = nft_range_init, + .dump = nft_range_dump, +}; + +static struct nft_expr_type nft_range_type __read_mostly = { + .name = "range", + .ops = &nft_range_ops, + .policy = nft_range_policy, + .maxattr = NFTA_RANGE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_range_module_init(void) +{ + return nft_register_expr(&nft_range_type); +} + +void nft_range_module_exit(void) +{ + nft_unregister_expr(&nft_range_type); +} -- cgit v1.2.1 From ff107d27761ff4b644c82c209e004ec9c8fbbc22 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 25 Sep 2016 16:35:56 +0800 Subject: netfilter: nft_log: complete NFTA_LOG_FLAGS attr support NFTA_LOG_FLAGS attribute is already supported, but the related NF_LOG_XXX flags are not exposed to the userspace. So we cannot explicitly enable log flags to log uid, tcp sequence, ip options and so on, i.e. such rule "nft add rule filter output log uid" is not supported yet. So move NF_LOG_XXX macro definitions to the uapi/../nf_log.h. In order to keep consistent with other modules, change NF_LOG_MASK to refer to all supported log flags. On the other hand, add a new NF_LOG_DEFAULT_MASK to refer to the original default log flags. Finally, if user specify the unsupported log flags or NFTA_LOG_GROUP and NFTA_LOG_FLAGS are set at the same time, report EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_log.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/nf_log_arp.c | 2 +- net/ipv4/netfilter/nf_log_ipv4.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 2 +- net/ipv6/netfilter/nf_log_ipv6.c | 4 ++-- net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nft_log.c | 9 ++++++++- 8 files changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 152300d164ac..9a11086ba6ff 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, if (loginfo->type == NF_LOG_TYPE_LOG) bitmask = loginfo->u.log.logflags; else - bitmask = NF_LOG_MASK; + bitmask = NF_LOG_DEFAULT_MASK; if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f993545a3373..7c00ce90adb8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = 4, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 8945c2653814..b24795e2ee6d 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 20f225593a8b..5b571e1b5f15 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 552fac2f390a..55aacea24396 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index c1bcf699a23d..f6aee2895fee 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 7c94ce0080d5..0dd5c695482f 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 24a73bb26e94..1b01404bb33f 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_LEVEL] != NULL && tb[NFTA_LOG_GROUP] != NULL) return -EINVAL; - if (tb[NFTA_LOG_GROUP] != NULL) + if (tb[NFTA_LOG_GROUP] != NULL) { li->type = NF_LOG_TYPE_ULOG; + if (tb[NFTA_LOG_FLAGS] != NULL) + return -EINVAL; + } nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + if (li->u.log.logflags & ~NF_LOG_MASK) { + err = -EINVAL; + goto err1; + } } break; case NF_LOG_TYPE_ULOG: -- cgit v1.2.1 From 8cb2a7d5667ab9a9c2fdd356357b85b63b320901 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 25 Sep 2016 16:47:05 +0800 Subject: netfilter: nf_log: get rid of XT_LOG_* macros nf_log is used by both nftables and iptables, so use XT_LOG_XXX macros here is not appropriate. Replace them with NF_LOG_XXX. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_log_ipv4.c | 6 +++--- net/ipv6/netfilter/nf_log_ipv6.c | 14 +++++++------- net/netfilter/nf_log_common.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 5b571e1b5f15..856648966f4c 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, if (ntohs(ih->frag_off) & IP_OFFSET) nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((logflags & XT_LOG_IPOPT) && + if ((logflags & NF_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { const unsigned char *op; unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; @@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff) + if ((logflags & NF_LOG_UID) && !iphoff) nf_log_dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ @@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; - if (!(logflags & XT_LOG_MACDECODE)) + if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index f6aee2895fee..57d86066a13b 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, } /* Max length: 48 "OPT (...) " */ - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, "OPT ( "); switch (currenthdr) { @@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ")"); return; } @@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, break; /* Max Length */ case IPPROTO_AH: - if (logflags & XT_LOG_IPOPT) { + if (logflags & NF_LOG_IPOPT) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; @@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, hdrlen = (hp->hdrlen+2)<<2; break; case IPPROTO_ESP: - if (logflags & XT_LOG_IPOPT) { + if (logflags & NF_LOG_IPOPT) { struct ip_esp_hdr _esph; const struct ip_esp_hdr *eh; @@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); return; } - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ") "); currenthdr = hp->nexthdr; @@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse) + if ((logflags & NF_LOG_UID) && recurse) nf_log_dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ @@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; - if (!(logflags & XT_LOG_MACDECODE)) + if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a5aa5967b8e1..119fe1cb1ea9 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, nf_log_buf_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & XT_LOG_TCPSEQ) { + if (logflags & NF_LOG_TCPSEQ) { nf_log_buf_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); } @@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, /* Max length: 11 "URGP=65535 " */ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { u_int8_t _opt[60 - sizeof(struct tcphdr)]; const u_int8_t *op; unsigned int i; -- cgit v1.2.1 From 2cf750704bb6d7ed8c7d732e071dd1bc890ea5e8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 25 Sep 2016 23:08:31 +0200 Subject: ipmr, ip6mr: fix scheduling while atomic and a deadlock with ipmr_get_route Since the commit below the ipmr/ip6mr rtnl_unicast() code uses the portid instead of the previous dst_pid which was copied from in_skb's portid. Since the skb is new the portid is 0 at that point so the packets are sent to the kernel and we get scheduling while atomic or a deadlock (depending on where it happens) by trying to acquire rtnl two times. Also since this is RTM_GETROUTE, it can be triggered by a normal user. Here's the sleeping while atomic trace: [ 7858.212557] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620 [ 7858.212748] in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0 [ 7858.212881] 2 locks held by swapper/0/0: [ 7858.213013] #0: (((&mrt->ipmr_expire_timer))){+.-...}, at: [] call_timer_fn+0x5/0x350 [ 7858.213422] #1: (mfc_unres_lock){+.....}, at: [] ipmr_expire_process+0x25/0x130 [ 7858.213807] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.8.0-rc7+ #179 [ 7858.213934] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 7858.214108] 0000000000000000 ffff88005b403c50 ffffffff813a7804 0000000000000000 [ 7858.214412] ffffffff81a1338e ffff88005b403c78 ffffffff810a4a72 ffffffff81a1338e [ 7858.214716] 000000000000026c 0000000000000000 ffff88005b403ca8 ffffffff810a4b9f [ 7858.215251] Call Trace: [ 7858.215412] [] dump_stack+0x85/0xc1 [ 7858.215662] [] ___might_sleep+0x192/0x250 [ 7858.215868] [] __might_sleep+0x6f/0x100 [ 7858.216072] [] mutex_lock_nested+0x33/0x4d0 [ 7858.216279] [] ? netlink_lookup+0x25f/0x460 [ 7858.216487] [] rtnetlink_rcv+0x1b/0x40 [ 7858.216687] [] netlink_unicast+0x19c/0x260 [ 7858.216900] [] rtnl_unicast+0x20/0x30 [ 7858.217128] [] ipmr_destroy_unres+0xa9/0xf0 [ 7858.217351] [] ipmr_expire_process+0x8f/0x130 [ 7858.217581] [] ? ipmr_net_init+0x180/0x180 [ 7858.217785] [] ? ipmr_net_init+0x180/0x180 [ 7858.217990] [] call_timer_fn+0xa5/0x350 [ 7858.218192] [] ? call_timer_fn+0x5/0x350 [ 7858.218415] [] ? ipmr_net_init+0x180/0x180 [ 7858.218656] [] run_timer_softirq+0x260/0x640 [ 7858.218865] [] ? __do_softirq+0xbb/0x54f [ 7858.219068] [] __do_softirq+0xe8/0x54f [ 7858.219269] [] irq_exit+0xb8/0xc0 [ 7858.219463] [] smp_apic_timer_interrupt+0x42/0x50 [ 7858.219678] [] apic_timer_interrupt+0x8c/0xa0 [ 7858.219897] [] ? native_safe_halt+0x6/0x10 [ 7858.220165] [] ? trace_hardirqs_on+0xd/0x10 [ 7858.220373] [] default_idle+0x23/0x190 [ 7858.220574] [] arch_cpu_idle+0xf/0x20 [ 7858.220790] [] default_idle_call+0x4c/0x60 [ 7858.221016] [] cpu_startup_entry+0x39b/0x4d0 [ 7858.221257] [] rest_init+0x135/0x140 [ 7858.221469] [] start_kernel+0x50e/0x51b [ 7858.221670] [] ? early_idt_handler_array+0x120/0x120 [ 7858.221894] [] x86_64_start_reservations+0x2a/0x2c [ 7858.222113] [] x86_64_start_kernel+0x13b/0x14a Fixes: 2942e9005056 ("[RTNETLINK]: Use rtnl_unicast() for rtnetlink unicasts") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 3 ++- net/ipv4/route.c | 3 ++- net/ipv6/ip6mr.c | 5 +++-- net/ipv6/route.c | 4 +++- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a87bcd2d4a94..5f006e13de56 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2123,7 +2123,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait) + struct rtmsg *rtm, int nowait, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; @@ -2168,6 +2168,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, return -ENOMEM; } + NETLINK_CB(skb2).portid = portid; skb_push(skb2, sizeof(struct iphdr)); skb_reset_network_header(skb2); iph = ip_hdr(skb2); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b5b47a26d4ec..62c3ed0b7556 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2503,7 +2503,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, - r, nowait); + r, nowait, portid); + if (err <= 0) { if (!nowait) { if (err == 0) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index fccb5dd91902..7f4265b1649b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2285,8 +2285,8 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, return 1; } -int ip6mr_get_route(struct net *net, - struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, + int nowait, u32 portid) { int err; struct mr6_table *mrt; @@ -2331,6 +2331,7 @@ int ip6mr_get_route(struct net *net, return -ENOMEM; } + NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e3a224b97905..269218aacbea 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3202,7 +3202,9 @@ static int rt6_fill_node(struct net *net, if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { - int err = ip6mr_get_route(net, skb, rtm, nowait); + int err = ip6mr_get_route(net, skb, rtm, nowait, + portid); + if (err <= 0) { if (!nowait) { if (err == 0) -- cgit v1.2.1 From 1190cfdb1a19d89561ae51cff7d9c2ead24b3ebe Mon Sep 17 00:00:00 2001 From: Jorgen Hansen Date: Mon, 26 Sep 2016 23:59:53 -0700 Subject: VSOCK: Don't dec ack backlog twice for rejected connections If a pending socket is marked as rejected, we will decrease the sk_ack_backlog twice. So don't decrement it for rejected sockets in vsock_pending_work(). Testing of the rejected socket path was done through code modifications. Reported-by: Stefan Hajnoczi Signed-off-by: Jorgen Hansen Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 17dbbe64cd73..8a398b3fb532 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -465,6 +465,8 @@ void vsock_pending_work(struct work_struct *work) if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); + + listener->sk_ack_backlog--; } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We @@ -475,8 +477,6 @@ void vsock_pending_work(struct work_struct *work) goto out; } - listener->sk_ack_backlog--; - /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. @@ -2010,5 +2010,5 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); -MODULE_VERSION("1.0.1.0-k"); +MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); -- cgit v1.2.1 From 4b1d488a285a446329825d5fe91f987b7880e6e5 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 26 Sep 2016 13:45:25 +0300 Subject: act_ife: Fix external mac header on encode On ife encode side, external mac header is copied from the original packet and may be overridden if the user requests. Before, the mac header copy was done from memory region that might not be accessible anymore, as skb_cow_head might free it and copy the packet. This led to random values in the external mac header once the values were not set by user. This fix takes the internal mac header from the packet, after the call to skb_cow_head. Fixes: ef6980b6becb ("net sched: introduce IFE action") Acked-by: Jamal Hadi Salim Signed-off-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/act_ife.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index e87cd81315e1..88ac9a8c9bbd 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -740,8 +740,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - iethh = eth_hdr(skb); - err = skb_cow_head(skb, hdrm); if (unlikely(err)) { ife->tcf_qstats.drops++; @@ -752,6 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, if (!(at & AT_EGRESS)) skb_push(skb, skb->dev->hard_header_len); + iethh = (struct ethhdr *)skb->data; __skb_push(skb, hdrm); memcpy(skb->data, iethh, skb->mac_len); skb_reset_mac_header(skb); -- cgit v1.2.1 From c006da0be033b6ddcd27ee603d0ee01491236642 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 26 Sep 2016 13:45:26 +0300 Subject: act_ife: Fix false encoding On ife encode side, the action stores the different tlvs inside the ife header, where each tlv length field should refer to the length of the whole tlv (without additional padding) and not just the data length. On ife decode side, the action iterates over the tlvs in the ife header and parses them one by one, where in each iteration the current pointer is advanced according to the tlv size. Before, the encoding encoded only the data length inside the tlv, which led to false parsing of ife the header. In addition, due to the fact that the loop counter was unsigned, it could lead to infinite parsing loop. This fix changes the loop counter to be signed and fixes the encoding to take into account the tlv type and size. Fixes: 28a10c426e81 ("net sched: fix encoding to use real length") Acked-by: Jamal Hadi Salim Signed-off-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/act_ife.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 88ac9a8c9bbd..4a60cd5e1875 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -53,7 +53,7 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) u32 *tlv = (u32 *)(skbdata); u16 totlen = nla_total_size(dlen); /*alignment + hdr */ char *dptr = (char *)tlv + NLA_HDRLEN; - u32 htlv = attrtype << 16 | dlen; + u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN); *tlv = htonl(htlv); memset(dptr, 0, totlen - NLA_HDRLEN); @@ -627,7 +627,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; - u16 ifehdrln = ifehdr->metalen; + int ifehdrln = (int)ifehdr->metalen; struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); spin_lock(&ife->tcf_lock); -- cgit v1.2.1 From eb523f42d77a43f80bb9c57a34fbdc8406c7b075 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Tue, 27 Sep 2016 11:21:18 +0300 Subject: net/sched: cls_flower: Use a proper mask value for enc key id parameter The current code use the encapsulation key id value as the mask of that parameter which is wrong. Fix that by using a full mask. Fixes: bc3103f1ed40 ('net/sched: cls_flower: Classify packet in ip tunnels') Signed-off-by: Hadar Hen Zion Acked-by: Amir Vadai Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2af09c872a1a..f6f40fba599b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -481,7 +481,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, } fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, - &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC, sizeof(key->enc_key_id.keyid)); return 0; @@ -919,7 +919,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, - &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id, TCA_FLOWER_UNSPEC, sizeof(key->enc_key_id))) goto nla_put_failure; -- cgit v1.2.1 From b90eb754949931b2e4481b1df9a03f84d4be66ba Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:29 +0200 Subject: fib: introduce FIB notification infrastructure This allows to pass information about added/deleted FIB entries/rules to whoever is interested. This is done in a very similar way as devinet notifies address additions/removals. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 16 ++++++------- net/ipv4/fib_rules.c | 10 ++++++++ net/ipv4/fib_trie.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 77 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e56a4c20a3c..86c43dc9a60e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -182,7 +182,7 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(tb); + flushed += fib_table_flush(net, tb); } if (flushed) @@ -590,13 +590,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); else err = -ENOBUFS; } @@ -719,7 +719,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); errout: return err; } @@ -741,7 +741,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); errout: return err; } @@ -828,9 +828,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(tb, &cfg); + fib_table_insert(net, tb, &cfg); else - fib_table_delete(tb, &cfg); + fib_table_delete(net, tb, &cfg); } void fib_add_ifaddr(struct in_ifaddr *ifa) @@ -1254,7 +1254,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(tb); + fib_table_flush(net, tb); fib_free_table(tb); } } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 770bebed6b28..ebadf6b99499 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -164,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type) +{ + struct fib_notifier_info info; + + return call_fib_notifiers(net, event_type, &info); +} + static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, @@ -221,6 +229,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, net->ipv4.fib_has_custom_rules = true; fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); err = 0; errout: @@ -243,6 +252,7 @@ static int fib4_rule_delete(struct fib_rule *rule) #endif net->ipv4.fib_has_custom_rules = true; fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); errout: return err; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 241f27bbd7ad..51a4537eb145 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,44 @@ #include #include "fib_lookup.h" +static BLOCKING_NOTIFIER_HEAD(fib_chain); + +int register_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&fib_chain, nb); +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return blocking_notifier_call_chain(&fib_chain, event_type, info); +} + +static int call_fib_entry_notifiers(struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifiers(net, event_type, &info.info); +} + #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) @@ -1076,7 +1115,8 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; @@ -1193,6 +1233,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); + + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, + key, plen, fi, + new_fa->fa_tos, cfg->fc_type, + tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1245,6 +1290,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, + cfg->fc_type, tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1490,7 +1537,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1546,6 +1594,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, cfg->fc_type, tb->tb_id); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, + fa_to_delete->fa_info, tos, cfg->fc_type, + tb->tb_id, 0); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1809,7 +1860,7 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1861,6 +1912,11 @@ int fib_table_flush(struct fib_table *tb) switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, fa->fa_type, tb->tb_id); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); -- cgit v1.2.1 From c98501879b1b1af90c7325574f2672e9efca592c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:30 +0200 Subject: fib: introduce FIB info offload flag helpers These helpers are to be used in case someone offloads the FIB entry. The result is that if the entry is offloaded to at least one device, the offload flag is set. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 10b819308439..abd8d2a38a7d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1216,7 +1216,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_add(dev, &ipv4_fib.obj); if (!err) - fi->fib_flags |= RTNH_F_OFFLOAD; + fib_info_offload_inc(fi); return err == -EOPNOTSUPP ? 0 : err; } @@ -1260,7 +1260,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_del(dev, &ipv4_fib.obj); if (!err) - fi->fib_flags &= ~RTNH_F_OFFLOAD; + fib_info_offload_dec(fi); return err == -EOPNOTSUPP ? 0 : err; } -- cgit v1.2.1 From 347e3b28c1ba24c1ae2f30290d8247480ab9ce14 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 26 Sep 2016 12:52:33 +0200 Subject: switchdev: remove FIB offload infrastructure Since this is now taken care of by FIB notifier, remove the code, with all unused dependencies. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 13 ---- net/ipv4/fib_rules.c | 2 - net/ipv4/fib_trie.c | 104 +------------------------- net/switchdev/switchdev.c | 181 ---------------------------------------------- 4 files changed, 1 insertion(+), 299 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86c43dc9a60e..c3b80478226e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -189,19 +189,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net); } -void fib_flush_external(struct net *net) -{ - struct fib_table *tb; - struct hlist_head *head; - unsigned int h; - - for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) - fib_table_flush_external(tb); - } -} - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index ebadf6b99499..2e50062f642d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -228,7 +228,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); err = 0; @@ -251,7 +250,6 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); errout: return err; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 51a4537eb145..31cef3602585 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include "fib_lookup.h" @@ -1215,17 +1214,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = switchdev_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - kmem_cache_free(fn_alias_kmem, new_fa); - goto out; - } - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); @@ -1273,18 +1261,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - /* (Optionally) offload fib entry to switch hardware. */ - err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, - cfg->fc_nlflags, tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - goto out_free_new_fa; - } - /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_sw_fib_del; + goto out_free_new_fa; if (!plen) tb->tb_num_default++; @@ -1297,8 +1277,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_sw_fib_del: - switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1591,9 +1569,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, fa_to_delete->fa_info, tos, cfg->fc_type, tb->tb_id, 0); @@ -1785,80 +1760,6 @@ out: return NULL; } -/* Caller must hold RTNL */ -void fib_table_flush_external(struct fib_table *tb) -{ - struct trie *t = (struct trie *)tb->tb_data; - struct key_vector *pn = t->kv; - unsigned long cindex = 1; - struct hlist_node *tmp; - struct fib_alias *fa; - - /* walk trie in reverse order */ - for (;;) { - unsigned char slen = 0; - struct key_vector *n; - - if (!(cindex--)) { - t_key pkey = pn->key; - - /* cannot resize the trie vector */ - if (IS_TRIE(pn)) - break; - - /* resize completed node */ - pn = resize(t, pn); - cindex = get_index(pkey, pn); - - continue; - } - - /* grab the next available node */ - n = get_child(pn, cindex); - if (!n) - continue; - - if (IS_TNODE(n)) { - /* record pn and cindex for leaf walking */ - pn = n; - cindex = 1ul << n->bits; - - continue; - } - - hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { - struct fib_info *fi = fa->fa_info; - - /* if alias was cloned to local then we just - * need to remove the local copy from main - */ - if (tb->tb_id != fa->tb_id) { - hlist_del_rcu(&fa->fa_list); - alias_free_mem_rcu(fa); - continue; - } - - /* record local slen */ - slen = fa->fa_slen; - - if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) - continue; - - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); - } - - /* update leaf slen */ - n->slen = slen; - - if (hlist_empty(&n->leaf)) { - put_child_root(pn, n->key, NULL); - node_free(n); - } - } -} - /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb) { @@ -1909,9 +1810,6 @@ int fib_table_flush(struct net *net, struct fib_table *tb) continue; } - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, KEYLENGTH - fa->fa_slen, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index abd8d2a38a7d..02beb35f577f 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -21,7 +21,6 @@ #include #include #include -#include #include /** @@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); - case SWITCHDEV_OBJ_ID_IPV4_FIB: - return sizeof(struct switchdev_obj_ipv4_fib); case SWITCHDEV_OBJ_ID_PORT_FDB: return sizeof(struct switchdev_obj_port_fdb); case SWITCHDEV_OBJ_ID_PORT_MDB: @@ -1108,184 +1105,6 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); -static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) -{ - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct net_device *port_dev; - struct list_head *iter; - - /* Recusively search down until we find a sw port dev. - * (A sw port dev supports switchdev_port_attr_get). - */ - - if (ops && ops->switchdev_port_attr_get) - return dev; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = switchdev_get_lowest_dev(lower_dev); - if (port_dev) - return port_dev; - } - - return NULL; -} - -static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) -{ - struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - }; - struct switchdev_attr prev_attr; - struct net_device *dev = NULL; - int nhsel; - - ASSERT_RTNL(); - - /* For this route, all nexthop devs must be on the same switch. */ - - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (!nh->nh_dev) - return NULL; - - dev = switchdev_get_lowest_dev(nh->nh_dev); - if (!dev) - return NULL; - - attr.orig_dev = dev; - if (switchdev_port_attr_get(dev, &attr)) - return NULL; - - if (nhsel > 0 && - !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) - return NULL; - - prev_attr = attr; - } - - return dev; -} - -/** - * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @nlflags: netlink flags passed in (NLM_F_*) - * @tb_id: route table ID - * - * Add/modify switch IPv4 route entry. - */ -int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = nlflags, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - /* Don't offload route if using custom ip rules or if - * IPv4 FIB offloading has been disabled completely. - */ - -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (fi->fib_net->ipv4.fib_has_custom_rules) - return 0; -#endif - - if (fi->fib_net->ipv4.fib_offload_disabled) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_add(dev, &ipv4_fib.obj); - if (!err) - fib_info_offload_inc(fi); - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); - -/** - * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @tb_id: route table ID - * - * Delete IPv4 route entry from switch device. - */ -int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = 0, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - if (!(fi->fib_flags & RTNH_F_OFFLOAD)) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_del(dev, &ipv4_fib.obj); - if (!err) - fib_info_offload_dec(fi); - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); - -/** - * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation - * - * @fi: route FIB info structure - */ -void switchdev_fib_ipv4_abort(struct fib_info *fi) -{ - /* There was a problem installing this route to the offload - * device. For now, until we come up with more refined - * policy handling, abruptly end IPv4 fib offloading for - * for entire net by flushing offload device(s) of all - * IPv4 routes, and mark IPv4 fib offloading broken from - * this point forward. - */ - - fib_flush_external(fi->fib_net); - fi->fib_net->ipv4.fib_offload_disabled = true; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); - bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { -- cgit v1.2.1 From 3acf3ec3f4b0fd4263989f2e4227bbd1c42b5fe1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Tue, 27 Sep 2016 19:03:37 -0700 Subject: tcp: Change txhash on every SYN and RTO retransmit The current code changes txhash (flowlables) on every retransmitted SYN/ACK, but only after the 2nd retransmitted SYN and only after tcp_retries1 RTO retransmits. With this patch: 1) txhash is changed with every SYN retransmits 2) txhash is changed with every RTO. The result is that we can start re-routing around failed (or very congested paths) as soon as possible. Otherwise application health checks may fail and the connection may be terminated before we start to change txhash. v4: Removed sysctl, txhash is changed for all RTOs v3: Removed text saying default value of sysctl is 0 (it is 100) v2: Added sysctl documentation and cleaned code Tested with packetdrill tests Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f712b411f6ed..3ea1cf804748 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } else if (!tp->syn_data && !tp->syn_fastopen) { + sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk) tcp_mtu_probing(icsk, sk); dst_negative_advice(sk); + } else { + sk_rethink_txhash(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; -- cgit v1.2.1 From 7836667cec5e02ed2ae3eb09b88047b5b5f2343a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Sep 2016 08:41:16 -0700 Subject: net: do not export sk_stream_write_space Since commit 900f65d361d3 ("tcp: move duplicate code from tcp_v4_init_sock()/tcp_v6_init_sock()") we no longer need to export sk_stream_write_space() From: Eric Dumazet Cc: Neal Cardwell Signed-off-by: David S. Miller --- net/core/stream.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/stream.c b/net/core/stream.c index 159516a11b7e..1086c8b280a8 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk) rcu_read_unlock(); } } -EXPORT_SYMBOL(sk_stream_write_space); /** * sk_stream_wait_connect - Wait for a socket to get into the connected state -- cgit v1.2.1 From 8732db67c6b6dcdb455b73773ea2fc1e1d5024b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:15 +0100 Subject: rxrpc: Fix exclusive client connections Exclusive connections are currently reusable (which they shouldn't be) because rxrpc_alloc_client_connection() checks the exclusive flag in the rxrpc_connection struct before it's initialised from the function parameters. This means that the DONT_REUSE flag doesn't get set. Fix this by checking the function parameters for the exclusive flag. Signed-off-by: David Howells --- net/rxrpc/conn_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index c76a125df891..f5ee8bfa5bef 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -200,7 +200,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) } atomic_set(&conn->usage, 1); - if (conn->params.exclusive) + if (cp->exclusive) __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); conn->params = *cp; -- cgit v1.2.1 From a1767077b0176de17fa40ec743a20cbdac7a0d56 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:15 +0100 Subject: rxrpc: Make Tx loss-injection go through normal return and adjust tracing In rxrpc_send_data_packet() make the loss-injection path return through the same code as the transmission path so that the RTT determination is initiated and any future timer shuffling will be done, despite the packet having been binned. Whilst we're at it: (1) Add to the tx_data tracepoint an indication of whether or not we're retransmitting a data packet. (2) When we're deciding whether or not to request an ACK, rather than checking if we're in fast-retransmit mode check instead if we're retransmitting. (3) Don't invoke the lose_skb tracepoint when losing a Tx packet as we're not altering the sk_buff refcount nor are we just seeing it after getting it off the Tx list. (4) The rxrpc_skb_tx_lost note is then no longer used so remove it. (5) rxrpc_lose_skb() no longer needs to deal with rxrpc_skb_tx_lost. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_event.c | 2 +- net/rxrpc/misc.c | 1 - net/rxrpc/output.c | 17 +++++++++-------- net/rxrpc/sendmsg.c | 2 +- net/rxrpc/skbuff.c | 11 +++-------- 6 files changed, 15 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ca96e547cb9a..6aadaa7d8b43 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -603,7 +603,6 @@ enum rxrpc_skb_trace { rxrpc_skb_tx_cleaned, rxrpc_skb_tx_freed, rxrpc_skb_tx_got, - rxrpc_skb_tx_lost, rxrpc_skb_tx_new, rxrpc_skb_tx_rotated, rxrpc_skb_tx_seen, @@ -1073,7 +1072,7 @@ extern const s8 rxrpc_ack_priority[]; * output.c */ int rxrpc_send_call_packet(struct rxrpc_call *, u8); -int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *); +int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0e8478012212..1f6c7633b964 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -256,7 +256,7 @@ static void rxrpc_resend(struct rxrpc_call *call) rxrpc_get_skb(skb, rxrpc_skb_tx_got); spin_unlock_bh(&call->lock); - if (rxrpc_send_data_packet(call, skb) < 0) { + if (rxrpc_send_data_packet(call, skb, true) < 0) { rxrpc_free_skb(skb, rxrpc_skb_tx_freed); return; } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index aedb8978226d..47dddacdbb91 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -108,7 +108,6 @@ const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { [rxrpc_skb_tx_cleaned] = "Tx CLN", [rxrpc_skb_tx_freed] = "Tx FRE", [rxrpc_skb_tx_got] = "Tx GOT", - [rxrpc_skb_tx_lost] = "Tx *L*", [rxrpc_skb_tx_new] = "Tx NEW", [rxrpc_skb_tx_rotated] = "Tx ROT", [rxrpc_skb_tx_seen] = "Tx SEE", diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index cf43a715685e..ac9a58b619a6 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -238,7 +238,8 @@ out: /* * send a packet through the transport endpoint */ -int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) +int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool retrans) { struct rxrpc_connection *conn = call->conn; struct rxrpc_wire_header whdr; @@ -247,6 +248,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) struct kvec iov[2]; rxrpc_serial_t serial; size_t len; + bool lost = false; int ret, opt; _enter(",{%d}", skb->len); @@ -281,7 +283,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. */ - if (call->cong_mode == RXRPC_CALL_FAST_RETRANSMIT || + if (retrans || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) @@ -290,11 +292,9 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, - whdr.flags, true); - rxrpc_lose_skb(skb, rxrpc_skb_tx_lost); - _leave(" = 0 [lose]"); - return 0; + ret = 0; + lost = true; + goto done; } } @@ -319,7 +319,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb) goto send_fragmentable; done: - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, false); + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, + retrans, lost); if (ret >= 0) { ktime_t now = ktime_get_real(); skb->tstamp = now; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1f8040d82395..d8dfdce874d8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -144,7 +144,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, if (seq == 1 && rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - ret = rxrpc_send_data_packet(call, skb); + ret = rxrpc_send_data_packet(call, skb, false); if (ret < 0) { _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 5154cbf7e540..67b02c45271b 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -77,14 +77,9 @@ void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) if (skb) { int n; CHECK_SLAB_OKAY(&skb->users); - if (op == rxrpc_skb_tx_lost) { - n = atomic_read(select_skb_count(op)); - trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); - } else { - n = atomic_dec_return(select_skb_count(op)); - trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); - kfree_skb(skb); - } + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); } } -- cgit v1.2.1 From 2629c7fa7c0adfdf023051b404cd538951bd0354 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:15 +0100 Subject: rxrpc: When activating client conn channels, do state check inside lock In rxrpc_activate_channels(), the connection cache state is checked outside of the lock, which means it can change whilst we're waking calls up, thereby changing whether or not we're allowed to wake calls up. Fix this by moving the check inside the locked region. The check to see if all the channels are currently busy can stay outside of the locked region. Whilst we're at it: (1) Split the locked section out into its own function so that we can call it from other places in a later patch. (2) Determine the mask of channels dependent on the state as we're going to add another state in a later patch that will restrict the number of simultaneous calls to 1 on a connection. Signed-off-by: David Howells --- net/rxrpc/conn_client.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f5ee8bfa5bef..60ef9605167e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -575,29 +575,43 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, wake_up(&call->waitq); } +/* + * Assign channels and callNumbers to waiting calls with channel_lock + * held by caller. + */ +static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn) +{ + u8 avail, mask; + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + mask = RXRPC_ACTIVE_CHANS_MASK; + break; + default: + return; + } + + while (!list_empty(&conn->waiting_calls) && + (avail = ~conn->active_chans, + avail &= mask, + avail != 0)) + rxrpc_activate_one_channel(conn, __ffs(avail)); +} + /* * Assign channels and callNumbers to waiting calls. */ static void rxrpc_activate_channels(struct rxrpc_connection *conn) { - unsigned char mask; - _enter("%d", conn->debug_id); trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans); - if (conn->cache_state != RXRPC_CONN_CLIENT_ACTIVE || - conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) + if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) return; spin_lock(&conn->channel_lock); - - while (!list_empty(&conn->waiting_calls) && - (mask = ~conn->active_chans, - mask &= RXRPC_ACTIVE_CHANS_MASK, - mask != 0)) - rxrpc_activate_one_channel(conn, __ffs(mask)); - + rxrpc_activate_channels_locked(conn); spin_unlock(&conn->channel_lock); _leave(""); } -- cgit v1.2.1 From 1e9e5c9521d3667664a6e3c97075f71afec23720 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:15 +0100 Subject: rxrpc: Reduce the rxrpc_local::services list to a pointer Reduce the rxrpc_local::services list to just a pointer as we don't permit multiple service endpoints to bind to a single transport endpoints (this is excluded by rxrpc_lookup_local()). The reason we don't allow this is that if you send a request to an AFS filesystem service, it will try to talk back to your cache manager on the port you sent from (this is how file change notifications are handled). To prevent someone from stealing your CM callbacks, we don't let AF_RXRPC sockets share a UDP socket if at least one of them has a service bound. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 21 ++++++++------------- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_accept.c | 8 ++++---- net/rxrpc/local_object.c | 3 +-- net/rxrpc/security.c | 8 ++++---- 5 files changed, 18 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 8dbf7bed2cc4..44c9c2b0b190 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -136,7 +136,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct sock *sk = sock->sk; struct rxrpc_local *local; - struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; + struct rxrpc_sock *rx = rxrpc_sk(sk); + u16 service_id = srx->srx_service; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -160,15 +161,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) goto error_unlock; } - if (rx->srx.srx_service) { + if (service_id) { write_lock(&local->services_lock); - hlist_for_each_entry(prx, &local->services, listen_link) { - if (prx->srx.srx_service == rx->srx.srx_service) - goto service_in_use; - } - + if (rcu_access_pointer(local->service)) + goto service_in_use; rx->local = local; - hlist_add_head_rcu(&rx->listen_link, &local->services); + rcu_assign_pointer(local->service, rx); write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; @@ -599,7 +597,6 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->family = protocol; rx->calls = RB_ROOT; - INIT_HLIST_NODE(&rx->listen_link); spin_lock_init(&rx->incoming_lock); INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); @@ -681,11 +678,9 @@ static int rxrpc_release_sock(struct sock *sk) sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); - ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); - - if (!hlist_unhashed(&rx->listen_link)) { + if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); - hlist_del_rcu(&rx->listen_link); + rx->local->service = NULL; write_unlock(&rx->local->services_lock); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6aadaa7d8b43..539db54697f9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -93,7 +93,6 @@ struct rxrpc_sock { rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ - struct hlist_node listen_link; /* link in the local endpoint's listen list */ struct rxrpc_backlog *backlog; /* Preallocation for services */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ @@ -216,7 +215,7 @@ struct rxrpc_local { struct list_head link; struct socket *socket; /* my UDP socket */ struct work_struct processor; - struct hlist_head services; /* services listening on this endpoint */ + struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index a8d39d7cf42c..3cac231d8405 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -331,14 +331,14 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_sock *rx; struct rxrpc_call *call; + u16 service_id = sp->hdr.serviceId; _enter(""); /* Get the socket providing the service */ - hlist_for_each_entry_rcu_bh(rx, &local->services, listen_link) { - if (rx->srx.srx_service == sp->hdr.serviceId) - goto found_service; - } + rx = rcu_dereference(local->service); + if (service_id == rx->srx.srx_service) + goto found_service; trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index e3fad80b0795..ff4864d550b8 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -86,7 +86,6 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) atomic_set(&local->usage, 1); INIT_LIST_HEAD(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); - INIT_HLIST_HEAD(&local->services); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); @@ -292,7 +291,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) mutex_unlock(&rxrpc_local_mutex); ASSERT(RB_EMPTY_ROOT(&local->client_conns)); - ASSERT(hlist_empty(&local->services)); + ASSERT(!local->service); if (socket) { local->socket = NULL; diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 82d8134e9287..7d921e56e715 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -131,10 +131,10 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) /* find the service */ read_lock(&local->services_lock); - hlist_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == conn->params.service_id) - goto found_service; - } + rx = rcu_dereference_protected(local->service, + lockdep_is_held(&local->services_lock)); + if (rx && rx->srx.srx_service == conn->params.service_id) + goto found_service; /* the service appears to have died */ read_unlock(&local->services_lock); -- cgit v1.2.1 From b112a67081e4b06652ecde588bf1d5778fe43d75 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:16 +0100 Subject: rxrpc: Request more ACKs in slow-start mode Set the request-ACK on more DATA packets whilst we're in slow start mode so that we get sufficient ACKs back to supply information to configure the window. Signed-off-by: David Howells --- net/rxrpc/output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ac9a58b619a6..0d47db886f6e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -284,6 +284,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * ACKs if a DATA packet appears to have been lost. */ if (retrans || + call->cong_mode == RXRPC_CALL_SLOW_START || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) -- cgit v1.2.1 From ed1e8679d8bc6537077d1f24bc83b396f6062f09 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Sep 2016 22:37:16 +0100 Subject: rxrpc: Note serial number being ACK'd in the congestion management trace Note the serial number of the packet being ACK'd in the congestion management trace rather than the serial number of the ACK packet. Whilst the serial number of the ACK packet is useful for matching ACK packet in the output of wireshark, the serial number that the ACK is in response to is of more use in working out how different trace lines relate. Signed-off-by: David Howells --- net/rxrpc/input.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 094720dd1eaf..1461d30583c9 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -41,10 +41,10 @@ static void rxrpc_proto_abort(const char *why, */ static void rxrpc_congestion_management(struct rxrpc_call *call, struct sk_buff *skb, - struct rxrpc_ack_summary *summary) + struct rxrpc_ack_summary *summary, + rxrpc_serial_t acked_serial) { enum rxrpc_congest_change change = rxrpc_cong_no_change; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int cumulative_acks = call->cong_cumul_acks; unsigned int cwnd = call->cong_cwnd; bool resend = false; @@ -172,7 +172,7 @@ out_no_clear_ca: cwnd = RXRPC_RXTX_BUFF_SIZE - 1; call->cong_cwnd = cwnd; call->cong_cumul_acks = cumulative_acks; - trace_rxrpc_congest(call, summary, sp->hdr.serial, change); + trace_rxrpc_congest(call, summary, acked_serial, change); if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); return; @@ -848,7 +848,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, false, true, rxrpc_propose_ack_ping_for_lost_reply); - return rxrpc_congestion_management(call, skb, &summary); + return rxrpc_congestion_management(call, skb, &summary, acked_serial); } /* -- cgit v1.2.1 From f22d5c490990ecb6f4eb70c4ed478fc8cea78fe1 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:28:59 +0800 Subject: proc: Reduce cache miss in snmp_seq_show This is to use the generic interfaces snmp_get_cpu_field{,64}_batch to aggregate the data by going through all the items of each cpu sequentially. Then snmp_seq_show is split into 2 parts to avoid build warning "the frame size" larger than 1024. Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/ipv4/proc.c | 70 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1ed015e4bc79..f51fc8803154 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -46,6 +46,8 @@ #include #include +#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) + /* * Report socket allocation statistics [mea@utu.fi] */ @@ -379,13 +381,15 @@ static void icmp_put(struct seq_file *seq) /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ -static int snmp_seq_show(struct seq_file *seq, void *v) +static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { - int i; struct net *net = seq->private; + u64 buff64[IPSTATS_MIB_MAX]; + int i; - seq_puts(seq, "Ip: Forwarding DefaultTTL"); + memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); + seq_puts(seq, "Ip: Forwarding DefaultTTL"); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); @@ -394,57 +398,77 @@ static int snmp_seq_show(struct seq_file *seq, void *v) net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); + snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) - seq_printf(seq, " %llu", - snmp_fold_field64(net->mib.ip_statistics, - snmp4_ipstats_list[i].entry, - offsetof(struct ipstats_mib, syncp))); + seq_printf(seq, " %llu", buff64[i]); - icmp_put(seq); /* RFC 2011 compatibility */ - icmpmsg_put(seq); + return 0; +} + +static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) +{ + unsigned long buff[TCPUDP_MIB_MAX]; + struct net *net = seq->private; + int i; + + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); + snmp_get_cpu_field_batch(buff, snmp4_tcp_list, + net->mib.tcp_statistics); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) - seq_printf(seq, " %ld", - snmp_fold_field(net->mib.tcp_statistics, - snmp4_tcp_list[i].entry)); + seq_printf(seq, " %ld", buff[i]); else - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.tcp_statistics, - snmp4_tcp_list[i].entry)); + seq_printf(seq, " %lu", buff[i]); } + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + + snmp_get_cpu_field_batch(buff, snmp4_udp_list, + net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); - seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.udp_statistics, - snmp4_udp_list[i].entry)); + seq_printf(seq, " %lu", buff[i]); + + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); + snmp_get_cpu_field_batch(buff, snmp4_udp_list, + net->mib.udplite_statistics); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); - seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.udplite_statistics, - snmp4_udp_list[i].entry)); + seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } +static int snmp_seq_show(struct seq_file *seq, void *v) +{ + snmp_seq_show_ipstats(seq, v); + + icmp_put(seq); /* RFC 2011 compatibility */ + icmpmsg_put(seq); + + snmp_seq_show_tcp_udp(seq, v); + + return 0; +} + static int snmp_seq_open(struct inode *inode, struct file *file) { return single_open_net(inode, file, snmp_seq_show); -- cgit v1.2.1 From 4a4857b1c81ef39a9dc719af6b498cd39d1c1eb0 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:29:00 +0800 Subject: proc: Reduce cache miss in snmp6_seq_show This is to use the generic interfaces snmp_get_cpu_field{,64}_batch to aggregate the data by going through all the items of each cpu sequentially. Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/ipv6/proc.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 679253d0af84..cc8e3ae9ca73 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -30,6 +30,11 @@ #include #include +#define MAX4(a, b, c, d) \ + max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) +#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ + IPSTATS_MIB_MAX, ICMP_MIB_MAX) + static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; @@ -191,25 +196,34 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { + unsigned long buff[SNMP_MIB_MAX]; int i; - unsigned long val; - for (i = 0; itemlist[i].name; i++) { - val = pcpumib ? - snmp_fold_field(pcpumib, itemlist[i].entry) : - atomic_long_read(smib + itemlist[i].entry); - seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + if (pcpumib) { + memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + + snmp_get_cpu_field_batch(buff, itemlist, pcpumib); + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", + itemlist[i].name, buff[i]); + } else { + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, + atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { + u64 buff64[SNMP_MIB_MAX]; int i; + memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + + snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); for (i = 0; itemlist[i].name; i++) - seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, - snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } static int snmp6_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.1 From 7d64a94be2f9fadb1dd95742650f1fdbac69f25b Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:29:01 +0800 Subject: proc: Reduce cache miss in sctp_snmp_seq_show This is to use the generic interfaces snmp_get_cpu_field{,64}_batch to aggregate the data by going through all the items of each cpu sequentially. Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/sctp/proc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ef8ba77a5bea..09e16c2b5c17 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -73,13 +73,17 @@ static const struct snmp_mib sctp_snmp_list[] = { /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { + unsigned long buff[SCTP_MIB_MAX]; struct net *net = seq->private; int i; + memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); + + snmp_get_cpu_field_batch(buff, sctp_snmp_list, + net->sctp.sctp_statistics); for (i = 0; sctp_snmp_list[i].name != NULL; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, - snmp_fold_field(net->sctp.sctp_statistics, - sctp_snmp_list[i].entry)); + buff[i]); return 0; } -- cgit v1.2.1 From 07613873f1731aa47fdf06a1cbd2e3cd1974c026 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:29:02 +0800 Subject: proc: Reduce cache miss in xfrm_statistics_seq_show This is to use the generic interfaces snmp_get_cpu_field{,64}_batch to aggregate the data by going through all the items of each cpu sequentially. Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/xfrm/xfrm_proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 9c4fbd8935f4..ba2b539879bc 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -50,12 +50,18 @@ static const struct snmp_mib xfrm_mib_list[] = { static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { + unsigned long buff[LINUX_MIB_XFRMMAX]; struct net *net = seq->private; int i; + + memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + + snmp_get_cpu_field_batch(buff, xfrm_mib_list, + net->mib.xfrm_statistics); for (i = 0; xfrm_mib_list[i].name; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, - snmp_fold_field(net->mib.xfrm_statistics, - xfrm_mib_list[i].entry)); + buff[i]); + return 0; } -- cgit v1.2.1 From aca05671d58cea06dc60bbe554b8b399af7da409 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:29:03 +0800 Subject: ipv6: Remove useless parameter in __snmp6_fill_statsdev The parameter items(is always ICMP6_MIB_MAX) is useless for __snmp6_fill_statsdev Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2f1f5d439788..35d4baa55c9d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4961,18 +4961,18 @@ static inline size_t inet6_if_nlmsg_size(void) } static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, - int items, int bytes) + int bytes) { int i; - int pad = bytes - sizeof(u64) * items; + int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX; BUG_ON(pad < 0); /* Use put_unaligned() because stats may not be aligned for u64. */ - put_unaligned(items, &stats[0]); - for (i = 1; i < items; i++) + put_unaligned(ICMP6_MIB_MAX, &stats[0]); + for (i = 1; i < ICMP6_MIB_MAX; i++) put_unaligned(atomic_long_read(&mib[i]), &stats[i]); - memset(&stats[items], 0, pad); + memset(&stats[ICMP6_MIB_MAX], 0, pad); } static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, @@ -5005,7 +5005,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: - __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes); break; } } -- cgit v1.2.1 From 6d4a741cbbfa6612a479656654ca5edf7becc72c Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 30 Sep 2016 11:29:04 +0800 Subject: net: Suppress the "Comparison to NULL could be written" warnings This is to suppress the checkpatch.pl warning "Comparison to NULL could be written". No functional changes here. Signed-off-by: Jia He Signed-off-by: David S. Miller --- net/ipv4/proc.c | 32 ++++++++++++++++---------------- net/sctp/proc.c | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f51fc8803154..7143ca1a6af9 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -358,22 +358,22 @@ static void icmp_put(struct seq_file *seq) atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors"); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } @@ -390,7 +390,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); - for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", @@ -401,7 +401,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); - for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %llu", buff64[i]); return 0; @@ -416,13 +416,13 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name != NULL; i++) + for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); snmp_get_cpu_field_batch(buff, snmp4_tcp_list, net->mib.tcp_statistics); - for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { + for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); @@ -435,10 +435,10 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); @@ -447,10 +447,10 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); snmp_get_cpu_field_batch(buff, snmp4_udp_list, net->mib.udplite_statistics); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); @@ -493,21 +493,21 @@ static int netstat_seq_show(struct seq_file *seq, void *v) struct net *net = seq->private; seq_puts(seq, "TcpExt:"); - for (i = 0; snmp4_net_list[i].name != NULL; i++) + for (i = 0; snmp4_net_list[i].name; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); - for (i = 0; snmp4_net_list[i].name != NULL; i++) + for (i = 0; snmp4_net_list[i].name; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipextstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipextstats_list[i].name; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 09e16c2b5c17..206377fe91ec 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -81,7 +81,7 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) snmp_get_cpu_field_batch(buff, sctp_snmp_list, net->sctp.sctp_statistics); - for (i = 0; sctp_snmp_list[i].name != NULL; i++) + for (i = 0; sctp_snmp_list[i].name; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); -- cgit v1.2.1 From bd11f0741fa5a2c296629898ad07759dd12b35bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 27 Sep 2016 23:57:58 -0700 Subject: ipv6 addrconf: implement RFC7559 router solicitation backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements: https://tools.ietf.org/html/rfc7559 Backoff is performed according to RFC3315 section 14: https://tools.ietf.org/html/rfc3315#section-14 We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations to a negative value meaning an unlimited number of retransmits, and we make this the new default (inline with the RFC). We also add a new setting: /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval defaulting to 1 hour (per RFC recommendation). Signed-off-by: Maciej Żenczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 35d4baa55c9d..87183983724d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp) return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } +static inline s32 rfc3315_s14_backoff_init(s32 irt) +{ + /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ + u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + do_div(tmp, 1000000); + return (s32)tmp; +} + +static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) +{ + /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ + u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + do_div(tmp, 1000000); + if ((s32)tmp > mrt) { + /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ + tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + do_div(tmp, 1000000); + } + return (s32)tmp; +} + #ifdef CONFIG_SYSCTL static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data) goto put; write_lock(&idev->lock); + idev->rs_interval = rfc3315_s14_backoff_update( + idev->rs_interval, idev->cnf.rtr_solicit_max_interval); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == idev->cnf.rtr_solicits) ? idev->cnf.rtr_solicit_delay : - idev->cnf.rtr_solicit_interval); + idev->rs_interval); } else { /* * Note: we do not support deprecated "all on-link" @@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits > 0 && + ifp->idev->cnf.rtr_solicits != 0 && (dev->flags&IFF_LOOPBACK) == 0; read_unlock_bh(&ifp->idev->lock); @@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); + ifp->idev->rs_interval = rfc3315_s14_backoff_init( + ifp->idev->cnf.rtr_solicit_interval); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_rs_timer(ifp->idev, - ifp->idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); spin_unlock(&ifp->lock); write_unlock_bh(&ifp->idev->lock); } @@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; array[DEVCONF_RTR_SOLICIT_INTERVAL] = jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_max_interval); array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; @@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) return -EINVAL; if (!ipv6_accept_ra(idev)) return -EINVAL; - if (idev->cnf.rtr_solicits <= 0) + if (idev->cnf.rtr_solicits == 0) return -EINVAL; write_lock_bh(&idev->lock); @@ -5128,8 +5156,10 @@ update_lft: if (update_rs) { idev->if_flags |= IF_RS_SENT; + idev->rs_interval = rfc3315_s14_backoff_init( + idev->cnf.rtr_solicit_interval); idev->rs_probes = 1; - addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(idev, idev->rs_interval); } /* Well, that's kinda nasty ... */ @@ -5777,6 +5807,13 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "router_solicitation_max_interval", + .data = &ipv6_devconf.rtr_solicit_max_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { .procname = "router_solicitation_delay", .data = &ipv6_devconf.rtr_solicit_delay, -- cgit v1.2.1 From 0605483f6ace1f6b63e397c819a115ddcd13af0d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 29 Sep 2016 02:37:27 +0800 Subject: sctp: remove prsctp_param from sctp_chunk Now sctp uses chunk->prsctp_param to save the prsctp param for all the prsctp polices, we didn't need to introduce prsctp_param to sctp_chunk. We can just use chunk->sinfo.sinfo_timetolive for RTX and BUF polices, and reuse msg->expires_at for TTL policy, as the prsctp polices and old expires policy are mutual exclusive. This patch is to remove prsctp_param from sctp_chunk, and reuse msg's expires_at for TTL and chunk's sinfo.sinfo_timetolive for RTX and BUF polices. Note that sctp can't use chunk's sinfo.sinfo_timetolive for TTL policy, as it needs a u64 variables to save the expires_at time. This one also fixes the "netperf-Throughput_Mbps -37.2% regression" issue. Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/chunk.c | 9 +++++++-- net/sctp/outqueue.c | 4 ++-- net/sctp/sm_make_chunk.c | 15 --------------- 3 files changed, 9 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a55e54738b81..14990e21cf55 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -179,6 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, msg, msg->expires_at, jiffies); } + if (asoc->prsctp_enable && + SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) + msg->expires_at = + jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); + /* This is the biggest possible DATA chunk that can fit into * the packet */ @@ -349,14 +354,14 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) } if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && - time_after(jiffies, chunk->prsctp_param)) { + time_after(jiffies, chunk->msg->expires_at)) { if (chunk->sent_count) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; else chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && - chunk->sent_count > chunk->prsctp_param) { + chunk->sent_count > chunk->sinfo.sinfo_timetolive) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 72e54a416af6..e084f35d40d2 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -383,7 +383,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, transmitted_list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->prsctp_param <= sinfo->sinfo_timetolive) + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->transmitted_list); @@ -418,7 +418,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->prsctp_param <= sinfo->sinfo_timetolive) + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->list); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8c77b87a8565..46ffecc57214 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -706,20 +706,6 @@ nodata: return retval; } -static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, - const struct sctp_sndrcvinfo *sinfo) -{ - if (!chunk->asoc->prsctp_enable) - return; - - if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) - chunk->prsctp_param = - jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); - else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) || - SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags)) - chunk->prsctp_param = sinfo->sinfo_timetolive; -} - /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ @@ -753,7 +739,6 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); - sctp_set_prsctp_policy(retval, sinfo); nodata: return retval; -- cgit v1.2.1 From be4947bf46cb0e7a7d089e03c61bab35f1e695ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 29 Sep 2016 02:37:28 +0800 Subject: sctp: change to check peer prsctp_capable when using prsctp polices Now before using prsctp polices, sctp uses asoc->prsctp_enable to check if prsctp is enabled. However asoc->prsctp_enable is set only means local host support prsctp, sctp should not abandon packet if peer host doesn't enable prsctp. So this patch is to use asoc->peer.prsctp_capable to check if prsctp is enabled on both side, instead of asoc->prsctp_enable, as asoc's peer.prsctp_capable is set only when local and peer both enable prsctp. Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/chunk.c | 4 ++-- net/sctp/outqueue.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 14990e21cf55..0a3dbec0a8fb 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -179,7 +179,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, msg, msg->expires_at, jiffies); } - if (asoc->prsctp_enable && + if (asoc->peer.prsctp_capable && SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); @@ -340,7 +340,7 @@ errout: /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { - if (!chunk->asoc->prsctp_enable || + if (!chunk->asoc->peer.prsctp_capable || !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) { struct sctp_datamsg *msg = chunk->msg; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e084f35d40d2..107233da5cc9 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -326,7 +326,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); - if (chunk->asoc->prsctp_enable && + if (chunk->asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) @@ -442,7 +442,7 @@ void sctp_prsctp_prune(struct sctp_association *asoc, { struct sctp_transport *transport; - if (!asoc->prsctp_enable || !asoc->sent_cnt_removable) + if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) return; msg_len = sctp_prsctp_prune_sent(asoc, sinfo, @@ -1055,7 +1055,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* Mark as failed send. */ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); - if (asoc->prsctp_enable && + if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(chunk); @@ -1347,7 +1347,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); - if (asoc->prsctp_enable && + if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); -- cgit v1.2.1 From 1cceda7849809a8857fd9f26efe8846506c710e1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 29 Sep 2016 02:55:44 +0800 Subject: sctp: fix the issue sctp_diag uses lock_sock in rcu_read_lock When sctp dumps all the ep->assocs, it needs to lock_sock first, but now it locks sock in rcu_read_lock, and lock_sock may sleep, which would break rcu_read_lock. This patch is to get and hold one sock when traversing the list. After that and get out of rcu_read_lock, lock and dump it. Then it will traverse the list again to get the next one until all sctp socks are dumped. For sctp_diag_dump_one, it fixes this issue by holding asoc and moving cb() out of rcu_read_lock in sctp_transport_lookup_process. Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 58 ++++++++++++++++++++++++++++++++++++---------------- net/sctp/socket.c | 10 ++++++--- 2 files changed, 47 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index f3508aa75815..cef0cee182d4 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -272,28 +272,17 @@ out: return err; } -static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) +static int sctp_sock_dump(struct sock *sk, void *p) { - struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_comm_param *commp = p; - struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); + struct sctp_association *assoc; int err = 0; - /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) - goto out; - - if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) - goto out; - lock_sock(sk); - if (sk != assoc->base.sk) - goto release; list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; @@ -312,7 +301,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh) < 0) { cb->args[3] = 1; - err = 2; + err = 1; goto release; } cb->args[3] = 1; @@ -321,7 +310,7 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { - err = 2; + err = 1; goto release; } next: @@ -333,10 +322,35 @@ next: cb->args[4] = 0; release: release_sock(sk); + sock_put(sk); return err; +} + +static int sctp_get_sock(struct sctp_transport *tsp, void *p) +{ + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc = + list_entry(ep->asocs.next, struct sctp_association, asocs); + + /* find the ep only once through the transports by this condition */ + if (tsp->asoc != assoc) + goto out; + + if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) + goto out; + + sock_hold(sk); + cb->args[5] = (long)sk; + + return 1; + out: cb->args[2]++; - return err; + return 0; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) @@ -472,10 +486,18 @@ skip: * 2 : to record the transport pos of this time's traversal * 3 : to mark if we have dumped the ep info of the current asoc * 4 : to work as a temporary variable to traversal list + * 5 : to save the sk we get from travelsing the tsp list. */ if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; - sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp); + +next: + cb->args[5] = 0; + sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp); + + if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp)) + goto next; + done: cb->args[1] = cb->args[4]; cb->args[4] = 0; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9fc417a8b476..8ed2d99bde6d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4469,17 +4469,21 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), const union sctp_addr *paddr, void *p) { struct sctp_transport *transport; - int err = 0; + int err = -ENOENT; rcu_read_lock(); transport = sctp_addrs_lookup_transport(net, laddr, paddr); if (!transport || !sctp_transport_hold(transport)) goto out; - err = cb(transport, p); + + sctp_association_hold(transport->asoc); sctp_transport_put(transport); -out: rcu_read_unlock(); + err = cb(transport, p); + sctp_association_put(transport->asoc); + +out: return err; } EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); -- cgit v1.2.1 From 8782def204e57f6a507ff425e4944df4e010751a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 09:26:12 +0100 Subject: rxrpc: Switch to Congestion Avoidance mode at cwnd==ssthresh Switch to Congestion Avoidance mode at cwnd == ssthresh rather than relying on cwnd getting incremented beyond ssthresh and the window size, the mode being shifted and then cwnd being corrected. We need to make sure we switch into CA mode so that we stop marking every packet for ACK. Signed-off-by: David Howells --- net/rxrpc/input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1461d30583c9..21746f0f7ae0 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -57,7 +57,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, call->cong_ssthresh = max_t(unsigned int, summary->flight_size / 2, 2); cwnd = 1; - if (cwnd > call->cong_ssthresh && + if (cwnd >= call->cong_ssthresh && call->cong_mode == RXRPC_CALL_SLOW_START) { call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; call->cong_tstamp = skb->tstamp; @@ -82,7 +82,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto packet_loss_detected; if (summary->cumulative_acks > 0) cwnd += 1; - if (cwnd > call->cong_ssthresh) { + if (cwnd >= call->cong_ssthresh) { call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; call->cong_tstamp = skb->tstamp; } @@ -161,7 +161,7 @@ resume_normality: call->cong_dup_acks = 0; call->cong_extra = 0; call->cong_tstamp = skb->tstamp; - if (cwnd <= call->cong_ssthresh) + if (cwnd < call->cong_ssthresh) call->cong_mode = RXRPC_CALL_SLOW_START; else call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; -- cgit v1.2.1 From 0851115090a3eb9585d6a804a61e47f3d89ac2a8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 09:33:27 +0100 Subject: rxrpc: Reduce ssthresh to peer's receive window When we receive an ACK from the peer that tells us what the peer's receive window (rwind) is, we should reduce ssthresh to rwind if rwind is smaller than ssthresh. Signed-off-by: David Howells --- net/rxrpc/input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 21746f0f7ae0..7993473e56bb 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -658,6 +658,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) rwind = RXRPC_RXTX_BUFF_SIZE - 1; call->tx_winsize = rwind; + if (call->cong_ssthresh > rwind) + call->cong_ssthresh = rwind; mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU)); -- cgit v1.2.1 From 775e5b71db6aca47d49d43d08751f2e8ebad7f60 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 13:26:03 +0100 Subject: rxrpc: The offset field in struct rxrpc_skb_priv is unnecessary The offset field in struct rxrpc_skb_priv is unnecessary as the value can always be calculated. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/conn_event.c | 3 ++- net/rxrpc/input.c | 23 ++++++++++++----------- net/rxrpc/local_event.c | 3 ++- net/rxrpc/recvmsg.c | 6 ++---- net/rxrpc/rxkad.c | 9 ++++++--- 6 files changed, 24 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 539db54697f9..fd64a2bd1072 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -144,7 +144,6 @@ struct rxrpc_skb_priv { u8 nr_jumbo; /* Number of jumbo subpackets */ }; union { - unsigned int offset; /* offset into buffer of next read */ int remain; /* amount of space remaining for next write */ u32 error; /* network error code */ }; diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 37609ce89f52..3f9d8d7ec632 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -276,7 +276,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return 0; case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) < 0) return -EPROTO; abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7993473e56bb..5ba35b4a907b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -358,7 +358,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) static bool rxrpc_validate_jumbo(struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int offset = sp->offset; + unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len; int nr_jumbo = 1; u8 flags = sp->hdr.flags; @@ -419,7 +419,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int offset = sp->offset; + unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int ix; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; rxrpc_seq_t seq = sp->hdr.seq, hard_ack; @@ -746,15 +746,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, } buf; rxrpc_serial_t acked_serial; rxrpc_seq_t first_soft_ack, hard_ack; - int nr_acks, offset; + int nr_acks, offset, ioffset; _enter(""); - if (skb_copy_bits(skb, sp->offset, &buf.ack, sizeof(buf.ack)) < 0) { + offset = sizeof(struct rxrpc_wire_header); + if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) { _debug("extraction failure"); return rxrpc_proto_abort("XAK", call, 0); } - sp->offset += sizeof(buf.ack); + offset += sizeof(buf.ack); acked_serial = ntohl(buf.ack.serial); first_soft_ack = ntohl(buf.ack.firstPacket); @@ -792,9 +793,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ack_respond_to_ack); } - offset = sp->offset + nr_acks + 3; - if (skb->len >= offset + sizeof(buf.info)) { - if (skb_copy_bits(skb, offset, &buf.info, sizeof(buf.info)) < 0) + ioffset = offset + nr_acks + 3; + if (skb->len >= ioffset + sizeof(buf.info)) { + if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0) return rxrpc_proto_abort("XAI", call, 0); rxrpc_input_ackinfo(call, skb, &buf.info); } @@ -832,7 +833,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_rotate_tx_window(call, hard_ack, &summary); if (nr_acks > 0) { - if (skb_copy_bits(skb, sp->offset, buf.acks, nr_acks) < 0) + if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) return rxrpc_proto_abort("XSA", call, 0); rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks, &summary); @@ -880,7 +881,8 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) _enter(""); if (skb->len >= 4 && - skb_copy_bits(skb, sp->offset, &wtmp, sizeof(wtmp)) >= 0) + skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) >= 0) abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); @@ -996,7 +998,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); - sp->offset = sizeof(whdr); return 0; } diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 190f68bd9e27..540d3955c1bc 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -95,7 +95,8 @@ void rxrpc_process_local_events(struct rxrpc_local *local) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, sp->offset, &v, 1) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &v, 1) < 0) return; _proto("Rx VERSION { %02x }", v); if (v == 0) diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 038ae62ddb4d..f05ea0a88076 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -261,15 +261,13 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, u8 *_annotation, unsigned int *_offset, unsigned int *_len) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int offset = *_offset; + unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = *_len; int ret; u8 annotation = *_annotation; /* Locate the subpacket */ - offset = sp->offset; - len = skb->len - sp->offset; + len = skb->len - offset; if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) { offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) * RXRPC_JUMBO_SUBPKTLEN); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 88d080a1a3de..627abed5f999 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -771,7 +771,8 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, } abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sp->offset, &challenge, sizeof(challenge)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &challenge, sizeof(challenge)) < 0) goto protocol_error; version = ntohl(challenge.version); @@ -1028,7 +1029,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sp->offset, &response, sizeof(response)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &response, sizeof(response)) < 0) goto protocol_error; if (!pskb_pull(skb, sizeof(response))) BUG(); @@ -1057,7 +1059,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, return -ENOMEM; abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sp->offset, ticket, ticket_len) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + ticket, ticket_len) < 0) goto protocol_error_free; ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, -- cgit v1.2.1 From c31410ea009d10501ea90f64cdda0083c8cf0161 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 13:42:31 +0100 Subject: rxrpc: Remove error from struct rxrpc_skb_priv as it is unused Remove error from struct rxrpc_skb_priv as it is no longer used. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fd64a2bd1072..141c1458e719 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -145,7 +145,6 @@ struct rxrpc_skb_priv { }; union { int remain; /* amount of space remaining for next write */ - u32 error; /* network error code */ }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ -- cgit v1.2.1 From df0adc788ae74e35ab1a79f3db878df7fdc7db55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 26 Sep 2016 22:12:49 +0100 Subject: rxrpc: Keep the call timeouts as ktimes rather than jiffies Keep that call timeouts as ktimes rather than jiffies so that they can be expressed as functions of RTT. Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 8 +++--- net/rxrpc/call_event.c | 73 ++++++++++++++++++++++++++----------------------- net/rxrpc/call_object.c | 16 ++++------- net/rxrpc/input.c | 3 +- net/rxrpc/misc.c | 15 ++++++---- net/rxrpc/sendmsg.c | 8 +++--- net/rxrpc/sysctl.c | 8 +++--- 7 files changed, 69 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 141c1458e719..d38dffd78085 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -464,9 +464,9 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ - unsigned long ack_at; /* When deferred ACK needs to happen */ - unsigned long resend_at; /* When next resend needs to happen */ - unsigned long expire_at; /* When the call times out */ + ktime_t ack_at; /* When deferred ACK needs to happen */ + ktime_t resend_at; /* When next resend needs to happen */ + ktime_t expire_at; /* When the call times out */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -805,7 +805,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace); +void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1f6c7633b964..9ff3bb3ffb41 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -24,28 +24,40 @@ /* * Set the timer */ -void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why) +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, + ktime_t now) { - unsigned long t, now = jiffies; + unsigned long t_j, now_j = jiffies; + ktime_t t; read_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { t = call->expire_at; - if (time_before_eq(t, now)) + if (!ktime_after(t, now)) goto out; - if (time_after(call->resend_at, now) && - time_before(call->resend_at, t)) + if (ktime_after(call->resend_at, now) && + ktime_before(call->resend_at, t)) t = call->resend_at; - if (time_after(call->ack_at, now) && - time_before(call->ack_at, t)) + if (ktime_after(call->ack_at, now) && + ktime_before(call->ack_at, t)) t = call->ack_at; - if (call->timer.expires != t || !timer_pending(&call->timer)) { - mod_timer(&call->timer, t); - trace_rxrpc_timer(call, why, now); + t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); + t_j += jiffies; + + /* We have to make sure that the calculated jiffies value falls + * at or after the nsec value, or we may loop ceaselessly + * because the timer times out, but we haven't reached the nsec + * timeout yet. + */ + t_j++; + + if (call->timer.expires != t_j || !timer_pending(&call->timer)) { + mod_timer(&call->timer, t_j); + trace_rxrpc_timer(call, why, now, now_j); } } @@ -62,7 +74,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; - unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay; + unsigned int expiry = rxrpc_soft_ack_delay; + ktime_t now, ack_at; s8 prior = rxrpc_ack_priority[ack_reason]; /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial @@ -111,7 +124,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, break; } - now = jiffies; if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { _debug("already scheduled"); } else if (immediate || expiry == 0) { @@ -120,11 +132,11 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, background) rxrpc_queue_call(call); } else { - ack_at = now + expiry; - _debug("deferred ACK %ld < %ld", expiry, call->ack_at - now); - if (time_before(ack_at, call->ack_at)) { + now = ktime_get_real(); + ack_at = ktime_add_ms(now, expiry); + if (ktime_before(ack_at, call->ack_at)) { call->ack_at = ack_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_ack); + rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now); } } @@ -157,12 +169,12 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Perform retransmission of NAK'd and unack'd packets. */ -static void rxrpc_resend(struct rxrpc_call *call) +static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t cursor, seq, top; - ktime_t now = ktime_get_real(), max_age, oldest, resend_at, ack_ts; + ktime_t max_age, oldest, ack_ts; int ix; u8 annotation, anno_type, retrans = 0, unacked = 0; @@ -212,14 +224,7 @@ static void rxrpc_resend(struct rxrpc_call *call) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); - call->resend_at = jiffies + - nsecs_to_jiffies(ktime_to_ns(ktime_sub(resend_at, now))) + - 1; /* We have to make sure that the calculated jiffies value - * falls at or after the nsec value, or we shall loop - * ceaselessly because the timer times out, but we haven't - * reached the nsec timeout yet. - */ + call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); if (unacked) rxrpc_congestion_timeout(call); @@ -229,7 +234,7 @@ static void rxrpc_resend(struct rxrpc_call *call) * retransmitting data. */ if (!retrans) { - rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_ns(ack_ts) < call->peer->rtt) @@ -301,7 +306,7 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - unsigned long now; + ktime_t now; rxrpc_see_call(call); @@ -320,15 +325,15 @@ recheck_state: goto out_put; } - now = jiffies; - if (time_after_eq(now, call->expire_at)) { + now = ktime_get_real(); + if (ktime_before(call->expire_at, now)) { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || - time_after_eq(now, call->ack_at)) { + ktime_before(call->ack_at, now)) { call->ack_at = call->expire_at; if (call->ackr_reason) { rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); @@ -337,12 +342,12 @@ recheck_state: } if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) || - time_after_eq(now, call->resend_at)) { - rxrpc_resend(call); + ktime_before(call->resend_at, now)) { + rxrpc_resend(call, now); goto recheck_state; } - rxrpc_set_timer(call, rxrpc_timer_set_for_resend); + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); /* other events may have been raised since we started checking */ if (call->events && call->state < RXRPC_CALL_COMPLETE) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d4b3293b78fa..456ab752d473 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -19,11 +19,6 @@ #include #include "ar-internal.h" -/* - * Maximum lifetime of a call (in jiffies). - */ -unsigned int rxrpc_max_call_lifetime = 60 * HZ; - const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", @@ -77,7 +72,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); if (call->state < RXRPC_CALL_COMPLETE) { - trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + trace_rxrpc_timer(call, rxrpc_timer_expired, + ktime_get_real(), jiffies); rxrpc_queue_call(call); } } @@ -207,14 +203,14 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, */ static void rxrpc_start_call_timer(struct rxrpc_call *call) { - unsigned long expire_at; + ktime_t now = ktime_get_real(), expire_at; - expire_at = jiffies + rxrpc_max_call_lifetime; + expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); call->expire_at = expire_at; call->ack_at = expire_at; call->resend_at = expire_at; - call->timer.expires = expire_at + 1; - rxrpc_set_timer(call, rxrpc_timer_begin); + call->timer.expires = jiffies + LONG_MAX / 2; + rxrpc_set_timer(call, rxrpc_timer_begin, now); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5ba35b4a907b..3ad9f75031e3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -328,7 +328,8 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) call->resend_at = call->expire_at; call->ack_at = call->expire_at; spin_unlock_bh(&call->lock); - rxrpc_set_timer(call, rxrpc_timer_init_for_reply); + rxrpc_set_timer(call, rxrpc_timer_init_for_reply, + ktime_get_real()); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 47dddacdbb91..9d1c721bc4e8 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -20,29 +20,34 @@ */ unsigned int rxrpc_max_backlog __read_mostly = 10; +/* + * Maximum lifetime of a call (in mx). + */ +unsigned int rxrpc_max_call_lifetime = 60 * 1000; + /* * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in jiffies). + * packet with RXRPC_REQUEST_ACK set (in ms). */ unsigned int rxrpc_requested_ack_delay = 1; /* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * How long to wait before scheduling an ACK with subtype DELAY (in ms). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned int rxrpc_soft_ack_delay = 1 * HZ; +unsigned int rxrpc_soft_ack_delay = 1 * 1000; /* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * How long to wait before scheduling an ACK with subtype IDLE (in ms). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; +unsigned int rxrpc_idle_ack_delay = 0.5 * 1000; /* * Receive window size in packets. This indicates the maximum number of diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index d8dfdce874d8..3322543d460a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -149,13 +149,13 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, _debug("need instant resend %d", ret); rxrpc_instant_resend(call, ix); } else { - unsigned long resend_at; + ktime_t now = ktime_get_real(), resend_at; - resend_at = jiffies + msecs_to_jiffies(rxrpc_resend_timeout); + resend_at = ktime_add_ms(now, rxrpc_resend_timeout); - if (time_before(resend_at, call->resend_at)) { + if (ktime_before(resend_at, call->resend_at)) { call->resend_at = resend_at; - rxrpc_set_timer(call, rxrpc_timer_set_for_send); + rxrpc_set_timer(call, rxrpc_timer_set_for_send, now); } } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 13d1df03ebac..34c706d2f79c 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_requested_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&zero, }, { @@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_soft_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_idle_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -85,7 +85,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_max_call_lifetime, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, -- cgit v1.2.1 From 405dea1debeb9956684de342903bba9ddd52f1cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Sep 2016 09:13:50 +0100 Subject: rxrpc: Fix the call timer handling The call timer's concept of a call timeout (of which there are three) that is inactive is that it is the timeout has the same expiration time as the call expiration timeout (the expiration timer is never inactive). However, I'm not resetting the timeouts when they expire, leading to repeated processing of expired timeouts when other timeout events occur. Fix this by: (1) Move the timer expiry detection into rxrpc_set_timer() inside the locked section. This means that if a timeout is set that will expire immediately, we deal with it immediately. (2) If a timeout is at or before now then it has expired. When an expiry is detected, an event is raised, the timeout is automatically inactivated and the event processor is queued. (3) If a timeout is at or after the expiry timeout then it is inactive. Inactive timeouts do not contribute to the timer setting. (4) The call timer callback can now just call rxrpc_set_timer() to handle things. (5) The call processor work function now checks the event flags rather than checking the timeouts directly. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 26 ++++++++++++++++++-------- net/rxrpc/call_object.c | 7 ++----- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 9ff3bb3ffb41..4f00476630b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -29,6 +29,7 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, { unsigned long t_j, now_j = jiffies; ktime_t t; + bool queue = false; read_lock_bh(&call->state_lock); @@ -37,13 +38,21 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, if (!ktime_after(t, now)) goto out; - if (ktime_after(call->resend_at, now) && - ktime_before(call->resend_at, t)) + if (!ktime_after(call->resend_at, now)) { + call->resend_at = call->expire_at; + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + queue = true; + } else if (ktime_before(call->resend_at, t)) { t = call->resend_at; + } - if (ktime_after(call->ack_at, now) && - ktime_before(call->ack_at, t)) + if (!ktime_after(call->ack_at, now)) { + call->ack_at = call->expire_at; + if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + queue = true; + } else if (ktime_before(call->ack_at, t)) { t = call->ack_at; + } t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); t_j += jiffies; @@ -59,6 +68,9 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, mod_timer(&call->timer, t_j); trace_rxrpc_timer(call, why, now, now_j); } + + if (queue) + rxrpc_queue_call(call); } out: @@ -332,8 +344,7 @@ recheck_state: goto recheck_state; } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || - ktime_before(call->ack_at, now)) { + if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) { call->ack_at = call->expire_at; if (call->ackr_reason) { rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); @@ -341,8 +352,7 @@ recheck_state: } } - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) || - ktime_before(call->resend_at, now)) { + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) { rxrpc_resend(call, now); goto recheck_state; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 456ab752d473..364b42dc3dce 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -71,11 +71,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) { - trace_rxrpc_timer(call, rxrpc_timer_expired, - ktime_get_real(), jiffies); - rxrpc_queue_call(call); - } + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); } /* -- cgit v1.2.1 From d6169b0206db1c8c8d0e4c6b79fdf4b2fc6455f1 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 30 Sep 2016 15:24:31 -0700 Subject: net: Use ns_capable_noaudit() when determining net sysctl permissions The capability check should not be audited since it is only being used to determine the inode permissions. A failed check does not indicate a violation of security policy but, when an LSM is enabled, a denial audit message was being generated. The denial audit message caused confusion for some application authors because root-running Go applications always triggered the denial. To prevent this confusion, the capability check in net_ctl_permissions() is switched to the noaudit variant. BugLink: https://launchpad.net/bugs/1465724 Signed-off-by: Tyler Hicks Acked-by: Serge E. Hallyn Signed-off-by: James Morris [dtor: reapplied after e79c6a4fc923 ("net: make net namespace sysctls belong to container's owner") accidentally reverted the change.] Signed-off-by: Dmitry Torokhov Signed-off-by: David S. Miller --- net/sysctl_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 5bc1a3d57401..e0c71bd8f7cf 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -44,7 +44,7 @@ static int net_ctl_permissions(struct ctl_table_header *head, struct net *net = container_of(head->set, struct net, sysctls); /* Allow network administrator to have same access as root. */ - if (ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } -- cgit v1.2.1 From d4ef9f72128d414ad83b27b49312faa971d77382 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Wed, 28 Sep 2016 15:05:28 -0700 Subject: netfilter: bridge: clarify bridge/netfilter message When using bridge without bridge netfilter enabled the message displayed is rather confusing and leads to belive that a deprecated feature is in use. Use IS_MODULE to be explicit that the message only affects users which use bridge netfilter as module and reword the message. Signed-off-by: Stefan Agner Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/bridge/br.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 3addc05b9a16..889e5640455f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -227,9 +227,11 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif - pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " - "deprecated. Update your scripts to load br_netfilter if you " +#if IS_MODULE(CONFIG_BRIDGE_NETFILTER) + pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " + "by default. Update your scripts to load br_netfilter if you " "need this.\n"); +#endif return 0; -- cgit v1.2.1 From cb9e684e89e69894cb6697a3fa1274a284d1d3bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 29 Sep 2016 00:33:43 -0700 Subject: ipv6 addrconf: remove addrconf_sysctl_hop_limit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is an effective no-op in terms of user observable behaviour. By preventing the overwrite of non-null extra1/extra2 fields in addrconf_sysctl() we can enable the use of proc_dointvec_minmax(). This allows us to eliminate the constant min/max (1..255) trampoline function that is addrconf_sysctl_hop_limit(). This is nice because it simplifies the code, and allows future sysctls with constant min/max limits to also not require trampolines. We still can't eliminate the trampoline for mtu because it isn't actually a constant (it depends on other tunables of the device) and thus requires at-write-time logic to enforce range. Signed-off-by: Maciej Żenczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 87183983724d..cbd9343751a2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5496,20 +5496,6 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table lctl; - int min_hl = 1, max_hl = 255; - - lctl = *ctl; - lctl.extra1 = &min_hl; - lctl.extra2 = &max_hl; - - return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos); -} - static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -5743,6 +5729,9 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } +static const int one = 1; +static const int two_five_five = 255; + static const struct ctl_table addrconf_sysctl[] = { { .procname = "forwarding", @@ -5756,7 +5745,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.hop_limit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = addrconf_sysctl_hop_limit, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&two_five_five, }, { .procname = "mtu", @@ -6081,8 +6072,14 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, for (i = 0; table[i].data; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; - table[i].extra1 = idev; /* embedded; no ref */ - table[i].extra2 = net; + /* If one of these is already set, then it is not safe to + * overwrite either of them: this makes proc_dointvec_minmax + * usable. + */ + if (!table[i].extra1 && !table[i].extra2) { + table[i].extra1 = idev; /* embedded; no ref */ + table[i].extra2 = net; + } } snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); -- cgit v1.2.1 From 63d75463c91a5b5be7c0aca11ceb45ea5a0ae81d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 30 Sep 2016 16:56:45 +0200 Subject: net: pktgen: fix pkt_size The commit 879c7220e828 ("net: pktgen: Observe needed_headroom of the device") increased the 'pkt_overhead' field value by LL_RESERVED_SPACE. As a side effect the generated packet size, computed as: /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - pkt_dev->pkt_overhead; is decreased by the same value. The above changed slightly the behavior of existing pktgen users, and made the procfs interface somewhat inconsistent. Fix it by restoring the previous pkt_overhead value and using LL_RESERVED_SPACE as extralen in skb allocation. Also, change pktgen_alloc_skb() to only partially reserve the headroom to allow the caller to prefetch from ll header start. v1 -> v2: - fixed some typos in the comments Fixes: 879c7220e828 ("net: pktgen: Observe needed_headroom of the device") Suggested-by: Ben Greear Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/pktgen.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bbd118b19aef..5219a9e2127a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2286,7 +2286,7 @@ out: static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); + pkt_dev->pkt_overhead = 0; pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2777,13 +2777,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, - struct pktgen_dev *pkt_dev, - unsigned int extralen) + struct pktgen_dev *pkt_dev) { + unsigned int extralen = LL_RESERVED_SPACE(dev); struct sk_buff *skb = NULL; - unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + - pkt_dev->pkt_overhead; + unsigned int size; + size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; if (pkt_dev->flags & F_NODE) { int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); @@ -2796,8 +2796,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + /* the caller pre-fetches from skb->data and reserves for the mac hdr */ if (likely(skb)) - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, extralen - 16); return skb; } @@ -2830,16 +2831,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - datalen = (odev->hard_header_len + 16) & ~0xf; - - skb = pktgen_alloc_skb(odev, pkt_dev, datalen); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } prefetchw(skb->data); - skb_reserve(skb, datalen); + skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ eth = (__u8 *) skb_push(skb, 14); @@ -2959,7 +2958,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = pktgen_alloc_skb(odev, pkt_dev, 16); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; -- cgit v1.2.1 From fa34cd94fb01fcb8d79d91e009451b37692e94e5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 30 Sep 2016 18:13:49 +0200 Subject: net: rtnl: avoid uninitialized data in IFLA_VF_VLAN_LIST handling With the newly added support for IFLA_VF_VLAN_LIST netlink messages, we get a warning about potential uninitialized variable use in the parsing of the user input when enabling the -Wmaybe-uninitialized warning: net/core/rtnetlink.c: In function 'do_setvfinfo': net/core/rtnetlink.c:1756:9: error: 'ivvl$' may be used uninitialized in this function [-Werror=maybe-uninitialized] I have not been able to prove whether it is possible to arrive in this code with an empty IFLA_VF_VLAN_LIST block, but if we do, then ndo_set_vf_vlan gets called with uninitialized arguments. This adds an explicit check for an empty list, making it obvious to the reader and the compiler that this cannot happen. Fixes: 79aab093a0b5 ("net: Update API for VF vlan protocol 802.1ad support") Signed-off-by: Arnd Bergmann Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3ac8946bf244..b06d2f46b83e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1753,6 +1753,9 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) len++; } + if (len == 0) + return -EINVAL; + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) -- cgit v1.2.1 From f7d49bce8e741e1e6aa14ce4db1b6cea7e4be4e8 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 30 Sep 2016 19:08:05 +0200 Subject: openvswitch: mpls: set network header correctly on key extract After the 48d2ab609b6b ("net: mpls: Fixups for GSO"), MPLS handling in openvswitch was changed to have network header pointing to the start of the MPLS headers and inner_network_header pointing after the MPLS headers. However, key_extract was missed by the mentioned commit, causing incorrect headers to be set when a MPLS packet just enters the bridge or after it is recirculated. Fixes: 48d2ab609b6b ("net: mpls: Fixups for GSO") Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 634cc10d6dee..c8c82e109c68 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -633,12 +633,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) } else if (eth_p_mpls(key->eth.type)) { size_t stack_len = MPLS_HLEN; - /* In the presence of an MPLS label stack the end of the L2 - * header and the beginning of the L3 header differ. - * - * Advance network_header to the beginning of the L3 - * header. mac_len corresponds to the end of the L2 header. - */ + skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; @@ -646,12 +641,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(error)) return 0; - memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); if (stack_len == MPLS_HLEN) memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); - skb_set_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + stack_len); if (lse & htonl(MPLS_LS_S_MASK)) break; -- cgit v1.2.1 From 9095e10edd28e1e4a10ba5ca61fb54d9f74f8968 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 30 Sep 2016 19:08:06 +0200 Subject: mpls: move mpls_hdr to a common location This will be also used by openvswitch. Signed-off-by: Jiri Benc Acked-by: David Ahern Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/mpls/internal.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 732a5c17e986..bdfef6c3271a 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,9 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H - -struct mpls_shim_hdr { - __be32 label_stack_entry; -}; +#include struct mpls_entry_decoded { u32 label; @@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) -{ - return (struct mpls_shim_hdr *)skb_network_header(skb); -} - static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) { struct mpls_shim_hdr result; -- cgit v1.2.1 From 85de4a2101acb85c3b1dde465e84596ccca99f2c Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 30 Sep 2016 19:08:07 +0200 Subject: openvswitch: use mpls_hdr skb_mpls_header is equivalent to mpls_hdr now. Use the existing helper instead. Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 863e992dfbc0..4e03f64709bc 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -160,7 +160,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { - __be32 *new_mpls_lse; + struct mpls_shim_hdr *new_mpls_lse; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -180,8 +180,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_reset_mac_header(skb); skb_set_network_header(skb, skb->mac_len); - new_mpls_lse = (__be32 *)skb_mpls_header(skb); - *new_mpls_lse = mpls->mpls_lse; + new_mpls_lse = mpls_hdr(skb); + new_mpls_lse->label_stack_entry = mpls->mpls_lse; skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); @@ -202,7 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (unlikely(err)) return err; - skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), skb->mac_len); @@ -211,10 +211,10 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_reset_mac_header(skb); skb_set_network_header(skb, skb->mac_len); - /* skb_mpls_header() is used to locate the ethertype - * field correctly in the presence of VLAN tags. + /* mpls_hdr() is used to locate the ethertype field correctly in the + * presence of VLAN tags. */ - hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -226,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { - __be32 *stack; + struct mpls_shim_hdr *stack; __be32 lse; int err; @@ -234,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (unlikely(err)) return err; - stack = (__be32 *)skb_mpls_header(skb); - lse = OVS_MASKED(*stack, *mpls_lse, *mask); + stack = mpls_hdr(skb); + lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(*stack), lse }; + __be32 diff[] = { ~(stack->label_stack_entry), lse }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } - *stack = lse; + stack->label_stack_entry = lse; flow_key->mpls.top_lse = lse; return 0; } -- cgit v1.2.1 From f39acc84aad10710e89835c60d3b6694c43a8dd9 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Thu, 29 Sep 2016 12:10:40 +0300 Subject: net/sched: act_vlan: Push skb->data to mac_header prior calling skb_vlan_*() functions Generic skb_vlan_push/skb_vlan_pop functions don't properly handle the case where the input skb data pointer does not point at the mac header: - They're doing push/pop, but fail to properly unwind data back to its original location. For example, in the skb_vlan_push case, any subsequent 'skb_push(skb, skb->mac_len)' calls make the skb->data point 4 bytes BEFORE start of frame, leading to bogus frames that may be transmitted. - They update rcsum per the added/removed 4 bytes tag. Alas if data is originally after the vlan/eth headers, then these bytes were already pulled out of the csum. OTOH calling skb_vlan_push/skb_vlan_pop with skb->data at mac_header present no issues. act_vlan is the only caller to skb_vlan_*() that has skb->data pointing at network header (upon ingress). Other calles (ovs, bpf) already adjust skb->data at mac_header. This patch fixes act_vlan to point to the mac_header prior calling skb_vlan_*() functions, as other callers do. Signed-off-by: Shmulik Ladkani Cc: Daniel Borkmann Cc: Pravin Shelar Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a95c00b119da..b57fcbcefea1 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -37,6 +37,12 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, bstats_update(&v->tcf_bstats, skb); action = v->tcf_action; + /* Ensure 'data' points at mac_header prior calling vlan manipulating + * functions. + */ + if (skb_at_tc_ingress(skb)) + skb_push_rcsum(skb, skb->mac_len); + switch (v->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); @@ -83,6 +89,9 @@ drop: action = TC_ACT_SHOT; v->tcf_qstats.drops++; unlock: + if (skb_at_tc_ingress(skb)) + skb_pull_rcsum(skb, skb->mac_len); + spin_unlock(&v->tcf_lock); return action; } -- cgit v1.2.1 From b6a7920848cab619b5e434fdc0338778c63ef3f3 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Thu, 29 Sep 2016 12:10:41 +0300 Subject: net: skbuff: Limit skb_vlan_pop/push() to expect skb->data at mac header skb_vlan_pop/push were too generic, trying to support the cases where skb->data is at mac header, and cases where skb->data is arbitrarily elsewhere. Supporting an arbitrary skb->data was complex and bogus: - It failed to unwind skb->data to its original location post actual pop/push. (Also, semantic is not well defined for unwinding: If data was into the eth header, need to use same offset from start; But if data was at network header or beyond, need to adjust the original offset according to the push/pull) - It mangled the rcsum post actual push/pop, without taking into account that the eth bytes might already have been pulled out of the csum. Most callers (ovs, bpf) already had their skb->data at mac_header upon invoking skb_vlan_pop/push. Last caller that failed to do so (act_vlan) has been recently fixed. Therefore, to simplify things, no longer support arbitrary skb->data inputs for skb_vlan_pop/push(). skb->data is expected to be exactly at mac_header; WARN otherwise. Signed-off-by: Shmulik Ladkani Cc: Daniel Borkmann Cc: Pravin Shelar Cc: Jiri Pirko Signed-off-by: David S. Miller --- net/core/skbuff.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d36c7548952f..cbd19d250947 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4528,13 +4528,18 @@ EXPORT_SYMBOL(skb_ensure_writable); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); if (unlikely(err)) - goto pull; + return err; skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); @@ -4551,13 +4556,14 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); -pull: - __skb_pull(skb, offset); return err; } EXPORT_SYMBOL(__skb_vlan_pop); +/* Pop a vlan tag either from hwaccel or from payload. + * Expects skb->data at mac header. + */ int skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; @@ -4588,29 +4594,30 @@ int skb_vlan_pop(struct sk_buff *skb) } EXPORT_SYMBOL(skb_vlan_pop); +/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). + * Expects skb->data at mac header. + */ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - /* __vlan_insert_tag expect skb->data pointing to mac header. - * So change skb->data before calling it and change back to - * original position later - */ - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) { - __skb_pull(skb, offset); + if (err) return err; - } skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; -- cgit v1.2.1 From 93409033ae653f1c9a949202fb537ab095b2092f Mon Sep 17 00:00:00 2001 From: Andrew Collins Date: Mon, 3 Oct 2016 13:43:02 -0600 Subject: net: Add netdev all_adj_list refcnt propagation to fix panic This is a respin of a patch to fix a relatively easily reproducible kernel panic related to the all_adj_list handling for netdevs in recent kernels. The following sequence of commands will reproduce the issue: ip link add link eth0 name eth0.100 type vlan id 100 ip link add link eth0 name eth0.200 type vlan id 200 ip link add name testbr type bridge ip link set eth0.100 master testbr ip link set eth0.200 master testbr ip link add link testbr mac0 type macvlan ip link delete dev testbr This creates an upper/lower tree of (excuse the poor ASCII art): /---eth0.100-eth0 mac0-testbr- \---eth0.200-eth0 When testbr is deleted, the all_adj_lists are walked, and eth0 is deleted twice from the mac0 list. Unfortunately, during setup in __netdev_upper_dev_link, only one reference to eth0 is added, so this results in a panic. This change adds reference count propagation so things are handled properly. Matthias Schiffer reported a similar crash in batman-adv: https://github.com/freifunk-gluon/gluon/issues/680 https://www.open-mesh.org/issues/247 which this patch also seems to resolve. Signed-off-by: Andrew Collins Signed-off-by: David S. Miller --- net/core/dev.c | 68 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c0c291f721d6..f1fe26f66458 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5589,6 +5589,7 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, + u16 ref_nr, struct list_head *dev_list, void *private, bool master) { @@ -5598,7 +5599,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { - adj->ref_nr++; + adj->ref_nr += ref_nr; return 0; } @@ -5608,7 +5609,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->ref_nr = 1; + adj->ref_nr = ref_nr; adj->private = private; dev_hold(adj_dev); @@ -5647,6 +5648,7 @@ free_adj: static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, + u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; @@ -5659,10 +5661,10 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, BUG(); } - if (adj->ref_nr > 1) { - pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, - adj->ref_nr-1); - adj->ref_nr--; + if (adj->ref_nr > ref_nr) { + pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, + ref_nr, adj->ref_nr-ref_nr); + adj->ref_nr -= ref_nr; return; } @@ -5681,21 +5683,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, + u16 ref_nr, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, - master); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, + private, master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, - false); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, + private, false); if (ret) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); return ret; } @@ -5703,9 +5706,10 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev, } static int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + u16 ref_nr) { - return __netdev_adjacent_dev_link_lists(dev, upper_dev, + return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower, NULL, false); @@ -5713,17 +5717,19 @@ static int __netdev_adjacent_dev_link(struct net_device *dev, static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, + u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); - __netdev_adjacent_dev_remove(upper_dev, dev, down_list); + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + u16 ref_nr) { - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower); } @@ -5732,17 +5738,17 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { - int ret = __netdev_adjacent_dev_link(dev, upper_dev); + int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); if (ret) return ret; - ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); if (ret) { - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); return ret; } @@ -5752,8 +5758,8 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { - __netdev_adjacent_dev_unlink(dev, upper_dev); - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } @@ -5806,7 +5812,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { pr_debug("Interlinking %s with %s, non-neighbour\n", i->dev->name, j->dev->name); - ret = __netdev_adjacent_dev_link(i->dev, j->dev); + ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); if (ret) goto rollback_mesh; } @@ -5816,7 +5822,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { pr_debug("linking %s's upper device %s with %s\n", upper_dev->name, i->dev->name, dev->name); - ret = __netdev_adjacent_dev_link(dev, i->dev); + ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); if (ret) goto rollback_upper_mesh; } @@ -5825,7 +5831,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(i, &dev->all_adj_list.lower, list) { pr_debug("linking %s's lower device %s with %s\n", dev->name, i->dev->name, upper_dev->name); - ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); if (ret) goto rollback_lower_mesh; } @@ -5843,7 +5849,7 @@ rollback_lower_mesh: list_for_each_entry(i, &dev->all_adj_list.lower, list) { if (i == to_i) break; - __netdev_adjacent_dev_unlink(i->dev, upper_dev); + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); } i = NULL; @@ -5853,7 +5859,7 @@ rollback_upper_mesh: list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { if (i == to_i) break; - __netdev_adjacent_dev_unlink(dev, i->dev); + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); } i = j = NULL; @@ -5865,7 +5871,7 @@ rollback_mesh: list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { if (i == to_i && j == to_j) break; - __netdev_adjacent_dev_unlink(i->dev, j->dev); + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); } if (i == to_i) break; @@ -5945,16 +5951,16 @@ void netdev_upper_dev_unlink(struct net_device *dev, */ list_for_each_entry(i, &dev->all_adj_list.lower, list) list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(i->dev, j->dev); + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); /* remove also the devices itself from lower/upper device * list */ list_for_each_entry(i, &dev->all_adj_list.lower, list) - __netdev_adjacent_dev_unlink(i->dev, upper_dev); + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(dev, i->dev); + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); -- cgit v1.2.1 From d8cedaabe71236d27da1ff03d32ab1da06ed041f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:47 +1100 Subject: net/ncsi: Avoid unused-value build warning from ia64-linux-gcc xchg() is used to set NCSI channel's state in order for consistent access to the state. xchg()'s return value should be used. Otherwise, one build warning will be raised (with -Wunused-value) as below message indicates. It is reported by ia64-linux-gcc (GCC) 4.9.0. net/ncsi/ncsi-manage.c: In function 'ncsi_channel_monitor': arch/ia64/include/uapi/asm/cmpxchg.h:56:2: warning: value computed is \ not used [-Wunused-value] ((__typeof__(*(ptr))) __xchg((unsigned long) (x), (ptr), sizeof(*(ptr)))) ^ net/ncsi/ncsi-manage.c:202:3: note: in expansion of macro 'xchg' xchg(&nc->state, NCSI_CHANNEL_INACTIVE); This removes the atomic access to NCSI channel's state avoid the above build warning. We have to hold the channel's lock when its state is readed or updated. No functional changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 37 +++++++++++++++++++------- net/ncsi/ncsi-manage.c | 71 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index d463468442ae..b41a6617d498 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -53,7 +53,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, struct ncsi_aen_lsc_pkt *lsc; struct ncsi_channel *nc; struct ncsi_channel_mode *ncm; - unsigned long old_data; + bool chained; + int state; + unsigned long old_data, data; unsigned long flags; /* Find the NCSI channel */ @@ -62,20 +64,27 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, return -ENODEV; /* Update the link status */ - ncm = &nc->modes[NCSI_MODE_LINK]; lsc = (struct ncsi_aen_lsc_pkt *)h; + + spin_lock_irqsave(&nc->lock, flags); + ncm = &nc->modes[NCSI_MODE_LINK]; old_data = ncm->data[2]; - ncm->data[2] = ntohl(lsc->status); + data = ntohl(lsc->status); + ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); - if (!((old_data ^ ncm->data[2]) & 0x1) || - !list_empty(&nc->link)) + + chained = !list_empty(&nc->link); + state = nc->state; + spin_unlock_irqrestore(&nc->lock, flags); + + if (!((old_data ^ data) & 0x1) || chained) return 0; - if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) && - !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1))) + if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) && + !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1))) return 0; if (!(ndp->flags & NCSI_DEV_HWA) && - nc->state == NCSI_CHANNEL_ACTIVE) + state == NCSI_CHANNEL_ACTIVE) ndp->flags |= NCSI_DEV_RESHUFFLE; ncsi_stop_channel_monitor(nc); @@ -97,13 +106,21 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, if (!nc) return -ENODEV; + spin_lock_irqsave(&nc->lock, flags); if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_ACTIVE) + nc->state != NCSI_CHANNEL_ACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); return 0; + } + spin_unlock_irqrestore(&nc->lock, flags); ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + spin_lock_irqsave(&ndp->lock, flags); - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index ef017b871857..a26ce5132549 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -132,6 +132,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; struct ncsi_channel *nc; + unsigned long flags; nd->state = ncsi_dev_state_functional; if (force_down) { @@ -142,14 +143,21 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) nd->link_up = 0; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_ACTIVE) + nc->state != NCSI_CHANNEL_ACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); continue; + } if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; } + + spin_unlock_irqrestore(&nc->lock, flags); } } @@ -163,20 +171,22 @@ static void ncsi_channel_monitor(unsigned long data) struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_cmd_arg nca; - bool enabled; + bool enabled, chained; unsigned int timeout; unsigned long flags; - int ret; + int state, ret; spin_lock_irqsave(&nc->lock, flags); + state = nc->state; + chained = !list_empty(&nc->link); timeout = nc->timeout; enabled = nc->enabled; spin_unlock_irqrestore(&nc->lock, flags); - if (!enabled || !list_empty(&nc->link)) + if (!enabled || chained) return; - if (nc->state != NCSI_CHANNEL_INACTIVE && - nc->state != NCSI_CHANNEL_ACTIVE) + if (state != NCSI_CHANNEL_INACTIVE && + state != NCSI_CHANNEL_ACTIVE) return; if (!(timeout % 2)) { @@ -195,11 +205,15 @@ static void ncsi_channel_monitor(unsigned long data) if (timeout + 1 >= 3) { if (!(ndp->flags & NCSI_DEV_HWA) && - nc->state == NCSI_CHANNEL_ACTIVE) + state == NCSI_CHANNEL_ACTIVE) ncsi_report_link(ndp, true); + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + spin_lock_irqsave(&ndp->lock, flags); - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); ncsi_process_next_channel(ndp); @@ -508,6 +522,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_cmd_arg nca; + unsigned long flags; int ret; nca.ndp = ndp; @@ -556,7 +571,9 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) break; case ncsi_dev_state_suspend_done: - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); ncsi_process_next_channel(ndp); break; @@ -574,6 +591,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) struct ncsi_channel *nc = ndp->active_channel; struct ncsi_cmd_arg nca; unsigned char index; + unsigned long flags; int ret; nca.ndp = ndp; @@ -675,10 +693,12 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) goto error; break; case ncsi_dev_state_config_done: + spin_lock_irqsave(&nc->lock, flags); if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) - xchg(&nc->state, NCSI_CHANNEL_ACTIVE); + nc->state = NCSI_CHANNEL_ACTIVE; else - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); ncsi_start_channel_monitor(nc); ncsi_process_next_channel(ndp); @@ -707,18 +727,25 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) found = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_INACTIVE) + nc->state != NCSI_CHANNEL_INACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); continue; + } if (!found) found = nc; ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { + spin_unlock_irqrestore(&nc->lock, flags); found = nc; goto out; } + + spin_unlock_irqrestore(&nc->lock, flags); } } @@ -987,11 +1014,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) goto out; } - old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); list_del_init(&nc->link); - spin_unlock_irqrestore(&ndp->lock, flags); + spin_lock_irqsave(&nc->lock, flags); + old_state = nc->state; + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + ndp->active_channel = nc; ndp->active_package = nc->package; @@ -1006,7 +1036,7 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) break; default: netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", - nc->state, nc->package->id, nc->id); + old_state, nc->package->id, nc->id); ncsi_report_link(ndp, false); return -EINVAL; } @@ -1151,6 +1181,8 @@ int ncsi_start_dev(struct ncsi_dev *nd) struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_package *np; struct ncsi_channel *nc; + unsigned long flags; + bool chained; int old_state, ret; if (nd->state != ncsi_dev_state_registered && @@ -1166,8 +1198,13 @@ int ncsi_start_dev(struct ncsi_dev *nd) /* Reset channel's state and start over */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { - old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE); - WARN_ON_ONCE(!list_empty(&nc->link) || + spin_lock_irqsave(&nc->lock, flags); + chained = !list_empty(&nc->link); + old_state = nc->state; + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + WARN_ON_ONCE(chained || old_state == NCSI_CHANNEL_INVISIBLE); } } -- cgit v1.2.1 From bc7e0f50aa6958676115bffc1e5e58703579e04b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:48 +1100 Subject: net/ncsi: Introduce NCSI_RESERVED_CHANNEL This defines NCSI_RESERVED_CHANNEL as the reserved NCSI channel ID (0x1f). No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/internal.h | 1 + net/ncsi/ncsi-manage.c | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 33738c060547..66dc851d49ee 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -170,6 +170,7 @@ struct ncsi_package; #define NCSI_PACKAGE_SHIFT 5 #define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) +#define NCSI_RESERVED_CHANNEL 0x1f #define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) #define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index a26ce5132549..97c99bee8b68 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -542,7 +542,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; if (nd->state == ncsi_dev_state_suspend_select) { nca.type = NCSI_PKT_CMD_SP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; if (ndp->flags & NCSI_DEV_HWA) nca.bytes[0] = 0; else @@ -559,7 +559,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_suspend_deselect; } else if (nd->state == ncsi_dev_state_suspend_deselect) { nca.type = NCSI_PKT_CMD_DP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; nd->state = ncsi_dev_state_suspend_done; } @@ -608,7 +608,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) else nca.bytes[0] = 1; nca.package = np->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; @@ -834,7 +834,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Deselect all possible packages */ nca.type = NCSI_PKT_CMD_DP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; for (index = 0; index < 8; index++) { nca.package = index; ret = ncsi_xmit_cmd(&nca); @@ -850,7 +850,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Select all possible packages */ nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; for (index = 0; index < 8; index++) { nca.package = index; ret = ncsi_xmit_cmd(&nca); @@ -903,7 +903,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; nca.package = ndp->active_package->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; @@ -960,7 +960,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Deselect the active package */ nca.type = NCSI_PKT_CMD_DP; nca.package = ndp->active_package->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; -- cgit v1.2.1 From 55e02d0837fb4cf023832252847bfbff453603cc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:49 +1100 Subject: net/ncsi: Don't probe on the reserved channel ID (0x1f) We needn't send CIS (Clear Initial State) command to the NCSI reserved channel (0x1f) in the enumeration. We shouldn't receive a valid response from CIS on NCSI channel 0x1f. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 97c99bee8b68..8c5e0160d578 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -911,12 +911,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_cis; break; case ncsi_dev_state_probe_cis: - ndp->pending_req_num = 32; + ndp->pending_req_num = NCSI_RESERVED_CHANNEL; /* Clear initial state */ nca.type = NCSI_PKT_CMD_CIS; nca.package = ndp->active_package->id; - for (index = 0; index < 0x20; index++) { + for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) { nca.channel = index; ret = ncsi_xmit_cmd(&nca); if (ret) -- cgit v1.2.1 From a15af54f8f2a32d629781417503843bfbd02a004 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:50 +1100 Subject: net/ncsi: Rework request index allocation The NCSI request index (struct ncsi_request::id) is put into instance ID (IID) field while sending NCSI command packet. It was designed the available IDs are given in round-robin fashion. @ndp->request_id was introduced to represent the next available ID, but it has been used as number of successively allocated IDs. It breaks the round-robin design. Besides, we shouldn't put 0 to NCSI command packet's IID field, meaning ID#0 should be reserved according section 6.3.1.1 in NCSI spec (v1.1.0). This fixes above two issues. With it applied, the available IDs will be assigned in round-robin fashion and ID#0 won't be assigned. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/internal.h | 1 + net/ncsi/ncsi-manage.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 66dc851d49ee..c956fe8d80c3 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -259,6 +259,7 @@ struct ncsi_dev_priv { struct list_head packages; /* List of packages */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ +#define NCSI_REQ_START_IDX 1 unsigned int pending_req_num; /* Number of pending requests */ struct ncsi_package *active_package; /* Currently handled package */ struct ncsi_channel *active_channel; /* Currently handled channel */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 8c5e0160d578..00ce2c7fdb15 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -427,30 +427,31 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) /* Check if there is one available request until the ceiling */ spin_lock_irqsave(&ndp->lock, flags); - for (i = ndp->request_id; !nr && i < limit; i++) { + for (i = ndp->request_id; i < limit; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; nr->driven = driven; - if (++ndp->request_id >= limit) - ndp->request_id = 0; + ndp->request_id = i + 1; + goto found; } /* Fail back to check from the starting cursor */ - for (i = 0; !nr && i < ndp->request_id; i++) { + for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; nr->driven = driven; - if (++ndp->request_id >= limit) - ndp->request_id = 0; + ndp->request_id = i + 1; + goto found; } - spin_unlock_irqrestore(&ndp->lock, flags); +found: + spin_unlock_irqrestore(&ndp->lock, flags); return nr; } @@ -1148,7 +1149,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); INIT_LIST_HEAD(&ndp->packages); - ndp->request_id = 0; + ndp->request_id = NCSI_REQ_START_IDX; for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { ndp->requests[i].id = i; ndp->requests[i].ndp = ndp; -- cgit v1.2.1 From a0509cbeef5dafbab42c42622e012bcc94c3eb9e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:51 +1100 Subject: net/ncsi: Allow to extend NCSI request properties There is only one NCSI request property for now: the response for the sent command need drive the workqueue or not. So we had one field (@driven) for the purpose. We lost the flexibility to extend NCSI request properties. This replaces @driven with @flags and @req_flags in NCSI request and NCSI command argument struct. Each bit of the newly introduced field can be used for one property. No functional changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/internal.h | 8 +++++--- net/ncsi/ncsi-cmd.c | 2 +- net/ncsi/ncsi-manage.c | 19 ++++++++++--------- net/ncsi/ncsi-rsp.c | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index c956fe8d80c3..26e929595b5e 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -207,7 +207,8 @@ struct ncsi_package { struct ncsi_request { unsigned char id; /* Request ID - 0 to 255 */ bool used; /* Request that has been assigned */ - bool driven; /* Drive state machine */ + unsigned int flags; /* NCSI request property */ +#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ struct sk_buff *cmd; /* Associated NCSI command packet */ struct sk_buff *rsp; /* Associated NCSI response packet */ @@ -276,7 +277,7 @@ struct ncsi_cmd_arg { unsigned char package; /* Destination package ID */ unsigned char channel; /* Detination channel ID or 0x1f */ unsigned short payload; /* Command packet payload length */ - bool driven; /* Drive the state machine? */ + unsigned int req_flags; /* NCSI request properties */ union { unsigned char bytes[16]; /* Command packet specific data */ unsigned short words[8]; @@ -315,7 +316,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, unsigned char id, struct ncsi_package **np, struct ncsi_channel **nc); -struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, + unsigned int req_flags); void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 21057a8ceeac..db7083bfd476 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -272,7 +272,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) struct sk_buff *skb; struct ncsi_request *nr; - nr = ncsi_alloc_request(ndp, nca->driven); + nr = ncsi_alloc_request(ndp, nca->req_flags); if (!nr) return NULL; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 00ce2c7fdb15..adf5401817c2 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -194,7 +194,7 @@ static void ncsi_channel_monitor(unsigned long data) nca.package = np->id; nca.channel = nc->id; nca.type = NCSI_PKT_CMD_GLS; - nca.driven = false; + nca.req_flags = 0; ret = ncsi_xmit_cmd(&nca); if (ret) { netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", @@ -419,7 +419,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, * be same. Otherwise, the bogus response might be replied. So * the available IDs are allocated in round-robin fashion. */ -struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, + unsigned int req_flags) { struct ncsi_request *nr = NULL; int i, limit = ARRAY_SIZE(ndp->requests); @@ -433,7 +434,7 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) nr = &ndp->requests[i]; nr->used = true; - nr->driven = driven; + nr->flags = req_flags; ndp->request_id = i + 1; goto found; } @@ -445,7 +446,7 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) nr = &ndp->requests[i]; nr->used = true; - nr->driven = driven; + nr->flags = req_flags; ndp->request_id = i + 1; goto found; } @@ -473,7 +474,7 @@ void ncsi_free_request(struct ncsi_request *nr) nr->cmd = NULL; nr->rsp = NULL; nr->used = false; - driven = nr->driven; + driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN); spin_unlock_irqrestore(&ndp->lock, flags); if (driven && cmd && --ndp->pending_req_num == 0) @@ -527,7 +528,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_suspend: nd->state = ncsi_dev_state_suspend_select; @@ -596,7 +597,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_config: case ncsi_dev_state_config_sp: @@ -825,7 +826,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_probe: nd->state = ncsi_dev_state_probe_deselect; @@ -1101,7 +1102,7 @@ static int ncsi_inet6addr_event(struct notifier_block *this, return NOTIFY_OK; nca.ndp = ndp; - nca.driven = false; + nca.req_flags = 0; nca.package = np->id; nca.channel = nc->id; nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index af84389a6bf1..86cdaebd8d9e 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -317,7 +317,7 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) ncm->data[3] = ntohl(rsp->other); ncm->data[4] = ntohl(rsp->oem_status); - if (nr->driven) + if (nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN) return 0; /* Reset the channel monitor if it has been enabled */ -- cgit v1.2.1 From 83afdc6aad9d767cae271df1ca15641b9cbe3bfe Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:52 +1100 Subject: net/ncsi: Rework the channel monitoring The original NCSI channel monitoring was implemented based on a backoff algorithm: the GLS response should be received in the specified interval. Otherwise, the channel is regarded as dead and failover should be taken if current channel is an active one. There are several problems in the implementation: (A) On BCM5718, we found when the IID (Instance ID) in the GLS command packet changes from 255 to 1, the response corresponding to IID#1 never comes in. It means we cannot make the unfair judgement that the channel is dead when one response is missed. (B) The code's readability should be improved. (C) We should do failover when current channel is active one and the channel monitoring should be marked as disabled before doing failover. This reworks the channel monitoring to address all above issues. The fields for channel monitoring is put into separate struct and the state of channel monitoring is predefined. The channel is regarded alive if the network controller responses to one of two GLS commands or both of them in 5 seconds. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/internal.h | 12 +++++++++--- net/ncsi/ncsi-manage.c | 44 +++++++++++++++++++++++++------------------- net/ncsi/ncsi-rsp.c | 2 +- 3 files changed, 35 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 26e929595b5e..13290a70fa71 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -187,9 +187,15 @@ struct ncsi_channel { struct ncsi_channel_mode modes[NCSI_MODE_MAX]; struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; struct ncsi_channel_stats stats; - struct timer_list timer; /* Link monitor timer */ - bool enabled; /* Timer is enabled */ - unsigned int timeout; /* Times of timeout */ + struct { + struct timer_list timer; + bool enabled; + unsigned int state; +#define NCSI_CHANNEL_MONITOR_START 0 +#define NCSI_CHANNEL_MONITOR_RETRY 1 +#define NCSI_CHANNEL_MONITOR_WAIT 2 +#define NCSI_CHANNEL_MONITOR_WAIT_MAX 5 + } monitor; struct list_head node; struct list_head link; }; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index adf5401817c2..4742c7c6c748 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -172,15 +172,15 @@ static void ncsi_channel_monitor(unsigned long data) struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_cmd_arg nca; bool enabled, chained; - unsigned int timeout; + unsigned int monitor_state; unsigned long flags; int state, ret; spin_lock_irqsave(&nc->lock, flags); state = nc->state; chained = !list_empty(&nc->link); - timeout = nc->timeout; - enabled = nc->enabled; + enabled = nc->monitor.enabled; + monitor_state = nc->monitor.state; spin_unlock_irqrestore(&nc->lock, flags); if (!enabled || chained) @@ -189,7 +189,9 @@ static void ncsi_channel_monitor(unsigned long data) state != NCSI_CHANNEL_ACTIVE) return; - if (!(timeout % 2)) { + switch (monitor_state) { + case NCSI_CHANNEL_MONITOR_START: + case NCSI_CHANNEL_MONITOR_RETRY: nca.ndp = ndp; nca.package = np->id; nca.channel = nc->id; @@ -201,12 +203,16 @@ static void ncsi_channel_monitor(unsigned long data) ret); return; } - } - if (timeout + 1 >= 3) { + break; + case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: + break; + default: if (!(ndp->flags & NCSI_DEV_HWA) && - state == NCSI_CHANNEL_ACTIVE) + state == NCSI_CHANNEL_ACTIVE) { ncsi_report_link(ndp, true); + ndp->flags |= NCSI_DEV_RESHUFFLE; + } spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INVISIBLE; @@ -221,10 +227,9 @@ static void ncsi_channel_monitor(unsigned long data) } spin_lock_irqsave(&nc->lock, flags); - nc->timeout = timeout + 1; - nc->enabled = true; + nc->monitor.state++; spin_unlock_irqrestore(&nc->lock, flags); - mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); + mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_start_channel_monitor(struct ncsi_channel *nc) @@ -232,12 +237,12 @@ void ncsi_start_channel_monitor(struct ncsi_channel *nc) unsigned long flags; spin_lock_irqsave(&nc->lock, flags); - WARN_ON_ONCE(nc->enabled); - nc->timeout = 0; - nc->enabled = true; + WARN_ON_ONCE(nc->monitor.enabled); + nc->monitor.enabled = true; + nc->monitor.state = NCSI_CHANNEL_MONITOR_START; spin_unlock_irqrestore(&nc->lock, flags); - mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); + mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_stop_channel_monitor(struct ncsi_channel *nc) @@ -245,14 +250,14 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc) unsigned long flags; spin_lock_irqsave(&nc->lock, flags); - if (!nc->enabled) { + if (!nc->monitor.enabled) { spin_unlock_irqrestore(&nc->lock, flags); return; } - nc->enabled = false; + nc->monitor.enabled = false; spin_unlock_irqrestore(&nc->lock, flags); - del_timer_sync(&nc->timer); + del_timer_sync(&nc->monitor.timer); } struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -281,8 +286,9 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) nc->id = id; nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; - nc->enabled = false; - setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); + nc->monitor.enabled = false; + setup_timer(&nc->monitor.timer, + ncsi_channel_monitor, (unsigned long)nc); spin_lock_init(&nc->lock); INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 86cdaebd8d9e..087db775b3dc 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -322,7 +322,7 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) /* Reset the channel monitor if it has been enabled */ spin_lock_irqsave(&nc->lock, flags); - nc->timeout = 0; + nc->monitor.state = NCSI_CHANNEL_MONITOR_START; spin_unlock_irqrestore(&nc->lock, flags); return 0; -- cgit v1.2.1 From c0cd1ba4f8bd8b5fef43bc51a2983673b8f086ff Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 4 Oct 2016 11:25:53 +1100 Subject: net/ncsi: Introduce ncsi_stop_dev() This introduces ncsi_stop_dev(), as counterpart to ncsi_start_dev(), to stop the NCSI device so that it can be reenabled in future. This API should be called when the network device driver is going to shutdown the device. There are 3 things done in the function: Stop the channel monitoring; Reset channels to inactive state; Report NCSI link down. Signed-off-by: Gavin Shan Reviewed-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 4742c7c6c748..5e509e547c2d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1187,11 +1187,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - struct ncsi_package *np; - struct ncsi_channel *nc; - unsigned long flags; - bool chained; - int old_state, ret; + int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) @@ -1203,9 +1199,29 @@ int ncsi_start_dev(struct ncsi_dev *nd) return 0; } - /* Reset channel's state and start over */ + if (ndp->flags & NCSI_DEV_HWA) + ret = ncsi_enable_hwa(ndp); + else + ret = ncsi_choose_active_channel(ndp); + + return ret; +} +EXPORT_SYMBOL_GPL(ncsi_start_dev); + +void ncsi_stop_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + bool chained; + int old_state; + unsigned long flags; + + /* Stop the channel monitor and reset channel's state */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; @@ -1217,14 +1233,9 @@ int ncsi_start_dev(struct ncsi_dev *nd) } } - if (ndp->flags & NCSI_DEV_HWA) - ret = ncsi_enable_hwa(ndp); - else - ret = ncsi_choose_active_channel(ndp); - - return ret; + ncsi_report_link(ndp, true); } -EXPORT_SYMBOL_GPL(ncsi_start_dev); +EXPORT_SYMBOL_GPL(ncsi_stop_dev); void ncsi_unregister_dev(struct ncsi_dev *nd) { -- cgit v1.2.1