From 169bf9121b19dd6029e0a354d33513f61bfbe3d3 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Tue, 10 Mar 2015 12:23:34 -0400 Subject: tipc: ensure that idle links are deleted when a bearer is disabled commit afaa3f65f65fda2e7b190aac7e2a75d9a2a77cb6 (tipc: purge links when bearer is disabled) was an attempt to resolve a problem that turned out to have a more profound reason. When we disable a bearer, we delete all its pertaining links if there is no other bearer to perform failover to, or if the module is shutting down. In case there are dual bearers, we wait with deleting links until the failover procedure is finished. However, this misses the case when a link on the removed bearer was already down, so that there will be no failover procedure to finish the link delete. This causes confusion if a new bearer is added to replace the removed one, and also entails a small memory leak. This commit takes the current state of the link into account when deciding when to delete it, and also reverses the above-mentioned commit. Reviewed-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 14f09b3cb87c..98609fdfb06a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -344,6 +344,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *link; struct tipc_node *node; + bool del_link; rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { @@ -353,12 +354,13 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, tipc_node_unlock(node); continue; } + del_link = !tipc_link_is_up(link) && !link->exp_msg_count; tipc_link_reset(link); if (del_timer(&link->timer)) tipc_link_put(link); link->flags |= LINK_STOPPED; /* Delete link now, or when failover is finished: */ - if (shutting_down || !tipc_node_is_up(node)) + if (shutting_down || !tipc_node_is_up(node) || del_link) tipc_link_delete(link); tipc_node_unlock(node); } -- cgit v1.2.3 From cf2157f88a5abf1a64b8c51a737a35e918dc67e5 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:06 -0400 Subject: tipc: move message validation function to msg.c The function link_buf_validate() is in reality re-entrant and context independent, and will in later commits be called from several locations. Therefore, we move it to msg.c, make it outline and rename the it to tipc_msg_validate(). We also redesign the function to make proper use of pskb_may_pull() Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 58 +-------------------------------------------------------- net/tipc/msg.c | 44 ++++++++++++++++++++++++++++++++++++++++++- net/tipc/msg.h | 5 +++-- 3 files changed, 47 insertions(+), 60 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 98609fdfb06a..944c8c663a2d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1047,61 +1047,6 @@ static void link_retrieve_defq(struct tipc_link *link, skb_queue_splice_tail_init(&link->deferred_queue, list); } -/** - * link_recv_buf_validate - validate basic format of received message - * - * This routine ensures a TIPC message has an acceptable header, and at least - * as much data as the header indicates it should. The routine also ensures - * that the entire message header is stored in the main fragment of the message - * buffer, to simplify future access to message header fields. - * - * Note: Having extra info present in the message header or data areas is OK. - * TIPC will ignore the excess, under the assumption that it is optional info - * introduced by a later release of the protocol. - */ -static int link_recv_buf_validate(struct sk_buff *buf) -{ - static u32 min_data_hdr_size[8] = { - SHORT_H_SIZE, MCAST_H_SIZE, NAMED_H_SIZE, BASIC_H_SIZE, - MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE - }; - - struct tipc_msg *msg; - u32 tipc_hdr[2]; - u32 size; - u32 hdr_size; - u32 min_hdr_size; - - /* If this packet comes from the defer queue, the skb has already - * been validated - */ - if (unlikely(TIPC_SKB_CB(buf)->deferred)) - return 1; - - if (unlikely(buf->len < MIN_H_SIZE)) - return 0; - - msg = skb_header_pointer(buf, 0, sizeof(tipc_hdr), tipc_hdr); - if (msg == NULL) - return 0; - - if (unlikely(msg_version(msg) != TIPC_VERSION)) - return 0; - - size = msg_size(msg); - hdr_size = msg_hdr_sz(msg); - min_hdr_size = msg_isdata(msg) ? - min_data_hdr_size[msg_type(msg)] : INT_H_SIZE; - - if (unlikely((hdr_size < min_hdr_size) || - (size < hdr_size) || - (buf->len < size) || - (size - hdr_size > TIPC_MAX_USER_MSG_SIZE))) - return 0; - - return pskb_may_pull(buf, hdr_size); -} - /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace @@ -1127,7 +1072,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) while ((skb = __skb_dequeue(&head))) { /* Ensure message is well-formed */ - if (unlikely(!link_recv_buf_validate(skb))) + if (unlikely(!tipc_msg_validate(skb))) goto discard; /* Ensure message data is a single contiguous unit */ @@ -1398,7 +1343,6 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, if (tipc_link_defer_pkt(&l_ptr->deferred_queue, buf)) { l_ptr->stats.deferred_recv++; - TIPC_SKB_CB(buf)->deferred = true; if ((skb_queue_len(&l_ptr->deferred_queue) % 16) == 1) tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } else { diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6eb90cd3ef7..4a64caf6fa20 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -1,7 +1,7 @@ /* * net/tipc/msg.c: TIPC message header routines * - * Copyright (c) 2000-2006, 2014, Ericsson AB + * Copyright (c) 2000-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -181,6 +181,48 @@ err: return 0; } +/* tipc_msg_validate - validate basic format of received message + * + * This routine ensures a TIPC message has an acceptable header, and at least + * as much data as the header indicates it should. The routine also ensures + * that the entire message header is stored in the main fragment of the message + * buffer, to simplify future access to message header fields. + * + * Note: Having extra info present in the message header or data areas is OK. + * TIPC will ignore the excess, under the assumption that it is optional info + * introduced by a later release of the protocol. + */ +bool tipc_msg_validate(struct sk_buff *skb) +{ + struct tipc_msg *msg; + int msz, hsz; + + if (unlikely(TIPC_SKB_CB(skb)->validated)) + return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) + return false; + + hsz = msg_hdr_sz(buf_msg(skb)); + if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE)) + return false; + if (unlikely(!pskb_may_pull(skb, hsz))) + return false; + + msg = buf_msg(skb); + if (unlikely(msg_version(msg) != TIPC_VERSION)) + return false; + + msz = msg_size(msg); + if (unlikely(msz < hsz)) + return false; + if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) + return false; + if (unlikely(skb->len < msz)) + return false; + + TIPC_SKB_CB(skb)->validated = true; + return true; +} /** * tipc_msg_build - create buffer chain containing specified header and data diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 7cece647394c..62306b8d2410 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, 2014, Ericsson AB + * Copyright (c) 2000-2007, 2014-2015 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -92,7 +92,7 @@ struct plist; struct tipc_skb_cb { void *handle; struct sk_buff *tail; - bool deferred; + bool validated; bool wakeup_pending; bool bundling; u16 chain_sz; @@ -758,6 +758,7 @@ static inline u32 msg_tot_origport(struct tipc_msg *m) } struct sk_buff *tipc_buf_acquire(u32 size); +bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, -- cgit v1.2.3 From 1149557d64c97dc9adf3103347a1c0e8c06d3b89 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:07 -0400 Subject: tipc: eliminate unnecessary linearization of incoming buffers Currently, TIPC linearizes all incoming buffers directly at reception before passing them upwards in the stack. This is clearly a waste of CPU resources, and must be avoided. In this commit, we eliminate this unnecessary linearization. We still ensure that at least the message header is linear, and that the buffer is linearized where this is still needed, i.e. when unbundling and when reversing messages. In addition, we ensure that fragmented messages are validated after reassembly before delivering them upwards in the stack. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 5 ----- net/tipc/msg.c | 14 ++++++++++---- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 944c8c663a2d..8c6639d107fc 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1075,13 +1075,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) if (unlikely(!tipc_msg_validate(skb))) goto discard; - /* Ensure message data is a single contiguous unit */ - if (unlikely(skb_linearize(skb))) - goto discard; - /* Handle arrival of a non-unicast link message */ msg = buf_msg(skb); - if (unlikely(msg_non_seq(msg))) { if (msg_user(msg) == LINK_CONFIG) tipc_disc_rcv(net, skb, b_ptr); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4a64caf6fa20..ff8c64cd1cd9 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -165,6 +165,9 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { + TIPC_SKB_CB(head)->validated = false; + if (unlikely(!tipc_msg_validate(head))) + goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; *headbuf = NULL; @@ -172,7 +175,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } *buf = NULL; return 0; - err: pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); @@ -378,10 +380,14 @@ bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu) */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { - struct tipc_msg *msg = buf_msg(skb); + struct tipc_msg *msg; int imsz; - struct tipc_msg *imsg = (struct tipc_msg *)(msg_data(msg) + *pos); + struct tipc_msg *imsg; + if (unlikely(skb_linearize(skb))) + return false; + msg = buf_msg(skb); + imsg = (struct tipc_msg *)(msg_data(msg) + *pos); /* Is there space left for shortest possible message? */ if (*pos > (msg_data_sz(msg) - SHORT_H_SIZE)) goto none; @@ -463,11 +469,11 @@ bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, if (skb_linearize(buf)) goto exit; + msg = buf_msg(buf); if (msg_dest_droppable(msg)) goto exit; if (msg_errcode(msg)) goto exit; - memcpy(&ohdr, msg, msg_hdr_sz(msg)); imp = min_t(uint, imp + 1, TIPC_CRITICAL_IMPORTANCE); if (msg_isdata(msg)) -- cgit v1.2.3 From c1336ee472f83a90ede01fdae095ed5d0a2934c9 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:08 -0400 Subject: tipc: extract bundled buffers by cloning instead of copying When we currently extract a bundled buffer from a message bundle in the function tipc_msg_extract(), we allocate a new buffer and explicitly copy the linear data area. This is unnecessary, since we can just clone the buffer and do skb_pull() on the clone to move the data pointer to the correct position. This is what we do in this commit. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 45 ++++++++++++--------------------------------- net/tipc/msg.c | 30 ++++++++++++++++-------------- 2 files changed, 28 insertions(+), 47 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 8c6639d107fc..56c39b1a53a9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012-2014, Ericsson AB + * Copyright (c) 1996-2007, 2012-2015, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -1117,7 +1117,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) ackd = msg_ack(msg); /* Release acked messages */ - if (n_ptr->bclink.recv_permitted) + if (likely(n_ptr->bclink.recv_permitted)) tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); released = 0; @@ -1712,45 +1712,24 @@ void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, } } -/** - * buf_extract - extracts embedded TIPC message from another message - * @skb: encapsulating message buffer - * @from_pos: offset to extract from - * - * Returns a new message buffer containing an embedded message. The - * encapsulating buffer is left unchanged. - */ -static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) -{ - struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos); - u32 size = msg_size(msg); - struct sk_buff *eb; - - eb = tipc_buf_acquire(size); - if (eb) - skb_copy_to_linear_data(eb, msg, size); - return eb; -} - /* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet. * Owner node is locked. */ -static void tipc_link_dup_rcv(struct tipc_link *l_ptr, - struct sk_buff *t_buf) +static void tipc_link_dup_rcv(struct tipc_link *link, + struct sk_buff *skb) { - struct sk_buff *buf; + struct sk_buff *iskb; + int pos = 0; - if (!tipc_link_is_up(l_ptr)) + if (!tipc_link_is_up(link)) return; - buf = buf_extract(t_buf, INT_H_SIZE); - if (buf == NULL) { + if (!tipc_msg_extract(skb, &iskb, &pos)) { pr_warn("%sfailed to extract inner dup pkt\n", link_co_err); return; } - - /* Add buffer to deferred queue, if applicable: */ - link_handle_out_of_seq_msg(l_ptr, buf); + /* Append buffer to deferred queue, if applicable: */ + link_handle_out_of_seq_msg(link, iskb); } /* tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet @@ -1762,6 +1741,7 @@ static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr, struct tipc_msg *t_msg = buf_msg(t_buf); struct sk_buff *buf = NULL; struct tipc_msg *msg; + int pos = 0; if (tipc_link_is_up(l_ptr)) tipc_link_reset(l_ptr); @@ -1773,8 +1753,7 @@ static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr, /* Should there be an inner packet? */ if (l_ptr->exp_msg_count) { l_ptr->exp_msg_count--; - buf = buf_extract(t_buf, INT_H_SIZE); - if (buf == NULL) { + if (!tipc_msg_extract(t_buf, &buf, &pos)) { pr_warn("%sno inner failover pkt\n", link_co_err); goto exit; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ff8c64cd1cd9..333d2ae1cf76 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -372,38 +372,40 @@ bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu) /** * tipc_msg_extract(): extract bundled inner packet from buffer - * @skb: linear outer buffer, to be extracted from. + * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned - * @pos: position of msg to be extracted. Returns with pointer of next msg + * @pos: position in outer message of msg to be extracted. + * Returns position of next msg * Consumes outer buffer when last packet extracted * Returns true when when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { struct tipc_msg *msg; - int imsz; - struct tipc_msg *imsg; + int imsz, offset; + *iskb = NULL; if (unlikely(skb_linearize(skb))) - return false; + goto none; + msg = buf_msg(skb); - imsg = (struct tipc_msg *)(msg_data(msg) + *pos); - /* Is there space left for shortest possible message? */ - if (*pos > (msg_data_sz(msg) - SHORT_H_SIZE)) + offset = msg_hdr_sz(msg) + *pos; + if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE))) goto none; - imsz = msg_size(imsg); - /* Is there space left for current message ? */ - if ((*pos + imsz) > msg_data_sz(msg)) + *iskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!*iskb)) goto none; - *iskb = tipc_buf_acquire(imsz); - if (!*iskb) + skb_pull(*iskb, offset); + imsz = msg_size(buf_msg(*iskb)); + skb_trim(*iskb, imsz); + if (unlikely(!tipc_msg_validate(*iskb))) goto none; - skb_copy_to_linear_data(*iskb, imsg, imsz); *pos += align(imsz); return true; none: kfree_skb(skb); + kfree_skb(*iskb); *iskb = NULL; return false; } -- cgit v1.2.3 From 2cdf3918e47e98c8f34f7a64455ea9fd433756e7 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:09 -0400 Subject: tipc: eliminate unnecessary call to broadcast ack function The unicast packet header contains a broadcast acknowledge sequence number, that may need to be conveyed to the broadcast link for proper treatment. Currently, the function tipc_rcv(), which is on the most critical data path, calls the function tipc_bclink_acknowledge() to have this done. This call is made for each received packet, and results in the unconditional grabbing of the broadcast link spinlock. This is unnecessary, since we can see directly from tipc_rcv() if the acknowledged number differs from what has been previously acked from the node in question. In the vast majority of cases the numbers won't differ, and there is nothing to update. We now make the call to tipc_bclink_acknowledge() conditional to that the two ack values differ. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 4 ++++ net/tipc/link.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3e41704832de..5ee5076a8b27 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -215,7 +215,11 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) struct net *net = n_ptr->net; struct tipc_net *tn = net_generic(net, tipc_net_id); + if (unlikely(!n_ptr->bclink.recv_permitted)) + return; + tipc_bclink_lock(net); + /* Bail out if tx queue is empty (no clean up is required) */ skb = skb_peek(&tn->bcl->outqueue); if (!skb) diff --git a/net/tipc/link.c b/net/tipc/link.c index 56c39b1a53a9..2652c3286e2f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1117,7 +1117,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) ackd = msg_ack(msg); /* Release acked messages */ - if (likely(n_ptr->bclink.recv_permitted)) + if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg))) tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); released = 0; -- cgit v1.2.3 From 05dcc5aa4dcced4f59f925625cea669e82b75519 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:10 -0400 Subject: tipc: split link outqueue struct tipc_link contains one single queue for outgoing packets, where both transmitted and waiting packets are queued. This infrastructure is hard to maintain, because we need to keep a number of fields to keep track of which packets are sent or unsent, and the number of packets in each category. A lot of code becomes simpler if we split this queue into a transmission queue, where sent/unacknowledged packets are kept, and a backlog queue, where we keep the not yet sent packets. In this commit we do this separation. Reviewed-by: Erik Hugne Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 48 ++++++------- net/tipc/link.c | 208 ++++++++++++++++++++++++++----------------------------- net/tipc/link.h | 17 ++--- net/tipc/msg.c | 32 +++++---- net/tipc/msg.h | 6 +- net/tipc/node.c | 4 +- net/tipc/node.h | 2 +- 7 files changed, 150 insertions(+), 167 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 5ee5076a8b27..17cb0ff5f344 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -135,9 +135,10 @@ static void bclink_set_last_sent(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *bcl = tn->bcl; + struct sk_buff *skb = skb_peek(&bcl->backlogq); - if (bcl->next_out) - bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); + if (skb) + bcl->fsm_msg_cnt = mod(buf_seqno(skb) - 1); else bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); } @@ -180,7 +181,7 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) struct sk_buff *skb; struct tipc_link *bcl = tn->bcl; - skb_queue_walk(&bcl->outqueue, skb) { + skb_queue_walk(&bcl->transmq, skb) { if (more(buf_seqno(skb), after)) { tipc_link_retransmit(bcl, skb, mod(to - after)); break; @@ -210,7 +211,6 @@ void tipc_bclink_wakeup_users(struct net *net) void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) { struct sk_buff *skb, *tmp; - struct sk_buff *next; unsigned int released = 0; struct net *net = n_ptr->net; struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -221,7 +221,7 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) tipc_bclink_lock(net); /* Bail out if tx queue is empty (no clean up is required) */ - skb = skb_peek(&tn->bcl->outqueue); + skb = skb_peek(&tn->bcl->transmq); if (!skb) goto exit; @@ -248,27 +248,19 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) } /* Skip over packets that node has previously acknowledged */ - skb_queue_walk(&tn->bcl->outqueue, skb) { + skb_queue_walk(&tn->bcl->transmq, skb) { if (more(buf_seqno(skb), n_ptr->bclink.acked)) break; } /* Update packets that node is now acknowledging */ - skb_queue_walk_from_safe(&tn->bcl->outqueue, skb, tmp) { + skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) { if (more(buf_seqno(skb), acked)) break; - - next = tipc_skb_queue_next(&tn->bcl->outqueue, skb); - if (skb != tn->bcl->next_out) { - bcbuf_decr_acks(skb); - } else { - bcbuf_set_acks(skb, 0); - tn->bcl->next_out = next; - bclink_set_last_sent(net); - } - + bcbuf_decr_acks(skb); + bclink_set_last_sent(net); if (bcbuf_acks(skb) == 0) { - __skb_unlink(skb, &tn->bcl->outqueue); + __skb_unlink(skb, &tn->bcl->transmq); kfree_skb(skb); released = 1; } @@ -276,7 +268,7 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) n_ptr->bclink.acked = acked; /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(tn->bcl->next_out)) { + if (unlikely(skb_peek(&tn->bcl->backlogq))) { tipc_link_push_packets(tn->bcl); bclink_set_last_sent(net); } @@ -323,7 +315,7 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, buf = tipc_buf_acquire(INT_H_SIZE); if (buf) { struct tipc_msg *msg = buf_msg(buf); - struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferred_queue); + struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq); u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent; tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG, @@ -398,7 +390,7 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) if (likely(bclink->bcast_nodes.count)) { rc = __tipc_link_xmit(net, bcl, list); if (likely(!rc)) { - u32 len = skb_queue_len(&bcl->outqueue); + u32 len = skb_queue_len(&bcl->transmq); bclink_set_last_sent(net); bcl->stats.queue_sz_counts++; @@ -563,25 +555,25 @@ receive: if (node->bclink.last_in == node->bclink.last_sent) goto unlock; - if (skb_queue_empty(&node->bclink.deferred_queue)) { + if (skb_queue_empty(&node->bclink.deferdq)) { node->bclink.oos_state = 1; goto unlock; } - msg = buf_msg(skb_peek(&node->bclink.deferred_queue)); + msg = buf_msg(skb_peek(&node->bclink.deferdq)); seqno = msg_seqno(msg); next_in = mod(next_in + 1); if (seqno != next_in) goto unlock; /* Take in-sequence message from deferred queue & deliver it */ - buf = __skb_dequeue(&node->bclink.deferred_queue); + buf = __skb_dequeue(&node->bclink.deferdq); goto receive; } /* Handle out-of-sequence broadcast message */ if (less(next_in, seqno)) { - deferred = tipc_link_defer_pkt(&node->bclink.deferred_queue, + deferred = tipc_link_defer_pkt(&node->bclink.deferdq, buf); bclink_update_last_sent(node, seqno); buf = NULL; @@ -638,7 +630,6 @@ static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf, msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tn->net_id); tn->bcl->stats.sent_info++; - if (WARN_ON(!bclink->bcast_nodes.count)) { dump_stack(); return 0; @@ -917,8 +908,9 @@ int tipc_bclink_init(struct net *net) sprintf(bcbearer->media.name, "tipc-broadcast"); spin_lock_init(&bclink->lock); - __skb_queue_head_init(&bcl->outqueue); - __skb_queue_head_init(&bcl->deferred_queue); + __skb_queue_head_init(&bcl->transmq); + __skb_queue_head_init(&bcl->backlogq); + __skb_queue_head_init(&bcl->deferdq); skb_queue_head_init(&bcl->wakeupq); bcl->next_out_no = 1; spin_lock_init(&bclink->node.lock); diff --git a/net/tipc/link.c b/net/tipc/link.c index 2652c3286e2f..7e0036f5a364 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -194,10 +194,10 @@ static void link_timeout(unsigned long data) tipc_node_lock(l_ptr->owner); /* update counters used in statistical profiling of send traffic */ - l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->outqueue); + l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq); l_ptr->stats.queue_sz_counts++; - skb = skb_peek(&l_ptr->outqueue); + skb = skb_peek(&l_ptr->transmq); if (skb) { struct tipc_msg *msg = buf_msg(skb); u32 length = msg_size(msg); @@ -229,7 +229,7 @@ static void link_timeout(unsigned long data) /* do all other link processing performed on a periodic basis */ link_state_event(l_ptr, TIMEOUT_EVT); - if (l_ptr->next_out) + if (skb_queue_len(&l_ptr->backlogq)) tipc_link_push_packets(l_ptr); tipc_node_unlock(l_ptr->owner); @@ -313,8 +313,9 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, link_init_max_pkt(l_ptr); l_ptr->next_out_no = 1; - __skb_queue_head_init(&l_ptr->outqueue); - __skb_queue_head_init(&l_ptr->deferred_queue); + __skb_queue_head_init(&l_ptr->transmq); + __skb_queue_head_init(&l_ptr->backlogq); + __skb_queue_head_init(&l_ptr->deferdq); skb_queue_head_init(&l_ptr->wakeupq); skb_queue_head_init(&l_ptr->inputq); skb_queue_head_init(&l_ptr->namedq); @@ -400,7 +401,7 @@ static bool link_schedule_user(struct tipc_link *link, u32 oport, */ void link_prepare_wakeup(struct tipc_link *link) { - uint pend_qsz = skb_queue_len(&link->outqueue); + uint pend_qsz = skb_queue_len(&link->backlogq); struct sk_buff *skb, *tmp; skb_queue_walk_safe(&link->wakeupq, skb, tmp) { @@ -430,8 +431,9 @@ void tipc_link_reset_fragments(struct tipc_link *l_ptr) */ void tipc_link_purge_queues(struct tipc_link *l_ptr) { - __skb_queue_purge(&l_ptr->deferred_queue); - __skb_queue_purge(&l_ptr->outqueue); + __skb_queue_purge(&l_ptr->deferdq); + __skb_queue_purge(&l_ptr->transmq); + __skb_queue_purge(&l_ptr->backlogq); tipc_link_reset_fragments(l_ptr); } @@ -464,15 +466,15 @@ void tipc_link_reset(struct tipc_link *l_ptr) } /* Clean up all queues, except inputq: */ - __skb_queue_purge(&l_ptr->outqueue); - __skb_queue_purge(&l_ptr->deferred_queue); + __skb_queue_purge(&l_ptr->transmq); + __skb_queue_purge(&l_ptr->backlogq); + __skb_queue_purge(&l_ptr->deferdq); if (!owner->inputq) owner->inputq = &l_ptr->inputq; skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - l_ptr->next_out = NULL; - l_ptr->unacked_window = 0; + l_ptr->rcv_unacked = 0; l_ptr->checkpoint = 1; l_ptr->next_out_no = 1; l_ptr->fsm_msg_cnt = 0; @@ -742,54 +744,51 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list) { struct tipc_msg *msg = buf_msg(skb_peek(list)); - uint psz = msg_size(msg); - uint sndlim = link->queue_limit[0]; + unsigned int maxwin = link->window; uint imp = tipc_msg_tot_importance(msg); uint mtu = link->max_pkt; uint ack = mod(link->next_in_no - 1); uint seqno = link->next_out_no; uint bc_last_in = link->owner->bclink.last_in; struct tipc_media_addr *addr = &link->media_addr; - struct sk_buff_head *outqueue = &link->outqueue; + struct sk_buff_head *transmq = &link->transmq; + struct sk_buff_head *backlogq = &link->backlogq; struct sk_buff *skb, *tmp; /* Match queue limits against msg importance: */ - if (unlikely(skb_queue_len(outqueue) >= link->queue_limit[imp])) + if (unlikely(skb_queue_len(backlogq) >= link->queue_limit[imp])) return tipc_link_cong(link, list); /* Has valid packet limit been used ? */ - if (unlikely(psz > mtu)) { + if (unlikely(msg_size(msg) > mtu)) { __skb_queue_purge(list); return -EMSGSIZE; } - /* Prepare each packet for sending, and add to outqueue: */ + /* Prepare each packet for sending, and add to relevant queue: */ skb_queue_walk_safe(list, skb, tmp) { __skb_unlink(skb, list); msg = buf_msg(skb); - msg_set_word(msg, 2, ((ack << 16) | mod(seqno))); + msg_set_seqno(msg, seqno); + msg_set_ack(msg, ack); msg_set_bcast_ack(msg, bc_last_in); - if (skb_queue_len(outqueue) < sndlim) { - __skb_queue_tail(outqueue, skb); - tipc_bearer_send(net, link->bearer_id, - skb, addr); - link->next_out = NULL; - link->unacked_window = 0; - } else if (tipc_msg_bundle(outqueue, skb, mtu)) { + if (likely(skb_queue_len(transmq) < maxwin)) { + __skb_queue_tail(transmq, skb); + tipc_bearer_send(net, link->bearer_id, skb, addr); + link->rcv_unacked = 0; + seqno++; + continue; + } + if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) { link->stats.sent_bundled++; continue; - } else if (tipc_msg_make_bundle(outqueue, skb, mtu, - link->addr)) { + } + if (tipc_msg_make_bundle(&skb, mtu, link->addr)) { link->stats.sent_bundled++; link->stats.sent_bundles++; - if (!link->next_out) - link->next_out = skb_peek_tail(outqueue); - } else { - __skb_queue_tail(outqueue, skb); - if (!link->next_out) - link->next_out = skb; } + __skb_queue_tail(backlogq, skb); seqno++; } link->next_out_no = seqno; @@ -895,14 +894,6 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) kfree_skb(buf); } -struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list, - const struct sk_buff *skb) -{ - if (skb_queue_is_last(list, skb)) - return NULL; - return skb->next; -} - /* * tipc_link_push_packets - push unsent packets to bearer * @@ -911,30 +902,23 @@ struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list, * * Called with node locked */ -void tipc_link_push_packets(struct tipc_link *l_ptr) +void tipc_link_push_packets(struct tipc_link *link) { - struct sk_buff_head *outqueue = &l_ptr->outqueue; - struct sk_buff *skb = l_ptr->next_out; + struct sk_buff *skb; struct tipc_msg *msg; - u32 next, first; + unsigned int ack = mod(link->next_in_no - 1); - skb_queue_walk_from(outqueue, skb) { - msg = buf_msg(skb); - next = msg_seqno(msg); - first = buf_seqno(skb_peek(outqueue)); - - if (mod(next - first) < l_ptr->queue_limit[0]) { - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - if (msg_user(msg) == MSG_BUNDLER) - TIPC_SKB_CB(skb)->bundling = false; - tipc_bearer_send(l_ptr->owner->net, - l_ptr->bearer_id, skb, - &l_ptr->media_addr); - l_ptr->next_out = tipc_skb_queue_next(outqueue, skb); - } else { + while (skb_queue_len(&link->transmq) < link->window) { + skb = __skb_dequeue(&link->backlogq); + if (!skb) break; - } + msg = buf_msg(skb); + msg_set_ack(msg, ack); + msg_set_bcast_ack(msg, link->owner->bclink.last_in); + link->rcv_unacked = 0; + __skb_queue_tail(&link->transmq, skb); + tipc_bearer_send(link->owner->net, link->bearer_id, + skb, &link->media_addr); } } @@ -1021,8 +1005,8 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, l_ptr->stale_count = 1; } - skb_queue_walk_from(&l_ptr->outqueue, skb) { - if (!retransmits || skb == l_ptr->next_out) + skb_queue_walk_from(&l_ptr->transmq, skb) { + if (!retransmits) break; msg = buf_msg(skb); msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); @@ -1039,12 +1023,12 @@ static void link_retrieve_defq(struct tipc_link *link, { u32 seq_no; - if (skb_queue_empty(&link->deferred_queue)) + if (skb_queue_empty(&link->deferdq)) return; - seq_no = buf_seqno(skb_peek(&link->deferred_queue)); + seq_no = buf_seqno(skb_peek(&link->deferdq)); if (seq_no == mod(link->next_in_no)) - skb_queue_splice_tail_init(&link->deferred_queue, list); + skb_queue_splice_tail_init(&link->deferdq, list); } /** @@ -1121,17 +1105,16 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); released = 0; - skb_queue_walk_safe(&l_ptr->outqueue, skb1, tmp) { - if (skb1 == l_ptr->next_out || - more(buf_seqno(skb1), ackd)) + skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) { + if (more(buf_seqno(skb1), ackd)) break; - __skb_unlink(skb1, &l_ptr->outqueue); + __skb_unlink(skb1, &l_ptr->transmq); kfree_skb(skb1); released = 1; } /* Try sending any messages link endpoint has pending */ - if (unlikely(l_ptr->next_out)) + if (unlikely(skb_queue_len(&l_ptr->backlogq))) tipc_link_push_packets(l_ptr); if (released && !skb_queue_empty(&l_ptr->wakeupq)) @@ -1166,10 +1149,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) goto unlock; } l_ptr->next_in_no++; - if (unlikely(!skb_queue_empty(&l_ptr->deferred_queue))) + if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) link_retrieve_defq(l_ptr, &head); - - if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { l_ptr->stats.sent_acks++; tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } @@ -1336,9 +1318,9 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, return; } - if (tipc_link_defer_pkt(&l_ptr->deferred_queue, buf)) { + if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { l_ptr->stats.deferred_recv++; - if ((skb_queue_len(&l_ptr->deferred_queue) % 16) == 1) + if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } else { l_ptr->stats.duplicates++; @@ -1375,11 +1357,11 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, if (!tipc_link_is_up(l_ptr)) return; - if (l_ptr->next_out) - next_sent = buf_seqno(l_ptr->next_out); + if (skb_queue_len(&l_ptr->backlogq)) + next_sent = buf_seqno(skb_peek(&l_ptr->backlogq)); msg_set_next_sent(msg, next_sent); - if (!skb_queue_empty(&l_ptr->deferred_queue)) { - u32 rec = buf_seqno(skb_peek(&l_ptr->deferred_queue)); + if (!skb_queue_empty(&l_ptr->deferdq)) { + u32 rec = buf_seqno(skb_peek(&l_ptr->deferdq)); gap = mod(rec - mod(l_ptr->next_in_no)); } msg_set_seq_gap(msg, gap); @@ -1431,10 +1413,9 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf, &l_ptr->media_addr); - l_ptr->unacked_window = 0; + l_ptr->rcv_unacked = 0; kfree_skb(buf); } @@ -1569,7 +1550,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, } if (msg_seq_gap(msg)) { l_ptr->stats.recv_nacks++; - tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->outqueue), + tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), msg_seq_gap(msg)); } break; @@ -1616,7 +1597,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, */ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) { - u32 msgcount = skb_queue_len(&l_ptr->outqueue); + int msgcount; struct tipc_link *tunnel = l_ptr->owner->active_links[0]; struct tipc_msg tunnel_hdr; struct sk_buff *skb; @@ -1627,10 +1608,12 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); + skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); + msgcount = skb_queue_len(&l_ptr->transmq); msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); msg_set_msgcnt(&tunnel_hdr, msgcount); - if (skb_queue_empty(&l_ptr->outqueue)) { + if (skb_queue_empty(&l_ptr->transmq)) { skb = tipc_buf_acquire(INT_H_SIZE); if (skb) { skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE); @@ -1646,7 +1629,7 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) split_bundles = (l_ptr->owner->active_links[0] != l_ptr->owner->active_links[1]); - skb_queue_walk(&l_ptr->outqueue, skb) { + skb_queue_walk(&l_ptr->transmq, skb) { struct tipc_msg *msg = buf_msg(skb); if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { @@ -1677,39 +1660,46 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) * and sequence order is preserved per sender/receiver socket pair. * Owner node is locked. */ -void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, - struct tipc_link *tunnel) +void tipc_link_dup_queue_xmit(struct tipc_link *link, + struct tipc_link *tnl) { struct sk_buff *skb; - struct tipc_msg tunnel_hdr; - - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, - DUPLICATE_MSG, INT_H_SIZE, l_ptr->addr); - msg_set_msgcnt(&tunnel_hdr, skb_queue_len(&l_ptr->outqueue)); - msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); - skb_queue_walk(&l_ptr->outqueue, skb) { + struct tipc_msg tnl_hdr; + struct sk_buff_head *queue = &link->transmq; + int mcnt; + + tipc_msg_init(link_own_addr(link), &tnl_hdr, CHANGEOVER_PROTOCOL, + DUPLICATE_MSG, INT_H_SIZE, link->addr); + mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq); + msg_set_msgcnt(&tnl_hdr, mcnt); + msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id); + +tunnel_queue: + skb_queue_walk(queue, skb) { struct sk_buff *outskb; struct tipc_msg *msg = buf_msg(skb); - u32 length = msg_size(msg); + u32 len = msg_size(msg); - if (msg_user(msg) == MSG_BUNDLER) - msg_set_type(msg, CLOSED_MSG); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - msg_set_size(&tunnel_hdr, length + INT_H_SIZE); - outskb = tipc_buf_acquire(length + INT_H_SIZE); + msg_set_ack(msg, mod(link->next_in_no - 1)); + msg_set_bcast_ack(msg, link->owner->bclink.last_in); + msg_set_size(&tnl_hdr, len + INT_H_SIZE); + outskb = tipc_buf_acquire(len + INT_H_SIZE); if (outskb == NULL) { pr_warn("%sunable to send duplicate msg\n", link_co_err); return; } - skb_copy_to_linear_data(outskb, &tunnel_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, skb->data, - length); - __tipc_link_xmit_skb(tunnel, outskb); - if (!tipc_link_is_up(l_ptr)) + skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, + skb->data, len); + __tipc_link_xmit_skb(tnl, outskb); + if (!tipc_link_is_up(link)) return; } + if (queue == &link->backlogq) + return; + queue = &link->backlogq; + goto tunnel_queue; } /* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet. @@ -1823,6 +1813,8 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window) { + l_ptr->window = window; + /* Data messages from this node, inclusive FIRST_FRAGM */ l_ptr->queue_limit[TIPC_LOW_IMPORTANCE] = window; l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE] = (window / 3) * 4; diff --git a/net/tipc/link.h b/net/tipc/link.h index 7aeb52092bf3..eec3ecf2d450 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -124,7 +124,8 @@ struct tipc_stats { * @max_pkt: current maximum packet size for this link * @max_pkt_target: desired maximum packet size for this link * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target) - * @outqueue: outbound message queue + * @transmitq: queue for sent, non-acked messages + * @backlogq: queue for messages waiting to be sent * @next_out_no: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message * @stale_count: # of identical retransmit requests made by peer @@ -177,20 +178,21 @@ struct tipc_link { u32 max_pkt_probes; /* Sending */ - struct sk_buff_head outqueue; + struct sk_buff_head transmq; + struct sk_buff_head backlogq; u32 next_out_no; + u32 window; u32 last_retransmitted; u32 stale_count; /* Reception */ u32 next_in_no; - struct sk_buff_head deferred_queue; - u32 unacked_window; + u32 rcv_unacked; + struct sk_buff_head deferdq; struct sk_buff_head inputq; struct sk_buff_head namedq; /* Congestion handling */ - struct sk_buff *next_out; struct sk_buff_head wakeupq; /* Fragmentation/reassembly */ @@ -302,9 +304,4 @@ static inline int link_reset_reset(struct tipc_link *l_ptr) return l_ptr->state == RESET_RESET; } -static inline int link_congested(struct tipc_link *l_ptr) -{ - return skb_queue_len(&l_ptr->outqueue) >= l_ptr->queue_limit[0]; -} - #endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 333d2ae1cf76..47c8fd8e2fb2 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -330,33 +330,36 @@ error: /** * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @list: the buffer chain of the existing buffer ("bundle") + * @bskb: the buffer to append to ("bundle") * @skb: buffer to be appended * @mtu: max allowable size for the bundle buffer * Consumes buffer if successful * Returns true if bundling could be performed, otherwise false */ -bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu) +bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) { - struct sk_buff *bskb = skb_peek_tail(list); - struct tipc_msg *bmsg = buf_msg(bskb); + struct tipc_msg *bmsg; struct tipc_msg *msg = buf_msg(skb); - unsigned int bsz = msg_size(bmsg); + unsigned int bsz; unsigned int msz = msg_size(msg); - u32 start = align(bsz); + u32 start, pad; u32 max = mtu - INT_H_SIZE; - u32 pad = start - bsz; if (likely(msg_user(msg) == MSG_FRAGMENTER)) return false; + if (!bskb) + return false; + bmsg = buf_msg(bskb); + bsz = msg_size(bmsg); + start = align(bsz); + pad = start - bsz; + if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; if (likely(msg_user(bmsg) != MSG_BUNDLER)) return false; - if (likely(!TIPC_SKB_CB(bskb)->bundling)) - return false; if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; if (unlikely(max < (start + msz))) @@ -419,12 +422,11 @@ none: * Replaces buffer if successful * Returns true if success, otherwise false */ -bool tipc_msg_make_bundle(struct sk_buff_head *list, - struct sk_buff *skb, u32 mtu, u32 dnode) +bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) { struct sk_buff *bskb; struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(skb); + struct tipc_msg *msg = buf_msg(*skb); u32 msz = msg_size(msg); u32 max = mtu - INT_H_SIZE; @@ -448,9 +450,9 @@ bool tipc_msg_make_bundle(struct sk_buff_head *list, msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - TIPC_SKB_CB(bskb)->bundling = true; - __skb_queue_tail(list, bskb); - return tipc_msg_bundle(list, skb, mtu); + tipc_msg_bundle(bskb, *skb, mtu); + *skb = bskb; + return true; } /** diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 62306b8d2410..e5fc5fdb2ea7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -767,9 +767,9 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff_head *list, - struct sk_buff *skb, u32 mtu, u32 dnode); +bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu); + +bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); diff --git a/net/tipc/node.c b/net/tipc/node.c index 86152de8248d..26d1de1bf34d 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,7 +111,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); - __skb_queue_head_init(&n_ptr->bclink.deferred_queue); + __skb_queue_head_init(&n_ptr->bclink.deferdq); hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n_ptr->addr < temp_node->addr) @@ -354,7 +354,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Flush broadcast link info associated with lost node */ if (n_ptr->bclink.recv_permitted) { - __skb_queue_purge(&n_ptr->bclink.deferred_queue); + __skb_queue_purge(&n_ptr->bclink.deferdq); if (n_ptr->bclink.reasm_buf) { kfree_skb(n_ptr->bclink.reasm_buf); diff --git a/net/tipc/node.h b/net/tipc/node.h index f78be64e105b..e89ac04ec2c3 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -84,7 +84,7 @@ struct tipc_node_bclink { u32 last_sent; u32 oos_state; u32 deferred_size; - struct sk_buff_head deferred_queue; + struct sk_buff_head deferdq; struct sk_buff *reasm_buf; int inputq_map; bool recv_permitted; -- cgit v1.2.3 From e3eea1eb47ac616ee09cf0ae5d1e7790ef8461ea Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Fri, 13 Mar 2015 16:08:11 -0400 Subject: tipc: clean up handling of message priorities Messages transferred by TIPC are assigned an "importance priority", -an integer value indicating how to treat the message when there is link or destination socket congestion. There is no separate header field for this value. Instead, the message user values have been chosen in ascending order according to perceived importance, so that the message user field can be used for this. This is not a good solution. First, we have many more users than the needed priority levels, so we end up with treating more priority levels than necessary. Second, the user field cannot always accurately reflect the priority of the message. E.g., a message fragment packet should really have the priority of the enveloped user data message, and not the priority of the MSG_FRAGMENTER user. Until now, we have been working around this problem in different ways, but it is now time to implement a consistent way of handling such priorities, although still within the constraint that we cannot allocate any more bits in the regular data message header for this. In this commit, we define a new priority level, TIPC_SYSTEM_IMPORTANCE, that will be the only one used apart from the four (lower) user data levels. All non-data messages map down to this priority. Furthermore, we take some free bits from the MSG_FRAGMENTER header and allocate them to store the priority of the enveloped message. We then adjust the functions msg_importance()/msg_set_importance() so that they read/set the correct header fields depending on user type. This small protocol change is fully compatible, because the code at the receiving end of a link currently reads the importance level only from user data messages, where there is no change. Reviewed-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 1 - net/tipc/link.c | 40 +++++++++++++--------------------- net/tipc/msg.c | 5 +---- net/tipc/msg.h | 65 +++++++++++++++++++++++++++++--------------------------- 4 files changed, 50 insertions(+), 61 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 17cb0ff5f344..5aff0844d4d3 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -383,7 +383,6 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) __skb_queue_purge(list); return -EHOSTUNREACH; } - /* Broadcast to all nodes */ if (likely(bclink)) { tipc_bclink_lock(net); diff --git a/net/tipc/link.c b/net/tipc/link.c index 7e0036f5a364..bc49120bfb44 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -35,6 +35,7 @@ */ #include "core.h" +#include "subscr.h" #include "link.h" #include "bcast.h" #include "socket.h" @@ -305,12 +306,10 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, msg_set_session(msg, (tn->random & 0xffff)); msg_set_bearer_id(msg, b_ptr->identity); strcpy((char *)msg_data(msg), if_name); - - l_ptr->priority = b_ptr->priority; - tipc_link_set_queue_limits(l_ptr, b_ptr->window); - l_ptr->net_plane = b_ptr->net_plane; link_init_max_pkt(l_ptr); + l_ptr->priority = b_ptr->priority; + tipc_link_set_queue_limits(l_ptr, b_ptr->window); l_ptr->next_out_no = 1; __skb_queue_head_init(&l_ptr->transmq); @@ -708,7 +707,7 @@ static int tipc_link_cong(struct tipc_link *link, struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); struct tipc_msg *msg = buf_msg(skb); - uint imp = tipc_msg_tot_importance(msg); + int imp = msg_importance(msg); u32 oport = msg_tot_origport(msg); if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { @@ -745,7 +744,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, { struct tipc_msg *msg = buf_msg(skb_peek(list)); unsigned int maxwin = link->window; - uint imp = tipc_msg_tot_importance(msg); + unsigned int imp = msg_importance(msg); uint mtu = link->max_pkt; uint ack = mod(link->next_in_no - 1); uint seqno = link->next_out_no; @@ -755,7 +754,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *backlogq = &link->backlogq; struct sk_buff *skb, *tmp; - /* Match queue limits against msg importance: */ + /* Match queue limit against msg importance: */ if (unlikely(skb_queue_len(backlogq) >= link->queue_limit[imp])) return tipc_link_cong(link, list); @@ -1811,25 +1810,16 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4); } -void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { - l_ptr->window = window; - - /* Data messages from this node, inclusive FIRST_FRAGM */ - l_ptr->queue_limit[TIPC_LOW_IMPORTANCE] = window; - l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE] = (window / 3) * 4; - l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE] = (window / 3) * 5; - l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE] = (window / 3) * 6; - /* Transiting data messages,inclusive FIRST_FRAGM */ - l_ptr->queue_limit[TIPC_LOW_IMPORTANCE + 4] = 300; - l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE + 4] = 600; - l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE + 4] = 900; - l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE + 4] = 1200; - l_ptr->queue_limit[CONN_MANAGER] = 1200; - l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500; - l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000; - /* FRAGMENT and LAST_FRAGMENT packets */ - l_ptr->queue_limit[MSG_FRAGMENTER] = 4000; + int max_bulk = TIPC_MAX_PUBLICATIONS / (l->max_pkt / ITEM_SIZE); + + l->window = win; + l->queue_limit[TIPC_LOW_IMPORTANCE] = win / 2; + l->queue_limit[TIPC_MEDIUM_IMPORTANCE] = win; + l->queue_limit[TIPC_HIGH_IMPORTANCE] = win / 2 * 3; + l->queue_limit[TIPC_CRITICAL_IMPORTANCE] = win * 2; + l->queue_limit[TIPC_SYSTEM_IMPORTANCE] = max_bulk; } /* tipc_link_find_owner - locate owner node of link by link's name diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 47c8fd8e2fb2..0c6dad8180a0 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -272,6 +272,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr)); msg_set_size(&pkthdr, pktmax); msg_set_fragm_no(&pkthdr, pktno); + msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ skb = tipc_buf_acquire(pktmax); @@ -467,7 +468,6 @@ bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, int err) { struct tipc_msg *msg = buf_msg(buf); - uint imp = msg_importance(msg); struct tipc_msg ohdr; uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); @@ -479,9 +479,6 @@ bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, if (msg_errcode(msg)) goto exit; memcpy(&ohdr, msg, msg_hdr_sz(msg)); - imp = min_t(uint, imp + 1, TIPC_CRITICAL_IMPORTANCE); - if (msg_isdata(msg)) - msg_set_importance(msg, imp); msg_set_errcode(msg, err); msg_set_origport(msg, msg_destport(&ohdr)); msg_set_destport(msg, msg_origport(&ohdr)); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index e5fc5fdb2ea7..bd3969a80dd4 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -54,6 +54,8 @@ struct plist; * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ +#define TIPC_SYSTEM_IMPORTANCE 4 + /* * Payload message types @@ -63,6 +65,19 @@ struct plist; #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 +/* + * Internal message users + */ +#define BCAST_PROTOCOL 5 +#define MSG_BUNDLER 6 +#define LINK_PROTOCOL 7 +#define CONN_MANAGER 8 +#define CHANGEOVER_PROTOCOL 10 +#define NAME_DISTRIBUTOR 11 +#define MSG_FRAGMENTER 12 +#define LINK_CONFIG 13 +#define SOCK_WAKEUP 14 /* pseudo user */ + /* * Message header sizes */ @@ -170,16 +185,6 @@ static inline void msg_set_user(struct tipc_msg *m, u32 n) msg_set_bits(m, 0, 25, 0xf, n); } -static inline u32 msg_importance(struct tipc_msg *m) -{ - return msg_bits(m, 0, 25, 0xf); -} - -static inline void msg_set_importance(struct tipc_msg *m, u32 i) -{ - msg_set_user(m, i); -} - static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; @@ -336,6 +341,25 @@ static inline void msg_set_seqno(struct tipc_msg *m, u32 n) /* * Words 3-10 */ +static inline u32 msg_importance(struct tipc_msg *m) +{ + if (unlikely(msg_user(m) == MSG_FRAGMENTER)) + return msg_bits(m, 5, 13, 0x7); + if (likely(msg_isdata(m) && !msg_errcode(m))) + return msg_user(m); + return TIPC_SYSTEM_IMPORTANCE; +} + +static inline void msg_set_importance(struct tipc_msg *m, u32 i) +{ + if (unlikely(msg_user(m) == MSG_FRAGMENTER)) + msg_set_bits(m, 5, 13, 0x7, i); + else if (likely(i < TIPC_SYSTEM_IMPORTANCE)) + msg_set_user(m, i); + else + pr_warn("Trying to set illegal importance in message\n"); +} + static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); @@ -457,20 +481,6 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) * Constants and routines used to read and write TIPC internal message headers */ -/* - * Internal message users - */ -#define BCAST_PROTOCOL 5 -#define MSG_BUNDLER 6 -#define LINK_PROTOCOL 7 -#define CONN_MANAGER 8 -#define ROUTE_DISTRIBUTOR 9 /* obsoleted */ -#define CHANGEOVER_PROTOCOL 10 -#define NAME_DISTRIBUTOR 11 -#define MSG_FRAGMENTER 12 -#define LINK_CONFIG 13 -#define SOCK_WAKEUP 14 /* pseudo user */ - /* * Connection management protocol message types */ @@ -743,13 +753,6 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -static inline u32 tipc_msg_tot_importance(struct tipc_msg *m) -{ - if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) - return msg_importance(msg_get_wrapped(m)); - return msg_importance(m); -} - static inline u32 msg_tot_origport(struct tipc_msg *m) { if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) -- cgit v1.2.3 From 3bd88ee7a2ea19dffe384e12fe452c59d9e53c29 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 19 Mar 2015 09:02:18 +0100 Subject: tipc: do not report -EHOSTUNREACH for failed local delivery Since commit 1186adf7df04 ("tipc: simplify message forwarding and rejection in socket layer") -EHOSTUNREACH is propagated back to the sending process if we fail to deliver the message to another socket local to the node. This is wrong, host unreachable should only be reported when the destination port/name does not exist in the cluster, and that check is always done before sending the message. Also, this introduces inconsistent sendmsg() behavior for local/remote destinations. Errors occurring on the receiving side should not trickle up to the sender. If message delivery fails TIPC should either discard the packet or reject it back to the sender based on the destination droppable option. Signed-off-by: Erik Hugne Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index bc49120bfb44..8c98c4d00ad6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -845,8 +845,10 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, if (link) return rc; - if (likely(in_own_node(net, dnode))) - return tipc_sk_rcv(net, list); + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } __skb_queue_purge(list); return rc; -- cgit v1.2.3 From 1f66d161ab3d8b518903fa6c3f9c1f48d6919e74 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 25 Mar 2015 12:07:24 -0400 Subject: tipc: introduce starvation free send algorithm Currently, we only use a single counter; the length of the backlog queue, to determine whether a message should be accepted to the queue or not. Each time a message is being sent, the queue length is compared to a threshold value for the message's importance priority. If the queue length is beyond this threshold, the message is rejected. This algorithm implies a risk of starvation of low importance senders during very high load, because it may take a long time before the backlog queue has decreased enough to accept a lower level message. We now eliminate this risk by introducing a counter for each importance priority. When a message is sent, we check only the queue level for that particular message's priority. If that is ok, the message can be added to the backlog, irrespective of the queue level for other priorities. This way, each level is guaranteed a certain portion of the total bandwidth, and any risk of starvation is eliminated. Reviewed-by: Ying Xue Reviewed-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 2 +- net/tipc/link.c | 58 +++++++++++++++++++++++++++++++++++--------------------- net/tipc/link.h | 7 +++++-- 3 files changed, 42 insertions(+), 25 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 52307397e0b1..79355531c3e2 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -831,7 +831,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->queue_limit[0])) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) goto prop_msg_full; nla_nest_end(msg->skb, prop); diff --git a/net/tipc/link.c b/net/tipc/link.c index 8c98c4d00ad6..b9325a1bddaa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -310,7 +310,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, link_init_max_pkt(l_ptr); l_ptr->priority = b_ptr->priority; tipc_link_set_queue_limits(l_ptr, b_ptr->window); - l_ptr->next_out_no = 1; __skb_queue_head_init(&l_ptr->transmq); __skb_queue_head_init(&l_ptr->backlogq); @@ -398,19 +397,22 @@ static bool link_schedule_user(struct tipc_link *link, u32 oport, * Move a number of waiting users, as permitted by available space in * the send queue, from link wait queue to node wait queue for wakeup */ -void link_prepare_wakeup(struct tipc_link *link) +void link_prepare_wakeup(struct tipc_link *l) { - uint pend_qsz = skb_queue_len(&link->backlogq); + int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; + int imp, lim; struct sk_buff *skb, *tmp; - skb_queue_walk_safe(&link->wakeupq, skb, tmp) { - if (pend_qsz >= link->queue_limit[TIPC_SKB_CB(skb)->chain_imp]) + skb_queue_walk_safe(&l->wakeupq, skb, tmp) { + imp = TIPC_SKB_CB(skb)->chain_imp; + lim = l->window + l->backlog[imp].limit; + pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; + if ((pnd[imp] + l->backlog[imp].len) >= lim) break; - pend_qsz += TIPC_SKB_CB(skb)->chain_sz; - skb_unlink(skb, &link->wakeupq); - skb_queue_tail(&link->inputq, skb); - link->owner->inputq = &link->inputq; - link->owner->action_flags |= TIPC_MSG_EVT; + skb_unlink(skb, &l->wakeupq); + skb_queue_tail(&l->inputq, skb); + l->owner->inputq = &l->inputq; + l->owner->action_flags |= TIPC_MSG_EVT; } } @@ -424,6 +426,16 @@ void tipc_link_reset_fragments(struct tipc_link *l_ptr) l_ptr->reasm_buf = NULL; } +static void tipc_link_purge_backlog(struct tipc_link *l) +{ + __skb_queue_purge(&l->backlogq); + l->backlog[TIPC_LOW_IMPORTANCE].len = 0; + l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; + l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; + l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; + l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; +} + /** * tipc_link_purge_queues - purge all pkt queues associated with link * @l_ptr: pointer to link @@ -432,7 +444,7 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) { __skb_queue_purge(&l_ptr->deferdq); __skb_queue_purge(&l_ptr->transmq); - __skb_queue_purge(&l_ptr->backlogq); + tipc_link_purge_backlog(l_ptr); tipc_link_reset_fragments(l_ptr); } @@ -466,13 +478,13 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Clean up all queues, except inputq: */ __skb_queue_purge(&l_ptr->transmq); - __skb_queue_purge(&l_ptr->backlogq); __skb_queue_purge(&l_ptr->deferdq); if (!owner->inputq) owner->inputq = &l_ptr->inputq; skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; + tipc_link_purge_backlog(l_ptr); l_ptr->rcv_unacked = 0; l_ptr->checkpoint = 1; l_ptr->next_out_no = 1; @@ -754,16 +766,14 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *backlogq = &link->backlogq; struct sk_buff *skb, *tmp; - /* Match queue limit against msg importance: */ - if (unlikely(skb_queue_len(backlogq) >= link->queue_limit[imp])) + /* Match backlog limit against msg importance: */ + if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit)) return tipc_link_cong(link, list); - /* Has valid packet limit been used ? */ if (unlikely(msg_size(msg) > mtu)) { __skb_queue_purge(list); return -EMSGSIZE; } - /* Prepare each packet for sending, and add to relevant queue: */ skb_queue_walk_safe(list, skb, tmp) { __skb_unlink(skb, list); @@ -786,8 +796,10 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, if (tipc_msg_make_bundle(&skb, mtu, link->addr)) { link->stats.sent_bundled++; link->stats.sent_bundles++; + imp = msg_importance(buf_msg(skb)); } __skb_queue_tail(backlogq, skb); + link->backlog[imp].len++; seqno++; } link->next_out_no = seqno; @@ -914,6 +926,7 @@ void tipc_link_push_packets(struct tipc_link *link) if (!skb) break; msg = buf_msg(skb); + link->backlog[msg_importance(msg)].len--; msg_set_ack(msg, ack); msg_set_bcast_ack(msg, link->owner->bclink.last_in); link->rcv_unacked = 0; @@ -1610,6 +1623,7 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); + tipc_link_purge_backlog(l_ptr); msgcount = skb_queue_len(&l_ptr->transmq); msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); msg_set_msgcnt(&tunnel_hdr, msgcount); @@ -1817,11 +1831,11 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) int max_bulk = TIPC_MAX_PUBLICATIONS / (l->max_pkt / ITEM_SIZE); l->window = win; - l->queue_limit[TIPC_LOW_IMPORTANCE] = win / 2; - l->queue_limit[TIPC_MEDIUM_IMPORTANCE] = win; - l->queue_limit[TIPC_HIGH_IMPORTANCE] = win / 2 * 3; - l->queue_limit[TIPC_CRITICAL_IMPORTANCE] = win * 2; - l->queue_limit[TIPC_SYSTEM_IMPORTANCE] = max_bulk; + l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2; + l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } /* tipc_link_find_owner - locate owner node of link by link's name @@ -2120,7 +2134,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, - link->queue_limit[TIPC_LOW_IMPORTANCE])) + link->window)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index eec3ecf2d450..99543a46095a 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -118,7 +118,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') - * @queue_limit: outbound message queue congestion thresholds (indexed by user) + * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_checkpoint: seq # of last acknowledged message at time of link reset * @max_pkt: current maximum packet size for this link @@ -166,7 +166,6 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; - u32 queue_limit[15]; /* queue_limit[0]==window limit */ /* Changeover */ u32 exp_msg_count; @@ -180,6 +179,10 @@ struct tipc_link { /* Sending */ struct sk_buff_head transmq; struct sk_buff_head backlogq; + struct { + u16 len; + u16 limit; + } backlog[5]; u32 next_out_no; u32 window; u32 last_retransmitted; -- cgit v1.2.3 From 3127a0200d4a46cf279bb388cc0f71827cd60699 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 25 Mar 2015 12:07:25 -0400 Subject: tipc: clean up handling of link congestion After the recent changes in message importance handling it becomes possible to simplify handling of messages and sockets when we encounter link congestion. We merge the function tipc_link_cong() into link_schedule_user(), and simplify the code of the latter. The code should now be easier to follow, especially regarding return codes and handling of the message that caused the situation. In case the scheduling function is unable to pre-allocate a wakeup message buffer, it now returns -ENOBUFS, which is a more correct code than the previously used -EHOSTUNREACH. Reviewed-by: Ying Xue Reviewed-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 104 ++++++++++++++++++++++++++------------------------------ net/tipc/msg.h | 28 ++++++--------- 2 files changed, 60 insertions(+), 72 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index b9325a1bddaa..58e2460682da 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -367,28 +367,43 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, } /** - * link_schedule_user - schedule user for wakeup after congestion + * link_schedule_user - schedule a message sender for wakeup after congestion * @link: congested link - * @oport: sending port - * @chain_sz: size of buffer chain that was attempted sent - * @imp: importance of message attempted sent + * @list: message that was attempted sent * Create pseudo msg to send back to user when congestion abates + * Only consumes message if there is an error */ -static bool link_schedule_user(struct tipc_link *link, u32 oport, - uint chain_sz, uint imp) +static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) { - struct sk_buff *buf; + struct tipc_msg *msg = buf_msg(skb_peek(list)); + int imp = msg_importance(msg); + u32 oport = msg_origport(msg); + u32 addr = link_own_addr(link); + struct sk_buff *skb; - buf = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, - link_own_addr(link), link_own_addr(link), - oport, 0, 0); - if (!buf) - return false; - TIPC_SKB_CB(buf)->chain_sz = chain_sz; - TIPC_SKB_CB(buf)->chain_imp = imp; - skb_queue_tail(&link->wakeupq, buf); + /* This really cannot happen... */ + if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { + pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); + tipc_link_reset(link); + goto err; + } + /* Non-blocking sender: */ + if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) + return -ELINKCONG; + + /* Create and schedule wakeup pseudo message */ + skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, + addr, addr, oport, 0, 0); + if (!skb) + goto err; + TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); + TIPC_SKB_CB(skb)->chain_imp = imp; + skb_queue_tail(&link->wakeupq, skb); link->stats.link_congs++; - return true; + return -ELINKCONG; +err: + __skb_queue_purge(list); + return -ENOBUFS; } /** @@ -708,48 +723,15 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } } -/* tipc_link_cong: determine return value and how to treat the - * sent buffer during link congestion. - * - For plain, errorless user data messages we keep the buffer and - * return -ELINKONG. - * - For all other messages we discard the buffer and return -EHOSTUNREACH - * - For TIPC internal messages we also reset the link - */ -static int tipc_link_cong(struct tipc_link *link, struct sk_buff_head *list) -{ - struct sk_buff *skb = skb_peek(list); - struct tipc_msg *msg = buf_msg(skb); - int imp = msg_importance(msg); - u32 oport = msg_tot_origport(msg); - - if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { - pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - tipc_link_reset(link); - goto drop; - } - if (unlikely(msg_errcode(msg))) - goto drop; - if (unlikely(msg_reroute_cnt(msg))) - goto drop; - if (TIPC_SKB_CB(skb)->wakeup_pending) - return -ELINKCONG; - if (link_schedule_user(link, oport, skb_queue_len(list), imp)) - return -ELINKCONG; -drop: - __skb_queue_purge(list); - return -EHOSTUNREACH; -} - /** * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked * @link: link to use * @list: chain of buffers containing message * - * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG, -EMSGSIZE (plain socket - * user data messages) or -EHOSTUNREACH (all other messages/senders) - * Only the socket functions tipc_send_stream() and tipc_send_packet() need - * to act on the return value, since they may need to do more send attempts. + * Consumes the buffer chain, except when returning -ELINKCONG, + * since the caller then may want to make more send attempts. + * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list) @@ -768,7 +750,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, /* Match backlog limit against msg importance: */ if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit)) - return tipc_link_cong(link, list); + return link_schedule_user(link, list); if (unlikely(msg_size(msg) > mtu)) { __skb_queue_purge(list); @@ -820,13 +802,25 @@ static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) return __tipc_link_xmit(link->owner->net, link, &head); } +/* tipc_link_xmit_skb(): send single buffer to destination + * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE + * messages, which will not be rejected + * The only exception is datagram messages rerouted after secondary + * lookup, which are rare and safe to dispose of anyway. + * TODO: Return real return value, and let callers use + * tipc_wait_for_sendpkt() where applicable + */ int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; + int rc; skb2list(skb, &head); - return tipc_link_xmit(net, &head, dnode, selector); + rc = tipc_link_xmit(net, &head, dnode, selector); + if (rc == -ELINKCONG) + kfree_skb(skb); + return 0; } /** diff --git a/net/tipc/msg.h b/net/tipc/msg.h index bd3969a80dd4..6445db09c0c4 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -240,6 +240,15 @@ static inline void msg_set_size(struct tipc_msg *m, u32 sz) m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } +static inline unchar *msg_data(struct tipc_msg *m) +{ + return ((unchar *)m) + msg_hdr_sz(m); +} + +static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +{ + return (struct tipc_msg *)msg_data(m); +} /* * Word 1 @@ -372,6 +381,8 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) static inline u32 msg_origport(struct tipc_msg *m) { + if (msg_user(m) == MSG_FRAGMENTER) + m = msg_get_wrapped(m); return msg_word(m, 4); } @@ -467,16 +478,6 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) msg_set_word(m, 10, n); } -static inline unchar *msg_data(struct tipc_msg *m) -{ - return ((unchar *)m) + msg_hdr_sz(m); -} - -static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) -{ - return (struct tipc_msg *)msg_data(m); -} - /* * Constants and routines used to read and write TIPC internal message headers */ @@ -753,13 +754,6 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -static inline u32 msg_tot_origport(struct tipc_msg *m) -{ - if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) - return msg_origport(msg_get_wrapped(m)); - return msg_origport(m); -} - struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, -- cgit v1.2.3 From 8b4ed8634f8b3f9aacfc42b4a872d30c36b9e255 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 25 Mar 2015 12:07:26 -0400 Subject: tipc: eliminate race condition at dual link establishment Despite recent improvements, the establishment of dual parallel links still has a small glitch where messages can bypass each other. When the second link in a dual-link configuration is established, part of the first link's traffic will be steered over to the new link. Although we do have a mechanism to ensure that packets sent before and after the establishment of the new link arrive in sequence to the destination node, this is not enough. The arriving messages will still be delivered upwards in different threads, something entailing a risk of message disordering during the transition phase. To fix this, we introduce a synchronization mechanism between the two parallel links, so that traffic arriving on the new link cannot be added to its input queue until we are guaranteed that all pre-establishment messages have been delivered on the old, parallel link. This problem seems to always have been around, but its occurrence is so rare that it has not been noticed until recent intensive testing. Reviewed-by: Ying Xue Reviewed-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ net/tipc/link.h | 2 ++ net/tipc/msg.h | 8 ++++++++ 3 files changed, 55 insertions(+) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 58e2460682da..1287161e9424 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -139,6 +139,13 @@ static void tipc_link_put(struct tipc_link *l_ptr) kref_put(&l_ptr->ref, tipc_link_release); } +static struct tipc_link *tipc_parallel_link(struct tipc_link *l) +{ + if (l->owner->active_links[0] != l) + return l->owner->active_links[0]; + return l->owner->active_links[1]; +} + static void link_init_max_pkt(struct tipc_link *l_ptr) { struct tipc_node *node = l_ptr->owner; @@ -1026,6 +1033,32 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, } } +/* link_synch(): check if all packets arrived before the synch + * point have been consumed + * Returns true if the parallel links are synched, otherwise false + */ +static bool link_synch(struct tipc_link *l) +{ + unsigned int post_synch; + struct tipc_link *pl; + + pl = tipc_parallel_link(l); + if (pl == l) + goto synched; + + /* Was last pre-synch packet added to input queue ? */ + if (less_eq(pl->next_in_no, l->synch_point)) + return false; + + /* Is it still in the input queue ? */ + post_synch = mod(pl->next_in_no - l->synch_point) - 1; + if (skb_queue_len(&pl->inputq) > post_synch) + return false; +synched: + l->flags &= ~LINK_SYNCHING; + return true; +} + static void link_retrieve_defq(struct tipc_link *link, struct sk_buff_head *list) { @@ -1156,6 +1189,14 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) skb = NULL; goto unlock; } + /* Synchronize with parallel link if applicable */ + if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { + link_handle_out_of_seq_msg(l_ptr, skb); + if (link_synch(l_ptr)) + link_retrieve_defq(l_ptr, &head); + skb = NULL; + goto unlock; + } l_ptr->next_in_no++; if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) link_retrieve_defq(l_ptr, &head); @@ -1231,6 +1272,10 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) switch (msg_user(msg)) { case CHANGEOVER_PROTOCOL: + if (msg_dup(msg)) { + link->flags |= LINK_SYNCHING; + link->synch_point = msg_seqno(msg_get_wrapped(msg)); + } if (!tipc_link_tunnel_rcv(node, &skb)) break; if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { diff --git a/net/tipc/link.h b/net/tipc/link.h index 99543a46095a..d2b5663643da 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -60,6 +60,7 @@ */ #define LINK_STARTED 0x0001 #define LINK_STOPPED 0x0002 +#define LINK_SYNCHING 0x0004 /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -170,6 +171,7 @@ struct tipc_link { /* Changeover */ u32 exp_msg_count; u32 reset_checkpoint; + u32 synch_point; /* Max packet negotiation */ u32 max_pkt; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 6445db09c0c4..d273207ede28 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -554,6 +554,14 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 15, 0x1fff, n); } +static inline bool msg_dup(struct tipc_msg *m) +{ + if (likely(msg_user(m) != CHANGEOVER_PROTOCOL)) + return false; + if (msg_type(m) != DUPLICATE_MSG) + return false; + return true; +} /* * Word 2 -- cgit v1.2.3 From b952b2befb6f6b009e91f087285b9a0a6beb1cc8 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Mar 2015 18:10:23 +0800 Subject: tipc: fix potential deadlock when all links are reset [ 60.988363] ====================================================== [ 60.988754] [ INFO: possible circular locking dependency detected ] [ 60.989152] 3.19.0+ #194 Not tainted [ 60.989377] ------------------------------------------------------- [ 60.989781] swapper/3/0 is trying to acquire lock: [ 60.990079] (&(&n_ptr->lock)->rlock){+.-...}, at: [] tipc_link_retransmit+0x1aa/0x240 [tipc] [ 60.990743] [ 60.990743] but task is already holding lock: [ 60.991106] (&(&bclink->lock)->rlock){+.-...}, at: [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.991738] [ 60.991738] which lock already depends on the new lock. [ 60.991738] [ 60.992174] [ 60.992174] the existing dependency chain (in reverse order) is: [ 60.992174] -> #1 (&(&bclink->lock)->rlock){+.-...}: [ 60.992174] [] lock_acquire+0x9c/0x140 [ 60.992174] [] _raw_spin_lock_bh+0x3f/0x50 [ 60.992174] [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.992174] [] tipc_bclink_add_node+0x97/0xf0 [tipc] [ 60.992174] [] tipc_node_link_up+0xf5/0x110 [tipc] [ 60.992174] [] link_state_event+0x2b3/0x4f0 [tipc] [ 60.992174] [] tipc_link_proto_rcv+0x24c/0x418 [tipc] [ 60.992174] [] tipc_rcv+0x827/0xac0 [tipc] [ 60.992174] [] tipc_l2_rcv_msg+0x73/0xd0 [tipc] [ 60.992174] [] __netif_receive_skb_core+0x746/0x980 [ 60.992174] [] __netif_receive_skb+0x21/0x70 [ 60.992174] [] netif_receive_skb_internal+0x35/0x130 [ 60.992174] [] napi_gro_receive+0x158/0x1d0 [ 60.992174] [] e1000_clean_rx_irq+0x155/0x490 [ 60.992174] [] e1000_clean+0x267/0x990 [ 60.992174] [] net_rx_action+0x150/0x360 [ 60.992174] [] __do_softirq+0x123/0x360 [ 60.992174] [] irq_exit+0x8e/0xb0 [ 60.992174] [] do_IRQ+0x65/0x110 [ 60.992174] [] ret_from_intr+0x0/0x13 [ 60.992174] [] arch_cpu_idle+0xf/0x20 [ 60.992174] [] cpu_startup_entry+0x2f6/0x3f0 [ 60.992174] [] start_secondary+0x13a/0x150 [ 60.992174] -> #0 (&(&n_ptr->lock)->rlock){+.-...}: [ 60.992174] [] __lock_acquire+0x163d/0x1ca0 [ 60.992174] [] lock_acquire+0x9c/0x140 [ 60.992174] [] _raw_spin_lock_bh+0x3f/0x50 [ 60.992174] [] tipc_link_retransmit+0x1aa/0x240 [tipc] [ 60.992174] [] tipc_bclink_rcv+0x611/0x640 [tipc] [ 60.992174] [] tipc_rcv+0x616/0xac0 [tipc] [ 60.992174] [] tipc_l2_rcv_msg+0x73/0xd0 [tipc] [ 60.992174] [] __netif_receive_skb_core+0x746/0x980 [ 60.992174] [] __netif_receive_skb+0x21/0x70 [ 60.992174] [] netif_receive_skb_internal+0x35/0x130 [ 60.992174] [] napi_gro_receive+0x158/0x1d0 [ 60.992174] [] e1000_clean_rx_irq+0x155/0x490 [ 60.992174] [] e1000_clean+0x267/0x990 [ 60.992174] [] net_rx_action+0x150/0x360 [ 60.992174] [] __do_softirq+0x123/0x360 [ 60.992174] [] irq_exit+0x8e/0xb0 [ 60.992174] [] do_IRQ+0x65/0x110 [ 60.992174] [] ret_from_intr+0x0/0x13 [ 60.992174] [] arch_cpu_idle+0xf/0x20 [ 60.992174] [] cpu_startup_entry+0x2f6/0x3f0 [ 60.992174] [] start_secondary+0x13a/0x150 [ 60.992174] [ 60.992174] other info that might help us debug this: [ 60.992174] [ 60.992174] Possible unsafe locking scenario: [ 60.992174] [ 60.992174] CPU0 CPU1 [ 60.992174] ---- ---- [ 60.992174] lock(&(&bclink->lock)->rlock); [ 60.992174] lock(&(&n_ptr->lock)->rlock); [ 60.992174] lock(&(&bclink->lock)->rlock); [ 60.992174] lock(&(&n_ptr->lock)->rlock); [ 60.992174] [ 60.992174] *** DEADLOCK *** [ 60.992174] [ 60.992174] 3 locks held by swapper/3/0: [ 60.992174] #0: (rcu_read_lock){......}, at: [] __netif_receive_skb_core+0x71/0x980 [ 60.992174] #1: (rcu_read_lock){......}, at: [] tipc_l2_rcv_msg+0x5/0xd0 [tipc] [ 60.992174] #2: (&(&bclink->lock)->rlock){+.-...}, at: [] tipc_bclink_lock+0x8e/0xa0 [tipc] [ 60.992174] The correct the sequence of grabbing n_ptr->lock and bclink->lock should be that the former is first held and the latter is then taken, which exactly happened on CPU1. But especially when the retransmission of broadcast link is failed, bclink->lock is first held in tipc_bclink_rcv(), and n_ptr->lock is taken in link_retransmit_failure() called by tipc_link_retransmit() subsequently, which is demonstrated on CPU0. As a result, deadlock occurs. If the order of holding the two locks happening on CPU0 is reversed, the deadlock risk will be relieved. Therefore, the node lock taken in link_retransmit_failure() originally is moved to tipc_bclink_rcv() so that it's obtained before bclink lock. But the precondition of the adjustment of node lock is that responding to bclink reset event must be moved from tipc_bclink_unlock() to tipc_node_unlock(). Reviewed-by: Erik Hugne Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 23 +---------------------- net/tipc/bcast.h | 4 ---- net/tipc/link.c | 5 +---- net/tipc/node.c | 5 ++++- net/tipc/node.h | 3 ++- 5 files changed, 8 insertions(+), 32 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 79355531c3e2..4289dd62f589 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -62,21 +62,8 @@ static void tipc_bclink_lock(struct net *net) static void tipc_bclink_unlock(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *node = NULL; - if (likely(!tn->bclink->flags)) { - spin_unlock_bh(&tn->bclink->lock); - return; - } - - if (tn->bclink->flags & TIPC_BCLINK_RESET) { - tn->bclink->flags &= ~TIPC_BCLINK_RESET; - node = tipc_bclink_retransmit_to(net); - } spin_unlock_bh(&tn->bclink->lock); - - if (node) - tipc_link_reset_all(node); } void tipc_bclink_input(struct net *net) @@ -91,13 +78,6 @@ uint tipc_bclink_get_mtu(void) return MAX_PKT_DEFAULT_MCAST; } -void tipc_bclink_set_flags(struct net *net, unsigned int flags) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - tn->bclink->flags |= flags; -} - static u32 bcbuf_acks(struct sk_buff *buf) { return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle; @@ -156,7 +136,6 @@ static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) seqno : node->bclink.last_sent; } - /** * tipc_bclink_retransmit_to - get most recent node to request retransmission * @@ -476,13 +455,13 @@ void tipc_bclink_rcv(struct net *net, struct sk_buff *buf) goto unlock; if (msg_destnode(msg) == tn->own_addr) { tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); - tipc_node_unlock(node); tipc_bclink_lock(net); bcl->stats.recv_nacks++; tn->bclink->retransmit_to = node; bclink_retransmit_pkt(tn, msg_bcgap_after(msg), msg_bcgap_to(msg)); tipc_bclink_unlock(net); + tipc_node_unlock(node); } else { tipc_node_unlock(node); bclink_peek_nack(net, msg); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 43f397fbac55..4bdc12277d33 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -55,7 +55,6 @@ struct tipc_bcbearer_pair { struct tipc_bearer *secondary; }; -#define TIPC_BCLINK_RESET 1 #define BCBEARER MAX_BEARERS /** @@ -86,7 +85,6 @@ struct tipc_bcbearer { * @lock: spinlock governing access to structure * @link: (non-standard) broadcast link structure * @node: (non-standard) node structure representing b'cast link's peer node - * @flags: represent bclink states * @bcast_nodes: map of broadcast-capable nodes * @retransmit_to: node that most recently requested a retransmit * @@ -96,7 +94,6 @@ struct tipc_bclink { spinlock_t lock; struct tipc_link link; struct tipc_node node; - unsigned int flags; struct sk_buff_head arrvq; struct sk_buff_head inputq; struct tipc_node_map bcast_nodes; @@ -117,7 +114,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, int tipc_bclink_init(struct net *net); void tipc_bclink_stop(struct net *net); -void tipc_bclink_set_flags(struct net *tn, unsigned int flags); void tipc_bclink_add_node(struct net *net, u32 addr); void tipc_bclink_remove_node(struct net *net, u32 addr); struct tipc_node *tipc_bclink_retransmit_to(struct net *tn); diff --git a/net/tipc/link.c b/net/tipc/link.c index 1287161e9424..f5e086c5f724 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -980,7 +980,6 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, (unsigned long) TIPC_SKB_CB(buf)->handle); n_ptr = tipc_bclink_retransmit_to(net); - tipc_node_lock(n_ptr); tipc_addr_string_fill(addr_string, n_ptr->addr); pr_info("Broadcast link info for %s\n", addr_string); @@ -992,9 +991,7 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, n_ptr->bclink.oos_state, n_ptr->bclink.last_sent); - tipc_node_unlock(n_ptr); - - tipc_bclink_set_flags(net, TIPC_BCLINK_RESET); + n_ptr->action_flags |= TIPC_BCAST_RESET; l_ptr->stale_count = 0; } } diff --git a/net/tipc/node.c b/net/tipc/node.c index 26d1de1bf34d..5cc43d34ad0a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -459,7 +459,7 @@ void tipc_node_unlock(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP | TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT | - TIPC_NAMED_MSG_EVT); + TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET); spin_unlock_bh(&node->lock); @@ -488,6 +488,9 @@ void tipc_node_unlock(struct tipc_node *node) if (flags & TIPC_BCAST_MSG_EVT) tipc_bclink_input(net); + + if (flags & TIPC_BCAST_RESET) + tipc_link_reset_all(node); } /* Caller should hold node lock for the passed node */ diff --git a/net/tipc/node.h b/net/tipc/node.h index e89ac04ec2c3..9629ecd2bdd8 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -64,7 +64,8 @@ enum { TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7), TIPC_NAMED_MSG_EVT = (1 << 8), - TIPC_BCAST_MSG_EVT = (1 << 9) + TIPC_BCAST_MSG_EVT = (1 << 9), + TIPC_BCAST_RESET = (1 << 10) }; /** -- cgit v1.2.3 From 8a0f6ebe8494c5c6ccfe12264385b64c280e3241 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Mar 2015 18:10:24 +0800 Subject: tipc: involve reference counter for node structure TIPC node hash node table is protected with rcu lock on read side. tipc_node_find() is used to look for a node object with node address through iterating the hash node table. As the entire process of what tipc_node_find() traverses the table is guarded with rcu read lock, it's safe for us. However, when callers use the node object returned by tipc_node_find(), there is no rcu read lock applied. Therefore, this is absolutely unsafe for callers of tipc_node_find(). Now we introduce a reference counter for node structure. Before tipc_node_find() returns node object to its caller, it first increases the reference counter. Accordingly, after its caller used it up, it decreases the counter again. This can prevent a node being used by one thread from being freed by another thread. Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 5 +-- net/tipc/discover.c | 1 + net/tipc/link.c | 7 +++-- net/tipc/name_distr.c | 2 ++ net/tipc/node.c | 85 ++++++++++++++++++++++++++++++++++++--------------- net/tipc/node.h | 9 ++++-- 6 files changed, 79 insertions(+), 30 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 4289dd62f589..ae558dd7f8ee 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -329,13 +329,12 @@ static void bclink_peek_nack(struct net *net, struct tipc_msg *msg) return; tipc_node_lock(n_ptr); - if (n_ptr->bclink.recv_permitted && (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) && (n_ptr->bclink.last_in == msg_bcgap_after(msg))) n_ptr->bclink.oos_state = 2; - tipc_node_unlock(n_ptr); + tipc_node_put(n_ptr); } /* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster @@ -466,6 +465,7 @@ void tipc_bclink_rcv(struct net *net, struct sk_buff *buf) tipc_node_unlock(node); bclink_peek_nack(net, msg); } + tipc_node_put(node); goto exit; } @@ -570,6 +570,7 @@ receive: unlock: tipc_node_unlock(node); + tipc_node_put(node); exit: kfree_skb(buf); } diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 169f3dd038b9..967e292f53c8 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -260,6 +260,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, } } tipc_node_unlock(node); + tipc_node_put(node); } /** diff --git a/net/tipc/link.c b/net/tipc/link.c index f5e086c5f724..514466efc25c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -854,6 +854,7 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, if (link) rc = __tipc_link_xmit(net, link, list); tipc_node_unlock(node); + tipc_node_put(node); } if (link) return rc; @@ -1116,8 +1117,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) n_ptr = tipc_node_find(net, msg_prevnode(msg)); if (unlikely(!n_ptr)) goto discard; - tipc_node_lock(n_ptr); + tipc_node_lock(n_ptr); /* Locate unicast link endpoint that should handle message */ l_ptr = n_ptr->links[b_ptr->identity]; if (unlikely(!l_ptr)) @@ -1205,6 +1206,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) skb = NULL; unlock: tipc_node_unlock(n_ptr); + tipc_node_put(n_ptr); discard: if (unlikely(skb)) kfree_skb(skb); @@ -2236,7 +2238,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { @@ -2249,6 +2250,7 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->prev_seq = 1; goto out; } + tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { @@ -2256,6 +2258,7 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) err = __tipc_nl_add_node_links(net, &msg, node, &prev_link); tipc_node_unlock(node); + tipc_node_put(node); if (err) goto out; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 506aaa565da7..41e7b7e4dda0 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -244,6 +244,7 @@ static void tipc_publ_subscribe(struct net *net, struct publication *publ, tipc_node_lock(node); list_add_tail(&publ->nodesub_list, &node->publ_list); tipc_node_unlock(node); + tipc_node_put(node); } static void tipc_publ_unsubscribe(struct net *net, struct publication *publ, @@ -258,6 +259,7 @@ static void tipc_publ_unsubscribe(struct net *net, struct publication *publ, tipc_node_lock(node); list_del_init(&publ->nodesub_list); tipc_node_unlock(node); + tipc_node_put(node); } /** diff --git a/net/tipc/node.c b/net/tipc/node.c index 5cc43d34ad0a..3e4f04897c03 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -42,6 +42,7 @@ static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); +static void tipc_node_delete(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -67,6 +68,23 @@ static unsigned int tipc_hashfn(u32 addr) return addr & (NODE_HTABLE_SIZE - 1); } +static void tipc_node_kref_release(struct kref *kref) +{ + struct tipc_node *node = container_of(kref, struct tipc_node, kref); + + tipc_node_delete(node); +} + +void tipc_node_put(struct tipc_node *node) +{ + kref_put(&node->kref, tipc_node_kref_release); +} + +static void tipc_node_get(struct tipc_node *node) +{ + kref_get(&node->kref); +} + /* * tipc_node_find - locate specified node object, if it exists */ @@ -82,6 +100,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], hash) { if (node->addr == addr) { + tipc_node_get(node); rcu_read_unlock(); return node; } @@ -106,6 +125,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) } n_ptr->addr = addr; n_ptr->net = net; + kref_init(&n_ptr->kref); spin_lock_init(&n_ptr->lock); INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); @@ -120,16 +140,17 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) list_add_tail_rcu(&n_ptr->list, &temp_node->list); n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; n_ptr->signature = INVALID_NODE_SIG; + tipc_node_get(n_ptr); exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } -static void tipc_node_delete(struct tipc_net *tn, struct tipc_node *n_ptr) +static void tipc_node_delete(struct tipc_node *node) { - list_del_rcu(&n_ptr->list); - hlist_del_rcu(&n_ptr->hash); - kfree_rcu(n_ptr, rcu); + list_del_rcu(&node->list); + hlist_del_rcu(&node->hash); + kfree_rcu(node, rcu); } void tipc_node_stop(struct net *net) @@ -139,7 +160,7 @@ void tipc_node_stop(struct net *net) spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) - tipc_node_delete(tn, node); + tipc_node_put(node); spin_unlock_bh(&tn->node_list_lock); } @@ -147,6 +168,7 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; + int err = 0; if (in_own_node(net, dnode)) return 0; @@ -157,8 +179,10 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (!conn) - return -EHOSTUNREACH; + if (!conn) { + err = -EHOSTUNREACH; + goto exit; + } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; @@ -166,7 +190,9 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) tipc_node_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_unlock(node); - return 0; +exit: + tipc_node_put(node); + return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) @@ -189,6 +215,7 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) kfree(conn); } tipc_node_unlock(node); + tipc_node_put(node); } /** @@ -417,19 +444,25 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; + int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); - if ((bearer_id >= MAX_BEARERS) || !node) - return -EINVAL; + if (!node) + return err; + + if (bearer_id >= MAX_BEARERS) + goto exit; + tipc_node_lock(node); link = node->links[bearer_id]; if (link) { strncpy(linkname, link->name, len); - tipc_node_unlock(node); - return 0; + err = 0; } +exit: tipc_node_unlock(node); - return -EINVAL; + tipc_node_put(node); + return err; } void tipc_node_unlock(struct tipc_node *node) @@ -545,17 +578,21 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - - if (last_addr && !tipc_node_find(net, last_addr)) { - rcu_read_unlock(); - /* We never set seq or call nl_dump_check_consistent() this - * means that setting prev_seq here will cause the consistence - * check to fail in the netlink callback handler. Resulting in - * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if - * the node state changed while we released the lock. - */ - cb->prev_seq = 1; - return -EPIPE; + if (last_addr) { + node = tipc_node_find(net, last_addr); + if (!node) { + rcu_read_unlock(); + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the NLMSG_DONE message having + * the NLM_F_DUMP_INTR flag set if the node state + * changed while we released the lock. + */ + cb->prev_seq = 1; + return -EPIPE; + } + tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { diff --git a/net/tipc/node.h b/net/tipc/node.h index 9629ecd2bdd8..02d5c20dc551 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -94,6 +94,7 @@ struct tipc_node_bclink { /** * struct tipc_node - TIPC node structure * @addr: network address of node + * @ref: reference counter to node object * @lock: spinlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain @@ -115,6 +116,7 @@ struct tipc_node_bclink { */ struct tipc_node { u32 addr; + struct kref kref; spinlock_t lock; struct net *net; struct hlist_node hash; @@ -137,6 +139,7 @@ struct tipc_node { }; struct tipc_node *tipc_node_find(struct net *net, u32 addr); +void tipc_node_put(struct tipc_node *node); struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); @@ -171,10 +174,12 @@ static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) node = tipc_node_find(net, addr); - if (likely(node)) + if (likely(node)) { mtu = node->act_mtus[selector & 1]; - else + tipc_node_put(node); + } else { mtu = MAX_MSG_SIZE; + } return mtu; } -- cgit v1.2.3 From 2da7142516527a5213588f47ed302e79a5d9527a Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 2 Apr 2015 09:33:00 -0400 Subject: tipc: drop tunneled packet duplicates at reception In commit 8b4ed8634f8b3f9aacfc42b4a872d30c36b9e255 ("tipc: eliminate race condition at dual link establishment") we introduced a parallel link synchronization mechanism that guarentees sequential delivery even for users switching from an old to a newly established link. The new mechanism makes it unnecessary to deliver the tunneled duplicate packets back to the old link, as we are currently doing. It is now sufficient to use the last tunneled packet's inner sequence number as synchronization point between the two parallel links, whereafter it can be dropped. In this commit, we drop the duplicate packets arriving on the new link, after updating the synchronization point at each new arrival. Although it would now have been sufficient for the other endpoint to only tunnel the last packet in its send queue, and not the entire queue, we must still do this to maintain compatibility with older nodes. This commit makes it possible to get rid if some complex interaction between the two parallel links. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 132 ++++++++++++++++++++------------------------------------ 1 file changed, 47 insertions(+), 85 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index 514466efc25c..c697cf69da91 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -105,8 +105,6 @@ static void link_handle_out_of_seq_msg(struct tipc_link *link, struct sk_buff *skb); static void tipc_link_proto_rcv(struct tipc_link *link, struct sk_buff *skb); -static int tipc_link_tunnel_rcv(struct tipc_node *node, - struct sk_buff **skb); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void link_reset_statistics(struct tipc_link *l_ptr); @@ -115,7 +113,8 @@ static void tipc_link_sync_xmit(struct tipc_link *l); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); - +static bool tipc_link_failover_rcv(struct tipc_node *node, + struct sk_buff **skb); /* * Simple link routines */ @@ -1274,8 +1273,10 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) if (msg_dup(msg)) { link->flags |= LINK_SYNCHING; link->synch_point = msg_seqno(msg_get_wrapped(msg)); + kfree_skb(skb); + break; } - if (!tipc_link_tunnel_rcv(node, &skb)) + if (!tipc_link_failover_rcv(node, &skb)) break; if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { tipc_data_input(link, skb); @@ -1755,101 +1756,62 @@ tunnel_queue: goto tunnel_queue; } -/* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet. - * Owner node is locked. - */ -static void tipc_link_dup_rcv(struct tipc_link *link, - struct sk_buff *skb) -{ - struct sk_buff *iskb; - int pos = 0; - - if (!tipc_link_is_up(link)) - return; - - if (!tipc_msg_extract(skb, &iskb, &pos)) { - pr_warn("%sfailed to extract inner dup pkt\n", link_co_err); - return; - } - /* Append buffer to deferred queue, if applicable: */ - link_handle_out_of_seq_msg(link, iskb); -} - /* tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet * Owner node is locked. */ -static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr, - struct sk_buff *t_buf) +static bool tipc_link_failover_rcv(struct tipc_node *node, + struct sk_buff **skb) { - struct tipc_msg *t_msg = buf_msg(t_buf); - struct sk_buff *buf = NULL; - struct tipc_msg *msg; + struct tipc_msg *msg = buf_msg(*skb); + struct sk_buff *iskb = NULL; + struct tipc_link *link = NULL; + int bearer_id = msg_bearer_id(msg); int pos = 0; - if (tipc_link_is_up(l_ptr)) - tipc_link_reset(l_ptr); - - /* First failover packet? */ - if (l_ptr->exp_msg_count == START_CHANGEOVER) - l_ptr->exp_msg_count = msg_msgcnt(t_msg); - - /* Should there be an inner packet? */ - if (l_ptr->exp_msg_count) { - l_ptr->exp_msg_count--; - if (!tipc_msg_extract(t_buf, &buf, &pos)) { - pr_warn("%sno inner failover pkt\n", link_co_err); - goto exit; - } - msg = buf_msg(buf); - - if (less(msg_seqno(msg), l_ptr->reset_checkpoint)) { - kfree_skb(buf); - buf = NULL; - goto exit; - } - if (msg_user(msg) == MSG_FRAGMENTER) { - l_ptr->stats.recv_fragments++; - tipc_buf_append(&l_ptr->reasm_buf, &buf); - } + if (msg_type(msg) != ORIGINAL_MSG) { + pr_warn("%sunknown tunnel pkt received\n", link_co_err); + goto exit; } -exit: - if ((!l_ptr->exp_msg_count) && (l_ptr->flags & LINK_STOPPED)) - tipc_link_delete(l_ptr); - return buf; -} - -/* tipc_link_tunnel_rcv(): Receive a tunnelled packet, sent - * via other link as result of a failover (ORIGINAL_MSG) or - * a new active link (DUPLICATE_MSG). Failover packets are - * returned to the active link for delivery upwards. - * Owner node is locked. - */ -static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, - struct sk_buff **buf) -{ - struct sk_buff *t_buf = *buf; - struct tipc_link *l_ptr; - struct tipc_msg *t_msg = buf_msg(t_buf); - u32 bearer_id = msg_bearer_id(t_msg); + if (bearer_id >= MAX_BEARERS) + goto exit; + link = node->links[bearer_id]; + if (!link) + goto exit; + if (tipc_link_is_up(link)) + tipc_link_reset(link); - *buf = NULL; + /* First failover packet? */ + if (link->exp_msg_count == START_CHANGEOVER) + link->exp_msg_count = msg_msgcnt(msg); - if (bearer_id >= MAX_BEARERS) + /* Should we expect an inner packet? */ + if (!link->exp_msg_count) goto exit; - l_ptr = n_ptr->links[bearer_id]; - if (!l_ptr) + if (!tipc_msg_extract(*skb, &iskb, &pos)) { + pr_warn("%sno inner failover pkt\n", link_co_err); + *skb = NULL; goto exit; + } + link->exp_msg_count--; + *skb = NULL; - if (msg_type(t_msg) == DUPLICATE_MSG) - tipc_link_dup_rcv(l_ptr, t_buf); - else if (msg_type(t_msg) == ORIGINAL_MSG) - *buf = tipc_link_failover_rcv(l_ptr, t_buf); - else - pr_warn("%sunknown tunnel pkt received\n", link_co_err); + /* Was packet already delivered? */ + if (less(buf_seqno(iskb), link->reset_checkpoint)) { + kfree_skb(iskb); + iskb = NULL; + goto exit; + } + if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { + link->stats.recv_fragments++; + tipc_buf_append(&link->reasm_buf, &iskb); + } exit: - kfree_skb(t_buf); - return *buf != NULL; + if (link && (!link->exp_msg_count) && (link->flags & LINK_STOPPED)) + tipc_link_delete(link); + kfree_skb(*skb); + *skb = iskb; + return *skb; } static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) -- cgit v1.2.3 From dff29b1a88524fe6afe296d6c477c491d1e02af0 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 2 Apr 2015 09:33:01 -0400 Subject: tipc: eliminate delayed link deletion at link failover When a bearer is disabled manually, all its links have to be reset and deleted. However, if there is a remaining, parallel link ready to take over a deleted link's traffic, we currently delay the delete of the removed link until the failover procedure is finished. This is because the remaining link needs to access state from the reset link, such as the last received packet number, and any partially reassembled buffer, in order to perform a successful failover. In this commit, we do instead move the state data over to the new link, so that it can fulfill the procedure autonomously, without accessing any data on the old link. This means that we can now proceed and delete all pertaining links immediately when a bearer is disabled. This saves us from some unnecessary complexity in such situations. We also choose to change the confusing definitions CHANGEOVER_PROTOCOL, ORIGINAL_MSG and DUPLICATE_MSG to the more descriptive TUNNEL_PROTOCOL, FAILOVER_MSG and SYNCH_MSG respectively. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 124 +++++++++++++++++++++++++------------------------------- net/tipc/link.h | 17 ++++---- net/tipc/msg.c | 4 +- net/tipc/msg.h | 10 ++--- net/tipc/node.c | 13 +++--- 5 files changed, 78 insertions(+), 90 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/link.c b/net/tipc/link.c index c697cf69da91..b1e17953eeea 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -89,17 +89,9 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { #define TIMEOUT_EVT 560817u /* link timer expired */ /* - * The following two 'message types' is really just implementation - * data conveniently stored in the message header. - * They must not be considered part of the protocol + * State value stored in 'failover_pkts' */ -#define OPEN_MSG 0 -#define CLOSED_MSG 1 - -/* - * State value stored in 'exp_msg_count' - */ -#define START_CHANGEOVER 100000u +#define FIRST_FAILOVER 0xffffu static void link_handle_out_of_seq_msg(struct tipc_link *link, struct sk_buff *skb); @@ -113,8 +105,7 @@ static void tipc_link_sync_xmit(struct tipc_link *l); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); -static bool tipc_link_failover_rcv(struct tipc_node *node, - struct sk_buff **skb); +static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); /* * Simple link routines */ @@ -332,15 +323,19 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, } /** - * link_delete - Conditional deletion of link. - * If timer still running, real delete is done when it expires - * @link: link to be deleted + * tipc_link_delete - Delete a link + * @l: link to be deleted */ -void tipc_link_delete(struct tipc_link *link) +void tipc_link_delete(struct tipc_link *l) { - tipc_link_reset_fragments(link); - tipc_node_detach_link(link->owner, link); - tipc_link_put(link); + tipc_link_reset(l); + if (del_timer(&l->timer)) + tipc_link_put(l); + l->flags |= LINK_STOPPED; + /* Delete link now, or when timer is finished: */ + tipc_link_reset_fragments(l); + tipc_node_detach_link(l->owner, l); + tipc_link_put(l); } void tipc_link_delete_list(struct net *net, unsigned int bearer_id, @@ -349,23 +344,12 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *link; struct tipc_node *node; - bool del_link; rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_lock(node); link = node->links[bearer_id]; - if (!link) { - tipc_node_unlock(node); - continue; - } - del_link = !tipc_link_is_up(link) && !link->exp_msg_count; - tipc_link_reset(link); - if (del_timer(&link->timer)) - tipc_link_put(link); - link->flags |= LINK_STOPPED; - /* Delete link now, or when failover is finished: */ - if (shutting_down || !tipc_node_is_up(node) || del_link) + if (link) tipc_link_delete(link); tipc_node_unlock(node); } @@ -472,9 +456,9 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) void tipc_link_reset(struct tipc_link *l_ptr) { u32 prev_state = l_ptr->state; - u32 checkpoint = l_ptr->next_in_no; int was_active_link = tipc_link_is_active(l_ptr); struct tipc_node *owner = l_ptr->owner; + struct tipc_link *pl = tipc_parallel_link(l_ptr); msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); @@ -492,11 +476,15 @@ void tipc_link_reset(struct tipc_link *l_ptr) tipc_node_link_down(l_ptr->owner, l_ptr); tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); - if (was_active_link && tipc_node_active_links(l_ptr->owner)) { - l_ptr->reset_checkpoint = checkpoint; - l_ptr->exp_msg_count = START_CHANGEOVER; + if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { + l_ptr->flags |= LINK_FAILINGOVER; + l_ptr->failover_checkpt = l_ptr->next_in_no; + pl->failover_pkts = FIRST_FAILOVER; + pl->failover_checkpt = l_ptr->next_in_no; + pl->failover_skb = l_ptr->reasm_buf; + } else { + kfree_skb(l_ptr->reasm_buf); } - /* Clean up all queues, except inputq: */ __skb_queue_purge(&l_ptr->transmq); __skb_queue_purge(&l_ptr->deferdq); @@ -506,6 +494,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; tipc_link_purge_backlog(l_ptr); + l_ptr->reasm_buf = NULL; l_ptr->rcv_unacked = 0; l_ptr->checkpoint = 1; l_ptr->next_out_no = 1; @@ -557,8 +546,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) return; /* Not yet. */ - /* Check whether changeover is going on */ - if (l_ptr->exp_msg_count) { + if (l_ptr->flags & LINK_FAILINGOVER) { if (event == TIMEOUT_EVT) link_set_timer(l_ptr, cont_intv); return; @@ -1242,7 +1230,7 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) node->action_flags |= TIPC_NAMED_MSG_EVT; return true; case MSG_BUNDLER: - case CHANGEOVER_PROTOCOL: + case TUNNEL_PROTOCOL: case MSG_FRAGMENTER: case BCAST_PROTOCOL: return false; @@ -1269,14 +1257,14 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) return; switch (msg_user(msg)) { - case CHANGEOVER_PROTOCOL: + case TUNNEL_PROTOCOL: if (msg_dup(msg)) { link->flags |= LINK_SYNCHING; link->synch_point = msg_seqno(msg_get_wrapped(msg)); kfree_skb(skb); break; } - if (!tipc_link_failover_rcv(node, &skb)) + if (!tipc_link_failover_rcv(link, &skb)) break; if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { tipc_data_input(link, skb); @@ -1391,8 +1379,8 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, u32 msg_size = sizeof(l_ptr->proto_msg); int r_flag; - /* Don't send protocol message during link changeover */ - if (l_ptr->exp_msg_count) + /* Don't send protocol message during link failover */ + if (l_ptr->flags & LINK_FAILINGOVER) return; /* Abort non-RESET send if communication with node is prohibited */ @@ -1444,7 +1432,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, } l_ptr->stats.sent_states++; } else { /* RESET_MSG or ACTIVATE_MSG */ - msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1)); + msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); msg_set_seq_gap(msg, 0); msg_set_next_sent(msg, 1); msg_set_probe(msg, 0); @@ -1486,8 +1474,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, u32 msg_tol; struct tipc_msg *msg = buf_msg(buf); - /* Discard protocol message during link changeover */ - if (l_ptr->exp_msg_count) + if (l_ptr->flags & LINK_FAILINGOVER) goto exit; if (l_ptr->net_plane != msg_net_plane(msg)) @@ -1659,8 +1646,8 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) if (!tunnel) return; - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, - ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); + tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, + FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); tipc_link_purge_backlog(l_ptr); msgcount = skb_queue_len(&l_ptr->transmq); @@ -1722,8 +1709,8 @@ void tipc_link_dup_queue_xmit(struct tipc_link *link, struct sk_buff_head *queue = &link->transmq; int mcnt; - tipc_msg_init(link_own_addr(link), &tnl_hdr, CHANGEOVER_PROTOCOL, - DUPLICATE_MSG, INT_H_SIZE, link->addr); + tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, + SYNCH_MSG, INT_H_SIZE, link->addr); mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq); msg_set_msgcnt(&tnl_hdr, mcnt); msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id); @@ -1756,36 +1743,37 @@ tunnel_queue: goto tunnel_queue; } -/* tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet +/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet * Owner node is locked. */ -static bool tipc_link_failover_rcv(struct tipc_node *node, +static bool tipc_link_failover_rcv(struct tipc_link *link, struct sk_buff **skb) { struct tipc_msg *msg = buf_msg(*skb); struct sk_buff *iskb = NULL; - struct tipc_link *link = NULL; + struct tipc_link *pl = NULL; int bearer_id = msg_bearer_id(msg); int pos = 0; - if (msg_type(msg) != ORIGINAL_MSG) { + if (msg_type(msg) != FAILOVER_MSG) { pr_warn("%sunknown tunnel pkt received\n", link_co_err); goto exit; } if (bearer_id >= MAX_BEARERS) goto exit; - link = node->links[bearer_id]; - if (!link) + + if (bearer_id == link->bearer_id) goto exit; - if (tipc_link_is_up(link)) - tipc_link_reset(link); - /* First failover packet? */ - if (link->exp_msg_count == START_CHANGEOVER) - link->exp_msg_count = msg_msgcnt(msg); + pl = link->owner->links[bearer_id]; + if (pl && tipc_link_is_up(pl)) + tipc_link_reset(pl); + + if (link->failover_pkts == FIRST_FAILOVER) + link->failover_pkts = msg_msgcnt(msg); /* Should we expect an inner packet? */ - if (!link->exp_msg_count) + if (!link->failover_pkts) goto exit; if (!tipc_msg_extract(*skb, &iskb, &pos)) { @@ -1793,22 +1781,22 @@ static bool tipc_link_failover_rcv(struct tipc_node *node, *skb = NULL; goto exit; } - link->exp_msg_count--; + link->failover_pkts--; *skb = NULL; - /* Was packet already delivered? */ - if (less(buf_seqno(iskb), link->reset_checkpoint)) { + /* Was this packet already delivered? */ + if (less(buf_seqno(iskb), link->failover_checkpt)) { kfree_skb(iskb); iskb = NULL; goto exit; } if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { link->stats.recv_fragments++; - tipc_buf_append(&link->reasm_buf, &iskb); + tipc_buf_append(&link->failover_skb, &iskb); } exit: - if (link && (!link->exp_msg_count) && (link->flags & LINK_STOPPED)) - tipc_link_delete(link); + if (!link->failover_pkts && pl) + pl->flags &= ~LINK_FAILINGOVER; kfree_skb(*skb); *skb = iskb; return *skb; diff --git a/net/tipc/link.h b/net/tipc/link.h index d2b5663643da..6e28f03c7905 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -58,9 +58,10 @@ /* Link endpoint execution states */ -#define LINK_STARTED 0x0001 -#define LINK_STOPPED 0x0002 -#define LINK_SYNCHING 0x0004 +#define LINK_STARTED 0x0001 +#define LINK_STOPPED 0x0002 +#define LINK_SYNCHING 0x0004 +#define LINK_FAILINGOVER 0x0008 /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -167,11 +168,12 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; + u16 synch_point; - /* Changeover */ - u32 exp_msg_count; - u32 reset_checkpoint; - u32 synch_point; + /* Failover */ + u16 failover_pkts; + u16 failover_checkpt; + struct sk_buff *failover_skb; /* Max packet negotiation */ u32 max_pkt; @@ -201,7 +203,6 @@ struct tipc_link { struct sk_buff_head wakeupq; /* Fragmentation/reassembly */ - u32 long_msg_seq_no; struct sk_buff *reasm_buf; /* Statistics */ diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 3bb499c61918..c3e96e815418 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -355,7 +355,7 @@ bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu) start = align(bsz); pad = start - bsz; - if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) + if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; @@ -433,7 +433,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode) if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (msg_user(msg) == CHANGEOVER_PROTOCOL) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; if (msg_user(msg) == BCAST_PROTOCOL) return false; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d273207ede28..e1d3595e2ee9 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -72,7 +72,7 @@ struct plist; #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 -#define CHANGEOVER_PROTOCOL 10 +#define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 @@ -512,8 +512,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) /* * Changeover tunnel message types */ -#define DUPLICATE_MSG 0 -#define ORIGINAL_MSG 1 +#define SYNCH_MSG 0 +#define FAILOVER_MSG 1 /* * Config protocol message types @@ -556,9 +556,9 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) static inline bool msg_dup(struct tipc_msg *m) { - if (likely(msg_user(m) != CHANGEOVER_PROTOCOL)) + if (likely(msg_user(m) != TUNNEL_PROTOCOL)) return false; - if (msg_type(m) != DUPLICATE_MSG) + if (msg_type(m) != SYNCH_MSG) return false; return true; } diff --git a/net/tipc/node.c b/net/tipc/node.c index 3e4f04897c03..f3d522c2881a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -394,18 +394,17 @@ static void node_lost_contact(struct tipc_node *n_ptr) n_ptr->bclink.recv_permitted = false; } - /* Abort link changeover */ + /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link *l_ptr = n_ptr->links[i]; if (!l_ptr) continue; - l_ptr->reset_checkpoint = l_ptr->next_in_no; - l_ptr->exp_msg_count = 0; + l_ptr->flags &= ~LINK_FAILINGOVER; + l_ptr->failover_checkpt = 0; + l_ptr->failover_pkts = 0; + kfree_skb(l_ptr->failover_skb); + l_ptr->failover_skb = NULL; tipc_link_reset_fragments(l_ptr); - - /* Link marked for deletion after failover? => do it now */ - if (l_ptr->flags & LINK_STOPPED) - tipc_link_delete(l_ptr); } n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; -- cgit v1.2.3 From ed193ece2649c194a87a9d8470195760d367c075 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 2 Apr 2015 09:33:02 -0400 Subject: tipc: simplify link mtu negotiation When a link is being established, the two endpoints advertise their respective interface MTU in the transmitted RESET and ACTIVATE messages. If there is any difference, the lower of the two MTUs will be selected for use by both endpoints. However, as a remnant of earlier attempts to introduce TIPC level routing. there also exists an MTU discovery mechanism. If an intermediate node has a lower MTU than the two endpoints, they will discover this through a bisectional approach, and finally adopt this MTU for common use. Since there is no TIPC level routing, and probably never will be, this mechanism doesn't make any sense, and only serves to make the link level protocol unecessarily complex. In this commit, we eliminate the MTU discovery algorithm,and fall back to the simple MTU advertising approach. This change is fully backwards compatible. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 4 +- net/tipc/link.c | 129 ++++++++++++++----------------------------------------- net/tipc/link.h | 12 +++--- net/tipc/node.c | 9 ++-- 4 files changed, 43 insertions(+), 111 deletions(-) (limited to 'net/tipc/link.c') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index ae558dd7f8ee..c5cbdcb1f0b5 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -413,7 +413,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) */ if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) { tipc_link_proto_xmit(node->active_links[node->addr & 1], - STATE_MSG, 0, 0, 0, 0, 0); + STATE_MSG, 0, 0, 0, 0); tn->bcl->stats.sent_acks++; } } @@ -899,7 +899,7 @@ int tipc_bclink_init(struct net *net) skb_queue_head_init(&bclink->inputq); bcl->owner = &bclink->node; bcl->owner->net = net; - bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; + bcl->mtu = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->bearer_id = MAX_BEARERS; rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer); diff --git a/net/tipc/link.c b/net/tipc/link.c index b1e17953eeea..a6b30df6ec02 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -136,34 +136,6 @@ static struct tipc_link *tipc_parallel_link(struct tipc_link *l) return l->owner->active_links[1]; } -static void link_init_max_pkt(struct tipc_link *l_ptr) -{ - struct tipc_node *node = l_ptr->owner; - struct tipc_net *tn = net_generic(node->net, tipc_net_id); - struct tipc_bearer *b_ptr; - u32 max_pkt; - - rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]); - if (!b_ptr) { - rcu_read_unlock(); - return; - } - max_pkt = (b_ptr->mtu & ~3); - rcu_read_unlock(); - - if (max_pkt > MAX_MSG_SIZE) - max_pkt = MAX_MSG_SIZE; - - l_ptr->max_pkt_target = max_pkt; - if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT) - l_ptr->max_pkt = l_ptr->max_pkt_target; - else - l_ptr->max_pkt = MAX_PKT_DEFAULT; - - l_ptr->max_pkt_probes = 0; -} - /* * Simple non-static link routines (i.e. referenced outside this file) */ @@ -304,7 +276,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, msg_set_bearer_id(msg, b_ptr->identity); strcpy((char *)msg_data(msg), if_name); l_ptr->net_plane = b_ptr->net_plane; - link_init_max_pkt(l_ptr); + l_ptr->advertised_mtu = b_ptr->mtu; + l_ptr->mtu = l_ptr->advertised_mtu; l_ptr->priority = b_ptr->priority; tipc_link_set_queue_limits(l_ptr, b_ptr->window); l_ptr->next_out_no = 1; @@ -465,8 +438,8 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Link is down, accept any session */ l_ptr->peer_session = INVALID_SESSION; - /* Prepare for max packet size negotiation */ - link_init_max_pkt(l_ptr); + /* Prepare for renewed mtu size negotiation */ + l_ptr->mtu = l_ptr->advertised_mtu; l_ptr->state = RESET_UNKNOWN; @@ -563,11 +536,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->checkpoint = l_ptr->next_in_no; if (tipc_bclink_acks_missing(l_ptr->owner)) { tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } else if (l_ptr->max_pkt < l_ptr->max_pkt_target) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; } link_set_timer(l_ptr, cont_intv); @@ -575,7 +544,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } l_ptr->state = WORKING_UNKNOWN; l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv / 4); break; @@ -586,7 +555,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -609,7 +578,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -620,13 +589,13 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->checkpoint = l_ptr->next_in_no; if (tipc_bclink_acks_missing(l_ptr->owner)) { tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; } link_set_timer(l_ptr, cont_intv); } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); + 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv / 4); } else { /* Link has failed */ @@ -636,7 +605,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = RESET_UNKNOWN; l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, RESET_MSG, - 0, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); } @@ -656,7 +625,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = WORKING_WORKING; l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); @@ -666,7 +635,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 1, 0, 0, 0, 0); + 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -676,7 +645,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) link_set_timer(l_ptr, cont_intv); break; case TIMEOUT_EVT: - tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -694,7 +663,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) l_ptr->state = WORKING_WORKING; l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); l_ptr->fsm_msg_cnt++; if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); @@ -704,7 +673,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) break; case TIMEOUT_EVT: tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); + 0, 0, 0, 0); l_ptr->fsm_msg_cnt++; link_set_timer(l_ptr, cont_intv); break; @@ -733,7 +702,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct tipc_msg *msg = buf_msg(skb_peek(list)); unsigned int maxwin = link->window; unsigned int imp = msg_importance(msg); - uint mtu = link->max_pkt; + uint mtu = link->mtu; uint ack = mod(link->next_in_no - 1); uint seqno = link->next_out_no; uint bc_last_in = link->owner->bclink.last_in; @@ -1187,7 +1156,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) link_retrieve_defq(l_ptr, &head); if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); } tipc_link_input(l_ptr, skb); skb = NULL; @@ -1362,7 +1331,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { l_ptr->stats.deferred_recv++; if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); } else { l_ptr->stats.duplicates++; } @@ -1372,7 +1341,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, * Send protocol message to the other endpoint. */ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, - u32 gap, u32 tolerance, u32 priority, u32 ack_mtu) + u32 gap, u32 tolerance, u32 priority) { struct sk_buff *buf = NULL; struct tipc_msg *msg = l_ptr->pmsg; @@ -1410,26 +1379,11 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, l_ptr->stats.sent_nacks++; msg_set_link_tolerance(msg, tolerance); msg_set_linkprio(msg, priority); - msg_set_max_pkt(msg, ack_mtu); + msg_set_max_pkt(msg, l_ptr->mtu); msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); msg_set_probe(msg, probe_msg != 0); - if (probe_msg) { - u32 mtu = l_ptr->max_pkt; - - if ((mtu < l_ptr->max_pkt_target) && - link_working_working(l_ptr) && - l_ptr->fsm_msg_cnt) { - msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; - if (l_ptr->max_pkt_probes == 10) { - l_ptr->max_pkt_target = (msg_size - 4); - l_ptr->max_pkt_probes = 0; - msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; - } - l_ptr->max_pkt_probes++; - } - + if (probe_msg) l_ptr->stats.sent_probes++; - } l_ptr->stats.sent_states++; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); @@ -1438,7 +1392,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_probe(msg, 0); msg_set_link_tolerance(msg, l_ptr->tolerance); msg_set_linkprio(msg, l_ptr->priority); - msg_set_max_pkt(msg, l_ptr->max_pkt_target); + msg_set_max_pkt(msg, l_ptr->advertised_mtu); } r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); @@ -1469,8 +1423,6 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf) { u32 rec_gap = 0; - u32 max_pkt_info; - u32 max_pkt_ack; u32 msg_tol; struct tipc_msg *msg = buf_msg(buf); @@ -1513,15 +1465,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (msg_linkprio(msg) > l_ptr->priority) l_ptr->priority = msg_linkprio(msg); - max_pkt_info = msg_max_pkt(msg); - if (max_pkt_info) { - if (max_pkt_info < l_ptr->max_pkt_target) - l_ptr->max_pkt_target = max_pkt_info; - if (l_ptr->max_pkt > l_ptr->max_pkt_target) - l_ptr->max_pkt = l_ptr->max_pkt_target; - } else { - l_ptr->max_pkt = l_ptr->max_pkt_target; - } + if (l_ptr->mtu > msg_max_pkt(msg)) + l_ptr->mtu = msg_max_pkt(msg); /* Synchronize broadcast link info, if not done previously */ if (!tipc_node_is_up(l_ptr->owner)) { @@ -1566,18 +1511,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, mod(l_ptr->next_in_no)); } - max_pkt_ack = msg_max_pkt(msg); - if (max_pkt_ack > l_ptr->max_pkt) { - l_ptr->max_pkt = max_pkt_ack; - l_ptr->max_pkt_probes = 0; - } - - max_pkt_ack = 0; - if (msg_probe(msg)) { + if (msg_probe(msg)) l_ptr->stats.recv_probes++; - if (msg_size(msg) > sizeof(l_ptr->proto_msg)) - max_pkt_ack = msg_size(msg); - } /* Protocol message before retransmits, reduce loss risk */ if (l_ptr->owner->bclink.recv_permitted) @@ -1585,8 +1520,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, msg_last_bcast(msg)); if (rec_gap || (msg_probe(msg))) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, rec_gap, 0, - 0, max_pkt_ack); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, + rec_gap, 0, 0); } if (msg_seq_gap(msg)) { l_ptr->stats.recv_nacks++; @@ -1816,7 +1751,7 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { - int max_bulk = TIPC_MAX_PUBLICATIONS / (l->max_pkt / ITEM_SIZE); + int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); l->window = win; l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; @@ -1988,14 +1923,14 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); link_set_supervision_props(link, tol); - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0, 0); + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); link->priority = prio; - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio, 0); + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio); } if (props[TIPC_NLA_PROP_WIN]) { u32 win; @@ -2100,7 +2035,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(tn->own_addr))) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->max_pkt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no)) goto attr_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index 6e28f03c7905..b5b4e3554d4e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -123,9 +123,8 @@ struct tipc_stats { * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_checkpoint: seq # of last acknowledged message at time of link reset - * @max_pkt: current maximum packet size for this link - * @max_pkt_target: desired maximum packet size for this link - * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target) + * @mtu: current maximum packet size for this link + * @advertised_mtu: advertised own mtu when link is being established * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent * @next_out_no: next sequence number to use for outbound messages @@ -176,9 +175,8 @@ struct tipc_link { struct sk_buff *failover_skb; /* Max packet negotiation */ - u32 max_pkt; - u32 max_pkt_target; - u32 max_pkt_probes; + u16 mtu; + u16 advertised_mtu; /* Sending */ struct sk_buff_head transmq; @@ -233,7 +231,7 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, - u32 gap, u32 tolerance, u32 priority, u32 acked_mtu); + u32 gap, u32 tolerance, u32 priority); void tipc_link_push_packets(struct tipc_link *l_ptr); u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *buf); void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); diff --git a/net/tipc/node.c b/net/tipc/node.c index f3d522c2881a..22c059ad2999 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -254,8 +254,8 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) active[0] = active[1] = l_ptr; exit: /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; } /** @@ -319,11 +319,10 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) /* Leave room for changeover header when returning 'mtu' to users: */ if (active[0]) { - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; return; } - /* Loopback link went down? No fragmentation needed from now on. */ if (n_ptr->addr == tn->own_addr) { n_ptr->act_mtus[0] = MAX_MSG_SIZE; -- cgit v1.2.3