From c0d5f9db1c7d1b8a9e2f217706e8ea233bac2754 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Fri, 16 Sep 2011 08:27:31 -0600 Subject: libceph: initialize ack_stamp to avoid unnecessary connection reset Commit 4cf9d544631c recorded when an outgoing ceph message was ACKed, in order to avoid unnecessary connection resets when an OSD is busy. However, ack_stamp is uninitialized, so there is a window between when the message is sent and when it is ACKed in which handle_timeout() interprets the unitialized value as an expired timeout, and resets the connection unnecessarily. Close the window by initializing ack_stamp. Signed-off-by: Jim Schutt Signed-off-by: Sage Weil --- net/ceph/messenger.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c340e2e0765b..9918e9eb276e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2307,6 +2307,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->front_max = front_len; m->front_is_vmalloc = false; m->more_to_follow = false; + m->ack_stamp = 0; m->pool = NULL; /* middle */ -- cgit v1.2.1 From 1cad78932a0d139dceff78e68808e160a224d57a Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Mon, 12 Sep 2011 14:51:53 -0700 Subject: libceph: fix parse options memory leak ceph_destroy_options does not free opt->mon_addr that is allocated in ceph_parse_options. Signed-off-by: Noah Watkins Signed-off-by: Sage Weil --- net/ceph/ceph_common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 132963abc266..2883ea01e680 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -232,6 +232,7 @@ void ceph_destroy_options(struct ceph_options *opt) ceph_crypto_key_destroy(opt->key); kfree(opt->key); } + kfree(opt->mon_addr); kfree(opt); } EXPORT_SYMBOL(ceph_destroy_options); -- cgit v1.2.1 From 935b639a049053d0ccbcf7422f2f9cd221642f58 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Sep 2011 11:13:17 -0700 Subject: libceph: fix linger request requeuing The r_req_lru_item list node moves between several lists, and that cycle is not directly related (and does not begin) with __register_request(). Initialize it in the request constructor, not __register_request(). This fixes later badness (below) when OSDs restart underneath an rbd mount. Crashes we've seen due to this include: [ 213.974288] kernel BUG at net/ceph/messenger.c:2193! and [ 144.035274] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 [ 144.035278] IP: [] con_work+0x1463/0x2ce0 [libceph] Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 16836a7df7a6..88ad8a2501b5 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -217,6 +217,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); + INIT_LIST_HEAD(&req->r_req_lru_item); req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -816,13 +817,10 @@ static void __register_request(struct ceph_osd_client *osdc, { req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); - INIT_LIST_HEAD(&req->r_req_lru_item); - dout("__register_request %p tid %lld\n", req, req->r_tid); __insert_request(osdc, req); ceph_osdc_get_request(req); osdc->num_requests++; - if (osdc->num_requests == 1) { dout(" first request, scheduling timeout\n"); __schedule_osd_timeout(osdc); -- cgit v1.2.1 From 8b267b312df9343fea3bd679c509b36214b5a854 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Wed, 21 Sep 2011 16:06:42 +0200 Subject: batman-adv: do_bcast has to be true for broadcast packets only corrects a critical bug of the GW feature. This bug made all the unicast packets destined to a GW to be sent as broadcast. This bug is present even if the sender GW feature is configured as OFF. It's an urgent bug fix and should be committed as soon as possible. This was a regression introduced by 43676ab590c3f8686fd047d34c3e33803eef71f0 Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/soft-interface.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 3e2f91ffa4e2..05dd35114a27 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -565,7 +565,7 @@ static int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) struct orig_node *orig_node = NULL; int data_len = skb->len, ret; short vid = -1; - bool do_bcast = false; + bool do_bcast; if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE) goto dropped; @@ -598,15 +598,15 @@ static int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) tt_local_add(soft_iface, ethhdr->h_source); orig_node = transtable_search(bat_priv, ethhdr->h_dest); - if (is_multicast_ether_addr(ethhdr->h_dest) || - (orig_node && orig_node->gw_flags)) { + do_bcast = is_multicast_ether_addr(ethhdr->h_dest); + if (do_bcast || (orig_node && orig_node->gw_flags)) { ret = gw_is_target(bat_priv, skb, orig_node); if (ret < 0) goto dropped; - if (ret == 0) - do_bcast = true; + if (ret) + do_bcast = false; } /* ethernet packet should be broadcasted */ -- cgit v1.2.1 From 2015de5fe2a47086a3260802275932bfd810884e Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Tue, 27 Sep 2011 15:16:08 -0400 Subject: ipv6-multicast: Fix memory leak in input path. Have to free the skb before returning if we fail the fib lookup. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 705c82886281..825d02fa6586 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2052,8 +2052,10 @@ int ip6_mr_input(struct sk_buff *skb) int err; err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, -- cgit v1.2.1 From d4cae56219755ccf8acfc8e2c1927009ff29d8c6 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 26 Sep 2011 07:04:36 +0000 Subject: net: check return value for dst_alloc return value of dst_alloc must be checked before use Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 94fdcc7f1030..552df27dcf53 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1349,14 +1349,16 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) BUG(); } xdst = dst_alloc(dst_ops, NULL, 0, 0, 0); - memset(&xdst->u.rt6.rt6i_table, 0, sizeof(*xdst) - sizeof(struct dst_entry)); - xfrm_policy_put_afinfo(afinfo); - if (likely(xdst)) + if (likely(xdst)) { + memset(&xdst->u.rt6.rt6i_table, 0, + sizeof(*xdst) - sizeof(struct dst_entry)); xdst->flo.ops = &xfrm_bundle_fc_ops; - else + } else xdst = ERR_PTR(-ENOBUFS); + xfrm_policy_put_afinfo(afinfo); + return xdst; } -- cgit v1.2.1 From fbe58186901155c0cb5398dd343337be0c456c04 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 26 Sep 2011 07:04:56 +0000 Subject: ipv6: check return value for dst_alloc return value of dst_alloc must be checked before use Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1250f9020670..fb545edef6ea 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -244,7 +244,9 @@ static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, { struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); - memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); + if (rt != NULL) + memset(&rt->rt6i_table, 0, + sizeof(*rt) - sizeof(struct dst_entry)); return rt; } -- cgit v1.2.1 From 67928c4041606f02725f3c95c4c0404e4532df1b Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Fri, 23 Sep 2011 13:11:01 +0000 Subject: ipv6-multicast: Fix memory leak in IPv6 multicast. If reg_vif_xmit cannot find a routing entry, be sure to free the skb before returning the error. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 825d02fa6586..def0538e2413 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -696,8 +696,10 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, int err; err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; -- cgit v1.2.1 From 782e182e91e97f529a1edb30fdece9f1bef90ecc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Sep 2011 10:08:27 -0700 Subject: libceph: fix pg_temp mapping calculation We need to apply the modulo pg_num calculation before looking up a pgid in the pg_temp mapping rbtree. This fixes pg_temp mappings, and fixes (some) misdirected requests that result in messages like [WRN] client4104 10.0.1.219:0/275025290 misdirected client4104.1:129 0.1 to osd0 not [1,0] in e11/11 on the server and stall make the client block without getting a reply (at least until the pg_temp mapping goes way, but that can take a long long time). Reorder calc_pg_raw() a bit to make more sense. Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e97c3588c3ec..eceb8d56703c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1046,10 +1046,25 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; - unsigned poolid, ps, pps; + unsigned poolid, ps, pps, t; int preferred; + poolid = le32_to_cpu(pgid.pool); + ps = le16_to_cpu(pgid.ps); + preferred = (s16)le16_to_cpu(pgid.preferred); + + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return NULL; + /* pg_temp? */ + if (preferred >= 0) + t = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpg_num), + pool->lpgp_num_mask); + else + t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), + pool->pgp_num_mask); + pgid.ps = cpu_to_le16(t); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { *num = pg->len; @@ -1057,18 +1072,6 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } /* crush */ - poolid = le32_to_cpu(pgid.pool); - ps = le16_to_cpu(pgid.ps); - preferred = (s16)le16_to_cpu(pgid.preferred); - - /* don't forcefeed bad device ids to crush */ - if (preferred >= osdmap->max_osd || - preferred >= osdmap->crush->max_devices) - preferred = -1; - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); - if (!pool) - return NULL; ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { @@ -1078,6 +1081,11 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, return NULL; } + /* don't forcefeed bad device ids to crush */ + if (preferred >= osdmap->max_osd || + preferred >= osdmap->crush->max_devices) + preferred = -1; + if (preferred >= 0) pps = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpgp_num), -- cgit v1.2.1 From 8adc8b3d780363d5df0dd6ace10336e3d7e331a1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Sep 2011 10:11:04 -0700 Subject: libceph: fix pg_temp mapping update The incremental map updates have a record for each pg_temp mapping that is to be add/updated (len > 0) or removed (len == 0). The old code was written as if the updates were a complete enumeration; that was just wrong. Update the code to remove 0-length entries and drop the rbtree traversal. This avoids misdirected (and hung) requests that manifest as server errors like [WRN] client4104 10.0.1.219:0/275025290 misdirected client4104.1:129 0.1 to osd0 not [1,0] in e11/11 Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index eceb8d56703c..fd863fe76934 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -339,6 +339,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, struct ceph_pg_mapping *pg = NULL; int c; + dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); while (*p) { parent = *p; pg = rb_entry(parent, struct ceph_pg_mapping, node); @@ -366,16 +367,33 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, while (n) { pg = rb_entry(n, struct ceph_pg_mapping, node); c = pgid_cmp(pgid, pg->pgid); - if (c < 0) + if (c < 0) { n = n->rb_left; - else if (c > 0) + } else if (c > 0) { n = n->rb_right; - else + } else { + dout("__lookup_pg_mapping %llx got %p\n", + *(u64 *)&pgid, pg); return pg; + } } return NULL; } +static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) +{ + struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); + + if (pg) { + dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); + rb_erase(&pg->node, root); + kfree(pg); + return 0; + } + dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); + return -ENOENT; +} + /* * rbtree of pg pool info */ @@ -711,7 +729,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, void *start = *p; int err = -EINVAL; u16 version; - struct rb_node *rbp; ceph_decode_16_safe(p, end, version, bad); if (version > CEPH_OSDMAP_INC_VERSION) { @@ -861,7 +878,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pg_temp */ - rbp = rb_first(&map->pg_temp); ceph_decode_32_safe(p, end, len, bad); while (len--) { struct ceph_pg_mapping *pg; @@ -872,18 +888,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_copy(p, &pgid, sizeof(pgid)); pglen = ceph_decode_32(p); - /* remove any? */ - while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, - node)->pgid, pgid) <= 0) { - struct ceph_pg_mapping *cur = - rb_entry(rbp, struct ceph_pg_mapping, node); - - rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); - rb_erase(&cur->node, &map->pg_temp); - kfree(cur); - } - if (pglen) { /* insert */ ceph_decode_need(p, end, pglen*sizeof(u32), bad); @@ -903,17 +907,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); + } else { + /* remove */ + __remove_pg_mapping(&map->pg_temp, pgid); } } - while (rbp) { - struct ceph_pg_mapping *cur = - rb_entry(rbp, struct ceph_pg_mapping, node); - - rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); - rb_erase(&cur->node, &map->pg_temp); - kfree(cur); - } /* ignore the rest */ *p = end; -- cgit v1.2.1 From aabdcb0b553b9c9547b1a506b34d55a764745870 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 23 Sep 2011 08:23:47 +0000 Subject: can bcm: fix tx_setup off-by-one errors This patch fixes two off-by-one errors that canceled each other out. Checking for the same condition two times in bcm_tx_timeout_tsklet() reduced the count of frames to be sent by one. This did not show up the first time tx_setup is invoked as an additional frame is sent due to TX_ANNONCE. Invoking a second tx_setup on the same item led to a reduced (by 1) number of sent frames. Reported-by: Andre Naujoks Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/bcm.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index d6c8ae5b2e6a..c9cdb8d78e70 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -365,9 +365,6 @@ static void bcm_tx_timeout_tsklet(unsigned long data) bcm_send_to_user(op, &msg_head, NULL, 0); } - } - - if (op->kt_ival1.tv64 && (op->count > 0)) { /* send (next) frame */ bcm_can_tx(op); @@ -970,8 +967,9 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* spec: send can_frame when starting timer */ op->flags |= TX_ANNOUNCE; - if (op->kt_ival1.tv64 && (op->count > 0)) { - /* op->count-- is done in bcm_tx_timeout_handler */ + /* only start timer when having more frames than sent below */ + if (op->kt_ival1.tv64 && (op->count > 1)) { + /* op->count-- is done in bcm_tx_timeout_tsklet */ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); } else @@ -979,8 +977,11 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, HRTIMER_MODE_REL); } - if (op->flags & TX_ANNOUNCE) + if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); + if (op->kt_ival1.tv64 && (op->count > 0)) + op->count--; + } return msg_head->nframes * CFSIZ + MHSIZ; } -- cgit v1.2.1 From 676a1184e8afd4fed7948232df1ff91517400859 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 25 Sep 2011 02:21:30 +0000 Subject: ipv6: nullify ipv6_ac_list and ipv6_fl_list when creating new socket ipv6_ac_list and ipv6_fl_list from listening socket are inadvertently shared with new socket created for connection. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3c9fa618b69d..79cc6469508d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1383,6 +1383,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); @@ -1447,6 +1449,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, First: no IPv4 options. */ newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ -- cgit v1.2.1 From 85a64889492b45f931ddac87ec09d84aa7347ee1 Mon Sep 17 00:00:00 2001 From: Jonathan Lallinger Date: Thu, 29 Sep 2011 07:58:41 +0000 Subject: RDSRDMA: Fix cleanup of rds_iw_mr_pool In the rds_iw_mr_pool struct the free_pinned field keeps track of memory pinned by free MRs. While this field is incremented properly upon allocation, it is never decremented upon unmapping. This would cause the rds_rdma module to crash the kernel upon unloading, by triggering the BUG_ON in the rds_iw_destroy_mr_pool function. This change keeps track of the MRs that become unpinned, so that free_pinned can be decremented appropriately. Signed-off-by: Jonathan Lallinger Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- net/rds/iw_rdma.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 8b77edbab272..4e1de171866c 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -84,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list); + struct list_head *kill_list, + int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) @@ -499,7 +500,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) LIST_HEAD(unmap_list); LIST_HEAD(kill_list); unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, ncleaned = 0, unpinned = 0, free_goal; int ret = 0; rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); @@ -524,7 +525,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) * will be destroyed by the unmap function. */ if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, + &kill_list, &unpinned); /* If we've been asked to destroy all MRs, move those * that were simply cleaned to the kill list */ if (free_all) @@ -548,6 +550,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) spin_unlock_irqrestore(&pool->list_lock, flags); } + atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); @@ -828,7 +831,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, struct list_head *unmap_list, - struct list_head *kill_list) + struct list_head *kill_list, + int *unpinned) { struct rds_iw_mapping *mapping, *next; unsigned int ncleaned = 0; @@ -855,6 +859,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, spin_lock_irqsave(&pool->list_lock, flags); list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + *unpinned += mapping->m_sg.len; list_move(&mapping->m_list, &laundered); ncleaned++; } -- cgit v1.2.1 From 12d0d0d3a7349daa95dbfd5d7df8146255bc7c67 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 29 Sep 2011 15:33:47 -0400 Subject: can bcm: fix incomplete tx_setup fix The commit aabdcb0b553b9c9547b1a506b34d55a764745870 ("can bcm: fix tx_setup off-by-one errors") fixed only a part of the original problem reported by Andre Naujoks. It turned out that the original code needed to be re-ordered to reduce complexity and to finally fix the reported frame counting issues. Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/bcm.c | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index c9cdb8d78e70..c84963d2dee6 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -344,6 +344,18 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, } } +static void bcm_tx_start_timer(struct bcm_op *op) +{ + if (op->kt_ival1.tv64 && op->count) + hrtimer_start(&op->timer, + ktime_add(ktime_get(), op->kt_ival1), + HRTIMER_MODE_ABS); + else if (op->kt_ival2.tv64) + hrtimer_start(&op->timer, + ktime_add(ktime_get(), op->kt_ival2), + HRTIMER_MODE_ABS); +} + static void bcm_tx_timeout_tsklet(unsigned long data) { struct bcm_op *op = (struct bcm_op *)data; @@ -365,23 +377,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data) bcm_send_to_user(op, &msg_head, NULL, 0); } - - /* send (next) frame */ bcm_can_tx(op); - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival1), - HRTIMER_MODE_ABS); - } else { - if (op->kt_ival2.tv64) { + } else if (op->kt_ival2.tv64) + bcm_can_tx(op); - /* send (next) frame */ - bcm_can_tx(op); - hrtimer_start(&op->timer, - ktime_add(ktime_get(), op->kt_ival2), - HRTIMER_MODE_ABS); - } - } + bcm_tx_start_timer(op); } /* @@ -961,28 +962,21 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, hrtimer_cancel(&op->timer); } - if ((op->flags & STARTTIMER) && - ((op->kt_ival1.tv64 && op->count) || op->kt_ival2.tv64)) { - + if (op->flags & STARTTIMER) { + hrtimer_cancel(&op->timer); /* spec: send can_frame when starting timer */ op->flags |= TX_ANNOUNCE; - - /* only start timer when having more frames than sent below */ - if (op->kt_ival1.tv64 && (op->count > 1)) { - /* op->count-- is done in bcm_tx_timeout_tsklet */ - hrtimer_start(&op->timer, op->kt_ival1, - HRTIMER_MODE_REL); - } else - hrtimer_start(&op->timer, op->kt_ival2, - HRTIMER_MODE_REL); } if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); - if (op->kt_ival1.tv64 && (op->count > 0)) + if (op->count) op->count--; } + if (op->flags & STARTTIMER) + bcm_tx_start_timer(op); + return msg_head->nframes * CFSIZ + MHSIZ; } -- cgit v1.2.1 From 7091fbd82cd5686444ffe9935ed6a8190101fe9d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Sep 2011 10:38:28 +0000 Subject: make PACKET_STATISTICS getsockopt report consistently between ring and non-ring This is a minor change. Up until kernel 2.6.32, getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, ...) would return total and dropped packets since its last invocation. The introduction of socket queue overflow reporting [1] changed drop rate calculation in the normal packet socket path, but not when using a packet ring. As a result, the getsockopt now returns different statistics depending on the reception method used. With a ring, it still returns the count since the last call, as counts are incremented in tpacket_rcv and reset in getsockopt. Without a ring, it returns 0 if no drops occurred since the last getsockopt and the total drops over the lifespan of the socket otherwise. The culprit is this line in packet_rcv, executed on a drop: drop_n_acct: po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); As it shows, the new drop number it taken from the socket drop counter, which is not reset at getsockopt. I put together a small example that demonstrates the issue [2]. It runs for 10 seconds and overflows the queue/ring on every odd second. The reported drop rates are: ring: 16, 0, 16, 0, 16, ... non-ring: 0, 15, 0, 30, 0, 46, 0, 60, 0 , 74. Note how the even ring counts monotonically increase. Because the getsockopt adds tp_drops to tp_packets, total counts are similarly reported cumulatively. Long story short, reinstating the original code, as the below patch does, fixes the issue at the cost of additional per-packet cycles. Another solution that does not introduce per-packet overhead is be to keep the current data path, record the value of sk_drops at getsockopt() at call N in a new field in struct packetsock and subtract that when reporting at call N+1. I'll be happy to code that, instead, it's just more messy. [1] http://patchwork.ozlabs.org/patch/35665/ [2] http://kernel.googlecode.com/files/test-packetsock-getstatistics.c Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c698cec0a445..fabb4fafa281 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -961,7 +961,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, return 0; drop_n_acct: - po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_drops++; + atomic_inc(&sk->sk_drops); + spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { -- cgit v1.2.1 From 260fcbeb1ae9e768a44c9925338fbacb0d7e5ba9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 29 Sep 2011 17:10:10 +0000 Subject: tcp: properly handle md5sig_pool references tcp_v4_clear_md5_list() assumes that multiple tcp md5sig peers only hold one reference to md5sig_pool. but tcp_v4_md5_do_add() increases use count of md5sig_pool for each peer. This patch makes tcp_v4_md5_do_add() only increases use count for the first tcp md5sig peer. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv6/tcp_ipv6.c | 8 +++++--- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c34f01513945..7963e03f1068 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -927,18 +927,21 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + + md5sig = tp->md5sig_info; + if (md5sig->entries4 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } - md5sig = tp->md5sig_info; if (md5sig->alloced4 == md5sig->entries4) { keys = kmalloc((sizeof(*keys) * (md5sig->entries4 + 1)), GFP_ATOMIC); if (!keys) { kfree(newkey); - tcp_free_md5sig_pool(); + if (md5sig->entries4 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -982,6 +985,7 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; + tcp_free_md5sig_pool(); } else if (tp->md5sig_info->entries4 != i) { /* Need to do some manipulation */ memmove(&tp->md5sig_info->keys4[i], @@ -989,7 +993,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) (tp->md5sig_info->entries4 - i) * sizeof(struct tcp4_md5sig_key)); } - tcp_free_md5sig_pool(); return 0; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 79cc6469508d..7b8fc5794352 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -591,7 +591,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + if (tp->md5sig_info->entries6 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } @@ -600,8 +601,9 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); if (!keys) { - tcp_free_md5sig_pool(); kfree(newkey); + if (tp->md5sig_info->entries6 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -647,6 +649,7 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) kfree(tp->md5sig_info->keys6); tp->md5sig_info->keys6 = NULL; tp->md5sig_info->alloced6 = 0; + tcp_free_md5sig_pool(); } else { /* shrink the database */ if (tp->md5sig_info->entries6 != i) @@ -655,7 +658,6 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) (tp->md5sig_info->entries6 - i) * sizeof (tp->md5sig_info->keys6[0])); } - tcp_free_md5sig_pool(); return 0; } } -- cgit v1.2.1 From 1e5289e121372a3494402b1b131b41bfe1cf9b7f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 2 Oct 2011 04:21:50 +0000 Subject: tcp: properly update lost_cnt_hint during shifting lost_skb_hint is used by tcp_mark_head_lost() to mark the first unhandled skb. lost_cnt_hint is the number of packets or sacked packets before the lost_skb_hint; When shifting a skb that is before the lost_skb_hint, if tcp_is_fack() is ture, the skb has already been counted in the lost_cnt_hint; if tcp_is_fack() is false, tcp_sacktag_one() will increase the lost_cnt_hint. So tcp_shifted_skb() does not need to adjust the lost_cnt_hint by itself. When shifting a skb that is equal to lost_skb_hint, the shifted packets will not be counted by tcp_mark_head_lost(). So tcp_shifted_skb() should adjust the lost_cnt_hint even tcp_is_fack(tp) is true. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21fab3edb92c..d73aab3fbfc0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1389,9 +1389,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); - /* Tweak before seqno plays */ - if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && - !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; -- cgit v1.2.1 From 3458e21c0d384ca04b27a2ea24d9314c1b57530f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 5 Oct 2011 03:24:43 +0000 Subject: netfilter: Use proper rwlock init function Replace the open coded initialization with the init function. Signed-off-by: Thomas Gleixner Acked-by: Hans Schillstrom Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 2b771dc708a3..5290ac353a5e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3679,7 +3679,7 @@ int __net_init ip_vs_control_net_init(struct net *net) int idx; struct netns_ipvs *ipvs = net_ipvs(net); - ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + rwlock_init(&ipvs->rs_lock); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) -- cgit v1.2.1 From b64b73d7d0c480f75684519c6134e79d50c1b341 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 3 Oct 2011 18:14:45 +0000 Subject: bridge: leave carrier on for empty bridge This resolves a regression seen by some users of bridging. Some users use the bridge like a dummy device. They expect to be able to put an IPv6 address on the device with no ports attached. Although there are better ways of doing this, there is no reason to not allow it. Note: the bridge still will reflect the state of ports in the bridge if there are any added. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 32b8f9f7f79e..ff3ed6086ce1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -91,7 +91,6 @@ static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - netif_carrier_off(dev); netdev_update_features(dev); netif_start_queue(dev); br_stp_enable_bridge(br); @@ -108,8 +107,6 @@ static int br_dev_stop(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - netif_carrier_off(dev); - br_stp_disable_bridge(br); br_multicast_stop(br); -- cgit v1.2.1