From 3181faa85bda3dc3f5e630a1846526c9caaa38e3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Jun 2011 09:03:47 +0200 Subject: cfq-iosched: fix a rcu warning I got a rcu warnning at boot. the ioc->ioc_data is rcu_deferenced, but doesn't hold rcu_read_lock. Signed-off-by: Shaohua Li Cc: stable@kernel.org # after ab4bd22d Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f3799432676d..fd566c1e2617 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2773,11 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, smp_wmb(); cic->key = cfqd_dead_key(cfqd); + rcu_read_lock(); if (rcu_dereference(ioc->ioc_data) == cic) { + rcu_read_unlock(); spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); spin_unlock(&ioc->lock); - } + } else + rcu_read_unlock(); if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); -- cgit v1.2.1 From 726e99ab88db059fe1422e15376ae404f8c66eb4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Jun 2011 09:03:48 +0200 Subject: cfq-iosched: make code consistent ioc->ioc_data is rcu protectd, so uses correct API to access it. This doesn't change any behavior, but just make code consistent. Signed-off-by: Shaohua Li Cc: stable@kernel.org # after ab4bd22d Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd566c1e2617..ae21919f15e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3087,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); - BUG_ON(ioc->ioc_data == cic); + BUG_ON(rcu_dereference_check(ioc->ioc_data, + lockdep_is_held(&ioc->lock)) == cic); radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); -- cgit v1.2.1 From 0cfdd247d1779d5ffc8f685b172a526ecdc6773f Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 May 2011 11:14:35 +0200 Subject: drbd: Use the correct max_bio_size when creating resync requests Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_worker.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d76b06b6b20..4d3e6f6213ba 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -536,12 +536,7 @@ static int w_make_resync_request(struct drbd_conf *mdev, return 1; } - /* starting with drbd 8.3.8, we can handle multi-bio EEs, - * if it should be necessary */ - max_bio_size = - mdev->agreed_pro_version < 94 ? queue_max_hw_sectors(mdev->rq_queue) << 9 : - mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_BIO_SIZE; - + max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; number = drbd_rs_number_requests(mdev); if (number == 0) goto requeue; -- cgit v1.2.1 From 829c60878626be290a4c248e8f1b86a0d5cbd38b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 3 Jun 2011 21:18:13 +0200 Subject: drbd: add missing spinlock to bitmap receive During bitmap exchange, when using the RLE bitmap compression scheme, we have a code path that can set the whole bitmap at once. To avoid holding spin_lock_irq() for too long, we used to lock out other bitmap modifications during bitmap exchange by other means, and then, knowing we have exclusive access to the bitmap, modify it without the spinlock, and with IRQs enabled. Since we now allow local IO to continue, potentially setting additional bits during the bitmap receive phase, this is no longer true, and we get uncoordinated updates of bitmap members, causing bm_set to no longer accurately reflect the total number of set bits. To actually see this, you'd need to have a large bitmap, use RLE bitmap compression, and have busy IO during sync handshake and bitmap exchange. Fix this by taking the spin_lock_irq() in this code path as well, but calling cond_resched_lock() after each page worth of bits processed. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index f440a02dfdb1..1e89a74ddb17 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -112,9 +112,6 @@ struct drbd_bitmap { struct task_struct *bm_task; }; -static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, - unsigned long e, int val, const enum km_type km); - #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) { @@ -1256,7 +1253,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f * expected to be called for only a few bits (e - s about BITS_PER_LONG). * Must hold bitmap lock already. */ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, - unsigned long e, int val, const enum km_type km) + unsigned long e, int val) { struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr = NULL; @@ -1274,14 +1271,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); if (page_nr != last_page_nr) { if (p_addr) - __bm_unmap(p_addr, km); + __bm_unmap(p_addr, KM_IRQ1); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) bm_set_page_need_writeout(b->bm_pages[last_page_nr]); changed_total += c; c = 0; - p_addr = __bm_map_pidx(b, page_nr, km); + p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1); last_page_nr = page_nr; } if (val) @@ -1290,7 +1287,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); } if (p_addr) - __bm_unmap(p_addr, km); + __bm_unmap(p_addr, KM_IRQ1); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) @@ -1318,7 +1315,7 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) bm_print_lock_info(mdev); - c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); + c = __bm_change_bits_to(mdev, s, e, val); spin_unlock_irqrestore(&b->bm_lock, flags); return c; @@ -1343,16 +1340,17 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, { int i; int bits; - unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; b->bm_set += BITS_PER_LONG - bits; } - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr, KM_IRQ1); } -/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. +/* Same thing as drbd_bm_set_bits, + * but more efficient for a large bit range. * You must first drbd_bm_lock(). * Can be called to set the whole bitmap in one go. * Sets bits from s to e _inclusive_. */ @@ -1366,6 +1364,7 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi * Do not use memset, because we must account for changes, * so we need to loop over the words with hweight() anyways. */ + struct drbd_bitmap *b = mdev->bitmap; unsigned long sl = ALIGN(s,BITS_PER_LONG); unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); int first_page; @@ -1376,15 +1375,19 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi if (e - s <= 3*BITS_PER_LONG) { /* don't bother; el and sl may even be wrong. */ - __bm_change_bits_to(mdev, s, e, 1, KM_USER0); + spin_lock_irq(&b->bm_lock); + __bm_change_bits_to(mdev, s, e, 1); + spin_unlock_irq(&b->bm_lock); return; } /* difference is large enough that we can trust sl and el */ + spin_lock_irq(&b->bm_lock); + /* bits filling the current long */ if (sl) - __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); + __bm_change_bits_to(mdev, s, sl-1, 1); first_page = sl >> (3 + PAGE_SHIFT); last_page = el >> (3 + PAGE_SHIFT); @@ -1397,7 +1400,7 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi /* first and full pages, unless first page == last page */ for (page_nr = first_page; page_nr < last_page; page_nr++) { bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); - cond_resched(); + cond_resched_lock(&b->bm_lock); first_word = 0; } @@ -1411,7 +1414,8 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi * it would trigger an assert in __bm_change_bits_to() */ if (el <= e) - __bm_change_bits_to(mdev, el, e, 1, KM_USER0); + __bm_change_bits_to(mdev, el, e, 1); + spin_unlock_irq(&b->bm_lock); } /* returns bit state -- cgit v1.2.1 From 8ccee20e3ef4e12dbf02a18f17d386569b1f73ee Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 6 Jun 2011 11:31:42 +0200 Subject: drbd: don't cond_resched_lock with IRQs disabled The last commit, drbd: add missing spinlock to bitmap receive, introduced a cond_resched_lock(), where the lock in question is taken with irqs disabled. As we must not schedule with IRQs disabled, and cond_resched_lock_irq() does not exist, yet, we re-aquire the spin_lock_irq() for each bitmap page processed in turn. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 1e89a74ddb17..61f4fae9f67e 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1400,8 +1400,10 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi /* first and full pages, unless first page == last page */ for (page_nr = first_page; page_nr < last_page; page_nr++) { bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); - cond_resched_lock(&b->bm_lock); + spin_unlock_irq(&b->bm_lock); + cond_resched(); first_word = 0; + spin_lock_irq(&b->bm_lock); } /* last page (respectively only page, for first page == last page) */ -- cgit v1.2.1 From 5a8b424276f7ba50c51e7caf485b2be23739e5b8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Jun 2011 14:18:23 +0200 Subject: drbd: account bitmap IO during resync as resync-(related-)-io If we have a good resync rate, we will frequently update the on-disk bitmap, which, if not accounted for as resync io, may let an otherwise idle device appear to be "busy", and cause us to throttle resync. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 61f4fae9f67e..7b976296b564 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -991,6 +991,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bio_endio(bio, -EIO); } else { submit_bio(rw, bio); + /* this should not count as user activity and cause the + * resync to throttle -- see drbd_rs_should_slow_down(). */ + atomic_add(len >> 9, &mdev->rs_sect_ev); } } -- cgit v1.2.1 From cb6518cbef5e3e36b7ae90fcab610a52ea7e9fc0 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 20 Jun 2011 14:44:45 +0200 Subject: drbd: when receive times out on meta socket, also check last receive time on data socket If we have an asymetrically congested network, we may send P_PING, but due to congestion, the corresponding P_PING_ACK would time out, and we would drop a (congested, but otherwise) healthy connection ("PingAck did not arrive in time.") Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 25d32c5aa50a..43beaca53179 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4602,6 +4602,11 @@ int drbd_asender(struct drbd_thread *thi) dev_err(DEV, "meta connection shut down by peer.\n"); goto reconnect; } else if (rv == -EAGAIN) { + /* If the data socket received something meanwhile, + * that is good enough: peer is still alive. */ + if (time_after(mdev->last_received, + jiffies - mdev->meta.socket->sk->sk_rcvtimeo)) + continue; if (ping_timeout_active) { dev_err(DEV, "PingAck did not arrive in time.\n"); goto reconnect; @@ -4637,6 +4642,7 @@ int drbd_asender(struct drbd_thread *thi) goto reconnect; } if (received == expect) { + mdev->last_received = jiffies; D_ASSERT(cmd != NULL); if (!cmd->process(mdev, h)) goto reconnect; -- cgit v1.2.1 From 15b493d11fcce3c5547e3d7fb6d90e11ffe12777 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 28 Jun 2011 09:48:48 +0200 Subject: drbd: fix limit define, we support 1 PiByte now Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd_limits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 246f576c981d..447c36752385 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -117,10 +117,10 @@ /* drbdsetup XY resize -d Z * you are free to reduce the device size to nothing, if you want to. * the upper limit with 64bit kernel, enough ram and flexible meta data - * is 16 TB, currently. */ + * is 1 PiB, currently. */ /* DRBD_MAX_SECTORS */ #define DRBD_DISK_SIZE_SECT_MIN 0 -#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) +#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40)) #define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ #define DRBD_ON_IO_ERROR_DEF EP_PASS_ON -- cgit v1.2.1 From 86e1e98e5c6b4edab97e2b058466ef553cfd878e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 28 Jun 2011 13:22:48 +0200 Subject: drbd: we should write meta data updates with FLUSH FUA We used to write these with BIO_RW_BARRIER aka REQ_HARDBARRIER (unless disabled in the configuration). The correct semantic now would be to write with FLUSH/FUA. For example, with activity log transactions, FUA alone is not enough, we need the corresponding bitmap update (and all related application updates) on stable storage as well. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 09ef9a878ef0..cf0e63dd97da 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -79,7 +79,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, md_io.error = 0; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) - rw |= REQ_FUA; + rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; bio = bio_alloc(GFP_NOIO, 1); -- cgit v1.2.1