From 383606e0dea6a380097dbcb0c319b09ca372f36b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 14:21:32 +0200 Subject: drbd: differentiate between normal and forced detach Aborting local requests (not waiting for completion from the lower level disk) is dangerous: if the master bio has been completed to upper layers, data pages may be re-used for other things already. If local IO is still pending and later completes, this may cause crashes or corrupt unrelated data. Only abort local IO if explicitly requested. Intended use case is a lower level device that turned into a tarpit, not completing io requests, not even doing error completion. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6d4de6a72e80..40a1c4f07190 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -950,6 +950,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * to realize a "hot spare" feature (not that I'd recommend that) */ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &mdev->flags); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { @@ -1345,6 +1348,7 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } if (dt.detach_force) { + set_bit(FORCE_DETACH, &mdev->flags); drbd_force_state(mdev, NS(disk, D_FAILED)); reply->ret_code = SS_SUCCESS; goto out; -- cgit v1.2.1 From c2ba686f353972cc89a006ffb6bab7ba1822271e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 15:14:06 +0200 Subject: drbd: report congestion if we are waiting for some userland callback If the drbd worker thread is synchronously waiting for some userland callback, we don't want some casual pageout to block on us. Have drbd_congested() report congestion in that case. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 40a1c4f07190..03fc853be2b9 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) char *argv[] = {usermode_helper, cmd, mb, NULL }; int ret; + if (current == mdev->worker.task) + set_bit(CALLBACK_PENDING, &mdev->flags); + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); if (get_net_conf(mdev)) { @@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + if (current == mdev->worker.task) + clear_bit(CALLBACK_PENDING, &mdev->flags); + if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; -- cgit v1.2.1 From 0029d62434d9045bc3e8b2eb48ae696e30336e92 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 18:02:52 +0200 Subject: drbd: do not reset rs_pending_cnt too early Fix asserts like block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 ! We reset the resync lru cache and related information (rs_pending_cnt), once we successfully finished a resync or online verify, or if the replication connection is lost. We also need to reset it if a resync or online verify is aborted because a lower level disk failed. In that case the replication link is still established, and we may still have packets queued in the network buffers which want to touch rs_pending_cnt. We do not have any synchronization mechanism to know for sure when all such pending resync related packets have been drained. To avoid this counter to go negative (and violate the ASSERT that it will always be >= 0), just do not reset it when we lose a disk. It is good enough to make sure it is re-initialized before the next resync can start: reset it when we re-attach a disk. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 03fc853be2b9..a68d9bfb731c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -959,6 +959,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp /* make sure there is no leftover from previous force-detach attempts */ clear_bit(FORCE_DETACH, &mdev->flags); + /* and no leftover from previously aborted resync or verify, either */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { -- cgit v1.2.1 From 7ee1fb93f390f7a7231abec4e34e6ab20abeed45 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Jun 2012 10:27:58 +0200 Subject: drbd: flush drbd work queue before invalidate/invalidate remote If you do back to back wait-sync/invalidate on a Primary in a tight loop, during application IO load, you could trigger a race: kernel: block drbd6: FIXME going to queue 'set_n_write from StartingSync' but 'write from resync_finished' still pending? Fix this by changing the order of the drbd_queue_work() and the wake_up() in dec_ap_pending(), and adding the additional drbd_flush_workqueue() before requesting the full sync. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index a68d9bfb731c..c47df7cf7f80 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1977,9 +1977,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -2018,9 +2020,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); -- cgit v1.2.1 From db141b2f42b485b700465fe2401fbe65c65b190c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 25 Jun 2012 19:15:58 +0200 Subject: drbd: fix max_bio_size to be unsigned We capped our max_bio_size respectively max_hw_sectors with min_t(int, lower level limit, our limit); unfortunately, some drivers, e.g. the kvm virtio block driver, initialize their limits to "-1U", and that is of course a smaller "int" value than our limit. Impact: we started to request 16 MB resync requests, which lead to protocol error and a reconnect loop. Fix all relevant constants and parameters to be unsigned int. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/block/drbd/drbd_nl.c') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c47df7cf7f80..fb9dce8daa24 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -801,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev) static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - int max_hw_sectors = max_bio_size >> 9; - int max_segments = 0; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; if (get_ldev_if_state(mdev, D_ATTACHING)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; @@ -835,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) { - int now, new, local, peer; + unsigned int now, new, local, peer; now = queue_max_hw_sectors(mdev->rq_queue) << 9; local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ @@ -846,13 +846,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) mdev->local_max_bio_size = local; put_ldev(mdev); } + local = min(local, DRBD_MAX_BIO_SIZE); /* We may ignore peer limits if the peer is modern enough. Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { if (mdev->agreed_pro_version < 94) { - peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; @@ -860,10 +861,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) peer = DRBD_MAX_BIO_SIZE; } - new = min_t(int, local, peer); + new = min(local, peer); if (mdev->state.role == R_PRIMARY && new < now) - dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now); if (new != now) dev_info(DEV, "max BIO size = %u\n", new); -- cgit v1.2.1