From 6be9d4940134b36f9ed020aead36f831f19b49f1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 23 May 2008 13:04:34 -0700 Subject: md: md: raid5 rate limit error printk Last night we had scsi problems and a hardware raid unit was offlined during heavy i/o. While this happened we got for about 3 minutes a huge number messages like these Apr 12 03:36:07 pfs1n14 kernel: [197510.696595] raid5:md7: read error not correctable (sector 2993096568 on sdj2). I guess the high error rate is responsible for not scheduling other events - during this time the system was not pingable and in the end also other devices run into scsi command timeouts causing problems on these unrelated devices as well. Signed-off-by: Bernd Schubert Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93fde48c0f42..2f28745dacf9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING -- cgit v1.2.1 From dfc7064500061677720fa26352963c772d3ebe6b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 May 2008 13:04:39 -0700 Subject: md: restart recovery cleanly after device failure. When we get any IO error during a recovery (rebuilding a spare), we abort the recovery and restart it. For RAID6 (and multi-drive RAID1) it may not be best to restart at the beginning: when multiple failures can be tolerated, the recovery may be able to continue and re-doing all that has already been done doesn't make sense. We already have the infrastructure to record where a recovery is up to and restart from there, but it is not being used properly. This is because: - We sometimes abort with MD_RECOVERY_ERR rather than just MD_RECOVERY_INTR, which causes the recovery not be be checkpointed. - We remove spares and then re-added them which loses important state information. The distinction between MD_RECOVERY_ERR and MD_RECOVERY_INTR really isn't needed. If there is an error, the relevant drive will be marked as Faulty, and that is enough to ensure correct handling of the error. So we first remove MD_RECOVERY_ERR, changing some of the uses of it to MD_RECOVERY_INTR. Then we cause the attempt to remove a non-faulty device from an array to fail (unless recovery is impossible as the array is too degraded). Then when remove_and_add_spares attempts to remove the devices on which recovery can continue, it will fail, they will remain in place, and recovery will continue on them as desired. Issue: If we are halfway through rebuilding a spare and another drive fails, and a new spare is immediately available, do we want to: 1/ complete the current rebuild, then go back and rebuild the new spare or 2/ restart the rebuild from the start and rebuild both devices in parallel. Both options can be argued for. The code currently takes option 2 as a/ this requires least code change b/ this results in a minimally-degraded array in minimal time. Cc: "Eivind Sarto" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2f28745dacf9..425958a76b84 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1268,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT @@ -4574,6 +4574,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { -- cgit v1.2.1 From e0a115e5aa554b93150a8dc1c3fe15467708abb2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:52 -0700 Subject: md: fix prexor vs sync_request race During the initial array synchronization process there is a window between when a prexor operation is scheduled to a specific stripe and when it completes for a sync_request to be scheduled to the same stripe. When this happens the prexor completes and the stripe is unconditionally marked "insync", effectively canceling the sync_request for the stripe. Prior to 2.6.23 this was not a problem because the prexor operation was done under sh->lock. The effect in older kernels being that the prexor would still erroneously mark the stripe "insync", but sync_request would be held off and re-mark the stripe as "!in_sync". Change the write completion logic to not mark the stripe "in_sync" if a prexor was performed. The effect of the change is to sometimes not set STRIPE_INSYNC. The worst this can do is cause the resync to stall waiting for STRIPE_INSYNC to be set. If this were happening, then STRIPE_SYNCING would be set and handle_issuing_new_read_requests would cause all available blocks to eventually be read, at which point prexor would never be used on that stripe any more and STRIPE_INSYNC would eventually be set. echo repair > /sys/block/mdN/md/sync_action will correct arrays that may have lost this race. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 425958a76b84..f0f0585c107e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2645,6 +2645,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2774,9 +2775,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2810,6 +2813,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.1 From c337869d95011495fa181536786e74aa2d7ff031 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:54 -0700 Subject: md: do not compute parity unless it is on a failed drive If a block is computed (rather than read) then a check/repair operation may be lead to believe that the data on disk is correct, when infact it isn't. So only compute blocks for failed devices. This issue has been around since at least 2.6.12, but has become harder to hit in recent kernels since most reads bypass the cache. echo repair > /sys/block/mdN/md/sync_action will set the parity blocks to the correct state. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f0f0585c107e..c37e256b1176 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2002,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2087,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); -- cgit v1.2.1 From 8c2e870a625bd336b2e7a65a97c1836acef07322 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:30:52 +1000 Subject: Ensure interrupted recovery completed properly (v1 metadata plus bitmap) If, while assembling an array, we find a device which is not fully in-sync with the array, it is important to set the "fullsync" flags. This is an exact analog to the setting of this flag in hot_add_disk methods. Currently, only v1.x metadata supports having devices in an array which are not fully in-sync (it keep track of how in sync they are). The 'fullsync' flag only makes a difference when a write-intent bitmap is being used. In this case it tells recovery to ignore the bitmap and recovery all blocks. This fix is already in place for raid1, but not raid5/6 or raid10. So without this fix, a raid1 ir raid4/5/6 array with version 1.x metadata and a write intent bitmaps, that is stopped in the middle of a recovery, will appear to complete the recovery instantly after it is reassembled, but the recovery will not be correct. If you might have an array like that, issueing echo repair > /sys/block/mdXX/md/sync_action will make sure recovery completes properly. Cc: Signed-off-by: Neil Brown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c37e256b1176..475fba4d371e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4305,7 +4305,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* -- cgit v1.2.1 From efe311431869b40d67911820a309f9a1a41306f3 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:14 +1000 Subject: Don't acknowlege that stripe-expand is complete until it really is. We shouldn't acknowledge that a stripe has been expanded (When reshaping a raid5 by adding a device) until the moved data has actually been written out. However we are currently acknowledging (by calling md_done_sync) when the POST_XOR is complete and before the write. So track in s.locked whether there are pending writes, and don't call md_done_sync yet if there are. Note: we all set R5_LOCKED on devices which are are about to read from. This probably isn't technically necessary, but is usually done when writing a block, and justifies the use of s.locked here. This bug can lead to a crash if an array is stopped while an reshape is in progress. Cc: Signed-off-by: Neil Brown --- drivers/md/raid5.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 475fba4d371e..54c8ee28fcc4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2898,6 +2898,8 @@ static void handle_stripe5(struct stripe_head *sh) for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &dev->flags); + s.locked++; if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; } @@ -2911,6 +2913,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); -- cgit v1.2.1 From 7a1fc53c5adb910751a9b212af90302eb4ffb527 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Jul 2008 04:54:57 -0700 Subject: md: ensure all blocks are uptodate or locked when syncing Remove the dubious attempt to prefer 'compute' over 'read'. Not only is it wrong given commit c337869d (md: do not compute parity unless it is on a failed drive), but it can trigger a BUG_ON in handle_parity_checks5(). Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54c8ee28fcc4..3b27df52456b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2017,12 +2017,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ s->uptodate++; return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) -- cgit v1.2.1