From 964147d5c86d63be79b442c30f3783d49860c078 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 18 May 2010 15:27:13 +1000 Subject: md/raid1: fix counting of write targets. There is a very small race window when writing to a RAID1 such that if a device is marked faulty at exactly the wrong time, the write-in-progress will not be sent to the device, but the bitmap (if present) will be updated to say that the write was sent. Then if the device turned out to still be usable as was re-added to the array, the bitmap-based-resync would skip resyncing that block, possibly leading to corruption. This would only be a problem if no further writes were issued to that area of the device (i.e. that bitmap chunk). Suitable for any pending -stable kernel. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f741f77eeb2b..1ab30f64848f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -911,9 +911,10 @@ static int make_request(struct request_queue *q, struct bio * bio) if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); r1_bio->bios[i] = NULL; - } else + } else { r1_bio->bios[i] = bio; - targets++; + targets++; + } } else r1_bio->bios[i] = NULL; } -- cgit v1.2.1 From 7b92813c3c0b6990f14838e3985fb385d2655d0c Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Mon, 8 Mar 2010 16:02:40 +1100 Subject: drivers/md: Remove unnecessary casts of void * void pointers do not need to be cast to other pointer types. Signed-off-by: H Hartley Sweeten Signed-off-by: NeilBrown --- drivers/md/raid1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1ab30f64848f..23a7516abbfd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -262,7 +262,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio) static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror; conf_t *conf = r1_bio->mddev->private; @@ -307,7 +307,7 @@ static void raid1_end_read_request(struct bio *bio, int error) static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -1223,7 +1223,7 @@ abort: static void end_sync_read(struct bio *bio, int error) { - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int i; for (i=r1_bio->mddev->raid_disks; i--; ) @@ -1246,7 +1246,7 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; int i; -- cgit v1.2.1 From 490773268cf64f68da2470e07b52c7944da6312d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Mar 2010 16:20:56 +1100 Subject: md: move io accounting out of personalities into md_make_request While I generally prefer letting personalities do as much as possible, given that we have a central md_make_request anyway we may as well use it to simplify code. Also this centralises knowledge of ->gendisk which will help later. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 23a7516abbfd..e277013ac808 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -787,7 +787,6 @@ static int make_request(struct request_queue *q, struct bio * bio) struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); - int cpu; bool do_barriers; mdk_rdev_t *blocked_rdev; @@ -833,12 +832,6 @@ static int make_request(struct request_queue *q, struct bio * bio) bitmap = mddev->bitmap; - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - /* * make_request() can abort the operation when READA is being * used and no empty request is available. -- cgit v1.2.1 From b821eaa572fd737faaf6928ba046e571526c36c6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Mar 2010 11:18:15 +1100 Subject: md: remove ->changed and related code. We set ->changed to 1 and call check_disk_change at the end of md_open so that bd_invalidated would be set and thus partition rescan would happen appropriately. Now that we call revalidate_disk directly, which sets bd_invalidates, that indirection is no longer needed and can be removed. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e277013ac808..eebce166dafe 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2184,7 +2184,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { -- cgit v1.2.1 From 21a52c6d05c15f862797736393915bfa8cd40ee9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 1 Apr 2010 15:02:13 +1100 Subject: md: pass mddev to make_request functions rather than request_queue We used to pass the personality make_request function direct to the block layer so the first argument had to be a queue. But now we have the intermediary md_make_request so it makes at lot more sense to pass a struct mddev_s. It makes it possible to have an mddev without its own queue too. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index eebce166dafe..5ff75c4d3af6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -773,9 +773,8 @@ do_sync_io: return NULL; } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; -- cgit v1.2.1 From d754c5ae1ff76b20d3ecde8ad666d7865eada8ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 7 Apr 2010 12:14:43 +1000 Subject: md/raid1: fix confusing 'redirect sector' message. This message seems to suggest the named device is the one on which a read failed, however it is actually the device that the read will be redirected to. So make the message a little clearer. Reported-by: Tim Burgess Signed-off-by: NeilBrown --- drivers/md/raid1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5ff75c4d3af6..2e08e48b02d9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1689,10 +1689,10 @@ static void raid1d(mddev_t *mddev) r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); + printk(KERN_ERR "raid1: redirecting sector %llu to" + " other mirror: %s\n", + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev,b)); bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; -- cgit v1.2.1 From e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 31 Mar 2010 11:21:44 +1100 Subject: md/raid1: delay reads that could overtake behind-writes. When a raid1 array is configured to support write-behind on some devices, it normally only reads from other devices. If all devices are write-behind (because the rest have failed) it is possible for a read request to be serviced before a behind-write request, which would appear as data corruption. So when forced to read from a WriteMostly device, wait for any write-behind to complete, and don't start any more behind-writes. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2e08e48b02d9..cb2da87ad593 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -857,6 +857,15 @@ static int make_request(mddev_t *mddev, struct bio * bio) } mirror = conf->mirrors + rdisk; + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* Reading from a write-mostly device must + * take care not to over-take any writes + * that are 'behind' + */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); @@ -934,10 +943,14 @@ static int make_request(mddev_t *mddev, struct bio * bio) set_bit(R1BIO_Degraded, &r1_bio->state); } - /* do behind I/O ? */ + /* do behind I/O ? + * Not if there are too many, or cannot allocate memory, + * or a reader on WriteMostly is waiting for behind writes + * to flush */ if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait) && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); @@ -2144,15 +2157,13 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; - int behind_wait = 0; /* wait for behind writes to complete */ - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - behind_wait++; - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); /* wait a second */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev)); /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); } raise_barrier(conf); -- cgit v1.2.1 From 9dd1e2faf72f79a2af9dcbd059473c06648726c2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 May 2010 14:30:35 +1000 Subject: md/raid1: improve printk messages Make sure the array name is included in a uniform way in all printk messages. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 57 +++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cb2da87ad593..1db02c4955a9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -296,7 +296,8 @@ static void raid1_end_read_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } @@ -1075,21 +1076,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" - "raid1: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - printk("RAID1 conf printout:\n"); + printk(KERN_DEBUG "RAID1 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1097,7 +1099,7 @@ static void print_conf(conf_t *conf) char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), bdevname(rdev->bdev,b)); @@ -1458,9 +1460,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) char b[BDEVNAME_SIZE]; /* Cannot read from anywhere, array is toast */ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" " for block %llu\n", - bdevname(bio->bi_bdev,b), + mdname(mddev), + bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); @@ -1582,7 +1585,7 @@ static void fix_read_error(conf_t *conf, int read_disk, else { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO - "raid1:%s: read error corrected " + "md/raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + @@ -1687,8 +1690,9 @@ static void raid1d(mddev_t *mddev) bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" " read error for block %llu\n", + mdname(mddev), bdevname(bio->bi_bdev,b), (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); @@ -1702,8 +1706,9 @@ static void raid1d(mddev_t *mddev) r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid1: redirecting sector %llu to" + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" " other mirror: %s\n", + mdname(mddev), (unsigned long long)r1_bio->sector, bdevname(rdev->bdev,b)); bio->bi_sector = r1_bio->sector + rdev->data_offset; @@ -1760,13 +1765,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int still_degraded = 0; if (!conf->r1buf_pool) - { -/* - printk("sync start - bitmap %p\n", mddev->bitmap); -*/ if (init_resync(conf)) return 0; - } max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { @@ -2047,7 +2047,7 @@ static conf_t *setup_conf(mddev_t *mddev) err = -EIO; if (conf->last_used < 0) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", mdname(mddev)); goto abort; } @@ -2055,7 +2055,7 @@ static conf_t *setup_conf(mddev_t *mddev) conf->thread = md_register_thread(raid1d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", + "md/raid1:%s: couldn't allocate thread\n", mdname(mddev)); goto abort; } @@ -2081,12 +2081,12 @@ static int run(mddev_t *mddev) mdk_rdev_t *rdev; if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", mdname(mddev), mddev->level); return -EIO; } if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", mdname(mddev)); return -EIO; } @@ -2129,11 +2129,11 @@ static int run(mddev_t *mddev) mddev->recovery_cp = MaxSector; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid1: %s is not clean" + printk(KERN_NOTICE "md/raid1:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); printk(KERN_INFO - "raid1: raid set %s active with %d out of %d mirrors\n", + "md/raid1:%s: active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); @@ -2160,7 +2160,8 @@ static int stop(mddev_t *mddev) /* wait for behind writes to complete */ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev)); + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); /* need to kick something here to make sure I/O goes? */ wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); @@ -2288,9 +2289,9 @@ static int raid1_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) printk(KERN_WARNING - "md/raid1: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + "md/raid1:%s: cannot register " + "%s\n", + mdname(mddev), nm); } if (rdev) newmirrors[d2++].rdev = rdev; -- cgit v1.2.1 From af3a2cd6b8a479345786e7fe5e199ad2f6240e56 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 8 May 2010 08:20:17 +1000 Subject: md: Fix read balancing in RAID1 and RAID10 on drives > 2TB read_balance uses a "unsigned long" for a sector number which will get truncated beyond 2TB. This will cause read-balancing to be non-optimal, and can cause data to be read from the 'wrong' branch during a resync. This has a very small chance of returning wrong data. Reported-by: Jordan Russell Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1db02c4955a9..a9b9972ff703 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -418,7 +418,7 @@ static void raid1_end_write_request(struct bio *bio, int error) */ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { - const unsigned long this_sector = r1_bio->sector; + const sector_t this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; int wonly_disk = -1; const int sectors = r1_bio->sectors; @@ -434,7 +434,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operation device, for consistancy */ + /* Choose the first operational device, for consistancy */ new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); -- cgit v1.2.1