summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pretzel.yyz.us>2005-06-26 18:06:06 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-06-26 18:06:06 -0400
commitaef7b83c92dd0b7e994805440655d1d64147287b (patch)
tree981f373358c1988e061625e8f272013065cb086f /drivers/md
parentb1fc5505e0dbcc3fd7c75bfe6bee39ec50080963 (diff)
parent8678887e7fb43cd6c9be6c9807b05e77848e0920 (diff)
downloadtalos-op-linux-aef7b83c92dd0b7e994805440655d1d64147287b.tar.gz
talos-op-linux-aef7b83c92dd0b7e994805440655d1d64147287b.zip
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/bitmap.c1586
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-ioctl.c14
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/linear.c8
-rw-r--r--drivers/md/md.c528
-rw-r--r--drivers/md/multipath.c11
-rw-r--r--drivers/md/raid0.c12
-rw-r--r--drivers/md/raid1.c249
-rw-r--r--drivers/md/raid10.c36
-rw-r--r--drivers/md/raid5.c19
-rw-r--r--drivers/md/raid6main.c18
13 files changed, 2238 insertions, 252 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 90de9c146a5f..d3efedf6a6ad 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o
dm-mirror-objs := dm-log.o dm-raid1.o
+md-mod-objs := md.o bitmap.o
raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \
@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
-obj-$(CONFIG_BLK_DEV_MD) += md.o
+obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
new file mode 100644
index 000000000000..95980ad6b27b
--- /dev/null
+++ b/drivers/md/bitmap.c
@@ -0,0 +1,1586 @@
+/*
+ * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * bitmap_create - sets up the bitmap structure
+ * bitmap_destroy - destroys the bitmap structure
+ *
+ * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
+ * - added disk storage for bitmap
+ * - changes to allow various bitmap chunk sizes
+ * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
+ */
+
+/*
+ * Still to do:
+ *
+ * flush after percent set rather than just time based. (maybe both).
+ * wait if count gets too high, wake when it drops to half.
+ * allow bitmap to be mirrored with superblock (before or after...)
+ * allow hot-add to re-instate a current device.
+ * allow hot-add of bitmap after quiessing device
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/config.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include <linux/raid/md.h>
+#include <linux/raid/bitmap.h>
+
+/* debug macros */
+
+#define DEBUG 0
+
+#if DEBUG
+/* these are for debugging purposes only! */
+
+/* define one and only one of these */
+#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
+#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
+#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
+#define INJECT_FAULTS_4 0 /* undef */
+#define INJECT_FAULTS_5 0 /* undef */
+#define INJECT_FAULTS_6 0
+
+/* if these are defined, the driver will fail! debug only */
+#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
+#define INJECT_FATAL_FAULT_2 0 /* undef */
+#define INJECT_FATAL_FAULT_3 0 /* undef */
+#endif
+
+//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */
+#define DPRINTK(x...) do { } while(0)
+
+#ifndef PRINTK
+# if DEBUG > 0
+# define PRINTK(x...) printk(KERN_DEBUG x)
+# else
+# define PRINTK(x...)
+# endif
+#endif
+
+static inline char * bmname(struct bitmap *bitmap)
+{
+ return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
+}
+
+
+/*
+ * test if the bitmap is active
+ */
+int bitmap_active(struct bitmap *bitmap)
+{
+ unsigned long flags;
+ int res = 0;
+
+ if (!bitmap)
+ return res;
+ spin_lock_irqsave(&bitmap->lock, flags);
+ res = bitmap->flags & BITMAP_ACTIVE;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return res;
+}
+
+#define WRITE_POOL_SIZE 256
+/* mempool for queueing pending writes on the bitmap file */
+static void *write_pool_alloc(unsigned int gfp_flags, void *data)
+{
+ return kmalloc(sizeof(struct page_list), gfp_flags);
+}
+
+static void write_pool_free(void *ptr, void *data)
+{
+ kfree(ptr);
+}
+
+/*
+ * just a placeholder - calls kmalloc for bitmap pages
+ */
+static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
+{
+ unsigned char *page;
+
+#if INJECT_FAULTS_1
+ page = NULL;
+#else
+ page = kmalloc(PAGE_SIZE, GFP_NOIO);
+#endif
+ if (!page)
+ printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
+ else
+ PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
+ bmname(bitmap), page);
+ return page;
+}
+
+/*
+ * for now just a placeholder -- just calls kfree for bitmap pages
+ */
+static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
+{
+ PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
+ kfree(page);
+}
+
+/*
+ * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
+ *
+ * 1) check to see if this page is allocated, if it's not then try to alloc
+ * 2) if the alloc fails, set the page's hijacked flag so we'll use the
+ * page pointer directly as a counter
+ *
+ * if we find our page, we increment the page's refcount so that it stays
+ * allocated while we're using it
+ */
+static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
+{
+ unsigned char *mappage;
+
+ if (page >= bitmap->pages) {
+ printk(KERN_ALERT
+ "%s: invalid bitmap page request: %lu (> %lu)\n",
+ bmname(bitmap), page, bitmap->pages-1);
+ return -EINVAL;
+ }
+
+
+ if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
+ return 0;
+
+ if (bitmap->bp[page].map) /* page is already allocated, just return */
+ return 0;
+
+ if (!create)
+ return -ENOENT;
+
+ spin_unlock_irq(&bitmap->lock);
+
+ /* this page has not been allocated yet */
+
+ if ((mappage = bitmap_alloc_page(bitmap)) == NULL) {
+ PRINTK("%s: bitmap map page allocation failed, hijacking\n",
+ bmname(bitmap));
+ /* failed - set the hijacked flag so that we can use the
+ * pointer as a counter */
+ spin_lock_irq(&bitmap->lock);
+ if (!bitmap->bp[page].map)
+ bitmap->bp[page].hijacked = 1;
+ goto out;
+ }
+
+ /* got a page */
+
+ spin_lock_irq(&bitmap->lock);
+
+ /* recheck the page */
+
+ if (bitmap->bp[page].map || bitmap->bp[page].hijacked) {
+ /* somebody beat us to getting the page */
+ bitmap_free_page(bitmap, mappage);
+ return 0;
+ }
+
+ /* no page was in place and we have one, so install it */
+
+ memset(mappage, 0, PAGE_SIZE);
+ bitmap->bp[page].map = mappage;
+ bitmap->missing_pages--;
+out:
+ return 0;
+}
+
+
+/* if page is completely empty, put it back on the free list, or dealloc it */
+/* if page was hijacked, unmark the flag so it might get alloced next time */
+/* Note: lock should be held when calling this */
+static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
+{
+ char *ptr;
+
+ if (bitmap->bp[page].count) /* page is still busy */
+ return;
+
+ /* page is no longer in use, it can be released */
+
+ if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
+ bitmap->bp[page].hijacked = 0;
+ bitmap->bp[page].map = NULL;
+ return;
+ }
+
+ /* normal case, free the page */
+
+#if 0
+/* actually ... let's not. We will probably need the page again exactly when
+ * memory is tight and we are flusing to disk
+ */
+ return;
+#else
+ ptr = bitmap->bp[page].map;
+ bitmap->bp[page].map = NULL;
+ bitmap->missing_pages++;
+ bitmap_free_page(bitmap, ptr);
+ return;
+#endif
+}
+
+
+/*
+ * bitmap file handling - read and write the bitmap file and its superblock
+ */
+
+/* copy the pathname of a file to a buffer */
+char *file_path(struct file *file, char *buf, int count)
+{
+ struct dentry *d;
+ struct vfsmount *v;
+
+ if (!buf)
+ return NULL;
+
+ d = file->f_dentry;
+ v = file->f_vfsmnt;
+
+ buf = d_path(d, v, buf, count);
+
+ return IS_ERR(buf) ? NULL : buf;
+}
+
+/*
+ * basic page I/O operations
+ */
+
+/* IO operations when bitmap is stored near all superblocks */
+static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index)
+{
+ /* choose a good rdev and read the page from there */
+
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ struct page *page = alloc_page(GFP_KERNEL);
+ sector_t target;
+
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ do {
+ ITERATE_RDEV(mddev, rdev, tmp)
+ if (rdev->in_sync && !rdev->faulty)
+ goto found;
+ return ERR_PTR(-EIO);
+
+ found:
+ target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
+
+ } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ));
+
+ page->index = index;
+ return page;
+}
+
+static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev, rdev, tmp)
+ if (rdev->in_sync && !rdev->faulty)
+ md_super_write(mddev, rdev,
+ (rdev->sb_offset<<1) + offset
+ + page->index * (PAGE_SIZE/512),
+ PAGE_SIZE,
+ page);
+
+ if (wait)
+ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+ return 0;
+}
+
+/*
+ * write out a page to a file
+ */
+static int write_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+ int ret = -ENOMEM;
+
+ if (bitmap->file == NULL)
+ return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
+
+ if (wait)
+ lock_page(page);
+ else {
+ if (TestSetPageLocked(page))
+ return -EAGAIN; /* already locked */
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ }
+
+ ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE);
+ if (!ret)
+ ret = page->mapping->a_ops->commit_write(NULL, page, 0,
+ PAGE_SIZE);
+ if (ret) {
+ unlock_page(page);
+ return ret;
+ }
+
+ set_page_dirty(page); /* force it to be written out */
+
+ if (!wait) {
+ /* add to list to be waited for by daemon */
+ struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
+ item->page = page;
+ page_cache_get(page);
+ spin_lock(&bitmap->write_lock);
+ list_add(&item->list, &bitmap->complete_pages);
+ spin_unlock(&bitmap->write_lock);
+ md_wakeup_thread(bitmap->writeback_daemon);
+ }
+ return write_one_page(page, wait);
+}
+
+/* read a page from a file, pinning it into cache, and return bytes_read */
+static struct page *read_page(struct file *file, unsigned long index,
+ unsigned long *bytes_read)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct page *page = NULL;
+ loff_t isize = i_size_read(inode);
+ unsigned long end_index = isize >> PAGE_CACHE_SHIFT;
+
+ PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE,
+ (unsigned long long)index << PAGE_CACHE_SHIFT);
+
+ page = read_cache_page(inode->i_mapping, index,
+ (filler_t *)inode->i_mapping->a_ops->readpage, file);
+ if (IS_ERR(page))
+ goto out;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ goto out;
+ }
+
+ if (index > end_index) /* we have read beyond EOF */
+ *bytes_read = 0;
+ else if (index == end_index) /* possible short read */
+ *bytes_read = isize & ~PAGE_CACHE_MASK;
+ else
+ *bytes_read = PAGE_CACHE_SIZE; /* got a full page */
+out:
+ if (IS_ERR(page))
+ printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
+ (int)PAGE_CACHE_SIZE,
+ (unsigned long long)index << PAGE_CACHE_SHIFT,
+ PTR_ERR(page));
+ return page;
+}
+
+/*
+ * bitmap file superblock operations
+ */
+
+/* update the event counter and sync the superblock to disk */
+int bitmap_update_sb(struct bitmap *bitmap)
+{
+ bitmap_super_t *sb;
+ unsigned long flags;
+
+ if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
+ return 0;
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if (!bitmap->sb_page) { /* no superblock */
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return 0;
+ }
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+ sb->events = cpu_to_le64(bitmap->mddev->events);
+ if (!bitmap->mddev->degraded)
+ sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+ kunmap(bitmap->sb_page);
+ return write_page(bitmap, bitmap->sb_page, 1);
+}
+
+/* print out the bitmap file superblock */
+void bitmap_print_sb(struct bitmap *bitmap)
+{
+ bitmap_super_t *sb;
+
+ if (!bitmap || !bitmap->sb_page)
+ return;
+ sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+ printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
+ printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
+ printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
+ printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
+ *(__u32 *)(sb->uuid+0),
+ *(__u32 *)(sb->uuid+4),
+ *(__u32 *)(sb->uuid+8),
+ *(__u32 *)(sb->uuid+12));
+ printk(KERN_DEBUG " events: %llu\n",
+ (unsigned long long) le64_to_cpu(sb->events));
+ printk(KERN_DEBUG "events cleared: %llu\n",
+ (unsigned long long) le64_to_cpu(sb->events_cleared));
+ printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
+ printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
+ printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ printk(KERN_DEBUG " sync size: %llu KB\n",
+ (unsigned long long)le64_to_cpu(sb->sync_size)/2);
+ kunmap(bitmap->sb_page);
+}
+
+/* read the superblock from the bitmap file and initialize some bitmap fields */
+static int bitmap_read_sb(struct bitmap *bitmap)
+{
+ char *reason = NULL;
+ bitmap_super_t *sb;
+ unsigned long chunksize, daemon_sleep;
+ unsigned long bytes_read;
+ unsigned long long events;
+ int err = -EINVAL;
+
+ /* page 0 is the superblock, read it... */
+ if (bitmap->file)
+ bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
+ else {
+ bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
+ bytes_read = PAGE_SIZE;
+ }
+ if (IS_ERR(bitmap->sb_page)) {
+ err = PTR_ERR(bitmap->sb_page);
+ bitmap->sb_page = NULL;
+ return err;
+ }
+
+ sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+
+ if (bytes_read < sizeof(*sb)) { /* short read */
+ printk(KERN_INFO "%s: bitmap file superblock truncated\n",
+ bmname(bitmap));
+ err = -ENOSPC;
+ goto out;
+ }
+
+ chunksize = le32_to_cpu(sb->chunksize);
+ daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+
+ /* verify that the bitmap-specific fields are valid */
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
+ reason = "bad magic";
+ else if (sb->version != cpu_to_le32(BITMAP_MAJOR))
+ reason = "unrecognized superblock version";
+ else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
+ reason = "bitmap chunksize out of range (512B - 4MB)";
+ else if ((1 << ffz(~chunksize)) != chunksize)
+ reason = "bitmap chunksize not a power of 2";
+ else if (daemon_sleep < 1 || daemon_sleep > 15)
+ reason = "daemon sleep period out of range";
+ if (reason) {
+ printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
+ bmname(bitmap), reason);
+ goto out;
+ }
+
+ /* keep the array size field of the bitmap superblock up to date */
+ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+
+ if (!bitmap->mddev->persistent)
+ goto success;
+
+ /*
+ * if we have a persistent array superblock, compare the
+ * bitmap's UUID and event counter to the mddev's
+ */
+ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+ printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
+ bmname(bitmap));
+ goto out;
+ }
+ events = le64_to_cpu(sb->events);
+ if (events < bitmap->mddev->events) {
+ printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
+ "-- forcing full recovery\n", bmname(bitmap), events,
+ (unsigned long long) bitmap->mddev->events);
+ sb->state |= BITMAP_STALE;
+ }
+success:
+ /* assign fields using values from superblock */
+ bitmap->chunksize = chunksize;
+ bitmap->daemon_sleep = daemon_sleep;
+ bitmap->flags |= sb->state;
+ bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+ err = 0;
+out:
+ kunmap(bitmap->sb_page);
+ if (err)
+ bitmap_print_sb(bitmap);
+ return err;
+}
+
+enum bitmap_mask_op {
+ MASK_SET,
+ MASK_UNSET
+};
+
+/* record the state of the bitmap in the superblock */
+static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
+ enum bitmap_mask_op op)
+{
+ bitmap_super_t *sb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if (!bitmap || !bitmap->sb_page) { /* can't set the state */
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return;
+ }
+ page_cache_get(bitmap->sb_page);
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+ switch (op) {
+ case MASK_SET: sb->state |= bits;
+ break;
+ case MASK_UNSET: sb->state &= ~bits;
+ break;
+ default: BUG();
+ }
+ kunmap(bitmap->sb_page);
+ page_cache_release(bitmap->sb_page);
+}
+
+/*
+ * general bitmap file operations
+ */
+
+/* calculate the index of the page that contains this bit */
+static inline unsigned long file_page_index(unsigned long chunk)
+{
+ return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
+}
+
+/* calculate the (bit) offset of this bit within a page */
+static inline unsigned long file_page_offset(unsigned long chunk)
+{
+ return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
+}
+
+/*
+ * return a pointer to the page in the filemap that contains the given bit
+ *
+ * this lookup is complicated by the fact that the bitmap sb might be exactly
+ * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
+ * 0 or page 1
+ */
+static inline struct page *filemap_get_page(struct bitmap *bitmap,
+ unsigned long chunk)
+{
+ return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
+}
+
+
+static void bitmap_file_unmap(struct bitmap *bitmap)
+{
+ struct page **map, *sb_page;
+ unsigned long *attr;
+ int pages;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ map = bitmap->filemap;
+ bitmap->filemap = NULL;
+ attr = bitmap->filemap_attr;
+ bitmap->filemap_attr = NULL;
+ pages = bitmap->file_pages;
+ bitmap->file_pages = 0;
+ sb_page = bitmap->sb_page;
+ bitmap->sb_page = NULL;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+
+ while (pages--)
+ if (map[pages]->index != 0) /* 0 is sb_page, release it below */
+ page_cache_release(map[pages]);
+ kfree(map);
+ kfree(attr);
+
+ if (sb_page)
+ page_cache_release(sb_page);
+}
+
+static void bitmap_stop_daemons(struct bitmap *bitmap);
+
+/* dequeue the next item in a page list -- don't call from irq context */
+static struct page_list *dequeue_page(struct bitmap *bitmap)
+{
+ struct page_list *item = NULL;
+ struct list_head *head = &bitmap->complete_pages;
+
+ spin_lock(&bitmap->write_lock);
+ if (list_empty(head))
+ goto out;
+ item = list_entry(head->prev, struct page_list, list);
+ list_del(head->prev);
+out:
+ spin_unlock(&bitmap->write_lock);
+ return item;
+}
+
+static void drain_write_queues(struct bitmap *bitmap)
+{
+ struct page_list *item;
+
+ while ((item = dequeue_page(bitmap))) {
+ /* don't bother to wait */
+ page_cache_release(item->page);
+ mempool_free(item, bitmap->write_pool);
+ }
+
+ wake_up(&bitmap->write_wait);
+}
+
+static void bitmap_file_put(struct bitmap *bitmap)
+{
+ struct file *file;
+ struct inode *inode;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ file = bitmap->file;
+ bitmap->file = NULL;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+
+ bitmap_stop_daemons(bitmap);
+
+ drain_write_queues(bitmap);
+
+ bitmap_file_unmap(bitmap);
+
+ if (file) {
+ inode = file->f_mapping->host;
+ spin_lock(&inode->i_lock);
+ atomic_set(&inode->i_writecount, 1); /* allow writes again */
+ spin_unlock(&inode->i_lock);
+ fput(file);
+ }
+}
+
+
+/*
+ * bitmap_file_kick - if an error occurs while manipulating the bitmap file
+ * then it is no longer reliable, so we stop using it and we mark the file
+ * as failed in the superblock
+ */
+static void bitmap_file_kick(struct bitmap *bitmap)
+{
+ char *path, *ptr = NULL;
+
+ bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
+ bitmap_update_sb(bitmap);
+
+ if (bitmap->file) {
+ path = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (path)
+ ptr = file_path(bitmap->file, path, PAGE_SIZE);
+
+ printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
+ bmname(bitmap), ptr ? ptr : "");
+
+ kfree(path);
+ }
+
+ bitmap_file_put(bitmap);
+
+ return;
+}
+
+enum bitmap_page_attr {
+ BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced
+ BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared
+ BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced
+};
+
+static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
+ enum bitmap_page_attr attr)
+{
+ bitmap->filemap_attr[page->index] |= attr;
+}
+
+static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
+ enum bitmap_page_attr attr)
+{
+ bitmap->filemap_attr[page->index] &= ~attr;
+}
+
+static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page)
+{
+ return bitmap->filemap_attr[page->index];
+}
+
+/*
+ * bitmap_file_set_bit -- called before performing a write to the md device
+ * to set (and eventually sync) a particular bit in the bitmap file
+ *
+ * we set the bit immediately, then we record the page number so that
+ * when an unplug occurs, we can flush the dirty pages out to disk
+ */
+static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
+{
+ unsigned long bit;
+ struct page *page;
+ void *kaddr;
+ unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+
+ if (!bitmap->filemap) {
+ return;
+ }
+
+ page = filemap_get_page(bitmap, chunk);
+ bit = file_page_offset(chunk);
+
+
+ /* make sure the page stays cached until it gets written out */
+ if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
+ page_cache_get(page);
+
+ /* set the bit */
+ kaddr = kmap_atomic(page, KM_USER0);
+ set_bit(bit, kaddr);
+ kunmap_atomic(kaddr, KM_USER0);
+ PRINTK("set file bit %lu page %lu\n", bit, page->index);
+
+ /* record page number so it gets flushed to disk when unplug occurs */
+ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+
+}
+
+/* this gets called when the md device is ready to unplug its underlying
+ * (slave) device queues -- before we let any writes go down, we need to
+ * sync the dirty pages of the bitmap file to disk */
+int bitmap_unplug(struct bitmap *bitmap)
+{
+ unsigned long i, attr, flags;
+ struct page *page;
+ int wait = 0;
+ int err;
+
+ if (!bitmap)
+ return 0;
+
+ /* look at each page to see if there are any set bits that need to be
+ * flushed out to disk */
+ for (i = 0; i < bitmap->file_pages; i++) {
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if (!bitmap->filemap) {
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return 0;
+ }
+ page = bitmap->filemap[i];
+ attr = get_page_attr(bitmap, page);
+ clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+ clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+ if ((attr & BITMAP_PAGE_DIRTY))
+ wait = 1;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+
+ if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) {
+ err = write_page(bitmap, page, 0);
+ if (err == -EAGAIN) {
+ if (attr & BITMAP_PAGE_DIRTY)
+ err = write_page(bitmap, page, 1);
+ else
+ err = 0;
+ }
+ if (err)
+ return 1;
+ }
+ }
+ if (wait) { /* if any writes were performed, we need to wait on them */
+ if (bitmap->file) {
+ spin_lock_irq(&bitmap->write_lock);
+ wait_event_lock_irq(bitmap->write_wait,
+ list_empty(&bitmap->complete_pages), bitmap->write_lock,
+ wake_up_process(bitmap->writeback_daemon->tsk));
+ spin_unlock_irq(&bitmap->write_lock);
+ } else
+ wait_event(bitmap->mddev->sb_wait,
+ atomic_read(&bitmap->mddev->pending_writes)==0);
+ }
+ return 0;
+}
+
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int in_sync);
+/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
+ * the in-memory bitmap from the on-disk bitmap -- also, sets up the
+ * memory mapping of the bitmap file
+ * Special cases:
+ * if there's no bitmap file, or if the bitmap file had been
+ * previously kicked from the array, we mark all the bits as
+ * 1's in order to cause a full resync.
+ */
+static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync)
+{
+ unsigned long i, chunks, index, oldindex, bit;
+ struct page *page = NULL, *oldpage = NULL;
+ unsigned long num_pages, bit_cnt = 0;
+ struct file *file;
+ unsigned long bytes, offset, dummy;
+ int outofdate;
+ int ret = -ENOSPC;
+
+ chunks = bitmap->chunks;
+ file = bitmap->file;
+
+ BUG_ON(!file && !bitmap->offset);
+
+#if INJECT_FAULTS_3
+ outofdate = 1;
+#else
+ outofdate = bitmap->flags & BITMAP_STALE;
+#endif
+ if (outofdate)
+ printk(KERN_INFO "%s: bitmap file is out of date, doing full "
+ "recovery\n", bmname(bitmap));
+
+ bytes = (chunks + 7) / 8;
+
+ num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
+ printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
+ bmname(bitmap),
+ (unsigned long) i_size_read(file->f_mapping->host),
+ bytes + sizeof(bitmap_super_t));
+ goto out;
+ }
+
+ ret = -ENOMEM;
+
+ bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+ if (!bitmap->filemap)
+ goto out;
+
+ bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL);
+ if (!bitmap->filemap_attr)
+ goto out;
+
+ memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
+
+ oldindex = ~0L;
+
+ for (i = 0; i < chunks; i++) {
+ index = file_page_index(i);
+ bit = file_page_offset(i);
+ if (index != oldindex) { /* this is a new page, read it in */
+ /* unmap the old page, we're done with it */
+ if (oldpage != NULL)
+ kunmap(oldpage);
+ if (index == 0) {
+ /*
+ * if we're here then the superblock page
+ * contains some bits (PAGE_SIZE != sizeof sb)
+ * we've already read it in, so just use it
+ */
+ page = bitmap->sb_page;
+ offset = sizeof(bitmap_super_t);
+ } else if (file) {
+ page = read_page(file, index, &dummy);
+ offset = 0;
+ } else {
+ page = read_sb_page(bitmap->mddev, bitmap->offset, index);
+ offset = 0;
+ }
+ if (IS_ERR(page)) { /* read error */
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
+ oldindex = index;
+ oldpage = page;
+ kmap(page);
+
+ if (outofdate) {
+ /*
+ * if bitmap is out of date, dirty the
+ * whole page and write it out
+ */
+ memset(page_address(page) + offset, 0xff,
+ PAGE_SIZE - offset);
+ ret = write_page(bitmap, page, 1);
+ if (ret) {
+ kunmap(page);
+ /* release, page not in filemap yet */
+ page_cache_release(page);
+ goto out;
+ }
+ }
+
+ bitmap->filemap[bitmap->file_pages++] = page;
+ }
+ if (test_bit(bit, page_address(page))) {
+ /* if the disk bit is set, set the memory bit */
+ bitmap_set_memory_bits(bitmap,
+ i << CHUNK_BLOCK_SHIFT(bitmap), 1, in_sync);
+ bit_cnt++;
+ }
+ }
+
+ /* everything went OK */
+ ret = 0;
+ bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
+
+ if (page) /* unmap the last page */
+ kunmap(page);
+
+ if (bit_cnt) { /* Kick recovery if any bits were set */
+ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
+ md_wakeup_thread(bitmap->mddev->thread);
+ }
+
+out:
+ printk(KERN_INFO "%s: bitmap initialized from disk: "
+ "read %lu/%lu pages, set %lu bits, status: %d\n",
+ bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret);
+
+ return ret;
+}
+
+void bitmap_write_all(struct bitmap *bitmap)
+{
+ /* We don't actually write all bitmap blocks here,
+ * just flag them as needing to be written
+ */
+
+ unsigned long chunks = bitmap->chunks;
+ unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t);
+ unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE;
+ while (num_pages--)
+ bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
+}
+
+
+static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
+{
+ sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ bitmap->bp[page].count += inc;
+/*
+ if (page == 0) printk("count page 0, offset %llu: %d gives %d\n",
+ (unsigned long long)offset, inc, bitmap->bp[page].count);
+*/
+ bitmap_checkfree(bitmap, page);
+}
+static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
+ sector_t offset, int *blocks,
+ int create);
+
+/*
+ * bitmap daemon -- periodically wakes up to clean bits and flush pages
+ * out to disk
+ */
+
+int bitmap_daemon_work(struct bitmap *bitmap)
+{
+ unsigned long j;
+ unsigned long flags;
+ struct page *page = NULL, *lastpage = NULL;
+ int err = 0;
+ int blocks;
+ int attr;
+
+ if (bitmap == NULL)
+ return 0;
+ if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
+ return 0;
+ bitmap->daemon_lastrun = jiffies;
+
+ for (j = 0; j < bitmap->chunks; j++) {
+ bitmap_counter_t *bmc;
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if (!bitmap->filemap) {
+ /* error or shutdown */
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ break;
+ }
+
+ page = filemap_get_page(bitmap, j);
+
+ if (page != lastpage) {
+ /* skip this page unless it's marked as needing cleaning */
+ if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
+ if (attr & BITMAP_PAGE_NEEDWRITE) {
+ page_cache_get(page);
+ clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+ }
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (attr & BITMAP_PAGE_NEEDWRITE) {
+ switch (write_page(bitmap, page, 0)) {
+ case -EAGAIN:
+ set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+ break;
+ case 0:
+ break;
+ default:
+ bitmap_file_kick(bitmap);
+ }
+ page_cache_release(page);
+ }
+ continue;
+ }
+
+ /* grab the new page, sync and release the old */
+ page_cache_get(page);
+ if (lastpage != NULL) {
+ if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
+ clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ err = write_page(bitmap, lastpage, 0);
+ if (err == -EAGAIN) {
+ err = 0;
+ set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ }
+ } else {
+ set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ }
+ kunmap(lastpage);
+ page_cache_release(lastpage);
+ if (err)
+ bitmap_file_kick(bitmap);
+ } else
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ lastpage = page;
+ kmap(page);
+/*
+ printk("bitmap clean at page %lu\n", j);
+*/
+ spin_lock_irqsave(&bitmap->lock, flags);
+ clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+ }
+ bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
+ &blocks, 0);
+ if (bmc) {
+/*
+ if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
+*/
+ if (*bmc == 2) {
+ *bmc=1; /* maybe clear the bit next time */
+ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+ } else if (*bmc == 1) {
+ /* we can clear the bit */
+ *bmc = 0;
+ bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
+ -1);
+
+ /* clear the bit */
+ clear_bit(file_page_offset(j), page_address(page));
+ }
+ }
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ }
+
+ /* now sync the final page */
+ if (lastpage != NULL) {
+ kunmap(lastpage);
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
+ clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ err = write_page(bitmap, lastpage, 0);
+ if (err == -EAGAIN) {
+ set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ err = 0;
+ }
+ } else {
+ set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ }
+
+ page_cache_release(lastpage);
+ }
+
+ return err;
+}
+
+static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
+{
+ mdk_thread_t *dmn;
+ unsigned long flags;
+
+ /* if no one is waiting on us, we'll free the md thread struct
+ * and exit, otherwise we let the waiter clean things up */
+ spin_lock_irqsave(&bitmap->lock, flags);
+ if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
+ *daemon = NULL;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ kfree(dmn);
+ complete_and_exit(NULL, 0); /* do_exit not exported */
+ }
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+}
+
+static void bitmap_writeback_daemon(mddev_t *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ struct page *page;
+ struct page_list *item;
+ int err = 0;
+
+ if (signal_pending(current)) {
+ printk(KERN_INFO
+ "%s: bitmap writeback daemon got signal, exiting...\n",
+ bmname(bitmap));
+ err = -EINTR;
+ goto out;
+ }
+
+ PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
+ /* wait on bitmap page writebacks */
+ while ((item = dequeue_page(bitmap))) {
+ page = item->page;
+ mempool_free(item, bitmap->write_pool);
+ PRINTK("wait on page writeback: %p\n", page);
+ wait_on_page_writeback(page);
+ PRINTK("finished page writeback: %p\n", page);
+
+ err = PageError(page);
+ page_cache_release(page);
+ if (err) {
+ printk(KERN_WARNING "%s: bitmap file writeback "
+ "failed (page %lu): %d\n",
+ bmname(bitmap), page->index, err);
+ bitmap_file_kick(bitmap);
+ goto out;
+ }
+ }
+ out:
+ wake_up(&bitmap->write_wait);
+ if (err) {
+ printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
+ bmname(bitmap), err);
+ daemon_exit(bitmap, &bitmap->writeback_daemon);
+ }
+}
+
+static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
+ void (*func)(mddev_t *), char *name)
+{
+ mdk_thread_t *daemon;
+ unsigned long flags;
+ char namebuf[32];
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ *ptr = NULL;
+
+ if (!bitmap->file) /* no need for daemon if there's no backing file */
+ goto out_unlock;
+
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+
+#if INJECT_FATAL_FAULT_2
+ daemon = NULL;
+#else
+ sprintf(namebuf, "%%s_%s", name);
+ daemon = md_register_thread(func, bitmap->mddev, namebuf);
+#endif
+ if (!daemon) {
+ printk(KERN_ERR "%s: failed to start bitmap daemon\n",
+ bmname(bitmap));
+ return -ECHILD;
+ }
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ *ptr = daemon;
+
+ md_wakeup_thread(daemon); /* start it running */
+
+ PRINTK("%s: %s daemon (pid %d) started...\n",
+ bmname(bitmap), name, daemon->tsk->pid);
+out_unlock:
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return 0;
+}
+
+static int bitmap_start_daemons(struct bitmap *bitmap)
+{
+ int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
+ bitmap_writeback_daemon, "bitmap_wb");
+ return err;
+}
+
+static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr)
+{
+ mdk_thread_t *daemon;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ daemon = *ptr;
+ *ptr = NULL;
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (daemon)
+ md_unregister_thread(daemon); /* destroy the thread */
+}
+
+static void bitmap_stop_daemons(struct bitmap *bitmap)
+{
+ /* the daemons can't stop themselves... they'll just exit instead... */
+ if (bitmap->writeback_daemon &&
+ current->pid != bitmap->writeback_daemon->tsk->pid)
+ bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon);
+}
+
+static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
+ sector_t offset, int *blocks,
+ int create)
+{
+ /* If 'create', we might release the lock and reclaim it.
+ * The lock must have been taken with interrupts enabled.
+ * If !create, we don't release the lock.
+ */
+ sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+ sector_t csize;
+
+ if (bitmap_checkpage(bitmap, page, create) < 0) {
+ csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+ *blocks = csize - (offset & (csize- 1));
+ return NULL;
+ }
+ /* now locked ... */
+
+ if (bitmap->bp[page].hijacked) { /* hijacked pointer */
+ /* should we use the first or second counter field
+ * of the hijacked pointer? */
+ int hi = (pageoff > PAGE_COUNTER_MASK);
+ csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
+ PAGE_COUNTER_SHIFT - 1);
+ *blocks = csize - (offset & (csize- 1));
+ return &((bitmap_counter_t *)
+ &bitmap->bp[page].map)[hi];
+ } else { /* page is allocated */
+ csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+ *blocks = csize - (offset & (csize- 1));
+ return (bitmap_counter_t *)
+ &(bitmap->bp[page].map[pageoff]);
+ }
+}
+
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
+{
+ if (!bitmap) return 0;
+ while (sectors) {
+ int blocks;
+ bitmap_counter_t *bmc;
+
+ spin_lock_irq(&bitmap->lock);
+ bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
+ if (!bmc) {
+ spin_unlock_irq(&bitmap->lock);
+ return 0;
+ }
+
+ switch(*bmc) {
+ case 0:
+ bitmap_file_set_bit(bitmap, offset);
+ bitmap_count_page(bitmap,offset, 1);
+ blk_plug_device(bitmap->mddev->queue);
+ /* fall through */
+ case 1:
+ *bmc = 2;
+ }
+ if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
+ (*bmc)++;
+
+ spin_unlock_irq(&bitmap->lock);
+
+ offset += blocks;
+ if (sectors > blocks)
+ sectors -= blocks;
+ else sectors = 0;
+ }
+ return 0;
+}
+
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
+ int success)
+{
+ if (!bitmap) return;
+ while (sectors) {
+ int blocks;
+ unsigned long flags;
+ bitmap_counter_t *bmc;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
+ if (!bmc) {
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ return;
+ }
+
+ if (!success && ! (*bmc & NEEDED_MASK))
+ *bmc |= NEEDED_MASK;
+
+ (*bmc)--;
+ if (*bmc <= 2) {
+ set_page_attr(bitmap,
+ filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ BITMAP_PAGE_CLEAN);
+ }
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ offset += blocks;
+ if (sectors > blocks)
+ sectors -= blocks;
+ else sectors = 0;
+ }
+}
+
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
+{
+ bitmap_counter_t *bmc;
+ int rv;
+ if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
+ *blocks = 1024;
+ return 1; /* always resync if no bitmap */
+ }
+ spin_lock_irq(&bitmap->lock);
+ bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
+ rv = 0;
+ if (bmc) {
+ /* locked */
+ if (RESYNC(*bmc))
+ rv = 1;
+ else if (NEEDED(*bmc)) {
+ rv = 1;
+ *bmc |= RESYNC_MASK;
+ *bmc &= ~NEEDED_MASK;
+ }
+ }
+ spin_unlock_irq(&bitmap->lock);
+ return rv;
+}
+
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
+{
+ bitmap_counter_t *bmc;
+ unsigned long flags;
+/*
+ if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted);
+*/ if (bitmap == NULL) {
+ *blocks = 1024;
+ return;
+ }
+ spin_lock_irqsave(&bitmap->lock, flags);
+ bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
+ if (bmc == NULL)
+ goto unlock;
+ /* locked */
+/*
+ if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks);
+*/
+ if (RESYNC(*bmc)) {
+ *bmc &= ~RESYNC_MASK;
+
+ if (!NEEDED(*bmc) && aborted)
+ *bmc |= NEEDED_MASK;
+ else {
+ if (*bmc <= 2) {
+ set_page_attr(bitmap,
+ filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ BITMAP_PAGE_CLEAN);
+ }
+ }
+ }
+ unlock:
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+}
+
+void bitmap_close_sync(struct bitmap *bitmap)
+{
+ /* Sync has finished, and any bitmap chunks that weren't synced
+ * properly have been aborted. It remains to us to clear the
+ * RESYNC bit wherever it is still on
+ */
+ sector_t sector = 0;
+ int blocks;
+ if (!bitmap) return;
+ while (sector < bitmap->mddev->resync_max_sectors) {
+ bitmap_end_sync(bitmap, sector, &blocks, 0);
+/*
+ if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n",
+ (unsigned long long)sector, blocks);
+*/ sector += blocks;
+ }
+}
+
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int in_sync)
+{
+ /* For each chunk covered by any of these sectors, set the
+ * counter to 1 and set resync_needed unless in_sync. They should all
+ * be 0 at this point
+ */
+ while (sectors) {
+ int secs;
+ bitmap_counter_t *bmc;
+ spin_lock_irq(&bitmap->lock);
+ bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
+ if (!bmc) {
+ spin_unlock_irq(&bitmap->lock);
+ return;
+ }
+ if (! *bmc) {
+ struct page *page;
+ *bmc = 1 | (in_sync? 0 : NEEDED_MASK);
+ bitmap_count_page(bitmap, offset, 1);
+ page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+ }
+ spin_unlock_irq(&bitmap->lock);
+ if (sectors > secs)
+ sectors -= secs;
+ else
+ sectors = 0;
+ }
+}
+
+/*
+ * free memory that was allocated
+ */
+void bitmap_destroy(mddev_t *mddev)
+{
+ unsigned long k, pages;
+ struct bitmap_page *bp;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap) /* there was no bitmap */
+ return;
+
+ mddev->bitmap = NULL; /* disconnect from the md device */
+
+ /* release the bitmap file and kill the daemon */
+ bitmap_file_put(bitmap);
+
+ bp = bitmap->bp;
+ pages = bitmap->pages;
+
+ /* free all allocated memory */
+
+ mempool_destroy(bitmap->write_pool);
+
+ if (bp) /* deallocate the page memory */
+ for (k = 0; k < pages; k++)
+ if (bp[k].map && !bp[k].hijacked)
+ kfree(bp[k].map);
+ kfree(bp);
+ kfree(bitmap);
+}
+
+/*
+ * initialize the bitmap structure
+ * if this returns an error, bitmap_destroy must be called to do clean up
+ */
+int bitmap_create(mddev_t *mddev)
+{
+ struct bitmap *bitmap;
+ unsigned long blocks = mddev->resync_max_sectors;
+ unsigned long chunks;
+ unsigned long pages;
+ struct file *file = mddev->bitmap_file;
+ int err;
+
+ BUG_ON(sizeof(bitmap_super_t) != 256);
+
+ if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
+ return 0;
+
+ BUG_ON(file && mddev->bitmap_offset);
+
+ bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+ if (!bitmap)
+ return -ENOMEM;
+
+ memset(bitmap, 0, sizeof(*bitmap));
+
+ spin_lock_init(&bitmap->lock);
+ bitmap->mddev = mddev;
+ mddev->bitmap = bitmap;
+
+ spin_lock_init(&bitmap->write_lock);
+ INIT_LIST_HEAD(&bitmap->complete_pages);
+ init_waitqueue_head(&bitmap->write_wait);
+ bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
+ write_pool_free, NULL);
+ if (!bitmap->write_pool)
+ return -ENOMEM;
+
+ bitmap->file = file;
+ bitmap->offset = mddev->bitmap_offset;
+ if (file) get_file(file);
+ /* read superblock from bitmap file (this sets bitmap->chunksize) */
+ err = bitmap_read_sb(bitmap);
+ if (err)
+ return err;
+
+ bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
+ sizeof(bitmap->chunksize));
+
+ /* now that chunksize and chunkshift are set, we can use these macros */
+ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
+ CHUNK_BLOCK_RATIO(bitmap);
+ pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
+
+ BUG_ON(!pages);
+
+ bitmap->chunks = chunks;
+ bitmap->pages = pages;
+ bitmap->missing_pages = pages;
+ bitmap->counter_bits = COUNTER_BITS;
+
+ bitmap->syncchunk = ~0UL;
+
+#if INJECT_FATAL_FAULT_1
+ bitmap->bp = NULL;
+#else
+ bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
+#endif
+ if (!bitmap->bp)
+ return -ENOMEM;
+ memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
+
+ bitmap->flags |= BITMAP_ACTIVE;
+
+ /* now that we have some pages available, initialize the in-memory
+ * bitmap from the on-disk bitmap */
+ err = bitmap_init_from_disk(bitmap, mddev->recovery_cp == MaxSector);
+ if (err)
+ return err;
+
+ printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
+ pages, bmname(bitmap));
+
+ /* kick off the bitmap daemons */
+ err = bitmap_start_daemons(bitmap);
+ if (err)
+ return err;
+ return bitmap_update_sb(bitmap);
+}
+
+/* the bitmap API -- for raid personalities */
+EXPORT_SYMBOL(bitmap_startwrite);
+EXPORT_SYMBOL(bitmap_endwrite);
+EXPORT_SYMBOL(bitmap_start_sync);
+EXPORT_SYMBOL(bitmap_end_sync);
+EXPORT_SYMBOL(bitmap_unplug);
+EXPORT_SYMBOL(bitmap_close_sync);
+EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0dd6c2b5391b..d0a4bab220e5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -704,8 +704,7 @@ static void crypt_dtr(struct dm_target *ti)
mempool_destroy(cc->page_pool);
mempool_destroy(cc->io_pool);
- if (cc->iv_mode)
- kfree(cc->iv_mode);
+ kfree(cc->iv_mode);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
crypto_free_tfm(cc->tfm);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ee3c869d9701..200a0688f717 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -122,14 +122,6 @@ static struct hash_cell *__get_uuid_cell(const char *str)
/*-----------------------------------------------------------------
* Inserting, removing and renaming a device.
*---------------------------------------------------------------*/
-static inline char *kstrdup(const char *str)
-{
- char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
- if (r)
- strcpy(r, str);
- return r;
-}
-
static struct hash_cell *alloc_cell(const char *name, const char *uuid,
struct mapped_device *md)
{
@@ -139,7 +131,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
if (!hc)
return NULL;
- hc->name = kstrdup(name);
+ hc->name = kstrdup(name, GFP_KERNEL);
if (!hc->name) {
kfree(hc);
return NULL;
@@ -149,7 +141,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
hc->uuid = NULL;
else {
- hc->uuid = kstrdup(uuid);
+ hc->uuid = kstrdup(uuid, GFP_KERNEL);
if (!hc->uuid) {
kfree(hc->name);
kfree(hc);
@@ -273,7 +265,7 @@ static int dm_hash_rename(const char *old, const char *new)
/*
* duplicate new.
*/
- new_name = kstrdup(new);
+ new_name = kstrdup(new, GFP_KERNEL);
if (!new_name)
return -ENOMEM;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 1e97b3c12bd5..0c1b8520ef86 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -985,6 +985,9 @@ static int do_end_io(struct multipath *m, struct bio *bio,
if (!error)
return 0; /* I/O complete */
+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+ return error;
+
spin_lock(&m->lock);
if (!m->nr_valid_paths) {
if (!m->queue_if_no_path || m->suspended) {
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 161e9aa87291..8d740013d74d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -217,8 +217,7 @@ static int linear_run (mddev_t *mddev)
return 0;
out:
- if (conf)
- kfree(conf);
+ kfree(conf);
return 1;
}
@@ -269,9 +268,8 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
* split it.
*/
struct bio_pair *bp;
- bp = bio_split(bio, bio_split_pool,
- (bio->bi_sector + (bio->bi_size >> 9) -
- (tmp_dev->offset + tmp_dev->size))<<1);
+ bp = bio_split(bio, bio_split_pool,
+ ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector);
if (linear_make_request(q, &bp->bio1))
generic_make_request(&bp->bio1);
if (linear_make_request(q, &bp->bio2))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d899204d3743..3802f7a17f16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -19,6 +19,9 @@
Neil Brown <neilb@cse.unsw.edu.au>.
+ - persistent bitmap code
+ Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
@@ -33,6 +36,7 @@
#include <linux/config.h>
#include <linux/linkage.h>
#include <linux/raid/md.h>
+#include <linux/raid/bitmap.h>
#include <linux/sysctl.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
@@ -40,6 +44,8 @@
#include <linux/init.h>
+#include <linux/file.h>
+
#ifdef CONFIG_KMOD
#include <linux/kmod.h>
#endif
@@ -189,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit)
if (mddev->unit == unit) {
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- if (new)
- kfree(new);
+ kfree(new);
return mddev;
}
@@ -218,6 +223,8 @@ static mddev_t * mddev_find(dev_t unit)
INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1);
+ spin_lock_init(&new->write_lock);
+ init_waitqueue_head(&new->sb_wait);
new->queue = blk_alloc_queue(GFP_KERNEL);
if (!new->queue) {
@@ -320,6 +327,40 @@ static void free_disk_sb(mdk_rdev_t * rdev)
}
+static int super_written(struct bio *bio, unsigned int bytes_done, int error)
+{
+ mdk_rdev_t *rdev = bio->bi_private;
+ if (bio->bi_size)
+ return 1;
+
+ if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+ md_error(rdev->mddev, rdev);
+
+ if (atomic_dec_and_test(&rdev->mddev->pending_writes))
+ wake_up(&rdev->mddev->sb_wait);
+ return 0;
+}
+
+void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+ sector_t sector, int size, struct page *page)
+{
+ /* write first size bytes of page to sector of rdev
+ * Increment mddev->pending_writes before returning
+ * and decrement it on completion, waking up sb_wait
+ * if zero is reached.
+ * If an error occurred, call md_error
+ */
+ struct bio *bio = bio_alloc(GFP_NOIO, 1);
+
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_sector = sector;
+ bio_add_page(bio, page, size, 0);
+ bio->bi_private = rdev;
+ bio->bi_end_io = super_written;
+ atomic_inc(&mddev->pending_writes);
+ submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
+}
+
static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
{
if (bio->bi_size)
@@ -329,7 +370,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
return 0;
}
-static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw)
{
struct bio *bio = bio_alloc(GFP_NOIO, 1);
@@ -416,11 +457,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
ret = 1;
abort:
- if (tmp1)
- kfree(tmp1);
- if (tmp2)
- kfree(tmp2);
-
+ kfree(tmp1);
+ kfree(tmp2);
return ret;
}
@@ -569,6 +607,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mdp_disk_t *desc;
mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+ rdev->raid_disk = -1;
+ rdev->in_sync = 0;
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
mddev->minor_version = sb->minor_version;
@@ -599,16 +639,35 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
mddev->max_disks = MD_SB_DISKS;
- } else {
- __u64 ev1;
- ev1 = md_event(sb);
+
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+ mddev->bitmap_file == NULL) {
+ if (mddev->level != 1) {
+ /* FIXME use a better test */
+ printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+ return -EINVAL;
+ }
+ mddev->bitmap_offset = (MD_SB_BYTES >> 9);
+ }
+
+ } else if (mddev->pers == NULL) {
+ /* Insist on good event counter while assembling */
+ __u64 ev1 = md_event(sb);
++ev1;
if (ev1 < mddev->events)
return -EINVAL;
- }
+ } else if (mddev->bitmap) {
+ /* if adding to array with a bitmap, then we can accept an
+ * older device ... but not too old.
+ */
+ __u64 ev1 = md_event(sb);
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ } else /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
+
if (mddev->level != LEVEL_MULTIPATH) {
- rdev->raid_disk = -1;
- rdev->in_sync = rdev->faulty = 0;
+ rdev->faulty = 0;
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
@@ -618,7 +677,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
- }
+ } else /* MULTIPATH are always insync */
+ rdev->in_sync = 1;
return 0;
}
@@ -683,6 +743,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->layout = mddev->layout;
sb->chunk_size = mddev->chunk_size;
+ if (mddev->bitmap && mddev->bitmap_file == NULL)
+ sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
sb->disks[0].state = (1<<MD_DISK_REMOVED);
ITERATE_RDEV(mddev,rdev2,tmp) {
mdp_disk_t *d;
@@ -780,7 +843,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
case 0:
sb_offset = rdev->bdev->bd_inode->i_size >> 9;
sb_offset -= 8*2;
- sb_offset &= ~(4*2-1);
+ sb_offset &= ~(sector_t)(4*2-1);
/* convert from sectors to K */
sb_offset /= 2;
break;
@@ -860,6 +923,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ rdev->raid_disk = -1;
+ rdev->in_sync = 0;
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
mddev->patch_version = 0;
@@ -877,13 +942,30 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
- } else {
- __u64 ev1;
- ev1 = le64_to_cpu(sb->events);
+
+ if ((le32_to_cpu(sb->feature_map) & 1) &&
+ mddev->bitmap_file == NULL ) {
+ if (mddev->level != 1) {
+ printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+ return -EINVAL;
+ }
+ mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+ }
+ } else if (mddev->pers == NULL) {
+ /* Insist of good event counter while assembling */
+ __u64 ev1 = le64_to_cpu(sb->events);
++ev1;
if (ev1 < mddev->events)
return -EINVAL;
- }
+ } else if (mddev->bitmap) {
+ /* If adding to array with a bitmap, then we can accept an
+ * older device, but not too old.
+ */
+ __u64 ev1 = le64_to_cpu(sb->events);
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ } else /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
if (mddev->level != LEVEL_MULTIPATH) {
int role;
@@ -891,14 +973,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
- rdev->in_sync = 0;
rdev->faulty = 0;
- rdev->raid_disk = -1;
break;
case 0xfffe: /* faulty */
- rdev->in_sync = 0;
rdev->faulty = 1;
- rdev->raid_disk = -1;
break;
default:
rdev->in_sync = 1;
@@ -906,7 +984,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = role;
break;
}
- }
+ } else /* MULTIPATH are always insync */
+ rdev->in_sync = 1;
+
return 0;
}
@@ -933,6 +1013,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
else
sb->resync_offset = cpu_to_le64(0);
+ if (mddev->bitmap && mddev->bitmap_file == NULL) {
+ sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+ sb->feature_map = cpu_to_le32(1);
+ }
+
max_dev = 0;
ITERATE_RDEV(mddev,rdev2,tmp)
if (rdev2->desc_nr+1 > max_dev)
@@ -1196,8 +1281,11 @@ void md_print_devices(void)
printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
printk("md: **********************************\n");
ITERATE_MDDEV(mddev,tmp) {
- printk("%s: ", mdname(mddev));
+ if (mddev->bitmap)
+ bitmap_print_sb(mddev->bitmap);
+ else
+ printk("%s: ", mdname(mddev));
ITERATE_RDEV(mddev,rdev,tmp2)
printk("<%s>", bdevname(rdev->bdev,b));
printk("\n");
@@ -1210,30 +1298,6 @@ void md_print_devices(void)
}
-static int write_disk_sb(mdk_rdev_t * rdev)
-{
- char b[BDEVNAME_SIZE];
- if (!rdev->sb_loaded) {
- MD_BUG();
- return 1;
- }
- if (rdev->faulty) {
- MD_BUG();
- return 1;
- }
-
- dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
- bdevname(rdev->bdev,b),
- (unsigned long long)rdev->sb_offset);
-
- if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
- return 0;
-
- printk("md: write_disk_sb failed for device %s\n",
- bdevname(rdev->bdev,b));
- return 1;
-}
-
static void sync_sbs(mddev_t * mddev)
{
mdk_rdev_t *rdev;
@@ -1248,12 +1312,14 @@ static void sync_sbs(mddev_t * mddev)
static void md_update_sb(mddev_t * mddev)
{
- int err, count = 100;
+ int err;
struct list_head *tmp;
mdk_rdev_t *rdev;
+ int sync_req;
- mddev->sb_dirty = 0;
repeat:
+ spin_lock(&mddev->write_lock);
+ sync_req = mddev->in_sync;
mddev->utime = get_seconds();
mddev->events ++;
@@ -1266,20 +1332,26 @@ repeat:
MD_BUG();
mddev->events --;
}
+ mddev->sb_dirty = 2;
sync_sbs(mddev);
/*
* do not write anything to disk if using
* nonpersistent superblocks
*/
- if (!mddev->persistent)
+ if (!mddev->persistent) {
+ mddev->sb_dirty = 0;
+ spin_unlock(&mddev->write_lock);
+ wake_up(&mddev->sb_wait);
return;
+ }
+ spin_unlock(&mddev->write_lock);
dprintk(KERN_INFO
"md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev),mddev->in_sync);
- err = 0;
+ err = bitmap_update_sb(mddev->bitmap);
ITERATE_RDEV(mddev,rdev,tmp) {
char b[BDEVNAME_SIZE];
dprintk(KERN_INFO "md: ");
@@ -1288,22 +1360,32 @@ repeat:
dprintk("%s ", bdevname(rdev->bdev,b));
if (!rdev->faulty) {
- err += write_disk_sb(rdev);
+ md_super_write(mddev,rdev,
+ rdev->sb_offset<<1, MD_SB_BYTES,
+ rdev->sb_page);
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdevname(rdev->bdev,b),
+ (unsigned long long)rdev->sb_offset);
+
} else
dprintk(")\n");
- if (!err && mddev->level == LEVEL_MULTIPATH)
+ if (mddev->level == LEVEL_MULTIPATH)
/* only need to write one superblock... */
break;
}
- if (err) {
- if (--count) {
- printk(KERN_ERR "md: errors occurred during superblock"
- " update, repeating\n");
- goto repeat;
- }
- printk(KERN_ERR \
- "md: excessive errors occurred during superblock update, exiting\n");
+ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+ /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+
+ spin_lock(&mddev->write_lock);
+ if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+ /* have to write it out again */
+ spin_unlock(&mddev->write_lock);
+ goto repeat;
}
+ mddev->sb_dirty = 0;
+ spin_unlock(&mddev->write_lock);
+ wake_up(&mddev->sb_wait);
+
}
/*
@@ -1607,12 +1689,19 @@ static int do_md_run(mddev_t * mddev)
mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
- err = mddev->pers->run(mddev);
+ /* before we start the array running, initialise the bitmap */
+ err = bitmap_create(mddev);
+ if (err)
+ printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
+ mdname(mddev), err);
+ else
+ err = mddev->pers->run(mddev);
if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n");
module_put(mddev->pers->owner);
mddev->pers = NULL;
- return -EINVAL;
+ bitmap_destroy(mddev);
+ return err;
}
atomic_set(&mddev->writes_pending,0);
mddev->safemode = 0;
@@ -1725,6 +1814,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (ro)
set_disk_ro(disk, 1);
}
+
+ bitmap_destroy(mddev);
+ if (mddev->bitmap_file) {
+ atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ }
+
/*
* Free resources if final stop
*/
@@ -1983,6 +2080,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
return 0;
}
+static int get_bitmap_file(mddev_t * mddev, void * arg)
+{
+ mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
+ char *ptr, *buf = NULL;
+ int err = -ENOMEM;
+
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+ goto out;
+
+ /* bitmap disabled, zero the first byte and copy out */
+ if (!mddev->bitmap || !mddev->bitmap->file) {
+ file->pathname[0] = '\0';
+ goto copy_out;
+ }
+
+ buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
+ if (!ptr)
+ goto out;
+
+ strcpy(file->pathname, ptr);
+
+copy_out:
+ err = 0;
+ if (copy_to_user(arg, file, sizeof(*file)))
+ err = -EFAULT;
+out:
+ kfree(buf);
+ kfree(file);
+ return err;
+}
+
static int get_disk_info(mddev_t * mddev, void __user * arg)
{
mdu_disk_info_t info;
@@ -2078,11 +2211,25 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
PTR_ERR(rdev));
return PTR_ERR(rdev);
}
+ /* set save_raid_disk if appropriate */
+ if (!mddev->persistent) {
+ if (info->state & (1<<MD_DISK_SYNC) &&
+ info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+ } else
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ rdev->saved_raid_disk = rdev->raid_disk;
+
rdev->in_sync = 0; /* just to be sure */
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->thread)
md_wakeup_thread(mddev->thread);
return err;
@@ -2256,6 +2403,49 @@ abort_export:
return err;
}
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ spin_lock(&inode->i_lock);
+ if (atomic_read(&inode->i_writecount) > 1) {
+ spin_unlock(&inode->i_lock);
+ return -ETXTBSY;
+ }
+ atomic_set(&inode->i_writecount, -1);
+ spin_unlock(&inode->i_lock);
+
+ return 0;
+}
+
+static int set_bitmap_file(mddev_t *mddev, int fd)
+{
+ int err;
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ mddev->bitmap_file = fget(fd);
+
+ if (mddev->bitmap_file == NULL) {
+ printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+ mdname(mddev));
+ return -EBADF;
+ }
+
+ err = deny_bitmap_write_access(mddev->bitmap_file);
+ if (err) {
+ printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+ mdname(mddev));
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ } else
+ mddev->bitmap_offset = 0; /* file overrides offset */
+ return err;
+}
+
/*
* set_array_info is used two different ways
* The original usage is when creating a new array.
@@ -2567,8 +2757,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
/*
* Commands querying/configuring an existing array:
*/
- /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
- if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
+ * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
+ && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
err = -ENODEV;
goto abort_unlock;
}
@@ -2582,6 +2774,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = get_array_info(mddev, argp);
goto done_unlock;
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, (void *)arg);
+ goto done_unlock;
+
case GET_DISK_INFO:
err = get_disk_info(mddev, argp);
goto done_unlock;
@@ -2662,6 +2858,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = do_md_run (mddev);
goto done_unlock;
+ case SET_BITMAP_FILE:
+ err = set_bitmap_file(mddev, (int)arg);
+ goto done_unlock;
+
default:
if (_IOC_TYPE(cmd) == MD_MAJOR)
printk(KERN_WARNING "md: %s(pid %d) used"
@@ -2773,10 +2973,10 @@ static int md_thread(void * arg)
while (thread->run) {
void (*run)(mddev_t *);
- wait_event_interruptible(thread->wqueue,
- test_bit(THREAD_WAKEUP, &thread->flags));
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ wait_event_interruptible_timeout(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags),
+ thread->timeout);
+ try_to_freeze();
clear_bit(THREAD_WAKEUP, &thread->flags);
@@ -2820,6 +3020,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
thread->run = run;
thread->mddev = mddev;
thread->name = name;
+ thread->timeout = MAX_SCHEDULE_TIMEOUT;
ret = kernel_thread(md_thread, thread, 0);
if (ret < 0) {
kfree(thread);
@@ -2858,13 +3059,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!rdev || rdev->faulty)
return;
-
+/*
dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
mdname(mddev),
MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
__builtin_return_address(0),__builtin_return_address(1),
__builtin_return_address(2),__builtin_return_address(3));
-
+*/
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
@@ -3018,6 +3219,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
struct list_head *tmp2;
mdk_rdev_t *rdev;
int i;
+ struct bitmap *bitmap;
if (v == (void*)1) {
seq_printf(seq, "Personalities : ");
@@ -3070,10 +3272,35 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev->pers) {
mddev->pers->status (seq, mddev);
seq_printf(seq, "\n ");
- if (mddev->curr_resync > 2)
+ if (mddev->curr_resync > 2) {
status_resync (seq, mddev);
- else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
- seq_printf(seq, " resync=DELAYED");
+ seq_printf(seq, "\n ");
+ } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, " resync=DELAYED\n ");
+ } else
+ seq_printf(seq, "\n ");
+
+ if ((bitmap = mddev->bitmap)) {
+ unsigned long chunk_kb;
+ unsigned long flags;
+ spin_lock_irqsave(&bitmap->lock, flags);
+ chunk_kb = bitmap->chunksize >> 10;
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+ "%lu%s chunk",
+ bitmap->pages - bitmap->missing_pages,
+ bitmap->pages,
+ (bitmap->pages - bitmap->missing_pages)
+ << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : bitmap->chunksize,
+ chunk_kb ? "KB" : "B");
+ if (bitmap->file) {
+ seq_printf(seq, ", file: ");
+ seq_path(seq, bitmap->file->f_vfsmnt,
+ bitmap->file->f_dentry," \t\n");
+ }
+
+ seq_printf(seq, "\n");
+ spin_unlock_irqrestore(&bitmap->lock, flags);
}
seq_printf(seq, "\n");
@@ -3176,19 +3403,28 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
}
-void md_write_start(mddev_t *mddev)
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
+ */
+void md_write_start(mddev_t *mddev, struct bio *bi)
{
- if (!atomic_read(&mddev->writes_pending)) {
- mddev_lock_uninterruptible(mddev);
+ DEFINE_WAIT(w);
+ if (bio_data_dir(bi) != WRITE)
+ return;
+
+ atomic_inc(&mddev->writes_pending);
+ if (mddev->in_sync) {
+ spin_lock(&mddev->write_lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
- del_timer(&mddev->safemode_timer);
- md_update_sb(mddev);
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(mddev->thread);
}
- atomic_inc(&mddev->writes_pending);
- mddev_unlock(mddev);
- } else
- atomic_inc(&mddev->writes_pending);
+ spin_unlock(&mddev->write_lock);
+ }
+ wait_event(mddev->sb_wait, mddev->sb_dirty==0);
}
void md_write_end(mddev_t *mddev)
@@ -3201,37 +3437,6 @@ void md_write_end(mddev_t *mddev)
}
}
-static inline void md_enter_safemode(mddev_t *mddev)
-{
- if (!mddev->safemode) return;
- if (mddev->safemode == 2 &&
- (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
- mddev->recovery_cp != MaxSector))
- return; /* avoid the lock */
- mddev_lock_uninterruptible(mddev);
- if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
- !mddev->in_sync && mddev->recovery_cp == MaxSector) {
- mddev->in_sync = 1;
- md_update_sb(mddev);
- }
- mddev_unlock(mddev);
-
- if (mddev->safemode == 1)
- mddev->safemode = 0;
-}
-
-void md_handle_safemode(mddev_t *mddev)
-{
- if (signal_pending(current)) {
- printk(KERN_INFO "md: %s in immediate safe mode\n",
- mdname(mddev));
- mddev->safemode = 2;
- flush_signals(current);
- }
- md_enter_safemode(mddev);
-}
-
-
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
#define SYNC_MARKS 10
@@ -3241,12 +3446,13 @@ static void md_do_sync(mddev_t *mddev)
mddev_t *mddev2;
unsigned int currspeed = 0,
window;
- sector_t max_sectors,j;
+ sector_t max_sectors,j, io_sectors;
unsigned long mark[SYNC_MARKS];
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
sector_t last_check;
+ int skipped = 0;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3312,7 +3518,7 @@ static void md_do_sync(mddev_t *mddev)
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
/* resync follows the size requested by the personality,
- * which default to physical size, but can be virtual size
+ * which defaults to physical size, but can be virtual size
*/
max_sectors = mddev->resync_max_sectors;
else
@@ -3327,13 +3533,15 @@ static void md_do_sync(mddev_t *mddev)
sysctl_speed_limit_max);
is_mddev_idle(mddev); /* this also initializes IO event counters */
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ /* we don't use the checkpoint if there's a bitmap */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
j = mddev->recovery_cp;
else
j = 0;
+ io_sectors = 0;
for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies;
- mark_cnt[m] = j;
+ mark_cnt[m] = io_sectors;
}
last_mark = 0;
mddev->resync_mark = mark[last_mark];
@@ -3358,21 +3566,29 @@ static void md_do_sync(mddev_t *mddev)
}
while (j < max_sectors) {
- int sectors;
+ sector_t sectors;
- sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
- if (sectors < 0) {
+ skipped = 0;
+ sectors = mddev->pers->sync_request(mddev, j, &skipped,
+ currspeed < sysctl_speed_limit_min);
+ if (sectors == 0) {
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
goto out;
}
- atomic_add(sectors, &mddev->recovery_active);
+
+ if (!skipped) { /* actual IO requested */
+ io_sectors += sectors;
+ atomic_add(sectors, &mddev->recovery_active);
+ }
+
j += sectors;
if (j>1) mddev->curr_resync = j;
- if (last_check + window > j || j == max_sectors)
+
+ if (last_check + window > io_sectors || j == max_sectors)
continue;
- last_check = j;
+ last_check = io_sectors;
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
test_bit(MD_RECOVERY_ERR, &mddev->recovery))
@@ -3386,7 +3602,7 @@ static void md_do_sync(mddev_t *mddev)
mddev->resync_mark = mark[next];
mddev->resync_mark_cnt = mark_cnt[next];
mark[next] = jiffies;
- mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
last_mark = next;
}
@@ -3413,7 +3629,8 @@ static void md_do_sync(mddev_t *mddev)
mddev->queue->unplug_fn(mddev->queue);
cond_resched();
- currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+ currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+ /((jiffies-mddev->resync_mark)/HZ +1) +1;
if (currspeed > sysctl_speed_limit_min) {
if ((currspeed > sysctl_speed_limit_max) ||
@@ -3433,7 +3650,7 @@ static void md_do_sync(mddev_t *mddev)
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
/* tell personality that we are finished */
- mddev->pers->sync_request(mddev, max_sectors, 1);
+ mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
mddev->curr_resync > 2 &&
@@ -3447,7 +3664,6 @@ static void md_do_sync(mddev_t *mddev)
mddev->recovery_cp = MaxSector;
}
- md_enter_safemode(mddev);
skip:
mddev->curr_resync = 0;
wake_up(&resync_wait);
@@ -3484,20 +3700,48 @@ void md_check_recovery(mddev_t *mddev)
struct list_head *rtmp;
- dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+ if (mddev->bitmap)
+ bitmap_daemon_work(mddev->bitmap);
if (mddev->ro)
return;
+
+ if (signal_pending(current)) {
+ if (mddev->pers->sync_request) {
+ printk(KERN_INFO "md: %s in immediate safe mode\n",
+ mdname(mddev));
+ mddev->safemode = 2;
+ }
+ flush_signals(current);
+ }
+
if ( ! (
mddev->sb_dirty ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+ (mddev->safemode == 1) ||
+ (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+ && !mddev->in_sync && mddev->recovery_cp == MaxSector)
))
return;
+
if (mddev_trylock(mddev)==0) {
int spares =0;
+
+ spin_lock(&mddev->write_lock);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ mddev->sb_dirty = 1;
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ spin_unlock(&mddev->write_lock);
+
if (mddev->sb_dirty)
md_update_sb(mddev);
+
+
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
/* resync/recovery still happening */
@@ -3515,6 +3759,14 @@ void md_check_recovery(mddev_t *mddev)
mddev->pers->spare_active(mddev);
}
md_update_sb(mddev);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev->saved_raid_disk = -1;
+
mddev->recovery = 0;
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3557,6 +3809,13 @@ void md_check_recovery(mddev_t *mddev)
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (!spares)
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+ /* We are adding a device or devices to an array
+ * which has the bitmap stored on all devices.
+ * So make sure all bitmap pages get written
+ */
+ bitmap_write_all(mddev->bitmap);
+ }
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"%s_resync");
@@ -3624,6 +3883,8 @@ static int __init md_init(void)
" MD_SB_DISKS=%d\n",
MD_MAJOR_VERSION, MD_MINOR_VERSION,
MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+ printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
+ BITMAP_MINOR);
if (register_blkdev(MAJOR_NR, "md"))
return -1;
@@ -3739,7 +4000,6 @@ EXPORT_SYMBOL(md_error);
EXPORT_SYMBOL(md_done_sync);
EXPORT_SYMBOL(md_write_start);
EXPORT_SYMBOL(md_write_end);
-EXPORT_SYMBOL(md_handle_safemode);
EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4e4bfde3db5d..2d2ca7fa0265 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -462,10 +462,6 @@ static int multipath_run (mddev_t *mddev)
}
memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
- mddev->queue->unplug_fn = multipath_unplug;
-
- mddev->queue->issue_flush_fn = multipath_issue_flush;
-
conf->working_disks = 0;
ITERATE_RDEV(mddev,rdev,tmp) {
disk_idx = rdev->raid_disk;
@@ -528,13 +524,16 @@ static int multipath_run (mddev_t *mddev)
* Ok, everything is just fine now
*/
mddev->array_size = mddev->size;
+
+ mddev->queue->unplug_fn = multipath_unplug;
+ mddev->queue->issue_flush_fn = multipath_issue_flush;
+
return 0;
out_free_conf:
if (conf->pool)
mempool_destroy(conf->pool);
- if (conf->multipaths)
- kfree(conf->multipaths);
+ kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
out:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e7d934eca06f..e11dd14d0b43 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -371,10 +371,8 @@ static int raid0_run (mddev_t *mddev)
return 0;
out_free_conf:
- if (conf->strip_zone)
- kfree(conf->strip_zone);
- if (conf->devlist)
- kfree (conf->devlist);
+ kfree(conf->strip_zone);
+ kfree(conf->devlist);
kfree(conf);
mddev->private = NULL;
out:
@@ -386,11 +384,11 @@ static int raid0_stop (mddev_t *mddev)
raid0_conf_t *conf = mddev_to_conf(mddev);
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
- kfree (conf->hash_table);
+ kfree(conf->hash_table);
conf->hash_table = NULL;
- kfree (conf->strip_zone);
+ kfree(conf->strip_zone);
conf->strip_zone = NULL;
- kfree (conf);
+ kfree(conf);
mddev->private = NULL;
return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 83380b5d6593..ff1dbec864af 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -12,6 +12,15 @@
* Fixes to reconstruction by Jakob Řstergaard" <jakob@ostenfeld.dk>
* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
*
+ * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
+ * bitmapped intelligence in resync:
+ *
+ * - bitmap marked during normal i/o
+ * - bitmap used to skip nondirty blocks during sync
+ *
+ * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
+ * - persistent bitmap code
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
@@ -22,7 +31,16 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include "dm-bio-list.h"
#include <linux/raid/raid1.h>
+#include <linux/raid/bitmap.h>
+
+#define DEBUG 0
+#if DEBUG
+#define PRINTK(x...) printk(x)
+#else
+#define PRINTK(x...)
+#endif
/*
* Number of guaranteed r1bios in case of extreme VM load:
@@ -287,9 +305,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
- if (!uptodate)
+ if (!uptodate) {
md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- else
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
/*
* Set R1BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@@ -309,6 +329,10 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining)) {
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state));
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
}
@@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev)
static void raid1_unplug(request_queue_t *q)
{
- unplug_slaves(q->queuedata);
+ mddev_t *mddev = q->queuedata;
+
+ unplug_slaves(mddev);
+ md_wakeup_thread(mddev->thread);
}
static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, sector_t sect)
{
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
- conf->resync_lock, unplug_slaves(conf->mddev));
+ conf->resync_lock, raid1_unplug(conf->mddev->queue));
if (!conf->barrier++) {
wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
- conf->resync_lock, unplug_slaves(conf->mddev));
+ conf->resync_lock, raid1_unplug(conf->mddev->queue));
if (conf->nr_pending)
BUG();
}
wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
- conf->resync_lock, unplug_slaves(conf->mddev));
+ conf->resync_lock, raid1_unplug(conf->mddev->queue));
conf->next_resync = sect;
spin_unlock_irq(&conf->resync_lock);
}
@@ -522,14 +549,20 @@ static int make_request(request_queue_t *q, struct bio * bio)
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct bio *read_bio;
- int i, disks;
+ int i, targets = 0, disks;
mdk_rdev_t *rdev;
+ struct bitmap *bitmap = mddev->bitmap;
+ unsigned long flags;
+ struct bio_list bl;
+
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
+ md_write_start(mddev, bio); /* wait on superblock update early */
+
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
conf->nr_pending++;
@@ -552,7 +585,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
r1_bio->master_bio = bio;
r1_bio->sectors = bio->bi_size >> 9;
-
+ r1_bio->state = 0;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
@@ -595,6 +628,13 @@ static int make_request(request_queue_t *q, struct bio * bio)
* bios[x] to bio
*/
disks = conf->raid_disks;
+#if 0
+ { static int first=1;
+ if (first) printk("First Write sector %llu disks %d\n",
+ (unsigned long long)r1_bio->sector, disks);
+ first = 0;
+ }
+#endif
rcu_read_lock();
for (i = 0; i < disks; i++) {
if ((rdev=conf->mirrors[i].rdev) != NULL &&
@@ -605,13 +645,21 @@ static int make_request(request_queue_t *q, struct bio * bio)
r1_bio->bios[i] = NULL;
} else
r1_bio->bios[i] = bio;
+ targets++;
} else
r1_bio->bios[i] = NULL;
}
rcu_read_unlock();
- atomic_set(&r1_bio->remaining, 1);
- md_write_start(mddev);
+ if (targets < conf->raid_disks) {
+ /* array is degraded, we will not clear the bitmap
+ * on I/O completion (see raid1_end_write_request) */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+
+ atomic_set(&r1_bio->remaining, 0);
+
+ bio_list_init(&bl);
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
@@ -627,14 +675,23 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
- generic_make_request(mbio);
- }
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- md_write_end(mddev);
- raid_end_bio_io(r1_bio);
+ bio_list_add(&bl, mbio);
}
+ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_merge(&conf->pending_bio_list, &bl);
+ bio_list_init(&bl);
+
+ blk_plug_device(mddev->queue);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+#if 0
+ while ((bio = bio_list_pop(&bl)) != NULL)
+ generic_make_request(bio);
+#endif
+
return 0;
}
@@ -714,7 +771,7 @@ static void close_sync(conf_t *conf)
{
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier,
- conf->resync_lock, unplug_slaves(conf->mddev));
+ conf->resync_lock, raid1_unplug(conf->mddev->queue));
spin_unlock_irq(&conf->resync_lock);
if (conf->barrier) BUG();
@@ -754,9 +811,12 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
int found = 0;
- int mirror;
+ int mirror = 0;
mirror_info_t *p;
+ if (rdev->saved_raid_disk >= 0 &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ mirror = rdev->saved_raid_disk;
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
@@ -773,6 +833,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
+ if (rdev->saved_raid_disk != mirror)
+ conf->fullsync = 1;
p->rdev = rdev;
break;
}
@@ -828,10 +890,11 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
* or re-read if the read failed.
* We don't do much here, just schedule handling by raid1d
*/
- if (!uptodate)
+ if (!uptodate) {
md_error(r1_bio->mddev,
conf->mirrors[r1_bio->read_disk].rdev);
- else
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
set_bit(R1BIO_Uptodate, &r1_bio->state);
rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
reschedule_retry(r1_bio);
@@ -855,8 +918,10 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
mirror = i;
break;
}
- if (!uptodate)
+ if (!uptodate) {
md_error(mddev, conf->mirrors[mirror].rdev);
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
update_head_pos(mirror, r1_bio);
if (atomic_dec_and_test(&r1_bio->remaining)) {
@@ -876,6 +941,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
bio = r1_bio->bios[r1_bio->read_disk];
+/*
+ if (r1_bio->sector == 0) printk("First sync write startss\n");
+*/
/*
* schedule writes
*/
@@ -903,10 +971,12 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
atomic_inc(&r1_bio->remaining);
md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+
generic_make_request(wbio);
}
if (atomic_dec_and_test(&r1_bio->remaining)) {
+ /* if we're here, all write(s) have completed, so clean up */
md_done_sync(mddev, r1_bio->sectors, 1);
put_buf(r1_bio);
}
@@ -931,11 +1001,30 @@ static void raid1d(mddev_t *mddev)
mdk_rdev_t *rdev;
md_check_recovery(mddev);
- md_handle_safemode(mddev);
for (;;) {
char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags);
+
+ if (conf->pending_bio_list.head) {
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(mddev->queue);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ if (bitmap_unplug(mddev->bitmap) != 0)
+ printk("%s: bitmap file write failed!\n", mdname(mddev));
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ unplug = 1;
+
+ continue;
+ }
+
if (list_empty(head))
break;
r1_bio = list_entry(head->prev, r1bio_t, retry_list);
@@ -1009,7 +1098,7 @@ static int init_resync(conf_t *conf)
* that can be installed to exclude normal IO requests.
*/
-static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
{
conf_t *conf = mddev_to_conf(mddev);
mirror_info_t *mirror;
@@ -1019,17 +1108,43 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
int disk;
int i;
int write_targets = 0;
+ int sync_blocks;
if (!conf->r1buf_pool)
+ {
+/*
+ printk("sync start - bitmap %p\n", mddev->bitmap);
+*/
if (init_resync(conf))
- return -ENOMEM;
+ return 0;
+ }
max_sector = mddev->size << 1;
if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chunk (there will
+ * only be one in raid1 resync.
+ * We can find the current addess in mddev->curr_resync
+ */
+ if (!conf->fullsync) {
+ if (mddev->curr_resync < max_sector)
+ bitmap_end_sync(mddev->bitmap,
+ mddev->curr_resync,
+ &sync_blocks, 1);
+ bitmap_close_sync(mddev->bitmap);
+ }
+ if (mddev->curr_resync >= max_sector)
+ conf->fullsync = 0;
close_sync(conf);
return 0;
}
+ if (!conf->fullsync &&
+ !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) {
+ /* We can skip this block, and probably several more */
+ *skipped = 1;
+ return sync_blocks;
+ }
/*
* If there is non-resync activity waiting for us then
* put in a delay to throttle resync.
@@ -1068,6 +1183,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
r1_bio->mddev = mddev;
r1_bio->sector = sector_nr;
+ r1_bio->state = 0;
set_bit(R1BIO_IsSync, &r1_bio->state);
r1_bio->read_disk = disk;
@@ -1102,18 +1218,24 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
bio->bi_bdev = conf->mirrors[i].rdev->bdev;
bio->bi_private = r1_bio;
}
+
+ if (write_targets + 1 < conf->raid_disks)
+ /* array degraded, can't clear bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+
if (write_targets == 0) {
/* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished
*/
- int rv = max_sector - sector_nr;
- md_done_sync(mddev, rv, 1);
+ sector_t rv = max_sector - sector_nr;
+ *skipped = 1;
put_buf(r1_bio);
rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
return rv;
}
nr_sectors = 0;
+ sync_blocks = 0;
do {
struct page *page;
int len = PAGE_SIZE;
@@ -1121,6 +1243,17 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
len = (max_sector - sector_nr) << 9;
if (len == 0)
break;
+ if (!conf->fullsync) {
+ if (sync_blocks == 0) {
+ if (!bitmap_start_sync(mddev->bitmap,
+ sector_nr, &sync_blocks))
+ break;
+ if (sync_blocks < (PAGE_SIZE>>9))
+ BUG();
+ if (len > (sync_blocks<<9)) len = sync_blocks<<9;
+ }
+ }
+
for (i=0 ; i < conf->raid_disks; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io) {
@@ -1143,6 +1276,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
}
nr_sectors += len>>9;
sector_nr += len>>9;
+ sync_blocks -= (len>>9);
} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
bio_full:
bio = r1_bio->bios[disk];
@@ -1197,10 +1331,6 @@ static int run(mddev_t *mddev)
if (!conf->r1bio_pool)
goto out_no_mem;
- mddev->queue->unplug_fn = raid1_unplug;
-
- mddev->queue->issue_flush_fn = raid1_issue_flush;
-
ITERATE_RDEV(mddev, rdev, tmp) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
@@ -1235,6 +1365,9 @@ static int run(mddev_t *mddev)
init_waitqueue_head(&conf->wait_idle);
init_waitqueue_head(&conf->wait_resume);
+ bio_list_init(&conf->pending_bio_list);
+ bio_list_init(&conf->flushing_bio_list);
+
if (!conf->working_disks) {
printk(KERN_ERR "raid1: no operational mirrors for %s\n",
mdname(mddev));
@@ -1263,16 +1396,15 @@ static int run(mddev_t *mddev)
conf->last_used = j;
-
- {
- mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
- if (!mddev->thread) {
- printk(KERN_ERR
- "raid1: couldn't allocate thread for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
+ mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
+ if (!mddev->thread) {
+ printk(KERN_ERR
+ "raid1: couldn't allocate thread for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
}
+ if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
printk(KERN_INFO
"raid1: raid set %s active with %d out of %d mirrors\n",
mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -1282,6 +1414,9 @@ static int run(mddev_t *mddev)
*/
mddev->array_size = mddev->size;
+ mddev->queue->unplug_fn = raid1_unplug;
+ mddev->queue->issue_flush_fn = raid1_issue_flush;
+
return 0;
out_no_mem:
@@ -1292,10 +1427,8 @@ out_free_conf:
if (conf) {
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
- if (conf->mirrors)
- kfree(conf->mirrors);
- if (conf->poolinfo)
- kfree(conf->poolinfo);
+ kfree(conf->mirrors);
+ kfree(conf->poolinfo);
kfree(conf);
mddev->private = NULL;
}
@@ -1312,10 +1445,8 @@ static int stop(mddev_t *mddev)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
- if (conf->mirrors)
- kfree(conf->mirrors);
- if (conf->poolinfo)
- kfree(conf->poolinfo);
+ kfree(conf->mirrors);
+ kfree(conf->poolinfo);
kfree(conf);
mddev->private = NULL;
return 0;
@@ -1350,17 +1481,26 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
* We allocate a new r1bio_pool if we can.
* Then raise a device barrier and wait until all IO stops.
* Then resize conf->mirrors and swap in the new r1bio pool.
+ *
+ * At the same time, we "pack" the devices so that all the missing
+ * devices have the higher raid_disk numbers.
*/
mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo;
mirror_info_t *newmirrors;
conf_t *conf = mddev_to_conf(mddev);
+ int cnt;
- int d;
+ int d, d2;
- for (d= raid_disks; d < conf->raid_disks; d++)
- if (conf->mirrors[d].rdev)
+ if (raid_disks < conf->raid_disks) {
+ cnt=0;
+ for (d= 0; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev)
+ cnt++;
+ if (cnt > raid_disks)
return -EBUSY;
+ }
newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
if (!newpoolinfo)
@@ -1385,14 +1525,18 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
- conf->resync_lock, unplug_slaves(mddev));
+ conf->resync_lock, raid1_unplug(mddev->queue));
spin_unlock_irq(&conf->resync_lock);
/* ok, everything is stopped */
oldpool = conf->r1bio_pool;
conf->r1bio_pool = newpool;
- for (d=0; d < raid_disks && d < conf->raid_disks; d++)
- newmirrors[d] = conf->mirrors[d];
+
+ for (d=d2=0; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev) {
+ conf->mirrors[d].rdev->raid_disk = d2;
+ newmirrors[d2++].rdev = conf->mirrors[d].rdev;
+ }
kfree(conf->mirrors);
conf->mirrors = newmirrors;
kfree(conf->poolinfo);
@@ -1401,6 +1545,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
mddev->degraded += (raid_disks - conf->raid_disks);
conf->raid_disks = mddev->raid_disks = raid_disks;
+ conf->last_used = 0; /* just make sure it is in-range */
spin_lock_irq(&conf->resync_lock);
conf->barrier--;
spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e9dc2876a626..62ebb1bc72be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -700,6 +700,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
return 0;
}
+ md_write_start(mddev, bio);
+
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -774,7 +776,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
rcu_read_unlock();
atomic_set(&r10_bio->remaining, 1);
- md_write_start(mddev);
+
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
@@ -1216,7 +1218,6 @@ static void raid10d(mddev_t *mddev)
mdk_rdev_t *rdev;
md_check_recovery(mddev);
- md_handle_safemode(mddev);
for (;;) {
char b[BDEVNAME_SIZE];
@@ -1319,7 +1320,7 @@ static int init_resync(conf_t *conf)
*
*/
-static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
{
conf_t *conf = mddev_to_conf(mddev);
r10bio_t *r10_bio;
@@ -1333,7 +1334,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
if (!conf->r10buf_pool)
if (init_resync(conf))
- return -ENOMEM;
+ return 0;
skipped:
max_sector = mddev->size << 1;
@@ -1341,15 +1342,15 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
close_sync(conf);
+ *skipped = 1;
return sectors_skipped;
}
if (chunks_skipped >= conf->raid_disks) {
/* if there has been nothing to do on any drive,
* then there is nothing to do at all..
*/
- sector_t sec = max_sector - sector_nr;
- md_done_sync(mddev, sec, 1);
- return sec + sectors_skipped;
+ *skipped = 1;
+ return (max_sector - sector_nr) + sectors_skipped;
}
/* make sure whole request will fit in a chunk - if chunks
@@ -1563,17 +1564,22 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
}
}
+ if (sectors_skipped)
+ /* pretend they weren't skipped, it makes
+ * no important difference in this case
+ */
+ md_done_sync(mddev, sectors_skipped, 1);
+
return sectors_skipped + nr_sectors;
giveup:
/* There is nowhere to write, so all non-sync
* drives must be failed, so try the next chunk...
*/
{
- int sec = max_sector - sector_nr;
+ sector_t sec = max_sector - sector_nr;
sectors_skipped += sec;
chunks_skipped ++;
sector_nr = max_sector;
- md_done_sync(mddev, sec, 1);
goto skipped;
}
}
@@ -1639,9 +1645,6 @@ static int run(mddev_t *mddev)
mdname(mddev));
goto out_free_conf;
}
- mddev->queue->unplug_fn = raid10_unplug;
-
- mddev->queue->issue_flush_fn = raid10_issue_flush;
ITERATE_RDEV(mddev, rdev, tmp) {
disk_idx = rdev->raid_disk;
@@ -1713,6 +1716,9 @@ static int run(mddev_t *mddev)
mddev->array_size = size/2;
mddev->resync_max_sectors = size;
+ mddev->queue->unplug_fn = raid10_unplug;
+ mddev->queue->issue_flush_fn = raid10_issue_flush;
+
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
* maybe...
@@ -1731,8 +1737,7 @@ static int run(mddev_t *mddev)
out_free_conf:
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
- if (conf->mirrors)
- kfree(conf->mirrors);
+ kfree(conf->mirrors);
kfree(conf);
mddev->private = NULL;
out:
@@ -1748,8 +1753,7 @@ static int stop(mddev_t *mddev)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
- if (conf->mirrors)
- kfree(conf->mirrors);
+ kfree(conf->mirrors);
kfree(conf);
mddev->private = NULL;
return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e96e2a10a9c9..93a9726cc2d6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1411,6 +1411,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ md_write_start(mddev, bi);
+
if (bio_data_dir(bi)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1423,8 +1425,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
- if ( bio_data_dir(bi) == WRITE )
- md_write_start(mddev);
+
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
@@ -1475,7 +1476,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
}
/* FIXME go_faster isn't used */
-static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
{
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh;
@@ -1498,8 +1499,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
* nothing we can do.
*/
if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- int rv = (mddev->size << 1) - sector_nr;
- md_done_sync(mddev, rv, 1);
+ sector_t rv = (mddev->size << 1) - sector_nr;
+ *skipped = 1;
return rv;
}
@@ -1546,7 +1547,6 @@ static void raid5d (mddev_t *mddev)
PRINTK("+++ raid5d active\n");
md_check_recovery(mddev);
- md_handle_safemode(mddev);
handled = 0;
spin_lock_irq(&conf->device_lock);
@@ -1620,9 +1620,6 @@ static int run (mddev_t *mddev)
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
- mddev->queue->unplug_fn = raid5_unplug_device;
- mddev->queue->issue_flush_fn = raid5_issue_flush;
-
PRINTK("raid5: run(%s) called.\n", mdname(mddev));
ITERATE_RDEV(mddev,rdev,tmp) {
@@ -1728,6 +1725,10 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
}
/* Ok, everything is just fine now */
+
+ mddev->queue->unplug_fn = raid5_unplug_device;
+ mddev->queue->issue_flush_fn = raid5_issue_flush;
+
mddev->array_size = mddev->size * (mddev->raid_disks - 1);
return 0;
abort:
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 8a33f351e092..f62ea1a73d0d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1570,6 +1570,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ md_write_start(mddev, bi);
+
if (bio_data_dir(bi)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1583,8 +1585,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
- if ( bio_data_dir(bi) == WRITE )
- md_write_start(mddev);
+
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
@@ -1634,7 +1635,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
}
/* FIXME go_faster isn't used */
-static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
{
raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
struct stripe_head *sh;
@@ -1657,8 +1658,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
* nothing we can do.
*/
if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- int rv = (mddev->size << 1) - sector_nr;
- md_done_sync(mddev, rv, 1);
+ sector_t rv = (mddev->size << 1) - sector_nr;
+ *skipped = 1;
return rv;
}
@@ -1705,7 +1706,6 @@ static void raid6d (mddev_t *mddev)
PRINTK("+++ raid6d active\n");
md_check_recovery(mddev);
- md_handle_safemode(mddev);
handled = 0;
spin_lock_irq(&conf->device_lock);
@@ -1779,9 +1779,6 @@ static int run (mddev_t *mddev)
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
- mddev->queue->unplug_fn = raid6_unplug_device;
- mddev->queue->issue_flush_fn = raid6_issue_flush;
-
PRINTK("raid6: run(%s) called.\n", mdname(mddev));
ITERATE_RDEV(mddev,rdev,tmp) {
@@ -1895,6 +1892,9 @@ static int run (mddev_t *mddev)
/* Ok, everything is just fine now */
mddev->array_size = mddev->size * (mddev->raid_disks - 2);
+
+ mddev->queue->unplug_fn = raid6_unplug_device;
+ mddev->queue->issue_flush_fn = raid6_issue_flush;
return 0;
abort:
if (conf) {
OpenPOWER on IntegriCloud