diff options
author | Mike Snitzer <snitzer@redhat.com> | 2014-03-21 18:33:41 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2014-04-04 14:53:03 -0400 |
commit | 67324ea18812bc952ef96892fbd5817b9050413f (patch) | |
tree | 00c8f83280e10b94f3d4cbdf15d468b74cc6438a /drivers/md | |
parent | c140e1c4e23bdaf0a5c00b6a8b6d18f259d39a00 (diff) | |
download | talos-op-linux-67324ea18812bc952ef96892fbd5817b9050413f.tar.gz talos-op-linux-67324ea18812bc952ef96892fbd5817b9050413f.zip |
dm thin: sort the per thin deferred bios using an rb_tree
A thin-pool will allocate blocks using FIFO order for all thin devices
which share the thin-pool. Because of this simplistic allocation the
thin-pool's space can become fragmented quite easily; especially when
multiple threads are requesting blocks in parallel.
Sort each thin device's deferred_bio_list based on logical sector to
help reduce fragmentation of the thin-pool's ondisk layout.
The following tables illustrate the realized gains/potential offered by
sorting each thin device's deferred_bio_list. An "io size"-sized random
read of the device would result in "seeks/io" fragments being read, with
an average "distance/seek" between each fragment.
Data was written to a single thin device using multiple threads via
iozone (8 threads, 64K for both the block_size and io_size).
unsorted:
io size seeks/io distance/seek
--------------------------------------
4k 0.000 0b
16k 0.013 11m
64k 0.065 11m
256k 0.274 10m
1m 1.109 10m
4m 4.411 10m
16m 17.097 11m
64m 60.055 13m
256m 148.798 25m
1g 809.929 21m
sorted:
io size seeks/io distance/seek
--------------------------------------
4k 0.000 0b
16k 0.000 1g
64k 0.001 1g
256k 0.003 1g
1m 0.011 1g
4m 0.045 1g
16m 0.181 1g
64m 0.747 1011m
256m 3.299 1g
1g 14.373 1g
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-thin.c | 84 |
1 files changed, 82 insertions, 2 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 08e62aef361d..53728be84dee 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/rbtree.h> #define DM_MSG_PREFIX "thin" @@ -230,6 +231,7 @@ struct thin_c { spinlock_t lock; struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; + struct rb_root sort_bio_list; /* sorted list of deferred bios */ }; /*----------------------------------------------------------------*/ @@ -371,6 +373,7 @@ struct dm_thin_endio_hook { struct dm_deferred_entry *shared_read_entry; struct dm_deferred_entry *all_io_entry; struct dm_thin_new_mapping *overwrite_mapping; + struct rb_node rb_node; }; static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) @@ -1367,12 +1370,77 @@ static int need_commit_due_to_time(struct pool *pool) jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; } +#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) +#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) + +static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) +{ + struct rb_node **rbp, *parent; + struct dm_thin_endio_hook *pbd; + sector_t bi_sector = bio->bi_iter.bi_sector; + + rbp = &tc->sort_bio_list.rb_node; + parent = NULL; + while (*rbp) { + parent = *rbp; + pbd = thin_pbd(parent); + + if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + + pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + rb_link_node(&pbd->rb_node, parent, rbp); + rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); +} + +static void __extract_sorted_bios(struct thin_c *tc) +{ + struct rb_node *node; + struct dm_thin_endio_hook *pbd; + struct bio *bio; + + for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { + pbd = thin_pbd(node); + bio = thin_bio(pbd); + + bio_list_add(&tc->deferred_bio_list, bio); + rb_erase(&pbd->rb_node, &tc->sort_bio_list); + } + + WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); +} + +static void __sort_thin_deferred_bios(struct thin_c *tc) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, &tc->deferred_bio_list); + bio_list_init(&tc->deferred_bio_list); + + /* Sort deferred_bio_list using rb-tree */ + while ((bio = bio_list_pop(&bios))) + __thin_bio_rb_add(tc, bio); + + /* + * Transfer the sorted bios in sort_bio_list back to + * deferred_bio_list to allow lockless submission of + * all bios. + */ + __extract_sorted_bios(tc); +} + static void process_thin_deferred_bios(struct thin_c *tc) { struct pool *pool = tc->pool; unsigned long flags; struct bio *bio; struct bio_list bios; + struct blk_plug plug; if (tc->requeue_mode) { requeue_bio_list(tc, &tc->deferred_bio_list); @@ -1382,10 +1450,20 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); + + if (bio_list_empty(&tc->deferred_bio_list)) { + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + __sort_thin_deferred_bios(tc); + bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list); + spin_unlock_irqrestore(&tc->lock, flags); + blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { /* * If we've got no free new_mapping structs, and processing @@ -1405,6 +1483,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) else pool->process_bio(tc, bio); } + blk_finish_plug(&plug); } static void process_deferred_bios(struct pool *pool) @@ -2964,7 +3043,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3040,6 +3119,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) spin_lock_init(&tc->lock); bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->retry_on_resume_list); + tc->sort_bio_list = RB_ROOT; if (argc == 3) { r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); @@ -3287,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, |