diff options
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r-- | drivers/md/dm-thin.c | 766 |
1 files changed, 614 insertions, 152 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4843801173fe..8735543eacdb 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,11 +11,13 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/log2.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/rbtree.h> #define DM_MSG_PREFIX "thin" @@ -25,7 +27,6 @@ */ #define ENDIO_HOOK_POOL_SIZE 1024 #define MAPPING_POOL_SIZE 1024 -#define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ #define NO_SPACE_TIMEOUT_SECS 60 @@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td, { key->virtual = 0; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; } static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, @@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, { key->virtual = 1; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; +} + +/*----------------------------------------------------------------*/ + +#define THROTTLE_THRESHOLD (1 * HZ) + +struct throttle { + struct rw_semaphore lock; + unsigned long threshold; + bool throttle_applied; +}; + +static void throttle_init(struct throttle *t) +{ + init_rwsem(&t->lock); + t->throttle_applied = false; +} + +static void throttle_work_start(struct throttle *t) +{ + t->threshold = jiffies + THROTTLE_THRESHOLD; +} + +static void throttle_work_update(struct throttle *t) +{ + if (!t->throttle_applied && jiffies > t->threshold) { + down_write(&t->lock); + t->throttle_applied = true; + } +} + +static void throttle_work_complete(struct throttle *t) +{ + if (t->throttle_applied) { + t->throttle_applied = false; + up_write(&t->lock); + } +} + +static void throttle_lock(struct throttle *t) +{ + down_read(&t->lock); +} + +static void throttle_unlock(struct throttle *t) +{ + up_read(&t->lock); } /*----------------------------------------------------------------*/ @@ -155,8 +205,11 @@ struct pool_features { struct thin_c; typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); +#define CELL_SORT_ARRAY_SIZE 8192 + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -171,11 +224,13 @@ struct pool { struct pool_features pf; bool low_water_triggered:1; /* A dm event has been sent */ + bool suspended:1; struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; struct workqueue_struct *wq; + struct throttle throttle; struct work_struct worker; struct delayed_work waker; struct delayed_work no_space_timeout; @@ -198,8 +253,13 @@ struct pool { process_bio_fn process_bio; process_bio_fn process_discard; + process_cell_fn process_cell; + process_cell_fn process_discard_cell; + process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; + + struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -232,8 +292,11 @@ struct thin_c { struct pool *pool; struct dm_thin_device *td; + struct mapped_device *thin_md; + bool requeue_mode:1; spinlock_t lock; + struct list_head deferred_cells; struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ @@ -290,6 +353,15 @@ static void cell_release(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + static void cell_release_no_holder(struct pool *pool, struct dm_bio_prison_cell *cell, struct bio_list *bios) @@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } -static void cell_defer_no_holder_no_free(struct thin_c *tc, - struct dm_bio_prison_cell *cell) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&tc->lock, flags); - dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); - - wake_worker(pool); -} - static void cell_error_with_code(struct pool *pool, struct dm_bio_prison_cell *cell, int error_code) { @@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) cell_error_with_code(pool, cell, -EIO); } +static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, 0); +} + +static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); +} + /*----------------------------------------------------------------*/ /* @@ -393,44 +462,65 @@ struct dm_thin_endio_hook { struct rb_node rb_node; }; -static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) +static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) +{ + bio_list_merge(bios, master); + bio_list_init(master); +} + +static void error_bio_list(struct bio_list *bios, int error) { struct bio *bio; + + while ((bio = bio_list_pop(bios))) + bio_endio(bio, error); +} + +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +{ struct bio_list bios; unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, master); - bio_list_init(master); + __merge_bio_list(&bios, master); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, DM_ENDIO_REQUEUE); + error_bio_list(&bios, error); } -static void requeue_io(struct thin_c *tc) +static void requeue_deferred_cells(struct thin_c *tc) { - requeue_bio_list(tc, &tc->deferred_bio_list); - requeue_bio_list(tc, &tc->retry_on_resume_list); + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell, *tmp; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + list_for_each_entry_safe(cell, tmp, &cells, user_list) + cell_requeue(pool, cell); } -static void error_thin_retry_list(struct thin_c *tc) +static void requeue_io(struct thin_c *tc) { - struct bio *bio; - unsigned long flags; struct bio_list bios; + unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, &tc->retry_on_resume_list); - bio_list_init(&tc->retry_on_resume_list); + __merge_bio_list(&bios, &tc->deferred_bio_list); + __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + error_bio_list(&bios, DM_ENDIO_REQUEUE); + requeue_deferred_cells(tc); } static void error_retry_list(struct pool *pool) @@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool) rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) - error_thin_retry_list(tc); + error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); rcu_read_unlock(); } @@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err) */ /* - * This sends the bios in the cell back to the deferred_bios list. + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct pool *pool = tc->pool; unsigned long flags; spin_lock_irqsave(&tc->lock, flags); - cell_release(pool, cell, &tc->deferred_bio_list); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); wake_worker(pool); } -/* - * Same as cell_defer above, except it omits the original holder of the cell. - */ -static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) { - struct pool *pool = tc->pool; - unsigned long flags; + struct remap_info *info = context; + struct bio *bio; - spin_lock_irqsave(&tc->lock, flags); - cell_release_no_holder(pool, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + while ((bio = bio_list_pop(&cell->bios))) { + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); - wake_worker(pool); + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void inc_remap_and_issue_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); } static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) @@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_no_holder(tc, m->cell); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); bio_endio(bio, 0); - } else - cell_defer(tc, m->cell); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } out: list_del(&m->list); @@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, } } +static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, + dm_block_t data_block, + struct dm_thin_new_mapping *m) +{ + struct pool *pool = tc->pool; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_block); +} + /* * A partial copy also needs to zero the uncopied region. */ @@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first. */ - if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_dest); - } else { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_dest, m); + else { struct dm_io_region from, to; from.bdev = origin->bdev; @@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); - else if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_block); + else if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); - } else + else ll_zero(tc, m, data_block * pool->sectors_per_block, (data_block + 1) * pool->sectors_per_block); @@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c bio_list_init(&bios); cell_release(pool, cell, &bios); - error = should_error_unserviceable_bio(pool); - if (error) - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, error); - else - while ((bio = bio_list_pop(&bios))) - retry_on_resume(bio); + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); } -static void process_discard(struct thin_c *tc, struct bio *bio) +static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; + struct bio *bio = cell->holder; struct pool *pool = tc->pool; - struct dm_bio_prison_cell *cell, *cell2; - struct dm_cell_key key, key2; + struct dm_bio_prison_cell *cell2; + struct dm_cell_key key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; struct dm_thin_new_mapping *m; - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { @@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio) m->cell2 = cell2; m->bio = bio; - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { - spin_lock_irqsave(&pool->lock, flags); - list_add_tail(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); - } + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + } else { inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); @@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio) } } +static void process_discard_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + dm_block_t block = get_bio_block(tc, bio); + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &cell)) + return; + + process_discard_cell(tc, cell); +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, } } +static void __remap_and_issue_shared_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if ((bio_data_dir(bio) == WRITE) || + (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))) + bio_list_add(&info->defer_bios, bio); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; + + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); + inc_all_io_entry(info->tc->pool, bio); + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void remap_and_issue_shared_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(tc, bio, block); +} + static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct dm_thin_lookup_result *lookup_result) + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *virt_cell) { - struct dm_bio_prison_cell *cell; + struct dm_bio_prison_cell *data_cell; struct pool *pool = tc->pool; struct dm_cell_key key; @@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return; + } - if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) - break_sharing(tc, bio, block, &key, lookup_result, cell); - else { + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { + break_sharing(tc, bio, block, &key, lookup_result, data_cell); + cell_defer_no_holder(tc, virt_cell); + } else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result->block); + + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); } } @@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block } } -static void process_bio(struct thin_c *tc, struct bio *bio) +static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; struct pool *pool = tc->pool; + struct bio *bio = cell->holder; dm_block_t block = get_bio_block(tc, bio); - struct dm_bio_prison_cell *cell; - struct dm_cell_key key; struct dm_thin_lookup_result lookup_result; - /* - * If cell is already occupied, then the block is already - * being provisioned so we have nothing further to do here. - */ - build_virtual_key(tc->td, block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared) { - process_shared_bio(tc, bio, block, &lookup_result); - cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ - } else { + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result, cell); + else { inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result.block); + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; @@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio) } } -static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + process_cell(tc, cell); +} + +static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, + struct dm_bio_prison_cell *cell) { int r; int rw = bio_data_dir(bio); @@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { handle_unserviceable_bio(tc->pool, bio); - else { + if (cell) + cell_defer_no_holder(tc, cell); + } else { inc_all_io_entry(tc->pool, bio); remap_and_issue(tc, bio, lookup_result.block); + if (cell) + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; case -ENODATA: + if (cell) + cell_defer_no_holder(tc, cell); if (rw != READ) { handle_unserviceable_bio(tc->pool, bio); break; @@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) default: DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", __func__, r); + if (cell) + cell_defer_no_holder(tc, cell); bio_io_error(bio); break; } } +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + __process_bio_read_only(tc, bio, NULL); +} + +static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + __process_bio_read_only(tc, cell->holder, cell); +} + static void process_bio_success(struct thin_c *tc, struct bio *bio) { bio_endio(bio, 0); @@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_success(tc->pool, cell); +} + +static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_error(tc->pool, cell); +} + /* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks? @@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) struct bio *bio; struct bio_list bios; struct blk_plug plug; + unsigned count = 0; if (tc->requeue_mode) { - requeue_bio_list(tc, &tc->deferred_bio_list); + error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); return; } @@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc) pool->process_discard(tc, bio); else pool->process_bio(tc, bio); + + if ((count++ & 127) == 0) { + throttle_work_update(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + } } blk_finish_plug(&plug); } +static int cmp_cells(const void *lhs, const void *rhs) +{ + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); + + BUG_ON(!lhs_cell->holder); + BUG_ON(!rhs_cell->holder); + + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) + return -1; + + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) + return 1; + + return 0; +} + +static unsigned sort_cells(struct pool *pool, struct list_head *cells) +{ + unsigned count = 0; + struct dm_bio_prison_cell *cell, *tmp; + + list_for_each_entry_safe(cell, tmp, cells, user_list) { + if (count >= CELL_SORT_ARRAY_SIZE) + break; + + pool->cell_sort_array[count++] = cell; + list_del(&cell->user_list); + } + + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); + + return count; +} + +static void process_thin_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell; + unsigned i, j, count; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + if (list_empty(&cells)) + return; + + do { + count = sort_cells(tc->pool, &cells); + + for (i = 0; i < count; i++) { + cell = pool->cell_sort_array[i]; + BUG_ON(!cell->holder); + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + for (j = i; j < count; j++) + list_add(&pool->cell_sort_array[j]->user_list, &cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice(&cells, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + if (cell->holder->bi_rw & REQ_DISCARD) + pool->process_discard_cell(tc, cell); + else + pool->process_cell(tc, cell); + } + } while (!list_empty(&cells)); +} + static void thin_get(struct thin_c *tc); static void thin_put(struct thin_c *tc); @@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool) tc = get_first_thin(pool); while (tc) { + process_thin_deferred_cells(tc); process_thin_deferred_bios(tc); tc = get_next_thin(pool, tc); } @@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); + throttle_work_start(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + throttle_work_update(&pool->throttle); process_deferred_bios(pool); + throttle_work_complete(&pool->throttle); } /* @@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; + pool->process_cell = process_cell_fail; + pool->process_discard_cell = process_cell_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_fail; @@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; pool->process_discard = process_bio_success; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_cell_success; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) if (old_mode != new_mode) notify_of_pool_mode_change(pool, "out-of-data-space"); pool->process_bio = process_bio_read_only; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) notify_of_pool_mode_change(pool, "write"); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard; break; @@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + thin_defer_bio(tc, bio); + throttle_unlock(&pool->throttle); +} + +static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + spin_lock_irqsave(&tc->lock, flags); + list_add_tail(&cell->user_list, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + throttle_unlock(&pool->throttle); + + wake_worker(pool); +} + static void thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) dm_block_t block = get_bio_block(tc, bio); struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - struct dm_bio_prison_cell cell1, cell2; - struct dm_bio_prison_cell *cell_result; + struct dm_bio_prison_cell *virt_cell, *data_cell; struct dm_cell_key key; thin_hook_bio(tc, bio); @@ -1932,10 +2287,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { - thin_defer_bio(tc, bio); + thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &virt_cell)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1958,23 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * More distant ancestors are irrelevant. The * shared flag will be set in their case. */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { - cell_defer_no_holder_no_free(tc, &cell1); + if (bio_detain(tc->pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } inc_all_io_entry(tc->pool, bio); - cell_defer_no_holder_no_free(tc, &cell2); - cell_defer_no_holder_no_free(tc, &cell1); + cell_defer_no_holder(tc, data_cell); + cell_defer_no_holder(tc, virt_cell); remap(tc, bio, result.block); return DM_MAPIO_REMAPPED; @@ -1986,16 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. */ handle_unserviceable_bio(tc->pool, bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } /* fall through */ case -EWOULDBLOCK: - /* - * In future, the failed dm_thin_find_block above could - * provide the hint to load the metadata into cache. - */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; default: @@ -2005,6 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } } @@ -2185,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); - pool->prison = dm_bio_prison_create(PRISON_CELLS); + pool->prison = dm_bio_prison_create(); if (!pool->prison) { *error = "Error creating pool's bio prison"; err_p = ERR_PTR(-ENOMEM); @@ -2211,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_wq; } + throttle_init(&pool->throttle); INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); @@ -2220,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_LIST_HEAD(&pool->prepared_discards); INIT_LIST_HEAD(&pool->active_thins); pool->low_water_triggered = false; + pool->suspended = true; pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -2756,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti) return 0; } +static void pool_suspend_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Suspend all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_suspend_noflush(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + +static void pool_resume_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Resume all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_resume(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; unsigned long flags; + /* + * Must requeue active_thins' bios and then resume + * active_thins _before_ clearing 'suspend' flag. + */ + requeue_bios(pool); + pool_resume_active_thins(pool); + spin_lock_irqsave(&pool->lock, flags); pool->low_water_triggered = false; + pool->suspended = false; spin_unlock_irqrestore(&pool->lock, flags); - requeue_bios(pool); do_waker(&pool->waker.work); } +static void pool_presuspend(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + pool->suspended = true; + spin_unlock_irqrestore(&pool->lock, flags); + + pool_suspend_active_thins(pool); +} + +static void pool_presuspend_undo(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + pool_resume_active_thins(pool); + + spin_lock_irqsave(&pool->lock, flags); + pool->suspended = false; + spin_unlock_irqrestore(&pool->lock, flags); +} + static void pool_postsuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -2941,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct * create_thin <dev_id> * create_snap <dev_id> <origin_id> * delete <dev_id> - * trim <dev_id> <new_size_in_sectors> * set_transaction_id <current_trans_id> <new_trans_id> * reserve_metadata_snap * release_metadata_snap @@ -3169,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If max_sectors is smaller than pool->sectors_per_block adjust it + * to the highest possible power-of-2 factor of pool->sectors_per_block. + * This is especially beneficial when the pool's data device is a RAID + * device that has a full stripe width that matches pool->sectors_per_block + * -- because even though partial RAID stripe-sized IOs will be issued to a + * single RAID stripe; when aggregated they will end on a full RAID stripe + * boundary.. which avoids additional partial RAID stripe writes cascading + */ + if (limits->max_sectors < pool->sectors_per_block) { + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) + limits->max_sectors--; + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } + } /* * If the system-determined stacked limits are compatible with the * pool's blocksize (io_opt is a factor) do not override them. */ if (io_opt_sectors < pool->sectors_per_block || - do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); + !is_factor(io_opt_sectors, pool->sectors_per_block)) { + if (is_factor(pool->sectors_per_block, limits->max_sectors)) + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + else + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3206,11 +3641,13 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, .map = pool_map, + .presuspend = pool_presuspend, + .presuspend_undo = pool_presuspend_undo, .postsuspend = pool_postsuspend, .preresume = pool_preresume, .resume = pool_resume, @@ -3240,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti) struct thin_c *tc = ti->private; unsigned long flags; - thin_put(tc); - wait_for_completion(&tc->can_destroy); - spin_lock_irqsave(&tc->pool->lock, flags); list_del_rcu(&tc->list); spin_unlock_irqrestore(&tc->pool->lock, flags); synchronize_rcu(); + thin_put(tc); + wait_for_completion(&tc->can_destroy); + mutex_lock(&dm_thin_pool_table.mutex); __pool_dec(tc->pool); @@ -3294,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -ENOMEM; goto out_unlock; } + tc->thin_md = dm_table_get_md(ti->table); spin_lock_init(&tc->lock); + INIT_LIST_HEAD(&tc->deferred_cells); bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->retry_on_resume_list); tc->sort_bio_list = RB_ROOT; @@ -3339,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (get_pool_mode(tc->pool) == PM_FAIL) { ti->error = "Couldn't open thin device, Pool is in fail mode"; r = -EINVAL; - goto bad_thin_open; + goto bad_pool; } r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); if (r) { ti->error = "Couldn't open thin internal device"; - goto bad_thin_open; + goto bad_pool; } r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); if (r) - goto bad_target_max_io_len; + goto bad; ti->num_flush_bios = 1; ti->flush_supported = true; @@ -3365,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->split_discard_bios = true; } - dm_put(pool_md); - mutex_unlock(&dm_thin_pool_table.mutex); - atomic_set(&tc->refcount, 1); - init_completion(&tc->can_destroy); - spin_lock_irqsave(&tc->pool->lock, flags); + if (tc->pool->suspended) { + spin_unlock_irqrestore(&tc->pool->lock, flags); + mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ + ti->error = "Unable to activate thin device while pool is suspended"; + r = -EINVAL; + goto bad; + } list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); /* @@ -3383,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) */ synchronize_rcu(); + dm_put(pool_md); + + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + return 0; -bad_target_max_io_len: +bad: dm_pool_close_thin_device(tc->td); -bad_thin_open: +bad_pool: __pool_dec(tc->pool); bad_pool_lookup: dm_put(pool_md); @@ -3533,6 +3979,21 @@ err: DMEMIT("Error"); } +static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct thin_c *tc = ti->private; + struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = tc->pool_dev->bdev; + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -3557,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -3567,6 +4028,7 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, + .merge = thin_merge, .iterate_devices = thin_iterate_devices, }; |