summaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c158
1 files changed, 91 insertions, 67 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 63a7c416b224..2b53c3841b53 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
{
- return atomic_read(&md->pending[READ]) +
- atomic_read(&md->pending[WRITE]);
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+ long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ }
+
+ return sum != 0;
+}
+
+static bool md_in_flight(struct mapped_device *md)
+{
+ if (queue_is_mq(md->queue))
+ return blk_mq_queue_inflight(md->queue);
+ else
+ return md_in_flight_bios(md);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- int rw = bio_data_dir(bio);
io->start_time = jiffies;
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
-
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
- int rw = bio_data_dir(bio);
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
- /*
- * After this is decremented the bio must not be touched if it is
- * a flush.
- */
- pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
- pending += atomic_read(&md->pending[rw^0x1]);
-
/* nudge anyone waiting on suspend queue */
- if (!pending)
+ if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
}
@@ -1318,7 +1320,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
__bio_clone_fast(clone, bio);
- if (unlikely(bio_integrity(bio) != NULL)) {
+ if (bio_integrity(bio)) {
int r;
if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
@@ -1334,11 +1336,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
return r;
}
- bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
- clone->bi_iter.bi_size = to_bytes(len);
-
- if (unlikely(bio_integrity(bio) != NULL))
- bio_integrity_trim(clone);
+ bio_trim(clone, sector - clone->bi_iter.bi_sector, len);
return 0;
}
@@ -1417,10 +1415,21 @@ static int __send_empty_flush(struct clone_info *ci)
unsigned target_nr = 0;
struct dm_target *ti;
+ /*
+ * Empty flush uses a statically initialized bio, as the base for
+ * cloning. However, blkg association requires that a bdev is
+ * associated with a gendisk, which doesn't happen until the bdev is
+ * opened. So, blkg association is done at issue time of the flush
+ * rather than when the device is created in alloc_dev().
+ */
+ bio_set_dev(ci->bio, ci->io->md->bdev);
+
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ bio_disassociate_blkg(ci->bio);
+
return 0;
}
@@ -1473,11 +1482,9 @@ static bool is_split_required_for_discard(struct dm_target *ti)
}
static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
- get_num_bios_fn get_num_bios,
- is_split_required_fn is_split_required)
+ unsigned num_bios, bool is_split_required)
{
unsigned len;
- unsigned num_bios;
/*
* Even though the device advertised support for this type of
@@ -1485,11 +1492,10 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
* reconfiguration might also have changed that since the
* check was performed.
*/
- num_bios = get_num_bios ? get_num_bios(ti) : 0;
if (!num_bios)
return -EOPNOTSUPP;
- if (is_split_required && !is_split_required(ti))
+ if (!is_split_required)
len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
else
len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
@@ -1504,23 +1510,23 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
static int __send_discard(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_discard_bios,
- is_split_required_for_discard);
+ return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti),
+ is_split_required_for_discard(ti));
}
static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false);
}
static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false);
}
static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
}
static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
@@ -1578,6 +1584,9 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
ci->sector = bio->bi_iter.bi_sector;
}
+#define __dm_part_stat_sub(part, field, subnd) \
+ (part_stat_get(part, field) -= (subnd))
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
@@ -1598,7 +1607,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1623,7 +1641,21 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
GFP_NOIO, &md->queue->bio_split);
ci.io->orig_bio = b;
+
+ /*
+ * Adjust IO stats for each split, otherwise upon queue
+ * reentry there will be redundant IO accounting.
+ * NOTE: this is a stop-gap fix, a proper fix involves
+ * significant refactoring of DM core's bio splitting
+ * (by eliminating DM's splitting and just using bio_split)
+ */
+ part_stat_lock();
+ __dm_part_stat_sub(&dm_disk(md)->part0,
+ sectors[op_stat_group(bio_op(bio))], ci.sector_count);
+ part_stat_unlock();
+
bio_chain(b, bio);
+ trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
ret = generic_make_request(bio);
break;
}
@@ -1654,7 +1686,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1685,10 +1726,16 @@ out:
return ret;
}
-typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
+static blk_qc_t dm_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
+{
+ if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
+ return __process_bio(md, map, bio);
+ else
+ return __split_and_process_bio(md, map, bio);
+}
-static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
- process_bio_fn process_bio)
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
{
struct mapped_device *md = q->queuedata;
blk_qc_t ret = BLK_QC_T_NONE;
@@ -1708,26 +1755,12 @@ static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
return ret;
}
- ret = process_bio(md, map, bio);
+ ret = dm_process_bio(md, map, bio);
dm_put_live_table(md, srcu_idx);
return ret;
}
-/*
- * The request function that remaps the bio to one target and
- * splits off any remainder.
- */
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
-{
- return __dm_make_request(q, bio, __split_and_process_bio);
-}
-
-static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
-{
- return __dm_make_request(q, bio, __process_bio);
-}
-
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r = bdi_bits;
@@ -1898,7 +1931,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
md->queue->queuedata = md;
@@ -1908,8 +1941,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->disk)
goto bad;
- atomic_set(&md->pending[0], 0);
- atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
@@ -1940,10 +1971,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad;
- bio_init(&md->flush_bio, NULL, 0);
- bio_set_dev(&md->flush_bio, md->bdev);
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
@@ -2221,12 +2248,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request);
- break;
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request_nvme);
+ blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
@@ -2410,9 +2434,9 @@ static void dm_wq_work(struct work_struct *work)
break;
if (dm_request_based(md))
- generic_make_request(c);
+ (void) generic_make_request(c);
else
- __split_and_process_bio(md, map, c);
+ (void) dm_process_bio(md, map, c);
}
dm_put_live_table(md, srcu_idx);
OpenPOWER on IntegriCloud