summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig22
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bcache/request.c11
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/dm-bufio.c46
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-crypt.c28
-rw-r--r--drivers/md/dm-exception-store.h2
-rw-r--r--drivers/md/dm-io.c4
-rw-r--r--drivers/md/dm-kcopyd.c2
-rw-r--r--drivers/md/dm-mpath.c30
-rw-r--r--drivers/md/dm-snap-persistent.c5
-rw-r--r--drivers/md/dm-snap-transient.c4
-rw-r--r--drivers/md/dm-snap.c26
-rw-r--r--drivers/md/dm-thin-metadata.c148
-rw-r--r--drivers/md/dm-thin.c14
-rw-r--r--drivers/md/dm-verity-fec.c818
-rw-r--r--drivers/md/dm-verity-fec.h152
-rw-r--r--drivers/md/dm-verity-target.c (renamed from drivers/md/dm-verity.c)602
-rw-r--r--drivers/md/dm-verity.h129
-rw-r--r--drivers/md/dm.c13
-rw-r--r--drivers/md/md.c557
-rw-r--r--drivers/md/md.h48
-rw-r--r--drivers/md/persistent-data/Kconfig9
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c11
-rw-r--r--drivers/md/persistent-data/dm-btree.c118
-rw-r--r--drivers/md/persistent-data/dm-btree.h14
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c29
-rw-r--r--drivers/md/raid10.c4
29 files changed, 1897 insertions, 959 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3e01e6fb3424..0a2e7273db9e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -123,6 +123,7 @@ config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
select RAID6_PQ
+ select LIBCRC32C
select ASYNC_MEMCPY
select ASYNC_XOR
select ASYNC_PQ
@@ -239,6 +240,15 @@ config DM_BUFIO
as a cache, holding recently-read blocks in memory and performing
delayed writes.
+config DM_DEBUG_BLOCK_STACK_TRACING
+ bool "Keep stack trace of persistent data block lock holders"
+ depends on STACKTRACE_SUPPORT && DM_BUFIO
+ select STACKTRACE
+ ---help---
+ Enable this for messages that may help debug problems with the
+ block manager locking used by thin provisioning and caching.
+
+ If unsure, say N.
config DM_BIO_PRISON
tristate
depends on BLK_DEV_DM
@@ -457,6 +467,18 @@ config DM_VERITY
If unsure, say N.
+config DM_VERITY_FEC
+ bool "Verity forward error correction support"
+ depends on DM_VERITY
+ select REED_SOLOMON
+ select REED_SOLOMON_DEC8
+ ---help---
+ Add forward error correction support to dm-verity. This option
+ makes it possible to use pre-generated error correction data to
+ recover from corrupted blocks.
+
+ If unsure, say N.
+
config DM_SWITCH
tristate "Switch target support (EXPERIMENTAL)"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f34979cd141a..62a65764e8e0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -16,6 +16,7 @@ dm-cache-mq-y += dm-cache-policy-mq.o
dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
+dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o raid5-cache.o
@@ -63,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_FEC),y)
+dm-verity-objs += dm-verity-fec.o
+endif
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 8e9877b04637..25fa8445bb24 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -958,7 +958,8 @@ static void cached_dev_nodata(struct closure *cl)
/* Cached devices - read & write stuff */
-static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ struct bio *bio)
{
struct search *s;
struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
@@ -997,6 +998,8 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
else
generic_make_request(bio);
}
+
+ return BLK_QC_T_NONE;
}
static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
@@ -1070,7 +1073,8 @@ static void flash_dev_nodata(struct closure *cl)
continue_at(cl, search_free, NULL);
}
-static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t flash_dev_make_request(struct request_queue *q,
+ struct bio *bio)
{
struct search *s;
struct closure *cl;
@@ -1093,7 +1097,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
bcache_wq);
- return;
+ return BLK_QC_T_NONE;
} else if (rw) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
@@ -1109,6 +1113,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
}
continue_at(cl, search_free, NULL);
+ return BLK_QC_T_NONE;
}
static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index db3ae4c2b223..dde6172f3f10 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -230,7 +230,7 @@ void bch_bio_map(struct bio *bio, void *base)
BUG_ON(!bio->bi_iter.bi_size);
BUG_ON(bio->bi_vcnt);
- bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+ bv->bv_offset = base ? offset_in_page(base) : 0;
goto start;
for (; size; bio->bi_vcnt++, bv++) {
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2dd33085b331..cd77216beff1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -16,6 +16,7 @@
#include <linux/shrinker.h>
#include <linux/module.h>
#include <linux/rbtree.h>
+#include <linux/stacktrace.h>
#define DM_MSG_PREFIX "bufio"
@@ -149,6 +150,11 @@ struct dm_buffer {
struct list_head write_list;
struct bio bio;
struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+#define MAX_STACK 10
+ struct stack_trace stack_trace;
+ unsigned long stack_entries[MAX_STACK];
+#endif
};
/*----------------------------------------------------------------*/
@@ -253,6 +259,17 @@ static LIST_HEAD(dm_bufio_all_clients);
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+static void buffer_record_stack(struct dm_buffer *b)
+{
+ b->stack_trace.nr_entries = 0;
+ b->stack_trace.max_entries = MAX_STACK;
+ b->stack_trace.entries = b->stack_entries;
+ b->stack_trace.skip = 2;
+ save_stack_trace(&b->stack_trace);
+}
+#endif
+
/*----------------------------------------------------------------
* A red/black tree acts as an index for all the buffers.
*--------------------------------------------------------------*/
@@ -454,6 +471,9 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
adjust_total_allocated(b->data_mode, (long)c->block_size);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ memset(&b->stack_trace, 0, sizeof(b->stack_trace));
+#endif
return b;
}
@@ -630,7 +650,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
do {
if (!bio_add_page(&b->bio, virt_to_page(ptr),
len < PAGE_SIZE ? len : PAGE_SIZE,
- virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
+ offset_in_page(ptr))) {
BUG_ON(b->c->block_size <= PAGE_SIZE);
use_dmio(b, rw, block, end_io);
return;
@@ -1063,12 +1083,16 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
dm_bufio_lock(c);
b = __bufio_new(c, block, nf, &need_submit, &write_list);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ if (b && b->hold_count == 1)
+ buffer_record_stack(b);
+#endif
dm_bufio_unlock(c);
__flush_write_list(&write_list);
if (!b)
- return b;
+ return NULL;
if (need_submit)
submit_io(b, READ, b->block, read_endio);
@@ -1462,6 +1486,7 @@ static void drop_buffers(struct dm_bufio_client *c)
{
struct dm_buffer *b;
int i;
+ bool warned = false;
BUG_ON(dm_bufio_in_request());
@@ -1476,9 +1501,21 @@ static void drop_buffers(struct dm_bufio_client *c)
__free_buffer_wake(b);
for (i = 0; i < LIST_SIZE; i++)
- list_for_each_entry(b, &c->lru[i], lru_list)
+ list_for_each_entry(b, &c->lru[i], lru_list) {
+ WARN_ON(!warned);
+ warned = true;
DMERR("leaked buffer %llx, hold count %u, list %d",
(unsigned long long)b->block, b->hold_count, i);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ print_stack_trace(&b->stack_trace, 1);
+ b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
+#endif
+ }
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ while ((b = __get_unclaimed_buffer(c)))
+ __free_buffer_wake(b);
+#endif
for (i = 0; i < LIST_SIZE; i++)
BUG_ON(!list_empty(&c->lru[i]));
@@ -1891,8 +1928,7 @@ static void __exit dm_bufio_exit(void)
bug = 1;
}
- if (bug)
- BUG();
+ BUG_ON(bug);
}
module_init(dm_bufio_init)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2fd4c8296144..5780accffa30 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
*/
struct dm_hook_info {
bio_end_io_t *bi_end_io;
- void *bi_private;
};
static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
bio_end_io_t *bi_end_io, void *bi_private)
{
h->bi_end_io = bio->bi_end_io;
- h->bi_private = bio->bi_private;
bio->bi_end_io = bi_end_io;
bio->bi_private = bi_private;
@@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
{
bio->bi_end_io = h->bi_end_io;
- bio->bi_private = h->bi_private;
}
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3729b394432c..3147c8d09ea8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -112,7 +112,8 @@ struct iv_tcw_private {
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
- DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
+ DM_CRYPT_EXIT_THREAD};
/*
* The fields in here must be read only after initialization.
@@ -994,7 +995,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
struct bio_vec *bvec;
retry:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
@@ -1010,7 +1011,7 @@ retry:
if (!page) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
- gfp_mask |= __GFP_WAIT;
+ gfp_mask |= __GFP_DIRECT_RECLAIM;
goto retry;
}
@@ -1027,7 +1028,7 @@ retry:
}
return_clone:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
return clone;
@@ -1203,20 +1204,18 @@ continue_locked:
if (!RB_EMPTY_ROOT(&cc->write_tree))
goto pop_from_list;
+ if (unlikely(test_bit(DM_CRYPT_EXIT_THREAD, &cc->flags))) {
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+ break;
+ }
+
__set_current_state(TASK_INTERRUPTIBLE);
__add_wait_queue(&cc->write_thread_wait, &wait);
spin_unlock_irq(&cc->write_thread_wait.lock);
- if (unlikely(kthread_should_stop())) {
- set_task_state(current, TASK_RUNNING);
- remove_wait_queue(&cc->write_thread_wait, &wait);
- break;
- }
-
schedule();
- set_task_state(current, TASK_RUNNING);
spin_lock_irq(&cc->write_thread_wait.lock);
__remove_wait_queue(&cc->write_thread_wait, &wait);
goto continue_locked;
@@ -1531,8 +1530,13 @@ static void crypt_dtr(struct dm_target *ti)
if (!cc)
return;
- if (cc->write_thread)
+ if (cc->write_thread) {
+ spin_lock_irq(&cc->write_thread_wait.lock);
+ set_bit(DM_CRYPT_EXIT_THREAD, &cc->flags);
+ wake_up_locked(&cc->write_thread_wait);
+ spin_unlock_irq(&cc->write_thread_wait.lock);
kthread_stop(cc->write_thread);
+ }
if (cc->io_queue)
destroy_workqueue(cc->io_queue);
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index fae34e7a0b1e..12b5216c2cfe 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -69,7 +69,7 @@ struct dm_exception_store_type {
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81c5e1a1f363..06d426eb5a30 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -246,7 +246,7 @@ static void vm_dp_init(struct dpages *dp, void *data)
{
dp->get_page = vm_get_page;
dp->next_page = vm_next_page;
- dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_u = offset_in_page(data);
dp->context_ptr = data;
}
@@ -271,7 +271,7 @@ static void km_dp_init(struct dpages *dp, void *data)
{
dp->get_page = km_get_page;
dp->next_page = km_next_page;
- dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_u = offset_in_page(data);
dp->context_ptr = data;
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3a7cade5e27d..1452ed9aacb4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
*pages = NULL;
do {
- pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+ pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
if (unlikely(!pl)) {
/* Use reserved pages */
pl = kc->pages;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aaa6caa46a9f..cfa29f574c2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1537,32 +1537,34 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode)
{
struct multipath *m = ti->private;
- struct pgpath *pgpath;
unsigned long flags;
int r;
- r = 0;
-
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pgpath)
__choose_pgpath(m, 0);
- pgpath = m->current_pgpath;
-
- if (pgpath) {
- *bdev = pgpath->path.dev->bdev;
- *mode = pgpath->path.dev->mode;
+ if (m->current_pgpath) {
+ if (!m->queue_io) {
+ *bdev = m->current_pgpath->path.dev->bdev;
+ *mode = m->current_pgpath->path.dev->mode;
+ r = 0;
+ } else {
+ /* pg_init has not started or completed */
+ r = -ENOTCONN;
+ }
+ } else {
+ /* No path is available */
+ if (m->queue_if_no_path)
+ r = -ENOTCONN;
+ else
+ r = -EIO;
}
- if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
- r = -ENOTCONN;
- else if (!*bdev)
- r = -EIO;
-
spin_unlock_irqrestore(&m->lock, flags);
- if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+ if (r == -ENOTCONN) {
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pg) {
/* Path status changed, redo selection */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3164b8bce294..4d3909393f2c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -695,7 +695,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
}
static void persistent_commit_exception(struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context)
{
@@ -704,6 +704,9 @@ static void persistent_commit_exception(struct dm_exception_store *store,
struct core_exception ce;
struct commit_callback *cb;
+ if (!valid)
+ ps->valid = 0;
+
ce.old_chunk = e->old_chunk;
ce.new_chunk = e->new_chunk;
write_exception(ps, ps->current_committed++, &ce);
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 9b7c8c8049d6..4d50a12cf00c 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -52,12 +52,12 @@ static int transient_prepare_exception(struct dm_exception_store *store,
}
static void transient_commit_exception(struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context)
{
/* Just succeed */
- callback(callback_context, 1);
+ callback(callback_context, valid);
}
static void transient_usage(struct dm_exception_store *store,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c06b74e91cd6..3766386080a4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -207,7 +207,6 @@ struct dm_snap_pending_exception {
*/
struct bio *full_bio;
bio_end_io_t *full_bio_end_io;
- void *full_bio_private;
};
/*
@@ -1438,8 +1437,9 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
dm_table_event(s->ti->table);
}
-static void pending_complete(struct dm_snap_pending_exception *pe, int success)
+static void pending_complete(void *context, int success)
{
+ struct dm_snap_pending_exception *pe = context;
struct dm_exception *e;
struct dm_snapshot *s = pe->snap;
struct bio *origin_bios = NULL;
@@ -1485,10 +1485,8 @@ out:
snapshot_bios = bio_list_get(&pe->snapshot_bios);
origin_bios = bio_list_get(&pe->origin_bios);
full_bio = pe->full_bio;
- if (full_bio) {
+ if (full_bio)
full_bio->bi_end_io = pe->full_bio_end_io;
- full_bio->bi_private = pe->full_bio_private;
- }
increment_pending_exceptions_done_count();
up_write(&s->lock);
@@ -1509,24 +1507,13 @@ out:
free_pending_exception(pe);
}
-static void commit_callback(void *context, int success)
-{
- struct dm_snap_pending_exception *pe = context;
-
- pending_complete(pe, success);
-}
-
static void complete_exception(struct dm_snap_pending_exception *pe)
{
struct dm_snapshot *s = pe->snap;
- if (unlikely(pe->copy_error))
- pending_complete(pe, 0);
-
- else
- /* Update the metadata if we are persistent */
- s->store->type->commit_exception(s->store, &pe->e,
- commit_callback, pe);
+ /* Update the metadata if we are persistent */
+ s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
+ pending_complete, pe);
}
/*
@@ -1605,7 +1592,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
- pe->full_bio_private = bio->bi_private;
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 1fa45695b68a..f962d6453afd 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1207,6 +1207,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
dm_block_t held_root;
/*
+ * We commit to ensure the btree roots which we increment in a
+ * moment are up to date.
+ */
+ __commit_transaction(pmd);
+
+ /*
* Copy the superblock.
*/
dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
@@ -1389,8 +1395,21 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
return td->snapshotted_time > time;
}
-int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_issue_io, struct dm_thin_lookup_result *result)
+static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
+ struct dm_thin_lookup_result *result)
+{
+ uint64_t block_time = 0;
+ dm_block_t exception_block;
+ uint32_t exception_time;
+
+ block_time = le64_to_cpu(value);
+ unpack_block_time(block_time, &exception_block, &exception_time);
+ result->block = exception_block;
+ result->shared = __snapshotted_since(td, exception_time);
+}
+
+static int __find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result)
{
int r;
__le64 value;
@@ -1398,39 +1417,56 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- down_read(&pmd->root_lock);
- if (pmd->fail_io) {
- up_read(&pmd->root_lock);
- return -EINVAL;
- }
-
if (can_issue_io) {
info = &pmd->info;
} else
info = &pmd->nb_info;
r = dm_btree_lookup(info, pmd->root, keys, &value);
- if (!r) {
- uint64_t block_time = 0;
- dm_block_t exception_block;
- uint32_t exception_time;
-
- block_time = le64_to_cpu(value);
- unpack_block_time(block_time, &exception_block,
- &exception_time);
- result->block = exception_block;
- result->shared = __snapshotted_since(td, exception_time);
+ if (!r)
+ unpack_lookup_result(td, value, result);
+
+ return r;
+}
+
+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result)
+{
+ int r;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (pmd->fail_io) {
+ up_read(&pmd->root_lock);
+ return -EINVAL;
}
+ r = __find_block(td, block, can_issue_io, result);
+
up_read(&pmd->root_lock);
return r;
}
-/* FIXME: write a more efficient one in btree */
-int dm_thin_find_mapped_range(struct dm_thin_device *td,
- dm_block_t begin, dm_block_t end,
- dm_block_t *thin_begin, dm_block_t *thin_end,
- dm_block_t *pool_begin, bool *maybe_shared)
+static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
+ dm_block_t *vblock,
+ struct dm_thin_lookup_result *result)
+{
+ int r;
+ __le64 value;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[2] = { td->id, block };
+
+ r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
+ if (!r)
+ unpack_lookup_result(td, value, result);
+
+ return r;
+}
+
+static int __find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
{
int r;
dm_block_t pool_end;
@@ -1439,21 +1475,11 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
if (end < begin)
return -ENODATA;
- /*
- * Find first mapped block.
- */
- while (begin < end) {
- r = dm_thin_find_block(td, begin, true, &lookup);
- if (r) {
- if (r != -ENODATA)
- return r;
- } else
- break;
-
- begin++;
- }
+ r = __find_next_mapped_block(td, begin, &begin, &lookup);
+ if (r)
+ return r;
- if (begin == end)
+ if (begin >= end)
return -ENODATA;
*thin_begin = begin;
@@ -1463,7 +1489,7 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
begin++;
pool_end = *pool_begin + 1;
while (begin != end) {
- r = dm_thin_find_block(td, begin, true, &lookup);
+ r = __find_block(td, begin, true, &lookup);
if (r) {
if (r == -ENODATA)
break;
@@ -1483,6 +1509,24 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
return 0;
}
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
+{
+ int r = -EINVAL;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io) {
+ r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
+ pool_begin, maybe_shared);
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
static int __insert(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block)
{
@@ -1538,7 +1582,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
{
int r;
- unsigned count;
+ unsigned count, total_count = 0;
struct dm_pool_metadata *pmd = td->pmd;
dm_block_t keys[1] = { td->id };
__le64 value;
@@ -1561,11 +1605,29 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
if (r)
return r;
- r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
- if (r)
- return r;
+ /*
+ * Remove leaves stops at the first unmapped entry, so we have to
+ * loop round finding mapped ranges.
+ */
+ while (begin < end) {
+ r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
+ if (r == -ENODATA)
+ break;
+
+ if (r)
+ return r;
+
+ if (begin >= end)
+ break;
+
+ r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+ if (r)
+ return r;
+
+ total_count += count;
+ }
- td->mapped_blocks -= count;
+ td->mapped_blocks -= total_count;
td->changed = 1;
/*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 3897b90bd462..72d91f477683 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2432,6 +2432,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
case PM_WRITE:
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "write");
+ pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
@@ -3452,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- cancel_delayed_work(&pool->waker);
- cancel_delayed_work(&pool->no_space_timeout);
+ cancel_delayed_work_sync(&pool->waker);
+ cancel_delayed_work_sync(&pool->no_space_timeout);
flush_workqueue(pool->wq);
(void) commit(pool);
}
@@ -3885,7 +3886,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 16, 0},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4249,10 +4250,9 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
struct pool *pool = tc->pool;
- struct queue_limits *pool_limits = dm_get_queue_limits(pool->pool_md);
- if (!pool_limits->discard_granularity)
- return; /* pool's discard support is disabled */
+ if (!pool->pf.discard_enabled)
+ return;
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
@@ -4260,7 +4260,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 16, 0},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
new file mode 100644
index 000000000000..1cc10c4de701
--- /dev/null
+++ b/drivers/md/dm-verity-fec.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "dm-verity-fec.h"
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "verity-fec"
+
+/*
+ * If error correction has been configured, returns true.
+ */
+bool verity_fec_is_enabled(struct dm_verity *v)
+{
+ return v->fec && v->fec->dev;
+}
+
+/*
+ * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable
+ * length fields.
+ */
+static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
+{
+ return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+}
+
+/*
+ * Return an interleaved offset for a byte in RS block.
+ */
+static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
+{
+ u32 mod;
+
+ mod = do_div(offset, v->fec->rsn);
+ return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
+}
+
+/*
+ * Decode an RS block using Reed-Solomon.
+ */
+static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u8 *data, u8 *fec, int neras)
+{
+ int i;
+ uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
+
+ for (i = 0; i < v->fec->roots; i++)
+ par[i] = fec[i];
+
+ return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
+ fio->erasures, 0, NULL);
+}
+
+/*
+ * Read error-correcting codes for the requested RS block. Returns a pointer
+ * to the data block. Caller is responsible for releasing buf.
+ */
+static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
+ unsigned *offset, struct dm_buffer **buf)
+{
+ u64 position, block;
+ u8 *res;
+
+ position = (index + rsb) * v->fec->roots;
+ block = position >> v->data_dev_block_bits;
+ *offset = (unsigned)(position - (block << v->data_dev_block_bits));
+
+ res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+ if (unlikely(IS_ERR(res))) {
+ DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+ v->data_dev->name, (unsigned long long)rsb,
+ (unsigned long long)(v->fec->start + block),
+ PTR_ERR(res));
+ *buf = NULL;
+ }
+
+ return res;
+}
+
+/* Loop over each preallocated buffer slot. */
+#define fec_for_each_prealloc_buffer(__i) \
+ for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++)
+
+/* Loop over each extra buffer slot. */
+#define fec_for_each_extra_buffer(io, __i) \
+ for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++)
+
+/* Loop over each allocated buffer. */
+#define fec_for_each_buffer(io, __i) \
+ for (__i = 0; __i < (io)->nbufs; __i++)
+
+/* Loop over each RS block in each allocated buffer. */
+#define fec_for_each_buffer_rs_block(io, __i, __j) \
+ fec_for_each_buffer(io, __i) \
+ for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
+
+/*
+ * Return a pointer to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
+ struct dm_verity_fec_io *fio,
+ unsigned i, unsigned j)
+{
+ return &fio->bufs[i][j * v->fec->rsn];
+}
+
+/*
+ * Return an index to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j)
+{
+ return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
+}
+
+/*
+ * Decode all RS blocks from buffers and copy corrected bytes into fio->output
+ * starting from block_offset.
+ */
+static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u64 rsb, int byte_index, unsigned block_offset,
+ int neras)
+{
+ int r, corrected = 0, res;
+ struct dm_buffer *buf;
+ unsigned n, i, offset;
+ u8 *par, *block;
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (IS_ERR(par))
+ return PTR_ERR(par);
+
+ /*
+ * Decode the RS blocks we have in bufs. Each RS block results in
+ * one corrected target byte and consumes fec->roots parity bytes.
+ */
+ fec_for_each_buffer_rs_block(fio, n, i) {
+ block = fec_buffer_rs_block(v, fio, n, i);
+ res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ if (res < 0) {
+ dm_bufio_release(buf);
+
+ r = res;
+ goto error;
+ }
+
+ corrected += res;
+ fio->output[block_offset] = block[byte_index];
+
+ block_offset++;
+ if (block_offset >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ /* read the next block when we run out of parity bytes */
+ offset += v->fec->roots;
+ if (offset >= 1 << v->data_dev_block_bits) {
+ dm_bufio_release(buf);
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (unlikely(IS_ERR(par)))
+ return PTR_ERR(par);
+ }
+ }
+done:
+ r = corrected;
+error:
+ if (r < 0 && neras)
+ DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
+ v->data_dev->name, (unsigned long long)rsb, r);
+ else if (r > 0)
+ DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
+ v->data_dev->name, (unsigned long long)rsb, r);
+
+ return r;
+}
+
+/*
+ * Locate data block erasures using verity hashes.
+ */
+static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *want_digest, u8 *data)
+{
+ if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+ data, 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io))))
+ return 0;
+
+ return memcmp(verity_io_real_digest(v, io), want_digest,
+ v->digest_size) != 0;
+}
+
+/*
+ * Read data blocks that are part of the RS block and deinterleave as much as
+ * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ */
+static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
+ u64 rsb, u64 target, unsigned block_offset,
+ int *neras)
+{
+ bool is_zero;
+ int i, j, target_index = -1;
+ struct dm_buffer *buf;
+ struct dm_bufio_client *bufio;
+ struct dm_verity_fec_io *fio = fec_io(io);
+ u64 block, ileaved;
+ u8 *bbuf, *rs_block;
+ u8 want_digest[v->digest_size];
+ unsigned n, k;
+
+ if (neras)
+ *neras = 0;
+
+ /*
+ * read each of the rsn data blocks that are part of the RS block, and
+ * interleave contents to available bufs
+ */
+ for (i = 0; i < v->fec->rsn; i++) {
+ ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
+
+ /*
+ * target is the data block we want to correct, target_index is
+ * the index of this block within the rsn RS blocks
+ */
+ if (ileaved == target)
+ target_index = i;
+
+ block = ileaved >> v->data_dev_block_bits;
+ bufio = v->fec->data_bufio;
+
+ if (block >= v->data_blocks) {
+ block -= v->data_blocks;
+
+ /*
+ * blocks outside the area were assumed to contain
+ * zeros when encoding data was generated
+ */
+ if (unlikely(block >= v->fec->hash_blocks))
+ continue;
+
+ block += v->hash_start;
+ bufio = v->bufio;
+ }
+
+ bbuf = dm_bufio_read(bufio, block, &buf);
+ if (unlikely(IS_ERR(bbuf))) {
+ DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
+ v->data_dev->name,
+ (unsigned long long)rsb,
+ (unsigned long long)block, PTR_ERR(bbuf));
+
+ /* assume the block is corrupted */
+ if (neras && *neras <= v->fec->roots)
+ fio->erasures[(*neras)++] = i;
+
+ continue;
+ }
+
+ /* locate erasures if the block is on the data device */
+ if (bufio == v->fec->data_bufio &&
+ verity_hash_for_block(v, io, block, want_digest,
+ &is_zero) == 0) {
+ /* skip known zero blocks entirely */
+ if (is_zero)
+ continue;
+
+ /*
+ * skip if we have already found the theoretical
+ * maximum number (i.e. fec->roots) of erasures
+ */
+ if (neras && *neras <= v->fec->roots &&
+ fec_is_erasure(v, io, want_digest, bbuf))
+ fio->erasures[(*neras)++] = i;
+ }
+
+ /*
+ * deinterleave and copy the bytes that fit into bufs,
+ * starting from block_offset
+ */
+ fec_for_each_buffer_rs_block(fio, n, j) {
+ k = fec_buffer_rs_index(n, j) + block_offset;
+
+ if (k >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ rs_block = fec_buffer_rs_block(v, fio, n, j);
+ rs_block[i] = bbuf[k];
+ }
+done:
+ dm_bufio_release(buf);
+ }
+
+ return target_index;
+}
+
+/*
+ * Allocate RS control structure and FEC buffers from preallocated mempools,
+ * and attempt to allocate as many extra buffers as available.
+ */
+static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ if (!fio->rs) {
+ fio->rs = mempool_alloc(v->fec->rs_pool, 0);
+ if (unlikely(!fio->rs)) {
+ DMERR("failed to allocate RS");
+ return -ENOMEM;
+ }
+ }
+
+ fec_for_each_prealloc_buffer(n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+ if (unlikely(!fio->bufs[n])) {
+ DMERR("failed to allocate FEC buffer");
+ return -ENOMEM;
+ }
+ }
+
+ /* try to allocate the maximum number of buffers */
+ fec_for_each_extra_buffer(fio, n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+ /* we can manage with even one buffer if necessary */
+ if (unlikely(!fio->bufs[n]))
+ break;
+ }
+ fio->nbufs = n;
+
+ if (!fio->output) {
+ fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+
+ if (!fio->output) {
+ DMERR("failed to allocate FEC page");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are
+ * zeroed before deinterleaving.
+ */
+static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ fec_for_each_buffer(fio, n)
+ memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+
+ memset(fio->erasures, 0, sizeof(fio->erasures));
+}
+
+/*
+ * Decode all RS blocks in a single data block and return the target block
+ * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
+ * hashes to locate erasures.
+ */
+static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+ bool use_erasures)
+{
+ int r, neras = 0;
+ unsigned pos;
+
+ r = fec_alloc_bufs(v, fio);
+ if (unlikely(r < 0))
+ return r;
+
+ for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+ fec_init_bufs(v, fio);
+
+ r = fec_read_bufs(v, io, rsb, offset, pos,
+ use_erasures ? &neras : NULL);
+ if (unlikely(r < 0))
+ return r;
+
+ r = fec_decode_bufs(v, fio, rsb, r, pos, neras);
+ if (r < 0)
+ return r;
+
+ pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+ }
+
+ /* Always re-validate the corrected block against the expected hash */
+ r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+ 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
+ return r;
+
+ if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
+ v->digest_size)) {
+ DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
+ v->data_dev->name, (unsigned long long)rsb, neras);
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
+ size_t len)
+{
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ memcpy(data, &fio->output[fio->output_pos], len);
+ fio->output_pos += len;
+
+ return 0;
+}
+
+/*
+ * Correct errors in a block. Copies corrected block to dest if non-NULL,
+ * otherwise to a bio_vec starting from iter.
+ */
+int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+ enum verity_block_type type, sector_t block, u8 *dest,
+ struct bvec_iter *iter)
+{
+ int r;
+ struct dm_verity_fec_io *fio = fec_io(io);
+ u64 offset, res, rsb;
+
+ if (!verity_fec_is_enabled(v))
+ return -EOPNOTSUPP;
+
+ if (type == DM_VERITY_BLOCK_TYPE_METADATA)
+ block += v->data_blocks;
+
+ /*
+ * For RS(M, N), the continuous FEC data is divided into blocks of N
+ * bytes. Since block size may not be divisible by N, the last block
+ * is zero padded when decoding.
+ *
+ * Each byte of the block is covered by a different RS(M, N) code,
+ * and each code is interleaved over N blocks to make it less likely
+ * that bursty corruption will leave us in unrecoverable state.
+ */
+
+ offset = block << v->data_dev_block_bits;
+
+ res = offset;
+ div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+
+ /*
+ * The base RS block we can feed to the interleaver to find out all
+ * blocks required for decoding.
+ */
+ rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits);
+
+ /*
+ * Locating erasures is slow, so attempt to recover the block without
+ * them first. Do a second attempt with erasures if the corruption is
+ * bad enough.
+ */
+ r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+ if (r < 0) {
+ r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+ if (r < 0)
+ return r;
+ }
+
+ if (dest)
+ memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
+ else if (iter) {
+ fio->output_pos = 0;
+ r = verity_for_bv_block(v, io, iter, fec_bv_copy);
+ }
+
+ return r;
+}
+
+/*
+ * Clean up per-bio data.
+ */
+void verity_fec_finish_io(struct dm_verity_io *io)
+{
+ unsigned n;
+ struct dm_verity_fec *f = io->v->fec;
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ if (!verity_fec_is_enabled(io->v))
+ return;
+
+ mempool_free(fio->rs, f->rs_pool);
+
+ fec_for_each_prealloc_buffer(n)
+ mempool_free(fio->bufs[n], f->prealloc_pool);
+
+ fec_for_each_extra_buffer(fio, n)
+ mempool_free(fio->bufs[n], f->extra_pool);
+
+ mempool_free(fio->output, f->output_pool);
+}
+
+/*
+ * Initialize per-bio data.
+ */
+void verity_fec_init_io(struct dm_verity_io *io)
+{
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ if (!verity_fec_is_enabled(io->v))
+ return;
+
+ fio->rs = NULL;
+ memset(fio->bufs, 0, sizeof(fio->bufs));
+ fio->nbufs = 0;
+ fio->output = NULL;
+}
+
+/*
+ * Append feature arguments and values to the status table.
+ */
+unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+ char *result, unsigned maxlen)
+{
+ if (!verity_fec_is_enabled(v))
+ return sz;
+
+ DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s "
+ DM_VERITY_OPT_FEC_BLOCKS " %llu "
+ DM_VERITY_OPT_FEC_START " %llu "
+ DM_VERITY_OPT_FEC_ROOTS " %d",
+ v->fec->dev->name,
+ (unsigned long long)v->fec->blocks,
+ (unsigned long long)v->fec->start,
+ v->fec->roots);
+
+ return sz;
+}
+
+void verity_fec_dtr(struct dm_verity *v)
+{
+ struct dm_verity_fec *f = v->fec;
+
+ if (!verity_fec_is_enabled(v))
+ goto out;
+
+ mempool_destroy(f->rs_pool);
+ mempool_destroy(f->prealloc_pool);
+ mempool_destroy(f->extra_pool);
+ kmem_cache_destroy(f->cache);
+
+ if (f->data_bufio)
+ dm_bufio_client_destroy(f->data_bufio);
+ if (f->bufio)
+ dm_bufio_client_destroy(f->bufio);
+
+ if (f->dev)
+ dm_put_device(v->ti, f->dev);
+out:
+ kfree(f);
+ v->fec = NULL;
+}
+
+static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct dm_verity *v = (struct dm_verity *)pool_data;
+
+ return init_rs(8, 0x11d, 0, 1, v->fec->roots);
+}
+
+static void fec_rs_free(void *element, void *pool_data)
+{
+ struct rs_control *rs = (struct rs_control *)element;
+
+ if (rs)
+ free_rs(rs);
+}
+
+bool verity_is_fec_opt_arg(const char *arg_name)
+{
+ return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS));
+}
+
+int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ unsigned *argc, const char *arg_name)
+{
+ int r;
+ struct dm_target *ti = v->ti;
+ const char *arg_value;
+ unsigned long long num_ll;
+ unsigned char num_c;
+ char dummy;
+
+ if (!*argc) {
+ ti->error = "FEC feature arguments require a value";
+ return -EINVAL;
+ }
+
+ arg_value = dm_shift_arg(as);
+ (*argc)--;
+
+ if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+ r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev);
+ if (r) {
+ ti->error = "FEC device lookup failed";
+ return r;
+ }
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) {
+ if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+ ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+ >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+ v->fec->blocks = num_ll;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) {
+ if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+ ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >>
+ (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_START;
+ return -EINVAL;
+ }
+ v->fec->start = num_ll;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) {
+ if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c ||
+ num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) ||
+ num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS;
+ return -EINVAL;
+ }
+ v->fec->roots = num_c;
+
+ } else {
+ ti->error = "Unrecognized verity FEC feature request";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
+ */
+int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+ struct dm_verity_fec *f;
+
+ f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL);
+ if (!f) {
+ v->ti->error = "Cannot allocate FEC structure";
+ return -ENOMEM;
+ }
+ v->fec = f;
+
+ return 0;
+}
+
+/*
+ * Validate arguments and preallocate memory. Must be called after arguments
+ * have been parsed using verity_fec_parse_opt_args.
+ */
+int verity_fec_ctr(struct dm_verity *v)
+{
+ struct dm_verity_fec *f = v->fec;
+ struct dm_target *ti = v->ti;
+ u64 hash_blocks;
+
+ if (!verity_fec_is_enabled(v)) {
+ verity_fec_dtr(v);
+ return 0;
+ }
+
+ /*
+ * FEC is computed over data blocks, possible metadata, and
+ * hash blocks. In other words, FEC covers total of fec_blocks
+ * blocks consisting of the following:
+ *
+ * data blocks | hash blocks | metadata (optional)
+ *
+ * We allow metadata after hash blocks to support a use case
+ * where all data is stored on the same device and FEC covers
+ * the entire area.
+ *
+ * If metadata is included, we require it to be available on the
+ * hash device after the hash blocks.
+ */
+
+ hash_blocks = v->hash_blocks - v->hash_start;
+
+ /*
+ * Require matching block sizes for data and hash devices for
+ * simplicity.
+ */
+ if (v->data_dev_block_bits != v->hash_dev_block_bits) {
+ ti->error = "Block sizes must match to use FEC";
+ return -EINVAL;
+ }
+
+ if (!f->roots) {
+ ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS;
+ return -EINVAL;
+ }
+ f->rsn = DM_VERITY_FEC_RSM - f->roots;
+
+ if (!f->blocks) {
+ ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+
+ f->rounds = f->blocks;
+ if (sector_div(f->rounds, f->rsn))
+ f->rounds++;
+
+ /*
+ * Due to optional metadata, f->blocks can be larger than
+ * data_blocks and hash_blocks combined.
+ */
+ if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+
+ /*
+ * Metadata is accessed through the hash device, so we require
+ * it to be large enough.
+ */
+ f->hash_blocks = f->blocks - v->data_blocks;
+ if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) {
+ ti->error = "Hash device is too small for "
+ DM_VERITY_OPT_FEC_BLOCKS;
+ return -E2BIG;
+ }
+
+ f->bufio = dm_bufio_client_create(f->dev->bdev,
+ 1 << v->data_dev_block_bits,
+ 1, 0, NULL, NULL);
+ if (IS_ERR(f->bufio)) {
+ ti->error = "Cannot initialize FEC bufio client";
+ return PTR_ERR(f->bufio);
+ }
+
+ if (dm_bufio_get_device_size(f->bufio) <
+ ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) {
+ ti->error = "FEC device is too small";
+ return -E2BIG;
+ }
+
+ f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
+ 1 << v->data_dev_block_bits,
+ 1, 0, NULL, NULL);
+ if (IS_ERR(f->data_bufio)) {
+ ti->error = "Cannot initialize FEC data bufio client";
+ return PTR_ERR(f->data_bufio);
+ }
+
+ if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) {
+ ti->error = "Data device is too small";
+ return -E2BIG;
+ }
+
+ /* Preallocate an rs_control structure for each worker thread */
+ f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
+ fec_rs_free, (void *) v);
+ if (!f->rs_pool) {
+ ti->error = "Cannot allocate RS pool";
+ return -ENOMEM;
+ }
+
+ f->cache = kmem_cache_create("dm_verity_fec_buffers",
+ f->rsn << DM_VERITY_FEC_BUF_RS_BITS,
+ 0, 0, NULL);
+ if (!f->cache) {
+ ti->error = "Cannot create FEC buffer cache";
+ return -ENOMEM;
+ }
+
+ /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
+ f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
+ DM_VERITY_FEC_BUF_PREALLOC,
+ f->cache);
+ if (!f->prealloc_pool) {
+ ti->error = "Cannot allocate FEC buffer prealloc pool";
+ return -ENOMEM;
+ }
+
+ f->extra_pool = mempool_create_slab_pool(0, f->cache);
+ if (!f->extra_pool) {
+ ti->error = "Cannot allocate FEC buffer extra pool";
+ return -ENOMEM;
+ }
+
+ /* Preallocate an output buffer for each thread */
+ f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
+ 1 << v->data_dev_block_bits);
+ if (!f->output_pool) {
+ ti->error = "Cannot allocate FEC output pool";
+ return -ENOMEM;
+ }
+
+ /* Reserve space for our per-bio data */
+ ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+
+ return 0;
+}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
new file mode 100644
index 000000000000..7fa0298b995e
--- /dev/null
+++ b/drivers/md/dm-verity-fec.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef DM_VERITY_FEC_H
+#define DM_VERITY_FEC_H
+
+#include "dm-verity.h"
+#include <linux/rslib.h>
+
+/* Reed-Solomon(M, N) parameters */
+#define DM_VERITY_FEC_RSM 255
+#define DM_VERITY_FEC_MAX_RSN 253
+#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */
+
+/* buffers for deinterleaving and decoding */
+#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */
+#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */
+/* we need buffers for at most 1 << block size RS blocks */
+#define DM_VERITY_FEC_BUF_MAX \
+ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+
+#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device"
+#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks"
+#define DM_VERITY_OPT_FEC_START "fec_start"
+#define DM_VERITY_OPT_FEC_ROOTS "fec_roots"
+
+/* configuration */
+struct dm_verity_fec {
+ struct dm_dev *dev; /* parity data device */
+ struct dm_bufio_client *data_bufio; /* for data dev access */
+ struct dm_bufio_client *bufio; /* for parity data access */
+ sector_t start; /* parity data start in blocks */
+ sector_t blocks; /* number of blocks covered */
+ sector_t rounds; /* number of interleaving rounds */
+ sector_t hash_blocks; /* blocks covered after v->hash_start */
+ unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */
+ unsigned char rsn; /* N of RS(M, N) */
+ mempool_t *rs_pool; /* mempool for fio->rs */
+ mempool_t *prealloc_pool; /* mempool for preallocated buffers */
+ mempool_t *extra_pool; /* mempool for extra buffers */
+ mempool_t *output_pool; /* mempool for output */
+ struct kmem_cache *cache; /* cache for buffers */
+};
+
+/* per-bio data */
+struct dm_verity_fec_io {
+ struct rs_control *rs; /* Reed-Solomon state */
+ int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */
+ u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */
+ unsigned nbufs; /* number of buffers allocated */
+ u8 *output; /* buffer for corrected output */
+ size_t output_pos;
+};
+
+#ifdef CONFIG_DM_VERITY_FEC
+
+/* each feature parameter requires a value */
+#define DM_VERITY_OPTS_FEC 8
+
+extern bool verity_fec_is_enabled(struct dm_verity *v);
+
+extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+ enum verity_block_type type, sector_t block,
+ u8 *dest, struct bvec_iter *iter);
+
+extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+ char *result, unsigned maxlen);
+
+extern void verity_fec_finish_io(struct dm_verity_io *io);
+extern void verity_fec_init_io(struct dm_verity_io *io);
+
+extern bool verity_is_fec_opt_arg(const char *arg_name);
+extern int verity_fec_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v, unsigned *argc,
+ const char *arg_name);
+
+extern void verity_fec_dtr(struct dm_verity *v);
+
+extern int verity_fec_ctr_alloc(struct dm_verity *v);
+extern int verity_fec_ctr(struct dm_verity *v);
+
+#else /* !CONFIG_DM_VERITY_FEC */
+
+#define DM_VERITY_OPTS_FEC 0
+
+static inline bool verity_fec_is_enabled(struct dm_verity *v)
+{
+ return false;
+}
+
+static inline int verity_fec_decode(struct dm_verity *v,
+ struct dm_verity_io *io,
+ enum verity_block_type type,
+ sector_t block, u8 *dest,
+ struct bvec_iter *iter)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline unsigned verity_fec_status_table(struct dm_verity *v,
+ unsigned sz, char *result,
+ unsigned maxlen)
+{
+ return sz;
+}
+
+static inline void verity_fec_finish_io(struct dm_verity_io *io)
+{
+}
+
+static inline void verity_fec_init_io(struct dm_verity_io *io)
+{
+}
+
+static inline bool verity_is_fec_opt_arg(const char *arg_name)
+{
+ return false;
+}
+
+static inline int verity_fec_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v,
+ unsigned *argc,
+ const char *arg_name)
+{
+ return -EINVAL;
+}
+
+static inline void verity_fec_dtr(struct dm_verity *v)
+{
+}
+
+static inline int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+ return 0;
+}
+
+static inline int verity_fec_ctr(struct dm_verity *v)
+{
+ return 0;
+}
+
+#endif /* CONFIG_DM_VERITY_FEC */
+
+#endif /* DM_VERITY_FEC_H */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c
index ccf41886ebcf..5c5d30cb6ec5 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity-target.c
@@ -14,12 +14,11 @@
* access behavior.
*/
-#include "dm-bufio.h"
+#include "dm-verity.h"
+#include "dm-verity-fec.h"
#include <linux/module.h>
-#include <linux/device-mapper.h>
#include <linux/reboot.h>
-#include <crypto/hash.h>
#define DM_MSG_PREFIX "verity"
@@ -28,83 +27,18 @@
#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
-#define DM_VERITY_MAX_LEVELS 63
#define DM_VERITY_MAX_CORRUPTED_ERRS 100
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
+#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
+
+#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
-enum verity_mode {
- DM_VERITY_MODE_EIO,
- DM_VERITY_MODE_LOGGING,
- DM_VERITY_MODE_RESTART
-};
-
-enum verity_block_type {
- DM_VERITY_BLOCK_TYPE_DATA,
- DM_VERITY_BLOCK_TYPE_METADATA
-};
-
-struct dm_verity {
- struct dm_dev *data_dev;
- struct dm_dev *hash_dev;
- struct dm_target *ti;
- struct dm_bufio_client *bufio;
- char *alg_name;
- struct crypto_shash *tfm;
- u8 *root_digest; /* digest of the root block */
- u8 *salt; /* salt: its size is salt_size */
- unsigned salt_size;
- sector_t data_start; /* data offset in 512-byte sectors */
- sector_t hash_start; /* hash start in blocks */
- sector_t data_blocks; /* the number of data blocks */
- sector_t hash_blocks; /* the number of hash blocks */
- unsigned char data_dev_block_bits; /* log2(data blocksize) */
- unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
- unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
- unsigned char levels; /* the number of tree levels */
- unsigned char version;
- unsigned digest_size; /* digest size for the current hash algorithm */
- unsigned shash_descsize;/* the size of temporary space for crypto */
- int hash_failed; /* set to 1 if hash of any block failed */
- enum verity_mode mode; /* mode for handling verification errors */
- unsigned corrupted_errs;/* Number of errors for corrupted blocks */
-
- struct workqueue_struct *verify_wq;
-
- /* starting blocks for each tree level. 0 is the lowest level. */
- sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
-};
-
-struct dm_verity_io {
- struct dm_verity *v;
-
- /* original values of bio->bi_end_io and bio->bi_private */
- bio_end_io_t *orig_bi_end_io;
- void *orig_bi_private;
-
- sector_t block;
- unsigned n_blocks;
-
- struct bvec_iter iter;
-
- struct work_struct work;
-
- /*
- * Three variably-size fields follow this struct:
- *
- * u8 hash_desc[v->shash_descsize];
- * u8 real_digest[v->digest_size];
- * u8 want_digest[v->digest_size];
- *
- * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
- */
-};
-
struct dm_verity_prefetch_work {
struct work_struct work;
struct dm_verity *v;
@@ -112,21 +46,6 @@ struct dm_verity_prefetch_work {
unsigned n_blocks;
};
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (struct shash_desc *)(io + 1);
-}
-
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (u8 *)(io + 1) + v->shash_descsize;
-}
-
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
-}
-
/*
* Auxiliary structure appended to each dm-bufio buffer. If the value
* hash_verified is nonzero, hash of the block has been verified.
@@ -173,6 +92,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
return block >> (level * v->hash_per_block_bits);
}
+/*
+ * Wrapper for crypto_shash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+{
+ int r;
+
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(desc);
+
+ if (unlikely(r < 0)) {
+ DMERR("crypto_shash_init failed: %d", r);
+ return r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+ if (unlikely(r < 0)) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len)
+{
+ int r = crypto_shash_update(desc, data, len);
+
+ if (unlikely(r < 0))
+ DMERR("crypto_shash_update failed: %d", r);
+
+ return r;
+}
+
+static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
+ u8 *digest)
+{
+ int r;
+
+ if (unlikely(!v->version)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ r = crypto_shash_final(desc, digest);
+
+ if (unlikely(r < 0))
+ DMERR("crypto_shash_final failed: %d", r);
+
+ return r;
+}
+
+int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len, u8 *digest)
+{
+ int r;
+
+ r = verity_hash_init(v, desc);
+ if (unlikely(r < 0))
+ return r;
+
+ r = verity_hash_update(v, desc, data, len);
+ if (unlikely(r < 0))
+ return r;
+
+ return verity_hash_final(v, desc, digest);
+}
+
static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
sector_t *hash_block, unsigned *offset)
{
@@ -246,17 +243,17 @@ out:
* Verify hash of a metadata block pertaining to the specified data block
* ("block" argument) at a specified level ("level" argument).
*
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
+ * On successful return, verity_io_want_digest(v, io) contains the hash value
+ * for a lower tree level or for the data block (if we're at the lowest level).
*
* If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
* If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
+ * against current value of verity_io_want_digest(v, io).
*/
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
- int level, bool skip_unverified)
+static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, int level, bool skip_unverified,
+ u8 *want_digest)
{
- struct dm_verity *v = io->v;
struct dm_buffer *buf;
struct buffer_aux *aux;
u8 *data;
@@ -273,72 +270,128 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
aux = dm_bufio_get_aux_data(buf);
if (!aux->hash_verified) {
- struct shash_desc *desc;
- u8 *result;
-
if (skip_unverified) {
r = 1;
goto release_ret_r;
}
- desc = io_hash_desc(v, io);
- desc->tfm = v->tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- r = crypto_shash_init(desc);
- if (r < 0) {
- DMERR("crypto_shash_init failed: %d", r);
+ r = verity_hash(v, verity_io_hash_desc(v, io),
+ data, 1 << v->hash_dev_block_bits,
+ verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
goto release_ret_r;
- }
-
- if (likely(v->version >= 1)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- goto release_ret_r;
- }
- }
- r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
+ if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
+ v->digest_size) == 0))
+ aux->hash_verified = 1;
+ else if (verity_fec_decode(v, io,
+ DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block, data, NULL) == 0)
+ aux->hash_verified = 1;
+ else if (verity_handle_err(v,
+ DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block)) {
+ r = -EIO;
goto release_ret_r;
}
+ }
- if (!v->version) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- goto release_ret_r;
- }
- }
+ data += offset;
+ memcpy(want_digest, data, v->digest_size);
+ r = 0;
- result = io_real_digest(v, io);
- r = crypto_shash_final(desc, result);
- if (r < 0) {
- DMERR("crypto_shash_final failed: %d", r);
- goto release_ret_r;
- }
- if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
- if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
- hash_block)) {
- r = -EIO;
- goto release_ret_r;
- }
- } else
- aux->hash_verified = 1;
+release_ret_r:
+ dm_bufio_release(buf);
+ return r;
+}
+
+/*
+ * Find a hash for a given block, write it to digest and verify the integrity
+ * of the hash tree if necessary.
+ */
+int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, u8 *digest, bool *is_zero)
+{
+ int r = 0, i;
+
+ if (likely(v->levels)) {
+ /*
+ * First, we try to get the requested hash for
+ * the current block. If the hash block itself is
+ * verified, zero is returned. If it isn't, this
+ * function returns 1 and we fall back to whole
+ * chain verification.
+ */
+ r = verity_verify_level(v, io, block, 0, true, digest);
+ if (likely(r <= 0))
+ goto out;
}
- data += offset;
+ memcpy(digest, v->root_digest, v->digest_size);
- memcpy(io_want_digest(v, io), data, v->digest_size);
+ for (i = v->levels - 1; i >= 0; i--) {
+ r = verity_verify_level(v, io, block, i, false, digest);
+ if (unlikely(r))
+ goto out;
+ }
+out:
+ if (!r && v->zero_digest)
+ *is_zero = !memcmp(v->zero_digest, digest, v->digest_size);
+ else
+ *is_zero = false;
+
+ return r;
+}
+
+/*
+ * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
+ * starting from iter.
+ */
+int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter *iter,
+ int (*process)(struct dm_verity *v,
+ struct dm_verity_io *io, u8 *data,
+ size_t len))
+{
+ unsigned todo = 1 << v->data_dev_block_bits;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+ do {
+ int r;
+ u8 *page;
+ unsigned len;
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ page = kmap_atomic(bv.bv_page);
+ len = bv.bv_len;
+
+ if (likely(len >= todo))
+ len = todo;
+
+ r = process(v, io, page + bv.bv_offset, len);
+ kunmap_atomic(page);
+
+ if (r < 0)
+ return r;
+
+ bio_advance_iter(bio, iter, len);
+ todo -= len;
+ } while (todo);
- dm_bufio_release(buf);
return 0;
+}
-release_ret_r:
- dm_bufio_release(buf);
+static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *data, size_t len)
+{
+ return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
+}
- return r;
+static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *data, size_t len)
+{
+ memset(data, 0, len);
+ return 0;
}
/*
@@ -346,99 +399,56 @@ release_ret_r:
*/
static int verity_verify_io(struct dm_verity_io *io)
{
+ bool is_zero;
struct dm_verity *v = io->v;
- struct bio *bio = dm_bio_from_per_bio_data(io,
- v->ti->per_bio_data_size);
+ struct bvec_iter start;
unsigned b;
- int i;
for (b = 0; b < io->n_blocks; b++) {
- struct shash_desc *desc;
- u8 *result;
int r;
- unsigned todo;
+ struct shash_desc *desc = verity_io_hash_desc(v, io);
+
+ r = verity_hash_for_block(v, io, io->block + b,
+ verity_io_want_digest(v, io),
+ &is_zero);
+ if (unlikely(r < 0))
+ return r;
- if (likely(v->levels)) {
+ if (is_zero) {
/*
- * First, we try to get the requested hash for
- * the current block. If the hash block itself is
- * verified, zero is returned. If it isn't, this
- * function returns 0 and we fall back to whole
- * chain verification.
+ * If we expect a zero block, don't validate, just
+ * return zeros.
*/
- int r = verity_verify_level(io, io->block + b, 0, true);
- if (likely(!r))
- goto test_block_hash;
- if (r < 0)
+ r = verity_for_bv_block(v, io, &io->iter,
+ verity_bv_zero);
+ if (unlikely(r < 0))
return r;
- }
- memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
-
- for (i = v->levels - 1; i >= 0; i--) {
- int r = verity_verify_level(io, io->block + b, i, false);
- if (unlikely(r))
- return r;
+ continue;
}
-test_block_hash:
- desc = io_hash_desc(v, io);
- desc->tfm = v->tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- r = crypto_shash_init(desc);
- if (r < 0) {
- DMERR("crypto_shash_init failed: %d", r);
+ r = verity_hash_init(v, desc);
+ if (unlikely(r < 0))
return r;
- }
-
- if (likely(v->version >= 1)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
- }
- todo = 1 << v->data_dev_block_bits;
- do {
- u8 *page;
- unsigned len;
- struct bio_vec bv = bio_iter_iovec(bio, io->iter);
-
- page = kmap_atomic(bv.bv_page);
- len = bv.bv_len;
- if (likely(len >= todo))
- len = todo;
- r = crypto_shash_update(desc, page + bv.bv_offset, len);
- kunmap_atomic(page);
-
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
-
- bio_advance_iter(bio, &io->iter, len);
- todo -= len;
- } while (todo);
- if (!v->version) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
- }
+ start = io->iter;
+ r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+ if (unlikely(r < 0))
+ return r;
- result = io_real_digest(v, io);
- r = crypto_shash_final(desc, result);
- if (r < 0) {
- DMERR("crypto_shash_final failed: %d", r);
+ r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
return r;
- }
- if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
- if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
- io->block + b))
- return -EIO;
- }
+
+ if (likely(memcmp(verity_io_real_digest(v, io),
+ verity_io_want_digest(v, io), v->digest_size) == 0))
+ continue;
+ else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
+ io->block + b, NULL, &start) == 0)
+ continue;
+ else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+ io->block + b))
+ return -EIO;
}
return 0;
@@ -453,9 +463,10 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
bio->bi_end_io = io->orig_bi_end_io;
- bio->bi_private = io->orig_bi_private;
bio->bi_error = error;
+ verity_fec_finish_io(io);
+
bio_endio(bio);
}
@@ -470,7 +481,7 @@ static void verity_end_io(struct bio *bio)
{
struct dm_verity_io *io = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
verity_finish_io(io, bio->bi_error);
return;
}
@@ -566,7 +577,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
io = dm_per_bio_data(bio, ti->per_bio_data_size);
io->v = v;
io->orig_bi_end_io = bio->bi_end_io;
- io->orig_bi_private = bio->bi_private;
io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
@@ -574,6 +584,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
bio->bi_private = io;
io->iter = bio->bi_iter;
+ verity_fec_init_io(io);
+
verity_submit_prefetch(v, io);
generic_make_request(bio);
@@ -588,6 +600,7 @@ static void verity_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_verity *v = ti->private;
+ unsigned args = 0;
unsigned sz = 0;
unsigned x;
@@ -614,8 +627,17 @@ static void verity_status(struct dm_target *ti, status_type_t type,
else
for (x = 0; x < v->salt_size; x++)
DMEMIT("%02x", v->salt[x]);
+ if (v->mode != DM_VERITY_MODE_EIO)
+ args++;
+ if (verity_fec_is_enabled(v))
+ args += DM_VERITY_OPTS_FEC;
+ if (v->zero_digest)
+ args++;
+ if (!args)
+ return;
+ DMEMIT(" %u", args);
if (v->mode != DM_VERITY_MODE_EIO) {
- DMEMIT(" 1 ");
+ DMEMIT(" ");
switch (v->mode) {
case DM_VERITY_MODE_LOGGING:
DMEMIT(DM_VERITY_OPT_LOGGING);
@@ -627,6 +649,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
BUG();
}
}
+ if (v->zero_digest)
+ DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+ sz = verity_fec_status_table(v, sz, result, maxlen);
break;
}
}
@@ -677,6 +702,7 @@ static void verity_dtr(struct dm_target *ti)
kfree(v->salt);
kfree(v->root_digest);
+ kfree(v->zero_digest);
if (v->tfm)
crypto_free_shash(v->tfm);
@@ -689,9 +715,94 @@ static void verity_dtr(struct dm_target *ti)
if (v->data_dev)
dm_put_device(ti, v->data_dev);
+ verity_fec_dtr(v);
+
kfree(v);
}
+static int verity_alloc_zero_digest(struct dm_verity *v)
+{
+ int r = -ENOMEM;
+ struct shash_desc *desc;
+ u8 *zero_data;
+
+ v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
+
+ if (!v->zero_digest)
+ return r;
+
+ desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+
+ if (!desc)
+ return r; /* verity_dtr will free zero_digest */
+
+ zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
+
+ if (!zero_data)
+ goto out;
+
+ r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+ v->zero_digest);
+
+out:
+ kfree(desc);
+ kfree(zero_data);
+
+ return r;
+}
+
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+{
+ int r;
+ unsigned argc;
+ struct dm_target *ti = v->ti;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
+ };
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ if (!argc)
+ return 0;
+
+ do {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) {
+ v->mode = DM_VERITY_MODE_LOGGING;
+ continue;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
+ v->mode = DM_VERITY_MODE_RESTART;
+ continue;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
+ r = verity_alloc_zero_digest(v);
+ if (r) {
+ ti->error = "Cannot allocate zero digest";
+ return r;
+ }
+ continue;
+
+ } else if (verity_is_fec_opt_arg(arg_name)) {
+ r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
+ if (r)
+ return r;
+ continue;
+ }
+
+ ti->error = "Unrecognized verity feature request";
+ return -EINVAL;
+ } while (argc && !r);
+
+ return r;
+}
+
/*
* Target parameters:
* <version> The current format is version 1.
@@ -710,18 +821,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
struct dm_arg_set as;
- const char *opt_string;
- unsigned int num, opt_params;
+ unsigned int num;
unsigned long long num_ll;
int r;
int i;
sector_t hash_position;
char dummy;
- static struct dm_arg _args[] = {
- {0, 1, "Invalid number of feature args"},
- };
-
v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
if (!v) {
ti->error = "Cannot allocate verity structure";
@@ -730,6 +836,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->private = v;
v->ti = ti;
+ r = verity_fec_ctr_alloc(v);
+ if (r)
+ goto bad;
+
if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
ti->error = "Device must be readonly";
r = -EINVAL;
@@ -866,29 +976,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
as.argc = argc;
as.argv = argv;
- r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
- if (r)
+ r = verity_parse_opt_args(&as, v);
+ if (r < 0)
goto bad;
-
- while (opt_params) {
- opt_params--;
- opt_string = dm_shift_arg(&as);
- if (!opt_string) {
- ti->error = "Not enough feature arguments";
- r = -EINVAL;
- goto bad;
- }
-
- if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
- v->mode = DM_VERITY_MODE_LOGGING;
- else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
- v->mode = DM_VERITY_MODE_RESTART;
- else {
- ti->error = "Invalid feature arguments";
- r = -EINVAL;
- goto bad;
- }
- }
}
v->hash_per_block_bits =
@@ -938,8 +1028,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
-
/* WQ_UNBOUND greatly improves performance when running on ramdisk */
v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
if (!v->verify_wq) {
@@ -948,6 +1036,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+ ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+ v->shash_descsize + v->digest_size * 2;
+
+ r = verity_fec_ctr(v);
+ if (r)
+ goto bad;
+
+ ti->per_bio_data_size = roundup(ti->per_bio_data_size,
+ __alignof__(struct dm_verity_io));
+
return 0;
bad:
@@ -958,7 +1056,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
new file mode 100644
index 000000000000..fb419f422d73
--- /dev/null
+++ b/drivers/md/dm-verity.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef DM_VERITY_H
+#define DM_VERITY_H
+
+#include "dm-bufio.h"
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_VERITY_MAX_LEVELS 63
+
+enum verity_mode {
+ DM_VERITY_MODE_EIO,
+ DM_VERITY_MODE_LOGGING,
+ DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+ DM_VERITY_BLOCK_TYPE_DATA,
+ DM_VERITY_BLOCK_TYPE_METADATA
+};
+
+struct dm_verity_fec;
+
+struct dm_verity {
+ struct dm_dev *data_dev;
+ struct dm_dev *hash_dev;
+ struct dm_target *ti;
+ struct dm_bufio_client *bufio;
+ char *alg_name;
+ struct crypto_shash *tfm;
+ u8 *root_digest; /* digest of the root block */
+ u8 *salt; /* salt: its size is salt_size */
+ u8 *zero_digest; /* digest for a zero block */
+ unsigned salt_size;
+ sector_t data_start; /* data offset in 512-byte sectors */
+ sector_t hash_start; /* hash start in blocks */
+ sector_t data_blocks; /* the number of data blocks */
+ sector_t hash_blocks; /* the number of hash blocks */
+ unsigned char data_dev_block_bits; /* log2(data blocksize) */
+ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
+ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
+ unsigned char levels; /* the number of tree levels */
+ unsigned char version;
+ unsigned digest_size; /* digest size for the current hash algorithm */
+ unsigned shash_descsize;/* the size of temporary space for crypto */
+ int hash_failed; /* set to 1 if hash of any block failed */
+ enum verity_mode mode; /* mode for handling verification errors */
+ unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+
+ struct workqueue_struct *verify_wq;
+
+ /* starting blocks for each tree level. 0 is the lowest level. */
+ sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+
+ struct dm_verity_fec *fec; /* forward error correction */
+};
+
+struct dm_verity_io {
+ struct dm_verity *v;
+
+ /* original value of bio->bi_end_io */
+ bio_end_io_t *orig_bi_end_io;
+
+ sector_t block;
+ unsigned n_blocks;
+
+ struct bvec_iter iter;
+
+ struct work_struct work;
+
+ /*
+ * Three variably-size fields follow this struct:
+ *
+ * u8 hash_desc[v->shash_descsize];
+ * u8 real_digest[v->digest_size];
+ * u8 want_digest[v->digest_size];
+ *
+ * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+ * and verity_io_want_digest().
+ */
+};
+
+static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (struct shash_desc *)(io + 1);
+}
+
+static inline u8 *verity_io_real_digest(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static inline u8 *verity_io_want_digest(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+static inline u8 *verity_io_digest_end(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return verity_io_want_digest(v, io) + v->digest_size;
+}
+
+extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter *iter,
+ int (*process)(struct dm_verity *v,
+ struct dm_verity_io *io,
+ u8 *data, size_t len));
+
+extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len, u8 *digest);
+
+extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, u8 *digest, bool *is_zero);
+
+#endif /* DM_VERITY_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 32440ad5f684..5df40480228b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -591,7 +591,7 @@ retry:
out:
dm_put_live_table(md, *srcu_idx);
- if (r == -ENOTCONN) {
+ if (r == -ENOTCONN && !fatal_signal_pending(current)) {
msleep(10);
goto retry;
}
@@ -603,9 +603,10 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_target *tgt;
+ struct block_device *tgt_bdev = NULL;
int srcu_idx, r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
if (r < 0)
return r;
@@ -620,7 +621,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
goto out;
}
- r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
out:
dm_put_live_table(md, srcu_idx);
return r;
@@ -1755,7 +1756,7 @@ static void __split_and_process_bio(struct mapped_device *md,
* The request function that just remaps the bio built up by
* dm_merge_bvec.
*/
-static void dm_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
@@ -1774,12 +1775,12 @@ static void dm_make_request(struct request_queue *q, struct bio *bio)
queue_io(md, bio);
else
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
__split_and_process_bio(md, map, bio);
dm_put_live_table(md, srcu_idx);
- return;
+ return BLK_QC_T_NONE;
}
int dm_request_based(struct mapped_device *md)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3f9a514b5b9d..31b595479aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
#include <linux/kthread.h>
#include <linux/blkdev.h>
+#include <linux/badblocks.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
@@ -250,7 +251,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
* call has finished, the bio has been linked into some internal structure
* and so is visible to ->quiesce(), so we don't need the refcount any more.
*/
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
struct mddev *mddev = q->queuedata;
@@ -262,13 +263,13 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_error = -EROFS;
bio_endio(bio);
- return;
+ return BLK_QC_T_NONE;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
@@ -302,6 +303,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
+
+ return BLK_QC_T_NONE;
}
/* mddev_suspend makes sure no new requests are submitted
@@ -312,8 +315,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
*/
void mddev_suspend(struct mddev *mddev)
{
- BUG_ON(mddev->suspended);
- mddev->suspended = 1;
+ if (mddev->suspended++)
+ return;
synchronize_rcu();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
@@ -324,7 +327,8 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- mddev->suspended = 0;
+ if (--mddev->suspended)
+ return;
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
@@ -707,8 +711,7 @@ void md_rdev_clear(struct md_rdev *rdev)
put_page(rdev->bb_page);
rdev->bb_page = NULL;
}
- kfree(rdev->badblocks.page);
- rdev->badblocks.page = NULL;
+ badblocks_exit(&rdev->badblocks);
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
@@ -1358,8 +1361,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
return cpu_to_le32(csum);
}
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged);
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
@@ -1484,8 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
- if (md_set_badblocks(&rdev->badblocks,
- sector, count, 1) == 0)
+ if (badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
@@ -1650,7 +1650,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
- rdev->raid_disk = mddev->raid_disks;
+ rdev->raid_disk = 0;
break;
default:
rdev->saved_raid_disk = role;
@@ -2317,7 +2317,7 @@ repeat:
rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
rdev->badblocks.changed = 0;
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
clear_bit(Blocked, &rdev->flags);
@@ -2443,7 +2443,7 @@ repeat:
clear_bit(Blocked, &rdev->flags);
if (any_badblocks_changed)
- md_ack_all_badblocks(&rdev->badblocks);
+ ack_all_badblocks(&rdev->badblocks);
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
@@ -2771,6 +2771,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
+ int err;
if (rdev->raid_disk != -1)
return -EBUSY;
@@ -2792,9 +2793,15 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
- remove_and_add_spares(rdev->mddev, rdev);
- if (rdev->raid_disk == -1)
- return -EBUSY;
+ err = rdev->mddev->pers->
+ hot_add_disk(rdev->mddev, rdev);
+ if (err) {
+ rdev->raid_disk = -1;
+ return err;
+ } else
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ if (sysfs_link_rdev(rdev->mddev, rdev))
+ /* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@@ -3044,11 +3051,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack);
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ * are recorded as bad. The list is truncated to fit within
+ * the one-page limit of sysfs.
+ * Writing "sector length" to this file adds an acknowledged
+ * bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ * been acknowledged. Writing to this file adds bad blocks
+ * without acknowledging them. This is largely for testing.
+ */
static ssize_t bb_show(struct md_rdev *rdev, char *page)
{
return badblocks_show(&rdev->badblocks, page, 0);
@@ -3163,14 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev)
* This reserves the space even on arrays where it cannot
* be used - I wonder if that matters
*/
- rdev->badblocks.count = 0;
- rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
- rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- seqlock_init(&rdev->badblocks.lock);
- if (rdev->badblocks.page == NULL)
- return -ENOMEM;
-
- return 0;
+ return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
@@ -4316,8 +4322,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
mddev_unlock(mddev);
}
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4330,8 +4335,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ err = -EBUSY;
+ else {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = mddev->pers->start_reshape(mddev);
+ }
mddev_unlock(mddev);
}
if (err)
@@ -8476,254 +8485,9 @@ void md_finish_reshape(struct mddev *mddev)
}
EXPORT_SYMBOL(md_finish_reshape);
-/* Bad block management.
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
- *
- * Locking of the bad-block table uses a seqlock so md_is_badblock
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
- *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
- * We return
- * 0 if there are no known bad blocks in the range
- * 1 if there are known bad block which are all acknowledged
- * -1 if there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
- */
-int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
-{
- int hi;
- int lo;
- u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
-
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
- sectors = target - s;
- }
- /* 'target' is now the first block after the bad range */
-
-retry:
- seq = read_seqbegin(&bb->lock);
- lo = 0;
- rv = 0;
- hi = bb->count;
-
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not. */
- lo = mid;
- else
- /* This and later ranges are definitely out. */
- hi = mid;
- }
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
- */
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
- }
- }
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return rv;
-}
-EXPORT_SYMBOL_GPL(md_is_badblock);
-
-/*
- * Add a range of bad blocks to the table.
- * This might extend the table, or might contract it
- * if two adjacent ranges can be merged.
- * We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- */
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
-{
- u64 *p;
- int lo, hi;
- int rv = 1;
- unsigned long flags;
-
- if (bb->shift < 0)
- /* badblocks are disabled */
- return 0;
-
- if (bb->shift) {
- /* round the start down, and the end up */
- sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
- sectors = next - s;
- }
-
- write_seqlock_irqsave(&bb->lock, flags);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
-
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- }
- }
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
- }
- sectors = e - s;
- lo = hi;
- hi++;
- }
- }
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
- }
- }
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi' */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- /* No room for more */
- rv = 0;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
-
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
- }
-
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- write_sequnlock_irqrestore(&bb->lock, flags);
-
- return rv;
-}
+/* Bad block management */
+/* Returns 1 on success, 0 on failure */
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8732,114 +8496,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- rv = md_set_badblocks(&rdev->badblocks,
- s, sectors, 0);
- if (rv) {
+ rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
+ if (rv == 0) {
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
- }
- return rv;
+ return 1;
+ } else
+ return 0;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
-/*
- * Remove a range of bad blocks from the table.
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- */
-static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
-{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
- int rv = 0;
-
- if (bb->shift > 0) {
- /* When clearing we round the start up and the end down.
- * This should not matter as the shift should align with
- * the block size and no rounding should ever be needed.
- * However it is better the think a block is bad when it
- * isn't than to think a block is not bad when it is.
- */
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
- sectors = target - s;
- }
-
- write_seqlock_irq(&bb->lock);
-
- p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
- }
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MD_MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
- }
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
- */
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
- }
- }
-
- bb->changed = 1;
-out:
- write_sequnlock_irq(&bb->lock);
- return rv;
-}
-
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
@@ -8847,133 +8516,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
- return md_clear_badblocks(&rdev->badblocks,
+ return badblocks_clear(&rdev->badblocks,
s, sectors);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
-/*
- * Acknowledge all bad blocks in a list.
- * This only succeeds if ->changed is clear. It is used by
- * in-kernel metadata updates
- */
-void md_ack_all_badblocks(struct badblocks *bb)
-{
- if (bb->page == NULL || bb->changed)
- /* no point even trying */
- return;
- write_seqlock_irq(&bb->lock);
-
- if (bb->changed == 0 && bb->unacked_exist) {
- u64 *p = bb->page;
- int i;
- for (i = 0; i < bb->count ; i++) {
- if (!BB_ACK(p[i])) {
- sector_t start = BB_OFFSET(p[i]);
- int len = BB_LEN(p[i]);
- p[i] = BB_MAKE(start, len, 1);
- }
- }
- bb->unacked_exist = 0;
- }
- write_sequnlock_irq(&bb->lock);
-}
-EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
-
-/* sysfs access to bad-blocks list.
- * We present two files.
- * 'bad-blocks' lists sector numbers and lengths of ranges that
- * are recorded as bad. The list is truncated to fit within
- * the one-page limit of sysfs.
- * Writing "sector length" to this file adds an acknowledged
- * bad block list.
- * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
- * been acknowledged. Writing to this file adds bad blocks
- * without acknowledging them. This is largely for testing.
- */
-
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack)
-{
- size_t len;
- int i;
- u64 *p = bb->page;
- unsigned seq;
-
- if (bb->shift < 0)
- return 0;
-
-retry:
- seq = read_seqbegin(&bb->lock);
-
- len = 0;
- i = 0;
-
- while (len < PAGE_SIZE && i < bb->count) {
- sector_t s = BB_OFFSET(p[i]);
- unsigned int length = BB_LEN(p[i]);
- int ack = BB_ACK(p[i]);
- i++;
-
- if (unack && ack)
- continue;
-
- len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
- (unsigned long long)s << bb->shift,
- length << bb->shift);
- }
- if (unack && len == 0)
- bb->unacked_exist = 0;
-
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
- return len;
-}
-
-#define DO_DEBUG 1
-
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
-{
- unsigned long long sector;
- int length;
- char newline;
-#ifdef DO_DEBUG
- /* Allow clearing via sysfs *only* for testing/debugging.
- * Normally only a successful write may clear a badblock
- */
- int clear = 0;
- if (page[0] == '-') {
- clear = 1;
- page++;
- }
-#endif /* DO_DEBUG */
-
- switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
- case 3:
- if (newline != '\n')
- return -EINVAL;
- case 2:
- if (length <= 0)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
-#ifdef DO_DEBUG
- if (clear) {
- md_clear_badblocks(bb, sector, length);
- return len;
- }
-#endif /* DO_DEBUG */
- if (md_set_badblocks(bb, sector, length, !unack))
- return len;
- else
- return -ENOSPC;
-}
-
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2bea51edfab7..75b9aaacb03f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -17,6 +17,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/badblocks.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
@@ -28,13 +29,6 @@
#define MaxSector (~(sector_t)0)
-/* Bad block numbers are stored sorted in a single page.
- * 64bits is used for each block or extent.
- * 54 bits are sector number, 9 bits are extent size,
- * 1 bit is an 'acknowledged' flag.
- */
-#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
-
/*
* MD's 'extended' device
*/
@@ -117,22 +111,7 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
- struct badblocks {
- int count; /* count of bad blocks */
- int unacked_exist; /* there probably are unacknowledged
- * bad blocks. This is only cleared
- * when a read discovers none
- */
- int shift; /* shift from sectors to block size
- * a -ve shift means badblocks are
- * disabled.*/
- u64 *page; /* badblock list */
- int changed;
- seqlock_t lock;
-
- sector_t sector;
- sector_t size; /* in sectors */
- } badblocks;
+ struct badblocks badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -185,22 +164,11 @@ enum flag_bits {
*/
};
-#define BB_LEN_MASK (0x00000000000001FFULL)
-#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
-#define BB_ACK_MASK (0x8000000000000000ULL)
-#define BB_MAX_LEN 512
-#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
-#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
-#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
-#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
-
-extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
- int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+ int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
sectors,
first_bad, bad_sectors);
if (rv)
@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
-extern void md_ack_all_badblocks(struct badblocks *bb);
-
struct md_cluster_info;
struct mddev {
@@ -566,7 +532,9 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
@@ -576,7 +544,9 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 78c74bb71ba4..a53cbc928af1 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -7,12 +7,3 @@ config DM_PERSISTENT_DATA
Library providing immutable on-disk data structure support for
device-mapper targets such as the thin provisioning target.
-config DM_DEBUG_BLOCK_STACK_TRACING
- bool "Keep stack trace of persistent data block lock holders"
- depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
- select STACKTRACE
- ---help---
- Enable this for messages that may help debug problems with the
- block manager locking used by thin provisioning and caching.
-
- If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index f2393ba838eb..1e33dd51c21f 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -97,10 +97,6 @@ static void __del_holder(struct block_lock *lock, struct task_struct *task)
static int __check_holder(struct block_lock *lock)
{
unsigned i;
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- static struct stack_trace t;
- static stack_entries entries;
-#endif
for (i = 0; i < MAX_HOLDERS; i++) {
if (lock->holders[i] == current) {
@@ -110,12 +106,7 @@ static int __check_holder(struct block_lock *lock)
print_stack_trace(lock->traces + i, 4);
DMERR("subsequent acquisition attempted here:");
- t.nr_entries = 0;
- t.max_entries = MAX_STACK;
- t.entries = entries;
- t.skip = 3;
- save_stack_trace(&t);
- print_stack_trace(&t, 4);
+ dump_stack();
#endif
return -EINVAL;
}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index c573402033b2..ea3d3b656fd0 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -63,6 +63,11 @@ int lower_bound(struct btree_node *n, uint64_t key)
return bsearch(n, key, 0);
}
+static int upper_bound(struct btree_node *n, uint64_t key)
+{
+ return bsearch(n, key, 1);
+}
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt)
{
@@ -252,6 +257,16 @@ static void pop_frame(struct del_stack *s)
dm_tm_unlock(s->tm, f->b);
}
+static void unlock_all_frames(struct del_stack *s)
+{
+ struct frame *f;
+
+ while (unprocessed_frames(s)) {
+ f = s->spine + s->top--;
+ dm_tm_unlock(s->tm, f->b);
+ }
+}
+
int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
{
int r;
@@ -308,9 +323,13 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
pop_frame(s);
}
}
-
out:
+ if (r) {
+ /* cleanup all frames of del_stack */
+ unlock_all_frames(s);
+ }
kfree(s);
+
return r;
}
EXPORT_SYMBOL_GPL(dm_btree_del);
@@ -392,6 +411,82 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_btree_lookup);
+static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
+ uint64_t key, uint64_t *rkey, void *value_le)
+{
+ int r, i;
+ uint32_t flags, nr_entries;
+ struct dm_block *node;
+ struct btree_node *n;
+
+ r = bn_read_lock(info, root, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
+ flags = le32_to_cpu(n->header.flags);
+ nr_entries = le32_to_cpu(n->header.nr_entries);
+
+ if (flags & INTERNAL_NODE) {
+ i = lower_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ if (r == -ENODATA && i < (nr_entries - 1)) {
+ i++;
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ }
+
+ } else {
+ i = upper_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ *rkey = le64_to_cpu(n->keys[i]);
+ memcpy(value_le, value_ptr(n, i), info->value_type.size);
+ }
+out:
+ dm_tm_unlock(info->tm, node);
+ return r;
+}
+
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le)
+{
+ unsigned level;
+ int r = -ENODATA;
+ __le64 internal_value_le;
+ struct ro_spine spine;
+
+ init_ro_spine(&spine, info);
+ for (level = 0; level < info->levels - 1u; level++) {
+ r = btree_lookup_raw(&spine, root, keys[level],
+ lower_bound, rkey,
+ &internal_value_le, sizeof(uint64_t));
+ if (r)
+ goto out;
+
+ if (*rkey != keys[level]) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ root = le64_to_cpu(internal_value_le);
+ }
+
+ r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
+out:
+ exit_ro_spine(&spine);
+ return r;
+}
+
+EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
+
/*
* Splits a node by creating a sibling node and shifting half the nodes
* contents across. Assumes there is a parent node, and it has room for
@@ -473,8 +568,10 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
r = insert_at(sizeof(__le64), pn, parent_index + 1,
le64_to_cpu(rn->keys[0]), &location);
- if (r)
+ if (r) {
+ unlock_block(s->info, right);
return r;
+ }
if (key < le64_to_cpu(rn->keys[0])) {
unlock_block(s->info, right);
@@ -657,12 +754,19 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
return 0;
}
+static bool need_insert(struct btree_node *node, uint64_t *keys,
+ unsigned level, unsigned index)
+{
+ return ((index >= le32_to_cpu(node->header.nr_entries)) ||
+ (le64_to_cpu(node->keys[index]) != keys[level]));
+}
+
static int insert(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, void *value, dm_block_t *new_root,
int *inserted)
__dm_written_to_disk(value)
{
- int r, need_insert;
+ int r;
unsigned level, index = -1, last_level = info->levels - 1;
dm_block_t block = root;
struct shadow_spine spine;
@@ -678,10 +782,8 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
goto bad;
n = dm_block_data(shadow_current(&spine));
- need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
- (le64_to_cpu(n->keys[index]) != keys[level]));
- if (need_insert) {
+ if (need_insert(n, keys, level, index)) {
dm_block_t new_tree;
__le64 new_le;
@@ -708,10 +810,8 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
goto bad;
n = dm_block_data(shadow_current(&spine));
- need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
- (le64_to_cpu(n->keys[index]) != keys[level]));
- if (need_insert) {
+ if (need_insert(n, keys, level, index)) {
if (inserted)
*inserted = 1;
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 11d8cf78621d..c74301fa5a37 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -110,6 +110,13 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, void *value_le);
/*
+ * Tries to find the first key where the bottom level key is >= to that
+ * given. Useful for skipping empty sections of the btree.
+ */
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le);
+
+/*
* Insertion (or overwrite an existing value). O(ln(n))
*/
int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
@@ -135,9 +142,10 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root);
/*
- * Removes values between 'keys' and keys2, where keys2 is keys with the
- * final key replaced with 'end_key'. 'end_key' is the one-past-the-end
- * value. 'keys' may be altered.
+ * Removes a _contiguous_ run of values starting from 'keys' and not
+ * reaching keys2 (where keys2 is keys with the final key replaced with
+ * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be
+ * altered.
*/
int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, uint64_t end_key,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 53091295fce9..7e44005595c1 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -136,7 +136,7 @@ static int brb_push(struct bop_ring_buffer *brb,
return 0;
}
-static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
{
struct block_op *bop;
@@ -147,6 +147,14 @@ static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
result->type = bop->type;
result->block = bop->block;
+ return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb)
+{
+ if (brb_empty(brb))
+ return -ENODATA;
+
brb->begin = brb_next(brb, brb->begin);
return 0;
@@ -211,7 +219,7 @@ static int apply_bops(struct sm_metadata *smm)
while (!brb_empty(&smm->uncommitted)) {
struct block_op bop;
- r = brb_pop(&smm->uncommitted, &bop);
+ r = brb_peek(&smm->uncommitted, &bop);
if (r) {
DMERR("bug in bop ring buffer");
break;
@@ -220,6 +228,8 @@ static int apply_bops(struct sm_metadata *smm)
r = commit_bop(smm, &bop);
if (r)
break;
+
+ brb_pop(&smm->uncommitted);
}
return r;
@@ -683,7 +693,6 @@ static struct dm_space_map bootstrap_ops = {
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
int r, i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
dm_block_t old_len = smm->ll.nr_blocks;
@@ -705,11 +714,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* allocate any new blocks.
*/
do {
- for (i = old_len; !r && i < smm->begin; i++) {
- r = sm_ll_inc(&smm->ll, i, &ev);
- if (r)
- goto out;
- }
+ for (i = old_len; !r && i < smm->begin; i++)
+ r = add_bop(smm, BOP_INC, i);
+
+ if (r)
+ goto out;
+
old_len = smm->begin;
r = apply_bops(smm);
@@ -754,7 +764,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
{
int r;
dm_block_t i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
smm->begin = superblock + 1;
@@ -782,7 +791,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
* allocated blocks that they were built from.
*/
for (i = superblock; !r && i < smm->begin; i++)
- r = sm_ll_inc(&smm->ll, i, &ev);
+ r = add_bop(smm, BOP_INC, i);
if (r)
return r;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 41d70bc9ba2f..84e597e1c489 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
first = i;
fbio = r10_bio->devs[i].bio;
+ fbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ fbio->bi_iter.bi_idx = 0;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
@@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
- tbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
OpenPOWER on IntegriCloud