diff options
Diffstat (limited to 'drivers/md')
51 files changed, 2102 insertions, 1452 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 17bf109c58e9..f6e0a8b3a61e 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,8 @@ config BCACHE tristate "Block device as cache" - ---help--- + select CRC64 + help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. @@ -10,7 +11,7 @@ config BCACHE config BCACHE_DEBUG bool "Bcache debugging" depends on BCACHE - ---help--- + help Don't select this option unless you're a developer Enables extra debugging tools, allows expensive runtime checks to be @@ -20,7 +21,7 @@ config BCACHE_CLOSURES_DEBUG bool "Debug closures" depends on BCACHE select DEBUG_FS - ---help--- + help Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 7fa2631b422c..7a28232d868b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,8 +87,8 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned next = c->nbuckets * c->sb.bucket_size / 1024; - unsigned i; + unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned int i; int r; atomic_sub(sectors, &c->rescale); @@ -169,7 +169,7 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) #define bucket_prio(b) \ ({ \ - unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ \ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ }) @@ -244,6 +244,7 @@ static void invalidate_buckets_random(struct cache *ca) while (!fifo_full(&ca->free_inc)) { size_t n; + get_random_bytes(&n, sizeof(n)); n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); @@ -301,7 +302,7 @@ do { \ static int bch_allocator_push(struct cache *ca, long bucket) { - unsigned i; + unsigned int i; /* Prios/gens are actually the most important reserve */ if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) @@ -385,7 +386,7 @@ out: /* Allocation */ -long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) { DEFINE_WAIT(w); struct bucket *b; @@ -421,7 +422,7 @@ out: if (expensive_debug_checks(ca->set)) { size_t iter; long i; - unsigned j; + unsigned int j; for (iter = 0; iter < prio_buckets(ca) * 2; iter++) BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); @@ -470,14 +471,14 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) void bch_bucket_free(struct cache_set *c, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) __bch_bucket_free(PTR_CACHE(c, k, i), PTR_BUCKET(c, k, i)); } -int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, int n, bool wait) { int i; @@ -510,10 +511,11 @@ err: return -1; } -int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, int n, bool wait) { int ret; + mutex_lock(&c->bucket_lock); ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); mutex_unlock(&c->bucket_lock); @@ -524,8 +526,8 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct open_bucket { struct list_head list; - unsigned last_write_point; - unsigned sectors_free; + unsigned int last_write_point; + unsigned int sectors_free; BKEY_PADDED(key); }; @@ -556,7 +558,7 @@ struct open_bucket { */ static struct open_bucket *pick_data_bucket(struct cache_set *c, const struct bkey *search, - unsigned write_point, + unsigned int write_point, struct bkey *alloc) { struct open_bucket *ret, *ret_task = NULL; @@ -595,12 +597,16 @@ found: * * If s->writeback is true, will not fail. */ -bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, - unsigned write_point, unsigned write_prio, bool wait) +bool bch_alloc_sectors(struct cache_set *c, + struct bkey *k, + unsigned int sectors, + unsigned int write_point, + unsigned int write_prio, + bool wait) { struct open_bucket *b; BKEY_PADDED(key) alloc; - unsigned i; + unsigned int i; /* * We might have to allocate a new bucket, which we can't do with a @@ -613,7 +619,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, spin_lock(&c->data_bucket_lock); while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { - unsigned watermark = write_prio + unsigned int watermark = write_prio ? RESERVE_MOVINGGC : RESERVE_NONE; @@ -702,6 +708,7 @@ int bch_open_buckets_alloc(struct cache_set *c) for (i = 0; i < MAX_OPEN_BUCKETS; i++) { struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) return -ENOMEM; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6bf294f3907..83504dd8100a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -252,7 +252,7 @@ struct bcache_device { struct kobject kobj; struct cache_set *c; - unsigned id; + unsigned int id; #define BCACHEDEVNAME_SIZE 12 char name[BCACHEDEVNAME_SIZE]; @@ -264,18 +264,19 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned nr_stripes; - unsigned stripe_size; + unsigned int nr_stripes; + unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; struct bio_set bio_split; - unsigned data_csum:1; + unsigned int data_csum:1; - int (*cache_miss)(struct btree *, struct search *, - struct bio *, unsigned); - int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); + int (*cache_miss)(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors); + int (*ioctl)(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg); }; struct io { @@ -284,7 +285,7 @@ struct io { struct list_head lru; unsigned long jiffies; - unsigned sequential; + unsigned int sequential; sector_t last; }; @@ -328,13 +329,6 @@ struct cached_dev { */ atomic_t has_dirty; - /* - * Set to zero by things that touch the backing volume-- except - * writeback. Incremented by writeback. Used to determine when to - * accelerate idle writeback. - */ - atomic_t backing_idle; - struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -365,18 +359,18 @@ struct cached_dev { struct cache_accounting accounting; /* The rest of this all shows up in sysfs */ - unsigned sequential_cutoff; - unsigned readahead; + unsigned int sequential_cutoff; + unsigned int readahead; - unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; + unsigned int io_disable:1; + unsigned int verify:1; + unsigned int bypass_torture_test:1; - unsigned partial_stripes_expensive:1; - unsigned writeback_metadata:1; - unsigned writeback_running:1; + unsigned int partial_stripes_expensive:1; + unsigned int writeback_metadata:1; + unsigned int writeback_running:1; unsigned char writeback_percent; - unsigned writeback_delay; + unsigned int writeback_delay; uint64_t writeback_rate_target; int64_t writeback_rate_proportional; @@ -384,16 +378,16 @@ struct cached_dev { int64_t writeback_rate_integral_scaled; int32_t writeback_rate_change; - unsigned writeback_rate_update_seconds; - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; + unsigned int writeback_rate_update_seconds; + unsigned int writeback_rate_i_term_inverse; + unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 atomic_t io_errors; - unsigned error_limit; - unsigned offline_seconds; + unsigned int error_limit; + unsigned int offline_seconds; char backing_dev_name[BDEVNAME_SIZE]; }; @@ -423,9 +417,9 @@ struct cache { /* * When allocating new buckets, prio_write() gets first dibs - since we * may not be allocate at all without writing priorities and gens. - * prio_buckets[] contains the last buckets we wrote priorities to (so - * gc can mark them as metadata), prio_next[] contains the buckets - * allocated for the next prio write. + * prio_last_buckets[] contains the last buckets we wrote priorities to + * (so gc can mark them as metadata), prio_buckets[] contains the + * buckets allocated for the next prio write. */ uint64_t *prio_buckets; uint64_t *prio_last_buckets; @@ -454,7 +448,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc; + unsigned int invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -474,11 +468,12 @@ struct cache { struct gc_stat { size_t nodes; + size_t nodes_pre; size_t key_bytes; size_t nkeys; uint64_t data; /* sectors */ - unsigned in_use; /* percent */ + unsigned int in_use; /* percent */ }; /* @@ -514,6 +509,8 @@ struct cache_set { struct cache_accounting accounting; unsigned long flags; + atomic_t idle_counter; + atomic_t at_max_writeback_rate; struct cache_sb sb; @@ -522,9 +519,11 @@ struct cache_set { int caches_loaded; struct bcache_device **devices; - unsigned devices_max_used; + unsigned int devices_max_used; + atomic_t attached_dev_nr; struct list_head cached_devs; uint64_t cached_dev_sectors; + atomic_long_t flash_dev_dirty_sectors; struct closure caching; struct closure sb_write; @@ -550,7 +549,7 @@ struct cache_set { * Default number of pages for a new btree node - may be less than a * full bucket */ - unsigned btree_pages; + unsigned int btree_pages; /* * Lists of struct btrees; lru is the list for structs that have memory @@ -573,7 +572,7 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned btree_cache_used; + unsigned int btree_cache_used; /* * If we need to allocate memory for a new btree node and that @@ -603,6 +602,10 @@ struct cache_set { */ atomic_t rescale; /* + * used for GC, identify if any front side I/Os is inflight + */ + atomic_t search_inflight; + /* * When we invalidate buckets, we use both the priority and the amount * of good data to determine which buckets to reuse first - to weight * those together consistently we keep track of the smallest nonzero @@ -611,8 +614,8 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - last_gc) for all buckets. When it gets too big we have to gc - * to keep gens from wrapping around. + * max(gen - last_gc) for all buckets. When it gets too big we have to + * gc to keep gens from wrapping around. */ uint8_t need_gc; struct gc_stat gc_stats; @@ -647,7 +650,7 @@ struct cache_set { struct mutex verify_lock; #endif - unsigned nr_uuids; + unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); struct closure uuid_write; @@ -668,12 +671,12 @@ struct cache_set { struct journal journal; #define CONGESTED_MAX 1024 - unsigned congested_last_us; + unsigned int congested_last_us; atomic_t congested; /* The rest of this all shows up in sysfs */ - unsigned congested_read_threshold_us; - unsigned congested_write_threshold_us; + unsigned int congested_read_threshold_us; + unsigned int congested_write_threshold_us; struct time_stats btree_gc_time; struct time_stats btree_split_time; @@ -692,16 +695,16 @@ struct cache_set { ON_ERROR_PANIC, } on_error; #define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; + unsigned int error_limit; + unsigned int error_decay; unsigned short journal_delay_ms; bool expensive_debug_checks; - unsigned verify:1; - unsigned key_merging_disabled:1; - unsigned gc_always_rewrite:1; - unsigned shrinker_disabled:1; - unsigned copy_gc_enabled:1; + unsigned int verify:1; + unsigned int key_merging_disabled:1; + unsigned int gc_always_rewrite:1; + unsigned int shrinker_disabled:1; + unsigned int copy_gc_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -710,7 +713,7 @@ struct cache_set { }; struct bbio { - unsigned submit_time_us; + unsigned int submit_time_us; union { struct bkey key; uint64_t _pad[3]; @@ -727,10 +730,10 @@ struct bbio { #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ - ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) + ((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) #define btree_default_blocks(c) \ - ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) + ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) #define bucket_bytes(c) ((c)->sb.bucket_size << 9) @@ -759,21 +762,21 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) static inline struct cache *PTR_CACHE(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return c->cache[PTR_DEV(k, ptr)]; } static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return sector_to_bucket(c, PTR_OFFSET(k, ptr)); } static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } @@ -781,17 +784,18 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, static inline uint8_t gen_after(uint8_t a, uint8_t b) { uint8_t r = a - b; + return r > 128U ? 0 : r; } static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) + unsigned int i) { return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); } static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) + unsigned int i) { return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); } @@ -877,16 +881,16 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) #define BUCKET_GC_GEN_MAX 96U #define kobj_attribute_write(n, fn) \ - static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) + static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn) #define kobj_attribute_rw(n, show, store) \ static struct kobj_attribute ksysfs_##n = \ - __ATTR(n, S_IWUSR|S_IRUSR, show, store) + __ATTR(n, 0600, show, store) static inline void wake_up_allocators(struct cache_set *c) { struct cache *ca; - unsigned i; + unsigned int i; for_each_cache(ca, c, i) wake_up_process(ca->alloc_thread); @@ -922,40 +926,43 @@ static inline void wait_for_kthread_stop(void) /* Forward declarations */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); -void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, - const char *); -void bch_bbio_free(struct bio *, struct cache_set *); -struct bio *bch_bbio_alloc(struct cache_set *); - -void __bch_submit_bbio(struct bio *, struct cache_set *); -void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); - -uint8_t bch_inc_gen(struct cache *, struct bucket *); -void bch_rescale_priorities(struct cache_set *, int); - -bool bch_can_invalidate_bucket(struct cache *, struct bucket *); -void __bch_invalidate_one_bucket(struct cache *, struct bucket *); - -void __bch_bucket_free(struct cache *, struct bucket *); -void bch_bucket_free(struct cache_set *, struct bkey *); - -long bch_bucket_alloc(struct cache *, unsigned, bool); -int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); -int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); -bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); +void bch_count_io_errors(struct cache *ca, blk_status_t error, + int is_read, const char *m); +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_free(struct bio *bio, struct cache_set *c); +struct bio *bch_bbio_alloc(struct cache_set *c); + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c); +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned int ptr); + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b); +void bch_rescale_priorities(struct cache_set *c, int sectors); + +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b); +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b); + +void __bch_bucket_free(struct cache *ca, struct bucket *b); +void bch_bucket_free(struct cache_set *c, struct bkey *k); + +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, int n, bool wait); +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, int n, bool wait); +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, + unsigned int sectors, unsigned int write_point, + unsigned int write_prio, bool wait); bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) -bool bch_cache_set_error(struct cache_set *, const char *, ...); +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *); -void bch_write_bdev_super(struct cached_dev *, struct closure *); +void bch_prio_write(struct cache *ca); +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; extern struct mutex bch_register_lock; @@ -967,35 +974,36 @@ extern struct kobj_type bch_cache_set_ktype; extern struct kobj_type bch_cache_set_internal_ktype; extern struct kobj_type bch_cache_ktype; -void bch_cached_dev_release(struct kobject *); -void bch_flash_dev_release(struct kobject *); -void bch_cache_set_release(struct kobject *); -void bch_cache_release(struct kobject *); +void bch_cached_dev_release(struct kobject *kobj); +void bch_flash_dev_release(struct kobject *kobj); +void bch_cache_set_release(struct kobject *kobj); +void bch_cache_release(struct kobject *kobj); -int bch_uuid_write(struct cache_set *); -void bcache_write_super(struct cache_set *); +int bch_uuid_write(struct cache_set *c); +void bcache_write_super(struct cache_set *c); int bch_flash_dev_create(struct cache_set *c, uint64_t size); -int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); -void bch_cached_dev_detach(struct cached_dev *); -void bch_cached_dev_run(struct cached_dev *); -void bcache_device_stop(struct bcache_device *); +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); +void bch_cached_dev_detach(struct cached_dev *dc); +void bch_cached_dev_run(struct cached_dev *dc); +void bcache_device_stop(struct bcache_device *d); -void bch_cache_set_unregister(struct cache_set *); -void bch_cache_set_stop(struct cache_set *); +void bch_cache_set_unregister(struct cache_set *c); +void bch_cache_set_stop(struct cache_set *c); -struct cache_set *bch_cache_set_alloc(struct cache_sb *); -void bch_btree_cache_free(struct cache_set *); -int bch_btree_cache_alloc(struct cache_set *); -void bch_moving_init_cache_set(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); -void bch_open_buckets_free(struct cache_set *); +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); +void bch_btree_cache_free(struct cache_set *c); +int bch_btree_cache_alloc(struct cache_set *c); +void bch_moving_init_cache_set(struct cache_set *c); +int bch_open_buckets_alloc(struct cache_set *c); +void bch_open_buckets_free(struct cache_set *c); int bch_cache_allocator_start(struct cache *ca); void bch_debug_exit(void); -int bch_debug_init(struct kobject *); +void bch_debug_init(struct kobject *kobj); void bch_request_exit(void); int bch_request_init(void); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f3403b45bc28..8f07fa6e1739 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -18,31 +18,31 @@ #ifdef CONFIG_BCACHE_DEBUG -void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set) { struct bkey *k, *next; for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); - printk(KERN_ERR "block %u key %u/%u: ", set, - (unsigned) ((u64 *) k - i->d), i->keys); + pr_err("block %u key %u/%u: ", set, + (unsigned int) ((u64 *) k - i->d), i->keys); if (b->ops->key_dump) b->ops->key_dump(b, k); else - printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + pr_err("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); if (next < bset_bkey_last(i) && bkey_cmp(k, b->ops->is_extents ? &START_KEY(next) : next) > 0) - printk(KERN_ERR "Key skipped backwards\n"); + pr_err("Key skipped backwards\n"); } } void bch_dump_bucket(struct btree_keys *b) { - unsigned i; + unsigned int i; console_lock(); for (i = 0; i <= b->nsets; i++) @@ -53,7 +53,7 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { - unsigned ret = 0; + unsigned int ret = 0; struct btree_iter iter; struct bkey *k; @@ -128,7 +128,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} /* Keylists */ -int __bch_keylist_realloc(struct keylist *l, unsigned u64s) +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) { size_t oldsize = bch_keylist_nkeys(l); size_t newsize = oldsize + u64s; @@ -180,7 +180,7 @@ void bch_keylist_pop_front(struct keylist *l) /* Key/pointer manipulation */ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, - unsigned i) + unsigned int i) { BUG_ON(i > KEY_PTRS(src)); @@ -194,7 +194,7 @@ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, bool __bch_cut_front(const struct bkey *where, struct bkey *k) { - unsigned i, len = 0; + unsigned int i, len = 0; if (bkey_cmp(where, &START_KEY(k)) <= 0) return false; @@ -214,7 +214,7 @@ bool __bch_cut_front(const struct bkey *where, struct bkey *k) bool __bch_cut_back(const struct bkey *where, struct bkey *k) { - unsigned len = 0; + unsigned int len = 0; if (bkey_cmp(where, k) >= 0) return false; @@ -240,9 +240,9 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; + unsigned int exponent:BKEY_EXPONENT_BITS; + unsigned int m:BKEY_MID_BITS; + unsigned int mantissa:BKEY_MANTISSA_BITS; } __packed; /* @@ -311,7 +311,9 @@ void bch_btree_keys_free(struct btree_keys *b) } EXPORT_SYMBOL(bch_btree_keys_free); -int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +int bch_btree_keys_alloc(struct btree_keys *b, + unsigned int page_order, + gfp_t gfp) { struct bset_tree *t = b->set; @@ -345,7 +347,7 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned i; + unsigned int i; b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; @@ -366,7 +368,11 @@ EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ -static unsigned inorder_next(unsigned j, unsigned size) +/* + * return array index next to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ +static unsigned int inorder_next(unsigned int j, unsigned int size) { if (j * 2 + 1 < size) { j = j * 2 + 1; @@ -379,7 +385,11 @@ static unsigned inorder_next(unsigned j, unsigned size) return j; } -static unsigned inorder_prev(unsigned j, unsigned size) +/* + * return array index previous to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ +static unsigned int inorder_prev(unsigned int j, unsigned int size) { if (j * 2 < size) { j = j * 2; @@ -392,7 +402,8 @@ static unsigned inorder_prev(unsigned j, unsigned size) return j; } -/* I have no idea why this code works... and I'm the one who wrote it +/* + * I have no idea why this code works... and I'm the one who wrote it * * However, I do know what it does: * Given a binary tree constructed in an array (i.e. how you normally implement @@ -405,10 +416,12 @@ static unsigned inorder_prev(unsigned j, unsigned size) * extra is a function of size: * extra = (size - rounddown_pow_of_two(size - 1)) << 1; */ -static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) +static unsigned int __to_inorder(unsigned int j, + unsigned int size, + unsigned int extra) { - unsigned b = fls(j); - unsigned shift = fls(size - 1) - b; + unsigned int b = fls(j); + unsigned int shift = fls(size - 1) - b; j ^= 1U << (b - 1); j <<= 1; @@ -421,14 +434,20 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) return j; } -static unsigned to_inorder(unsigned j, struct bset_tree *t) +/* + * Return the cacheline index in bset_tree->data, where j is index + * from a linear array which stores the auxiliar binary tree + */ +static unsigned int to_inorder(unsigned int j, struct bset_tree *t) { return __to_inorder(j, t->size, t->extra); } -static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) +static unsigned int __inorder_to_tree(unsigned int j, + unsigned int size, + unsigned int extra) { - unsigned shift; + unsigned int shift; if (j > extra) j += j - extra; @@ -441,7 +460,11 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) return j; } -static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) +/* + * Return an index from a linear array which stores the auxiliar binary + * tree, j is the cacheline index of t->data. + */ +static unsigned int inorder_to_tree(unsigned int j, struct bset_tree *t) { return __inorder_to_tree(j, t->size, t->extra); } @@ -452,14 +475,15 @@ void inorder_test(void) unsigned long done = 0; ktime_t start = ktime_get(); - for (unsigned size = 2; + for (unsigned int size = 2; size < 65536000; size++) { - unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; - unsigned i = 1, j = rounddown_pow_of_two(size - 1); + unsigned int extra = + (size - rounddown_pow_of_two(size - 1)) << 1; + unsigned int i = 1, j = rounddown_pow_of_two(size - 1); if (!(size % 4096)) - printk(KERN_NOTICE "loop %u, %llu per us\n", size, + pr_notice("loop %u, %llu per us\n", size, done / ktime_us_delta(ktime_get(), start)); while (1) { @@ -502,30 +526,31 @@ void inorder_test(void) * of the previous key so we can walk backwards to it from t->tree[j]'s key. */ -static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, - unsigned offset) +static struct bkey *cacheline_to_bkey(struct bset_tree *t, + unsigned int cacheline, + unsigned int offset) { return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; } -static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) +static unsigned int bkey_to_cacheline(struct bset_tree *t, struct bkey *k) { return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bset_tree *t, - unsigned cacheline, +static unsigned int bkey_to_cacheline_offset(struct bset_tree *t, + unsigned int cacheline, struct bkey *k) { return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } -static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) +static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned int j) { return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); } -static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) +static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned int j) { return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); } @@ -534,7 +559,7 @@ static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) * For the write set - the one we're currently inserting keys into - we don't * maintain a full search tree, we just keep a simple lookup table in t->prev. */ -static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) +static struct bkey *table_to_bkey(struct bset_tree *t, unsigned int cacheline) { return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); } @@ -546,14 +571,29 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) return low; } -static inline unsigned bfloat_mantissa(const struct bkey *k, +/* + * Calculate mantissa value for struct bkey_float. + * If most significant bit of f->exponent is not set, then + * - f->exponent >> 6 is 0 + * - p[0] points to bkey->low + * - p[-1] borrows bits from KEY_INODE() of bkey->high + * if most isgnificant bits of f->exponent is set, then + * - f->exponent >> 6 is 1 + * - p[0] points to bits from KEY_INODE() of bkey->high + * - p[-1] points to other bits from KEY_INODE() of + * bkey->high too. + * See make_bfloat() to check when most significant bit of f->exponent + * is set or not. + */ +static inline unsigned int bfloat_mantissa(const struct bkey *k, struct bkey_float *f) { const uint64_t *p = &k->low - (f->exponent >> 6); + return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; } -static void make_bfloat(struct bset_tree *t, unsigned j) +static void make_bfloat(struct bset_tree *t, unsigned int j) { struct bkey_float *f = &t->tree[j]; struct bkey *m = tree_to_bkey(t, j); @@ -570,6 +610,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j) BUG_ON(m < l || m > r); BUG_ON(bkey_next(p) != m); + /* + * If l and r have different KEY_INODE values (different backing + * device), f->exponent records how many least significant bits + * are different in KEY_INODE values and sets most significant + * bits to 1 (by +64). + * If l and r have same KEY_INODE value, f->exponent records + * how many different bits in least significant bits of bkey->low. + * See bfloat_mantiss() how the most significant bit of + * f->exponent is used to calculate bfloat mantissa value. + */ if (KEY_INODE(l) != KEY_INODE(r)) f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; else @@ -591,7 +641,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { if (t != b->set) { - unsigned j = roundup(t[-1].size, + unsigned int j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); t->tree = t[-1].tree + j; @@ -633,17 +683,26 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) } EXPORT_SYMBOL(bch_bset_init_next); +/* + * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to + * accelerate bkey search in a btree node (pointed by bset_tree->data in + * memory). After search in the auxiliar tree by calling bset_search_tree(), + * a struct bset_search_iter is returned which indicates range [l, r] from + * bset_tree->data where the searching bkey might be inside. Then a followed + * linear comparison does the exact search, see __bch_bset_search() for how + * the auxiliary tree is used. + */ void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); struct bkey *prev = NULL, *k = t->data->start; - unsigned j, cacheline = 1; + unsigned int j, cacheline = 1; b->last_set_unwritten = 0; bset_alloc_tree(b, t); - t->size = min_t(unsigned, + t->size = min_t(unsigned int, bkey_to_cacheline(t, bset_bkey_last(t->data)), b->set->tree + btree_keys_cachelines(b) - t->tree); @@ -683,7 +742,7 @@ EXPORT_SYMBOL(bch_bset_build_written_tree); void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; - unsigned inorder, j = 1; + unsigned int inorder, j = 1; for (t = b->set; t <= bset_tree_last(b); t++) if (k < bset_bkey_last(t->data)) @@ -730,14 +789,15 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, struct bkey *k) { - unsigned shift = bkey_u64s(k); - unsigned j = bkey_to_cacheline(t, k); + unsigned int shift = bkey_u64s(k); + unsigned int j = bkey_to_cacheline(t, k); /* We're getting called from btree_split() or btree_gc, just bail out */ if (!t->size) return; - /* k is the key we just inserted; we need to find the entry in the + /* + * k is the key we just inserted; we need to find the entry in the * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ @@ -745,7 +805,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, table_to_bkey(t, j) <= k) j++; - /* Adjust all the lookup table entries, and find a new key for any that + /* + * Adjust all the lookup table entries, and find a new key for any that * have gotten too big */ for (; j < t->size; j++) { @@ -770,7 +831,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); + t->prev[t->size] = + bkey_to_cacheline_offset(t, t->size, k); t->size++; } } @@ -818,10 +880,10 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, } EXPORT_SYMBOL(bch_bset_insert); -unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) { - unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; struct btree_iter iter; @@ -873,10 +935,10 @@ struct bset_search_iter { static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { - unsigned li = 0, ri = t->size; + unsigned int li = 0, ri = t->size; while (li + 1 != ri) { - unsigned m = (li + ri) >> 1; + unsigned int m = (li + ri) >> 1; if (bkey_cmp(table_to_bkey(t, m), search) > 0) ri = m; @@ -895,10 +957,22 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, { struct bkey *l, *r; struct bkey_float *f; - unsigned inorder, j, n = 1; + unsigned int inorder, j, n = 1; do { - unsigned p = n << 4; + /* + * A bit trick here. + * If p < t->size, (int)(p - t->size) is a minus value and + * the most significant bit is set, right shifting 31 bits + * gets 1. If p >= t->size, the most significant bit is + * not set, right shifting 31 bits gets 0. + * So the following 2 lines equals to + * if (p >= t->size) + * p = 0; + * but a branch instruction is avoided. + */ + unsigned int p = n << 4; + p &= ((int) (p - t->size)) >> 31; prefetch(&t->tree[p]); @@ -907,6 +981,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, f = &t->tree[j]; /* + * Similar bit trick, use subtract operation to avoid a branch + * instruction. + * * n = (f->mantissa > bfloat_mantissa()) * ? j * 2 * : j * 2 + 1; @@ -915,7 +992,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, * to work - that's done in make_bfloat() */ if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned) + n = j * 2 + (((unsigned int) (f->mantissa - bfloat_mantissa(search, f))) >> 31); else @@ -1046,6 +1123,7 @@ static struct bkey *__bch_btree_iter_init(struct btree_keys *b, struct bset_tree *start) { struct bkey *ret = NULL; + iter->size = ARRAY_SIZE(iter->data); iter->used = 0; @@ -1121,7 +1199,8 @@ void bch_bset_sort_state_free(struct bset_sort_state *state) mempool_exit(&state->pool); } -int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order) { spin_lock_init(&state->time.lock); @@ -1174,7 +1253,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, } static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup, + unsigned int start, unsigned int order, bool fixup, struct bset_sort_state *state) { uint64_t start_time; @@ -1225,7 +1304,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree_keys *b, unsigned start, +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; @@ -1235,7 +1314,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned start, __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { - unsigned i; + unsigned int i; for (i = start; i <= b->nsets; i++) keys += b->set[i].data->keys; @@ -1260,8 +1339,8 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + bch_btree_iter_init(b, &iter, NULL); btree_mergesort(b, new->set->data, &iter, false, true); @@ -1275,7 +1354,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { - unsigned crit = SORT_CRIT; + unsigned int crit = SORT_CRIT; int i; /* Don't sort if nothing to do */ @@ -1304,7 +1383,7 @@ EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - unsigned i; + unsigned int i; for (i = 0; i <= b->nsets; i++) { struct bset_tree *t = &b->set[i]; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index b867f2200495..bac76aabca6d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -163,10 +163,10 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned int size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned int extra; /* copy of the last key in the set */ struct bkey end; @@ -187,18 +187,25 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(struct btree_iter_set, - struct btree_iter_set); - struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); - bool (*insert_fixup)(struct btree_keys *, struct bkey *, - struct btree_iter *, struct bkey *); - bool (*key_invalid)(struct btree_keys *, - const struct bkey *); - bool (*key_bad)(struct btree_keys *, const struct bkey *); - bool (*key_merge)(struct btree_keys *, - struct bkey *, struct bkey *); - void (*key_to_text)(char *, size_t, const struct bkey *); - void (*key_dump)(struct btree_keys *, const struct bkey *); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); + struct bkey *(*sort_fixup)(struct btree_iter *iter, + struct bkey *tmp); + bool (*insert_fixup)(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key); + bool (*key_invalid)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_bad)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_merge)(struct btree_keys *bk, + struct bkey *l, struct bkey *r); + void (*key_to_text)(char *buf, + size_t size, + const struct bkey *k); + void (*key_dump)(struct btree_keys *keys, + const struct bkey *k); /* * Only used for deciding whether to use START_KEY(k) or just the key @@ -211,7 +218,7 @@ struct btree_keys { const struct btree_keys_ops *ops; uint8_t page_order; uint8_t nsets; - unsigned last_set_unwritten:1; + unsigned int last_set_unwritten:1; bool *expensive_debug_checks; /* @@ -239,12 +246,14 @@ static inline bool bkey_written(struct btree_keys *b, struct bkey *k) return !b->last_set_unwritten || k < b->set[b->nsets].data->start; } -static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +static inline unsigned int bset_byte_offset(struct btree_keys *b, + struct bset *i) { return ((size_t) i) - ((size_t) b->set->data); } -static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +static inline unsigned int bset_sector_offset(struct btree_keys *b, + struct bset *i) { return bset_byte_offset(b, i) >> 9; } @@ -273,25 +282,27 @@ static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) } static inline struct bset *bset_next_set(struct btree_keys *b, - unsigned block_bytes) + unsigned int block_bytes) { struct bset *i = bset_tree_last(b)->data; return ((void *) i) + roundup(set_bytes(i), block_bytes); } -void bch_btree_keys_free(struct btree_keys *); -int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); -void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, - bool *); - -void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); -void bch_bset_build_written_tree(struct btree_keys *); -void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); -bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); -void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); -unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, - struct bkey *); +void bch_btree_keys_free(struct btree_keys *b); +int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, + gfp_t gfp); +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks); + +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic); +void bch_bset_build_written_tree(struct btree_keys *b); +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k); +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r); +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert); +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key); enum { BTREE_INSERT_STATUS_NO_INSERT = 0, @@ -313,18 +324,21 @@ struct btree_iter { } data[MAX_BSETS]; }; -typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); +typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree_keys *, ptr_filter_fn); +struct bkey *bch_btree_iter_next(struct btree_iter *iter); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree_keys *b, + ptr_filter_fn fn); -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, - struct bkey *); +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end); +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search); -struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, - const struct bkey *); +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, + const struct bkey *search); /* * Returns the first key that is strictly greater than search @@ -349,21 +363,23 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, struct bset_sort_state { mempool_t pool; - unsigned page_order; - unsigned crit_factor; + unsigned int page_order; + unsigned int crit_factor; struct time_stats time; }; -void bch_bset_sort_state_free(struct bset_sort_state *); -int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); -void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); -void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, - struct bset_sort_state *); -void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, - struct bset_sort_state *); -void bch_btree_sort_partial(struct btree_keys *, unsigned, - struct bset_sort_state *); +void bch_bset_sort_state_free(struct bset_sort_state *state); +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order); +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state); +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state); +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state); +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, + struct bset_sort_state *state); static inline void bch_btree_sort(struct btree_keys *b, struct bset_sort_state *state) @@ -377,13 +393,13 @@ struct bset_stats { size_t floats, failed; }; -void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); /* Bkey utility code */ #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) -static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) { return bkey_idx(i->start, idx); } @@ -401,10 +417,10 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, - unsigned); -bool __bch_cut_front(const struct bkey *, struct bkey *); -bool __bch_cut_back(const struct bkey *, struct bkey *); +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned int i); +bool __bch_cut_front(const struct bkey *where, struct bkey *k); +bool __bch_cut_back(const struct bkey *where, struct bkey *k); static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) { @@ -522,18 +538,20 @@ static inline size_t bch_keylist_bytes(struct keylist *l) return bch_keylist_nkeys(l) * sizeof(uint64_t); } -struct bkey *bch_keylist_pop(struct keylist *); -void bch_keylist_pop_front(struct keylist *); -int __bch_keylist_realloc(struct keylist *, unsigned); +struct bkey *bch_keylist_pop(struct keylist *l); +void bch_keylist_pop_front(struct keylist *l); +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s); /* Debug stuff */ #ifdef CONFIG_BCACHE_DEBUG -int __bch_count_data(struct btree_keys *); -void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...); -void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); -void bch_dump_bucket(struct btree_keys *); +int __bch_count_data(struct btree_keys *b); +void __printf(2, 3) __bch_check_keys(struct btree_keys *b, + const char *fmt, + ...); +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); +void bch_dump_bucket(struct btree_keys *b); #else @@ -541,7 +559,7 @@ static inline int __bch_count_data(struct btree_keys *b) { return -1; } static inline void __printf(2, 3) __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} static inline void bch_dump_bucket(struct btree_keys *b) {} -void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 547c9eedc2f4..e7d4817681f2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -90,6 +90,9 @@ #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 +#define MAX_GC_TIMES 100 +#define MIN_GC_NODES 100 +#define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -180,7 +183,7 @@ static void bch_btree_init_next(struct btree *b) void bkey_put(struct cache_set *c, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) @@ -284,6 +287,7 @@ err: static void btree_node_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; + closure_put(cl); } @@ -432,7 +436,10 @@ static void do_btree_node_write(struct btree *b) continue_at(cl, btree_node_write_done, NULL); } else { - /* No problem for multipage bvec since the bio is just allocated */ + /* + * No problem for multipage bvec since the bio is + * just allocated + */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -476,7 +483,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) void bch_btree_node_write(struct btree *b, struct closure *parent) { - unsigned nsets = b->keys.nsets; + unsigned int nsets = b->keys.nsets; lockdep_assert_held(&b->lock); @@ -578,7 +585,7 @@ static void mca_bucket_free(struct btree *b) list_move(&b->list, &b->c->btree_cache_freeable); } -static unsigned btree_order(struct bkey *k) +static unsigned int btree_order(struct bkey *k) { return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); } @@ -586,7 +593,7 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { if (!bch_btree_keys_alloc(&b->keys, - max_t(unsigned, + max_t(unsigned int, ilog2(b->c->btree_pages), btree_order(k)), gfp)) { @@ -601,6 +608,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) return NULL; @@ -617,7 +625,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, unsigned min_order, bool flush) +static int mca_reap(struct btree *b, unsigned int min_order, bool flush) { struct closure cl; @@ -743,6 +751,7 @@ void bch_btree_cache_free(struct cache_set *c) { struct btree *b; struct closure cl; + closure_init_stack(&cl); if (c->shrink.list.next) @@ -783,7 +792,7 @@ void bch_btree_cache_free(struct cache_set *c) int bch_btree_cache_alloc(struct cache_set *c) { - unsigned i; + unsigned int i; for (i = 0; i < mca_reserve(c); i++) if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) @@ -1008,6 +1017,13 @@ retry: BUG_ON(b->level != level); } + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + b->parent = parent; b->accessed = 1; @@ -1019,13 +1035,6 @@ retry: for (; i <= b->keys.nsets; i++) prefetch(b->keys.set[i].data); - if (btree_node_io_error(b)) { - rw_unlock(write, b); - return ERR_PTR(-EIO); - } - - BUG_ON(!b->written); - return b; } @@ -1121,6 +1130,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, struct btree_op *op) { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); + if (!IS_ERR_OR_NULL(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); @@ -1133,7 +1143,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, static void make_btree_freeing_key(struct btree *b, struct bkey *k) { - unsigned i; + unsigned int i; mutex_lock(&b->c->bucket_lock); @@ -1154,7 +1164,7 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; struct cache *ca; - unsigned i, reserve = (c->root->level - b->level) * 2 + 1; + unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); @@ -1178,7 +1188,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) { uint8_t stale = 0; - unsigned i; + unsigned int i; struct bucket *g; /* @@ -1216,7 +1226,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ - SET_GC_SECTORS_USED(g, min_t(unsigned, + SET_GC_SECTORS_USED(g, min_t(unsigned int, GC_SECTORS_USED(g) + KEY_SIZE(k), MAX_GC_SECTORS_USED)); @@ -1230,7 +1240,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && @@ -1256,7 +1266,7 @@ void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned keys = 0, good_keys = 0; + unsigned int keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; @@ -1299,16 +1309,18 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) struct gc_merge_info { struct btree *b; - unsigned keys; + unsigned int keys; }; -static int bch_btree_insert_node(struct btree *, struct btree_op *, - struct keylist *, atomic_t *, struct bkey *); +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key); static int btree_gc_coalesce(struct btree *b, struct btree_op *op, struct gc_stat *gc, struct gc_merge_info *r) { - unsigned i, nodes = 0, keys = 0, blocks; + unsigned int i, nodes = 0, keys = 0, blocks; struct btree *new_nodes[GC_MERGE_NODES]; struct keylist keylist; struct closure cl; @@ -1508,11 +1520,11 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return -EINTR; } -static unsigned btree_gc_count_keys(struct btree *b) +static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; struct btree_iter iter; - unsigned ret = 0; + unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1520,6 +1532,32 @@ static unsigned btree_gc_count_keys(struct btree *b) return ret; } +static size_t btree_gc_min_nodes(struct cache_set *c) +{ + size_t min_nodes; + + /* + * Since incremental GC would stop 100ms when front + * side I/O comes, so when there are many btree nodes, + * if GC only processes constant (100) nodes each time, + * GC would last a long time, and the front side I/Os + * would run out of the buckets (since no new bucket + * can be allocated during GC), and be blocked again. + * So GC should not process constant nodes, but varied + * nodes according to the number of btree nodes, which + * realized by dividing GC into constant(100) times, + * so when there are many btree nodes, GC can process + * more nodes each time, otherwise, GC will process less + * nodes each time (but no less than MIN_GC_NODES) + */ + min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; + if (min_nodes < MIN_GC_NODES) + min_nodes = MIN_GC_NODES; + + return min_nodes; +} + + static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { @@ -1585,6 +1623,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; + if (atomic_read(&b->c->search_inflight) && + gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + gc->nodes_pre = gc->nodes; + ret = -EAGAIN; + break; + } + if (need_resched()) { ret = -EAGAIN; break; @@ -1642,7 +1687,7 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; if (!c->gc_mark_valid) return; @@ -1668,7 +1713,7 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned i; + unsigned int i; mutex_lock(&c->bucket_lock); @@ -1686,7 +1731,7 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned j; + unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1753,7 +1798,10 @@ static void bch_btree_gc(struct cache_set *c) closure_sync(&writes); cond_resched(); - if (ret && ret != -EAGAIN) + if (ret == -EAGAIN) + schedule_timeout_interruptible(msecs_to_jiffies + (GC_SLEEP_MS)); + else if (ret) pr_warn("gc failed!"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -1775,7 +1823,7 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { struct cache *ca; - unsigned i; + unsigned int i; for_each_cache(ca, c, i) if (ca->invalidate_needs_gc) @@ -1834,8 +1882,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) do { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); - if (k) + if (k) { btree_node_prefetch(b, k); + /* + * initiallize c->gc_stats.nodes + * for incremental GC + */ + b->c->gc_stats.nodes++; + } if (p) ret = btree(check_recurse, p, b, op); @@ -1860,7 +1914,7 @@ void bch_initial_gc_finish(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; bch_btree_gc_finish(c); @@ -1900,7 +1954,7 @@ void bch_initial_gc_finish(struct cache_set *c) static bool btree_insert_key(struct btree *b, struct bkey *k, struct bkey *replace_key) { - unsigned status; + unsigned int status; BUG_ON(bkey_cmp(k, &b->key) > 0); @@ -1999,7 +2053,7 @@ static int btree_split(struct btree *b, struct btree_op *op, block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { - unsigned keys = 0; + unsigned int keys = 0; trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); @@ -2177,10 +2231,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) { + b->seq != seq + 1) { op->lock = b->level; goto out; - } + } } SET_KEY_PTRS(check_key, 1); @@ -2255,7 +2309,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys, void bch_btree_set_root(struct btree *b) { - unsigned i; + unsigned int i; struct closure cl; closure_init_stack(&cl); @@ -2367,7 +2421,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, struct refill { struct btree_op op; - unsigned nr_found; + unsigned int nr_found; struct keybuf *buf; struct bkey *end; keybuf_pred_fn *pred; @@ -2443,6 +2497,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, if (!RB_EMPTY_ROOT(&buf->keys)) { struct keybuf_key *w; + w = RB_FIRST(&buf->keys, struct keybuf_key, node); buf->start = START_KEY(&w->key); @@ -2474,6 +2529,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, { bool ret = false; struct keybuf_key *p, *w, s; + s.key = *start; if (bkey_cmp(end, &buf->start) <= 0 || @@ -2500,6 +2556,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, struct keybuf_key *bch_keybuf_next(struct keybuf *buf) { struct keybuf_key *w; + spin_lock(&buf->lock); w = RB_FIRST(&buf->keys, struct keybuf_key, node); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d211e2c25b6b..a68d6c55783b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } enum btree_flags { BTREE_NODE_io_error, @@ -184,7 +184,7 @@ static inline struct bset *btree_bset_last(struct btree *b) return bset_tree_last(&b->keys)->data; } -static inline unsigned bset_block_offset(struct btree *b, struct bset *i) +static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) { return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } @@ -213,7 +213,7 @@ struct btree_op { /* Btree level at which we start taking write locks */ short lock; - unsigned insert_collision:1; + unsigned int insert_collision:1; }; static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) @@ -238,26 +238,28 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -void bch_btree_node_read_done(struct btree *); -void __bch_btree_node_write(struct btree *, struct closure *); -void bch_btree_node_write(struct btree *, struct closure *); - -void bch_btree_set_root(struct btree *); -struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, - int, bool, struct btree *); -struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, - struct bkey *, int, bool, struct btree *); - -int bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bkey *); -int bch_btree_insert(struct cache_set *, struct keylist *, - atomic_t *, struct bkey *); - -int bch_gc_thread_start(struct cache_set *); -void bch_initial_gc_finish(struct cache_set *); -void bch_moving_gc(struct cache_set *); -int bch_btree_check(struct cache_set *); -void bch_initial_mark_key(struct cache_set *, int, struct bkey *); +void bch_btree_node_read_done(struct btree *b); +void __bch_btree_node_write(struct btree *b, struct closure *parent); +void bch_btree_node_write(struct btree *b, struct closure *parent); + +void bch_btree_set_root(struct btree *b); +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent); +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write, + struct btree *parent); + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key); + +int bch_gc_thread_start(struct cache_set *c); +void bch_initial_gc_finish(struct cache_set *c); +void bch_moving_gc(struct cache_set *c); +int bch_btree_check(struct cache_set *c); +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); static inline void wake_up_gc(struct cache_set *c) { @@ -272,9 +274,9 @@ static inline void wake_up_gc(struct cache_set *c) #define MAP_END_KEY 1 -typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); -int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, - struct bkey *, btree_map_nodes_fn *, int); +typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b); +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags); static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn) @@ -290,21 +292,23 @@ static inline int bch_btree_map_leaf_nodes(struct btree_op *op, return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); } -typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, - struct bkey *); -int bch_btree_map_keys(struct btree_op *, struct cache_set *, - struct bkey *, btree_map_keys_fn *, int); - -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - -void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, - struct bkey *, keybuf_pred_fn *); -bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, - struct bkey *); -void bch_keybuf_del(struct keybuf *, struct keybuf_key *); -struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, - struct bkey *, keybuf_pred_fn *); +typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, + struct bkey *k); +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags); + +typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); + +void bch_keybuf_init(struct keybuf *buf); +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred); +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end); +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w); +struct keybuf_key *bch_keybuf_next(struct keybuf *buf); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred); void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0e14969182c6..73f5319295bc 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Asynchronous refcounty things * @@ -162,12 +163,13 @@ static struct dentry *closure_debug; static int debug_seq_show(struct seq_file *f, void *data) { struct closure *cl; + spin_lock_irq(&closure_list_lock); list_for_each_entry(cl, &closure_list, all) { int r = atomic_read(&cl->remaining); - seq_printf(f, "%p: %pF -> %pf p %p r %i ", + seq_printf(f, "%p: %pS -> %pS p %p r %i ", cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); @@ -177,7 +179,7 @@ static int debug_seq_show(struct seq_file *f, void *data) r & CLOSURE_RUNNING ? "R" : ""); if (r & CLOSURE_WAITING) - seq_printf(f, " W %pF\n", + seq_printf(f, " W %pS\n", (void *) cl->waiting_on); seq_printf(f, "\n"); @@ -199,11 +201,16 @@ static const struct file_operations debug_ops = { .release = single_release }; -int __init closure_debug_init(void) +void __init closure_debug_init(void) { - closure_debug = debugfs_create_file("closures", - 0400, bcache_debug, NULL, &debug_ops); - return IS_ERR_OR_NULL(closure_debug); + if (!IS_ERR_OR_NULL(bcache_debug)) + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ + closure_debug = debugfs_create_file( + "closures", 0400, bcache_debug, NULL, &debug_ops); } #endif diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 71427eb5fdae..eca0d496b686 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -159,7 +159,7 @@ struct closure { #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e - unsigned magic; + unsigned int magic; struct list_head all; unsigned long ip; unsigned long waiting_on; @@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl) #ifdef CONFIG_BCACHE_CLOSURES_DEBUG -int closure_debug_init(void); +void closure_debug_init(void); void closure_debug_create(struct closure *cl); void closure_debug_destroy(struct closure *cl); #else -static inline int closure_debug_init(void) { return 0; } +static inline void closure_debug_init(void) {} static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} @@ -289,10 +289,12 @@ static inline void closure_init_stack(struct closure *cl) } /** - * closure_wake_up - wake up all closures on a wait list. + * closure_wake_up - wake up all closures on a wait list, + * with memory barrier */ static inline void closure_wake_up(struct closure_waitlist *list) { + /* Memory barrier for the wait list */ smp_mb(); __closure_wake_up(list); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d030ce3025a6..06da66b2488a 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -67,34 +67,35 @@ void bch_btree_verify(struct btree *b) if (inmemory->keys != sorted->keys || memcmp(inmemory->start, sorted->start, - (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + (void *) bset_bkey_last(inmemory) - + (void *) inmemory->start)) { struct bset *i; - unsigned j; + unsigned int j; console_lock(); - printk(KERN_ERR "*** in memory:\n"); + pr_err("*** in memory:\n"); bch_dump_bset(&b->keys, inmemory, 0); - printk(KERN_ERR "*** read back in:\n"); + pr_err("*** read back in:\n"); bch_dump_bset(&v->keys, sorted, 0); for_each_written_bset(b, ondisk, i) { - unsigned block = ((void *) i - (void *) ondisk) / + unsigned int block = ((void *) i - (void *) ondisk) / block_bytes(b->c); - printk(KERN_ERR "*** on disk block %u:\n", block); + pr_err("*** on disk block %u:\n", block); bch_dump_bset(&b->keys, i, block); } - printk(KERN_ERR "*** block %zu not written\n", + pr_err("*** block %zu not written\n", ((void *) i - (void *) ondisk) / block_bytes(b->c)); for (j = 0; j < inmemory->keys; j++) if (inmemory->d[j] != sorted->d[j]) break; - printk(KERN_ERR "b->written %u\n", b->written); + pr_err("b->written %u\n", b->written); console_unlock(); panic("verify failed at %u\n", j); @@ -110,11 +111,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_clone_kmalloc(bio, GFP_NOIO); + check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; + check->bi_disk = bio->bi_disk; check->bi_opf = REQ_OP_READ; + check->bi_iter.bi_sector = bio->bi_iter.bi_sector; + check->bi_iter.bi_size = bio->bi_iter.bi_size; + bch_bio_map(check, NULL); if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; @@ -172,9 +177,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; - unsigned bytes = min(i->bytes, size); - + unsigned int bytes = min(i->bytes, size); int err = copy_to_user(buf, i->buf, bytes); + if (err) return err; @@ -233,8 +238,8 @@ void bch_debug_init_cache_set(struct cache_set *c) { if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; - snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } @@ -248,11 +253,12 @@ void bch_debug_exit(void) debugfs_remove_recursive(bcache_debug); } -int __init bch_debug_init(struct kobject *kobj) +void __init bch_debug_init(struct kobject *kobj) { - if (!IS_ENABLED(CONFIG_DEBUG_FS)) - return 0; - + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ bcache_debug = debugfs_create_dir("bcache", NULL); - return IS_ERR_OR_NULL(bcache_debug); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index acc48d3fa274..fb3d4dff4b26 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -8,8 +8,8 @@ struct cache_set; #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *); -void bch_data_verify(struct cached_dev *, struct bio *); +void bch_btree_verify(struct btree *b); +void bch_data_verify(struct cached_dev *dc, struct bio *bio); #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) @@ -27,7 +27,7 @@ static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} #endif #ifdef CONFIG_DEBUG_FS -void bch_debug_init_cache_set(struct cache_set *); +void bch_debug_init_cache_set(struct cache_set *c); #else static inline void bch_debug_init_cache_set(struct cache_set *c) {} #endif diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 1d096742eb41..c809724e6571 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -46,7 +46,7 @@ static bool bch_key_sort_cmp(struct btree_iter_set l, static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -67,7 +67,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -96,7 +96,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) { - unsigned i = 0; + unsigned int i = 0; char *out = buf, *end = buf + size; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) @@ -126,22 +126,22 @@ void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) { struct btree *b = container_of(keys, struct btree, keys); - unsigned j; + unsigned int j; char buf[80]; bch_extent_to_text(buf, sizeof(buf), k); - printk(" %s", buf); + pr_err(" %s", buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); + pr_err(" bucket %zu", n); if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", + pr_err(" prio %i", PTR_BUCKET(b->c, k, j)->prio); } - printk(" %s\n", bch_ptr_status(b->c, k)); + pr_err(" %s\n", bch_ptr_status(b->c, k)); } /* Btree ptrs */ @@ -166,12 +166,13 @@ bad: static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); } static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) { - unsigned i; + unsigned int i; char buf[80]; struct bucket *g; @@ -204,7 +205,7 @@ err: static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i; + unsigned int i; if (!bkey_cmp(k, &ZERO_KEY) || !KEY_PTRS(k) || @@ -327,13 +328,14 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, struct cache_set *c = container_of(b, struct btree, keys)->c; uint64_t old_offset; - unsigned old_size, sectors_found = 0; + unsigned int old_size, sectors_found = 0; BUG_ON(!KEY_OFFSET(insert)); BUG_ON(!KEY_SIZE(insert)); while (1) { struct bkey *k = bch_btree_iter_next(iter); + if (!k) break; @@ -363,7 +365,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, * k might have been split since we inserted/found the * key we're replacing */ - unsigned i; + unsigned int i; uint64_t offset = KEY_START(k) - KEY_START(replace_key); @@ -498,11 +500,12 @@ bad: static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); + return __bch_extent_invalid(b->c, k); } static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { struct bucket *g = PTR_BUCKET(b->c, k, ptr); char buf[80]; @@ -534,7 +537,7 @@ err: static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i, stale; + unsigned int i, stale; if (!KEY_PTRS(k) || bch_extent_invalid(bk, k)) @@ -574,10 +577,12 @@ static uint64_t merge_chksums(struct bkey *l, struct bkey *r) ~((uint64_t)1 << 63); } -static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +static bool bch_extent_merge(struct btree_keys *bk, + struct bkey *l, + struct bkey *r) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i; + unsigned int i; if (key_merging_disabled(b->c)) return false; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index 0cd3575afa1d..4d667e05bb73 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -8,8 +8,8 @@ extern const struct btree_keys_ops bch_extent_keys_ops; struct bkey; struct cache_set; -void bch_extent_to_text(char *, size_t, const struct bkey *); -bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); -bool __bch_extent_invalid(struct cache_set *, const struct bkey *); +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k); +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k); +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k); #endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 9612873afee2..c25097968319 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -17,6 +17,7 @@ void bch_bbio_free(struct bio *bio, struct cache_set *c) { struct bbio *b = container_of(bio, struct bbio, bio); + mempool_free(b, &c->bio_meta); } @@ -42,9 +43,10 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) } void bch_submit_bbio(struct bio *bio, struct cache_set *c, - struct bkey *k, unsigned ptr) + struct bkey *k, unsigned int ptr) { struct bbio *b = container_of(bio, struct bbio, bio); + bch_bkey_copy_single_ptr(&b->key, k, ptr); __bch_submit_bbio(bio, c); } @@ -52,7 +54,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) { - unsigned errors; + unsigned int errors; WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); @@ -75,16 +77,16 @@ void bch_count_io_errors(struct cache *ca, */ if (ca->set->error_decay) { - unsigned count = atomic_inc_return(&ca->io_count); + unsigned int count = atomic_inc_return(&ca->io_count); while (count > ca->set->error_decay) { - unsigned errors; - unsigned old = count; - unsigned new = count - ca->set->error_decay; + unsigned int errors; + unsigned int old = count; + unsigned int new = count - ca->set->error_decay; /* * First we subtract refresh from count; each time we - * succesfully do so, we rescale the errors once: + * successfully do so, we rescale the errors once: */ count = atomic_cmpxchg(&ca->io_count, old, new); @@ -104,7 +106,7 @@ void bch_count_io_errors(struct cache *ca, } if (error) { - unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, + unsigned int errors = atomic_add_return(1 << IO_ERROR_SHIFT, &ca->io_errors); errors >>= IO_ERROR_SHIFT; @@ -126,18 +128,18 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, struct cache *ca = PTR_CACHE(c, &b->key, 0); int is_read = (bio_data_dir(bio) == READ ? 1 : 0); - unsigned threshold = op_is_write(bio_op(bio)) + unsigned int threshold = op_is_write(bio_op(bio)) ? c->congested_write_threshold_us : c->congested_read_threshold_us; if (threshold) { - unsigned t = local_clock_us(); - + unsigned int t = local_clock_us(); int us = t - b->submit_time_us; int congested = atomic_read(&c->congested); if (us > (int) threshold) { int ms = us / 1024; + c->congested_last_us = t; ms = min(ms, CONGESTED_MAX + congested); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18f1b5239620..6116bbf870d8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -28,11 +28,12 @@ static void journal_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; + closure_put(cl); } static int journal_read_bucket(struct cache *ca, struct list_head *list, - unsigned bucket_index) + unsigned int bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; @@ -40,7 +41,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; struct closure cl; - unsigned len, left, offset = 0; + unsigned int len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); @@ -50,7 +51,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; @@ -154,12 +155,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) }) struct cache *ca; - unsigned iter; + unsigned int iter; for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned i, l, r, m; + unsigned int i, l, r, m; uint64_t seq; bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); @@ -192,7 +193,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, + l + 1)) if (read_bucket(l)) goto bsearch; @@ -304,7 +306,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) k < bset_bkey_last(&i->j); k = bkey_next(k)) if (!__bch_extent_invalid(c, k)) { - unsigned j; + unsigned int j; for (j = 0; j < KEY_PTRS(k); j++) if (ptr_available(c, k, j)) @@ -492,7 +494,7 @@ static void journal_reclaim(struct cache_set *c) struct bkey *k = &c->journal.key; struct cache *ca; uint64_t last_seq; - unsigned iter, n = 0; + unsigned int iter, n = 0; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -526,7 +528,7 @@ static void journal_reclaim(struct cache_set *c) for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; /* No space available on this device */ if (next == ja->discard_idx) @@ -580,7 +582,7 @@ static void journal_write_endio(struct bio *bio) closure_put(&w->c->journal.io); } -static void journal_write(struct closure *); +static void journal_write(struct closure *cl); static void journal_write_done(struct closure *cl) { @@ -609,11 +611,12 @@ static void journal_write_unlocked(struct closure *cl) struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * c->sb.block_size; struct bio *bio; struct bio_list list; + bio_list_init(&list); if (!w->need_write) { @@ -705,7 +708,7 @@ static void journal_try_write(struct cache_set *c) } static struct journal_write *journal_wait_for_write(struct cache_set *c, - unsigned nkeys) + unsigned int nkeys) __acquires(&c->journal.lock) { size_t sectors; @@ -828,6 +831,7 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); + free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index b5788199188f..66f0facff84b 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -110,7 +110,7 @@ struct journal { struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ - unsigned blocks_free; + unsigned int blocks_free; uint64_t seq; DECLARE_FIFO(atomic_t, pin); @@ -131,13 +131,13 @@ struct journal_device { uint64_t seq[SB_JOURNAL_BUCKETS]; /* Journal bucket we're currently writing to */ - unsigned cur_idx; + unsigned int cur_idx; /* Last journal bucket that still contains an open journal entry */ - unsigned last_idx; + unsigned int last_idx; /* Next journal bucket to be discarded */ - unsigned discard_idx; + unsigned int discard_idx; #define DISCARD_READY 0 #define DISCARD_IN_FLIGHT 1 @@ -167,14 +167,16 @@ struct cache_set; struct btree_op; struct keylist; -atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); -void bch_journal_next(struct journal *); -void bch_journal_mark(struct cache_set *, struct list_head *); -void bch_journal_meta(struct cache_set *, struct closure *); -int bch_journal_read(struct cache_set *, struct list_head *); -int bch_journal_replay(struct cache_set *, struct list_head *); - -void bch_journal_free(struct cache_set *); -int bch_journal_alloc(struct cache_set *); +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent); +void bch_journal_next(struct journal *j); +void bch_journal_mark(struct cache_set *c, struct list_head *list); +void bch_journal_meta(struct cache_set *c, struct closure *cl); +int bch_journal_read(struct cache_set *c, struct list_head *list); +int bch_journal_replay(struct cache_set *c, struct list_head *list); + +void bch_journal_free(struct cache_set *c); +int bch_journal_alloc(struct cache_set *c); #endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index a24c3a95b2c0..7891fb512736 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -23,7 +23,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) { struct cache_set *c = container_of(buf, struct cache_set, moving_gc_keys); - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && @@ -38,6 +38,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) static void moving_io_destructor(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); + kfree(io); } @@ -186,9 +187,10 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } -static unsigned bucket_heap_top(struct cache *ca) +static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } @@ -196,7 +198,7 @@ void bch_moving_gc(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; if (!c->copy_gc_enabled) return; @@ -204,9 +206,9 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned sectors_to_move = 0; - unsigned reserve_sectors = ca->sb.bucket_size * - fifo_used(&ca->free[RESERVE_MOVINGGC]); + unsigned int sectors_to_move = 0; + unsigned int reserve_sectors = ca->sb.bucket_size * + fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ae67f5fa8047..51be355a3309 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,9 +25,9 @@ struct kmem_cache *bch_search_cache; -static void bch_data_insert_start(struct closure *); +static void bch_data_insert_start(struct closure *cl); -static unsigned cache_mode(struct cached_dev *dc) +static unsigned int cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } @@ -45,6 +45,7 @@ static void bio_csum(struct bio *bio, struct bkey *k) bio_for_each_segment(bv, bio, iter) { void *d = kmap(bv.bv_page) + bv.bv_offset; + csum = bch_crc64_update(csum, d, bv.bv_len); kunmap(bv.bv_page); } @@ -98,7 +99,7 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } -static int bch_keylist_realloc(struct keylist *l, unsigned u64s, +static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, struct cache_set *c) { size_t oldsize = bch_keylist_nkeys(l); @@ -107,7 +108,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s, /* * The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far + * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) @@ -125,7 +126,7 @@ static void bch_data_invalidate(struct closure *cl) bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); while (bio_sectors(bio)) { - unsigned sectors = min(bio_sectors(bio), + unsigned int sectors = min(bio_sectors(bio), 1U << (KEY_SIZE_BITS - 1)); if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) @@ -135,7 +136,9 @@ static void bch_data_invalidate(struct closure *cl) bio->bi_iter.bi_size -= sectors << 9; bch_keylist_add(&op->insert_keys, - &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); + &KEY(op->inode, + bio->bi_iter.bi_sector, + sectors)); } op->insert_data_done = true; @@ -151,7 +154,7 @@ static void bch_data_insert_error(struct closure *cl) /* * Our data write just errored, which means we've got a bunch of keys to - * insert that point to data that wasn't succesfully written. + * insert that point to data that wasn't successfully written. * * We don't have to insert those keys but we still have to invalidate * that region of the cache - so, if we just strip off all the pointers @@ -211,7 +214,7 @@ static void bch_data_insert_start(struct closure *cl) bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); do { - unsigned i; + unsigned int i; struct bkey *k; struct bio_set *split = &op->c->bio_split; @@ -328,7 +331,7 @@ void bch_data_insert(struct closure *cl) /* Congested? */ -unsigned bch_get_congested(struct cache_set *c) +unsigned int bch_get_congested(struct cache_set *c) { int i; long rand; @@ -372,8 +375,8 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc); - unsigned sectors, congested = bch_get_congested(c); + unsigned int mode = cache_mode(dc); + unsigned int sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -469,11 +472,11 @@ struct search { struct bio *cache_miss; struct bcache_device *d; - unsigned insert_bio_sectors; - unsigned recoverable:1; - unsigned write:1; - unsigned read_dirty_data:1; - unsigned cache_missed:1; + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; unsigned long start_time; @@ -514,20 +517,20 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) struct search *s = container_of(op, struct search, op); struct bio *n, *bio = &s->bio.bio; struct bkey *bio_key; - unsigned ptr; + unsigned int ptr; if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) return MAP_CONTINUE; if (KEY_INODE(k) != s->iop.inode || KEY_START(k) > bio->bi_iter.bi_sector) { - unsigned bio_sectors = bio_sectors(bio); - unsigned sectors = KEY_INODE(k) == s->iop.inode + unsigned int bio_sectors = bio_sectors(bio); + unsigned int sectors = KEY_INODE(k) == s->iop.inode ? min_t(uint64_t, INT_MAX, KEY_START(k) - bio->bi_iter.bi_sector) : INT_MAX; - int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) return ret; @@ -623,6 +626,7 @@ static void request_endio(struct bio *bio) if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); + s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */ s->recoverable = false; @@ -667,8 +671,7 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, - bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -702,6 +705,8 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + atomic_dec(&s->d->c->search_inflight); + if (s->iop.bio) bio_put(s->iop.bio); @@ -719,6 +724,7 @@ static inline struct search *search_alloc(struct bio *bio, closure_init(&s->cl, NULL); do_bio_hook(s, bio, request_endio); + atomic_inc(&d->c->search_inflight); s->orig_bio = bio; s->cache_miss = NULL; @@ -811,7 +817,8 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio) { bio_reset(s->iop.bio); - s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; + s->iop.bio->bi_iter.bi_sector = + s->cache_miss->bi_iter.bi_sector; bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); @@ -854,10 +861,10 @@ static void cached_dev_read_done_bh(struct closure *cl) } static int cached_dev_cache_miss(struct btree *b, struct search *s, - struct bio *bio, unsigned sectors) + struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned reada = 0; + unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; @@ -1062,8 +1069,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, - bio_data_dir(bio), + generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), &ddip->d->disk->part0, ddip->start_time); if (bio->bi_status) { @@ -1102,6 +1108,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) generic_make_request(bio); } +static void quit_max_writeback_rate(struct cache_set *c, + struct cached_dev *this_dc) +{ + int i; + struct bcache_device *d; + struct cached_dev *dc; + + /* + * mutex bch_register_lock may compete with other parallel requesters, + * or attach/detach operations on other backing device. Waiting to + * the mutex lock may increase I/O request latency for seconds or more. + * To avoid such situation, if mutext_trylock() failed, only writeback + * rate of current cached device is set to 1, and __update_write_back() + * will decide writeback rate of other cached devices (remember now + * c->idle_counter is 0 already). + */ + if (mutex_trylock(&bch_register_lock)) { + for (i = 0; i < c->devices_max_used; i++) { + if (!c->devices[i]) + continue; + + if (UUID_FLASH_ONLY(&c->uuids[i])) + continue; + + d = c->devices[i]; + dc = container_of(d, struct cached_dev, disk); + /* + * set writeback rate to default minimum value, + * then let update_writeback_rate() to decide the + * upcoming rate. + */ + atomic_long_set(&dc->writeback_rate.rate, 1); + } + mutex_unlock(&bch_register_lock); + } else + atomic_long_set(&this_dc->writeback_rate.rate, 1); +} + /* Cached devices - read & write stuff */ static blk_qc_t cached_dev_make_request(struct request_queue *q, @@ -1119,8 +1163,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + if (likely(d->c)) { + if (atomic_read(&d->c->idle_counter)) + atomic_set(&d->c->idle_counter, 0); + /* + * If at_max_writeback_rate of cache set is true and new I/O + * comes, quit max writeback rate of all cached devices + * attached to this cache set, and set at_max_writeback_rate + * to false. + */ + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { + atomic_set(&d->c->at_max_writeback_rate, 0); + quit_max_writeback_rate(d->c, dc); + } + } + + generic_start_io_acct(q, + bio_op(bio), + bio_sectors(bio), + &d->disk->part0); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1156,6 +1217,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, unsigned int cmd, unsigned long arg) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); } @@ -1170,7 +1232,7 @@ static int cached_dev_congested(void *data, int bits) return 1; if (cached_dev_get(dc)) { - unsigned i; + unsigned int i; struct cache *ca; for_each_cache(ca, d->c, i) { @@ -1197,9 +1259,9 @@ void bch_cached_dev_request_init(struct cached_dev *dc) /* Flash backed devices */ static int flash_dev_cache_miss(struct btree *b, struct search *s, - struct bio *bio, unsigned sectors) + struct bio *bio, unsigned int sectors) { - unsigned bytes = min(sectors, bio_sectors(bio)) << 9; + unsigned int bytes = min(sectors, bio_sectors(bio)) << 9; swap(bio->bi_iter.bi_size, bytes); zero_fill_bio(bio); @@ -1229,7 +1291,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct search *s; struct closure *cl; struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; @@ -1237,7 +1298,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; @@ -1254,7 +1315,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, flash_dev_nodata, bcache_wq); return BLK_QC_T_NONE; - } else if (rw) { + } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); @@ -1283,7 +1344,7 @@ static int flash_dev_congested(void *data, int bits) struct bcache_device *d = data; struct request_queue *q; struct cache *ca; - unsigned i; + unsigned int i; int ret = 0; for_each_cache(ca, d->c, i) { @@ -1306,8 +1367,7 @@ void bch_flash_dev_request_init(struct bcache_device *d) void bch_request_exit(void) { - if (bch_search_cache) - kmem_cache_destroy(bch_search_cache); + kmem_cache_destroy(bch_search_cache); } int __init bch_request_init(void) diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index dea0886b81c1..aa055cfeb099 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -8,7 +8,7 @@ struct data_insert_op { struct bio *bio; struct workqueue_struct *wq; - unsigned inode; + unsigned int inode; uint16_t write_point; uint16_t write_prio; blk_status_t status; @@ -17,15 +17,15 @@ struct data_insert_op { uint16_t flags; struct { - unsigned bypass:1; - unsigned writeback:1; - unsigned flush_journal:1; - unsigned csum:1; + unsigned int bypass:1; + unsigned int writeback:1; + unsigned int flush_journal:1; + unsigned int csum:1; - unsigned replace:1; - unsigned replace_collision:1; + unsigned int replace:1; + unsigned int replace_collision:1; - unsigned insert_data_done:1; + unsigned int insert_data_done:1; }; }; @@ -33,7 +33,7 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -unsigned bch_get_congested(struct cache_set *); +unsigned int bch_get_congested(struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index be119326297b..894410f3f829 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -33,11 +33,11 @@ * stored left shifted by 16, and scaled back in the sysfs show() function. */ -static const unsigned DAY_RESCALE = 288; -static const unsigned HOUR_RESCALE = 12; -static const unsigned FIVE_MINUTE_RESCALE = 1; -static const unsigned accounting_delay = (HZ * 300) / 22; -static const unsigned accounting_weight = 32; +static const unsigned int DAY_RESCALE = 288; +static const unsigned int HOUR_RESCALE = 12; +static const unsigned int FIVE_MINUTE_RESCALE = 1; +static const unsigned int accounting_delay = (HZ * 300) / 22; +static const unsigned int accounting_weight = 32; /* sysfs reading/writing */ @@ -152,7 +152,7 @@ static void scale_accounting(struct timer_list *t) struct cache_accounting *acc = from_timer(acc, t, timer); #define move_stat(name) do { \ - unsigned t = atomic_xchg(&acc->collector.name, 0); \ + unsigned int t = atomic_xchg(&acc->collector.name, 0); \ t <<= 16; \ acc->five_minute.name += t; \ acc->hour.name += t; \ @@ -200,6 +200,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, bool hit, bool bypass) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + mark_cache_stats(&dc->accounting.collector, hit, bypass); mark_cache_stats(&c->accounting.collector, hit, bypass); } @@ -207,6 +208,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_readaheads); atomic_inc(&c->accounting.collector.cache_readaheads); } @@ -214,6 +216,7 @@ void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_miss_collisions); atomic_inc(&c->accounting.collector.cache_miss_collisions); } diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index 0b70f9de0c03..abfaabf7e7fc 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -23,7 +23,7 @@ struct cache_stats { unsigned long cache_miss_collisions; unsigned long sectors_bypassed; - unsigned rescale; + unsigned int rescale; }; struct cache_accounting { @@ -53,10 +53,13 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); void bch_cache_accounting_destroy(struct cache_accounting *acc); -void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, - bool, bool); -void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); -void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); -void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass); +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); +void bch_mark_cache_miss_collision(struct cache_set *c, + struct bcache_device *d); +void bch_mark_sectors_bypassed(struct cache_set *c, + struct cached_dev *dc, + int sectors); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa4058e43202..94c756c66bd7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. @@ -61,7 +62,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, const char *err; struct cache_sb *s; struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); - unsigned i; + unsigned int i; if (!bh) return "IO error"; @@ -149,7 +150,8 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) goto err; err = "Bad UUID"; @@ -181,7 +183,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; } - sb->last_mount = get_seconds(); + sb->last_mount = (u32)ktime_get_real_seconds(); err = NULL; get_page(bh->b_page); @@ -202,7 +204,7 @@ static void write_bdev_super_endio(struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio) { struct cache_sb *out = page_address(bio_first_page_all(bio)); - unsigned i; + unsigned int i; bio->bi_iter.bi_sector = SB_SECTOR; bio->bi_iter.bi_size = SB_SIZE; @@ -282,7 +284,7 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned i; + unsigned int i; down(&c->sb_write_mutex); closure_init(cl, &c->cl); @@ -334,7 +336,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, { struct closure *cl = &c->uuid_write; struct uuid_entry *u; - unsigned i; + unsigned int i; char buf[80]; BUG_ON(!parent); @@ -415,8 +417,8 @@ static int __uuid_write(struct cache_set *c) { BKEY_PADDED(key) k; struct closure cl; - closure_init_stack(&cl); + closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) @@ -456,6 +458,7 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + return uuid_find(c, zero_uuid); } @@ -463,8 +466,8 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c) * Bucket priorities/gens: * * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority + * 8 bit gen + * 16 bit priority * * See alloc.c for an explanation of the gen. The priority is used to implement * lru (and in the future other) cache replacement policies; for most purposes @@ -587,7 +590,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; struct bucket *b; - unsigned bucket_nr = 0; + unsigned int bucket_nr = 0; for (b = ca->buckets; b < ca->buckets + ca->sb.nbuckets; @@ -599,7 +602,8 @@ static void prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); - if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + if (p->csum != + bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); if (p->magic != pset_magic(&ca->sb)) @@ -619,6 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; @@ -629,6 +634,7 @@ static int open_dev(struct block_device *b, fmode_t mode) static void release_dev(struct gendisk *b, fmode_t mode) { struct bcache_device *d = b->private_data; + closure_put(&d->cl); } @@ -662,7 +668,7 @@ static void bcache_device_unlink(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - unsigned i; + unsigned int i; struct cache *ca; sysfs_remove_link(&d->c->kobj, d->name); @@ -676,7 +682,7 @@ static void bcache_device_unlink(struct bcache_device *d) static void bcache_device_link(struct bcache_device *d, struct cache_set *c, const char *name) { - unsigned i; + unsigned int i; struct cache *ca; for_each_cache(ca, d->c, i) @@ -696,12 +702,14 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); + atomic_dec(&d->c->attached_dev_nr); + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); bch_uuid_write(d->c); } @@ -713,7 +721,7 @@ static void bcache_device_detach(struct bcache_device *d) } static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, - unsigned id) + unsigned int id) { d->id = id; d->c = c; @@ -760,7 +768,7 @@ static void bcache_device_free(struct bcache_device *d) closure_debug_destroy(&d->cl); } -static int bcache_device_init(struct bcache_device *d, unsigned block_size, +static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors) { struct request_queue *q; @@ -776,7 +784,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->nr_stripes || d->nr_stripes > max_stripes) { pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", - (unsigned)d->nr_stripes); + (unsigned int)d->nr_stripes); return -ENOMEM; } @@ -796,11 +804,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, return idx; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_device_idx, idx); - return -ENOMEM; - } + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto err; + + d->disk = alloc_disk(BCACHE_MINORS); + if (!d->disk) + goto err; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -834,6 +843,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_write_cache(q, true, true); return 0; + +err: + ida_simple_remove(&bcache_device_idx, idx); + return -ENOMEM; + } /* Cached device */ @@ -911,6 +925,7 @@ void bch_cached_dev_run(struct cached_dev *dc) if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { struct closure cl; + closure_init_stack(&cl); SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); @@ -920,8 +935,10 @@ void bch_cached_dev_run(struct cached_dev *dc) add_disk(d->disk); bd_link_disk_holder(dc->bdev, dc->disk.disk); - /* won't show up in the uevent file, use udevadm monitor -e instead - * only class / kset properties are persistent */ + /* + * won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent + */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); @@ -968,6 +985,7 @@ static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); struct closure cl; + closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); @@ -1027,7 +1045,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid) { - uint32_t rtime = cpu_to_le32(get_seconds()); + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; @@ -1070,7 +1088,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); u = NULL; } @@ -1089,12 +1107,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, } } - /* Deadlocks since we're called via sysfs... - sysfs_remove_file(&dc->kobj, &sysfs_attach); + /* + * Deadlocks since we're called via sysfs... + * sysfs_remove_file(&dc->kobj, &sysfs_attach); */ if (bch_is_zero(u->uuid, 16)) { struct closure cl; + closure_init_stack(&cl); memcpy(u->uuid, dc->sb.uuid, 16); @@ -1116,11 +1136,11 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, list_move(&dc->list, &c->cached_devs); calc_cached_dev_sectors(c); - smp_wmb(); /* * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ + smp_wmb(); refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ @@ -1138,6 +1158,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_cached_dev_run(dc); bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); @@ -1203,7 +1224,7 @@ static void cached_dev_flush(struct closure *cl) continue_at(cl, cached_dev_free, system_wq); } -static int cached_dev_init(struct cached_dev *dc, unsigned block_size) +static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) { int ret; struct io *io; @@ -1285,6 +1306,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, pr_info("registered backing device %s", dc->backing_dev_name); list_add(&dc->list, &uncached_devices); + /* attach to a matched cache set if it exists */ list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c, NULL); @@ -1310,7 +1332,10 @@ void bch_flash_dev_release(struct kobject *kobj) static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); + atomic_long_sub(bcache_dev_sectors_dirty(d), + &d->c->flash_dev_dirty_sectors); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); @@ -1390,7 +1415,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) get_random_bytes(u->uuid, 16); memset(u->label, 0, 32); - u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); + u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds()); SET_UUID_FLASH_ONLY(u, 1); u->sectors = size >> 9; @@ -1447,17 +1472,18 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) pr_info("CACHE_SET_IO_DISABLE already set"); - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ + /* + * XXX: we can be called from atomic context + * acquire_console_sem(); + */ - printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); + pr_err("bcache: error on %pU: ", c->sb.set_uuid); va_start(args, fmt); vprintk(fmt, args); va_end(args); - printk(", disabling caching\n"); + pr_err(", disabling caching\n"); if (c->on_error == ON_ERROR_PANIC) panic("panic forced after error\n"); @@ -1469,6 +1495,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); + kfree(c); module_put(THIS_MODULE); } @@ -1477,7 +1504,7 @@ static void cache_set_free(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); struct cache *ca; - unsigned i; + unsigned int i; if (!IS_ERR_OR_NULL(c->debug)) debugfs_remove(c->debug); @@ -1520,7 +1547,7 @@ static void cache_set_flush(struct closure *cl) struct cache_set *c = container_of(cl, struct cache_set, caching); struct cache *ca; struct btree *b; - unsigned i; + unsigned int i; bch_cache_accounting_destroy(&c->accounting); @@ -1659,6 +1686,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { int iter_size; struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); + if (!c) return NULL; @@ -1687,6 +1715,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); c->devices_max_used = 0; + atomic_set(&c->attached_dev_nr, 0); c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, @@ -1718,8 +1747,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) || mempool_init_slab_pool(&c->search, 32, bch_search_cache) || mempool_init_kmalloc_pool(&c->bio_meta, 2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c)) || + sizeof(struct bbio) + sizeof(struct bio_vec) * + bucket_pages(c)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || @@ -1749,7 +1778,7 @@ static void run_cache_set(struct cache_set *c) struct cached_dev *dc, *t; struct cache *ca; struct closure cl; - unsigned i; + unsigned int i; closure_init_stack(&cl); @@ -1791,7 +1820,9 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); + c->root = bch_btree_node_get(c, NULL, k, + j->btree_level, + true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1840,7 +1871,7 @@ static void run_cache_set(struct cache_set *c) pr_notice("invalidating existing data"); for_each_cache(ca, c, i) { - unsigned j; + unsigned int j; ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, 2, SB_JOURNAL_BUCKETS); @@ -1894,7 +1925,7 @@ static void run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = get_seconds(); + c->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) @@ -1985,7 +2016,7 @@ err: void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); - unsigned i; + unsigned int i; if (ca->set) { BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); @@ -2085,7 +2116,9 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, goto err; } - if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) { + if (kobject_add(&ca->kobj, + &part_to_dev(bdev->bd_part)->kobj, + "bcache")) { err = "error calling kobject_add"; ret = -ENOMEM; goto out; @@ -2114,13 +2147,14 @@ err: /* Global interfaces/init */ -static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, - const char *, size_t); +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); -static bool bch_is_open_backing(struct block_device *bdev) { +static bool bch_is_open_backing(struct block_device *bdev) +{ struct cache_set *c, *tc; struct cached_dev *dc, *t; @@ -2134,10 +2168,11 @@ static bool bch_is_open_backing(struct block_device *bdev) { return false; } -static bool bch_is_open_cache(struct block_device *bdev) { +static bool bch_is_open_cache(struct block_device *bdev) +{ struct cache_set *c, *tc; struct cache *ca; - unsigned i; + unsigned int i; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) for_each_cache(ca, c, i) @@ -2146,7 +2181,8 @@ static bool bch_is_open_cache(struct block_device *bdev) { return false; } -static bool bch_is_open(struct block_device *bdev) { +static bool bch_is_open(struct block_device *bdev) +{ return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); } @@ -2163,8 +2199,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || - !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; + + sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); + if (!sb) goto err; err = "failed to open device"; @@ -2199,6 +2239,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) goto err_close; @@ -2207,6 +2248,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, mutex_unlock(&bch_register_lock); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) goto err_close; @@ -2324,13 +2366,21 @@ static int __init bcache_init(void) return bcache_major; } - if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || - !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - bch_request_init() || - bch_debug_init(bcache_kobj) || closure_debug_init() || + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; + + bcache_kobj = kobject_create_and_add("bcache", fs_kobj); + if (!bcache_kobj) + goto err; + + if (bch_request_init() || sysfs_create_files(bcache_kobj, files)) goto err; + bch_debug_init(bcache_kobj); + closure_debug_init(); + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 225b15aa0340..150cf4f4cf74 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -130,8 +130,10 @@ rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); rw_attribute(size); -static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected) +static ssize_t bch_snprint_string_list(char *buf, + size_t size, + const char * const list[], + size_t selected) { char *out = buf; size_t i; @@ -148,7 +150,8 @@ SHOW(__bch_cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + char const *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + int wb = dc->writeback_running; #define var(stat) (dc->stat) @@ -170,7 +173,8 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); + sysfs_hprint(writeback_rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); @@ -188,15 +192,22 @@ SHOW(__bch_cached_dev) char change[20]; s64 next_io; - bch_hprint(rate, dc->writeback_rate.rate << 9); - bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(target, dc->writeback_rate_target << 9); - bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); - bch_hprint(change, dc->writeback_rate_change << 9); - - next_io = div64_s64(dc->writeback_rate.next - local_clock(), - NSEC_PER_MSEC); + /* + * Except for dirty and target, other values should + * be 0 if writeback is not running. + */ + bch_hprint(rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 + : 0); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional, + wb ? dc->writeback_rate_proportional << 9 : 0); + bch_hprint(integral, + wb ? dc->writeback_rate_integral_scaled << 9 : 0); + bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0); + next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(), + NSEC_PER_MSEC) : 0; return sprintf(buf, "rate:\t\t%s/sec\n" @@ -255,8 +266,19 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); + if (attr == &sysfs_writeback_rate) { + ssize_t ret; + long int v = atomic_long_read(&dc->writeback_rate.rate); + + ret = strtoul_safe_clamp(buf, v, 1, INT_MAX); + + if (!ret) { + atomic_long_set(&dc->writeback_rate.rate, v); + ret = size; + } + + return ret; + } sysfs_strtoul_clamp(writeback_rate_update_seconds, dc->writeback_rate_update_seconds, @@ -287,7 +309,7 @@ STORE(__cached_dev) if (v < 0) return v; - if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { + if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) { SET_BDEV_CACHE_MODE(&dc->sb, v); bch_write_bdev_super(dc, NULL); } @@ -321,8 +343,9 @@ STORE(__cached_dev) add_uevent_var(env, "DRIVER=bcache"); add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), add_uevent_var(env, "CACHED_LABEL=%s", buf); - kobject_uevent_env( - &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj, + KOBJ_CHANGE, + env->envp); kfree(env); } @@ -338,8 +361,8 @@ STORE(__cached_dev) if (!v) return size; } - - pr_err("Can't attach %s: cache set not found", buf); + if (v == -ENOENT) + pr_err("Can't attach %s: cache set not found", buf); return v; } @@ -439,6 +462,7 @@ STORE(__bch_flash_dev) if (attr == &sysfs_size) { uint64_t v; + strtoi_h_or_return(buf, v); u->sectors = v >> 9; @@ -513,9 +537,9 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) op.stats.floats, op.stats.failed); } -static unsigned bch_root_usage(struct cache_set *c) +static unsigned int bch_root_usage(struct cache_set *c) { - unsigned bytes = 0; + unsigned int bytes = 0; struct bkey *k; struct btree *b; struct btree_iter iter; @@ -550,9 +574,9 @@ static size_t bch_cache_size(struct cache_set *c) return ret; } -static unsigned bch_cache_max_chain(struct cache_set *c) +static unsigned int bch_cache_max_chain(struct cache_set *c) { - unsigned ret = 0; + unsigned int ret = 0; struct hlist_head *h; mutex_lock(&c->bucket_lock); @@ -560,7 +584,7 @@ static unsigned bch_cache_max_chain(struct cache_set *c) for (h = c->bucket_hash; h < c->bucket_hash + (1 << BUCKET_HASH_BITS); h++) { - unsigned i = 0; + unsigned int i = 0; struct hlist_node *p; hlist_for_each(p, h) @@ -573,13 +597,13 @@ static unsigned bch_cache_max_chain(struct cache_set *c) return ret; } -static unsigned bch_btree_used(struct cache_set *c) +static unsigned int bch_btree_used(struct cache_set *c) { return div64_u64(c->gc_stats.key_bytes * 100, (c->gc_stats.nodes ?: 1) * btree_bytes(c)); } -static unsigned bch_average_key_size(struct cache_set *c) +static unsigned int bch_average_key_size(struct cache_set *c) { return c->gc_stats.nkeys ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) @@ -683,6 +707,7 @@ STORE(__bch_cache_set) if (attr == &sysfs_flash_vol_create) { int r; uint64_t v; + strtoi_h_or_return(buf, v); r = bch_flash_dev_create(c, v); @@ -716,6 +741,7 @@ STORE(__bch_cache_set) if (attr == &sysfs_prune_cache) { struct shrink_control sc; + sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); c->shrink.scan_objects(&c->shrink, &sc); @@ -769,12 +795,14 @@ STORE_LOCKED(bch_cache_set) SHOW(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_show(&c->kobj, attr, buf); } STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_store(&c->kobj, attr, buf, size); } @@ -976,7 +1004,7 @@ STORE(__bch_cache) if (v < 0) return v; - if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { + if ((unsigned int) v != CACHE_REPLACEMENT(&ca->sb)) { mutex_lock(&ca->set->bucket_lock); SET_CACHE_REPLACEMENT(&ca->sb, v); mutex_unlock(&ca->set->bucket_lock); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index b54fe9602529..3fe82425859c 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -44,9 +44,9 @@ STORE(fn) \ static struct attribute sysfs_##_name = \ { .name = #_name, .mode = _mode } -#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) +#define write_attribute(n) __sysfs_attribute(n, 0200) +#define read_attribute(n) __sysfs_attribute(n, 0444) +#define rw_attribute(n) __sysfs_attribute(n, 0644) #define sysfs_printf(file, fmt, ...) \ do { \ diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index fc479b026d6d..20eddeac1531 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * random utiility code, for bcache but in theory not specific to bcache * @@ -133,6 +134,7 @@ bool bch_is_zero(const char *p, size_t n) int bch_parse_uuid(const char *s, char *uuid) { size_t i, j, x; + memset(uuid, 0, 16); for (i = 0, j = 0; @@ -200,7 +202,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done * NSEC_PER_SEC, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); /* Bound the time. Don't let us fall further than 2 seconds behind * (this prevents unnecessary backlog that would make it impossible @@ -279,134 +281,3 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) return 0; } - -/* - * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any - * use permitted, subject to terms of PostgreSQL license; see.) - - * If we have a 64-bit integer type, then a 64-bit CRC looks just like the - * usual sort of implementation. (See Ross Williams' excellent introduction - * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from - * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) - * If we have no working 64-bit type, then fake it with two 32-bit registers. - * - * The present implementation is a normal (not "reflected", in Williams' - * terms) 64-bit CRC, using initial all-ones register contents and a final - * bit inversion. The chosen polynomial is borrowed from the DLT1 spec - * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): - * - * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x + 1 -*/ - -static const uint64_t crc_table[256] = { - 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, - 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, - 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, - 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, - 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, - 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, - 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, - 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, - 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, - 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, - 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, - 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, - 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, - 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, - 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, - 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, - 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, - 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, - 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, - 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, - 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, - 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, - 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, - 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, - 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, - 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, - 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, - 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, - 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, - 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, - 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, - 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, - 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, - 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, - 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, - 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, - 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, - 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, - 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, - 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, - 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, - 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, - 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, - 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, - 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, - 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, - 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, - 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, - 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, - 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, - 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, - 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, - 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, - 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, - 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, - 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, - 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, - 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, - 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, - 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, - 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, - 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, - 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, - 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, - 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, - 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, - 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, - 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, - 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, - 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, - 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, - 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, - 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, - 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, - 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, - 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, - 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, - 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, - 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, - 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, - 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, - 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, - 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, - 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, - 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, - 0x9AFCE626CE85B507ULL, -}; - -uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) -{ - const unsigned char *data = _data; - - while (len--) { - int i = ((int) (crc >> 56) ^ *data++) & 0xFF; - crc = crc_table[i] ^ (crc << 8); - } - - return crc; -} - -uint64_t bch_crc64(const void *data, size_t len) -{ - uint64_t crc = 0xffffffffffffffffULL; - - crc = bch_crc64_update(crc, data, len); - - return crc ^ 0xffffffffffffffffULL; -} diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cced87f8eb27..00aab6abcfe4 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/crc64.h> #include "closure.h" @@ -288,10 +289,10 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -int bch_strtoint_h(const char *, int *); -int bch_strtouint_h(const char *, unsigned int *); -int bch_strtoll_h(const char *, long long *); -int bch_strtoull_h(const char *, unsigned long long *); +int bch_strtoint_h(const char *cp, int *res); +int bch_strtouint_h(const char *cp, unsigned int *res); +int bch_strtoll_h(const char *cp, long long *res); +int bch_strtoull_h(const char *cp, unsigned long long *res); static inline int bch_strtol_h(const char *cp, long *res) { @@ -347,7 +348,7 @@ static inline int bch_strtoul_h(const char *cp, long *res) snprintf(buf, size, \ __builtin_types_compatible_p(typeof(var), int) \ ? "%i\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned) \ + __builtin_types_compatible_p(typeof(var), unsigned int) \ ? "%u\n" : \ __builtin_types_compatible_p(typeof(var), long) \ ? "%li\n" : \ @@ -379,7 +380,7 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); -static inline unsigned local_clock_us(void) +static inline unsigned int local_clock_us(void) { return local_clock() >> 10; } @@ -402,7 +403,8 @@ do { \ __print_time_stat(stats, name, \ average_duration, duration_units); \ sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ - div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ + div_u64((stats)->max_duration, \ + NSEC_PER_ ## duration_units)); \ \ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ ? div_s64(local_clock() - (stats)->last, \ @@ -442,7 +444,7 @@ struct bch_ratelimit { * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - uint32_t rate; + atomic_long_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) @@ -542,10 +544,27 @@ dup: \ #define RB_PREV(ptr, member) \ container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) +static inline uint64_t bch_crc64(const void *p, size_t len) +{ + uint64_t crc = 0xffffffffffffffffULL; + + crc = crc64_be(crc, p, len); + return crc ^ 0xffffffffffffffffULL; +} + +static inline uint64_t bch_crc64_update(uint64_t crc, + const void *p, + size_t len) +{ + crc = crc64_be(crc, p, len); + return crc; +} + /* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +static inline unsigned int fract_exp_two(unsigned int x, + unsigned int fract_bits) { - unsigned fract = x & ~(~0 << fract_bits); + unsigned int fract = x & ~(~0 << fract_bits); x >>= fract_bits; x = 1 << x; @@ -561,8 +580,4 @@ static inline sector_t bdev_sectors(struct block_device *bdev) { return bdev->bd_inode->i_size >> 9; } - -uint64_t bch_crc64_update(uint64_t, const void *, size_t); -uint64_t bch_crc64(const void *, size_t); - #endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ad45ebe1a74b..08c3a9f9676c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * flash-only devices */ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - - bcache_flash_devs_sectors_dirty(c); + atomic_long_read(&c->flash_dev_dirty_sectors); /* * Unfortunately there is no control of global dirty data. If the @@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; - dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; - dc->writeback_rate.rate = new_rate; + dc->writeback_rate_change = new_rate - + atomic_long_read(&dc->writeback_rate.rate); + atomic_long_set(&dc->writeback_rate.rate, new_rate); dc->writeback_rate_target = target; } +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* + * Idle_counter is increased everytime when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ + if (atomic_inc_return(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) + return false; + + if (atomic_read(&c->at_max_writeback_rate) != 1) + atomic_set(&c->at_max_writeback_rate, 1); + + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); + + /* keep writeback_rate_target as existing value */ + dc->writeback_rate_proportional = 0; + dc->writeback_rate_integral_scaled = 0; + dc->writeback_rate_change = 0; + + /* + * Check c->idle_counter and c->at_max_writeback_rate agagain in case + * new I/O arrives during before set_at_max_writeback_rate() returns. + * Then the writeback rate is set to 1, and its new value should be + * decided via __update_writeback_rate(). + */ + if ((atomic_read(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) || + !atomic_read(&c->at_max_writeback_rate)) + return false; + + return true; +} + static void update_writeback_rate(struct work_struct *work) { struct cached_dev *dc = container_of(to_delayed_work(work), @@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work) return; } - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && - dc->writeback_percent) - __update_writeback_rate(dc); + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (!set_at_max_writeback_rate(c, dc)) { + down_read(&dc->writeback_lock); + __update_writeback_rate(dc); + up_read(&dc->writeback_lock); + } + } - up_read(&dc->writeback_lock); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -163,7 +215,8 @@ static void update_writeback_rate(struct work_struct *work) smp_mb(); } -static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +static unsigned int writeback_delay(struct cached_dev *dc, + unsigned int sectors) { if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) @@ -197,6 +250,7 @@ static void dirty_init(struct keybuf_key *w) static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); + kfree(io); } @@ -211,7 +265,7 @@ static void write_dirty_finish(struct closure *cl) /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { int ret; - unsigned i; + unsigned int i; struct keylist keys; bch_keylist_init(&keys); @@ -325,7 +379,7 @@ static void read_dirty_submit(struct closure *cl) static void read_dirty(struct cached_dev *dc) { - unsigned delay = 0; + unsigned int delay = 0; struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; size_t size; int nk, i; @@ -390,7 +444,8 @@ static void read_dirty(struct cached_dev *dc) io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + DIV_ROUND_UP(KEY_SIZE(&w->key), + PAGE_SECTORS), GFP_KERNEL); if (!io) goto err; @@ -413,7 +468,8 @@ static void read_dirty(struct cached_dev *dc) down(&dc->in_flight); - /* We've acquired a semaphore for the maximum + /* + * We've acquired a semaphore for the maximum * simultaneous number of writebacks; from here * everything happens asynchronously. */ @@ -422,27 +478,6 @@ static void read_dirty(struct cached_dev *dc) delay = writeback_delay(dc, size); - /* If the control system would wait for at least half a - * second, and there's been no reqs hitting the backing disk - * for awhile: use an alternate mode where we have at most - * one contiguous set of writebacks in flight at a time. If - * someone wants to do IO it will be quick, as it will only - * have to contend with one operation in flight, and we'll - * be round-tripping data to the backing disk as quickly as - * it can accept it. - */ - if (delay >= HZ / 2) { - /* 3 means at least 1.5 seconds, up to 7.5 if we - * have slowed way down. - */ - if (atomic_inc_return(&dc->backing_idle) >= 3) { - /* Wait for current I/Os to finish */ - closure_sync(&cl); - /* And immediately launch a new set. */ - delay = 0; - } - } - while (!kthread_should_stop() && !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && delay) { @@ -467,20 +502,23 @@ err: /* Scan for dirty data */ -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, stripe, sectors_dirty; if (!d) return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) + atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); + stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), + int s = min_t(unsigned int, abs(nr_sectors), d->stripe_size - stripe_offset); if (nr_sectors < 0) @@ -504,7 +542,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { - struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + struct cached_dev *dc = container_of(buf, + struct cached_dev, + writeback_keys); BUG_ON(KEY_INODE(k) != dc->disk.id); @@ -514,7 +554,7 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned start_stripe, stripe, next_stripe; + unsigned int start_stripe, stripe, next_stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); @@ -645,8 +685,10 @@ static int bch_writeback_thread(void *arg) * data on cache. BCACHE_DEV_DETACHING flag is set in * bch_cached_dev_detach(). */ - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + up_write(&dc->writeback_lock); break; + } } up_write(&dc->writeback_lock); @@ -654,7 +696,7 @@ static int bch_writeback_thread(void *arg) read_dirty(dc); if (searched_full_index) { - unsigned delay = dc->writeback_delay * HZ; + unsigned int delay = dc->writeback_delay * HZ; while (delay && !kthread_should_stop() && @@ -673,10 +715,14 @@ static int bch_writeback_thread(void *arg) } /* Init */ +#define INIT_KEYS_EACH_TIME 500000 +#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; - unsigned inode; + unsigned int inode; + size_t count; + struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -691,18 +737,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + op->count++; + if (atomic_read(&b->c->search_inflight) && + !(op->count % INIT_KEYS_EACH_TIME)) { + bkey_copy_key(&op->start, k); + return -EAGAIN; + } + return MAP_CONTINUE; } void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; + int ret; bch_btree_op_init(&op.op, -1); op.inode = d->id; - - bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), - sectors_dirty_init_fn, 0); + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + do { + ret = bch_btree_map_keys(&op.op, d->c, &op.start, + sectors_dirty_init_fn, 0); + if (ret == -EAGAIN) + schedule_timeout_interruptible( + msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); + else if (ret < 0) { + pr_warn("sectors dirty init failed, ret=%d!", ret); + break; + } + } while (ret == -EAGAIN); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -715,7 +780,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_running = true; dc->writeback_percent = 10; dc->writeback_delay = 30; - dc->writeback_rate.rate = 1024; + atomic_long_set(&dc->writeback_rate.rate, 1024); dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 610fb01de629..d2b9fdbc8994 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -28,26 +28,7 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) -{ - uint64_t i, ret = 0; - - mutex_lock(&bch_register_lock); - - for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - - if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) - continue; - ret += bcache_dev_sectors_dirty(d); - } - - mutex_unlock(&bch_register_lock); - - return ret; -} - -static inline unsigned offset_to_stripe(struct bcache_device *d, +static inline unsigned int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); @@ -56,9 +37,9 @@ static inline unsigned offset_to_stripe(struct bcache_device *d, static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, - unsigned nr_sectors) + unsigned int nr_sectors) { - unsigned stripe = offset_to_stripe(&dc->disk, offset); + unsigned int stripe = offset_to_stripe(&dc->disk, offset); while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) @@ -73,9 +54,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, - unsigned cache_mode, bool would_skip) + unsigned int cache_mode, bool would_skip) { - unsigned in_use = dc->disk.c->gc_stats.in_use; + unsigned int in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -115,10 +96,11 @@ static inline void bch_writeback_add(struct cached_dev *dc) } } -void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, + uint64_t offset, int nr_sectors); -void bch_sectors_dirty_init(struct bcache_device *); -void bch_cached_dev_writeback_init(struct cached_dev *); -int bch_cached_dev_writeback_start(struct cached_dev *); +void bch_sectors_dirty_init(struct bcache_device *d); +void bch_cached_dev_writeback_init(struct cached_dev *dc); +int bch_cached_dev_writeback_start(struct cached_dev *dc); #endif diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 0d7212410e21..69dddeab124c 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -363,7 +363,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->version = cpu_to_le32(cmd->version); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); - disk_super->policy_hint_size = 0; + disk_super->policy_hint_size = cpu_to_le32(0); __copy_sm_root(cmd, disk_super); @@ -701,6 +701,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); + disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -1322,6 +1323,7 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd, dm_oblock_t oblock; unsigned flags; + bool dirty = true; dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); memcpy(&mapping, mapping_value_le, sizeof(mapping)); @@ -1332,8 +1334,10 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd, dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); memcpy(&hint, hint_value_le, sizeof(hint)); } + if (cmd->clean_when_opened) + dirty = flags & M_DIRTY; - r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, + r = fn(context, oblock, to_cblock(cb), dirty, le32_to_cpu(hint), hints_valid); if (r) { DMERR("policy couldn't load cache block %llu", @@ -1361,7 +1365,7 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd, dm_oblock_t oblock; unsigned flags; - bool dirty; + bool dirty = true; dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); memcpy(&mapping, mapping_value_le, sizeof(mapping)); @@ -1372,8 +1376,9 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd, dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); memcpy(&hint, hint_value_le, sizeof(hint)); } + if (cmd->clean_when_opened) + dirty = dm_bitset_cursor_get_value(dirty_cursor); - dirty = dm_bitset_cursor_get_value(dirty_cursor); r = fn(context, oblock, to_cblock(cb), dirty, le32_to_cpu(hint), hints_valid); if (r) { diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ce14a3d1f609..a53413371725 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1188,9 +1188,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) queue_continuation(mg->cache->wq, &mg->k); } -static int copy(struct dm_cache_migration *mg, bool promote) +static void copy(struct dm_cache_migration *mg, bool promote) { - int r; struct dm_io_region o_region, c_region; struct cache *cache = mg->cache; @@ -1203,11 +1202,9 @@ static int copy(struct dm_cache_migration *mg, bool promote) c_region.count = cache->sectors_per_block; if (promote) - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); + dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); else - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); - - return r; + dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); } static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) @@ -1449,12 +1446,7 @@ static void mg_full_copy(struct work_struct *ws) } init_continuation(&mg->k, mg_upgrade_lock); - - if (copy(mg, is_policy_promote)) { - DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = BLK_STS_IOERR; - mg_complete(mg, false); - } + copy(mg, is_policy_promote); } static void mg_copy(struct work_struct *ws) @@ -2250,7 +2242,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, {0, 2, "Invalid number of cache feature arguments"}, }; - int r; + int r, mode_ctr = 0; unsigned argc; const char *arg; struct cache_features *cf = &ca->features; @@ -2264,14 +2256,20 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, while (argc--) { arg = dm_shift_arg(as); - if (!strcasecmp(arg, "writeback")) + if (!strcasecmp(arg, "writeback")) { cf->io_mode = CM_IO_WRITEBACK; + mode_ctr++; + } - else if (!strcasecmp(arg, "writethrough")) + else if (!strcasecmp(arg, "writethrough")) { cf->io_mode = CM_IO_WRITETHROUGH; + mode_ctr++; + } - else if (!strcasecmp(arg, "passthrough")) + else if (!strcasecmp(arg, "passthrough")) { cf->io_mode = CM_IO_PASSTHROUGH; + mode_ctr++; + } else if (!strcasecmp(arg, "metadata2")) cf->metadata_version = 2; @@ -2282,6 +2280,11 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, } } + if (mode_ctr > 1) { + *error = "Duplicate cache io_mode features requested"; + return -EINVAL; + } + return 0; } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b61b069c33af..f266c81f396f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -99,7 +99,7 @@ struct crypt_iv_operations { }; struct iv_essiv_private { - struct crypto_ahash *hash_tfm; + struct crypto_shash *hash_tfm; u8 *salt; }; @@ -144,7 +144,7 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - wait_queue_head_t write_thread_wait; + spinlock_t write_thread_lock; struct task_struct *write_thread; struct rb_root write_tree; @@ -327,25 +327,22 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_essiv_init(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm); - struct scatterlist sg; + SHASH_DESC_ON_STACK(desc, essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int err; - sg_init_one(&sg, cc->key, cc->key_size); - ahash_request_set_tfm(req, essiv->hash_tfm); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size); + desc->tfm = essiv->hash_tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_ahash_digest(req); - ahash_request_zero(req); + err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); + shash_desc_zero(desc); if (err) return err; essiv_tfm = cc->iv_private; err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_ahash_digestsize(essiv->hash_tfm)); + crypto_shash_digestsize(essiv->hash_tfm)); if (err) return err; @@ -356,7 +353,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm); + unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int r, err = 0; @@ -408,7 +405,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_ahash(essiv->hash_tfm); + crypto_free_shash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); @@ -426,7 +423,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { struct crypto_cipher *essiv_tfm = NULL; - struct crypto_ahash *hash_tfm = NULL; + struct crypto_shash *hash_tfm = NULL; u8 *salt = NULL; int err; @@ -436,14 +433,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC); + hash_tfm = crypto_alloc_shash(opts, 0, 0); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; err = PTR_ERR(hash_tfm); goto bad; } - salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL); + salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL); if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; err = -ENOMEM; @@ -454,7 +451,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, cc->iv_gen_private.essiv.hash_tfm = hash_tfm; essiv_tfm = alloc_essiv_cipher(cc, ti, salt, - crypto_ahash_digestsize(hash_tfm)); + crypto_shash_digestsize(hash_tfm)); if (IS_ERR(essiv_tfm)) { crypt_iv_essiv_dtr(cc); return PTR_ERR(essiv_tfm); @@ -465,7 +462,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, bad: if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_ahash(hash_tfm); + crypto_free_shash(hash_tfm); kfree(salt); return err; } @@ -1620,36 +1617,31 @@ static int dmcrypt_write(void *data) struct rb_root write_tree; struct blk_plug plug; - DECLARE_WAITQUEUE(wait, current); - - spin_lock_irq(&cc->write_thread_wait.lock); + spin_lock_irq(&cc->write_thread_lock); continue_locked: if (!RB_EMPTY_ROOT(&cc->write_tree)) goto pop_from_list; set_current_state(TASK_INTERRUPTIBLE); - __add_wait_queue(&cc->write_thread_wait, &wait); - spin_unlock_irq(&cc->write_thread_wait.lock); + spin_unlock_irq(&cc->write_thread_lock); if (unlikely(kthread_should_stop())) { set_current_state(TASK_RUNNING); - remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); set_current_state(TASK_RUNNING); - spin_lock_irq(&cc->write_thread_wait.lock); - __remove_wait_queue(&cc->write_thread_wait, &wait); + spin_lock_irq(&cc->write_thread_lock); goto continue_locked; pop_from_list: write_tree = cc->write_tree; cc->write_tree = RB_ROOT; - spin_unlock_irq(&cc->write_thread_wait.lock); + spin_unlock_irq(&cc->write_thread_lock); BUG_ON(rb_parent(write_tree.rb_node)); @@ -1693,7 +1685,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) return; } - spin_lock_irqsave(&cc->write_thread_wait.lock, flags); + spin_lock_irqsave(&cc->write_thread_lock, flags); + if (RB_EMPTY_ROOT(&cc->write_tree)) + wake_up_process(cc->write_thread); rbp = &cc->write_tree.rb_node; parent = NULL; sector = io->sector; @@ -1706,9 +1700,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) } rb_link_node(&io->rb_node, parent, rbp); rb_insert_color(&io->rb_node, &cc->write_tree); - - wake_up_locked(&cc->write_thread_wait); - spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags); + spin_unlock_irqrestore(&cc->write_thread_lock, flags); } static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) @@ -2831,7 +2823,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - init_waitqueue_head(&cc->write_thread_wait); + spin_lock_init(&cc->write_thread_lock); cc->write_tree = RB_ROOT; cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); @@ -3069,11 +3061,11 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) */ limits->max_segment_size = PAGE_SIZE; - if (cc->sector_size != (1 << SECTOR_SHIFT)) { - limits->logical_block_size = cc->sector_size; - limits->physical_block_size = cc->sector_size; - blk_limits_io_min(limits, cc->sector_size); - } + limits->logical_block_size = + max_t(unsigned short, limits->logical_block_size, cc->sector_size); + limits->physical_block_size = + max_t(unsigned, limits->physical_block_size, cc->sector_size); + limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); } static struct target_type crypt_target = { diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 1783d80c9cad..2fb7bb4304ad 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -17,6 +17,13 @@ #define DM_MSG_PREFIX "delay" +struct delay_class { + struct dm_dev *dev; + sector_t start; + unsigned delay; + unsigned ops; +}; + struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; @@ -25,19 +32,16 @@ struct delay_c { struct list_head delayed_bios; atomic_t may_delay; - struct dm_dev *dev_read; - sector_t start_read; - unsigned read_delay; - unsigned reads; + struct delay_class read; + struct delay_class write; + struct delay_class flush; - struct dm_dev *dev_write; - sector_t start_write; - unsigned write_delay; - unsigned writes; + int argc; }; struct dm_delay_info { struct delay_c *context; + struct delay_class *class; struct list_head list; unsigned long expires; }; @@ -77,7 +81,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) { struct dm_delay_info *delayed, *next; unsigned long next_expires = 0; - int start_timer = 0; + unsigned long start_timer = 0; struct bio_list flush_bios = { }; mutex_lock(&delayed_bios_lock); @@ -87,10 +91,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) sizeof(struct dm_delay_info)); list_del(&delayed->list); bio_list_add(&flush_bios, bio); - if ((bio_data_dir(bio) == WRITE)) - delayed->context->writes--; - else - delayed->context->reads--; + delayed->class->ops--; continue; } @@ -100,7 +101,6 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) } else next_expires = min(next_expires, delayed->expires); } - mutex_unlock(&delayed_bios_lock); if (start_timer) @@ -117,6 +117,50 @@ static void flush_expired_bios(struct work_struct *work) flush_bios(flush_delayed_bios(dc, 0)); } +static void delay_dtr(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + destroy_workqueue(dc->kdelayd_wq); + + if (dc->read.dev) + dm_put_device(ti, dc->read.dev); + if (dc->write.dev) + dm_put_device(ti, dc->write.dev); + if (dc->flush.dev) + dm_put_device(ti, dc->flush.dev); + + mutex_destroy(&dc->timer_lock); + + kfree(dc); +} + +static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) +{ + int ret; + unsigned long long tmpll; + char dummy; + + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + return -EINVAL; + } + c->start = tmpll; + + if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) { + ti->error = "Invalid delay"; + return -EINVAL; + } + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev); + if (ret) { + ti->error = "Device lookup failed"; + return ret; + } + + return 0; +} + /* * Mapping parameters: * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] @@ -128,134 +172,89 @@ static void flush_expired_bios(struct work_struct *work) static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; - unsigned long long tmpll; - char dummy; int ret; - if (argc != 3 && argc != 6) { - ti->error = "Requires exactly 3 or 6 arguments"; + if (argc != 3 && argc != 6 && argc != 9) { + ti->error = "Requires exactly 3, 6 or 9 arguments"; return -EINVAL; } - dc = kmalloc(sizeof(*dc), GFP_KERNEL); + dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) { ti->error = "Cannot allocate context"; return -ENOMEM; } - dc->reads = dc->writes = 0; + ti->private = dc; + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); + INIT_LIST_HEAD(&dc->delayed_bios); + mutex_init(&dc->timer_lock); + atomic_set(&dc->may_delay, 1); + dc->argc = argc; - ret = -EINVAL; - if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid device sector"; + ret = delay_class_ctr(ti, &dc->read, argv); + if (ret) goto bad; - } - dc->start_read = tmpll; - if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { - ti->error = "Invalid delay"; - goto bad; + if (argc == 3) { + ret = delay_class_ctr(ti, &dc->write, argv); + if (ret) + goto bad; + ret = delay_class_ctr(ti, &dc->flush, argv); + if (ret) + goto bad; + goto out; } - ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &dc->dev_read); - if (ret) { - ti->error = "Device lookup failed"; + ret = delay_class_ctr(ti, &dc->write, argv + 3); + if (ret) goto bad; - } - - ret = -EINVAL; - dc->dev_write = NULL; - if (argc == 3) + if (argc == 6) { + ret = delay_class_ctr(ti, &dc->flush, argv + 3); + if (ret) + goto bad; goto out; - - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid write device sector"; - goto bad_dev_read; } - dc->start_write = tmpll; - if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { - ti->error = "Invalid write delay"; - goto bad_dev_read; - } - - ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), - &dc->dev_write); - if (ret) { - ti->error = "Write device lookup failed"; - goto bad_dev_read; - } + ret = delay_class_ctr(ti, &dc->flush, argv + 6); + if (ret) + goto bad; out: - ret = -EINVAL; dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!dc->kdelayd_wq) { + ret = -EINVAL; DMERR("Couldn't start kdelayd"); - goto bad_queue; + goto bad; } - timer_setup(&dc->delay_timer, handle_delayed_timer, 0); - - INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); - INIT_LIST_HEAD(&dc->delayed_bios); - mutex_init(&dc->timer_lock); - atomic_set(&dc->may_delay, 1); - ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->per_io_data_size = sizeof(struct dm_delay_info); - ti->private = dc; return 0; -bad_queue: - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); -bad_dev_read: - dm_put_device(ti, dc->dev_read); bad: - kfree(dc); + delay_dtr(ti); return ret; } -static void delay_dtr(struct dm_target *ti) -{ - struct delay_c *dc = ti->private; - - destroy_workqueue(dc->kdelayd_wq); - - dm_put_device(ti, dc->dev_read); - - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); - - mutex_destroy(&dc->timer_lock); - - kfree(dc); -} - -static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) +static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) { struct dm_delay_info *delayed; unsigned long expires = 0; - if (!delay || !atomic_read(&dc->may_delay)) + if (!c->delay || !atomic_read(&dc->may_delay)) return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); delayed->context = dc; - delayed->expires = expires = jiffies + msecs_to_jiffies(delay); + delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); mutex_lock(&delayed_bios_lock); - - if (bio_data_dir(bio) == WRITE) - dc->writes++; - else - dc->reads++; - + c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); - mutex_unlock(&delayed_bios_lock); queue_timeout(dc, expires); @@ -282,23 +281,28 @@ static void delay_resume(struct dm_target *ti) static int delay_map(struct dm_target *ti, struct bio *bio) { struct delay_c *dc = ti->private; - - if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { - bio_set_dev(bio, dc->dev_write->bdev); - if (bio_sectors(bio)) - bio->bi_iter.bi_sector = dc->start_write + - dm_target_offset(ti, bio->bi_iter.bi_sector); - - return delay_bio(dc, dc->write_delay, bio); + struct delay_class *c; + struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); + + if (bio_data_dir(bio) == WRITE) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + c = &dc->flush; + else + c = &dc->write; + } else { + c = &dc->read; } + delayed->class = c; + bio_set_dev(bio, c->dev->bdev); + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); - bio_set_dev(bio, dc->dev_read->bdev); - bio->bi_iter.bi_sector = dc->start_read + - dm_target_offset(ti, bio->bi_iter.bi_sector); - - return delay_bio(dc, dc->read_delay, bio); + return delay_bio(dc, c, bio); } +#define DMEMIT_DELAY_CLASS(c) \ + DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) + static void delay_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -307,17 +311,19 @@ static void delay_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u %u", dc->reads, dc->writes); + DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops); break; case STATUSTYPE_TABLE: - DMEMIT("%s %llu %u", dc->dev_read->name, - (unsigned long long) dc->start_read, - dc->read_delay); - if (dc->dev_write) - DMEMIT(" %s %llu %u", dc->dev_write->name, - (unsigned long long) dc->start_write, - dc->write_delay); + DMEMIT_DELAY_CLASS(&dc->read); + if (dc->argc >= 6) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->write); + } + if (dc->argc >= 9) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->flush); + } break; } } @@ -328,12 +334,15 @@ static int delay_iterate_devices(struct dm_target *ti, struct delay_c *dc = ti->private; int ret = 0; - ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); + ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); + if (ret) + goto out; + ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); + if (ret) + goto out; + ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); if (ret) goto out; - - if (dc->dev_write) - ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); out: return ret; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 86438b2f10dd..378878599466 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -31,6 +31,8 @@ #define MIN_LOG2_INTERLEAVE_SECTORS 3 #define MAX_LOG2_INTERLEAVE_SECTORS 31 #define METADATA_WORKQUEUE_MAX_ACTIVE 16 +#define RECALC_SECTORS 8192 +#define RECALC_WRITE_SUPER 16 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -44,7 +46,8 @@ */ #define SB_MAGIC "integrt" -#define SB_VERSION 1 +#define SB_VERSION_1 1 +#define SB_VERSION_2 2 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -57,9 +60,12 @@ struct superblock { __u64 provided_data_sectors; /* userspace uses this value */ __u32 flags; __u8 log2_sectors_per_block; + __u8 pad[3]; + __u64 recalc_sector; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 +#define SB_FLAG_RECALCULATING 0x2 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -139,6 +145,7 @@ struct alg_spec { struct dm_integrity_c { struct dm_dev *dev; + struct dm_dev *meta_dev; unsigned tag_size; __s8 log2_tag_size; sector_t start; @@ -170,7 +177,8 @@ struct dm_integrity_c { unsigned short journal_section_sectors; unsigned journal_sections; unsigned journal_entries; - sector_t device_sectors; + sector_t data_device_sectors; + sector_t meta_device_sectors; unsigned initial_sectors; unsigned metadata_run; __s8 log2_metadata_run; @@ -178,7 +186,7 @@ struct dm_integrity_c { __u8 sectors_per_block; unsigned char mode; - bool suspending; + int suspending; int failed; @@ -186,6 +194,7 @@ struct dm_integrity_c { /* these variables are locked with endio_wait.lock */ struct rb_root in_progress; + struct list_head wait_list; wait_queue_head_t endio_wait; struct workqueue_struct *wait_wq; @@ -210,6 +219,11 @@ struct dm_integrity_c { struct workqueue_struct *writer_wq; struct work_struct writer_work; + struct workqueue_struct *recalc_wq; + struct work_struct recalc_work; + u8 *recalc_buffer; + u8 *recalc_tags; + struct bio_list flush_bio_list; unsigned long autocommit_jiffies; @@ -233,7 +247,14 @@ struct dm_integrity_c { struct dm_integrity_range { sector_t logical_sector; unsigned n_sectors; - struct rb_node node; + bool waiting; + union { + struct rb_node node; + struct { + struct task_struct *task; + struct list_head wait_entry; + }; + }; }; struct dm_integrity_io { @@ -337,10 +358,14 @@ static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, sector_t *area, sector_t *offset) { - __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; - - *area = data_sector >> log2_interleave_sectors; - *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); + if (!ic->meta_dev) { + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); + } else { + *area = 0; + *offset = data_sector; + } } #define sector_to_block(ic, n) \ @@ -379,6 +404,9 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector { sector_t result; + if (ic->meta_dev) + return offset; + result = area << ic->sb->log2_interleave_sectors; if (likely(ic->log2_metadata_run >= 0)) result += (area + 1) << ic->log2_metadata_run; @@ -386,6 +414,8 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector result += (area + 1) * ic->metadata_run; result += (sector_t)ic->initial_sectors + offset; + result += ic->start; + return result; } @@ -395,6 +425,14 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) *sec_ptr -= ic->journal_sections; } +static void sb_set_version(struct dm_integrity_c *ic) +{ + if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + ic->sb->version = SB_VERSION_2; + else + ic->sb->version = SB_VERSION_1; +} + static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) { struct dm_io_request io_req; @@ -406,7 +444,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) io_req.mem.ptr.addr = ic->sb; io_req.notify.fn = NULL; io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; io_loc.sector = ic->start; io_loc.count = SB_SECTORS; @@ -753,7 +791,7 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned io_req.notify.fn = NULL; } io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; io_loc.sector = ic->start + SB_SECTORS + sector; io_loc.count = n_sectors; @@ -857,7 +895,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig io_req.notify.context = data; io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; - io_loc.sector = ic->start + target; + io_loc.sector = target; io_loc.count = n_sectors; r = dm_io(&io_req, 1, &io_loc, NULL); @@ -867,13 +905,27 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig } } -static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2) +{ + return range1->logical_sector < range2->logical_sector + range2->n_sectors && + range2->logical_sector + range2->n_sectors > range2->logical_sector; +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting) { struct rb_node **n = &ic->in_progress.rb_node; struct rb_node *parent; BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); + if (likely(check_waiting)) { + struct dm_integrity_range *range; + list_for_each_entry(range, &ic->wait_list, wait_entry) { + if (unlikely(ranges_overlap(range, new_range))) + return false; + } + } + parent = NULL; while (*n) { @@ -898,7 +950,22 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range * static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) { rb_erase(&range->node, &ic->in_progress); - wake_up_locked(&ic->endio_wait); + while (unlikely(!list_empty(&ic->wait_list))) { + struct dm_integrity_range *last_range = + list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry); + struct task_struct *last_range_task; + if (!ranges_overlap(range, last_range)) + break; + last_range_task = last_range->task; + list_del(&last_range->wait_entry); + if (!add_new_range(ic, last_range, false)) { + last_range->task = last_range_task; + list_add(&last_range->wait_entry, &ic->wait_list); + break; + } + last_range->waiting = false; + wake_up_process(last_range_task); + } } static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) @@ -910,6 +977,19 @@ static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *r spin_unlock_irqrestore(&ic->endio_wait.lock, flags); } +static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + new_range->waiting = true; + list_add_tail(&new_range->wait_entry, &ic->wait_list); + new_range->task = current; + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + } while (unlikely(new_range->waiting)); +} + static void init_journal_node(struct journal_node *node) { RB_CLEAR_NODE(&node->node); @@ -1599,8 +1679,12 @@ retry: dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors << ic->sb->log2_sectors_per_block); - if (unlikely(!dio->range.n_sectors)) - goto sleep; + if (unlikely(!dio->range.n_sectors)) { + if (from_map) + goto offload_to_thread; + sleep_on_endio_wait(ic); + goto retry; + } range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; ic->free_sectors -= range_sectors; journal_section = ic->free_section; @@ -1654,22 +1738,20 @@ retry: } } } - if (unlikely(!add_new_range(ic, &dio->range))) { + if (unlikely(!add_new_range(ic, &dio->range, true))) { /* * We must not sleep in the request routine because it could * stall bios on current->bio_list. * So, we offload the bio to a workqueue if we have to sleep. */ -sleep: if (from_map) { +offload_to_thread: spin_unlock_irq(&ic->endio_wait.lock); INIT_WORK(&dio->work, integrity_bio_wait); queue_work(ic->wait_wq, &dio->work); return; - } else { - sleep_on_endio_wait(ic); - goto retry; } + wait_and_add_new_range(ic, &dio->range); } spin_unlock_irq(&ic->endio_wait.lock); @@ -1701,14 +1783,18 @@ sleep: bio->bi_end_io = integrity_end_io; bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; - bio->bi_iter.bi_sector += ic->start; generic_make_request(bio); if (need_sync_io) { wait_for_completion_io(&read_comp); + if (unlikely(ic->recalc_wq != NULL) && + ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) + goto skip_check; if (likely(!bio->bi_status)) integrity_metadata(&dio->work); else +skip_check: dec_in_flight(dio); } else { @@ -1892,8 +1978,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; spin_lock_irq(&ic->endio_wait.lock); - while (unlikely(!add_new_range(ic, &io->range))) - sleep_on_endio_wait(ic); + if (unlikely(!add_new_range(ic, &io->range, true))) + wait_and_add_new_range(ic, &io->range); if (likely(!from_replay)) { struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; @@ -1981,7 +2067,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (READ_ONCE(ic->suspending)) + if (READ_ONCE(ic->suspending) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2008,6 +2094,108 @@ static void integrity_writer(struct work_struct *w) spin_unlock_irq(&ic->endio_wait.lock); } +static void recalc_write_super(struct dm_integrity_c *ic) +{ + int r; + + dm_integrity_flush_buffers(ic); + if (dm_integrity_failed(ic)) + return; + + sb_set_version(ic); + r = sync_rw_sb(ic, REQ_OP_WRITE, 0); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +} + +static void integrity_recalc(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work); + struct dm_integrity_range range; + struct dm_io_request io_req; + struct dm_io_region io_loc; + sector_t area, offset; + sector_t metadata_block; + unsigned metadata_offset; + __u8 *t; + unsigned i; + int r; + unsigned super_counter = 0; + + spin_lock_irq(&ic->endio_wait.lock); + +next_chunk: + + if (unlikely(READ_ONCE(ic->suspending))) + goto unlock_ret; + + range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + goto unlock_ret; + + get_area_and_offset(ic, range.logical_sector, &area, &offset); + range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); + if (!ic->meta_dev) + range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); + + if (unlikely(!add_new_range(ic, &range, true))) + wait_and_add_new_range(ic, &range); + + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { + recalc_write_super(ic); + super_counter = 0; + } + + if (unlikely(dm_integrity_failed(ic))) + goto err; + + io_req.bi_op = REQ_OP_READ; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = get_data_sector(ic, area, offset); + io_loc.count = range.n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } + + t = ic->recalc_tags; + for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tag_size; + } + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + + r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "writing tags", r); + goto err; + } + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); + goto next_chunk; + +err: + remove_range(ic, &range); + return; + +unlock_ret: + spin_unlock_irq(&ic->endio_wait.lock); + + recalc_write_super(ic); +} + static void init_journal(struct dm_integrity_c *ic, unsigned start_section, unsigned n_sections, unsigned char commit_seq) { @@ -2210,17 +2398,22 @@ static void dm_integrity_postsuspend(struct dm_target *ti) del_timer_sync(&ic->autocommit_timer); - ic->suspending = true; + WRITE_ONCE(ic->suspending, 1); + + if (ic->recalc_wq) + drain_workqueue(ic->recalc_wq); queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); if (ic->mode == 'J') { + if (ic->meta_dev) + queue_work(ic->writer_wq, &ic->writer_work); drain_workqueue(ic->writer_wq); dm_integrity_flush_buffers(ic); } - ic->suspending = false; + WRITE_ONCE(ic->suspending, 0); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); @@ -2232,6 +2425,16 @@ static void dm_integrity_resume(struct dm_target *ti) struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; replay_journal(ic); + + if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + if (recalc_pos < ic->provided_data_sectors) { + queue_work(ic->recalc_wq, &ic->recalc_work); + } else if (recalc_pos > ic->provided_data_sectors) { + ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors); + recalc_write_super(ic); + } + } } static void dm_integrity_status(struct dm_target *ti, status_type_t type, @@ -2243,7 +2446,13 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches)); + DMEMIT("%llu %llu", + (unsigned long long)atomic64_read(&ic->number_of_mismatches), + (unsigned long long)ic->provided_data_sectors); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector)); + else + DMEMIT(" -"); break; case STATUSTYPE_TABLE: { @@ -2251,19 +2460,25 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); arg_count = 5; + arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; + arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); + if (ic->meta_dev) + DMEMIT(" meta_device:%s", ic->meta_dev->name); + if (ic->sectors_per_block != 1) + DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); - if (ic->sectors_per_block != 1) - DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); #define EMIT_ALG(a, n) \ do { \ @@ -2286,7 +2501,10 @@ static int dm_integrity_iterate_devices(struct dm_target *ti, { struct dm_integrity_c *ic = ti->private; - return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); + if (!ic->meta_dev) + return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); + else + return fn(ti, ic->dev, 0, ti->len, data); } static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2319,26 +2537,38 @@ static void calculate_journal_section_size(struct dm_integrity_c *ic) static int calculate_device_limits(struct dm_integrity_c *ic) { __u64 initial_sectors; - sector_t last_sector, last_area, last_offset; calculate_journal_section_size(ic); initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; - if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX) + if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX) return -EINVAL; ic->initial_sectors = initial_sectors; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; - if (!(ic->metadata_run & (ic->metadata_run - 1))) - ic->log2_metadata_run = __ffs(ic->metadata_run); - else - ic->log2_metadata_run = -1; + if (!ic->meta_dev) { + sector_t last_sector, last_area, last_offset; - get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); - last_sector = get_data_sector(ic, last_area, last_offset); + ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + if (!(ic->metadata_run & (ic->metadata_run - 1))) + ic->log2_metadata_run = __ffs(ic->metadata_run); + else + ic->log2_metadata_run = -1; - if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors) - return -EINVAL; + get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); + last_sector = get_data_sector(ic, last_area, last_offset); + if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) + return -EINVAL; + } else { + __u64 meta_size = ic->provided_data_sectors * ic->tag_size; + meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) + >> (ic->log2_buffer_sectors + SECTOR_SHIFT); + meta_size <<= ic->log2_buffer_sectors; + if (ic->initial_sectors + meta_size < ic->initial_sectors || + ic->initial_sectors + meta_size > ic->meta_device_sectors) + return -EINVAL; + ic->metadata_run = 1; + ic->log2_metadata_run = 0; + } return 0; } @@ -2350,7 +2580,6 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); memcpy(ic->sb->magic, SB_MAGIC, 8); - ic->sb->version = SB_VERSION; ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); if (ic->journal_mac_alg.alg_string) @@ -2360,28 +2589,55 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = journal_sectors / ic->journal_section_sectors; if (!journal_sections) journal_sections = 1; - ic->sb->journal_sections = cpu_to_le32(journal_sections); - if (!interleave_sectors) - interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; - ic->sb->log2_interleave_sectors = __fls(interleave_sectors); - ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - - ic->provided_data_sectors = 0; - for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) { - __u64 prev_data_sectors = ic->provided_data_sectors; + if (!ic->meta_dev) { + ic->sb->journal_sections = cpu_to_le32(journal_sections); + if (!interleave_sectors) + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + ic->sb->log2_interleave_sectors = __fls(interleave_sectors); + ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + if (!ic->provided_data_sectors) + return -EINVAL; + } else { + ic->sb->log2_interleave_sectors = 0; + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + +try_smaller_buffer: + ic->sb->journal_sections = cpu_to_le32(0); + for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) { + __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections); + __u32 test_journal_sections = prev_journal_sections | (1U << test_bit); + if (test_journal_sections > journal_sections) + continue; + ic->sb->journal_sections = cpu_to_le32(test_journal_sections); + if (calculate_device_limits(ic)) + ic->sb->journal_sections = cpu_to_le32(prev_journal_sections); - ic->provided_data_sectors |= (sector_t)1 << test_bit; - if (calculate_device_limits(ic)) - ic->provided_data_sectors = prev_data_sectors; + } + if (!le32_to_cpu(ic->sb->journal_sections)) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + return -EINVAL; + } } - if (!ic->provided_data_sectors) - return -EINVAL; - ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + sb_set_version(ic); + return 0; } @@ -2828,6 +3084,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; + bool recalculate; bool should_write_sb; __u64 threshold; unsigned long long start; @@ -2848,6 +3105,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_io_data_size = sizeof(struct dm_integrity_io); ic->in_progress = RB_ROOT; + INIT_LIST_HEAD(&ic->wait_list); init_waitqueue_head(&ic->endio_wait); bio_list_init(&ic->flush_bio_list); init_waitqueue_head(&ic->copy_to_journal_wait); @@ -2883,13 +3141,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; - journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, - ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + journal_sectors = 0; interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; sync_msec = DEFAULT_SYNC_MSEC; + recalculate = false; ic->sectors_per_block = 1; as.argc = argc - DIRECT_ARGUMENTS; @@ -2908,7 +3165,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1) - journal_sectors = val; + journal_sectors = val ? val : 1; else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1) interleave_sectors = val; else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1) @@ -2917,7 +3174,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) journal_watermark = val; else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1) sync_msec = val; - else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { + else if (!memcmp(opt_string, "meta_device:", strlen("meta_device:"))) { + if (ic->meta_dev) { + dm_put_device(ti, ic->meta_dev); + ic->meta_dev = NULL; + } + r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { if (val < 1 << SECTOR_SHIFT || val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT || (val & (val -1))) { @@ -2941,6 +3208,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) "Invalid journal_mac argument"); if (r) goto bad; + } else if (!strcmp(opt_string, "recalculate")) { + recalculate = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -2948,6 +3217,21 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } + ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; + if (!ic->meta_dev) + ic->meta_device_sectors = ic->data_device_sectors; + else + ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT; + + if (!journal_sectors) { + journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + } + + if (!buffer_sectors) + buffer_sectors = 1; + ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); + r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, "Invalid internal hash", "Error setting internal hash key"); if (r) @@ -3062,7 +3346,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (ic->sb->version != SB_VERSION) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -3083,11 +3367,19 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } /* make sure that ti->max_io_len doesn't overflow */ - if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || - ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { - r = -EINVAL; - ti->error = "Invalid interleave_sectors in the superblock"; - goto bad; + if (!ic->meta_dev) { + if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || + ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + } else { + if (ic->sb->log2_interleave_sectors) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } } ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { @@ -3101,20 +3393,28 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Journal mac mismatch"; goto bad; } + +try_smaller_buffer: r = calculate_device_limits(ic); if (r) { + if (ic->meta_dev) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + } ti->error = "The device is too small"; goto bad; } + if (!ic->meta_dev) + ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); + if (ti->len > ic->provided_data_sectors) { r = -EINVAL; ti->error = "Not enough provided sectors for requested mapping size"; goto bad; } - if (!buffer_sectors) - buffer_sectors = 1; - ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT); threshold = (__u64)ic->journal_entries * (100 - journal_watermark); threshold += 50; @@ -3138,8 +3438,40 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); - ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), - 1, 0, NULL, NULL); + if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + if (!ic->internal_hash) { + r = -EINVAL; + ti->error = "Recalculate is only valid with internal hash"; + goto bad; + } + ic->recalc_wq = alloc_workqueue("dm-intergrity-recalc", WQ_MEM_RECLAIM, 1); + if (!ic->recalc_wq ) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->recalc_work, integrity_recalc); + ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); + if (!ic->recalc_buffer) { + ti->error = "Cannot allocate buffer for recalculating"; + r = -ENOMEM; + goto bad; + } + ic->recalc_tags = kvmalloc((RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size, GFP_KERNEL); + if (!ic->recalc_tags) { + ti->error = "Cannot allocate tags for recalculating"; + r = -ENOMEM; + goto bad; + } + } + + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL); if (IS_ERR(ic->bufio)) { r = PTR_ERR(ic->bufio); ti->error = "Cannot initialize dm-bufio"; @@ -3171,9 +3503,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->just_formatted = true; } - r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); - if (r) - goto bad; + if (!ic->meta_dev) { + r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); + if (r) + goto bad; + } if (!ic->internal_hash) dm_integrity_set(ti, ic); @@ -3192,6 +3526,7 @@ static void dm_integrity_dtr(struct dm_target *ti) struct dm_integrity_c *ic = ti->private; BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + BUG_ON(!list_empty(&ic->wait_list)); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); @@ -3201,6 +3536,12 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->commit_wq); if (ic->writer_wq) destroy_workqueue(ic->writer_wq); + if (ic->recalc_wq) + destroy_workqueue(ic->recalc_wq); + if (ic->recalc_buffer) + vfree(ic->recalc_buffer); + if (ic->recalc_tags) + kvfree(ic->recalc_tags); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); @@ -3208,6 +3549,8 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_io_client_destroy(ic->io); if (ic->dev) dm_put_device(ti, ic->dev); + if (ic->meta_dev) + dm_put_device(ti, ic->meta_dev); dm_integrity_free_page_list(ic, ic->journal); dm_integrity_free_page_list(ic, ic->journal_io); dm_integrity_free_page_list(ic, ic->journal_xor); @@ -3248,7 +3591,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3c7547a3c371..2fc4213e02b5 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -487,6 +487,8 @@ static int run_complete_job(struct kcopyd_job *job) if (atomic_dec_and_test(&kc->nr_jobs)) wake_up(&kc->destroyq); + cond_resched(); + return 0; } @@ -741,9 +743,9 @@ static void split_job(struct kcopyd_job *master_job) } } -int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, - unsigned int num_dests, struct dm_io_region *dests, - unsigned int flags, dm_kcopyd_notify_fn fn, void *context) +void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) { struct kcopyd_job *job; int i; @@ -818,16 +820,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->progress = 0; split_job(job); } - - return 0; } EXPORT_SYMBOL(dm_kcopyd_copy); -int dm_kcopyd_zero(struct dm_kcopyd_client *kc, - unsigned num_dests, struct dm_io_region *dests, - unsigned flags, dm_kcopyd_notify_fn fn, void *context) +void dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context) { - return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); + dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); } EXPORT_SYMBOL(dm_kcopyd_zero); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 75df4c9d8b54..cae689de75fd 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3859,7 +3859,7 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) /* Try loading the bitmap unless "raid0", which does not have one */ if (!rs_is_raid0(rs) && !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { - r = bitmap_load(&rs->md); + r = md_bitmap_load(&rs->md); if (r) DMERR("Failed to load bitmap"); } @@ -3987,8 +3987,8 @@ static int raid_preresume(struct dm_target *ti) /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { - r = bitmap_resize(mddev->bitmap, mddev->dev_sectors, - to_bytes(rs->requested_bitmap_chunk_sectors), 0); + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, + to_bytes(rs->requested_bitmap_chunk_sectors), 0); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 5903e492bb34..79eab1071ec2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -326,9 +326,8 @@ static void recovery_complete(int read_err, unsigned long write_err, dm_rh_recovery_end(reg, !(read_err || write_err)); } -static int recover(struct mirror_set *ms, struct dm_region *reg) +static void recover(struct mirror_set *ms, struct dm_region *reg) { - int r; unsigned i; struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; struct mirror *m; @@ -367,10 +366,8 @@ static int recover(struct mirror_set *ms, struct dm_region *reg) if (!errors_handled(ms)) set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); - r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, - flags, recovery_complete, reg); - - return r; + dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, + flags, recovery_complete, reg); } static void reset_ms_flags(struct mirror_set *ms) @@ -388,7 +385,6 @@ static void do_recovery(struct mirror_set *ms) { struct dm_region *reg; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - int r; /* * Start quiescing some regions. @@ -398,11 +394,8 @@ static void do_recovery(struct mirror_set *ms) /* * Copy any already quiesced regions. */ - while ((reg = dm_rh_recovery_start(ms->rh))) { - r = recover(ms, reg); - if (r) - dm_rh_recovery_end(reg, 0); - } + while ((reg = dm_rh_recovery_start(ms->rh))) + recover(ms, reg); /* * Update the in sync flag. diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 97de7a7334d4..ae4b33d10924 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -85,7 +85,7 @@ struct dm_snapshot { * A list of pending exceptions that completed out of order. * Protected by kcopyd single-threaded callback. */ - struct list_head out_of_order_list; + struct rb_root out_of_order_tree; mempool_t pending_pool; @@ -200,7 +200,7 @@ struct dm_snap_pending_exception { /* A sequence number, it is used for in-order completion. */ sector_t exception_sequence; - struct list_head out_of_order_entry; + struct rb_node out_of_order_node; /* * For writing a complete chunk, bypassing the copy. @@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) atomic_set(&s->pending_exceptions_count, 0); s->exception_start_sequence = 0; s->exception_complete_sequence = 0; - INIT_LIST_HEAD(&s->out_of_order_list); + s->out_of_order_tree = RB_ROOT; mutex_init(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1539,28 +1539,41 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) pe->copy_error = read_err || write_err; if (pe->exception_sequence == s->exception_complete_sequence) { + struct rb_node *next; + s->exception_complete_sequence++; complete_exception(pe); - while (!list_empty(&s->out_of_order_list)) { - pe = list_entry(s->out_of_order_list.next, - struct dm_snap_pending_exception, out_of_order_entry); + next = rb_first(&s->out_of_order_tree); + while (next) { + pe = rb_entry(next, struct dm_snap_pending_exception, + out_of_order_node); if (pe->exception_sequence != s->exception_complete_sequence) break; + next = rb_next(next); s->exception_complete_sequence++; - list_del(&pe->out_of_order_entry); + rb_erase(&pe->out_of_order_node, &s->out_of_order_tree); complete_exception(pe); + cond_resched(); } } else { - struct list_head *lh; + struct rb_node *parent = NULL; + struct rb_node **p = &s->out_of_order_tree.rb_node; struct dm_snap_pending_exception *pe2; - list_for_each_prev(lh, &s->out_of_order_list) { - pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); - if (pe2->exception_sequence < pe->exception_sequence) - break; + while (*p) { + pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node); + parent = *p; + + BUG_ON(pe->exception_sequence == pe2->exception_sequence); + if (pe->exception_sequence < pe2->exception_sequence) + p = &((*p)->rb_left); + else + p = &((*p)->rb_right); } - list_add(&pe->out_of_order_entry, lh); + + rb_link_node(&pe->out_of_order_node, parent, p); + rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } } @@ -1694,8 +1707,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; - /* FIXME: should only take write lock if we need - * to copy an exception */ mutex_lock(&s->lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index b900723bbd0f..7bd60a150f8f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1220,18 +1220,13 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, sector_t begin, sector_t end) { - int r; struct dm_io_region to; to.bdev = tc->pool_dev->bdev; to.sector = begin; to.count = end - begin; - r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - DMERR_LIMIT("dm_kcopyd_zero() failed"); - copy_complete(1, 1, m); - } + dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); } static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, @@ -1257,7 +1252,6 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_bio_prison_cell *cell, struct bio *bio, sector_t len) { - int r; struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); @@ -1296,19 +1290,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, to.sector = data_dest * pool->sectors_per_block; to.count = len; - r = dm_kcopyd_copy(pool->copier, &from, 1, &to, - 0, copy_complete, m); - if (r < 0) { - DMERR_LIMIT("dm_kcopyd_copy() failed"); - copy_complete(1, 1, m); - - /* - * We allow the zero to be issued, to simplify the - * error path. Otherwise we'd need to start - * worrying about decrementing the prepare_actions - * counter. - */ - } + dm_kcopyd_copy(pool->copier, &from, 1, &to, + 0, copy_complete, m); /* * Do we need to zero a tail region? @@ -2520,6 +2503,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + if (old_mode == PM_OUT_OF_DATA_SPACE) + cancel_delayed_work_sync(&pool->no_space_timeout); pool->out_of_data_space = false; pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); @@ -3890,6 +3875,8 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("- "); + DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt)); + break; case STATUSTYPE_TABLE: @@ -3979,7 +3966,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4353,7 +4340,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 87107c995cb5..5f1f80d424dd 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -268,9 +268,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) i = 0; do { long daa; - void *dummy_addr; daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, - &dummy_addr, &pfn); + NULL, &pfn); if (daa <= 0) { r = daa ? daa : -EINVAL; goto err3; @@ -457,7 +456,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc) COMPLETION_INITIALIZER_ONSTACK(endio.c), ATOMIC_INIT(1), }; - unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG; + unsigned bitmap_bits = wc->dirty_bitmap_size * 8; unsigned i = 0; while (1) { @@ -2240,6 +2239,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; + if (wc->start_sector) + extra_args += 2; if (wc->high_wm_percent_set) extra_args += 2; if (wc->low_wm_percent_set) @@ -2254,6 +2255,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; DMEMIT("%u", extra_args); + if (wc->start_sector) + DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); if (wc->high_wm_percent_set) { x = (uint64_t)wc->freelist_high_watermark * 100; x += wc->n_blocks / 2; @@ -2280,7 +2283,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 1, 0}, + .version = {1, 1, 1}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 44a119e12f1a..edf4b95eb075 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -161,10 +161,8 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, /* Copy the valid region */ set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags); - ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, - dmz_reclaim_kcopy_end, zrc); - if (ret) - return ret; + dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, + dmz_reclaim_kcopy_end, zrc); /* Wait for copy to complete */ wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b0dd7027848b..20f7e4ef5342 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; - generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), + &dm_disk(md)->part0); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, + io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index f983c3fdf204..2fc8c113977f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -46,8 +46,8 @@ static inline char *bmname(struct bitmap *bitmap) * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ -static int bitmap_checkpage(struct bitmap_counts *bitmap, - unsigned long page, int create, int no_hijack) +static int md_bitmap_checkpage(struct bitmap_counts *bitmap, + unsigned long page, int create, int no_hijack) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -115,7 +115,7 @@ __acquires(bitmap->lock) /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ -static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) +static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) { char *ptr; @@ -280,7 +280,7 @@ restart: return -EINVAL; } -static void bitmap_file_kick(struct bitmap *bitmap); +static void md_bitmap_file_kick(struct bitmap *bitmap); /* * write out a page to a file */ @@ -310,7 +310,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_read(&bitmap->pending_writes)==0); } if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - bitmap_file_kick(bitmap); + md_bitmap_file_kick(bitmap); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) @@ -421,11 +421,11 @@ out: */ /* - * bitmap_wait_writes() should be called before writing any bitmap + * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from - * bitmap_daemon_work(), have completed. + * md_bitmap_daemon_work(), have completed. */ -static void bitmap_wait_writes(struct bitmap *bitmap) +static void md_bitmap_wait_writes(struct bitmap *bitmap) { if (bitmap->storage.file) wait_event(bitmap->write_wait, @@ -443,7 +443,7 @@ static void bitmap_wait_writes(struct bitmap *bitmap) /* update the event counter and sync the superblock to disk */ -void bitmap_update_sb(struct bitmap *bitmap) +void md_bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -476,10 +476,10 @@ void bitmap_update_sb(struct bitmap *bitmap) kunmap_atomic(sb); write_page(bitmap, bitmap->storage.sb_page, 1); } -EXPORT_SYMBOL(bitmap_update_sb); +EXPORT_SYMBOL(md_bitmap_update_sb); /* print out the bitmap file superblock */ -void bitmap_print_sb(struct bitmap *bitmap) +void md_bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; @@ -518,7 +518,7 @@ void bitmap_print_sb(struct bitmap *bitmap) * * Returns: 0 on success, -Exxx on error */ -static int bitmap_new_disk_sb(struct bitmap *bitmap) +static int md_bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; @@ -577,7 +577,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) } /* read the superblock from the bitmap file and initialize some bitmap fields */ -static int bitmap_read_sb(struct bitmap *bitmap) +static int md_bitmap_read_sb(struct bitmap *bitmap) { char *reason = NULL; bitmap_super_t *sb; @@ -727,7 +727,7 @@ out_no_sb: bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; if (err) { - bitmap_print_sb(bitmap); + md_bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); } @@ -774,9 +774,9 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store, return store->filemap[file_page_index(store, chunk)]; } -static int bitmap_storage_alloc(struct bitmap_storage *store, - unsigned long chunks, int with_super, - int slot_number) +static int md_bitmap_storage_alloc(struct bitmap_storage *store, + unsigned long chunks, int with_super, + int slot_number) { int pnum, offset = 0; unsigned long num_pages; @@ -830,7 +830,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, return 0; } -static void bitmap_file_unmap(struct bitmap_storage *store) +static void md_bitmap_file_unmap(struct bitmap_storage *store) { struct page **map, *sb_page; int pages; @@ -862,12 +862,12 @@ static void bitmap_file_unmap(struct bitmap_storage *store) * then it is no longer reliable, so we stop using it and we mark the file * as failed in the superblock */ -static void bitmap_file_kick(struct bitmap *bitmap) +static void md_bitmap_file_kick(struct bitmap *bitmap) { char *path, *ptr = NULL; if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { - bitmap_update_sb(bitmap); + md_bitmap_update_sb(bitmap); if (bitmap->storage.file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); @@ -923,7 +923,7 @@ static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, * we set the bit immediately, then we record the page number so that * when an unplug occurs, we can flush the dirty pages out to disk */ -static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) +static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; @@ -952,7 +952,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); } -static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) +static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; @@ -980,7 +980,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) } } -static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; @@ -1005,7 +1005,7 @@ static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ -void bitmap_unplug(struct bitmap *bitmap) +void md_bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; @@ -1025,7 +1025,7 @@ void bitmap_unplug(struct bitmap *bitmap) BITMAP_PAGE_NEEDWRITE); if (dirty || need_write) { if (!writing) { - bitmap_wait_writes(bitmap); + md_bitmap_wait_writes(bitmap); if (bitmap->mddev->queue) blk_add_trace_msg(bitmap->mddev->queue, "md bitmap_unplug"); @@ -1036,14 +1036,14 @@ void bitmap_unplug(struct bitmap *bitmap) } } if (writing) - bitmap_wait_writes(bitmap); + md_bitmap_wait_writes(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - bitmap_file_kick(bitmap); + md_bitmap_file_kick(bitmap); } -EXPORT_SYMBOL(bitmap_unplug); +EXPORT_SYMBOL(md_bitmap_unplug); -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); +static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the * memory mapping of the bitmap file @@ -1055,7 +1055,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n * We ignore all bits for sectors that end earlier than 'start'. * This is used when reading an out-of-date bitmap... */ -static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) +static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { unsigned long i, chunks, index, oldindex, bit, node_offset = 0; struct page *page = NULL; @@ -1078,9 +1078,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) >= start); - bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->counts.chunkshift, - needed); + md_bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); } return 0; } @@ -1159,9 +1159,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); - bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->counts.chunkshift, - needed); + md_bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); bit_cnt++; } offset = 0; @@ -1179,7 +1179,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return ret; } -void bitmap_write_all(struct bitmap *bitmap) +void md_bitmap_write_all(struct bitmap *bitmap) { /* We don't actually write all bitmap blocks here, * just flag them as needing to be written @@ -1198,16 +1198,16 @@ void bitmap_write_all(struct bitmap *bitmap) bitmap->allclean = 0; } -static void bitmap_count_page(struct bitmap_counts *bitmap, - sector_t offset, int inc) +static void md_bitmap_count_page(struct bitmap_counts *bitmap, + sector_t offset, int inc) { sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; - bitmap_checkfree(bitmap, page); + md_bitmap_checkfree(bitmap, page); } -static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) +static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) { sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; @@ -1217,16 +1217,16 @@ static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) bp->pending = 1; } -static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, - sector_t offset, sector_t *blocks, - int create); +static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create); /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ -void bitmap_daemon_work(struct mddev *mddev) +void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; @@ -1301,10 +1301,8 @@ void bitmap_daemon_work(struct mddev *mddev) } counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; } - bmc = bitmap_get_counter(counts, - block, - &blocks, 0); + bmc = md_bitmap_get_counter(counts, block, &blocks, 0); if (!bmc) { j |= PAGE_COUNTER_MASK; continue; @@ -1312,17 +1310,17 @@ void bitmap_daemon_work(struct mddev *mddev) if (*bmc == 1 && !bitmap->need_sync) { /* We can clear the bit */ *bmc = 0; - bitmap_count_page(counts, block, -1); - bitmap_file_clear_bit(bitmap, block); + md_bitmap_count_page(counts, block, -1); + md_bitmap_file_clear_bit(bitmap, block); } else if (*bmc && *bmc <= 2) { *bmc = 1; - bitmap_set_pending(counts, block); + md_bitmap_set_pending(counts, block); bitmap->allclean = 0; } } spin_unlock_irq(&counts->lock); - bitmap_wait_writes(bitmap); + md_bitmap_wait_writes(bitmap); /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. * DIRTY pages need to be written by bitmap_unplug so it can wait * for them. @@ -1352,9 +1350,9 @@ void bitmap_daemon_work(struct mddev *mddev) mutex_unlock(&mddev->bitmap_info.mutex); } -static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, - sector_t offset, sector_t *blocks, - int create) +static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -1368,7 +1366,7 @@ __acquires(bitmap->lock) sector_t csize; int err; - err = bitmap_checkpage(bitmap, page, create, 0); + err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) @@ -1394,7 +1392,7 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { if (!bitmap) return 0; @@ -1415,7 +1413,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect bitmap_counter_t *bmc; spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); return 0; @@ -1437,8 +1435,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect switch (*bmc) { case 0: - bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(&bitmap->counts, offset, 1); + md_bitmap_file_set_bit(bitmap, offset); + md_bitmap_count_page(&bitmap->counts, offset, 1); /* fall through */ case 1: *bmc = 2; @@ -1456,10 +1454,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect } return 0; } -EXPORT_SYMBOL(bitmap_startwrite); +EXPORT_SYMBOL(md_bitmap_startwrite); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success, int behind) +void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind) { if (!bitmap) return; @@ -1477,7 +1475,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto bitmap_counter_t *bmc; spin_lock_irqsave(&bitmap->counts.lock, flags); - bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); if (!bmc) { spin_unlock_irqrestore(&bitmap->counts.lock, flags); return; @@ -1498,7 +1496,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto (*bmc)--; if (*bmc <= 2) { - bitmap_set_pending(&bitmap->counts, offset); + md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } spin_unlock_irqrestore(&bitmap->counts.lock, flags); @@ -1509,7 +1507,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto sectors = 0; } } -EXPORT_SYMBOL(bitmap_endwrite); +EXPORT_SYMBOL(md_bitmap_endwrite); static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded) @@ -1521,7 +1519,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t return 1; /* always resync if no bitmap */ } spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); rv = 0; if (bmc) { /* locked */ @@ -1539,8 +1537,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t return rv; } -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) +int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) { /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will @@ -1561,9 +1559,9 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, } return rv; } -EXPORT_SYMBOL(bitmap_start_sync); +EXPORT_SYMBOL(md_bitmap_start_sync); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) { bitmap_counter_t *bmc; unsigned long flags; @@ -1573,7 +1571,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i return; } spin_lock_irqsave(&bitmap->counts.lock, flags); - bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) goto unlock; /* locked */ @@ -1584,7 +1582,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i *bmc |= NEEDED_MASK; else { if (*bmc <= 2) { - bitmap_set_pending(&bitmap->counts, offset); + md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } } @@ -1592,9 +1590,9 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i unlock: spin_unlock_irqrestore(&bitmap->counts.lock, flags); } -EXPORT_SYMBOL(bitmap_end_sync); +EXPORT_SYMBOL(md_bitmap_end_sync); -void bitmap_close_sync(struct bitmap *bitmap) +void md_bitmap_close_sync(struct bitmap *bitmap) { /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the @@ -1605,13 +1603,13 @@ void bitmap_close_sync(struct bitmap *bitmap) if (!bitmap) return; while (sector < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, sector, &blocks, 0); + md_bitmap_end_sync(bitmap, sector, &blocks, 0); sector += blocks; } } -EXPORT_SYMBOL(bitmap_close_sync); +EXPORT_SYMBOL(md_bitmap_close_sync); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) { sector_t s = 0; sector_t blocks; @@ -1633,15 +1631,15 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, s, &blocks, 0); + md_bitmap_end_sync(bitmap, s, &blocks, 0); s += blocks; } bitmap->last_end_sync = jiffies; sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); } -EXPORT_SYMBOL(bitmap_cond_end_sync); +EXPORT_SYMBOL(md_bitmap_cond_end_sync); -void bitmap_sync_with_cluster(struct mddev *mddev, +void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi) { @@ -1649,20 +1647,20 @@ void bitmap_sync_with_cluster(struct mddev *mddev, sector_t sector, blocks = 0; for (sector = old_lo; sector < new_lo; ) { - bitmap_end_sync(bitmap, sector, &blocks, 0); + md_bitmap_end_sync(bitmap, sector, &blocks, 0); sector += blocks; } WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); for (sector = old_hi; sector < new_hi; ) { - bitmap_start_sync(bitmap, sector, &blocks, 0); + md_bitmap_start_sync(bitmap, sector, &blocks, 0); sector += blocks; } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); } -EXPORT_SYMBOL(bitmap_sync_with_cluster); +EXPORT_SYMBOL(md_bitmap_sync_with_cluster); -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) +static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the * counter to 2 and possibly set resync_needed. They should all @@ -1672,15 +1670,15 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n sector_t secs; bitmap_counter_t *bmc; spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); + bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); return; } if (!*bmc) { *bmc = 2; - bitmap_count_page(&bitmap->counts, offset, 1); - bitmap_set_pending(&bitmap->counts, offset); + md_bitmap_count_page(&bitmap->counts, offset, 1); + md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } if (needed) @@ -1689,14 +1687,14 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) { unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; - bitmap_set_memory_bits(bitmap, sec, 1); - bitmap_file_set_bit(bitmap, sec); + md_bitmap_set_memory_bits(bitmap, sec, 1); + md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->recovery_cp) /* We are asserting that the array is dirty, * so move the recovery_cp address back so @@ -1709,7 +1707,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) /* * flush out any pending updates */ -void bitmap_flush(struct mddev *mddev) +void md_bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; long sleep; @@ -1722,18 +1720,18 @@ void bitmap_flush(struct mddev *mddev) */ sleep = mddev->bitmap_info.daemon_sleep * 2; bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); + md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); + md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap_update_sb(bitmap); + md_bitmap_daemon_work(mddev); + md_bitmap_update_sb(bitmap); } /* * free memory that was allocated */ -void bitmap_free(struct bitmap *bitmap) +void md_bitmap_free(struct bitmap *bitmap) { unsigned long k, pages; struct bitmap_page *bp; @@ -1753,7 +1751,7 @@ void bitmap_free(struct bitmap *bitmap) atomic_read(&bitmap->pending_writes) == 0); /* release the bitmap file */ - bitmap_file_unmap(&bitmap->storage); + md_bitmap_file_unmap(&bitmap->storage); bp = bitmap->counts.bp; pages = bitmap->counts.pages; @@ -1767,9 +1765,9 @@ void bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } -EXPORT_SYMBOL(bitmap_free); +EXPORT_SYMBOL(md_bitmap_free); -void bitmap_wait_behind_writes(struct mddev *mddev) +void md_bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -1783,14 +1781,14 @@ void bitmap_wait_behind_writes(struct mddev *mddev) } } -void bitmap_destroy(struct mddev *mddev) +void md_bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; - bitmap_wait_behind_writes(mddev); + md_bitmap_wait_behind_writes(mddev); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1800,7 +1798,7 @@ void bitmap_destroy(struct mddev *mddev) if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - bitmap_free(bitmap); + md_bitmap_free(bitmap); } /* @@ -1808,7 +1806,7 @@ void bitmap_destroy(struct mddev *mddev) * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set */ -struct bitmap *bitmap_create(struct mddev *mddev, int slot) +struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1863,9 +1861,9 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) * instructing us to create a new on-disk bitmap instance. */ if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) - err = bitmap_new_disk_sb(bitmap); + err = md_bitmap_new_disk_sb(bitmap); else - err = bitmap_read_sb(bitmap); + err = md_bitmap_read_sb(bitmap); } else { err = 0; if (mddev->bitmap_info.chunksize == 0 || @@ -1878,7 +1876,7 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) goto error; bitmap->daemon_lastrun = jiffies; - err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); if (err) goto error; @@ -1891,11 +1889,11 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) return bitmap; error: - bitmap_free(bitmap); + md_bitmap_free(bitmap); return ERR_PTR(err); } -int bitmap_load(struct mddev *mddev) +int md_bitmap_load(struct mddev *mddev) { int err = 0; sector_t start = 0; @@ -1915,10 +1913,10 @@ int bitmap_load(struct mddev *mddev) */ while (sector < mddev->resync_max_sectors) { sector_t blocks; - bitmap_start_sync(bitmap, sector, &blocks, 0); + md_bitmap_start_sync(bitmap, sector, &blocks, 0); sector += blocks; } - bitmap_close_sync(bitmap); + md_bitmap_close_sync(bitmap); if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) @@ -1927,7 +1925,7 @@ int bitmap_load(struct mddev *mddev) start = mddev->recovery_cp; mutex_lock(&mddev->bitmap_info.mutex); - err = bitmap_init_from_disk(bitmap, start); + err = md_bitmap_init_from_disk(bitmap, start); mutex_unlock(&mddev->bitmap_info.mutex); if (err) @@ -1940,29 +1938,29 @@ int bitmap_load(struct mddev *mddev) mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; md_wakeup_thread(mddev->thread); - bitmap_update_sb(bitmap); + md_bitmap_update_sb(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; out: return err; } -EXPORT_SYMBOL_GPL(bitmap_load); +EXPORT_SYMBOL_GPL(md_bitmap_load); struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; - bitmap = bitmap_create(mddev, slot); + bitmap = md_bitmap_create(mddev, slot); if (IS_ERR(bitmap)) { rv = PTR_ERR(bitmap); return ERR_PTR(rv); } - rv = bitmap_init_from_disk(bitmap, 0); + rv = md_bitmap_init_from_disk(bitmap, 0); if (rv) { - bitmap_free(bitmap); + md_bitmap_free(bitmap); return ERR_PTR(rv); } @@ -1973,7 +1971,7 @@ EXPORT_SYMBOL(get_bitmap_from_slot); /* Loads the bitmap associated with slot and copies the resync information * to our bitmap */ -int bitmap_copy_from_slot(struct mddev *mddev, int slot, +int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, sector_t *high, bool clear_bits) { int rv = 0, i, j; @@ -1990,35 +1988,35 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, counts = &bitmap->counts; for (j = 0; j < counts->chunks; j++) { block = (sector_t)j << counts->chunkshift; - if (bitmap_file_test_bit(bitmap, block)) { + if (md_bitmap_file_test_bit(bitmap, block)) { if (!lo) lo = block; hi = block; - bitmap_file_clear_bit(bitmap, block); - bitmap_set_memory_bits(mddev->bitmap, block, 1); - bitmap_file_set_bit(mddev->bitmap, block); + md_bitmap_file_clear_bit(bitmap, block); + md_bitmap_set_memory_bits(mddev->bitmap, block, 1); + md_bitmap_file_set_bit(mddev->bitmap, block); } } if (clear_bits) { - bitmap_update_sb(bitmap); + md_bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); - bitmap_unplug(bitmap); + md_bitmap_unplug(bitmap); } - bitmap_unplug(mddev->bitmap); + md_bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; return rv; } -EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); +EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) { unsigned long chunk_kb; struct bitmap_counts *counts; @@ -2045,7 +2043,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) seq_printf(seq, "\n"); } -int bitmap_resize(struct bitmap *bitmap, sector_t blocks, +int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init) { /* If chunk_size is 0, choose an appropriate chunk size. @@ -2106,12 +2104,12 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); memset(&store, 0, sizeof(store)); if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) - ret = bitmap_storage_alloc(&store, chunks, - !bitmap->mddev->bitmap_info.external, - mddev_is_clustered(bitmap->mddev) - ? bitmap->cluster_slot : 0); + ret = md_bitmap_storage_alloc(&store, chunks, + !bitmap->mddev->bitmap_info.external, + mddev_is_clustered(bitmap->mddev) + ? bitmap->cluster_slot : 0); if (ret) { - bitmap_file_unmap(&store); + md_bitmap_file_unmap(&store); goto err; } @@ -2120,7 +2118,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL); ret = -ENOMEM; if (!new_bp) { - bitmap_file_unmap(&store); + md_bitmap_file_unmap(&store); goto err; } @@ -2134,7 +2132,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); - bitmap_file_unmap(&bitmap->storage); + md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; old_counts = bitmap->counts; @@ -2154,7 +2152,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; for (page = 0; page < pages; page++) { - ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); + ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1); if (ret) { unsigned long k; @@ -2184,27 +2182,23 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap_counter_t *bmc_old, *bmc_new; int set; - bmc_old = bitmap_get_counter(&old_counts, block, - &old_blocks, 0); + bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0); set = bmc_old && NEEDED(*bmc_old); if (set) { - bmc_new = bitmap_get_counter(&bitmap->counts, block, - &new_blocks, 1); + bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); if (*bmc_new == 0) { /* need to set on-disk bits too. */ sector_t end = block + new_blocks; sector_t start = block >> chunkshift; start <<= chunkshift; while (start < end) { - bitmap_file_set_bit(bitmap, block); + md_bitmap_file_set_bit(bitmap, block); start += 1 << chunkshift; } *bmc_new = 2; - bitmap_count_page(&bitmap->counts, - block, 1); - bitmap_set_pending(&bitmap->counts, - block); + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); } *bmc_new |= NEEDED_MASK; if (new_blocks < old_blocks) @@ -2225,18 +2219,15 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, int i; while (block < (chunks << chunkshift)) { bitmap_counter_t *bmc; - bmc = bitmap_get_counter(&bitmap->counts, block, - &new_blocks, 1); + bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); if (bmc) { /* new space. It needs to be resynced, so * we set NEEDED_MASK. */ if (*bmc == 0) { *bmc = NEEDED_MASK | 2; - bitmap_count_page(&bitmap->counts, - block, 1); - bitmap_set_pending(&bitmap->counts, - block); + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); } } block += new_blocks; @@ -2247,14 +2238,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, spin_unlock_irq(&bitmap->counts.lock); if (!init) { - bitmap_unplug(bitmap); + md_bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); } ret = 0; err: return ret; } -EXPORT_SYMBOL_GPL(bitmap_resize); +EXPORT_SYMBOL_GPL(md_bitmap_resize); static ssize_t location_show(struct mddev *mddev, char *page) @@ -2298,7 +2289,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); } mddev->bitmap_info.offset = 0; @@ -2337,18 +2328,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { struct bitmap *bitmap; mddev->pers->quiesce(mddev, 1); - bitmap = bitmap_create(mddev, -1); + bitmap = md_bitmap_create(mddev, -1); if (IS_ERR(bitmap)) rv = PTR_ERR(bitmap); else { mddev->bitmap = bitmap; - rv = bitmap_load(mddev); + rv = md_bitmap_load(mddev); if (rv) mddev->bitmap_info.offset = 0; } mddev->pers->quiesce(mddev, 0); if (rv) { - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); goto out; } } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 5df35ca90f58..cfd7395de8fd 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -236,43 +236,43 @@ struct bitmap { /* the bitmap API */ /* these are used only by md/bitmap */ -struct bitmap *bitmap_create(struct mddev *mddev, int slot); -int bitmap_load(struct mddev *mddev); -void bitmap_flush(struct mddev *mddev); -void bitmap_destroy(struct mddev *mddev); +struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); +int md_bitmap_load(struct mddev *mddev); +void md_bitmap_flush(struct mddev *mddev); +void md_bitmap_destroy(struct mddev *mddev); -void bitmap_print_sb(struct bitmap *bitmap); -void bitmap_update_sb(struct bitmap *bitmap); -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); +void md_bitmap_print_sb(struct bitmap *bitmap); +void md_bitmap_update_sb(struct bitmap *bitmap); +void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); -int bitmap_setallbits(struct bitmap *bitmap); -void bitmap_write_all(struct bitmap *bitmap); +int md_bitmap_setallbits(struct bitmap *bitmap); +void md_bitmap_write_all(struct bitmap *bitmap); -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); +void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); /* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, +int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); -void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); -void bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi); - -void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct mddev *mddev); - -int bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init); +int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void md_bitmap_close_sync(struct bitmap *bitmap); +void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); +void md_bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); + +void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_daemon_work(struct mddev *mddev); + +int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init); struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); -int bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *lo, sector_t *hi, bool clear_bits); -void bitmap_free(struct bitmap *bitmap); -void bitmap_wait_behind_writes(struct mddev *mddev); +int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *lo, sector_t *hi, bool clear_bits); +void md_bitmap_free(struct bitmap *bitmap); +void md_bitmap_wait_behind_writes(struct mddev *mddev); #endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 021cbf9ef1bf..94329e03001e 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -304,15 +304,6 @@ static void recover_bitmaps(struct md_thread *thread) while (cinfo->recovery_map) { slot = fls64((u64)cinfo->recovery_map) - 1; - /* Clear suspend_area associated with the bitmap */ - spin_lock_irq(&cinfo->suspend_lock); - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - } - spin_unlock_irq(&cinfo->suspend_lock); - snprintf(str, 64, "bitmap%04d", slot); bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { @@ -326,19 +317,35 @@ static void recover_bitmaps(struct md_thread *thread) str, ret); goto clear_bit; } - ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); goto clear_bit; } + + /* Clear suspend_area associated with the bitmap */ + spin_lock_irq(&cinfo->suspend_lock); + list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) + if (slot == s->slot) { + list_del(&s->list); + kfree(s); + } + spin_unlock_irq(&cinfo->suspend_lock); + if (hi > 0) { if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; /* wake up thread to continue resync in case resync * is not finished */ if (mddev->recovery_cp != MaxSector) { - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* + * clear the REMOTE flag since we will launch + * resync thread in current node. + */ + clear_bit(MD_RESYNCING_REMOTE, + &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); } } clear_bit: @@ -457,6 +464,11 @@ static void process_suspend_info(struct mddev *mddev, struct suspend_info *s; if (!hi) { + /* + * clear the REMOTE flag since resync or recovery is finished + * in remote node. + */ + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -480,9 +492,7 @@ static void process_suspend_info(struct mddev *mddev, * resync thread is running in another node, * so we don't need to do the resync again * with the same section */ - bitmap_sync_with_cluster(mddev, cinfo->sync_low, - cinfo->sync_hi, - lo, hi); + md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; @@ -585,6 +595,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) revalidate_disk(mddev->gendisk); break; case RESYNCING: + set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), le64_to_cpu(msg->low), le64_to_cpu(msg->high)); @@ -829,7 +840,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) } /* Read the disk bitmap sb and check if it needs recovery */ - ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); if (ret) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); lockres_free(bm_lockres); @@ -1127,13 +1138,13 @@ static int cluster_check_sync_size(struct mddev *mddev) bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) { pr_err("md-cluster: Cannot initialize %s\n", str); - bitmap_free(bitmap); + md_bitmap_free(bitmap); return -1; } bm_lockres->flags |= DLM_LKF_NOQUEUE; rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (!rv) - bitmap_update_sb(bitmap); + md_bitmap_update_sb(bitmap); lockres_free(bm_lockres); sb = kmap_atomic(bitmap->storage.sb_page); @@ -1141,11 +1152,11 @@ static int cluster_check_sync_size(struct mddev *mddev) sync_size = sb->sync_size; else if (sync_size != sb->sync_size) { kunmap_atomic(sb); - bitmap_free(bitmap); + md_bitmap_free(bitmap); return -1; } kunmap_atomic(sb); - bitmap_free(bitmap); + md_bitmap_free(bitmap); } return (my_sync_size == sync_size) ? 0 : -1; @@ -1265,8 +1276,18 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + + clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); dlm_unlock_sync(cinfo->resync_lockres); - return resync_info_update(mddev, 0, 0); + + /* + * If resync thread is interrupted so we can't say resync is finished, + * another node will launch resync thread to continue. + */ + if (test_bit(MD_CLOSING, &mddev->flags)) + return 0; + else + return resync_info_update(mddev, 0, 0); } static int area_resyncing(struct mddev *mddev, int direction, @@ -1442,7 +1463,7 @@ static int gather_bitmaps(struct md_rdev *rdev) for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { if (sn == (cinfo->slot_number - 1)) continue; - err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); if (err) { pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index 994aed2f9dff..63ceabb4e020 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -204,10 +204,6 @@ static int start_readonly; */ static bool create_on_open = true; -/* bio_clone_mddev - * like bio_clone_bioset, but with a local bio set - */ - struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev) { @@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request); static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); + const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; int cpu; @@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; @@ -2571,7 +2568,7 @@ repeat: if (mddev->queue) blk_add_trace_msg(mddev->queue, "md md_update_sb"); rewrite: - bitmap_update_sb(mddev->bitmap); + md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; @@ -4384,10 +4381,10 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) if (buf == end) break; } if (*end && !isspace(*end)) break; - bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); buf = skip_spaces(end); } - bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ + md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; @@ -5615,7 +5612,7 @@ int md_run(struct mddev *mddev) (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { struct bitmap *bitmap; - bitmap = bitmap_create(mddev, -1); + bitmap = md_bitmap_create(mddev, -1); if (IS_ERR(bitmap)) { err = PTR_ERR(bitmap); pr_warn("%s: failed to create bitmap (%d)\n", @@ -5630,7 +5627,7 @@ int md_run(struct mddev *mddev) pers->free(mddev, mddev->private); mddev->private = NULL; module_put(pers->owner); - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); goto abort; } if (mddev->queue) { @@ -5715,9 +5712,9 @@ static int do_md_run(struct mddev *mddev) err = md_run(mddev); if (err) goto out; - err = bitmap_load(mddev); + err = md_bitmap_load(mddev); if (err) { - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); goto out; } @@ -5859,7 +5856,7 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - bitmap_flush(mddev); + md_bitmap_flush(mddev); if (mddev->ro == 0 && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -5881,7 +5878,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - bitmap_wait_behind_writes(mddev); + md_bitmap_wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -5894,7 +5891,7 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ flush_workqueue(md_misc_wq); @@ -6713,21 +6710,21 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { struct bitmap *bitmap; - bitmap = bitmap_create(mddev, -1); + bitmap = md_bitmap_create(mddev, -1); mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; - err = bitmap_load(mddev); + err = md_bitmap_load(mddev); } else err = PTR_ERR(bitmap); if (err) { - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } mddev_resume(mddev); } else if (fd < 0) { mddev_suspend(mddev); - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); mddev_resume(mddev); } } @@ -7013,15 +7010,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - bitmap = bitmap_create(mddev, -1); + bitmap = md_bitmap_create(mddev, -1); mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; - rv = bitmap_load(mddev); + rv = md_bitmap_load(mddev); } else rv = PTR_ERR(bitmap); if (rv) - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); mddev_resume(mddev); } else { /* remove the bitmap */ @@ -7046,7 +7043,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) md_cluster_ops->leave(mddev); } mddev_suspend(mddev); - bitmap_destroy(mddev); + md_bitmap_destroy(mddev); mddev_resume(mddev); mddev->bitmap_info.offset = 0; } @@ -7680,6 +7677,23 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) resync -= atomic_read(&mddev->recovery_active); if (resync == 0) { + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset != MaxSector && + rdev->recovery_offset) { + seq_printf(seq, "\trecover=REMOTE"); + return 1; + } + if (mddev->reshape_position != MaxSector) + seq_printf(seq, "\treshape=REMOTE"); + else + seq_printf(seq, "\tresync=REMOTE"); + return 1; + } if (mddev->recovery_cp < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; @@ -7909,7 +7923,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -8046,8 +8060,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - + curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and @@ -8781,7 +8794,7 @@ void md_check_recovery(struct mddev *mddev) return; if (mddev->bitmap) - bitmap_daemon_work(mddev); + md_bitmap_daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -8918,7 +8931,7 @@ void md_check_recovery(struct mddev *mddev) * which has the bitmap stored on all devices. * So make sure all bitmap pages get written */ - bitmap_write_all(mddev->bitmap); + md_bitmap_write_all(mddev->bitmap); } INIT_WORK(&mddev->del_work, md_start_sync); queue_work(md_misc_wq, &mddev->del_work); @@ -9166,7 +9179,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (ret) pr_info("md-cluster: resize failed\n"); else - bitmap_update_sb(mddev->bitmap); + md_bitmap_update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 2d148bdaba74..8afd6bfdbfb9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -496,6 +496,7 @@ enum recovery_flags { MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ + MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; static inline int __must_check mddev_lock(struct mddev *mddev) diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 829b4ce057d8..0a3b8ae4a29c 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -69,9 +69,9 @@ static struct dm_block_validator index_validator = { */ #define BITMAP_CSUM_XOR 240779 -static void bitmap_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) +static void dm_bitmap_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) { struct disk_bitmap_header *disk_header = dm_block_data(b); @@ -81,9 +81,9 @@ static void bitmap_prepare_for_write(struct dm_block_validator *v, BITMAP_CSUM_XOR)); } -static int bitmap_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) +static int dm_bitmap_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) { struct disk_bitmap_header *disk_header = dm_block_data(b); __le32 csum_disk; @@ -108,8 +108,8 @@ static int bitmap_check(struct dm_block_validator *v, static struct dm_block_validator dm_sm_bitmap_validator = { .name = "sm_bitmap", - .prepare_for_write = bitmap_prepare_for_write, - .check = bitmap_check + .prepare_for_write = dm_bitmap_prepare_for_write, + .check = dm_bitmap_check, }; /*----------------------------------------------------------------*/ @@ -124,7 +124,7 @@ static void *dm_bitmap_data(struct dm_block *b) #define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL -static unsigned bitmap_word_used(void *addr, unsigned b) +static unsigned dm_bitmap_word_used(void *addr, unsigned b) { __le64 *words_le = addr; __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); @@ -170,7 +170,7 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end, { while (begin < end) { if (!(begin & (ENTRIES_PER_WORD - 1)) && - bitmap_word_used(addr, begin)) { + dm_bitmap_word_used(addr, begin)) { begin += ENTRIES_PER_WORD; continue; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8e05c1092aef..4e990246225e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -385,10 +385,10 @@ static void close_write(struct r1bio *r1_bio) r1_bio->behind_master_bio = NULL; } /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); md_write_end(r1_bio->mddev); } @@ -781,7 +781,7 @@ static int raid1_congested(struct mddev *mddev, int bits) static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - bitmap_unplug(conf->mddev->bitmap); + md_bitmap_unplug(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1470,10 +1470,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, alloc_behind_master_bio(r1_bio, bio); } - bitmap_startwrite(bitmap, r1_bio->sector, - r1_bio->sectors, - test_bit(R1BIO_BehindIO, - &r1_bio->state)); + md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); first_clone = 0; } @@ -1880,8 +1878,7 @@ static void end_sync_write(struct bio *bio) long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ do { - bitmap_end_sync(mddev->bitmap, s, - &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2626,12 +2623,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); else /* completed sync */ conf->fullsync = 0; - bitmap_close_sync(mddev->bitmap); + md_bitmap_close_sync(mddev->bitmap); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2651,7 +2648,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2669,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * sector_nr + two times RESYNC_SECTORS */ - bitmap_cond_end_sync(mddev->bitmap, sector_nr, + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); @@ -2828,8 +2825,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3165,7 +3162,7 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; if (mddev->bitmap) { - int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); + int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); if (ret) return ret; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 35bd3a62451b..981898049491 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -440,10 +440,10 @@ static void raid10_end_read_request(struct bio *bio) static void close_write(struct r10bio *r10_bio) { /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); + md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); md_write_end(r10_bio->mddev); } @@ -917,7 +917,7 @@ static void flush_pending_writes(struct r10conf *conf) blk_start_plug(&plug); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ - bitmap_unplug(conf->mddev->bitmap); + md_bitmap_unplug(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1102,7 +1102,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - bitmap_unplug(mddev->bitmap); + md_bitmap_unplug(mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1519,7 +1519,7 @@ retry_write: } atomic_set(&r10_bio->remaining, 1); - bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); + md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) @@ -2991,13 +2991,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - bitmap_end_sync(mddev->bitmap, sect, - &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, sect, + &sync_blocks, 1); } } else { /* completed sync */ @@ -3018,7 +3018,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - bitmap_close_sync(mddev->bitmap); + md_bitmap_close_sync(mddev->bitmap); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3112,8 +3112,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); + must_sync = md_bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3158,8 +3158,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); + must_sync = md_bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { @@ -3335,13 +3335,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - bitmap_cond_end_sync(mddev->bitmap, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > - conf->cluster_sync_high)); + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!bitmap_start_sync(mddev->bitmap, sector_nr, - &sync_blocks, mddev->degraded) && + if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4022,7 +4021,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; if (mddev->bitmap) { - int ret = bitmap_resize(mddev->bitmap, size, 0, 0); + int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); if (ret) return ret; } @@ -4287,10 +4286,9 @@ static int raid10_start_reshape(struct mddev *mddev) spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { - ret = bitmap_resize(mddev->bitmap, - raid10_size(mddev, 0, - conf->geo.raid_disks), - 0, 0); + ret = md_bitmap_resize(mddev->bitmap, + raid10_size(mddev, 0, conf->geo.raid_disks), + 0, 0); if (ret) goto abort; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2b775abf377b..e6e925add700 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -324,10 +324,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); } } } @@ -717,7 +717,6 @@ static void r5c_disable_writeback_async(struct work_struct *work) static void r5l_submit_current_io(struct r5l_log *log) { struct r5l_io_unit *io = log->current_io; - struct bio *bio; struct r5l_meta_block *block; unsigned long flags; u32 crc; @@ -730,7 +729,6 @@ static void r5l_submit_current_io(struct r5l_log *log) block->meta_size = cpu_to_le32(io->meta_offset); crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); - bio = io->current_bio; log->current_io = NULL; spin_lock_irqsave(&log->io_list_lock, flags); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2031506a0ecd..4ce0d7502fad 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -409,16 +409,14 @@ void raid5_release_stripe(struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); return; slow_path: - local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ - if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { + if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { INIT_LIST_HEAD(&list); hash = sh->hash_lock_index; do_release_stripe(conf, sh, &list); - spin_unlock(&conf->device_lock); + spin_unlock_irqrestore(&conf->device_lock, flags); release_inactive_stripe_list(conf, &list, hash); } - local_irq_restore(flags); } static inline void remove_hash(struct stripe_head *sh) @@ -3301,8 +3299,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, */ set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); + md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3392,8 +3390,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3438,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } } if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3779,10 +3777,10 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4521,6 +4519,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->failed++; if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; + else if (!rdev) { + rdev = rcu_dereference( + conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } } if (test_bit(R5_InJournal, &dev->flags)) @@ -5539,10 +5543,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) for (d = 0; d < conf->raid_disks - conf->max_degraded; d++) - bitmap_startwrite(mddev->bitmap, - sh->sector, - STRIPE_SECTORS, - 0); + md_bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -6020,11 +6024,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); + md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); else /* completed sync */ conf->fullsync = 0; - bitmap_close_sync(mddev->bitmap); + md_bitmap_close_sync(mddev->bitmap); return 0; } @@ -6053,7 +6057,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; @@ -6061,7 +6065,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); + md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { @@ -6084,7 +6088,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } rcu_read_unlock(); - bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6285,7 +6289,7 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - bitmap_unplug(mddev->bitmap); + md_bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7741,7 +7745,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; if (mddev->bitmap) { - int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); + int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); if (ret) return ret; } |