diff options
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r-- | drivers/md/bcache/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 22 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 56 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 17 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 78 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 29 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 10 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 230 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 30 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 4 |
15 files changed, 339 insertions, 181 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..8bc1faf71ff2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -67,6 +67,7 @@ #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/random.h> +#include <linux/sched/signal.h> #include <trace/events/bcache.h> #define MAX_OPEN_BUCKETS 128 @@ -377,7 +378,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: @@ -730,8 +734,21 @@ int bch_open_buckets_alloc(struct cache_set *c) int bch_cache_allocator_start(struct cache *ca) { - struct task_struct *k = kthread_run(bch_allocator_thread, - ca, "bcache_allocator"); + struct task_struct *k; + + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + + k = kthread_run(bch_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..74a9849ea164 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -301,6 +301,7 @@ struct cached_dev { struct block_device *bdev; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct closure sb_write; @@ -329,6 +330,9 @@ struct cached_dev { */ atomic_t has_dirty; +#define BCH_CACHE_READA_ALL 0 +#define BCH_CACHE_READA_META_ONLY 1 + unsigned int cache_readahead_policy; struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -403,6 +407,7 @@ enum alloc_reserve { struct cache { struct cache_set *set; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; @@ -582,6 +587,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -723,6 +729,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -977,7 +984,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..4385303836d8 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; } +/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys); @@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free); int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -342,7 +343,6 @@ err: bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next); /* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree); /* Insert */ @@ -780,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key); static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge); void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert); unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -931,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key); /* Lookup */ @@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, return i.l; } -EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } -EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init); static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1268,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * Our temporary buffer is the same size as the btree node's * buffer, we can just swap buffers instead of doing a big * memcpy() + * + * Don't worry event 'out' is allocated from mempool, it can + * still be swapped here. Because state->pool is a page mempool + * creaated by by mempool_init_page_pool(), which allocates + * pages by alloc_pages() indeed. */ out->magic = b->set->data->magic; @@ -1313,7 +1307,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1366,7 +1359,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index c71365e7c1fa..a50dcfda656f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -397,7 +397,8 @@ void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); /* Bkey utility code */ -#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \ + (unsigned int)(i)->keys) static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ba434d9ac720..b12186c87f52 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -34,6 +34,7 @@ #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> +#include <linux/sched/signal.h> #include <linux/rculist.h> #include <linux/delay.h> #include <trace/events/bcache.h> @@ -543,6 +544,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) set_btree_node_dirty(b); + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { @@ -723,38 +729,38 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; btree_cache_used = c->btree_cache_used; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out; - if (++i > 3 && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } nr--; + i++; } - for (; (nr--) && i < btree_cache_used; i++) { - if (list_empty(&c->btree_cache)) + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) goto out; - b = list_first_entry(&c->btree_cache, struct btree, list); - list_rotate_left(&c->btree_cache); - - if (!b->accessed && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; - } else - b->accessed = 0; + } + + nr--; + i++; } out: mutex_unlock(&c->bucket_lock); @@ -884,15 +890,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -927,10 +935,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, @@ -1058,7 +1068,6 @@ retry: BUG_ON(!b->written); b->parent = parent; - b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { prefetch(b->keys.set[i].tree); @@ -1149,7 +1158,6 @@ retry: goto retry; } - b->accessed = 1; b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); @@ -1906,6 +1914,18 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 76cfd121a486..f4dcca449391 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -121,8 +121,6 @@ struct btree { /* Key/pointer for this btree node */ BKEY_PADDED(key); - /* Single bit - set when accessed, cleared by shrinker */ - unsigned long accessed; unsigned long seq; struct rw_semaphore lock; struct cache_set *c; diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 73f5319295bc..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } -EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -105,8 +101,14 @@ struct closure_syncer { static void closure_sync_fn(struct closure *cl) { - cl->s->done = 1; - wake_up_process(cl->s->task); + struct closure_syncer *s = cl->s; + struct task_struct *p; + + rcu_read_lock(); + p = READ_ONCE(s->task); + s->done = 1; + wake_up_process(p); + rcu_read_unlock(); } void __sched __closure_sync(struct closure *cl) @@ -125,7 +127,6 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -143,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -156,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *closure_debug; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8b123be05254..336f43910383 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; unsigned int bytes = min(i->bytes, size); - int err = copy_to_user(buf, i->buf, bytes); - if (err) - return err; + if (copy_to_user(buf, i->buf, bytes)) + return -EFAULT; ret += bytes; buf += bytes; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index be2a2a201603..0e3ff9745ac7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -420,7 +420,10 @@ err: static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; - unsigned int i, n; + unsigned int i, nr; + int ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; if (c->journal.btree_flushing) return; @@ -433,12 +436,50 @@ static void btree_flush_write(struct cache_set *c) c->journal.btree_flushing = true; spin_unlock(&c->journal.flush_write_lock); + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } + spin_unlock(&c->journal.lock); + + mask = c->journal.pin.mask; + nr = 0; atomic_long_inc(&c->flush_write); memset(btree_nodes, 0, sizeof(btree_nodes)); - n = 0; mutex_lock(&c->bucket_lock); list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + if (btree_node_journal_flush(b)) pr_err("BUG: flush_write bit should not be set here!"); @@ -454,17 +495,43 @@ static void btree_flush_write(struct cache_set *c) continue; } + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the folowing for-loop. + */ + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } + set_btree_node_journal_flush(b); mutex_unlock(&b->write_lock); - btree_nodes[n++] = b; - if (n == BTREE_FLUSH_NR) + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) break; } mutex_unlock(&c->bucket_lock); - for (i = 0; i < n; i++) { + for (i = 0; i < nr; i++) { b = btree_nodes[i]; if (!b) { pr_err("BUG: btree_nodes[%d] is NULL", i); @@ -497,6 +564,7 @@ static void btree_flush_write(struct cache_set *c) mutex_unlock(&b->write_lock); } +out: spin_lock(&c->journal.flush_write_lock); c->journal.btree_flushing = false; spin_unlock(&c->journal.flush_write_lock); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..820d8402a1dc 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; - /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL); @@ -391,13 +379,20 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; /* - * Flag for bypass if the IO is for read-ahead or background, - * unless the read-ahead request is for metadata + * If the bio is for read-ahead or background IO, bypass it or + * not depends on the following situations, + * - If the IO is for meta data, always cache it and no bypass + * - If the IO is not meta data, check dc->cache_reada_policy, + * BCH_CACHE_READA_ALL: cache it and not bypass + * BCH_CACHE_READA_META_ONLY: not cache it and bypass + * That is, read-ahead request for metadata always get cached * (eg, for gfs2 or xfs). */ - if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && - !(bio->bi_opf & (REQ_META|REQ_PRIO))) - goto skip; + if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && + (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) + goto skip; + } if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index ba1c93791d8d..503aafe188dc 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -109,9 +109,13 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, void bch_cache_accounting_clear(struct cache_accounting *acc) { - memset(&acc->total.cache_hits, - 0, - sizeof(struct cache_stats)); + acc->total.cache_hits = 0; + acc->total.cache_misses = 0; + acc->total.cache_bypass_hits = 0; + acc->total.cache_bypass_misses = 0; + acc->total.cache_readaheads = 0; + acc->total.cache_miss_collisions = 0; + acc->total.sectors_bypassed = 0; } void bch_cache_accounting_destroy(struct cache_accounting *acc) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..0c3c5419c52b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -15,7 +15,6 @@ #include "writeback.h" #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> #include <linux/idr.h> @@ -60,17 +59,18 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, - struct page **res) + struct cache_sb_disk **res) { const char *err; - struct cache_sb *s; - struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + struct cache_sb_disk *s; + struct page *page; unsigned int i; - if (!bh) + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(page)) return "IO error"; - - s = (struct cache_sb *) bh->b_data; + s = page_address(page) + offset_in_page(SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; @@ -187,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, } sb->last_mount = (u32)ktime_get_real_seconds(); - err = NULL; - - get_page(bh->b_page); - *res = bh->b_page; + *res = s; + return NULL; err: - put_bh(bh); + put_page(page); return err; } @@ -206,15 +205,15 @@ static void write_bdev_super_endio(struct bio *bio) closure_put(&dc->sb_write); } -static void __write_super(struct cache_sb *sb, struct bio *bio) +static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, + struct bio *bio) { - struct cache_sb *out = page_address(bio_first_page_all(bio)); unsigned int i; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_iter.bi_size = SB_SIZE; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, NULL); + __bio_add_page(bio, virt_to_page(out), SB_SIZE, + offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); out->version = cpu_to_le64(sb->version); @@ -256,14 +255,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); - bio_reset(bio); + bio_init(bio, dc->sb_bv, 1); bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; closure_get(cl); /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); + __write_super(&dc->sb, dc->sb_disk, bio); closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } @@ -305,13 +304,13 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); - bio_reset(bio); + bio_init(bio, ca->sb_bv, 1); bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; closure_get(cl); - __write_super(&ca->sb, bio); + __write_super(&ca->sb, ca->sb_disk, bio); } closure_return_with_destructor(cl, bcache_write_super_unlock); @@ -529,12 +528,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -544,9 +560,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -564,7 +577,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -593,14 +606,16 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } -static void prio_read(struct cache *ca, uint64_t bucket) +static int prio_read(struct cache *ca, uint64_t bucket) { struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; struct bucket *b; unsigned int bucket_nr = 0; + int ret = -EIO; for (b = ca->buckets; b < ca->buckets + ca->sb.nbuckets; @@ -613,11 +628,15 @@ static void prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != - bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { pr_warn("bad csum reading priorities"); + goto out; + } - if (p->magic != pset_magic(&ca->sb)) + if (p->magic != pset_magic(&ca->sb)) { pr_warn("bad magic reading priorities"); + goto out; + } bucket = p->next_bucket; d = p->data; @@ -626,6 +645,10 @@ static void prio_read(struct cache *ca, uint64_t bucket) b->prio = le16_to_cpu(d->prio); b->gen = b->last_gc = d->gen; } + + ret = 0; +out: + return ret; } /* Bcache device */ @@ -761,20 +784,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); @@ -1251,6 +1282,9 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); + if (dc->sb_disk) + put_page(virt_to_page(dc->sb_disk)); + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1326,7 +1360,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static int register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cached_dev *dc) { @@ -1338,11 +1372,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; - - bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; - get_page(sb_page); - + dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -1769,6 +1799,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); @@ -1809,6 +1840,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; @@ -1850,8 +1882,10 @@ static int run_cache_set(struct cache_set *c) j = &list_entry(journal.prev, struct journal_replay, list)->j; err = "IO error reading priorities"; - for_each_cache(ca, c, i) - prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); + for_each_cache(ca, c, i) { + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; + } /* * If prio_read() fails it'll call cache_set_error and we'll @@ -1883,23 +1917,6 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; - /* - * bch_btree_check() may occupy too much system memory which - * has negative effects to user space application (e.g. data - * base) performance. Shrink the mca cache memory proactively - * here to avoid competing memory with user space workloads.. - */ - if (!c->shrinker_disabled) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = c->btree_cache_used * c->btree_pages; - /* first run to clear b->accessed tag */ - c->shrink.scan_objects(&c->shrink, &sc); - /* second run to reap non-accessed nodes */ - c->shrink.scan_objects(&c->shrink, &sc); - } - bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); @@ -1954,7 +1971,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -2110,8 +2127,8 @@ void bch_cache_release(struct kobject *kobj) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(bio_first_page_all(&ca->sb_bio)); + if (ca->sb_disk) + put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -2233,7 +2250,7 @@ err_free: return ret; } -static int register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cache *ca) { const char *err = NULL; /* must be set for any error case */ @@ -2243,10 +2260,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; - - bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; - get_page(sb_page); + ca->sb_disk = sb_disk; if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); @@ -2346,29 +2360,35 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; + const char *err; char *path = NULL; - struct cache_sb *sb = NULL; - struct block_device *bdev = NULL; - struct page *sb_page = NULL; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; + ssize_t ret; + ret = -EBUSY; + err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) - return -EBUSY; + goto out; /* For latest state of bcache_is_reboot */ smp_mb(); + err = "bcache is in reboot"; if (bcache_is_reboot) - return -EBUSY; + goto out_module_put; + ret = -ENOMEM; + err = "cannot allocate memory"; path = kstrndup(buffer, size, GFP_KERNEL); if (!path) - goto err; + goto out_module_put; sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) - goto err; + goto out_free_path; + ret = -EINVAL; err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, @@ -2385,57 +2405,63 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto quiet_out; + goto done; } - goto err; + goto out_free_sb; } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) - goto err_close; + goto out_blkdev_put; - err = read_super(sb, bdev, &sb_page); + err = read_super(sb, bdev, &sb_disk); if (err) - goto err_close; + goto out_blkdev_put; err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) - goto err_close; + goto out_put_sb_page; mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) - goto err; + goto out_free_sb; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err_close; + goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; + if (register_cache(sb, sb_disk, bdev, ca) != 0) + goto out_free_sb; } -quiet_out: - ret = size; -out: - if (sb_page) - put_page(sb_page); + +done: kfree(sb); kfree(path); module_put(THIS_MODULE); - return ret; + return size; -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - pr_info("error %s: %s", path, err); - goto out; +out_put_sb_page: + put_page(virt_to_page(sb_disk)); +out_blkdev_put: + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: + kfree(sb); +out_free_path: + kfree(path); + path = NULL; +out_module_put: + module_put(THIS_MODULE); +out: + pr_info("error %s: %s", path?path:"", err); + return ret; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e2059af90791..3470fae4eabc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -27,6 +27,12 @@ static const char * const bch_cache_modes[] = { NULL }; +static const char * const bch_reada_cache_policies[] = { + "all", + "meta-only", + NULL +}; + /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", @@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); rw_attribute(data_csum); rw_attribute(cache_mode); +rw_attribute(readahead_cache_policy); rw_attribute(stop_when_cache_set_failed); rw_attribute(writeback_metadata); rw_attribute(writeback_running); @@ -134,6 +141,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -167,6 +175,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb)); + if (attr == &sysfs_readahead_cache_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_reada_cache_policies, + dc->cache_readahead_policy); + if (attr == &sysfs_stop_when_cache_set_failed) return bch_snprint_string_list(buf, PAGE_SIZE, bch_stop_on_failure_modes, @@ -352,6 +365,15 @@ STORE(__cached_dev) } } + if (attr == &sysfs_readahead_cache_policy) { + v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != dc->cache_readahead_policy) + dc->cache_readahead_policy = v; + } + if (attr == &sysfs_stop_when_cache_set_failed) { v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); if (v < 0) @@ -466,6 +488,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, @@ -747,6 +770,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +889,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +982,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, @@ -964,6 +993,7 @@ KTYPE(bch_cache_set_internal); static int __bch_cache_cmp(const void *l, const void *r) { + cond_resched(); return *((uint16_t *)r) - *((uint16_t *)l); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; |