From c2421edf5f9151d0eb28affbf76e9e4f8ddd03c6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:09 +0800 Subject: bcache: comment on direct access to bvec table All direct access to bvec table are safe even after multipage bvec is supported. Cc: linux-bcache@vger.kernel.org Acked-by: Coly Li Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/bcache/btree.c') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81e8dc3dbe5e..02a4cf646fdc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b) continue_at(cl, btree_node_write_done, NULL); } else { + /* No problem for multipage bvec since the bio is just allocated */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); -- cgit v1.2.3 From 25d8be77e19224d8f21b363d77b5283c5dc21a57 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:10 +0800 Subject: block: move bio_alloc_pages() to bcache bcache is the only user of bio_alloc_pages(), so move this function into bcache, and avoid it being misused in the future. Also rename it to bch_bio_allo_pages() since it is bcache only. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 28 ---------------------------- drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/request.c | 2 +- drivers/md/bcache/util.c | 27 +++++++++++++++++++++++++++ drivers/md/bcache/util.h | 1 + drivers/md/bcache/writeback.c | 2 +- include/linux/bio.h | 1 - 9 files changed, 33 insertions(+), 34 deletions(-) (limited to 'drivers/md/bcache/btree.c') diff --git a/block/bio.c b/block/bio.c index 8bfdea58159b..fe1efbeaf4aa 100644 --- a/block/bio.c +++ b/block/bio.c @@ -968,34 +968,6 @@ void bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(bio_advance); -/** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - /** * bio_copy_data - copy contents of data buffers from one chain of bios to * another diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 02a4cf646fdc..ebb1874218e7 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c7a02c4900da..879ab21074c6 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) return; check->bi_opf = REQ_OP_READ; - if (bio_alloc_pages(check, GFP_NOIO)) + if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; submit_bio_wait(check); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d50c1c97da68..a24c3a95b2c0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c) bio_set_op_attrs(bio, REQ_OP_READ, 0); bio->bi_end_io = read_moving_endio; - if (bio_alloc_pages(bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 643c3021624f..c493fb947dc9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -841,7 +841,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); - if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; if (reada) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 61813d230015..a23cd6a14b74 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -283,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } +/** + * bch_bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ed5e8a412eb8..4df4c5c1cab2 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch_bio_map(struct bio *bio, void *base); +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); static inline sector_t bdev_sectors(struct block_device *bdev) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 56a37884ca8b..1ac2af6128b1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -278,7 +278,7 @@ static void read_dirty(struct cached_dev *dc) bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); io->bio.bi_end_io = read_dirty_endio; - if (bio_alloc_pages(&io->bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; trace_bcache_writeback(&w->key); diff --git a/include/linux/bio.h b/include/linux/bio.h index 435ddf04e889..367a979fd4a6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -500,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); -extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, -- cgit v1.2.3 From 9d13411784e27227162857df25ab6817a1db2a73 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Mon, 8 Jan 2018 12:21:20 -0800 Subject: bcache: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: drivers/md/bcache/btree.c:1800:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/md/bcache/btree.c') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ebb1874218e7..9e30713dbdb8 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1804,10 +1804,7 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(c->gc_thread)) - return PTR_ERR(c->gc_thread); - - return 0; + return PTR_ERR_OR_ZERO(c->gc_thread); } /* Initial partial gc */ -- cgit v1.2.3 From 2831231d4c3f999d2d062b23dfbc8b0faa4bc6e0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 8 Jan 2018 12:21:28 -0800 Subject: bcache: reduce cache_set devices iteration by devices_max_used Member devices of struct cache_set is used to reference all attached bcache devices to this cache set. If it is treated as array of pointers, size of devices[] is indicated by member nr_uuids of struct cache_set. nr_uuids is calculated in drivers/md/super.c:bch_cache_set_alloc(), bucket_bytes(c) / sizeof(struct uuid_entry) Bucket size is determined by user space tool "make-bcache", by default it is 1024 sectors (defined in bcache-tools/make-bcache.c:main()). So default nr_uuids value is 4096 from the above calculation. Every time when bcache code iterates bcache devices of a cache set, all the 4096 pointers are checked even only 1 bcache device is attached to the cache set, that's a wast of time and unncessary. This patch adds a member devices_max_used to struct cache_set. Its value is 1 + the maximum used index of devices[] in a cache set. When iterating all valid bcache devices of a cache set, use c->devices_max_used in for-loop may reduce a lot of useless checking. Personally, my motivation of this patch is not for performance, I use it in bcache debugging, which helps me to narrow down the scape to check valid bcached devices of a cache set. Signed-off-by: Coly Li Reviewed-by: Michael Lyle Reviewed-by: Tang Junhui Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/super.c | 9 ++++++--- drivers/md/bcache/writeback.h | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers/md/bcache/btree.c') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5f7b0b2513cc..9117da5f494b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -497,6 +497,7 @@ struct cache_set { int caches_loaded; struct bcache_device **devices; + unsigned devices_max_used; struct list_head cached_devs; uint64_t cached_dev_sectors; struct closure caching; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 9e30713dbdb8..bf3a48aa9a9a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1679,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c) /* don't reclaim buckets to which writeback keys point */ rcu_read_lock(); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 553e841e897d..d13e4ccb30a0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -721,6 +721,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, d->c = c; c->devices[id] = d; + if (id >= c->devices_max_used) + c->devices_max_used = id + 1; + closure_get(&c->caching); } @@ -1267,7 +1270,7 @@ static int flash_devs_run(struct cache_set *c) struct uuid_entry *u; for (u = c->uuids; - u < c->uuids + c->nr_uuids && !ret; + u < c->uuids + c->devices_max_used && !ret; u++) if (UUID_FLASH_ONLY(u)) ret = flash_dev_run(c, u); @@ -1433,7 +1436,7 @@ static void __cache_set_unregister(struct closure *cl) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) + for (i = 0; i < c->devices_max_used; i++) if (c->devices[i]) { if (!UUID_FLASH_ONLY(&c->uuids[i]) && test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { @@ -1496,7 +1499,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - + c->devices_max_used = 0; c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 6d26927267f8..f102b1f9bc51 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -24,7 +24,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) -- cgit v1.2.3 From 682811b3ce1a5a4e20d700939a9042f01dbc66c4 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 7 Feb 2018 11:41:43 -0800 Subject: bcache: fix for allocator and register thread race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [] closure_sync+0x25/0x90 [bcache] [] bch_journal+0x118/0x2b0 [bcache] [] bch_journal_meta+0x47/0x70 [bcache] [] bch_prio_write+0x237/0x340 [bcache] [] bch_allocator_thread+0x3c8/0x3d0 [bcache] [] kthread+0xcf/0xe0 [] ret_from_fork+0x58/0x90 [] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [] __bch_btree_map_nodes+0x12d/0x150 [bcache] [] bch_btree_insert+0xf1/0x170 [bcache] [] bch_journal_replay+0x13f/0x230 [bcache] [] run_cache_set+0x79a/0x7c2 [bcache] [] register_bcache+0xd48/0x1310 [bcache] [] kobj_attr_store+0xf/0x20 [] sysfs_write_file+0xc6/0x140 [] vfs_write+0xbd/0x1e0 [] SyS_write+0x7f/0xe0 [] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui Signed-off-by: Tang Junhui Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 9 ++++++--- drivers/md/bcache/super.c | 13 ++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'drivers/md/bcache/btree.c') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf3a48aa9a9a..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) */ for_each_cache(ca, c, i) { for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO])) + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) break; if (bch_can_invalidate_bucket(ca, b) && !GC_MARK(b)) { __bch_invalidate_one_bucket(ca, b); - fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e29150ec17e7..a2ad37a8afc0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1833,6 +1833,7 @@ void bch_cache_release(struct kobject *kobj) static int cache_alloc(struct cache *ca) { size_t free; + size_t btree_buckets; struct bucket *b; __module_get(THIS_MODULE); @@ -1840,9 +1841,19 @@ static int cache_alloc(struct cache *ca) bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); + /* + * when ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split, + * so bucket of RESERVE_BTREE type is needed, + * the worst situation is all journal buckets are valid journal, + * and all the keys need to replay, + * so the number of RESERVE_BTREE type buckets should be as much + * as journal buckets + */ + btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || -- cgit v1.2.3