summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-09-10 19:07:00 -0700
committerKent Overstreet <kmo@daterainc.com>2013-11-10 21:56:37 -0800
commita1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 (patch)
tree287cf5c3daa6ae4ec2ffc55b33b6df52c617f7f7 /drivers/md
parent8835c1234dd9a838993a2d5cb7572f57992ebbee (diff)
downloadtalos-op-linux-a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6.tar.gz
talos-op-linux-a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6.zip
bcache: Incremental gc
Big garbage collection rewrite; now, garbage collection uses the same mechanisms as used elsewhere for inserting/updating btree node pointers, instead of rewriting interior btree nodes in place. This makes the code significantly cleaner and less fragile, and means we can now make garbage collection incremental - it doesn't have to hold a write lock on the root of the btree for the entire duration of garbage collection. This means that there's less of a latency hit for doing garbage collection, which means we can gc more frequently (and do a better job of reclaiming from the cache), and we can coalesce across more btree nodes (improving our space efficiency). Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h1
-rw-r--r--drivers/md/bcache/btree.c388
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/sysfs.c2
4 files changed, 226 insertions, 167 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d03bc6f66493..d6970a651e42 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,7 +477,6 @@ struct gc_stat {
size_t nkeys;
uint64_t data; /* sectors */
- uint64_t dirty; /* sectors */
unsigned in_use; /* percent */
};
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a3f8ca4ee6e0..7d283d217438 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
-static int btree_gc_mark_node(struct btree *b, unsigned *keys,
- struct gc_stat *gc)
+static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{
uint8_t stale = 0;
- unsigned last_dev = -1;
- struct bcache_device *d = NULL;
+ unsigned keys = 0, good_keys = 0;
struct bkey *k;
struct btree_iter iter;
struct bset_tree *t;
@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
gc->nodes++;
for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
- if (last_dev != KEY_INODE(k)) {
- last_dev = KEY_INODE(k);
-
- d = KEY_INODE(k) < b->c->nr_uuids
- ? b->c->devices[last_dev]
- : NULL;
- }
-
stale = max(stale, btree_mark_key(b, k));
+ keys++;
if (bch_ptr_bad(b, k))
continue;
- *keys += bkey_u64s(k);
-
gc->key_bytes += bkey_u64s(k);
gc->nkeys++;
+ good_keys++;
gc->data += KEY_SIZE(k);
- if (KEY_DIRTY(k))
- gc->dirty += KEY_SIZE(k);
}
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
bkey_cmp(&b->key, &t->end) < 0,
b, "found short btree key in gc");
- return stale;
-}
+ if (b->c->gc_always_rewrite)
+ return true;
-static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
-{
- /*
- * We block priorities from being written for the duration of garbage
- * collection, so we can't sleep in btree_alloc() ->
- * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
- * our closure.
- */
- struct btree *n = btree_node_alloc_replacement(b);
-
- if (!IS_ERR_OR_NULL(n)) {
- swap(b, n);
+ if (stale > 10)
+ return true;
- memcpy(k->ptr, b->key.ptr,
- sizeof(uint64_t) * KEY_PTRS(&b->key));
-
- btree_node_free(n);
- up_write(&n->lock);
- }
+ if ((keys - good_keys) * 2 > keys)
+ return true;
- return b;
+ return false;
}
-/*
- * Leaving this at 2 until we've got incremental garbage collection done; it
- * could be higher (and has been tested with 4) except that garbage collection
- * could take much longer, adversely affecting latency.
- */
-#define GC_MERGE_NODES 2U
+#define GC_MERGE_NODES 4U
struct gc_merge_info {
struct btree *b;
- struct bkey *k;
unsigned keys;
};
-static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
- struct gc_merge_info *r)
+static int bch_btree_insert_node(struct btree *, struct btree_op *,
+ struct keylist *, atomic_t *, struct bkey *);
+
+static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
+ struct keylist *keylist, struct gc_stat *gc,
+ struct gc_merge_info *r)
{
- unsigned nodes = 0, keys = 0, blocks;
- int i;
+ unsigned i, nodes = 0, keys = 0, blocks;
+ struct btree *new_nodes[GC_MERGE_NODES];
struct closure cl;
+ struct bkey *k;
+ memset(new_nodes, 0, sizeof(new_nodes));
closure_init_stack(&cl);
- while (nodes < GC_MERGE_NODES && r[nodes].b)
+ while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
keys += r[nodes++].keys;
blocks = btree_default_blocks(b->c) * 2 / 3;
if (nodes < 2 ||
__set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
- return;
-
- for (i = nodes - 1; i >= 0; --i) {
- if (r[i].b->written)
- r[i].b = btree_gc_alloc(r[i].b, r[i].k);
+ return 0;
- if (r[i].b->written)
- return;
+ for (i = 0; i < nodes; i++) {
+ new_nodes[i] = btree_node_alloc_replacement(r[i].b);
+ if (IS_ERR_OR_NULL(new_nodes[i]))
+ goto out_nocoalesce;
}
for (i = nodes - 1; i > 0; --i) {
- struct bset *n1 = r[i].b->sets->data;
- struct bset *n2 = r[i - 1].b->sets->data;
+ struct bset *n1 = new_nodes[i]->sets->data;
+ struct bset *n2 = new_nodes[i - 1]->sets->data;
struct bkey *k, *last = NULL;
keys = 0;
- if (i == 1) {
+ if (i > 1) {
+ for (k = n2->start;
+ k < end(n2);
+ k = bkey_next(k)) {
+ if (__set_blocks(n1, n1->keys + keys +
+ bkey_u64s(k), b->c) > blocks)
+ break;
+
+ last = k;
+ keys += bkey_u64s(k);
+ }
+ } else {
/*
* Last node we're not getting rid of - we're getting
* rid of the node at r[0]. Have to try and fit all of
@@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
* length keys (shouldn't be possible in practice,
* though)
*/
- if (__set_blocks(n1, n1->keys + r->keys,
- b->c) > btree_blocks(r[i].b))
- return;
+ if (__set_blocks(n1, n1->keys + n2->keys,
+ b->c) > btree_blocks(new_nodes[i]))
+ goto out_nocoalesce;
keys = n2->keys;
+ /* Take the key of the node we're getting rid of */
last = &r->b->key;
- } else
- for (k = n2->start;
- k < end(n2);
- k = bkey_next(k)) {
- if (__set_blocks(n1, n1->keys + keys +
- bkey_u64s(k), b->c) > blocks)
- break;
-
- last = k;
- keys += bkey_u64s(k);
- }
+ }
BUG_ON(__set_blocks(n1, n1->keys + keys,
- b->c) > btree_blocks(r[i].b));
+ b->c) > btree_blocks(new_nodes[i]));
- if (last) {
- bkey_copy_key(&r[i].b->key, last);
- bkey_copy_key(r[i].k, last);
- }
+ if (last)
+ bkey_copy_key(&new_nodes[i]->key, last);
memcpy(end(n1),
n2->start,
(void *) node(n2, keys) - (void *) n2->start);
n1->keys += keys;
+ r[i].keys = n1->keys;
memmove(n2->start,
node(n2, keys),
@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
n2->keys -= keys;
- r[i].keys = n1->keys;
- r[i - 1].keys = n2->keys;
+ if (bch_keylist_realloc(keylist,
+ KEY_PTRS(&new_nodes[i]->key), b->c))
+ goto out_nocoalesce;
+
+ bch_btree_node_write(new_nodes[i], &cl);
+ bch_keylist_add(keylist, &new_nodes[i]->key);
}
- btree_node_free(r->b);
- up_write(&r->b->lock);
+ for (i = 0; i < nodes; i++) {
+ if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+ goto out_nocoalesce;
- trace_bcache_btree_gc_coalesce(nodes);
+ make_btree_freeing_key(r[i].b, keylist->top);
+ bch_keylist_push(keylist);
+ }
+
+ /* We emptied out this node */
+ BUG_ON(new_nodes[0]->sets->data->keys);
+ btree_node_free(new_nodes[0]);
+ rw_unlock(true, new_nodes[0]);
+ closure_sync(&cl);
+
+ for (i = 0; i < nodes; i++) {
+ btree_node_free(r[i].b);
+ rw_unlock(true, r[i].b);
+
+ r[i].b = new_nodes[i];
+ }
+
+ bch_btree_insert_node(b, op, keylist, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(keylist));
+
+ memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
+ r[nodes - 1].b = ERR_PTR(-EINTR);
+
+ trace_bcache_btree_gc_coalesce(nodes);
gc->nodes--;
- nodes--;
- memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
- memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+ /* Invalidated our iterator */
+ return -EINTR;
+
+out_nocoalesce:
+ closure_sync(&cl);
+
+ while ((k = bch_keylist_pop(keylist)))
+ if (!bkey_cmp(k, &ZERO_KEY))
+ atomic_dec(&b->c->prio_blocked);
+
+ for (i = 0; i < nodes; i++)
+ if (!IS_ERR_OR_NULL(new_nodes[i])) {
+ btree_node_free(new_nodes[i]);
+ rw_unlock(true, new_nodes[i]);
+ }
+ return 0;
}
-static int btree_gc_recurse(struct btree *b, struct btree_op *op,
- struct closure *writes, struct gc_stat *gc)
+static unsigned btree_gc_count_keys(struct btree *b)
{
- void write(struct btree *r)
- {
- if (!r->written || btree_node_dirty(r))
- bch_btree_node_write(r, writes);
+ struct bkey *k;
+ struct btree_iter iter;
+ unsigned ret = 0;
- up_write(&r->lock);
- }
+ for_each_key_filter(b, k, &iter, bch_ptr_bad)
+ ret += bkey_u64s(k);
- int ret = 0, stale;
+ return ret;
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+ struct closure *writes, struct gc_stat *gc)
+{
unsigned i;
+ int ret = 0;
+ bool should_rewrite;
+ struct btree *n;
+ struct bkey *k;
+ struct keylist keys;
+ struct btree_iter iter;
struct gc_merge_info r[GC_MERGE_NODES];
+ struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
- memset(r, 0, sizeof(r));
+ bch_keylist_init(&keys);
+ bch_btree_iter_init(b, &iter, &b->c->gc_done);
- while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
- r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
+ for (i = 0; i < GC_MERGE_NODES; i++)
+ r[i].b = ERR_PTR(-EINTR);
- if (IS_ERR(r->b)) {
- ret = PTR_ERR(r->b);
- break;
+ while (1) {
+ k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+ if (k) {
+ r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+ if (IS_ERR(r->b)) {
+ ret = PTR_ERR(r->b);
+ break;
+ }
+
+ r->keys = btree_gc_count_keys(r->b);
+
+ ret = btree_gc_coalesce(b, op, &keys, gc, r);
+ if (ret)
+ break;
}
- r->keys = 0;
- stale = btree_gc_mark_node(r->b, &r->keys, gc);
+ if (!last->b)
+ break;
- if (!b->written &&
- (r->b->level || stale > 10 ||
- b->c->gc_always_rewrite))
- r->b = btree_gc_alloc(r->b, r->k);
+ if (!IS_ERR(last->b)) {
+ should_rewrite = btree_gc_mark_node(last->b, gc);
+ if (should_rewrite) {
+ n = btree_node_alloc_replacement(last->b);
- if (r->b->level)
- ret = btree_gc_recurse(r->b, op, writes, gc);
+ if (!IS_ERR_OR_NULL(n)) {
+ bch_btree_node_write_sync(n);
+ bch_keylist_add(&keys, &n->key);
- if (ret) {
- write(r->b);
- break;
- }
+ make_btree_freeing_key(last->b,
+ keys.top);
+ bch_keylist_push(&keys);
+
+ btree_node_free(last->b);
- bkey_copy_key(&b->c->gc_done, r->k);
+ bch_btree_insert_node(b, op, &keys,
+ NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&keys));
- if (!b->written)
- btree_gc_coalesce(b, gc, r);
+ rw_unlock(true, last->b);
+ last->b = n;
+
+ /* Invalidated our iterator */
+ ret = -EINTR;
+ break;
+ }
+ }
- if (r[GC_MERGE_NODES - 1].b)
- write(r[GC_MERGE_NODES - 1].b);
+ if (last->b->level) {
+ ret = btree_gc_recurse(last->b, op, writes, gc);
+ if (ret)
+ break;
+ }
- memmove(&r[1], &r[0],
- sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
+ bkey_copy_key(&b->c->gc_done, &last->b->key);
+
+ /*
+ * Must flush leaf nodes before gc ends, since replace
+ * operations aren't journalled
+ */
+ if (btree_node_dirty(last->b))
+ bch_btree_node_write(last->b, writes);
+ rw_unlock(true, last->b);
+ }
+
+ memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
+ r->b = NULL;
- /* When we've got incremental GC working, we'll want to do
- * if (should_resched())
- * return -EAGAIN;
- */
- cond_resched();
-#if 0
if (need_resched()) {
ret = -EAGAIN;
break;
}
-#endif
}
- for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
- write(r[i].b);
+ for (i = 0; i < GC_MERGE_NODES; i++)
+ if (!IS_ERR_OR_NULL(r[i].b)) {
+ if (btree_node_dirty(r[i].b))
+ bch_btree_node_write(r[i].b, writes);
+ rw_unlock(true, r[i].b);
+ }
- /* Might have freed some children, must remove their keys */
- if (!b->written)
- bch_btree_sort(b);
+ bch_keylist_free(&keys);
return ret;
}
@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
struct btree *n = NULL;
- unsigned keys = 0;
- int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+ int ret = 0;
+ bool should_rewrite;
- if (b->level || stale > 10)
+ should_rewrite = btree_gc_mark_node(b, gc);
+ if (should_rewrite) {
n = btree_node_alloc_replacement(b);
- if (!IS_ERR_OR_NULL(n))
- swap(b, n);
-
- if (b->level)
- ret = btree_gc_recurse(b, op, writes, gc);
+ if (!IS_ERR_OR_NULL(n)) {
+ bch_btree_node_write_sync(n);
+ bch_btree_set_root(n);
+ btree_node_free(b);
+ rw_unlock(true, n);
- if (!b->written || btree_node_dirty(b))
- bch_btree_node_write_sync(b);
+ return -EINTR;
+ }
+ }
- if (!IS_ERR_OR_NULL(n)) {
- bch_btree_set_root(b);
- btree_node_free(n);
- rw_unlock(true, b);
+ if (b->level) {
+ ret = btree_gc_recurse(b, op, writes, gc);
+ if (ret)
+ return ret;
}
+ bkey_copy_key(&b->c->gc_done, &b->key);
+
return ret;
}
@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c)
btree_gc_start(c);
- atomic_inc(&c->prio_blocked);
-
- ret = btree_root(gc_root, c, &op, &writes, &stats);
- closure_sync(&writes);
-
- if (ret) {
- pr_warn("gc failed!");
- return;
- }
+ do {
+ ret = btree_root(gc_root, c, &op, &writes, &stats);
+ closure_sync(&writes);
- /* Possibly wait for new UUIDs or whatever to hit disk */
- bch_journal_meta(c, &writes);
- closure_sync(&writes);
+ if (ret && ret != -EAGAIN)
+ pr_warn("gc failed!");
+ } while (ret);
available = bch_btree_gc_finish(c);
-
- atomic_dec(&c->prio_blocked);
wake_up_allocators(c);
bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t);
- stats.dirty <<= 9;
stats.data <<= 9;
stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c)
static int bch_gc_thread(void *arg)
{
struct cache_set *c = arg;
+ struct cache *ca;
+ unsigned i;
while (1) {
+again:
bch_btree_gc(c);
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop())
break;
+ mutex_lock(&c->bucket_lock);
+
+ for_each_cache(ca, c, i)
+ if (ca->invalidate_needs_gc) {
+ mutex_unlock(&c->bucket_lock);
+ set_current_state(TASK_RUNNING);
+ goto again;
+ }
+
+ mutex_unlock(&c->bucket_lock);
+
try_to_freeze();
schedule();
}
@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
bch_keylist_init(&split_keys);
- BUG_ON(b->level);
-
do {
BUG_ON(b->level && replace_key);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index e11bb8571d24..b5a46affe8eb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
static inline void set_gc_sectors(struct cache_set *c)
{
- atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+ atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
}
static inline struct bkey *bch_btree_iter_init(struct btree *b,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9687771ec6f3..c5f73e34d016 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -489,7 +489,6 @@ lock_root:
sysfs_print(btree_used_percent, btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes);
- sysfs_hprint(dirty_data, c->gc_stats.dirty);
sysfs_hprint(average_key_size, average_key_size(c));
sysfs_print(cache_read_races,
@@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = {
&sysfs_cache_available_percent,
&sysfs_average_key_size,
- &sysfs_dirty_data,
&sysfs_errors,
&sysfs_io_error_limit,
OpenPOWER on IntegriCloud