dm cache policy mq: implement writeback_work() and mq_{set,clear}_dirty()

There are now two multiqueues for in cache blocks. A clean one and a dirty one. writeback_work comes from the dirty one. Demotions come from the clean one. There are two benefits: - Performance improvement, since demoting a clean block is a noop. - The cache cleans itself when io load is light. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
author: Joe Thornber <ejt@redhat.com> 2013-10-24 14:10:28 -0400
committer: Mike Snitzer <snitzer@redhat.com> 2013-11-09 18:20:25 -0500
commit: 01911c19bea63b1a958b9d9024504c2e9079f155 (patch)
tree: 613f9334c9d85702ff89168f587d4d90d24378b9 /drivers/md
parent: ffcbcb6720ab6a4bb6e0a51b3711e8c60872d281 (diff)
download: talos-op-linux-01911c19bea63b1a958b9d9024504c2e9079f155.tar.gz
talos-op-linux-01911c19bea63b1a958b9d9024504c2e9079f155.zip
1 files changed, 128 insertions, 19 deletions
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index a9a25de5b011..6710e038c730 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -224,6 +224,7 @@ struct entry {
 	 * FIXME: pack these better
 	 */
 	bool in_cache:1;
+	bool dirty:1;
 	unsigned hit_count;
 	unsigned generation;
 	unsigned tick;
@@ -238,13 +239,15 @@ struct mq_policy {
 	struct io_tracker tracker;
 
 	/*
-	 * We maintain two queues of entries.  The cache proper contains
-	 * the currently active mappings.  Whereas the pre_cache tracks
-	 * blocks that are being hit frequently and potential candidates
-	 * for promotion to the cache.
+	 * We maintain three queues of entries.  The cache proper,
+	 * consisting of a clean and dirty queue, contains the currently
+	 * active mappings.  Whereas the pre_cache tracks blocks that
+	 * are being hit frequently and potential candidates for promotion
+	 * to the cache.
 	 */
 	struct queue pre_cache;
-	struct queue cache;
+	struct queue cache_clean;
+	struct queue cache_dirty;
 
 	/*
 	 * Keeps track of time, incremented by the core.  We use this to
@@ -324,7 +327,8 @@ static void free_entries(struct mq_policy *mq)
 	struct entry *e, *tmp;
 
 	concat_queue(&mq->free, &mq->pre_cache);
-	concat_queue(&mq->free, &mq->cache);
+	concat_queue(&mq->free, &mq->cache_clean);
+	concat_queue(&mq->free, &mq->cache_dirty);
 
 	list_for_each_entry_safe(e, tmp, &mq->free, list)
 		kmem_cache_free(mq_entry_cache, e);
@@ -508,7 +512,8 @@ static void push(struct mq_policy *mq, struct entry *e)
 
 	if (e->in_cache) {
 		alloc_cblock(mq, e->cblock);
-		queue_push(&mq->cache, queue_level(e), &e->list);
+		queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+			   queue_level(e), &e->list);
 	} else
 		queue_push(&mq->pre_cache, queue_level(e), &e->list);
 }
@@ -558,7 +563,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
  * of the entries.
  *
  * At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries of the first level).
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
  *
  * We can be much cleverer than this though.  For example, each promotion
  * could bump up the threshold helping to prevent churn.  Much more to do
@@ -580,7 +586,16 @@ static void check_generation(struct mq_policy *mq)
 		mq->generation++;
 
 		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
-			head = mq->cache.qs + level;
+			head = mq->cache_clean.qs + level;
+			list_for_each_entry(e, head, list) {
+				nr++;
+				total += e->hit_count;
+
+				if (++count >= MAX_TO_AVERAGE)
+					break;
+			}
+
+			head = mq->cache_dirty.qs + level;
 			list_for_each_entry(e, head, list) {
 				nr++;
 				total += e->hit_count;
@@ -633,19 +648,28 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
  * - set the hit count to a hard coded value other than 1, eg, is it better
  *   if it goes in at level 2?
  */
-static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock)
 {
-	dm_cblock_t result;
-	struct entry *demoted = pop(mq, &mq->cache);
+	struct entry *demoted = pop(mq, &mq->cache_clean);
 
-	BUG_ON(!demoted);
-	result = demoted->cblock;
+	if (!demoted)
+		/*
+		 * We could get a block from mq->cache_dirty, but that
+		 * would add extra latency to the triggering bio as it
+		 * waits for the writeback.  Better to not promote this
+		 * time and hope there's a clean block next time this block
+		 * is hit.
+		 */
+		return -ENOSPC;
+
+	*cblock = demoted->cblock;
 	*oblock = demoted->oblock;
 	demoted->in_cache = false;
+	demoted->dirty = false;
 	demoted->hit_count = 1;
 	push(mq, demoted);
 
-	return result;
+	return 0;
 }
 
 /*
@@ -705,11 +729,16 @@ static int cache_entry_found(struct mq_policy *mq,
 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 			      struct policy_result *result)
 {
+	int r;
 	dm_cblock_t cblock;
 
 	if (find_free_cblock(mq, &cblock) == -ENOSPC) {
 		result->op = POLICY_REPLACE;
-		cblock = demote_cblock(mq, &result->old_oblock);
+		r = demote_cblock(mq, &result->old_oblock, &cblock);
+		if (r) {
+			result->op = POLICY_MISS;
+			return 0;
+		}
 	} else
 		result->op = POLICY_NEW;
 
@@ -717,6 +746,7 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
 	del(mq, e);
 	e->in_cache = true;
+	e->dirty = false;
 	push(mq, e);
 
 	return 0;
@@ -760,6 +790,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 	}
 
 	e->in_cache = false;
+	e->dirty = false;
 	e->oblock = oblock;
 	e->hit_count = 1;
 	e->generation = mq->generation;
@@ -787,6 +818,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 	e->oblock = oblock;
 	e->cblock = cblock;
 	e->in_cache = true;
+	e->dirty = false;
 	e->hit_count = 1;
 	e->generation = mq->generation;
 	push(mq, e);
@@ -917,6 +949,40 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
 	return r;
 }
 
+/*
+ * FIXME: __mq_set_clear_dirty can block due to mutex.
+ * Ideally a policy should not block in functions called
+ * from the map() function.  Explore using RCU.
+ */
+static void __mq_set_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock, bool set)
+{
+	struct mq_policy *mq = to_mq_policy(p);
+	struct entry *e;
+
+	mutex_lock(&mq->lock);
+	e = hash_lookup(mq, oblock);
+	if (!e)
+		DMWARN("__mq_set_clear_dirty called for a block that isn't in the cache");
+	else {
+		BUG_ON(!e->in_cache);
+
+		del(mq, e);
+		e->dirty = set;
+		push(mq, e);
+	}
+	mutex_unlock(&mq->lock);
+}
+
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	__mq_set_clear_dirty(p, oblock, true);
+}
+
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	__mq_set_clear_dirty(p, oblock, false);
+}
+
 static int mq_load_mapping(struct dm_cache_policy *p,
 			   dm_oblock_t oblock, dm_cblock_t cblock,
 			   uint32_t hint, bool hint_valid)
@@ -931,6 +997,7 @@ static int mq_load_mapping(struct dm_cache_policy *p,
 	e->cblock = cblock;
 	e->oblock = oblock;
 	e->in_cache = true;
+	e->dirty = false;	/* this gets corrected in a minute */
 	e->hit_count = hint_valid ? hint : 1;
 	e->generation = mq->generation;
 	push(mq, e);
@@ -949,7 +1016,14 @@ static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
 	mutex_lock(&mq->lock);
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, &mq->cache.qs[level], list) {
+		list_for_each_entry(e, &mq->cache_clean.qs[level], list) {
+			r = fn(context, e->cblock, e->oblock, e->hit_count);
+			if (r)
+				goto out;
+		}
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each_entry(e, &mq->cache_dirty.qs[level], list) {
 			r = fn(context, e->cblock, e->oblock, e->hit_count);
 			if (r)
 				goto out;
@@ -974,11 +1048,41 @@ static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
 
 	del(mq, e);
 	e->in_cache = false;
+	e->dirty = false;
 	push(mq, e);
 
 	mutex_unlock(&mq->lock);
 }
 
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
+			      dm_cblock_t *cblock)
+{
+	struct entry *e = pop(mq, &mq->cache_dirty);
+
+	if (!e)
+		return -ENODATA;
+
+	*oblock = e->oblock;
+	*cblock = e->cblock;
+	e->dirty = false;
+	push(mq, e);
+
+	return 0;
+}
+
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+			     dm_cblock_t *cblock)
+{
+	int r;
+	struct mq_policy *mq = to_mq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __mq_writeback_work(mq, oblock, cblock);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
 static void force_mapping(struct mq_policy *mq,
 			  dm_oblock_t current_oblock, dm_oblock_t new_oblock)
 {
@@ -988,6 +1092,7 @@ static void force_mapping(struct mq_policy *mq,
 
 	del(mq, e);
 	e->oblock = new_oblock;
+	e->dirty = true;
 	push(mq, e);
 }
 
@@ -1063,10 +1168,12 @@ static void init_policy_functions(struct mq_policy *mq)
 	mq->policy.destroy = mq_destroy;
 	mq->policy.map = mq_map;
 	mq->policy.lookup = mq_lookup;
+	mq->policy.set_dirty = mq_set_dirty;
+	mq->policy.clear_dirty = mq_clear_dirty;
 	mq->policy.load_mapping = mq_load_mapping;
 	mq->policy.walk_mappings = mq_walk_mappings;
 	mq->policy.remove_mapping = mq_remove_mapping;
-	mq->policy.writeback_work = NULL;
+	mq->policy.writeback_work = mq_writeback_work;
 	mq->policy.force_mapping = mq_force_mapping;
 	mq->policy.residency = mq_residency;
 	mq->policy.tick = mq_tick;
@@ -1099,7 +1206,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 	mq->find_free_last_word = 0;
 
 	queue_init(&mq->pre_cache);
-	queue_init(&mq->cache);
+	queue_init(&mq->cache_clean);
+	queue_init(&mq->cache_dirty);
+
 	mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
 
 	mq->nr_entries = 2 * from_cblock(cache_size);
author	Joe Thornber <ejt@redhat.com>	2013-10-24 14:10:28 -0400
committer	Mike Snitzer <snitzer@redhat.com>	2013-11-09 18:20:25 -0500
commit	01911c19bea63b1a958b9d9024504c2e9079f155 (patch)
tree	613f9334c9d85702ff89168f587d4d90d24378b9 /drivers/md
parent	ffcbcb6720ab6a4bb6e0a51b3711e8c60872d281 (diff)
download	talos-op-linux-01911c19bea63b1a958b9d9024504c2e9079f155.tar.gz talos-op-linux-01911c19bea63b1a958b9d9024504c2e9079f155.zip