dm cache: significant rework to leverage dm-bio-prison-v2

The cache policy interfaces have been updated to work well with the new bio-prison v2 interface's ability to queue work immediately (promotion, demotion, etc) -- overriding benefit being reduced latency on processing IO through the cache. Previously such work would be left for the DM cache core to queue on various lists and then process in batches later -- this caused a serious delay in latency for IO driven by the cache. The background tracker code was factored out so that all cache policies can make use of it. Also, the "cleaner" policy has been removed and is now a variant of the smq policy that simply disallows migrations. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2016-12-15 04:57:31 -05:00 · 2016-12-15 04:57:31 -05:00 · b29d4986d0
parent 742c8fdc31
commit b29d4986d0
10 changed files with 2087 additions and 2564 deletions
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@ -325,14 +325,6 @@ config DM_CACHE_SMQ
         of less memory utilization, improved performance and increased
         adaptability in the face of changing workloads.

-config DM_CACHE_CLEANER
-       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
-       depends on DM_CACHE
-       default y
-       ---help---
-         A simple cache policy that writes back all data to the
-         origin.  Used when decommissioning a dm-cache.
-
 config DM_ERA
       tristate "Era target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@ -13,9 +13,9 @@ dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
-dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+		    dm-cache-background-tracker.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o

--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-background-tracker.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "dm-background-tracker"
+
+struct bt_work {
+	struct list_head list;
+	struct rb_node node;
+	struct policy_work work;
+};
+
+struct background_tracker {
+	unsigned max_work;
+	atomic_t pending_promotes;
+	atomic_t pending_writebacks;
+	atomic_t pending_demotes;
+
+	struct list_head issued;
+	struct list_head queued;
+	struct rb_root pending;
+
+	struct kmem_cache *work_cache;
+};
+
+struct background_tracker *btracker_create(unsigned max_work)
+{
+	struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+
+	b->max_work = max_work;
+	atomic_set(&b->pending_promotes, 0);
+	atomic_set(&b->pending_writebacks, 0);
+	atomic_set(&b->pending_demotes, 0);
+
+	INIT_LIST_HEAD(&b->issued);
+	INIT_LIST_HEAD(&b->queued);
+
+	b->pending = RB_ROOT;
+	b->work_cache = KMEM_CACHE(bt_work, 0);
+	if (!b->work_cache) {
+		DMERR("couldn't create mempool for background work items");
+		kfree(b);
+		b = NULL;
+	}
+
+	return b;
+}
+EXPORT_SYMBOL_GPL(btracker_create);
+
+void btracker_destroy(struct background_tracker *b)
+{
+	kmem_cache_destroy(b->work_cache);
+	kfree(b);
+}
+EXPORT_SYMBOL_GPL(btracker_destroy);
+
+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
+{
+	if (from_oblock(lhs) < from_oblock(rhs))
+		return -1;
+
+	if (from_oblock(rhs) < from_oblock(lhs))
+		return 1;
+
+	return 0;
+}
+
+static bool __insert_pending(struct background_tracker *b,
+			     struct bt_work *nw)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node, *parent = NULL;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		parent = *new;
+		cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			/* already present */
+			return false;
+	}
+
+	rb_link_node(&nw->node, parent, new);
+	rb_insert_color(&nw->node, &b->pending);
+
+	return true;
+}
+
+static struct bt_work *__find_pending(struct background_tracker *b,
+				      dm_oblock_t oblock)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		cmp = cmp_oblock(w->work.oblock, oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			break;
+	}
+
+	return *new ? w : NULL;
+}
+
+
+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
+{
+	switch (w->op) {
+	case POLICY_PROMOTE:
+		atomic_add(delta, &b->pending_promotes);
+		break;
+
+	case POLICY_DEMOTE:
+		atomic_add(delta, &b->pending_demotes);
+		break;
+
+	case POLICY_WRITEBACK:
+		atomic_add(delta, &b->pending_writebacks);
+		break;
+	}
+}
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_writebacks);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
+
+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_demotes);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
+
+static bool max_work_reached(struct background_tracker *b)
+{
+	// FIXME: finish
+	return false;
+}
+
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork)
+{
+	struct bt_work *w;
+
+	if (pwork)
+		*pwork = NULL;
+
+	if (max_work_reached(b))
+		return -ENOMEM;
+
+	w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+	if (!w)
+		return -ENOMEM;
+
+	memcpy(&w->work, work, sizeof(*work));
+
+	if (!__insert_pending(b, w)) {
+		/*
+		 * There was a race, we'll just ignore this second
+		 * bit of work for the same oblock.
+		 */
+		kmem_cache_free(b->work_cache, w);
+		return -EINVAL;
+	}
+
+	if (pwork) {
+		*pwork = &w->work;
+		list_add(&w->list, &b->issued);
+	} else
+		list_add(&w->list, &b->queued);
+	update_stats(b, &w->work, 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_queue);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work)
+{
+	struct bt_work *w;
+
+	if (list_empty(&b->queued))
+		return -ENODATA;
+
+	w = list_first_entry(&b->queued, struct bt_work, list);
+	list_move(&w->list, &b->issued);
+	*work = &w->work;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_issue);
+
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op)
+{
+	struct bt_work *w = container_of(op, struct bt_work, work);
+
+	update_stats(b, &w->work, -1);
+	rb_erase(&w->node, &b->pending);
+	list_del(&w->list);
+	kmem_cache_free(b->work_cache, w);
+}
+EXPORT_SYMBOL_GPL(btracker_complete);
+
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock)
+{
+	return __find_pending(b, oblock) != NULL;
+}
+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
+
+/*----------------------------------------------------------------*/
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BACKGROUND_WORK_H
+#define DM_CACHE_BACKGROUND_WORK_H
+
+#include <linux/vmalloc.h>
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+struct background_work;
+struct background_tracker;
+
+/*
+ * FIXME: discuss lack of locking in all methods.
+ */
+struct background_tracker *btracker_create(unsigned max_work);
+void btracker_destroy(struct background_tracker *b);
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
+
+/*
+ * returns -EINVAL iff the work is already queued.  -ENOMEM if the work
+ * couldn't be queued for another reason.
+ */
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work);
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op);
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock);
+
+/*----------------------------------------------------------------*/
+
+#endif
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@ -50,6 +50,8 @@
 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
 #define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL

+struct dm_cache_metadata;
+
 /*
 * Reopens or creates a new, empty metadata volume.  Returns an ERR_PTR on
 * failure.  If reopening then features must match.
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * writeback cache policy supporting flushing out dirty cache blocks.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*----------------------------------------------------------------*/
-
-#define DM_MSG_PREFIX "cache cleaner"
-
-/* Cache entry struct. */
-struct wb_cache_entry {
-	struct list_head list;
-	struct hlist_node hlist;
-
-	dm_oblock_t oblock;
-	dm_cblock_t cblock;
-	bool dirty:1;
-	bool pending:1;
-};
-
-struct hash {
-	struct hlist_head *table;
-	dm_block_t hash_bits;
-	unsigned nr_buckets;
-};
-
-struct policy {
-	struct dm_cache_policy policy;
-	spinlock_t lock;
-
-	struct list_head free;
-	struct list_head clean;
-	struct list_head clean_pending;
-	struct list_head dirty;
-
-	/*
-	 * We know exactly how many cblocks will be needed,
-	 * so we can allocate them up front.
-	 */
-	dm_cblock_t cache_size, nr_cblocks_allocated;
-	struct wb_cache_entry *cblocks;
-	struct hash chash;
-};
-
-/*----------------------------------------------------------------------------*/
-
-/*
- * Low-level functions.
- */
-static unsigned next_power(unsigned n, unsigned min)
-{
-	return roundup_pow_of_two(max(n, min));
-}
-
-static struct policy *to_policy(struct dm_cache_policy *p)
-{
-	return container_of(p, struct policy, policy);
-}
-
-static struct list_head *list_pop(struct list_head *q)
-{
-	struct list_head *r = q->next;
-
-	list_del(r);
-
-	return r;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Allocate/free various resources. */
-static int alloc_hash(struct hash *hash, unsigned elts)
-{
-	hash->nr_buckets = next_power(elts >> 4, 16);
-	hash->hash_bits = __ffs(hash->nr_buckets);
-	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
-
-	return hash->table ? 0 : -ENOMEM;
-}
-
-static void free_hash(struct hash *hash)
-{
-	vfree(hash->table);
-}
-
-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
-{
-	int r = -ENOMEM;
-
-	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
-	if (p->cblocks) {
-		unsigned u = from_cblock(cache_size);
-
-		while (u--)
-			list_add(&p->cblocks[u].list, &p->free);
-
-		p->nr_cblocks_allocated = 0;
-
-		/* Cache entries hash. */
-		r = alloc_hash(&p->chash, from_cblock(cache_size));
-		if (r)
-			vfree(p->cblocks);
-	}
-
-	return r;
-}
-
-static void free_cache_blocks_and_hash(struct policy *p)
-{
-	free_hash(&p->chash);
-	vfree(p->cblocks);
-}
-
-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
-{
-	struct wb_cache_entry *e;
-
-	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
-
-	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
-
-	return e;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Hash functions (lookup, insert, remove). */
-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
-{
-	struct hash *hash = &p->chash;
-	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
-	struct wb_cache_entry *cur;
-	struct hlist_head *bucket = &hash->table[h];
-
-	hlist_for_each_entry(cur, bucket, hlist) {
-		if (cur->oblock == oblock) {
-			/* Move upfront bucket for faster access. */
-			hlist_del(&cur->hlist);
-			hlist_add_head(&cur->hlist, bucket);
-			return cur;
-		}
-	}
-
-	return NULL;
-}
-
-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
-
-	hlist_add_head(&e->hlist, &p->chash.table[h]);
-}
-
-static void remove_cache_hash_entry(struct wb_cache_entry *e)
-{
-	hlist_del(&e->hlist);
-}
-
-/* Public interface (see dm-cache-policy.h */
-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
-		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_locker *locker,
-		  struct policy_result *result)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	result->op = POLICY_MISS;
-
-	if (can_block)
-		spin_lock_irqsave(&p->lock, flags);
-
-	else if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		result->op = POLICY_HIT;
-		result->cblock = e->cblock;
-
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return 0;
-}
-
-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		*cblock = e->cblock;
-		r = 0;
-
-	} else
-		r = -ENOENT;
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-
-	e = lookup_cache_entry(p, oblock);
-	BUG_ON(!e);
-
-	if (set) {
-		if (!e->dirty) {
-			e->dirty = true;
-			list_move(&e->list, &p->dirty);
-		}
-
-	} else {
-		if (e->dirty) {
-			e->pending = false;
-			e->dirty = false;
-			list_move(&e->list, &p->clean);
-		}
-	}
-}
-
-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, true);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, false);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	insert_cache_hash_entry(p, e);
-	if (e->dirty)
-		list_add(&e->list, &p->dirty);
-	else
-		list_add(&e->list, &p->clean);
-}
-
-static int wb_load_mapping(struct dm_cache_policy *pe,
-			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e = alloc_cache_entry(p);
-
-	if (e) {
-		e->cblock = cblock;
-		e->oblock = oblock;
-		e->dirty = false; /* blocks default to clean */
-		add_cache_entry(p, e);
-		r = 0;
-
-	} else
-		r = -ENOMEM;
-
-	return r;
-}
-
-static void wb_destroy(struct dm_cache_policy *pe)
-{
-	struct policy *p = to_policy(pe);
-
-	free_cache_blocks_and_hash(p);
-	kfree(p);
-}
-
-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
-{
-	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
-
-	BUG_ON(!r);
-
-	remove_cache_hash_entry(r);
-	list_del(&r->list);
-
-	return r;
-}
-
-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, oblock);
-	list_add_tail(&e->list, &p->free);
-	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_force_mapping(struct dm_cache_policy *pe,
-				dm_oblock_t current_oblock, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, current_oblock);
-	e->oblock = oblock;
-	add_cache_entry(p, e);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
-{
-	struct list_head *l;
-	struct wb_cache_entry *r;
-
-	if (list_empty(&p->dirty))
-		return NULL;
-
-	l = list_pop(&p->dirty);
-	r = container_of(l, struct wb_cache_entry, list);
-	list_add(l, &p->clean_pending);
-
-	return r;
-}
-
-static int wb_writeback_work(struct dm_cache_policy *pe,
-			     dm_oblock_t *oblock,
-			     dm_cblock_t *cblock,
-			     bool critical_only)
-{
-	int r = -ENOENT;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-
-	e = get_next_dirty_entry(p);
-	if (e) {
-		*oblock = e->oblock;
-		*cblock = e->cblock;
-		r = 0;
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
-{
-	return to_policy(pe)->nr_cblocks_allocated;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct policy *p)
-{
-	p->policy.destroy = wb_destroy;
-	p->policy.map = wb_map;
-	p->policy.lookup = wb_lookup;
-	p->policy.set_dirty = wb_set_dirty;
-	p->policy.clear_dirty = wb_clear_dirty;
-	p->policy.load_mapping = wb_load_mapping;
-	p->policy.get_hint = NULL;
-	p->policy.remove_mapping = wb_remove_mapping;
-	p->policy.writeback_work = wb_writeback_work;
-	p->policy.force_mapping = wb_force_mapping;
-	p->policy.residency = wb_residency;
-	p->policy.tick = NULL;
-}
-
-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
-					 sector_t origin_size,
-					 sector_t cache_block_size)
-{
-	int r;
-	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
-
-	if (!p)
-		return NULL;
-
-	init_policy_functions(p);
-	INIT_LIST_HEAD(&p->free);
-	INIT_LIST_HEAD(&p->clean);
-	INIT_LIST_HEAD(&p->clean_pending);
-	INIT_LIST_HEAD(&p->dirty);
-
-	p->cache_size = cache_size;
-	spin_lock_init(&p->lock);
-
-	/* Allocate cache entry structs and add them to free list. */
-	r = alloc_cache_blocks_with_hash(p, cache_size);
-	if (!r)
-		return &p->policy;
-
-	kfree(p);
-
-	return NULL;
-}
-/*----------------------------------------------------------------------------*/
-
-static struct dm_cache_policy_type wb_policy_type = {
-	.name = "cleaner",
-	.version = {1, 0, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = wb_create
-};
-
-static int __init wb_init(void)
-{
-	int r = dm_cache_policy_register(&wb_policy_type);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-	else
-		DMINFO("version %u.%u.%u loaded",
-		       wb_policy_type.version[0],
-		       wb_policy_type.version[1],
-		       wb_policy_type.version[2]);
-
-	return r;
-}
-
-static void __exit wb_exit(void)
-{
-	dm_cache_policy_unregister(&wb_policy_type);
-}
-
-module_init(wb_init);
-module_exit(wb_exit);
-
-MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("cleaner cache policy");
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@ -12,40 +12,59 @@

 /*----------------------------------------------------------------*/

-/*
- * Little inline functions that simplify calling the policy methods.
- */
-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-			     bool can_block, bool can_migrate, bool discarded_oblock,
-			     struct bio *bio, struct policy_locker *locker,
-			     struct policy_result *result)
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy, bool *background_queued)
 {
-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+	return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
 }

-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
+					  dm_oblock_t oblock, dm_cblock_t *cblock,
+					  int data_dir, bool fast_copy,
+					  struct policy_work **work)
 {
-	BUG_ON(!p->lookup);
-	return p->lookup(p, oblock, cblock);
+	if (!p->lookup_with_work) {
+		*work = NULL;
+		return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
+	}
+
+	return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
 }

-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_get_background_work(struct dm_cache_policy *p,
+					     bool idle, struct policy_work **result)
 {
-	if (p->set_dirty)
-		p->set_dirty(p, oblock);
+	return p->get_background_work(p, idle, result);
 }

-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline void policy_complete_background_work(struct dm_cache_policy *p,
+						   struct policy_work *work,
+						   bool success)
 {
-	if (p->clear_dirty)
-		p->clear_dirty(p, oblock);
+	return p->complete_background_work(p, work, success);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->set_dirty(p, cblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->clear_dirty(p, cblock);
 }

 static inline int policy_load_mapping(struct dm_cache_policy *p,
 				      dm_oblock_t oblock, dm_cblock_t cblock,
-				      uint32_t hint, bool hint_valid)
+				      bool dirty, uint32_t hint, bool hint_valid)
 {
-	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+	return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
+}
+
+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
+					    dm_cblock_t cblock)
+{
+	return p->invalidate_mapping(p, cblock);
 }

 static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
@ -54,30 +73,6 @@ static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
 	return p->get_hint ? p->get_hint(p, cblock) : 0;
 }

-static inline int policy_writeback_work(struct dm_cache_policy *p,
-					dm_oblock_t *oblock,
-					dm_cblock_t *cblock,
-					bool critical_only)
-{
-	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
-}
-
-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	p->remove_mapping(p, oblock);
-}
-
-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-	return p->remove_cblock(p, cblock);
-}
-
-static inline void policy_force_mapping(struct dm_cache_policy *p,
-					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	return p->force_mapping(p, current_oblock, new_oblock);
-}
-
 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 {
 	return p->residency(p);
@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
 }

+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+	return p->allow_migrations(p, allow);
+}
+
 /*----------------------------------------------------------------*/

 /*
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@ -13,147 +13,94 @@

 /*----------------------------------------------------------------*/

-/* FIXME: make it clear which methods are optional.  Get debug policy to
- * double check this at start.
- */
-
 /*
 * The cache policy makes the important decisions about which blocks get to
 * live on the faster cache device.
- *
- * When the core target has to remap a bio it calls the 'map' method of the
- * policy.  This returns an instruction telling the core target what to do.
- *
- * POLICY_HIT:
- *   That block is in the cache.  Remap to the cache and carry on.
- *
- * POLICY_MISS:
- *   This block is on the origin device.  Remap and carry on.
- *
- * POLICY_NEW:
- *   This block is currently on the origin device, but the policy wants to
- *   move it.  The core should:
- *
- *   - hold any further io to this origin block
- *   - copy the origin to the given cache block
- *   - release all the held blocks
- *   - remap the original block to the cache
- *
- * POLICY_REPLACE:
- *   This block is currently on the origin device.  The policy wants to
- *   move it to the cache, with the added complication that the destination
- *   cache block needs a writeback first.  The core should:
- *
- *   - hold any further io to this origin block
- *   - hold any further io to the origin block that's being written back
- *   - writeback
- *   - copy new block to cache
- *   - release held blocks
- *   - remap bio to cache and reissue.
- *
- * Should the core run into trouble while processing a POLICY_NEW or
- * POLICY_REPLACE instruction it will roll back the policies mapping using
- * remove_mapping() or force_mapping().  These methods must not fail.  This
- * approach avoids having transactional semantics in the policy (ie, the
- * core informing the policy when a migration is complete), and hence makes
- * it easier to write new policies.
- *
- * In general policy methods should never block, except in the case of the
- * map function when can_migrate is set.  So be careful to implement using
- * bounded, preallocated memory.
 */
 enum policy_operation {
-	POLICY_HIT,
-	POLICY_MISS,
-	POLICY_NEW,
-	POLICY_REPLACE
-};
-
-/*
- * When issuing a POLICY_REPLACE the policy needs to make a callback to
- * lock the block being demoted.  This doesn't need to occur during a
- * writeback operation since the block remains in the cache.
- */
-struct policy_locker;
-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
-
-struct policy_locker {
-	policy_lock_fn fn;
+	POLICY_PROMOTE,
+	POLICY_DEMOTE,
+	POLICY_WRITEBACK
 };

 /*
 * This is the instruction passed back to the core target.
 */
-struct policy_result {
+struct policy_work {
 	enum policy_operation op;
-	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
-	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
 };

 /*
- * The cache policy object.  Just a bunch of methods.  It is envisaged that
- * this structure will be embedded in a bigger, policy specific structure
- * (ie. use container_of()).
+ * The cache policy object.  It is envisaged that this structure will be
+ * embedded in a bigger, policy specific structure (ie. use container_of()).
 */
 struct dm_cache_policy {
-
-	/*
-	 * FIXME: make it clear which methods are optional, and which may
-	 * block.
-	 */
-
 	/*
 	 * Destroys this object.
 	 */
 	void (*destroy)(struct dm_cache_policy *p);

 	/*
-	 * See large comment above.
-	 *
-	 * oblock      - the origin block we're interested in.
-	 *
-	 * can_block - indicates whether the current thread is allowed to
-	 *             block.  -EWOULDBLOCK returned if it can't and would.
-	 *
-	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
-	 *               instructions.  If denied and the policy would have
-	 *               returned one of these instructions it should
-	 *               return -EWOULDBLOCK.
-	 *
-	 * discarded_oblock - indicates whether the whole origin block is
-	 *               in a discarded state (FIXME: better to tell the
-	 *               policy about this sooner, so it can recycle that
-	 *               cache block if it wants.)
-	 * bio         - the bio that triggered this call.
-	 * result      - gets filled in with the instruction.
-	 *
-	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
-	 */
-	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
-		   bool can_block, bool can_migrate, bool discarded_oblock,
-		   struct bio *bio, struct policy_locker *locker,
-		   struct policy_result *result);
-
-	/*
-	 * Sometimes we want to see if a block is in the cache, without
-	 * triggering any update of stats.  (ie. it's not a real hit).
+	 * Find the location of a block.
 	 *
 	 * Must not block.
 	 *
-	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
-	 * (-EWOULDBLOCK would be typical).
+	 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
+	 * other errors (-EWOULDBLOCK would be typical).  data_dir should be
+	 * READ or WRITE. fast_copy should be set if migrating this block would
+	 * be 'cheap' somehow (eg, discarded data). background_queued will be set
+	 * if a migration has just been queued.
 	 */
-	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+		      int data_dir, bool fast_copy, bool *background_queued);

-	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+	/*
+	 * Sometimes the core target can optimise a migration, eg, the
+	 * block may be discarded, or the bio may cover an entire block.
+	 * In order to optimise it needs the migration immediately though
+	 * so it knows to do something different with the bio.
+	 *
+	 * This method is optional (policy-internal will fallback to using
+	 * lookup).
+	 */
+	int (*lookup_with_work)(struct dm_cache_policy *p,
+				dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy,
+				struct policy_work **work);
+
+	/*
+	 * Retrieves background work.  Returns -ENODATA when there's no
+	 * background work.
+	 */
+	int (*get_background_work)(struct dm_cache_policy *p, bool idle,
+			           struct policy_work **result);
+
+	/*
+	 * You must pass in the same work pointer that you were given, not
+	 * a copy.
+	 */
+	void (*complete_background_work)(struct dm_cache_policy *p,
+					 struct policy_work *work,
+					 bool success);
+
+	void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
+	void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);

 	/*
 	 * Called when a cache target is first created.  Used to load a
 	 * mapping from the metadata device into the policy.
 	 */
 	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
-			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+			    dm_cblock_t cblock, bool dirty,
+			    uint32_t hint, bool hint_valid);
+
+	/*
+	 * Drops the mapping, irrespective of whether it's clean or dirty.
+	 * Returns -ENODATA if cblock is not mapped.
+	 */
+	int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);

 	/*
 	 * Gets the hint for a given cblock.  Called in a single threaded
@ -161,36 +108,6 @@ struct dm_cache_policy {
 	 */
 	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);

-	/*
-	 * Override functions used on the error paths of the core target.
-	 * They must succeed.
-	 */
-	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
-			      dm_oblock_t new_oblock);
-
-	/*
-	 * This is called via the invalidate_cblocks message.  It is
-	 * possible the particular cblock has already been removed due to a
-	 * write io in passthrough mode.  In which case this should return
-	 * -ENODATA.
-	 */
-	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
-
-	/*
-	 * Provide a dirty block to be written back by the core target.  If
-	 * critical_only is set then the policy should only provide work if
-	 * it urgently needs it.
-	 *
-	 * Returns:
-	 *
-	 * 0 and @cblock,@oblock: block to write back provided
-	 *
-	 * -ENODATA: no dirty blocks available
-	 */
-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
-			      bool critical_only);
-
 	/*
 	 * How full is the cache?
 	 */
@ -202,6 +119,8 @@ struct dm_cache_policy {
 	 * queue merging has occurred).  To stop the policy being fooled by
 	 * these, the core target sends regular tick() calls to the policy.
 	 * The policy should only count an entry as hit once per tick.
+	 *
+	 * This method is optional.
 	 */
 	void (*tick)(struct dm_cache_policy *p, bool can_block);

@ -213,6 +132,8 @@ struct dm_cache_policy {
 	int (*set_config_value)(struct dm_cache_policy *p,
 				const char *key, const char *value);

+	void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
+
 	/*
 	 * Book keeping ptr for the policy register, not for general use.
 	 */
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c