Blame - src/kernel/linux/v4.14/drivers/md/dm-cache-target.c - T103

blob: 280873b13e744126f922602de92dbde6f3870379 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2012 Red Hat. All rights reserved.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-prison-v2.h"
				9	#include "dm-bio-record.h"
				10	#include "dm-cache-metadata.h"
				11
				12	#include <linux/dm-io.h>
				13	#include <linux/dm-kcopyd.h>
				14	#include <linux/jiffies.h>
				15	#include <linux/init.h>
				16	#include <linux/mempool.h>
				17	#include <linux/module.h>
				18	#include <linux/rwsem.h>
				19	#include <linux/slab.h>
				20	#include <linux/vmalloc.h>
				21
				22	#define DM_MSG_PREFIX "cache"
				23
				24	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
				25	"A percentage of time allocated for copying to and/or from cache");
				26
				27	/----------------------------------------------------------------/
				28
				29	/*
				30	* Glossary:
				31	*
				32	* oblock: index of an origin block
				33	* cblock: index of a cache block
				34	* promotion: movement of a block from origin to cache
				35	* demotion: movement of a block from cache to origin
				36	* migration: movement of a block between the origin and cache device,
				37	* either direction
				38	*/
				39
				40	/----------------------------------------------------------------/
				41
				42	struct io_tracker {
				43	spinlock_t lock;
				44
				45	/*
				46	* Sectors of in-flight IO.
				47	*/
				48	sector_t in_flight;
				49
				50	/*
				51	* The time, in jiffies, when this device became idle (if it is
				52	* indeed idle).
				53	*/
				54	unsigned long idle_time;
				55	unsigned long last_update_time;
				56	};
				57
				58	static void iot_init(struct io_tracker *iot)
				59	{
				60	spin_lock_init(&iot->lock);
				61	iot->in_flight = 0ul;
				62	iot->idle_time = 0ul;
				63	iot->last_update_time = jiffies;
				64	}
				65
				66	static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				67	{
				68	if (iot->in_flight)
				69	return false;
				70
				71	return time_after(jiffies, iot->idle_time + jifs);
				72	}
				73
				74	static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				75	{
				76	bool r;
				77	unsigned long flags;
				78
				79	spin_lock_irqsave(&iot->lock, flags);
				80	r = __iot_idle_for(iot, jifs);
				81	spin_unlock_irqrestore(&iot->lock, flags);
				82
				83	return r;
				84	}
				85
				86	static void iot_io_begin(struct io_tracker *iot, sector_t len)
				87	{
				88	unsigned long flags;
				89
				90	spin_lock_irqsave(&iot->lock, flags);
				91	iot->in_flight += len;
				92	spin_unlock_irqrestore(&iot->lock, flags);
				93	}
				94
				95	static void __iot_io_end(struct io_tracker *iot, sector_t len)
				96	{
				97	if (!len)
				98	return;
				99
				100	iot->in_flight -= len;
				101	if (!iot->in_flight)
				102	iot->idle_time = jiffies;
				103	}
				104
				105	static void iot_io_end(struct io_tracker *iot, sector_t len)
				106	{
				107	unsigned long flags;
				108
				109	spin_lock_irqsave(&iot->lock, flags);
				110	__iot_io_end(iot, len);
				111	spin_unlock_irqrestore(&iot->lock, flags);
				112	}
				113
				114	/----------------------------------------------------------------/
				115
				116	/*
				117	* Represents a chunk of future work. 'input' allows continuations to pass
				118	* values between themselves, typically error values.
				119	*/
				120	struct continuation {
				121	struct work_struct ws;
				122	blk_status_t input;
				123	};
				124
				125	static inline void init_continuation(struct continuation *k,
				126	void (fn)(struct work_struct ))
				127	{
				128	INIT_WORK(&k->ws, fn);
				129	k->input = 0;
				130	}
				131
				132	static inline void queue_continuation(struct workqueue_struct *wq,
				133	struct continuation *k)
				134	{
				135	queue_work(wq, &k->ws);
				136	}
				137
				138	/----------------------------------------------------------------/
				139
				140	/*
				141	* The batcher collects together pieces of work that need a particular
				142	* operation to occur before they can proceed (typically a commit).
				143	*/
				144	struct batcher {
				145	/*
				146	* The operation that everyone is waiting for.
				147	*/
				148	blk_status_t (commit_op)(void context);
				149	void *commit_context;
				150
				151	/*
				152	* This is how bios should be issued once the commit op is complete
				153	* (accounted_request).
				154	*/
				155	void (issue_op)(struct bio bio, void *context);
				156	void *issue_context;
				157
				158	/*
				159	* Queued work gets put on here after commit.
				160	*/
				161	struct workqueue_struct *wq;
				162
				163	spinlock_t lock;
				164	struct list_head work_items;
				165	struct bio_list bios;
				166	struct work_struct commit_work;
				167
				168	bool commit_scheduled;
				169	};
				170
				171	static void __commit(struct work_struct *_ws)
				172	{
				173	struct batcher *b = container_of(_ws, struct batcher, commit_work);
				174	blk_status_t r;
				175	unsigned long flags;
				176	struct list_head work_items;
				177	struct work_struct ws, tmp;
				178	struct continuation *k;
				179	struct bio *bio;
				180	struct bio_list bios;
				181
				182	INIT_LIST_HEAD(&work_items);
				183	bio_list_init(&bios);
				184
				185	/*
				186	* We have to grab these before the commit_op to avoid a race
				187	* condition.
				188	*/
				189	spin_lock_irqsave(&b->lock, flags);
				190	list_splice_init(&b->work_items, &work_items);
				191	bio_list_merge(&bios, &b->bios);
				192	bio_list_init(&b->bios);
				193	b->commit_scheduled = false;
				194	spin_unlock_irqrestore(&b->lock, flags);
				195
				196	r = b->commit_op(b->commit_context);
				197
				198	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
				199	k = container_of(ws, struct continuation, ws);
				200	k->input = r;
				201	INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
				202	queue_work(b->wq, ws);
				203	}
				204
				205	while ((bio = bio_list_pop(&bios))) {
				206	if (r) {
				207	bio->bi_status = r;
				208	bio_endio(bio);
				209	} else
				210	b->issue_op(bio, b->issue_context);
				211	}
				212	}
				213
				214	static void batcher_init(struct batcher *b,
				215	blk_status_t (commit_op)(void ),
				216	void *commit_context,
				217	void (issue_op)(struct bio bio, void *),
				218	void *issue_context,
				219	struct workqueue_struct *wq)
				220	{
				221	b->commit_op = commit_op;
				222	b->commit_context = commit_context;
				223	b->issue_op = issue_op;
				224	b->issue_context = issue_context;
				225	b->wq = wq;
				226
				227	spin_lock_init(&b->lock);
				228	INIT_LIST_HEAD(&b->work_items);
				229	bio_list_init(&b->bios);
				230	INIT_WORK(&b->commit_work, __commit);
				231	b->commit_scheduled = false;
				232	}
				233
				234	static void async_commit(struct batcher *b)
				235	{
				236	queue_work(b->wq, &b->commit_work);
				237	}
				238
				239	static void continue_after_commit(struct batcher b, struct continuation k)
				240	{
				241	unsigned long flags;
				242	bool commit_scheduled;
				243
				244	spin_lock_irqsave(&b->lock, flags);
				245	commit_scheduled = b->commit_scheduled;
				246	list_add_tail(&k->ws.entry, &b->work_items);
				247	spin_unlock_irqrestore(&b->lock, flags);
				248
				249	if (commit_scheduled)
				250	async_commit(b);
				251	}
				252
				253	/*
				254	* Bios are errored if commit failed.
				255	*/
				256	static void issue_after_commit(struct batcher b, struct bio bio)
				257	{
				258	unsigned long flags;
				259	bool commit_scheduled;
				260
				261	spin_lock_irqsave(&b->lock, flags);
				262	commit_scheduled = b->commit_scheduled;
				263	bio_list_add(&b->bios, bio);
				264	spin_unlock_irqrestore(&b->lock, flags);
				265
				266	if (commit_scheduled)
				267	async_commit(b);
				268	}
				269
				270	/*
				271	* Call this if some urgent work is waiting for the commit to complete.
				272	*/
				273	static void schedule_commit(struct batcher *b)
				274	{
				275	bool immediate;
				276	unsigned long flags;
				277
				278	spin_lock_irqsave(&b->lock, flags);
				279	immediate = !list_empty(&b->work_items) \|\| !bio_list_empty(&b->bios);
				280	b->commit_scheduled = true;
				281	spin_unlock_irqrestore(&b->lock, flags);
				282
				283	if (immediate)
				284	async_commit(b);
				285	}
				286
				287	/*
				288	* There are a couple of places where we let a bio run, but want to do some
				289	* work before calling its endio function. We do this by temporarily
				290	* changing the endio fn.
				291	*/
				292	struct dm_hook_info {
				293	bio_end_io_t *bi_end_io;
				294	};
				295
				296	static void dm_hook_bio(struct dm_hook_info h, struct bio bio,
				297	bio_end_io_t bi_end_io, void bi_private)
				298	{
				299	h->bi_end_io = bio->bi_end_io;
				300
				301	bio->bi_end_io = bi_end_io;
				302	bio->bi_private = bi_private;
				303	}
				304
				305	static void dm_unhook_bio(struct dm_hook_info h, struct bio bio)
				306	{
				307	bio->bi_end_io = h->bi_end_io;
				308	}
				309
				310	/----------------------------------------------------------------/
				311
				312	#define MIGRATION_POOL_SIZE 128
				313	#define COMMIT_PERIOD HZ
				314	#define MIGRATION_COUNT_WINDOW 10
				315
				316	/*
				317	* The block size of the device holding cache data must be
				318	* between 32KB and 1GB.
				319	*/
				320	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
				321	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				322
				323	enum cache_metadata_mode {
				324	CM_WRITE, /* metadata may be changed */
				325	CM_READ_ONLY, /* metadata may not be changed */
				326	CM_FAIL
				327	};
				328
				329	enum cache_io_mode {
				330	/*
				331	* Data is written to cached blocks only. These blocks are marked
				332	* dirty. If you lose the cache device you will lose data.
				333	* Potential performance increase for both reads and writes.
				334	*/
				335	CM_IO_WRITEBACK,
				336
				337	/*
				338	* Data is written to both cache and origin. Blocks are never
				339	* dirty. Potential performance benfit for reads only.
				340	*/
				341	CM_IO_WRITETHROUGH,
				342
				343	/*
				344	* A degraded mode useful for various cache coherency situations
				345	* (eg, rolling back snapshots). Reads and writes always go to the
				346	* origin. If a write goes to a cached oblock, then the cache
				347	* block is invalidated.
				348	*/
				349	CM_IO_PASSTHROUGH
				350	};
				351
				352	struct cache_features {
				353	enum cache_metadata_mode mode;
				354	enum cache_io_mode io_mode;
				355	unsigned metadata_version;
				356	};
				357
				358	struct cache_stats {
				359	atomic_t read_hit;
				360	atomic_t read_miss;
				361	atomic_t write_hit;
				362	atomic_t write_miss;
				363	atomic_t demotion;
				364	atomic_t promotion;
				365	atomic_t writeback;
				366	atomic_t copies_avoided;
				367	atomic_t cache_cell_clash;
				368	atomic_t commit_count;
				369	atomic_t discard_count;
				370	};
				371
				372	struct cache {
				373	struct dm_target *ti;
				374	struct dm_target_callbacks callbacks;
				375
				376	struct dm_cache_metadata *cmd;
				377
				378	/*
				379	* Metadata is written to this device.
				380	*/
				381	struct dm_dev *metadata_dev;
				382
				383	/*
				384	* The slower of the two data devices. Typically a spindle.
				385	*/
				386	struct dm_dev *origin_dev;
				387
				388	/*
				389	* The faster of the two data devices. Typically an SSD.
				390	*/
				391	struct dm_dev *cache_dev;
				392
				393	/*
				394	* Size of the origin device in _complete_ blocks and native sectors.
				395	*/
				396	dm_oblock_t origin_blocks;
				397	sector_t origin_sectors;
				398
				399	/*
				400	* Size of the cache device in blocks.
				401	*/
				402	dm_cblock_t cache_size;
				403
				404	/*
				405	* Fields for converting from sectors to blocks.
				406	*/
				407	sector_t sectors_per_block;
				408	int sectors_per_block_shift;
				409
				410	spinlock_t lock;
				411	struct list_head deferred_cells;
				412	struct bio_list deferred_bios;
				413	sector_t migration_threshold;
				414	wait_queue_head_t migration_wait;
				415	atomic_t nr_allocated_migrations;
				416
				417	/*
				418	* The number of in flight migrations that are performing
				419	* background io. eg, promotion, writeback.
				420	*/
				421	atomic_t nr_io_migrations;
				422
				423	struct rw_semaphore quiesce_lock;
				424
				425	/*
				426	* cache_size entries, dirty if set
				427	*/
				428	atomic_t nr_dirty;
				429	unsigned long *dirty_bitset;
				430
				431	/*
				432	* origin_blocks entries, discarded if set.
				433	*/
				434	dm_dblock_t discard_nr_blocks;
				435	unsigned long *discard_bitset;
				436	uint32_t discard_block_size; /* a power of 2 times sectors per block */
				437
				438	/*
				439	* Rather than reconstructing the table line for the status we just
				440	* save it and regurgitate.
				441	*/
				442	unsigned nr_ctr_args;
				443	const char **ctr_args;
				444
				445	struct dm_kcopyd_client *copier;
				446	struct workqueue_struct *wq;
				447	struct work_struct deferred_bio_worker;
				448	struct work_struct migration_worker;
				449	struct delayed_work waker;
				450	struct dm_bio_prison_v2 *prison;
				451	struct bio_set *bs;
				452
				453	mempool_t *migration_pool;
				454
				455	struct dm_cache_policy *policy;
				456	unsigned policy_nr_args;
				457
				458	bool need_tick_bio:1;
				459	bool sized:1;
				460	bool invalidate:1;
				461	bool commit_requested:1;
				462	bool loaded_mappings:1;
				463	bool loaded_discards:1;
				464
				465	/*
				466	* Cache features such as write-through.
				467	*/
				468	struct cache_features features;
				469
				470	struct cache_stats stats;
				471
				472	/*
				473	* Invalidation fields.
				474	*/
				475	spinlock_t invalidation_lock;
				476	struct list_head invalidation_requests;
				477
				478	struct io_tracker tracker;
				479
				480	struct work_struct commit_ws;
				481	struct batcher committer;
				482
				483	struct rw_semaphore background_work_lock;
				484	};
				485
				486	struct per_bio_data {
				487	bool tick:1;
				488	unsigned req_nr:2;
				489	struct dm_bio_prison_cell_v2 *cell;
				490	struct dm_hook_info hook_info;
				491	sector_t len;
				492	};
				493
				494	struct dm_cache_migration {
				495	struct continuation k;
				496	struct cache *cache;
				497
				498	struct policy_work *op;
				499	struct bio *overwrite_bio;
				500	struct dm_bio_prison_cell_v2 *cell;
				501
				502	dm_cblock_t invalidate_cblock;
				503	dm_oblock_t invalidate_oblock;
				504	};
				505
				506	/----------------------------------------------------------------/
				507
				508	static bool writethrough_mode(struct cache *cache)
				509	{
				510	return cache->features.io_mode == CM_IO_WRITETHROUGH;
				511	}
				512
				513	static bool writeback_mode(struct cache *cache)
				514	{
				515	return cache->features.io_mode == CM_IO_WRITEBACK;
				516	}
				517
				518	static inline bool passthrough_mode(struct cache *cache)
				519	{
				520	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
				521	}
				522
				523	/----------------------------------------------------------------/
				524
				525	static void wake_deferred_bio_worker(struct cache *cache)
				526	{
				527	queue_work(cache->wq, &cache->deferred_bio_worker);
				528	}
				529
				530	static void wake_migration_worker(struct cache *cache)
				531	{
				532	if (passthrough_mode(cache))
				533	return;
				534
				535	queue_work(cache->wq, &cache->migration_worker);
				536	}
				537
				538	/----------------------------------------------------------------/
				539
				540	static struct dm_bio_prison_cell_v2 alloc_prison_cell(struct cache cache)
				541	{
				542	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
				543	}
				544
				545	static void free_prison_cell(struct cache cache, struct dm_bio_prison_cell_v2 cell)
				546	{
				547	dm_bio_prison_free_cell_v2(cache->prison, cell);
				548	}
				549
				550	static struct dm_cache_migration alloc_migration(struct cache cache)
				551	{
				552	struct dm_cache_migration *mg;
				553
				554	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
				555	if (mg) {
				556	mg->cache = cache;
				557	atomic_inc(&mg->cache->nr_allocated_migrations);
				558	}
				559
				560	return mg;
				561	}
				562
				563	static void free_migration(struct dm_cache_migration *mg)
				564	{
				565	struct cache *cache = mg->cache;
				566
				567	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
				568	wake_up(&cache->migration_wait);
				569
				570	mempool_free(mg, cache->migration_pool);
				571	}
				572
				573	/----------------------------------------------------------------/
				574
				575	static inline dm_oblock_t oblock_succ(dm_oblock_t b)
				576	{
				577	return to_oblock(from_oblock(b) + 1ull);
				578	}
				579
				580	static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
				581	{
				582	key->virtual = 0;
				583	key->dev = 0;
				584	key->block_begin = from_oblock(begin);
				585	key->block_end = from_oblock(end);
				586	}
				587
				588	/*
				589	* We have two lock levels. Level 0, which is used to prevent WRITEs, and
				590	* level 1 which prevents both READs and WRITEs.
				591	*/
				592	#define WRITE_LOCK_LEVEL 0
				593	#define READ_WRITE_LOCK_LEVEL 1
				594
				595	static unsigned lock_level(struct bio *bio)
				596	{
				597	return bio_data_dir(bio) == WRITE ?
				598	WRITE_LOCK_LEVEL :
				599	READ_WRITE_LOCK_LEVEL;
				600	}
				601
				602	/*----------------------------------------------------------------
				603	* Per bio data
				604	--------------------------------------------------------------/
				605
				606	static size_t get_per_bio_data_size(struct cache *cache)
				607	{
				608	return sizeof(struct per_bio_data);
				609	}
				610
				611	static struct per_bio_data get_per_bio_data(struct bio bio, size_t data_size)
				612	{
				613	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
				614	BUG_ON(!pb);
				615	return pb;
				616	}
				617
				618	static struct per_bio_data init_per_bio_data(struct bio bio, size_t data_size)
				619	{
				620	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
				621
				622	pb->tick = false;
				623	pb->req_nr = dm_bio_get_target_bio_nr(bio);
				624	pb->cell = NULL;
				625	pb->len = 0;
				626
				627	return pb;
				628	}
				629
				630	/----------------------------------------------------------------/
				631
				632	static void defer_bio(struct cache cache, struct bio bio)
				633	{
				634	unsigned long flags;
				635
				636	spin_lock_irqsave(&cache->lock, flags);
				637	bio_list_add(&cache->deferred_bios, bio);
				638	spin_unlock_irqrestore(&cache->lock, flags);
				639
				640	wake_deferred_bio_worker(cache);
				641	}
				642
				643	static void defer_bios(struct cache cache, struct bio_list bios)
				644	{
				645	unsigned long flags;
				646
				647	spin_lock_irqsave(&cache->lock, flags);
				648	bio_list_merge(&cache->deferred_bios, bios);
				649	bio_list_init(bios);
				650	spin_unlock_irqrestore(&cache->lock, flags);
				651
				652	wake_deferred_bio_worker(cache);
				653	}
				654
				655	/----------------------------------------------------------------/
				656
				657	static bool bio_detain_shared(struct cache cache, dm_oblock_t oblock, struct bio bio)
				658	{
				659	bool r;
				660	size_t pb_size;
				661	struct per_bio_data *pb;
				662	struct dm_cell_key_v2 key;
				663	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
				664	struct dm_bio_prison_cell_v2 cell_prealloc, cell;
				665
				666	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
				667	if (!cell_prealloc) {
				668	defer_bio(cache, bio);
				669	return false;
				670	}
				671
				672	build_key(oblock, end, &key);
				673	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
				674	if (!r) {
				675	/*
				676	* Failed to get the lock.
				677	*/
				678	free_prison_cell(cache, cell_prealloc);
				679	return r;
				680	}
				681
				682	if (cell != cell_prealloc)
				683	free_prison_cell(cache, cell_prealloc);
				684
				685	pb_size = get_per_bio_data_size(cache);
				686	pb = get_per_bio_data(bio, pb_size);
				687	pb->cell = cell;
				688
				689	return r;
				690	}
				691
				692	/----------------------------------------------------------------/
				693
				694	static bool is_dirty(struct cache *cache, dm_cblock_t b)
				695	{
				696	return test_bit(from_cblock(b), cache->dirty_bitset);
				697	}
				698
				699	static void set_dirty(struct cache *cache, dm_cblock_t cblock)
				700	{
				701	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
				702	atomic_inc(&cache->nr_dirty);
				703	policy_set_dirty(cache->policy, cblock);
				704	}
				705	}
				706
				707	/*
				708	* These two are called when setting after migrations to force the policy
				709	* and dirty bitset to be in sync.
				710	*/
				711	static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
				712	{
				713	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
				714	atomic_inc(&cache->nr_dirty);
				715	policy_set_dirty(cache->policy, cblock);
				716	}
				717
				718	static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
				719	{
				720	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
				721	if (atomic_dec_return(&cache->nr_dirty) == 0)
				722	dm_table_event(cache->ti->table);
				723	}
				724
				725	policy_clear_dirty(cache->policy, cblock);
				726	}
				727
				728	/----------------------------------------------------------------/
				729
				730	static bool block_size_is_power_of_two(struct cache *cache)
				731	{
				732	return cache->sectors_per_block_shift >= 0;
				733	}
				734
				735	/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
				736	#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
				737	__always_inline
				738	#endif
				739	static dm_block_t block_div(dm_block_t b, uint32_t n)
				740	{
				741	do_div(b, n);
				742
				743	return b;
				744	}
				745
				746	static dm_block_t oblocks_per_dblock(struct cache *cache)
				747	{
				748	dm_block_t oblocks = cache->discard_block_size;
				749
				750	if (block_size_is_power_of_two(cache))
				751	oblocks >>= cache->sectors_per_block_shift;
				752	else
				753	oblocks = block_div(oblocks, cache->sectors_per_block);
				754
				755	return oblocks;
				756	}
				757
				758	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
				759	{
				760	return to_dblock(block_div(from_oblock(oblock),
				761	oblocks_per_dblock(cache)));
				762	}
				763
				764	static void set_discard(struct cache *cache, dm_dblock_t b)
				765	{
				766	unsigned long flags;
				767
				768	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
				769	atomic_inc(&cache->stats.discard_count);
				770
				771	spin_lock_irqsave(&cache->lock, flags);
				772	set_bit(from_dblock(b), cache->discard_bitset);
				773	spin_unlock_irqrestore(&cache->lock, flags);
				774	}
				775
				776	static void clear_discard(struct cache *cache, dm_dblock_t b)
				777	{
				778	unsigned long flags;
				779
				780	spin_lock_irqsave(&cache->lock, flags);
				781	clear_bit(from_dblock(b), cache->discard_bitset);
				782	spin_unlock_irqrestore(&cache->lock, flags);
				783	}
				784
				785	static bool is_discarded(struct cache *cache, dm_dblock_t b)
				786	{
				787	int r;
				788	unsigned long flags;
				789
				790	spin_lock_irqsave(&cache->lock, flags);
				791	r = test_bit(from_dblock(b), cache->discard_bitset);
				792	spin_unlock_irqrestore(&cache->lock, flags);
				793
				794	return r;
				795	}
				796
				797	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
				798	{
				799	int r;
				800	unsigned long flags;
				801
				802	spin_lock_irqsave(&cache->lock, flags);
				803	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
				804	cache->discard_bitset);
				805	spin_unlock_irqrestore(&cache->lock, flags);
				806
				807	return r;
				808	}
				809
				810	/*----------------------------------------------------------------
				811	* Remapping
				812	--------------------------------------------------------------/
				813	static void remap_to_origin(struct cache cache, struct bio bio)
				814	{
				815	bio_set_dev(bio, cache->origin_dev->bdev);
				816	}
				817
				818	static void remap_to_cache(struct cache cache, struct bio bio,
				819	dm_cblock_t cblock)
				820	{
				821	sector_t bi_sector = bio->bi_iter.bi_sector;
				822	sector_t block = from_cblock(cblock);
				823
				824	bio_set_dev(bio, cache->cache_dev->bdev);
				825	if (!block_size_is_power_of_two(cache))
				826	bio->bi_iter.bi_sector =
				827	(block * cache->sectors_per_block) +
				828	sector_div(bi_sector, cache->sectors_per_block);
				829	else
				830	bio->bi_iter.bi_sector =
				831	(block << cache->sectors_per_block_shift) \|
				832	(bi_sector & (cache->sectors_per_block - 1));
				833	}
				834
				835	static void check_if_tick_bio_needed(struct cache cache, struct bio bio)
				836	{
				837	unsigned long flags;
				838	size_t pb_data_size = get_per_bio_data_size(cache);
				839	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				840
				841	spin_lock_irqsave(&cache->lock, flags);
				842	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
				843	bio_op(bio) != REQ_OP_DISCARD) {
				844	pb->tick = true;
				845	cache->need_tick_bio = false;
				846	}
				847	spin_unlock_irqrestore(&cache->lock, flags);
				848	}
				849
				850	static void __remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				851	dm_oblock_t oblock, bool bio_has_pbd)
				852	{
				853	if (bio_has_pbd)
				854	check_if_tick_bio_needed(cache, bio);
				855	remap_to_origin(cache, bio);
				856	if (bio_data_dir(bio) == WRITE)
				857	clear_discard(cache, oblock_to_dblock(cache, oblock));
				858	}
				859
				860	static void remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				861	dm_oblock_t oblock)
				862	{
				863	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
				864	__remap_to_origin_clear_discard(cache, bio, oblock, true);
				865	}
				866
				867	static void remap_to_cache_dirty(struct cache cache, struct bio bio,
				868	dm_oblock_t oblock, dm_cblock_t cblock)
				869	{
				870	check_if_tick_bio_needed(cache, bio);
				871	remap_to_cache(cache, bio, cblock);
				872	if (bio_data_dir(bio) == WRITE) {
				873	set_dirty(cache, cblock);
				874	clear_discard(cache, oblock_to_dblock(cache, oblock));
				875	}
				876	}
				877
				878	static dm_oblock_t get_bio_block(struct cache cache, struct bio bio)
				879	{
				880	sector_t block_nr = bio->bi_iter.bi_sector;
				881
				882	if (!block_size_is_power_of_two(cache))
				883	(void) sector_div(block_nr, cache->sectors_per_block);
				884	else
				885	block_nr >>= cache->sectors_per_block_shift;
				886
				887	return to_oblock(block_nr);
				888	}
				889
				890	static bool accountable_bio(struct cache cache, struct bio bio)
				891	{
				892	return bio_op(bio) != REQ_OP_DISCARD;
				893	}
				894
				895	static void accounted_begin(struct cache cache, struct bio bio)
				896	{
				897	size_t pb_data_size = get_per_bio_data_size(cache);
				898	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				899
				900	if (accountable_bio(cache, bio)) {
				901	pb->len = bio_sectors(bio);
				902	iot_io_begin(&cache->tracker, pb->len);
				903	}
				904	}
				905
				906	static void accounted_complete(struct cache cache, struct bio bio)
				907	{
				908	size_t pb_data_size = get_per_bio_data_size(cache);
				909	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				910
				911	iot_io_end(&cache->tracker, pb->len);
				912	}
				913
				914	static void accounted_request(struct cache cache, struct bio bio)
				915	{
				916	accounted_begin(cache, bio);
				917	generic_make_request(bio);
				918	}
				919
				920	static void issue_op(struct bio bio, void context)
				921	{
				922	struct cache *cache = context;
				923	accounted_request(cache, bio);
				924	}
				925
				926	/*
				927	* When running in writethrough mode we need to send writes to clean blocks
				928	* to both the cache and origin devices. Clone the bio and send them in parallel.
				929	*/
				930	static void remap_to_origin_and_cache(struct cache cache, struct bio bio,
				931	dm_oblock_t oblock, dm_cblock_t cblock)
				932	{
				933	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
				934
				935	BUG_ON(!origin_bio);
				936
				937	bio_chain(origin_bio, bio);
				938	/*
				939	* Passing false to __remap_to_origin_clear_discard() skips
				940	* all code that might use per_bio_data (since clone doesn't have it)
				941	*/
				942	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
				943	submit_bio(origin_bio);
				944
				945	remap_to_cache(cache, bio, cblock);
				946	}
				947
				948	/*----------------------------------------------------------------
				949	* Failure modes
				950	--------------------------------------------------------------/
				951	static enum cache_metadata_mode get_cache_mode(struct cache *cache)
				952	{
				953	return cache->features.mode;
				954	}
				955
				956	static const char cache_device_name(struct cache cache)
				957	{
				958	return dm_device_name(dm_table_get_md(cache->ti->table));
				959	}
				960
				961	static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
				962	{
				963	const char *descs[] = {
				964	"write",
				965	"read-only",
				966	"fail"
				967	};
				968
				969	dm_table_event(cache->ti->table);
				970	DMINFO("%s: switching cache to %s mode",
				971	cache_device_name(cache), descs[(int)mode]);
				972	}
				973
				974	static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
				975	{
				976	bool needs_check;
				977	enum cache_metadata_mode old_mode = get_cache_mode(cache);
				978
				979	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
				980	DMERR("%s: unable to read needs_check flag, setting failure mode.",
				981	cache_device_name(cache));
				982	new_mode = CM_FAIL;
				983	}
				984
				985	if (new_mode == CM_WRITE && needs_check) {
				986	DMERR("%s: unable to switch cache to write mode until repaired.",
				987	cache_device_name(cache));
				988	if (old_mode != new_mode)
				989	new_mode = old_mode;
				990	else
				991	new_mode = CM_READ_ONLY;
				992	}
				993
				994	/* Never move out of fail mode */
				995	if (old_mode == CM_FAIL)
				996	new_mode = CM_FAIL;
				997
				998	switch (new_mode) {
				999	case CM_FAIL:
				1000	case CM_READ_ONLY:
				1001	dm_cache_metadata_set_read_only(cache->cmd);
				1002	break;
				1003
				1004	case CM_WRITE:
				1005	dm_cache_metadata_set_read_write(cache->cmd);
				1006	break;
				1007	}
				1008
				1009	cache->features.mode = new_mode;
				1010
				1011	if (new_mode != old_mode)
				1012	notify_mode_switch(cache, new_mode);
				1013	}
				1014
				1015	static void abort_transaction(struct cache *cache)
				1016	{
				1017	const char *dev_name = cache_device_name(cache);
				1018
				1019	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1020	return;
				1021
				1022	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
				1023	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
				1024	set_cache_mode(cache, CM_FAIL);
				1025	}
				1026
				1027	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
				1028	if (dm_cache_metadata_abort(cache->cmd)) {
				1029	DMERR("%s: failed to abort metadata transaction", dev_name);
				1030	set_cache_mode(cache, CM_FAIL);
				1031	}
				1032	}
				1033
				1034	static void metadata_operation_failed(struct cache cache, const char op, int r)
				1035	{
				1036	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
				1037	cache_device_name(cache), op, r);
				1038	abort_transaction(cache);
				1039	set_cache_mode(cache, CM_READ_ONLY);
				1040	}
				1041
				1042	/----------------------------------------------------------------/
				1043
				1044	static void load_stats(struct cache *cache)
				1045	{
				1046	struct dm_cache_statistics stats;
				1047
				1048	dm_cache_metadata_get_stats(cache->cmd, &stats);
				1049	atomic_set(&cache->stats.read_hit, stats.read_hits);
				1050	atomic_set(&cache->stats.read_miss, stats.read_misses);
				1051	atomic_set(&cache->stats.write_hit, stats.write_hits);
				1052	atomic_set(&cache->stats.write_miss, stats.write_misses);
				1053	}
				1054
				1055	static void save_stats(struct cache *cache)
				1056	{
				1057	struct dm_cache_statistics stats;
				1058
				1059	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1060	return;
				1061
				1062	stats.read_hits = atomic_read(&cache->stats.read_hit);
				1063	stats.read_misses = atomic_read(&cache->stats.read_miss);
				1064	stats.write_hits = atomic_read(&cache->stats.write_hit);
				1065	stats.write_misses = atomic_read(&cache->stats.write_miss);
				1066
				1067	dm_cache_metadata_set_stats(cache->cmd, &stats);
				1068	}
				1069
				1070	static void update_stats(struct cache_stats *stats, enum policy_operation op)
				1071	{
				1072	switch (op) {
				1073	case POLICY_PROMOTE:
				1074	atomic_inc(&stats->promotion);
				1075	break;
				1076
				1077	case POLICY_DEMOTE:
				1078	atomic_inc(&stats->demotion);
				1079	break;
				1080
				1081	case POLICY_WRITEBACK:
				1082	atomic_inc(&stats->writeback);
				1083	break;
				1084	}
				1085	}
				1086
				1087	/*----------------------------------------------------------------
				1088	* Migration processing
				1089	*
				1090	* Migration covers moving data from the origin device to the cache, or
				1091	* vice versa.
				1092	--------------------------------------------------------------/
				1093
				1094	static void inc_io_migrations(struct cache *cache)
				1095	{
				1096	atomic_inc(&cache->nr_io_migrations);
				1097	}
				1098
				1099	static void dec_io_migrations(struct cache *cache)
				1100	{
				1101	atomic_dec(&cache->nr_io_migrations);
				1102	}
				1103
				1104	static bool discard_or_flush(struct bio *bio)
				1105	{
				1106	return bio_op(bio) == REQ_OP_DISCARD \|\| op_is_flush(bio->bi_opf);
				1107	}
				1108
				1109	static void calc_discard_block_range(struct cache cache, struct bio bio,
				1110	dm_dblock_t b, dm_dblock_t e)
				1111	{
				1112	sector_t sb = bio->bi_iter.bi_sector;
				1113	sector_t se = bio_end_sector(bio);
				1114
				1115	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
				1116
				1117	if (se - sb < cache->discard_block_size)
				1118	e = b;
				1119	else
				1120	*e = to_dblock(block_div(se, cache->discard_block_size));
				1121	}
				1122
				1123	/----------------------------------------------------------------/
				1124
				1125	static void prevent_background_work(struct cache *cache)
				1126	{
				1127	lockdep_off();
				1128	down_write(&cache->background_work_lock);
				1129	lockdep_on();
				1130	}
				1131
				1132	static void allow_background_work(struct cache *cache)
				1133	{
				1134	lockdep_off();
				1135	up_write(&cache->background_work_lock);
				1136	lockdep_on();
				1137	}
				1138
				1139	static bool background_work_begin(struct cache *cache)
				1140	{
				1141	bool r;
				1142
				1143	lockdep_off();
				1144	r = down_read_trylock(&cache->background_work_lock);
				1145	lockdep_on();
				1146
				1147	return r;
				1148	}
				1149
				1150	static void background_work_end(struct cache *cache)
				1151	{
				1152	lockdep_off();
				1153	up_read(&cache->background_work_lock);
				1154	lockdep_on();
				1155	}
				1156
				1157	/----------------------------------------------------------------/
				1158
				1159	static bool bio_writes_complete_block(struct cache cache, struct bio bio)
				1160	{
				1161	return (bio_data_dir(bio) == WRITE) &&
				1162	(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
				1163	}
				1164
				1165	static bool optimisable_bio(struct cache cache, struct bio bio, dm_oblock_t block)
				1166	{
				1167	return writeback_mode(cache) &&
				1168	(is_discarded_oblock(cache, block) \|\| bio_writes_complete_block(cache, bio));
				1169	}
				1170
				1171	static void quiesce(struct dm_cache_migration *mg,
				1172	void (continuation)(struct work_struct ))
				1173	{
				1174	init_continuation(&mg->k, continuation);
				1175	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
				1176	}
				1177
				1178	static struct dm_cache_migration ws_to_mg(struct work_struct ws)
				1179	{
				1180	struct continuation *k = container_of(ws, struct continuation, ws);
				1181	return container_of(k, struct dm_cache_migration, k);
				1182	}
				1183
				1184	static void copy_complete(int read_err, unsigned long write_err, void *context)
				1185	{
				1186	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
				1187
				1188	if (read_err \|\| write_err)
				1189	mg->k.input = BLK_STS_IOERR;
				1190
				1191	queue_continuation(mg->cache->wq, &mg->k);
				1192	}
				1193
				1194	static int copy(struct dm_cache_migration *mg, bool promote)
				1195	{
				1196	int r;
				1197	struct dm_io_region o_region, c_region;
				1198	struct cache *cache = mg->cache;
				1199
				1200	o_region.bdev = cache->origin_dev->bdev;
				1201	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
				1202	o_region.count = cache->sectors_per_block;
				1203
				1204	c_region.bdev = cache->cache_dev->bdev;
				1205	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
				1206	c_region.count = cache->sectors_per_block;
				1207
				1208	if (promote)
				1209	r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
				1210	else
				1211	r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
				1212
				1213	return r;
				1214	}
				1215
				1216	static void bio_drop_shared_lock(struct cache cache, struct bio bio)
				1217	{
				1218	size_t pb_data_size = get_per_bio_data_size(cache);
				1219	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				1220
				1221	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
				1222	free_prison_cell(cache, pb->cell);
				1223	pb->cell = NULL;
				1224	}
				1225
				1226	static void overwrite_endio(struct bio *bio)
				1227	{
				1228	struct dm_cache_migration *mg = bio->bi_private;
				1229	struct cache *cache = mg->cache;
				1230	size_t pb_data_size = get_per_bio_data_size(cache);
				1231	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				1232
				1233	dm_unhook_bio(&pb->hook_info, bio);
				1234
				1235	if (bio->bi_status)
				1236	mg->k.input = bio->bi_status;
				1237
				1238	queue_continuation(mg->cache->wq, &mg->k);
				1239	}
				1240
				1241	static void overwrite(struct dm_cache_migration *mg,
				1242	void (continuation)(struct work_struct ))
				1243	{
				1244	struct bio *bio = mg->overwrite_bio;
				1245	size_t pb_data_size = get_per_bio_data_size(mg->cache);
				1246	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				1247
				1248	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
				1249
				1250	/*
				1251	* The overwrite bio is part of the copy operation, as such it does
				1252	* not set/clear discard or dirty flags.
				1253	*/
				1254	if (mg->op->op == POLICY_PROMOTE)
				1255	remap_to_cache(mg->cache, bio, mg->op->cblock);
				1256	else
				1257	remap_to_origin(mg->cache, bio);
				1258
				1259	init_continuation(&mg->k, continuation);
				1260	accounted_request(mg->cache, bio);
				1261	}
				1262
				1263	/*
				1264	* Migration steps:
				1265	*
				1266	* 1) exclusive lock preventing WRITEs
				1267	* 2) quiesce
				1268	* 3) copy or issue overwrite bio
				1269	* 4) upgrade to exclusive lock preventing READs and WRITEs
				1270	* 5) quiesce
				1271	* 6) update metadata and commit
				1272	* 7) unlock
				1273	*/
				1274	static void mg_complete(struct dm_cache_migration *mg, bool success)
				1275	{
				1276	struct bio_list bios;
				1277	struct cache *cache = mg->cache;
				1278	struct policy_work *op = mg->op;
				1279	dm_cblock_t cblock = op->cblock;
				1280
				1281	if (success)
				1282	update_stats(&cache->stats, op->op);
				1283
				1284	switch (op->op) {
				1285	case POLICY_PROMOTE:
				1286	clear_discard(cache, oblock_to_dblock(cache, op->oblock));
				1287	policy_complete_background_work(cache->policy, op, success);
				1288
				1289	if (mg->overwrite_bio) {
				1290	if (success)
				1291	force_set_dirty(cache, cblock);
				1292	else if (mg->k.input)
				1293	mg->overwrite_bio->bi_status = mg->k.input;
				1294	else
				1295	mg->overwrite_bio->bi_status = BLK_STS_IOERR;
				1296	bio_endio(mg->overwrite_bio);
				1297	} else {
				1298	if (success)
				1299	force_clear_dirty(cache, cblock);
				1300	dec_io_migrations(cache);
				1301	}
				1302	break;
				1303
				1304	case POLICY_DEMOTE:
				1305	/*
				1306	* We clear dirty here to update the nr_dirty counter.
				1307	*/
				1308	if (success)
				1309	force_clear_dirty(cache, cblock);
				1310	policy_complete_background_work(cache->policy, op, success);
				1311	dec_io_migrations(cache);
				1312	break;
				1313
				1314	case POLICY_WRITEBACK:
				1315	if (success)
				1316	force_clear_dirty(cache, cblock);
				1317	policy_complete_background_work(cache->policy, op, success);
				1318	dec_io_migrations(cache);
				1319	break;
				1320	}
				1321
				1322	bio_list_init(&bios);
				1323	if (mg->cell) {
				1324	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1325	free_prison_cell(cache, mg->cell);
				1326	}
				1327
				1328	free_migration(mg);
				1329	defer_bios(cache, &bios);
				1330	wake_migration_worker(cache);
				1331
				1332	background_work_end(cache);
				1333	}
				1334
				1335	static void mg_success(struct work_struct *ws)
				1336	{
				1337	struct dm_cache_migration *mg = ws_to_mg(ws);
				1338	mg_complete(mg, mg->k.input == 0);
				1339	}
				1340
				1341	static void mg_update_metadata(struct work_struct *ws)
				1342	{
				1343	int r;
				1344	struct dm_cache_migration *mg = ws_to_mg(ws);
				1345	struct cache *cache = mg->cache;
				1346	struct policy_work *op = mg->op;
				1347
				1348	switch (op->op) {
				1349	case POLICY_PROMOTE:
				1350	r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
				1351	if (r) {
				1352	DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
				1353	cache_device_name(cache));
				1354	metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
				1355
				1356	mg_complete(mg, false);
				1357	return;
				1358	}
				1359	mg_complete(mg, true);
				1360	break;
				1361
				1362	case POLICY_DEMOTE:
				1363	r = dm_cache_remove_mapping(cache->cmd, op->cblock);
				1364	if (r) {
				1365	DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
				1366	cache_device_name(cache));
				1367	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1368
				1369	mg_complete(mg, false);
				1370	return;
				1371	}
				1372
				1373	/*
				1374	* It would be nice if we only had to commit when a REQ_FLUSH
				1375	* comes through. But there's one scenario that we have to
				1376	* look out for:
				1377	*
				1378	* - vblock x in a cache block
				1379	* - domotion occurs
				1380	* - cache block gets reallocated and over written
				1381	* - crash
				1382	*
				1383	* When we recover, because there was no commit the cache will
				1384	* rollback to having the data for vblock x in the cache block.
				1385	* But the cache block has since been overwritten, so it'll end
				1386	* up pointing to data that was never in 'x' during the history
				1387	* of the device.
				1388	*
				1389	* To avoid this issue we require a commit as part of the
				1390	* demotion operation.
				1391	*/
				1392	init_continuation(&mg->k, mg_success);
				1393	continue_after_commit(&cache->committer, &mg->k);
				1394	schedule_commit(&cache->committer);
				1395	break;
				1396
				1397	case POLICY_WRITEBACK:
				1398	mg_complete(mg, true);
				1399	break;
				1400	}
				1401	}
				1402
				1403	static void mg_update_metadata_after_copy(struct work_struct *ws)
				1404	{
				1405	struct dm_cache_migration *mg = ws_to_mg(ws);
				1406
				1407	/*
				1408	* Did the copy succeed?
				1409	*/
				1410	if (mg->k.input)
				1411	mg_complete(mg, false);
				1412	else
				1413	mg_update_metadata(ws);
				1414	}
				1415
				1416	static void mg_upgrade_lock(struct work_struct *ws)
				1417	{
				1418	int r;
				1419	struct dm_cache_migration *mg = ws_to_mg(ws);
				1420
				1421	/*
				1422	* Did the copy succeed?
				1423	*/
				1424	if (mg->k.input)
				1425	mg_complete(mg, false);
				1426
				1427	else {
				1428	/*
				1429	* Now we want the lock to prevent both reads and writes.
				1430	*/
				1431	r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
				1432	READ_WRITE_LOCK_LEVEL);
				1433	if (r < 0)
				1434	mg_complete(mg, false);
				1435
				1436	else if (r)
				1437	quiesce(mg, mg_update_metadata);
				1438
				1439	else
				1440	mg_update_metadata(ws);
				1441	}
				1442	}
				1443
				1444	static void mg_full_copy(struct work_struct *ws)
				1445	{
				1446	struct dm_cache_migration *mg = ws_to_mg(ws);
				1447	struct cache *cache = mg->cache;
				1448	struct policy_work *op = mg->op;
				1449	bool is_policy_promote = (op->op == POLICY_PROMOTE);
				1450
				1451	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) \|\|
				1452	is_discarded_oblock(cache, op->oblock)) {
				1453	mg_upgrade_lock(ws);
				1454	return;
				1455	}
				1456
				1457	init_continuation(&mg->k, mg_upgrade_lock);
				1458
				1459	if (copy(mg, is_policy_promote)) {
				1460	DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
				1461	mg->k.input = BLK_STS_IOERR;
				1462	mg_complete(mg, false);
				1463	}
				1464	}
				1465
				1466	static void mg_copy(struct work_struct *ws)
				1467	{
				1468	struct dm_cache_migration *mg = ws_to_mg(ws);
				1469
				1470	if (mg->overwrite_bio) {
				1471	/*
				1472	* No exclusive lock was held when we last checked if the bio
				1473	* was optimisable. So we have to check again in case things
				1474	* have changed (eg, the block may no longer be discarded).
				1475	*/
				1476	if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
				1477	/*
				1478	* Fallback to a real full copy after doing some tidying up.
				1479	*/
				1480	bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
				1481	BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
				1482	mg->overwrite_bio = NULL;
				1483	inc_io_migrations(mg->cache);
				1484	mg_full_copy(ws);
				1485	return;
				1486	}
				1487
				1488	/*
				1489	* It's safe to do this here, even though it's new data
				1490	* because all IO has been locked out of the block.
				1491	*
				1492	* mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
				1493	* so _not_ using mg_upgrade_lock() as continutation.
				1494	*/
				1495	overwrite(mg, mg_update_metadata_after_copy);
				1496
				1497	} else
				1498	mg_full_copy(ws);
				1499	}
				1500
				1501	static int mg_lock_writes(struct dm_cache_migration *mg)
				1502	{
				1503	int r;
				1504	struct dm_cell_key_v2 key;
				1505	struct cache *cache = mg->cache;
				1506	struct dm_bio_prison_cell_v2 *prealloc;
				1507
				1508	prealloc = alloc_prison_cell(cache);
				1509	if (!prealloc) {
				1510	DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
				1511	mg_complete(mg, false);
				1512	return -ENOMEM;
				1513	}
				1514
				1515	/*
				1516	* Prevent writes to the block, but allow reads to continue.
				1517	* Unless we're using an overwrite bio, in which case we lock
				1518	* everything.
				1519	*/
				1520	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
				1521	r = dm_cell_lock_v2(cache->prison, &key,
				1522	mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
				1523	prealloc, &mg->cell);
				1524	if (r < 0) {
				1525	free_prison_cell(cache, prealloc);
				1526	mg_complete(mg, false);
				1527	return r;
				1528	}
				1529
				1530	if (mg->cell != prealloc)
				1531	free_prison_cell(cache, prealloc);
				1532
				1533	if (r == 0)
				1534	mg_copy(&mg->k.ws);
				1535	else
				1536	quiesce(mg, mg_copy);
				1537
				1538	return 0;
				1539	}
				1540
				1541	static int mg_start(struct cache cache, struct policy_work op, struct bio *bio)
				1542	{
				1543	struct dm_cache_migration *mg;
				1544
				1545	if (!background_work_begin(cache)) {
				1546	policy_complete_background_work(cache->policy, op, false);
				1547	return -EPERM;
				1548	}
				1549
				1550	mg = alloc_migration(cache);
				1551	if (!mg) {
				1552	policy_complete_background_work(cache->policy, op, false);
				1553	background_work_end(cache);
				1554	return -ENOMEM;
				1555	}
				1556
				1557	memset(mg, 0, sizeof(*mg));
				1558
				1559	mg->cache = cache;
				1560	mg->op = op;
				1561	mg->overwrite_bio = bio;
				1562
				1563	if (!bio)
				1564	inc_io_migrations(cache);
				1565
				1566	return mg_lock_writes(mg);
				1567	}
				1568
				1569	/*----------------------------------------------------------------
				1570	* invalidation processing
				1571	--------------------------------------------------------------/
				1572
				1573	static void invalidate_complete(struct dm_cache_migration *mg, bool success)
				1574	{
				1575	struct bio_list bios;
				1576	struct cache *cache = mg->cache;
				1577
				1578	bio_list_init(&bios);
				1579	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1580	free_prison_cell(cache, mg->cell);
				1581
				1582	if (!success && mg->overwrite_bio)
				1583	bio_io_error(mg->overwrite_bio);
				1584
				1585	free_migration(mg);
				1586	defer_bios(cache, &bios);
				1587
				1588	background_work_end(cache);
				1589	}
				1590
				1591	static void invalidate_completed(struct work_struct *ws)
				1592	{
				1593	struct dm_cache_migration *mg = ws_to_mg(ws);
				1594	invalidate_complete(mg, !mg->k.input);
				1595	}
				1596
				1597	static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
				1598	{
				1599	int r = policy_invalidate_mapping(cache->policy, cblock);
				1600	if (!r) {
				1601	r = dm_cache_remove_mapping(cache->cmd, cblock);
				1602	if (r) {
				1603	DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
				1604	cache_device_name(cache));
				1605	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1606	}
				1607
				1608	} else if (r == -ENODATA) {
				1609	/*
				1610	* Harmless, already unmapped.
				1611	*/
				1612	r = 0;
				1613
				1614	} else
				1615	DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
				1616
				1617	return r;
				1618	}
				1619
				1620	static void invalidate_remove(struct work_struct *ws)
				1621	{
				1622	int r;
				1623	struct dm_cache_migration *mg = ws_to_mg(ws);
				1624	struct cache *cache = mg->cache;
				1625
				1626	r = invalidate_cblock(cache, mg->invalidate_cblock);
				1627	if (r) {
				1628	invalidate_complete(mg, false);
				1629	return;
				1630	}
				1631
				1632	init_continuation(&mg->k, invalidate_completed);
				1633	continue_after_commit(&cache->committer, &mg->k);
				1634	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
				1635	mg->overwrite_bio = NULL;
				1636	schedule_commit(&cache->committer);
				1637	}
				1638
				1639	static int invalidate_lock(struct dm_cache_migration *mg)
				1640	{
				1641	int r;
				1642	struct dm_cell_key_v2 key;
				1643	struct cache *cache = mg->cache;
				1644	struct dm_bio_prison_cell_v2 *prealloc;
				1645
				1646	prealloc = alloc_prison_cell(cache);
				1647	if (!prealloc) {
				1648	invalidate_complete(mg, false);
				1649	return -ENOMEM;
				1650	}
				1651
				1652	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
				1653	r = dm_cell_lock_v2(cache->prison, &key,
				1654	READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
				1655	if (r < 0) {
				1656	free_prison_cell(cache, prealloc);
				1657	invalidate_complete(mg, false);
				1658	return r;
				1659	}
				1660
				1661	if (mg->cell != prealloc)
				1662	free_prison_cell(cache, prealloc);
				1663
				1664	if (r)
				1665	quiesce(mg, invalidate_remove);
				1666
				1667	else {
				1668	/*
				1669	* We can't call invalidate_remove() directly here because we
				1670	* might still be in request context.
				1671	*/
				1672	init_continuation(&mg->k, invalidate_remove);
				1673	queue_work(cache->wq, &mg->k.ws);
				1674	}
				1675
				1676	return 0;
				1677	}
				1678
				1679	static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
				1680	dm_oblock_t oblock, struct bio *bio)
				1681	{
				1682	struct dm_cache_migration *mg;
				1683
				1684	if (!background_work_begin(cache))
				1685	return -EPERM;
				1686
				1687	mg = alloc_migration(cache);
				1688	if (!mg) {
				1689	background_work_end(cache);
				1690	return -ENOMEM;
				1691	}
				1692
				1693	memset(mg, 0, sizeof(*mg));
				1694
				1695	mg->cache = cache;
				1696	mg->overwrite_bio = bio;
				1697	mg->invalidate_cblock = cblock;
				1698	mg->invalidate_oblock = oblock;
				1699
				1700	return invalidate_lock(mg);
				1701	}
				1702
				1703	/*----------------------------------------------------------------
				1704	* bio processing
				1705	--------------------------------------------------------------/
				1706
				1707	enum busy {
				1708	IDLE,
				1709	BUSY
				1710	};
				1711
				1712	static enum busy spare_migration_bandwidth(struct cache *cache)
				1713	{
				1714	bool idle = iot_idle_for(&cache->tracker, HZ);
				1715	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
				1716	cache->sectors_per_block;
				1717
				1718	if (idle && current_volume <= cache->migration_threshold)
				1719	return IDLE;
				1720	else
				1721	return BUSY;
				1722	}
				1723
				1724	static void inc_hit_counter(struct cache cache, struct bio bio)
				1725	{
				1726	atomic_inc(bio_data_dir(bio) == READ ?
				1727	&cache->stats.read_hit : &cache->stats.write_hit);
				1728	}
				1729
				1730	static void inc_miss_counter(struct cache cache, struct bio bio)
				1731	{
				1732	atomic_inc(bio_data_dir(bio) == READ ?
				1733	&cache->stats.read_miss : &cache->stats.write_miss);
				1734	}
				1735
				1736	/----------------------------------------------------------------/
				1737
				1738	static int map_bio(struct cache cache, struct bio bio, dm_oblock_t block,
				1739	bool *commit_needed)
				1740	{
				1741	int r, data_dir;
				1742	bool rb, background_queued;
				1743	dm_cblock_t cblock;
				1744	size_t pb_data_size = get_per_bio_data_size(cache);
				1745	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				1746
				1747	*commit_needed = false;
				1748
				1749	rb = bio_detain_shared(cache, block, bio);
				1750	if (!rb) {
				1751	/*
				1752	* An exclusive lock is held for this block, so we have to
				1753	* wait. We set the commit_needed flag so the current
				1754	* transaction will be committed asap, allowing this lock
				1755	* to be dropped.
				1756	*/
				1757	*commit_needed = true;
				1758	return DM_MAPIO_SUBMITTED;
				1759	}
				1760
				1761	data_dir = bio_data_dir(bio);
				1762
				1763	if (optimisable_bio(cache, bio, block)) {
				1764	struct policy_work *op = NULL;
				1765
				1766	r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
				1767	if (unlikely(r && r != -ENOENT)) {
				1768	DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
				1769	cache_device_name(cache), r);
				1770	bio_io_error(bio);
				1771	return DM_MAPIO_SUBMITTED;
				1772	}
				1773
				1774	if (r == -ENOENT && op) {
				1775	bio_drop_shared_lock(cache, bio);
				1776	BUG_ON(op->op != POLICY_PROMOTE);
				1777	mg_start(cache, op, bio);
				1778	return DM_MAPIO_SUBMITTED;
				1779	}
				1780	} else {
				1781	r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
				1782	if (unlikely(r && r != -ENOENT)) {
				1783	DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
				1784	cache_device_name(cache), r);
				1785	bio_io_error(bio);
				1786	return DM_MAPIO_SUBMITTED;
				1787	}
				1788
				1789	if (background_queued)
				1790	wake_migration_worker(cache);
				1791	}
				1792
				1793	if (r == -ENOENT) {
				1794	/*
				1795	* Miss.
				1796	*/
				1797	inc_miss_counter(cache, bio);
				1798	if (pb->req_nr == 0) {
				1799	accounted_begin(cache, bio);
				1800	remap_to_origin_clear_discard(cache, bio, block);
				1801
				1802	} else {
				1803	/*
				1804	* This is a duplicate writethrough io that is no
				1805	* longer needed because the block has been demoted.
				1806	*/
				1807	bio_endio(bio);
				1808	return DM_MAPIO_SUBMITTED;
				1809	}
				1810	} else {
				1811	/*
				1812	* Hit.
				1813	*/
				1814	inc_hit_counter(cache, bio);
				1815
				1816	/*
				1817	* Passthrough always maps to the origin, invalidating any
				1818	* cache blocks that are written to.
				1819	*/
				1820	if (passthrough_mode(cache)) {
				1821	if (bio_data_dir(bio) == WRITE) {
				1822	bio_drop_shared_lock(cache, bio);
				1823	atomic_inc(&cache->stats.demotion);
				1824	invalidate_start(cache, cblock, block, bio);
				1825	} else
				1826	remap_to_origin_clear_discard(cache, bio, block);
				1827
				1828	} else {
				1829	if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
				1830	!is_dirty(cache, cblock)) {
				1831	remap_to_origin_and_cache(cache, bio, block, cblock);
				1832	accounted_begin(cache, bio);
				1833	} else
				1834	remap_to_cache_dirty(cache, bio, block, cblock);
				1835	}
				1836	}
				1837
				1838	/*
				1839	* dm core turns FUA requests into a separate payload and FLUSH req.
				1840	*/
				1841	if (bio->bi_opf & REQ_FUA) {
				1842	/*
				1843	* issue_after_commit will call accounted_begin a second time. So
				1844	* we call accounted_complete() to avoid double accounting.
				1845	*/
				1846	accounted_complete(cache, bio);
				1847	issue_after_commit(&cache->committer, bio);
				1848	*commit_needed = true;
				1849	return DM_MAPIO_SUBMITTED;
				1850	}
				1851
				1852	return DM_MAPIO_REMAPPED;
				1853	}
				1854
				1855	static bool process_bio(struct cache cache, struct bio bio)
				1856	{
				1857	bool commit_needed;
				1858
				1859	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
				1860	generic_make_request(bio);
				1861
				1862	return commit_needed;
				1863	}
				1864
				1865	/*
				1866	* A non-zero return indicates read_only or fail_io mode.
				1867	*/
				1868	static int commit(struct cache *cache, bool clean_shutdown)
				1869	{
				1870	int r;
				1871
				1872	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1873	return -EINVAL;
				1874
				1875	atomic_inc(&cache->stats.commit_count);
				1876	r = dm_cache_commit(cache->cmd, clean_shutdown);
				1877	if (r)
				1878	metadata_operation_failed(cache, "dm_cache_commit", r);
				1879
				1880	return r;
				1881	}
				1882
				1883	/*
				1884	* Used by the batcher.
				1885	*/
				1886	static blk_status_t commit_op(void *context)
				1887	{
				1888	struct cache *cache = context;
				1889
				1890	if (dm_cache_changed_this_transaction(cache->cmd))
				1891	return errno_to_blk_status(commit(cache, false));
				1892
				1893	return 0;
				1894	}
				1895
				1896	/----------------------------------------------------------------/
				1897
				1898	static bool process_flush_bio(struct cache cache, struct bio bio)
				1899	{
				1900	size_t pb_data_size = get_per_bio_data_size(cache);
				1901	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				1902
				1903	if (!pb->req_nr)
				1904	remap_to_origin(cache, bio);
				1905	else
				1906	remap_to_cache(cache, bio, 0);
				1907
				1908	issue_after_commit(&cache->committer, bio);
				1909	return true;
				1910	}
				1911
				1912	static bool process_discard_bio(struct cache cache, struct bio bio)
				1913	{
				1914	dm_dblock_t b, e;
				1915
				1916	// FIXME: do we need to lock the region? Or can we just assume the
				1917	// user wont be so foolish as to issue discard concurrently with
				1918	// other IO?
				1919	calc_discard_block_range(cache, bio, &b, &e);
				1920	while (b != e) {
				1921	set_discard(cache, b);
				1922	b = to_dblock(from_dblock(b) + 1);
				1923	}
				1924
				1925	bio_endio(bio);
				1926
				1927	return false;
				1928	}
				1929
				1930	static void process_deferred_bios(struct work_struct *ws)
				1931	{
				1932	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
				1933
				1934	unsigned long flags;
				1935	bool commit_needed = false;
				1936	struct bio_list bios;
				1937	struct bio *bio;
				1938
				1939	bio_list_init(&bios);
				1940
				1941	spin_lock_irqsave(&cache->lock, flags);
				1942	bio_list_merge(&bios, &cache->deferred_bios);
				1943	bio_list_init(&cache->deferred_bios);
				1944	spin_unlock_irqrestore(&cache->lock, flags);
				1945
				1946	while ((bio = bio_list_pop(&bios))) {
				1947	if (bio->bi_opf & REQ_PREFLUSH)
				1948	commit_needed = process_flush_bio(cache, bio) \|\| commit_needed;
				1949
				1950	else if (bio_op(bio) == REQ_OP_DISCARD)
				1951	commit_needed = process_discard_bio(cache, bio) \|\| commit_needed;
				1952
				1953	else
				1954	commit_needed = process_bio(cache, bio) \|\| commit_needed;
				1955	}
				1956
				1957	if (commit_needed)
				1958	schedule_commit(&cache->committer);
				1959	}
				1960
				1961	/*----------------------------------------------------------------
				1962	* Main worker loop
				1963	--------------------------------------------------------------/
				1964
				1965	static void requeue_deferred_bios(struct cache *cache)
				1966	{
				1967	struct bio *bio;
				1968	struct bio_list bios;
				1969
				1970	bio_list_init(&bios);
				1971	bio_list_merge(&bios, &cache->deferred_bios);
				1972	bio_list_init(&cache->deferred_bios);
				1973
				1974	while ((bio = bio_list_pop(&bios))) {
				1975	bio->bi_status = BLK_STS_DM_REQUEUE;
				1976	bio_endio(bio);
				1977	}
				1978	}
				1979
				1980	/*
				1981	* We want to commit periodically so that not too much
				1982	* unwritten metadata builds up.
				1983	*/
				1984	static void do_waker(struct work_struct *ws)
				1985	{
				1986	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
				1987
				1988	policy_tick(cache->policy, true);
				1989	wake_migration_worker(cache);
				1990	schedule_commit(&cache->committer);
				1991	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
				1992	}
				1993
				1994	static void check_migrations(struct work_struct *ws)
				1995	{
				1996	int r;
				1997	struct policy_work *op;
				1998	struct cache *cache = container_of(ws, struct cache, migration_worker);
				1999	enum busy b;
				2000
				2001	for (;;) {
				2002	b = spare_migration_bandwidth(cache);
				2003
				2004	r = policy_get_background_work(cache->policy, b == IDLE, &op);
				2005	if (r == -ENODATA)
				2006	break;
				2007
				2008	if (r) {
				2009	DMERR_LIMIT("%s: policy_background_work failed",
				2010	cache_device_name(cache));
				2011	break;
				2012	}
				2013
				2014	r = mg_start(cache, op, NULL);
				2015	if (r)
				2016	break;
				2017	}
				2018	}
				2019
				2020	/*----------------------------------------------------------------
				2021	* Target methods
				2022	--------------------------------------------------------------/
				2023
				2024	/*
				2025	* This function gets called on the error paths of the constructor, so we
				2026	* have to cope with a partially initialised struct.
				2027	*/
				2028	static void destroy(struct cache *cache)
				2029	{
				2030	unsigned i;
				2031
				2032	mempool_destroy(cache->migration_pool);
				2033
				2034	if (cache->prison)
				2035	dm_bio_prison_destroy_v2(cache->prison);
				2036
				2037	if (cache->wq)
				2038	destroy_workqueue(cache->wq);
				2039
				2040	if (cache->dirty_bitset)
				2041	free_bitset(cache->dirty_bitset);
				2042
				2043	if (cache->discard_bitset)
				2044	free_bitset(cache->discard_bitset);
				2045
				2046	if (cache->copier)
				2047	dm_kcopyd_client_destroy(cache->copier);
				2048
				2049	if (cache->cmd)
				2050	dm_cache_metadata_close(cache->cmd);
				2051
				2052	if (cache->metadata_dev)
				2053	dm_put_device(cache->ti, cache->metadata_dev);
				2054
				2055	if (cache->origin_dev)
				2056	dm_put_device(cache->ti, cache->origin_dev);
				2057
				2058	if (cache->cache_dev)
				2059	dm_put_device(cache->ti, cache->cache_dev);
				2060
				2061	if (cache->policy)
				2062	dm_cache_policy_destroy(cache->policy);
				2063
				2064	for (i = 0; i < cache->nr_ctr_args ; i++)
				2065	kfree(cache->ctr_args[i]);
				2066	kfree(cache->ctr_args);
				2067
				2068	if (cache->bs)
				2069	bioset_free(cache->bs);
				2070
				2071	kfree(cache);
				2072	}
				2073
				2074	static void cache_dtr(struct dm_target *ti)
				2075	{
				2076	struct cache *cache = ti->private;
				2077
				2078	destroy(cache);
				2079	}
				2080
				2081	static sector_t get_dev_size(struct dm_dev *dev)
				2082	{
				2083	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				2084	}
				2085
				2086	/----------------------------------------------------------------/
				2087
				2088	/*
				2089	* Construct a cache device mapping.
				2090	*
				2091	* cache <metadata dev> <cache dev> <origin dev> <block size>
				2092	* <#feature args> [<feature arg>]*
				2093	* <policy> <#policy args> [<policy arg>]*
				2094	*
				2095	* metadata dev : fast device holding the persistent metadata
				2096	* cache dev : fast device holding cached data blocks
				2097	* origin dev : slow device holding original data blocks
				2098	* block size : cache unit size in sectors
				2099	*
				2100	* #feature args : number of feature arguments passed
				2101	* feature args : writethrough. (The default is writeback.)
				2102	*
				2103	* policy : the replacement policy to use
				2104	* #policy args : an even number of policy arguments corresponding
				2105	* to key/value pairs passed to the policy
				2106	* policy args : key/value pairs passed to the policy
				2107	* E.g. 'sequential_threshold 1024'
				2108	* See cache-policies.txt for details.
				2109	*
				2110	* Optional feature arguments are:
				2111	* writethrough : write through caching that prohibits cache block
				2112	* content from being different from origin block content.
				2113	* Without this argument, the default behaviour is to write
				2114	* back cache block contents later for performance reasons,
				2115	* so they may differ from the corresponding origin blocks.
				2116	*/
				2117	struct cache_args {
				2118	struct dm_target *ti;
				2119
				2120	struct dm_dev *metadata_dev;
				2121
				2122	struct dm_dev *cache_dev;
				2123	sector_t cache_sectors;
				2124
				2125	struct dm_dev *origin_dev;
				2126	sector_t origin_sectors;
				2127
				2128	uint32_t block_size;
				2129
				2130	const char *policy_name;
				2131	int policy_argc;
				2132	const char **policy_argv;
				2133
				2134	struct cache_features features;
				2135	};
				2136
				2137	static void destroy_cache_args(struct cache_args *ca)
				2138	{
				2139	if (ca->metadata_dev)
				2140	dm_put_device(ca->ti, ca->metadata_dev);
				2141
				2142	if (ca->cache_dev)
				2143	dm_put_device(ca->ti, ca->cache_dev);
				2144
				2145	if (ca->origin_dev)
				2146	dm_put_device(ca->ti, ca->origin_dev);
				2147
				2148	kfree(ca);
				2149	}
				2150
				2151	static bool at_least_one_arg(struct dm_arg_set as, char *error)
				2152	{
				2153	if (!as->argc) {
				2154	*error = "Insufficient args";
				2155	return false;
				2156	}
				2157
				2158	return true;
				2159	}
				2160
				2161	static int parse_metadata_dev(struct cache_args ca, struct dm_arg_set as,
				2162	char **error)
				2163	{
				2164	int r;
				2165	sector_t metadata_dev_size;
				2166	char b[BDEVNAME_SIZE];
				2167
				2168	if (!at_least_one_arg(as, error))
				2169	return -EINVAL;
				2170
				2171	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2172	&ca->metadata_dev);
				2173	if (r) {
				2174	*error = "Error opening metadata device";
				2175	return r;
				2176	}
				2177
				2178	metadata_dev_size = get_dev_size(ca->metadata_dev);
				2179	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
				2180	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				2181	bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
				2182
				2183	return 0;
				2184	}
				2185
				2186	static int parse_cache_dev(struct cache_args ca, struct dm_arg_set as,
				2187	char **error)
				2188	{
				2189	int r;
				2190
				2191	if (!at_least_one_arg(as, error))
				2192	return -EINVAL;
				2193
				2194	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2195	&ca->cache_dev);
				2196	if (r) {
				2197	*error = "Error opening cache device";
				2198	return r;
				2199	}
				2200	ca->cache_sectors = get_dev_size(ca->cache_dev);
				2201
				2202	return 0;
				2203	}
				2204
				2205	static int parse_origin_dev(struct cache_args ca, struct dm_arg_set as,
				2206	char **error)
				2207	{
				2208	int r;
				2209
				2210	if (!at_least_one_arg(as, error))
				2211	return -EINVAL;
				2212
				2213	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2214	&ca->origin_dev);
				2215	if (r) {
				2216	*error = "Error opening origin device";
				2217	return r;
				2218	}
				2219
				2220	ca->origin_sectors = get_dev_size(ca->origin_dev);
				2221	if (ca->ti->len > ca->origin_sectors) {
				2222	*error = "Device size larger than cached device";
				2223	return -EINVAL;
				2224	}
				2225
				2226	return 0;
				2227	}
				2228
				2229	static int parse_block_size(struct cache_args ca, struct dm_arg_set as,
				2230	char **error)
				2231	{
				2232	unsigned long block_size;
				2233
				2234	if (!at_least_one_arg(as, error))
				2235	return -EINVAL;
				2236
				2237	if (kstrtoul(dm_shift_arg(as), 10, &block_size) \|\| !block_size \|\|
				2238	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				2239	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				2240	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				2241	*error = "Invalid data block size";
				2242	return -EINVAL;
				2243	}
				2244
				2245	if (block_size > ca->cache_sectors) {
				2246	*error = "Data block size is larger than the cache device";
				2247	return -EINVAL;
				2248	}
				2249
				2250	ca->block_size = block_size;
				2251
				2252	return 0;
				2253	}
				2254
				2255	static void init_features(struct cache_features *cf)
				2256	{
				2257	cf->mode = CM_WRITE;
				2258	cf->io_mode = CM_IO_WRITEBACK;
				2259	cf->metadata_version = 1;
				2260	}
				2261
				2262	static int parse_features(struct cache_args ca, struct dm_arg_set as,
				2263	char **error)
				2264	{
				2265	static const struct dm_arg _args[] = {
				2266	{0, 2, "Invalid number of cache feature arguments"},
				2267	};
				2268
				2269	int r, mode_ctr = 0;
				2270	unsigned argc;
				2271	const char *arg;
				2272	struct cache_features *cf = &ca->features;
				2273
				2274	init_features(cf);
				2275
				2276	r = dm_read_arg_group(_args, as, &argc, error);
				2277	if (r)
				2278	return -EINVAL;
				2279
				2280	while (argc--) {
				2281	arg = dm_shift_arg(as);
				2282
				2283	if (!strcasecmp(arg, "writeback")) {
				2284	cf->io_mode = CM_IO_WRITEBACK;
				2285	mode_ctr++;
				2286	}
				2287
				2288	else if (!strcasecmp(arg, "writethrough")) {
				2289	cf->io_mode = CM_IO_WRITETHROUGH;
				2290	mode_ctr++;
				2291	}
				2292
				2293	else if (!strcasecmp(arg, "passthrough")) {
				2294	cf->io_mode = CM_IO_PASSTHROUGH;
				2295	mode_ctr++;
				2296	}
				2297
				2298	else if (!strcasecmp(arg, "metadata2"))
				2299	cf->metadata_version = 2;
				2300
				2301	else {
				2302	*error = "Unrecognised cache feature requested";
				2303	return -EINVAL;
				2304	}
				2305	}
				2306
				2307	if (mode_ctr > 1) {
				2308	*error = "Duplicate cache io_mode features requested";
				2309	return -EINVAL;
				2310	}
				2311
				2312	return 0;
				2313	}
				2314
				2315	static int parse_policy(struct cache_args ca, struct dm_arg_set as,
				2316	char **error)
				2317	{
				2318	static const struct dm_arg _args[] = {
				2319	{0, 1024, "Invalid number of policy arguments"},
				2320	};
				2321
				2322	int r;
				2323
				2324	if (!at_least_one_arg(as, error))
				2325	return -EINVAL;
				2326
				2327	ca->policy_name = dm_shift_arg(as);
				2328
				2329	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
				2330	if (r)
				2331	return -EINVAL;
				2332
				2333	ca->policy_argv = (const char **)as->argv;
				2334	dm_consume_args(as, ca->policy_argc);
				2335
				2336	return 0;
				2337	}
				2338
				2339	static int parse_cache_args(struct cache_args ca, int argc, char *argv,
				2340	char **error)
				2341	{
				2342	int r;
				2343	struct dm_arg_set as;
				2344
				2345	as.argc = argc;
				2346	as.argv = argv;
				2347
				2348	r = parse_metadata_dev(ca, &as, error);
				2349	if (r)
				2350	return r;
				2351
				2352	r = parse_cache_dev(ca, &as, error);
				2353	if (r)
				2354	return r;
				2355
				2356	r = parse_origin_dev(ca, &as, error);
				2357	if (r)
				2358	return r;
				2359
				2360	r = parse_block_size(ca, &as, error);
				2361	if (r)
				2362	return r;
				2363
				2364	r = parse_features(ca, &as, error);
				2365	if (r)
				2366	return r;
				2367
				2368	r = parse_policy(ca, &as, error);
				2369	if (r)
				2370	return r;
				2371
				2372	return 0;
				2373	}
				2374
				2375	/----------------------------------------------------------------/
				2376
				2377	static struct kmem_cache *migration_cache;
				2378
				2379	#define NOT_CORE_OPTION 1
				2380
				2381	static int process_config_option(struct cache cache, const char key, const char *value)
				2382	{
				2383	unsigned long tmp;
				2384
				2385	if (!strcasecmp(key, "migration_threshold")) {
				2386	if (kstrtoul(value, 10, &tmp))
				2387	return -EINVAL;
				2388
				2389	cache->migration_threshold = tmp;
				2390	return 0;
				2391	}
				2392
				2393	return NOT_CORE_OPTION;
				2394	}
				2395
				2396	static int set_config_value(struct cache cache, const char key, const char *value)
				2397	{
				2398	int r = process_config_option(cache, key, value);
				2399
				2400	if (r == NOT_CORE_OPTION)
				2401	r = policy_set_config_value(cache->policy, key, value);
				2402
				2403	if (r)
				2404	DMWARN("bad config value for %s: %s", key, value);
				2405
				2406	return r;
				2407	}
				2408
				2409	static int set_config_values(struct cache cache, int argc, const char *argv)
				2410	{
				2411	int r = 0;
				2412
				2413	if (argc & 1) {
				2414	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
				2415	return -EINVAL;
				2416	}
				2417
				2418	while (argc) {
				2419	r = set_config_value(cache, argv[0], argv[1]);
				2420	if (r)
				2421	break;
				2422
				2423	argc -= 2;
				2424	argv += 2;
				2425	}
				2426
				2427	return r;
				2428	}
				2429
				2430	static int create_cache_policy(struct cache cache, struct cache_args ca,
				2431	char **error)
				2432	{
				2433	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
				2434	cache->cache_size,
				2435	cache->origin_sectors,
				2436	cache->sectors_per_block);
				2437	if (IS_ERR(p)) {
				2438	*error = "Error creating cache's policy";
				2439	return PTR_ERR(p);
				2440	}
				2441	cache->policy = p;
				2442	BUG_ON(!cache->policy);
				2443
				2444	return 0;
				2445	}
				2446
				2447	/*
				2448	* We want the discard block size to be at least the size of the cache
				2449	* block size and have no more than 2^14 discard blocks across the origin.
				2450	*/
				2451	#define MAX_DISCARD_BLOCKS (1 << 14)
				2452
				2453	static bool too_many_discard_blocks(sector_t discard_block_size,
				2454	sector_t origin_size)
				2455	{
				2456	(void) sector_div(origin_size, discard_block_size);
				2457
				2458	return origin_size > MAX_DISCARD_BLOCKS;
				2459	}
				2460
				2461	static sector_t calculate_discard_block_size(sector_t cache_block_size,
				2462	sector_t origin_size)
				2463	{
				2464	sector_t discard_block_size = cache_block_size;
				2465
				2466	if (origin_size)
				2467	while (too_many_discard_blocks(discard_block_size, origin_size))
				2468	discard_block_size *= 2;
				2469
				2470	return discard_block_size;
				2471	}
				2472
				2473	static void set_cache_size(struct cache *cache, dm_cblock_t size)
				2474	{
				2475	dm_block_t nr_blocks = from_cblock(size);
				2476
				2477	if (nr_blocks > (1 << 20) && cache->cache_size != size)
				2478	DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
				2479	"All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
				2480	"Please consider increasing the cache block size to reduce the overall cache block count.",
				2481	(unsigned long long) nr_blocks);
				2482
				2483	cache->cache_size = size;
				2484	}
				2485
				2486	static int is_congested(struct dm_dev *dev, int bdi_bits)
				2487	{
				2488	struct request_queue *q = bdev_get_queue(dev->bdev);
				2489	return bdi_congested(q->backing_dev_info, bdi_bits);
				2490	}
				2491
				2492	static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				2493	{
				2494	struct cache *cache = container_of(cb, struct cache, callbacks);
				2495
				2496	return is_congested(cache->origin_dev, bdi_bits) \|\|
				2497	is_congested(cache->cache_dev, bdi_bits);
				2498	}
				2499
				2500	#define DEFAULT_MIGRATION_THRESHOLD 2048
				2501
				2502	static int cache_create(struct cache_args ca, struct cache *result)
				2503	{
				2504	int r = 0;
				2505	char **error = &ca->ti->error;
				2506	struct cache *cache;
				2507	struct dm_target *ti = ca->ti;
				2508	dm_block_t origin_blocks;
				2509	struct dm_cache_metadata *cmd;
				2510	bool may_format = ca->features.mode == CM_WRITE;
				2511
				2512	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
				2513	if (!cache)
				2514	return -ENOMEM;
				2515
				2516	cache->ti = ca->ti;
				2517	ti->private = cache;
				2518	ti->num_flush_bios = 2;
				2519	ti->flush_supported = true;
				2520
				2521	ti->num_discard_bios = 1;
				2522	ti->discards_supported = true;
				2523	ti->split_discard_bios = false;
				2524
				2525	cache->features = ca->features;
				2526	ti->per_io_data_size = get_per_bio_data_size(cache);
				2527
				2528	if (writethrough_mode(cache)) {
				2529	/* Create bioset for writethrough bios issued to origin */
				2530	cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
				2531	if (!cache->bs)
				2532	goto bad;
				2533	}
				2534
				2535	cache->callbacks.congested_fn = cache_is_congested;
				2536	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
				2537
				2538	cache->metadata_dev = ca->metadata_dev;
				2539	cache->origin_dev = ca->origin_dev;
				2540	cache->cache_dev = ca->cache_dev;
				2541
				2542	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
				2543
				2544	origin_blocks = cache->origin_sectors = ca->origin_sectors;
				2545	origin_blocks = block_div(origin_blocks, ca->block_size);
				2546	cache->origin_blocks = to_oblock(origin_blocks);
				2547
				2548	cache->sectors_per_block = ca->block_size;
				2549	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
				2550	r = -EINVAL;
				2551	goto bad;
				2552	}
				2553
				2554	if (ca->block_size & (ca->block_size - 1)) {
				2555	dm_block_t cache_size = ca->cache_sectors;
				2556
				2557	cache->sectors_per_block_shift = -1;
				2558	cache_size = block_div(cache_size, ca->block_size);
				2559	set_cache_size(cache, to_cblock(cache_size));
				2560	} else {
				2561	cache->sectors_per_block_shift = __ffs(ca->block_size);
				2562	set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
				2563	}
				2564
				2565	r = create_cache_policy(cache, ca, error);
				2566	if (r)
				2567	goto bad;
				2568
				2569	cache->policy_nr_args = ca->policy_argc;
				2570	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
				2571
				2572	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
				2573	if (r) {
				2574	*error = "Error setting cache policy's config values";
				2575	goto bad;
				2576	}
				2577
				2578	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
				2579	ca->block_size, may_format,
				2580	dm_cache_policy_get_hint_size(cache->policy),
				2581	ca->features.metadata_version);
				2582	if (IS_ERR(cmd)) {
				2583	*error = "Error creating metadata object";
				2584	r = PTR_ERR(cmd);
				2585	goto bad;
				2586	}
				2587	cache->cmd = cmd;
				2588	set_cache_mode(cache, CM_WRITE);
				2589	if (get_cache_mode(cache) != CM_WRITE) {
				2590	*error = "Unable to get write access to metadata, please check/repair metadata.";
				2591	r = -EINVAL;
				2592	goto bad;
				2593	}
				2594
				2595	if (passthrough_mode(cache)) {
				2596	bool all_clean;
				2597
				2598	r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
				2599	if (r) {
				2600	*error = "dm_cache_metadata_all_clean() failed";
				2601	goto bad;
				2602	}
				2603
				2604	if (!all_clean) {
				2605	*error = "Cannot enter passthrough mode unless all blocks are clean";
				2606	r = -EINVAL;
				2607	goto bad;
				2608	}
				2609
				2610	policy_allow_migrations(cache->policy, false);
				2611	}
				2612
				2613	spin_lock_init(&cache->lock);
				2614	INIT_LIST_HEAD(&cache->deferred_cells);
				2615	bio_list_init(&cache->deferred_bios);
				2616	atomic_set(&cache->nr_allocated_migrations, 0);
				2617	atomic_set(&cache->nr_io_migrations, 0);
				2618	init_waitqueue_head(&cache->migration_wait);
				2619
				2620	r = -ENOMEM;
				2621	atomic_set(&cache->nr_dirty, 0);
				2622	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
				2623	if (!cache->dirty_bitset) {
				2624	*error = "could not allocate dirty bitset";
				2625	goto bad;
				2626	}
				2627	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
				2628
				2629	cache->discard_block_size =
				2630	calculate_discard_block_size(cache->sectors_per_block,
				2631	cache->origin_sectors);
				2632	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
				2633	cache->discard_block_size));
				2634	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
				2635	if (!cache->discard_bitset) {
				2636	*error = "could not allocate discard bitset";
				2637	goto bad;
				2638	}
				2639	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				2640
				2641	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				2642	if (IS_ERR(cache->copier)) {
				2643	*error = "could not create kcopyd client";
				2644	r = PTR_ERR(cache->copier);
				2645	goto bad;
				2646	}
				2647
				2648	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
				2649	if (!cache->wq) {
				2650	*error = "could not create workqueue for metadata object";
				2651	goto bad;
				2652	}
				2653	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
				2654	INIT_WORK(&cache->migration_worker, check_migrations);
				2655	INIT_DELAYED_WORK(&cache->waker, do_waker);
				2656
				2657	cache->prison = dm_bio_prison_create_v2(cache->wq);
				2658	if (!cache->prison) {
				2659	*error = "could not create bio prison";
				2660	goto bad;
				2661	}
				2662
				2663	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
				2664	migration_cache);
				2665	if (!cache->migration_pool) {
				2666	*error = "Error creating cache's migration mempool";
				2667	goto bad;
				2668	}
				2669
				2670	cache->need_tick_bio = true;
				2671	cache->sized = false;
				2672	cache->invalidate = false;
				2673	cache->commit_requested = false;
				2674	cache->loaded_mappings = false;
				2675	cache->loaded_discards = false;
				2676
				2677	load_stats(cache);
				2678
				2679	atomic_set(&cache->stats.demotion, 0);
				2680	atomic_set(&cache->stats.promotion, 0);
				2681	atomic_set(&cache->stats.copies_avoided, 0);
				2682	atomic_set(&cache->stats.cache_cell_clash, 0);
				2683	atomic_set(&cache->stats.commit_count, 0);
				2684	atomic_set(&cache->stats.discard_count, 0);
				2685
				2686	spin_lock_init(&cache->invalidation_lock);
				2687	INIT_LIST_HEAD(&cache->invalidation_requests);
				2688
				2689	batcher_init(&cache->committer, commit_op, cache,
				2690	issue_op, cache, cache->wq);
				2691	iot_init(&cache->tracker);
				2692
				2693	init_rwsem(&cache->background_work_lock);
				2694	prevent_background_work(cache);
				2695
				2696	*result = cache;
				2697	return 0;
				2698	bad:
				2699	destroy(cache);
				2700	return r;
				2701	}
				2702
				2703	static int copy_ctr_args(struct cache cache, int argc, const char *argv)
				2704	{
				2705	unsigned i;
				2706	const char **copy;
				2707
				2708	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				2709	if (!copy)
				2710	return -ENOMEM;
				2711	for (i = 0; i < argc; i++) {
				2712	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				2713	if (!copy[i]) {
				2714	while (i--)
				2715	kfree(copy[i]);
				2716	kfree(copy);
				2717	return -ENOMEM;
				2718	}
				2719	}
				2720
				2721	cache->nr_ctr_args = argc;
				2722	cache->ctr_args = copy;
				2723
				2724	return 0;
				2725	}
				2726
				2727	static int cache_ctr(struct dm_target ti, unsigned argc, char *argv)
				2728	{
				2729	int r = -EINVAL;
				2730	struct cache_args *ca;
				2731	struct cache *cache = NULL;
				2732
				2733	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
				2734	if (!ca) {
				2735	ti->error = "Error allocating memory for cache";
				2736	return -ENOMEM;
				2737	}
				2738	ca->ti = ti;
				2739
				2740	r = parse_cache_args(ca, argc, argv, &ti->error);
				2741	if (r)
				2742	goto out;
				2743
				2744	r = cache_create(ca, &cache);
				2745	if (r)
				2746	goto out;
				2747
				2748	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
				2749	if (r) {
				2750	destroy(cache);
				2751	goto out;
				2752	}
				2753
				2754	ti->private = cache;
				2755	out:
				2756	destroy_cache_args(ca);
				2757	return r;
				2758	}
				2759
				2760	/----------------------------------------------------------------/
				2761
				2762	static int cache_map(struct dm_target ti, struct bio bio)
				2763	{
				2764	struct cache *cache = ti->private;
				2765
				2766	int r;
				2767	bool commit_needed;
				2768	dm_oblock_t block = get_bio_block(cache, bio);
				2769	size_t pb_data_size = get_per_bio_data_size(cache);
				2770
				2771	init_per_bio_data(bio, pb_data_size);
				2772	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
				2773	/*
				2774	* This can only occur if the io goes to a partial block at
				2775	* the end of the origin device. We don't cache these.
				2776	* Just remap to the origin and carry on.
				2777	*/
				2778	remap_to_origin(cache, bio);
				2779	accounted_begin(cache, bio);
				2780	return DM_MAPIO_REMAPPED;
				2781	}
				2782
				2783	if (discard_or_flush(bio)) {
				2784	defer_bio(cache, bio);
				2785	return DM_MAPIO_SUBMITTED;
				2786	}
				2787
				2788	r = map_bio(cache, bio, block, &commit_needed);
				2789	if (commit_needed)
				2790	schedule_commit(&cache->committer);
				2791
				2792	return r;
				2793	}
				2794
				2795	static int cache_end_io(struct dm_target ti, struct bio bio,
				2796	blk_status_t *error)
				2797	{
				2798	struct cache *cache = ti->private;
				2799	unsigned long flags;
				2800	size_t pb_data_size = get_per_bio_data_size(cache);
				2801	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
				2802
				2803	if (pb->tick) {
				2804	policy_tick(cache->policy, false);
				2805
				2806	spin_lock_irqsave(&cache->lock, flags);
				2807	cache->need_tick_bio = true;
				2808	spin_unlock_irqrestore(&cache->lock, flags);
				2809	}
				2810
				2811	bio_drop_shared_lock(cache, bio);
				2812	accounted_complete(cache, bio);
				2813
				2814	return DM_ENDIO_DONE;
				2815	}
				2816
				2817	static int write_dirty_bitset(struct cache *cache)
				2818	{
				2819	int r;
				2820
				2821	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2822	return -EINVAL;
				2823
				2824	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
				2825	if (r)
				2826	metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
				2827
				2828	return r;
				2829	}
				2830
				2831	static int write_discard_bitset(struct cache *cache)
				2832	{
				2833	unsigned i, r;
				2834
				2835	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2836	return -EINVAL;
				2837
				2838	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
				2839	cache->discard_nr_blocks);
				2840	if (r) {
				2841	DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
				2842	metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
				2843	return r;
				2844	}
				2845
				2846	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
				2847	r = dm_cache_set_discard(cache->cmd, to_dblock(i),
				2848	is_discarded(cache, to_dblock(i)));
				2849	if (r) {
				2850	metadata_operation_failed(cache, "dm_cache_set_discard", r);
				2851	return r;
				2852	}
				2853	}
				2854
				2855	return 0;
				2856	}
				2857
				2858	static int write_hints(struct cache *cache)
				2859	{
				2860	int r;
				2861
				2862	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2863	return -EINVAL;
				2864
				2865	r = dm_cache_write_hints(cache->cmd, cache->policy);
				2866	if (r) {
				2867	metadata_operation_failed(cache, "dm_cache_write_hints", r);
				2868	return r;
				2869	}
				2870
				2871	return 0;
				2872	}
				2873
				2874	/*
				2875	* returns true on success
				2876	*/
				2877	static bool sync_metadata(struct cache *cache)
				2878	{
				2879	int r1, r2, r3, r4;
				2880
				2881	r1 = write_dirty_bitset(cache);
				2882	if (r1)
				2883	DMERR("%s: could not write dirty bitset", cache_device_name(cache));
				2884
				2885	r2 = write_discard_bitset(cache);
				2886	if (r2)
				2887	DMERR("%s: could not write discard bitset", cache_device_name(cache));
				2888
				2889	save_stats(cache);
				2890
				2891	r3 = write_hints(cache);
				2892	if (r3)
				2893	DMERR("%s: could not write hints", cache_device_name(cache));
				2894
				2895	/*
				2896	* If writing the above metadata failed, we still commit, but don't
				2897	* set the clean shutdown flag. This will effectively force every
				2898	* dirty bit to be set on reload.
				2899	*/
				2900	r4 = commit(cache, !r1 && !r2 && !r3);
				2901	if (r4)
				2902	DMERR("%s: could not write cache metadata", cache_device_name(cache));
				2903
				2904	return !r1 && !r2 && !r3 && !r4;
				2905	}
				2906
				2907	static void cache_postsuspend(struct dm_target *ti)
				2908	{
				2909	struct cache *cache = ti->private;
				2910
				2911	prevent_background_work(cache);
				2912	BUG_ON(atomic_read(&cache->nr_io_migrations));
				2913
				2914	cancel_delayed_work_sync(&cache->waker);
				2915	drain_workqueue(cache->wq);
				2916	WARN_ON(cache->tracker.in_flight);
				2917
				2918	/*
				2919	* If it's a flush suspend there won't be any deferred bios, so this
				2920	* call is harmless.
				2921	*/
				2922	requeue_deferred_bios(cache);
				2923
				2924	if (get_cache_mode(cache) == CM_WRITE)
				2925	(void) sync_metadata(cache);
				2926	}
				2927
				2928	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
				2929	bool dirty, uint32_t hint, bool hint_valid)
				2930	{
				2931	int r;
				2932	struct cache *cache = context;
				2933
				2934	if (dirty) {
				2935	set_bit(from_cblock(cblock), cache->dirty_bitset);
				2936	atomic_inc(&cache->nr_dirty);
				2937	} else
				2938	clear_bit(from_cblock(cblock), cache->dirty_bitset);
				2939
				2940	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
				2941	if (r)
				2942	return r;
				2943
				2944	return 0;
				2945	}
				2946
				2947	/*
				2948	* The discard block size in the on disk metadata is not
				2949	* neccessarily the same as we're currently using. So we have to
				2950	* be careful to only set the discarded attribute if we know it
				2951	* covers a complete block of the new size.
				2952	*/
				2953	struct discard_load_info {
				2954	struct cache *cache;
				2955
				2956	/*
				2957	* These blocks are sized using the on disk dblock size, rather
				2958	* than the current one.
				2959	*/
				2960	dm_block_t block_size;
				2961	dm_block_t discard_begin, discard_end;
				2962	};
				2963
				2964	static void discard_load_info_init(struct cache *cache,
				2965	struct discard_load_info *li)
				2966	{
				2967	li->cache = cache;
				2968	li->discard_begin = li->discard_end = 0;
				2969	}
				2970
				2971	static void set_discard_range(struct discard_load_info *li)
				2972	{
				2973	sector_t b, e;
				2974
				2975	if (li->discard_begin == li->discard_end)
				2976	return;
				2977
				2978	/*
				2979	* Convert to sectors.
				2980	*/
				2981	b = li->discard_begin * li->block_size;
				2982	e = li->discard_end * li->block_size;
				2983
				2984	/*
				2985	* Then convert back to the current dblock size.
				2986	*/
				2987	b = dm_sector_div_up(b, li->cache->discard_block_size);
				2988	sector_div(e, li->cache->discard_block_size);
				2989
				2990	/*
				2991	* The origin may have shrunk, so we need to check we're still in
				2992	* bounds.
				2993	*/
				2994	if (e > from_dblock(li->cache->discard_nr_blocks))
				2995	e = from_dblock(li->cache->discard_nr_blocks);
				2996
				2997	for (; b < e; b++)
				2998	set_discard(li->cache, to_dblock(b));
				2999	}
				3000
				3001	static int load_discard(void *context, sector_t discard_block_size,
				3002	dm_dblock_t dblock, bool discard)
				3003	{
				3004	struct discard_load_info *li = context;
				3005
				3006	li->block_size = discard_block_size;
				3007
				3008	if (discard) {
				3009	if (from_dblock(dblock) == li->discard_end)
				3010	/*
				3011	* We're already in a discard range, just extend it.
				3012	*/
				3013	li->discard_end = li->discard_end + 1ULL;
				3014
				3015	else {
				3016	/*
				3017	* Emit the old range and start a new one.
				3018	*/
				3019	set_discard_range(li);
				3020	li->discard_begin = from_dblock(dblock);
				3021	li->discard_end = li->discard_begin + 1ULL;
				3022	}
				3023	} else {
				3024	set_discard_range(li);
				3025	li->discard_begin = li->discard_end = 0;
				3026	}
				3027
				3028	return 0;
				3029	}
				3030
				3031	static dm_cblock_t get_cache_dev_size(struct cache *cache)
				3032	{
				3033	sector_t size = get_dev_size(cache->cache_dev);
				3034	(void) sector_div(size, cache->sectors_per_block);
				3035	return to_cblock(size);
				3036	}
				3037
				3038	static bool can_resize(struct cache *cache, dm_cblock_t new_size)
				3039	{
				3040	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
				3041	if (cache->sized) {
				3042	DMERR("%s: unable to extend cache due to missing cache table reload",
				3043	cache_device_name(cache));
				3044	return false;
				3045	}
				3046	}
				3047
				3048	/*
				3049	* We can't drop a dirty block when shrinking the cache.
				3050	*/
				3051	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
				3052	new_size = to_cblock(from_cblock(new_size) + 1);
				3053	if (is_dirty(cache, new_size)) {
				3054	DMERR("%s: unable to shrink cache; cache block %llu is dirty",
				3055	cache_device_name(cache),
				3056	(unsigned long long) from_cblock(new_size));
				3057	return false;
				3058	}
				3059	}
				3060
				3061	return true;
				3062	}
				3063
				3064	static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
				3065	{
				3066	int r;
				3067
				3068	r = dm_cache_resize(cache->cmd, new_size);
				3069	if (r) {
				3070	DMERR("%s: could not resize cache metadata", cache_device_name(cache));
				3071	metadata_operation_failed(cache, "dm_cache_resize", r);
				3072	return r;
				3073	}
				3074
				3075	set_cache_size(cache, new_size);
				3076
				3077	return 0;
				3078	}
				3079
				3080	static int cache_preresume(struct dm_target *ti)
				3081	{
				3082	int r = 0;
				3083	struct cache *cache = ti->private;
				3084	dm_cblock_t csize = get_cache_dev_size(cache);
				3085
				3086	/*
				3087	* Check to see if the cache has resized.
				3088	*/
				3089	if (!cache->sized) {
				3090	r = resize_cache_dev(cache, csize);
				3091	if (r)
				3092	return r;
				3093
				3094	cache->sized = true;
				3095
				3096	} else if (csize != cache->cache_size) {
				3097	if (!can_resize(cache, csize))
				3098	return -EINVAL;
				3099
				3100	r = resize_cache_dev(cache, csize);
				3101	if (r)
				3102	return r;
				3103	}
				3104
				3105	if (!cache->loaded_mappings) {
				3106	r = dm_cache_load_mappings(cache->cmd, cache->policy,
				3107	load_mapping, cache);
				3108	if (r) {
				3109	DMERR("%s: could not load cache mappings", cache_device_name(cache));
				3110	metadata_operation_failed(cache, "dm_cache_load_mappings", r);
				3111	return r;
				3112	}
				3113
				3114	cache->loaded_mappings = true;
				3115	}
				3116
				3117	if (!cache->loaded_discards) {
				3118	struct discard_load_info li;
				3119
				3120	/*
				3121	* The discard bitset could have been resized, or the
				3122	* discard block size changed. To be safe we start by
				3123	* setting every dblock to not discarded.
				3124	*/
				3125	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				3126
				3127	discard_load_info_init(cache, &li);
				3128	r = dm_cache_load_discards(cache->cmd, load_discard, &li);
				3129	if (r) {
				3130	DMERR("%s: could not load origin discards", cache_device_name(cache));
				3131	metadata_operation_failed(cache, "dm_cache_load_discards", r);
				3132	return r;
				3133	}
				3134	set_discard_range(&li);
				3135
				3136	cache->loaded_discards = true;
				3137	}
				3138
				3139	return r;
				3140	}
				3141
				3142	static void cache_resume(struct dm_target *ti)
				3143	{
				3144	struct cache *cache = ti->private;
				3145
				3146	cache->need_tick_bio = true;
				3147	allow_background_work(cache);
				3148	do_waker(&cache->waker.work);
				3149	}
				3150
				3151	/*
				3152	* Status format:
				3153	*
				3154	* <metadata block size> <#used metadata blocks>/<#total metadata blocks>
				3155	* <cache block size> <#used cache blocks>/<#total cache blocks>
				3156	* <#read hits> <#read misses> <#write hits> <#write misses>
				3157	* <#demotions> <#promotions> <#dirty>
				3158	* <#features> <features>*
				3159	* <#core args> <core args>
				3160	* <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
				3161	*/
				3162	static void cache_status(struct dm_target *ti, status_type_t type,
				3163	unsigned status_flags, char *result, unsigned maxlen)
				3164	{
				3165	int r = 0;
				3166	unsigned i;
				3167	ssize_t sz = 0;
				3168	dm_block_t nr_free_blocks_metadata = 0;
				3169	dm_block_t nr_blocks_metadata = 0;
				3170	char buf[BDEVNAME_SIZE];
				3171	struct cache *cache = ti->private;
				3172	dm_cblock_t residency;
				3173	bool needs_check;
				3174
				3175	switch (type) {
				3176	case STATUSTYPE_INFO:
				3177	if (get_cache_mode(cache) == CM_FAIL) {
				3178	DMEMIT("Fail");
				3179	break;
				3180	}
				3181
				3182	/* Commit to ensure statistics aren't out-of-date */
				3183	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
				3184	(void) commit(cache, false);
				3185
				3186	r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
				3187	if (r) {
				3188	DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
				3189	cache_device_name(cache), r);
				3190	goto err;
				3191	}
				3192
				3193	r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
				3194	if (r) {
				3195	DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
				3196	cache_device_name(cache), r);
				3197	goto err;
				3198	}
				3199
				3200	residency = policy_residency(cache->policy);
				3201
				3202	DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
				3203	(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
				3204	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				3205	(unsigned long long)nr_blocks_metadata,
				3206	(unsigned long long)cache->sectors_per_block,
				3207	(unsigned long long) from_cblock(residency),
				3208	(unsigned long long) from_cblock(cache->cache_size),
				3209	(unsigned) atomic_read(&cache->stats.read_hit),
				3210	(unsigned) atomic_read(&cache->stats.read_miss),
				3211	(unsigned) atomic_read(&cache->stats.write_hit),
				3212	(unsigned) atomic_read(&cache->stats.write_miss),
				3213	(unsigned) atomic_read(&cache->stats.demotion),
				3214	(unsigned) atomic_read(&cache->stats.promotion),
				3215	(unsigned long) atomic_read(&cache->nr_dirty));
				3216
				3217	if (cache->features.metadata_version == 2)
				3218	DMEMIT("2 metadata2 ");
				3219	else
				3220	DMEMIT("1 ");
				3221
				3222	if (writethrough_mode(cache))
				3223	DMEMIT("writethrough ");
				3224
				3225	else if (passthrough_mode(cache))
				3226	DMEMIT("passthrough ");
				3227
				3228	else if (writeback_mode(cache))
				3229	DMEMIT("writeback ");
				3230
				3231	else {
				3232	DMERR("%s: internal error: unknown io mode: %d",
				3233	cache_device_name(cache), (int) cache->features.io_mode);
				3234	goto err;
				3235	}
				3236
				3237	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
				3238
				3239	DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
				3240	if (sz < maxlen) {
				3241	r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
				3242	if (r)
				3243	DMERR("%s: policy_emit_config_values returned %d",
				3244	cache_device_name(cache), r);
				3245	}
				3246
				3247	if (get_cache_mode(cache) == CM_READ_ONLY)
				3248	DMEMIT("ro ");
				3249	else
				3250	DMEMIT("rw ");
				3251
				3252	r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
				3253
				3254	if (r \|\| needs_check)
				3255	DMEMIT("needs_check ");
				3256	else
				3257	DMEMIT("- ");
				3258
				3259	break;
				3260
				3261	case STATUSTYPE_TABLE:
				3262	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
				3263	DMEMIT("%s ", buf);
				3264	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
				3265	DMEMIT("%s ", buf);
				3266	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
				3267	DMEMIT("%s", buf);
				3268
				3269	for (i = 0; i < cache->nr_ctr_args - 1; i++)
				3270	DMEMIT(" %s", cache->ctr_args[i]);
				3271	if (cache->nr_ctr_args)
				3272	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
				3273	}
				3274
				3275	return;
				3276
				3277	err:
				3278	DMEMIT("Error");
				3279	}
				3280
				3281	/*
				3282	* Defines a range of cblocks, begin to (end - 1) are in the range. end is
				3283	* the one-past-the-end value.
				3284	*/
				3285	struct cblock_range {
				3286	dm_cblock_t begin;
				3287	dm_cblock_t end;
				3288	};
				3289
				3290	/*
				3291	* A cache block range can take two forms:
				3292	*
				3293	* i) A single cblock, eg. '3456'
				3294	* ii) A begin and end cblock with a dash between, eg. 123-234
				3295	*/
				3296	static int parse_cblock_range(struct cache cache, const char str,
				3297	struct cblock_range *result)
				3298	{
				3299	char dummy;
				3300	uint64_t b, e;
				3301	int r;
				3302
				3303	/*
				3304	* Try and parse form (ii) first.
				3305	*/
				3306	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
				3307	if (r < 0)
				3308	return r;
				3309
				3310	if (r == 2) {
				3311	result->begin = to_cblock(b);
				3312	result->end = to_cblock(e);
				3313	return 0;
				3314	}
				3315
				3316	/*
				3317	* That didn't work, try form (i).
				3318	*/
				3319	r = sscanf(str, "%llu%c", &b, &dummy);
				3320	if (r < 0)
				3321	return r;
				3322
				3323	if (r == 1) {
				3324	result->begin = to_cblock(b);
				3325	result->end = to_cblock(from_cblock(result->begin) + 1u);
				3326	return 0;
				3327	}
				3328
				3329	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
				3330	return -EINVAL;
				3331	}
				3332
				3333	static int validate_cblock_range(struct cache cache, struct cblock_range range)
				3334	{
				3335	uint64_t b = from_cblock(range->begin);
				3336	uint64_t e = from_cblock(range->end);
				3337	uint64_t n = from_cblock(cache->cache_size);
				3338
				3339	if (b >= n) {
				3340	DMERR("%s: begin cblock out of range: %llu >= %llu",
				3341	cache_device_name(cache), b, n);
				3342	return -EINVAL;
				3343	}
				3344
				3345	if (e > n) {
				3346	DMERR("%s: end cblock out of range: %llu > %llu",
				3347	cache_device_name(cache), e, n);
				3348	return -EINVAL;
				3349	}
				3350
				3351	if (b >= e) {
				3352	DMERR("%s: invalid cblock range: %llu >= %llu",
				3353	cache_device_name(cache), b, e);
				3354	return -EINVAL;
				3355	}
				3356
				3357	return 0;
				3358	}
				3359
				3360	static inline dm_cblock_t cblock_succ(dm_cblock_t b)
				3361	{
				3362	return to_cblock(from_cblock(b) + 1);
				3363	}
				3364
				3365	static int request_invalidation(struct cache cache, struct cblock_range range)
				3366	{
				3367	int r = 0;
				3368
				3369	/*
				3370	* We don't need to do any locking here because we know we're in
				3371	* passthrough mode. There's is potential for a race between an
				3372	* invalidation triggered by an io and an invalidation message. This
				3373	* is harmless, we must not worry if the policy call fails.
				3374	*/
				3375	while (range->begin != range->end) {
				3376	r = invalidate_cblock(cache, range->begin);
				3377	if (r)
				3378	return r;
				3379
				3380	range->begin = cblock_succ(range->begin);
				3381	}
				3382
				3383	cache->commit_requested = true;
				3384	return r;
				3385	}
				3386
				3387	static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
				3388	const char **cblock_ranges)
				3389	{
				3390	int r = 0;
				3391	unsigned i;
				3392	struct cblock_range range;
				3393
				3394	if (!passthrough_mode(cache)) {
				3395	DMERR("%s: cache has to be in passthrough mode for invalidation",
				3396	cache_device_name(cache));
				3397	return -EPERM;
				3398	}
				3399
				3400	for (i = 0; i < count; i++) {
				3401	r = parse_cblock_range(cache, cblock_ranges[i], &range);
				3402	if (r)
				3403	break;
				3404
				3405	r = validate_cblock_range(cache, &range);
				3406	if (r)
				3407	break;
				3408
				3409	/*
				3410	* Pass begin and end origin blocks to the worker and wake it.
				3411	*/
				3412	r = request_invalidation(cache, &range);
				3413	if (r)
				3414	break;
				3415	}
				3416
				3417	return r;
				3418	}
				3419
				3420	/*
				3421	* Supports
				3422	* "<key> <value>"
				3423	* and
				3424	* "invalidate_cblocks [(<begin>)\|(<begin>-<end>)]*
				3425	*
				3426	* The key migration_threshold is supported by the cache target core.
				3427	*/
				3428	static int cache_message(struct dm_target ti, unsigned argc, char *argv)
				3429	{
				3430	struct cache *cache = ti->private;
				3431
				3432	if (!argc)
				3433	return -EINVAL;
				3434
				3435	if (get_cache_mode(cache) >= CM_READ_ONLY) {
				3436	DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
				3437	cache_device_name(cache));
				3438	return -EOPNOTSUPP;
				3439	}
				3440
				3441	if (!strcasecmp(argv[0], "invalidate_cblocks"))
				3442	return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
				3443
				3444	if (argc != 2)
				3445	return -EINVAL;
				3446
				3447	return set_config_value(cache, argv[0], argv[1]);
				3448	}
				3449
				3450	static int cache_iterate_devices(struct dm_target *ti,
				3451	iterate_devices_callout_fn fn, void *data)
				3452	{
				3453	int r = 0;
				3454	struct cache *cache = ti->private;
				3455
				3456	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
				3457	if (!r)
				3458	r = fn(ti, cache->origin_dev, 0, ti->len, data);
				3459
				3460	return r;
				3461	}
				3462
				3463	static void set_discard_limits(struct cache cache, struct queue_limits limits)
				3464	{
				3465	/*
				3466	* FIXME: these limits may be incompatible with the cache device
				3467	*/
				3468	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
				3469	cache->origin_sectors);
				3470	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
				3471	}
				3472
				3473	static void cache_io_hints(struct dm_target ti, struct queue_limits limits)
				3474	{
				3475	struct cache *cache = ti->private;
				3476	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
				3477
				3478	/*
				3479	* If the system-determined stacked limits are compatible with the
				3480	* cache's blocksize (io_opt is a factor) do not override them.
				3481	*/
				3482	if (io_opt_sectors < cache->sectors_per_block \|\|
				3483	do_div(io_opt_sectors, cache->sectors_per_block)) {
				3484	blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3485	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3486	}
				3487	set_discard_limits(cache, limits);
				3488	}
				3489
				3490	/----------------------------------------------------------------/
				3491
				3492	static struct target_type cache_target = {
				3493	.name = "cache",
				3494	.version = {2, 0, 0},
				3495	.module = THIS_MODULE,
				3496	.ctr = cache_ctr,
				3497	.dtr = cache_dtr,
				3498	.map = cache_map,
				3499	.end_io = cache_end_io,
				3500	.postsuspend = cache_postsuspend,
				3501	.preresume = cache_preresume,
				3502	.resume = cache_resume,
				3503	.status = cache_status,
				3504	.message = cache_message,
				3505	.iterate_devices = cache_iterate_devices,
				3506	.io_hints = cache_io_hints,
				3507	};
				3508
				3509	static int __init dm_cache_init(void)
				3510	{
				3511	int r;
				3512
				3513	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
				3514	if (!migration_cache)
				3515	return -ENOMEM;
				3516
				3517	r = dm_register_target(&cache_target);
				3518	if (r) {
				3519	DMERR("cache target registration failed: %d", r);
				3520	kmem_cache_destroy(migration_cache);
				3521	return r;
				3522	}
				3523
				3524	return 0;
				3525	}
				3526
				3527	static void __exit dm_cache_exit(void)
				3528	{
				3529	dm_unregister_target(&cache_target);
				3530	kmem_cache_destroy(migration_cache);
				3531	}
				3532
				3533	module_init(dm_cache_init);
				3534	module_exit(dm_cache_exit);
				3535
				3536	MODULE_DESCRIPTION(DM_NAME " cache target");
				3537	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
				3538	MODULE_LICENSE("GPL");