Blame - marvell/linux/drivers/md/dm-cache-target.c - T108

blob: c1d2e3376afcd4cdbea7a70457e2ded10f8e2492 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2012 Red Hat. All rights reserved.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-prison-v2.h"
				9	#include "dm-bio-record.h"
				10	#include "dm-cache-metadata.h"
				11
				12	#include <linux/dm-io.h>
				13	#include <linux/dm-kcopyd.h>
				14	#include <linux/jiffies.h>
				15	#include <linux/init.h>
				16	#include <linux/mempool.h>
				17	#include <linux/module.h>
				18	#include <linux/rwsem.h>
				19	#include <linux/slab.h>
				20	#include <linux/vmalloc.h>
				21
				22	#define DM_MSG_PREFIX "cache"
				23
				24	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
				25	"A percentage of time allocated for copying to and/or from cache");
				26
				27	/----------------------------------------------------------------/
				28
				29	/*
				30	* Glossary:
				31	*
				32	* oblock: index of an origin block
				33	* cblock: index of a cache block
				34	* promotion: movement of a block from origin to cache
				35	* demotion: movement of a block from cache to origin
				36	* migration: movement of a block between the origin and cache device,
				37	* either direction
				38	*/
				39
				40	/----------------------------------------------------------------/
				41
				42	struct io_tracker {
				43	spinlock_t lock;
				44
				45	/*
				46	* Sectors of in-flight IO.
				47	*/
				48	sector_t in_flight;
				49
				50	/*
				51	* The time, in jiffies, when this device became idle (if it is
				52	* indeed idle).
				53	*/
				54	unsigned long idle_time;
				55	unsigned long last_update_time;
				56	};
				57
				58	static void iot_init(struct io_tracker *iot)
				59	{
				60	spin_lock_init(&iot->lock);
				61	iot->in_flight = 0ul;
				62	iot->idle_time = 0ul;
				63	iot->last_update_time = jiffies;
				64	}
				65
				66	static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				67	{
				68	if (iot->in_flight)
				69	return false;
				70
				71	return time_after(jiffies, iot->idle_time + jifs);
				72	}
				73
				74	static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				75	{
				76	bool r;
				77	unsigned long flags;
				78
				79	spin_lock_irqsave(&iot->lock, flags);
				80	r = __iot_idle_for(iot, jifs);
				81	spin_unlock_irqrestore(&iot->lock, flags);
				82
				83	return r;
				84	}
				85
				86	static void iot_io_begin(struct io_tracker *iot, sector_t len)
				87	{
				88	unsigned long flags;
				89
				90	spin_lock_irqsave(&iot->lock, flags);
				91	iot->in_flight += len;
				92	spin_unlock_irqrestore(&iot->lock, flags);
				93	}
				94
				95	static void __iot_io_end(struct io_tracker *iot, sector_t len)
				96	{
				97	if (!len)
				98	return;
				99
				100	iot->in_flight -= len;
				101	if (!iot->in_flight)
				102	iot->idle_time = jiffies;
				103	}
				104
				105	static void iot_io_end(struct io_tracker *iot, sector_t len)
				106	{
				107	unsigned long flags;
				108
				109	spin_lock_irqsave(&iot->lock, flags);
				110	__iot_io_end(iot, len);
				111	spin_unlock_irqrestore(&iot->lock, flags);
				112	}
				113
				114	/----------------------------------------------------------------/
				115
				116	/*
				117	* Represents a chunk of future work. 'input' allows continuations to pass
				118	* values between themselves, typically error values.
				119	*/
				120	struct continuation {
				121	struct work_struct ws;
				122	blk_status_t input;
				123	};
				124
				125	static inline void init_continuation(struct continuation *k,
				126	void (fn)(struct work_struct ))
				127	{
				128	INIT_WORK(&k->ws, fn);
				129	k->input = 0;
				130	}
				131
				132	static inline void queue_continuation(struct workqueue_struct *wq,
				133	struct continuation *k)
				134	{
				135	queue_work(wq, &k->ws);
				136	}
				137
				138	/----------------------------------------------------------------/
				139
				140	/*
				141	* The batcher collects together pieces of work that need a particular
				142	* operation to occur before they can proceed (typically a commit).
				143	*/
				144	struct batcher {
				145	/*
				146	* The operation that everyone is waiting for.
				147	*/
				148	blk_status_t (commit_op)(void context);
				149	void *commit_context;
				150
				151	/*
				152	* This is how bios should be issued once the commit op is complete
				153	* (accounted_request).
				154	*/
				155	void (issue_op)(struct bio bio, void *context);
				156	void *issue_context;
				157
				158	/*
				159	* Queued work gets put on here after commit.
				160	*/
				161	struct workqueue_struct *wq;
				162
				163	spinlock_t lock;
				164	struct list_head work_items;
				165	struct bio_list bios;
				166	struct work_struct commit_work;
				167
				168	bool commit_scheduled;
				169	};
				170
				171	static void __commit(struct work_struct *_ws)
				172	{
				173	struct batcher *b = container_of(_ws, struct batcher, commit_work);
				174	blk_status_t r;
				175	unsigned long flags;
				176	struct list_head work_items;
				177	struct work_struct ws, tmp;
				178	struct continuation *k;
				179	struct bio *bio;
				180	struct bio_list bios;
				181
				182	INIT_LIST_HEAD(&work_items);
				183	bio_list_init(&bios);
				184
				185	/*
				186	* We have to grab these before the commit_op to avoid a race
				187	* condition.
				188	*/
				189	spin_lock_irqsave(&b->lock, flags);
				190	list_splice_init(&b->work_items, &work_items);
				191	bio_list_merge(&bios, &b->bios);
				192	bio_list_init(&b->bios);
				193	b->commit_scheduled = false;
				194	spin_unlock_irqrestore(&b->lock, flags);
				195
				196	r = b->commit_op(b->commit_context);
				197
				198	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
				199	k = container_of(ws, struct continuation, ws);
				200	k->input = r;
				201	INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
				202	queue_work(b->wq, ws);
				203	}
				204
				205	while ((bio = bio_list_pop(&bios))) {
				206	if (r) {
				207	bio->bi_status = r;
				208	bio_endio(bio);
				209	} else
				210	b->issue_op(bio, b->issue_context);
				211	}
				212	}
				213
				214	static void batcher_init(struct batcher *b,
				215	blk_status_t (commit_op)(void ),
				216	void *commit_context,
				217	void (issue_op)(struct bio bio, void *),
				218	void *issue_context,
				219	struct workqueue_struct *wq)
				220	{
				221	b->commit_op = commit_op;
				222	b->commit_context = commit_context;
				223	b->issue_op = issue_op;
				224	b->issue_context = issue_context;
				225	b->wq = wq;
				226
				227	spin_lock_init(&b->lock);
				228	INIT_LIST_HEAD(&b->work_items);
				229	bio_list_init(&b->bios);
				230	INIT_WORK(&b->commit_work, __commit);
				231	b->commit_scheduled = false;
				232	}
				233
				234	static void async_commit(struct batcher *b)
				235	{
				236	queue_work(b->wq, &b->commit_work);
				237	}
				238
				239	static void continue_after_commit(struct batcher b, struct continuation k)
				240	{
				241	unsigned long flags;
				242	bool commit_scheduled;
				243
				244	spin_lock_irqsave(&b->lock, flags);
				245	commit_scheduled = b->commit_scheduled;
				246	list_add_tail(&k->ws.entry, &b->work_items);
				247	spin_unlock_irqrestore(&b->lock, flags);
				248
				249	if (commit_scheduled)
				250	async_commit(b);
				251	}
				252
				253	/*
				254	* Bios are errored if commit failed.
				255	*/
				256	static void issue_after_commit(struct batcher b, struct bio bio)
				257	{
				258	unsigned long flags;
				259	bool commit_scheduled;
				260
				261	spin_lock_irqsave(&b->lock, flags);
				262	commit_scheduled = b->commit_scheduled;
				263	bio_list_add(&b->bios, bio);
				264	spin_unlock_irqrestore(&b->lock, flags);
				265
				266	if (commit_scheduled)
				267	async_commit(b);
				268	}
				269
				270	/*
				271	* Call this if some urgent work is waiting for the commit to complete.
				272	*/
				273	static void schedule_commit(struct batcher *b)
				274	{
				275	bool immediate;
				276	unsigned long flags;
				277
				278	spin_lock_irqsave(&b->lock, flags);
				279	immediate = !list_empty(&b->work_items) \|\| !bio_list_empty(&b->bios);
				280	b->commit_scheduled = true;
				281	spin_unlock_irqrestore(&b->lock, flags);
				282
				283	if (immediate)
				284	async_commit(b);
				285	}
				286
				287	/*
				288	* There are a couple of places where we let a bio run, but want to do some
				289	* work before calling its endio function. We do this by temporarily
				290	* changing the endio fn.
				291	*/
				292	struct dm_hook_info {
				293	bio_end_io_t *bi_end_io;
				294	};
				295
				296	static void dm_hook_bio(struct dm_hook_info h, struct bio bio,
				297	bio_end_io_t bi_end_io, void bi_private)
				298	{
				299	h->bi_end_io = bio->bi_end_io;
				300
				301	bio->bi_end_io = bi_end_io;
				302	bio->bi_private = bi_private;
				303	}
				304
				305	static void dm_unhook_bio(struct dm_hook_info h, struct bio bio)
				306	{
				307	bio->bi_end_io = h->bi_end_io;
				308	}
				309
				310	/----------------------------------------------------------------/
				311
				312	#define MIGRATION_POOL_SIZE 128
				313	#define COMMIT_PERIOD HZ
				314	#define MIGRATION_COUNT_WINDOW 10
				315
				316	/*
				317	* The block size of the device holding cache data must be
				318	* between 32KB and 1GB.
				319	*/
				320	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
				321	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				322
				323	enum cache_metadata_mode {
				324	CM_WRITE, /* metadata may be changed */
				325	CM_READ_ONLY, /* metadata may not be changed */
				326	CM_FAIL
				327	};
				328
				329	enum cache_io_mode {
				330	/*
				331	* Data is written to cached blocks only. These blocks are marked
				332	* dirty. If you lose the cache device you will lose data.
				333	* Potential performance increase for both reads and writes.
				334	*/
				335	CM_IO_WRITEBACK,
				336
				337	/*
				338	* Data is written to both cache and origin. Blocks are never
				339	* dirty. Potential performance benfit for reads only.
				340	*/
				341	CM_IO_WRITETHROUGH,
				342
				343	/*
				344	* A degraded mode useful for various cache coherency situations
				345	* (eg, rolling back snapshots). Reads and writes always go to the
				346	* origin. If a write goes to a cached oblock, then the cache
				347	* block is invalidated.
				348	*/
				349	CM_IO_PASSTHROUGH
				350	};
				351
				352	struct cache_features {
				353	enum cache_metadata_mode mode;
				354	enum cache_io_mode io_mode;
				355	unsigned metadata_version;
				356	bool discard_passdown:1;
				357	};
				358
				359	struct cache_stats {
				360	atomic_t read_hit;
				361	atomic_t read_miss;
				362	atomic_t write_hit;
				363	atomic_t write_miss;
				364	atomic_t demotion;
				365	atomic_t promotion;
				366	atomic_t writeback;
				367	atomic_t copies_avoided;
				368	atomic_t cache_cell_clash;
				369	atomic_t commit_count;
				370	atomic_t discard_count;
				371	};
				372
				373	struct cache {
				374	struct dm_target *ti;
				375	spinlock_t lock;
				376
				377	/*
				378	* Fields for converting from sectors to blocks.
				379	*/
				380	int sectors_per_block_shift;
				381	sector_t sectors_per_block;
				382
				383	struct dm_cache_metadata *cmd;
				384
				385	/*
				386	* Metadata is written to this device.
				387	*/
				388	struct dm_dev *metadata_dev;
				389
				390	/*
				391	* The slower of the two data devices. Typically a spindle.
				392	*/
				393	struct dm_dev *origin_dev;
				394
				395	/*
				396	* The faster of the two data devices. Typically an SSD.
				397	*/
				398	struct dm_dev *cache_dev;
				399
				400	/*
				401	* Size of the origin device in _complete_ blocks and native sectors.
				402	*/
				403	dm_oblock_t origin_blocks;
				404	sector_t origin_sectors;
				405
				406	/*
				407	* Size of the cache device in blocks.
				408	*/
				409	dm_cblock_t cache_size;
				410
				411	/*
				412	* Invalidation fields.
				413	*/
				414	spinlock_t invalidation_lock;
				415	struct list_head invalidation_requests;
				416
				417	sector_t migration_threshold;
				418	wait_queue_head_t migration_wait;
				419	atomic_t nr_allocated_migrations;
				420
				421	/*
				422	* The number of in flight migrations that are performing
				423	* background io. eg, promotion, writeback.
				424	*/
				425	atomic_t nr_io_migrations;
				426
				427	struct bio_list deferred_bios;
				428
				429	struct rw_semaphore quiesce_lock;
				430
				431	struct dm_target_callbacks callbacks;
				432
				433	/*
				434	* origin_blocks entries, discarded if set.
				435	*/
				436	dm_dblock_t discard_nr_blocks;
				437	unsigned long *discard_bitset;
				438	uint32_t discard_block_size; /* a power of 2 times sectors per block */
				439
				440	/*
				441	* Rather than reconstructing the table line for the status we just
				442	* save it and regurgitate.
				443	*/
				444	unsigned nr_ctr_args;
				445	const char **ctr_args;
				446
				447	struct dm_kcopyd_client *copier;
				448	struct work_struct deferred_bio_worker;
				449	struct work_struct migration_worker;
				450	struct workqueue_struct *wq;
				451	struct delayed_work waker;
				452	struct dm_bio_prison_v2 *prison;
				453
				454	/*
				455	* cache_size entries, dirty if set
				456	*/
				457	unsigned long *dirty_bitset;
				458	atomic_t nr_dirty;
				459
				460	unsigned policy_nr_args;
				461	struct dm_cache_policy *policy;
				462
				463	/*
				464	* Cache features such as write-through.
				465	*/
				466	struct cache_features features;
				467
				468	struct cache_stats stats;
				469
				470	bool need_tick_bio:1;
				471	bool sized:1;
				472	bool invalidate:1;
				473	bool commit_requested:1;
				474	bool loaded_mappings:1;
				475	bool loaded_discards:1;
				476
				477	struct rw_semaphore background_work_lock;
				478
				479	struct batcher committer;
				480	struct work_struct commit_ws;
				481
				482	struct io_tracker tracker;
				483
				484	mempool_t migration_pool;
				485
				486	struct bio_set bs;
				487	};
				488
				489	struct per_bio_data {
				490	bool tick:1;
				491	unsigned req_nr:2;
				492	struct dm_bio_prison_cell_v2 *cell;
				493	struct dm_hook_info hook_info;
				494	sector_t len;
				495	};
				496
				497	struct dm_cache_migration {
				498	struct continuation k;
				499	struct cache *cache;
				500
				501	struct policy_work *op;
				502	struct bio *overwrite_bio;
				503	struct dm_bio_prison_cell_v2 *cell;
				504
				505	dm_cblock_t invalidate_cblock;
				506	dm_oblock_t invalidate_oblock;
				507	};
				508
				509	/----------------------------------------------------------------/
				510
				511	static bool writethrough_mode(struct cache *cache)
				512	{
				513	return cache->features.io_mode == CM_IO_WRITETHROUGH;
				514	}
				515
				516	static bool writeback_mode(struct cache *cache)
				517	{
				518	return cache->features.io_mode == CM_IO_WRITEBACK;
				519	}
				520
				521	static inline bool passthrough_mode(struct cache *cache)
				522	{
				523	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
				524	}
				525
				526	/----------------------------------------------------------------/
				527
				528	static void wake_deferred_bio_worker(struct cache *cache)
				529	{
				530	queue_work(cache->wq, &cache->deferred_bio_worker);
				531	}
				532
				533	static void wake_migration_worker(struct cache *cache)
				534	{
				535	if (passthrough_mode(cache))
				536	return;
				537
				538	queue_work(cache->wq, &cache->migration_worker);
				539	}
				540
				541	/----------------------------------------------------------------/
				542
				543	static struct dm_bio_prison_cell_v2 alloc_prison_cell(struct cache cache)
				544	{
				545	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
				546	}
				547
				548	static void free_prison_cell(struct cache cache, struct dm_bio_prison_cell_v2 cell)
				549	{
				550	dm_bio_prison_free_cell_v2(cache->prison, cell);
				551	}
				552
				553	static struct dm_cache_migration alloc_migration(struct cache cache)
				554	{
				555	struct dm_cache_migration *mg;
				556
				557	mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
				558
				559	memset(mg, 0, sizeof(*mg));
				560
				561	mg->cache = cache;
				562	atomic_inc(&cache->nr_allocated_migrations);
				563
				564	return mg;
				565	}
				566
				567	static void free_migration(struct dm_cache_migration *mg)
				568	{
				569	struct cache *cache = mg->cache;
				570
				571	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
				572	wake_up(&cache->migration_wait);
				573
				574	mempool_free(mg, &cache->migration_pool);
				575	}
				576
				577	/----------------------------------------------------------------/
				578
				579	static inline dm_oblock_t oblock_succ(dm_oblock_t b)
				580	{
				581	return to_oblock(from_oblock(b) + 1ull);
				582	}
				583
				584	static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
				585	{
				586	key->virtual = 0;
				587	key->dev = 0;
				588	key->block_begin = from_oblock(begin);
				589	key->block_end = from_oblock(end);
				590	}
				591
				592	/*
				593	* We have two lock levels. Level 0, which is used to prevent WRITEs, and
				594	* level 1 which prevents both READs and WRITEs.
				595	*/
				596	#define WRITE_LOCK_LEVEL 0
				597	#define READ_WRITE_LOCK_LEVEL 1
				598
				599	static unsigned lock_level(struct bio *bio)
				600	{
				601	return bio_data_dir(bio) == WRITE ?
				602	WRITE_LOCK_LEVEL :
				603	READ_WRITE_LOCK_LEVEL;
				604	}
				605
				606	/*----------------------------------------------------------------
				607	* Per bio data
				608	--------------------------------------------------------------/
				609
				610	static struct per_bio_data get_per_bio_data(struct bio bio)
				611	{
				612	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
				613	BUG_ON(!pb);
				614	return pb;
				615	}
				616
				617	static struct per_bio_data init_per_bio_data(struct bio bio)
				618	{
				619	struct per_bio_data *pb = get_per_bio_data(bio);
				620
				621	pb->tick = false;
				622	pb->req_nr = dm_bio_get_target_bio_nr(bio);
				623	pb->cell = NULL;
				624	pb->len = 0;
				625
				626	return pb;
				627	}
				628
				629	/----------------------------------------------------------------/
				630
				631	static void defer_bio(struct cache cache, struct bio bio)
				632	{
				633	unsigned long flags;
				634
				635	spin_lock_irqsave(&cache->lock, flags);
				636	bio_list_add(&cache->deferred_bios, bio);
				637	spin_unlock_irqrestore(&cache->lock, flags);
				638
				639	wake_deferred_bio_worker(cache);
				640	}
				641
				642	static void defer_bios(struct cache cache, struct bio_list bios)
				643	{
				644	unsigned long flags;
				645
				646	spin_lock_irqsave(&cache->lock, flags);
				647	bio_list_merge(&cache->deferred_bios, bios);
				648	bio_list_init(bios);
				649	spin_unlock_irqrestore(&cache->lock, flags);
				650
				651	wake_deferred_bio_worker(cache);
				652	}
				653
				654	/----------------------------------------------------------------/
				655
				656	static bool bio_detain_shared(struct cache cache, dm_oblock_t oblock, struct bio bio)
				657	{
				658	bool r;
				659	struct per_bio_data *pb;
				660	struct dm_cell_key_v2 key;
				661	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
				662	struct dm_bio_prison_cell_v2 cell_prealloc, cell;
				663
				664	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
				665
				666	build_key(oblock, end, &key);
				667	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
				668	if (!r) {
				669	/*
				670	* Failed to get the lock.
				671	*/
				672	free_prison_cell(cache, cell_prealloc);
				673	return r;
				674	}
				675
				676	if (cell != cell_prealloc)
				677	free_prison_cell(cache, cell_prealloc);
				678
				679	pb = get_per_bio_data(bio);
				680	pb->cell = cell;
				681
				682	return r;
				683	}
				684
				685	/----------------------------------------------------------------/
				686
				687	static bool is_dirty(struct cache *cache, dm_cblock_t b)
				688	{
				689	return test_bit(from_cblock(b), cache->dirty_bitset);
				690	}
				691
				692	static void set_dirty(struct cache *cache, dm_cblock_t cblock)
				693	{
				694	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
				695	atomic_inc(&cache->nr_dirty);
				696	policy_set_dirty(cache->policy, cblock);
				697	}
				698	}
				699
				700	/*
				701	* These two are called when setting after migrations to force the policy
				702	* and dirty bitset to be in sync.
				703	*/
				704	static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
				705	{
				706	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
				707	atomic_inc(&cache->nr_dirty);
				708	policy_set_dirty(cache->policy, cblock);
				709	}
				710
				711	static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
				712	{
				713	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
				714	if (atomic_dec_return(&cache->nr_dirty) == 0)
				715	dm_table_event(cache->ti->table);
				716	}
				717
				718	policy_clear_dirty(cache->policy, cblock);
				719	}
				720
				721	/----------------------------------------------------------------/
				722
				723	static bool block_size_is_power_of_two(struct cache *cache)
				724	{
				725	return cache->sectors_per_block_shift >= 0;
				726	}
				727
				728	/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
				729	#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
				730	__always_inline
				731	#endif
				732	static dm_block_t block_div(dm_block_t b, uint32_t n)
				733	{
				734	do_div(b, n);
				735
				736	return b;
				737	}
				738
				739	static dm_block_t oblocks_per_dblock(struct cache *cache)
				740	{
				741	dm_block_t oblocks = cache->discard_block_size;
				742
				743	if (block_size_is_power_of_two(cache))
				744	oblocks >>= cache->sectors_per_block_shift;
				745	else
				746	oblocks = block_div(oblocks, cache->sectors_per_block);
				747
				748	return oblocks;
				749	}
				750
				751	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
				752	{
				753	return to_dblock(block_div(from_oblock(oblock),
				754	oblocks_per_dblock(cache)));
				755	}
				756
				757	static void set_discard(struct cache *cache, dm_dblock_t b)
				758	{
				759	unsigned long flags;
				760
				761	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
				762	atomic_inc(&cache->stats.discard_count);
				763
				764	spin_lock_irqsave(&cache->lock, flags);
				765	set_bit(from_dblock(b), cache->discard_bitset);
				766	spin_unlock_irqrestore(&cache->lock, flags);
				767	}
				768
				769	static void clear_discard(struct cache *cache, dm_dblock_t b)
				770	{
				771	unsigned long flags;
				772
				773	spin_lock_irqsave(&cache->lock, flags);
				774	clear_bit(from_dblock(b), cache->discard_bitset);
				775	spin_unlock_irqrestore(&cache->lock, flags);
				776	}
				777
				778	static bool is_discarded(struct cache *cache, dm_dblock_t b)
				779	{
				780	int r;
				781	unsigned long flags;
				782
				783	spin_lock_irqsave(&cache->lock, flags);
				784	r = test_bit(from_dblock(b), cache->discard_bitset);
				785	spin_unlock_irqrestore(&cache->lock, flags);
				786
				787	return r;
				788	}
				789
				790	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
				791	{
				792	int r;
				793	unsigned long flags;
				794
				795	spin_lock_irqsave(&cache->lock, flags);
				796	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
				797	cache->discard_bitset);
				798	spin_unlock_irqrestore(&cache->lock, flags);
				799
				800	return r;
				801	}
				802
				803	/*----------------------------------------------------------------
				804	* Remapping
				805	--------------------------------------------------------------/
				806	static void remap_to_origin(struct cache cache, struct bio bio)
				807	{
				808	bio_set_dev(bio, cache->origin_dev->bdev);
				809	}
				810
				811	static void remap_to_cache(struct cache cache, struct bio bio,
				812	dm_cblock_t cblock)
				813	{
				814	sector_t bi_sector = bio->bi_iter.bi_sector;
				815	sector_t block = from_cblock(cblock);
				816
				817	bio_set_dev(bio, cache->cache_dev->bdev);
				818	if (!block_size_is_power_of_two(cache))
				819	bio->bi_iter.bi_sector =
				820	(block * cache->sectors_per_block) +
				821	sector_div(bi_sector, cache->sectors_per_block);
				822	else
				823	bio->bi_iter.bi_sector =
				824	(block << cache->sectors_per_block_shift) \|
				825	(bi_sector & (cache->sectors_per_block - 1));
				826	}
				827
				828	static void check_if_tick_bio_needed(struct cache cache, struct bio bio)
				829	{
				830	unsigned long flags;
				831	struct per_bio_data *pb;
				832
				833	spin_lock_irqsave(&cache->lock, flags);
				834	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
				835	bio_op(bio) != REQ_OP_DISCARD) {
				836	pb = get_per_bio_data(bio);
				837	pb->tick = true;
				838	cache->need_tick_bio = false;
				839	}
				840	spin_unlock_irqrestore(&cache->lock, flags);
				841	}
				842
				843	static void __remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				844	dm_oblock_t oblock, bool bio_has_pbd)
				845	{
				846	if (bio_has_pbd)
				847	check_if_tick_bio_needed(cache, bio);
				848	remap_to_origin(cache, bio);
				849	if (bio_data_dir(bio) == WRITE)
				850	clear_discard(cache, oblock_to_dblock(cache, oblock));
				851	}
				852
				853	static void remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				854	dm_oblock_t oblock)
				855	{
				856	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
				857	__remap_to_origin_clear_discard(cache, bio, oblock, true);
				858	}
				859
				860	static void remap_to_cache_dirty(struct cache cache, struct bio bio,
				861	dm_oblock_t oblock, dm_cblock_t cblock)
				862	{
				863	check_if_tick_bio_needed(cache, bio);
				864	remap_to_cache(cache, bio, cblock);
				865	if (bio_data_dir(bio) == WRITE) {
				866	set_dirty(cache, cblock);
				867	clear_discard(cache, oblock_to_dblock(cache, oblock));
				868	}
				869	}
				870
				871	static dm_oblock_t get_bio_block(struct cache cache, struct bio bio)
				872	{
				873	sector_t block_nr = bio->bi_iter.bi_sector;
				874
				875	if (!block_size_is_power_of_two(cache))
				876	(void) sector_div(block_nr, cache->sectors_per_block);
				877	else
				878	block_nr >>= cache->sectors_per_block_shift;
				879
				880	return to_oblock(block_nr);
				881	}
				882
				883	static bool accountable_bio(struct cache cache, struct bio bio)
				884	{
				885	return bio_op(bio) != REQ_OP_DISCARD;
				886	}
				887
				888	static void accounted_begin(struct cache cache, struct bio bio)
				889	{
				890	struct per_bio_data *pb;
				891
				892	if (accountable_bio(cache, bio)) {
				893	pb = get_per_bio_data(bio);
				894	pb->len = bio_sectors(bio);
				895	iot_io_begin(&cache->tracker, pb->len);
				896	}
				897	}
				898
				899	static void accounted_complete(struct cache cache, struct bio bio)
				900	{
				901	struct per_bio_data *pb = get_per_bio_data(bio);
				902
				903	iot_io_end(&cache->tracker, pb->len);
				904	}
				905
				906	static void accounted_request(struct cache cache, struct bio bio)
				907	{
				908	accounted_begin(cache, bio);
				909	generic_make_request(bio);
				910	}
				911
				912	static void issue_op(struct bio bio, void context)
				913	{
				914	struct cache *cache = context;
				915	accounted_request(cache, bio);
				916	}
				917
				918	/*
				919	* When running in writethrough mode we need to send writes to clean blocks
				920	* to both the cache and origin devices. Clone the bio and send them in parallel.
				921	*/
				922	static void remap_to_origin_and_cache(struct cache cache, struct bio bio,
				923	dm_oblock_t oblock, dm_cblock_t cblock)
				924	{
				925	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
				926
				927	BUG_ON(!origin_bio);
				928
				929	bio_chain(origin_bio, bio);
				930	/*
				931	* Passing false to __remap_to_origin_clear_discard() skips
				932	* all code that might use per_bio_data (since clone doesn't have it)
				933	*/
				934	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
				935	submit_bio(origin_bio);
				936
				937	remap_to_cache(cache, bio, cblock);
				938	}
				939
				940	/*----------------------------------------------------------------
				941	* Failure modes
				942	--------------------------------------------------------------/
				943	static enum cache_metadata_mode get_cache_mode(struct cache *cache)
				944	{
				945	return cache->features.mode;
				946	}
				947
				948	static const char cache_device_name(struct cache cache)
				949	{
				950	return dm_device_name(dm_table_get_md(cache->ti->table));
				951	}
				952
				953	static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
				954	{
				955	const char *descs[] = {
				956	"write",
				957	"read-only",
				958	"fail"
				959	};
				960
				961	dm_table_event(cache->ti->table);
				962	DMINFO("%s: switching cache to %s mode",
				963	cache_device_name(cache), descs[(int)mode]);
				964	}
				965
				966	static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
				967	{
				968	bool needs_check;
				969	enum cache_metadata_mode old_mode = get_cache_mode(cache);
				970
				971	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
				972	DMERR("%s: unable to read needs_check flag, setting failure mode.",
				973	cache_device_name(cache));
				974	new_mode = CM_FAIL;
				975	}
				976
				977	if (new_mode == CM_WRITE && needs_check) {
				978	DMERR("%s: unable to switch cache to write mode until repaired.",
				979	cache_device_name(cache));
				980	if (old_mode != new_mode)
				981	new_mode = old_mode;
				982	else
				983	new_mode = CM_READ_ONLY;
				984	}
				985
				986	/* Never move out of fail mode */
				987	if (old_mode == CM_FAIL)
				988	new_mode = CM_FAIL;
				989
				990	switch (new_mode) {
				991	case CM_FAIL:
				992	case CM_READ_ONLY:
				993	dm_cache_metadata_set_read_only(cache->cmd);
				994	break;
				995
				996	case CM_WRITE:
				997	dm_cache_metadata_set_read_write(cache->cmd);
				998	break;
				999	}
				1000
				1001	cache->features.mode = new_mode;
				1002
				1003	if (new_mode != old_mode)
				1004	notify_mode_switch(cache, new_mode);
				1005	}
				1006
				1007	static void abort_transaction(struct cache *cache)
				1008	{
				1009	const char *dev_name = cache_device_name(cache);
				1010
				1011	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1012	return;
				1013
				1014	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
				1015	if (dm_cache_metadata_abort(cache->cmd)) {
				1016	DMERR("%s: failed to abort metadata transaction", dev_name);
				1017	set_cache_mode(cache, CM_FAIL);
				1018	}
				1019
				1020	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
				1021	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
				1022	set_cache_mode(cache, CM_FAIL);
				1023	}
				1024	}
				1025
				1026	static void metadata_operation_failed(struct cache cache, const char op, int r)
				1027	{
				1028	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
				1029	cache_device_name(cache), op, r);
				1030	abort_transaction(cache);
				1031	set_cache_mode(cache, CM_READ_ONLY);
				1032	}
				1033
				1034	/----------------------------------------------------------------/
				1035
				1036	static void load_stats(struct cache *cache)
				1037	{
				1038	struct dm_cache_statistics stats;
				1039
				1040	dm_cache_metadata_get_stats(cache->cmd, &stats);
				1041	atomic_set(&cache->stats.read_hit, stats.read_hits);
				1042	atomic_set(&cache->stats.read_miss, stats.read_misses);
				1043	atomic_set(&cache->stats.write_hit, stats.write_hits);
				1044	atomic_set(&cache->stats.write_miss, stats.write_misses);
				1045	}
				1046
				1047	static void save_stats(struct cache *cache)
				1048	{
				1049	struct dm_cache_statistics stats;
				1050
				1051	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1052	return;
				1053
				1054	stats.read_hits = atomic_read(&cache->stats.read_hit);
				1055	stats.read_misses = atomic_read(&cache->stats.read_miss);
				1056	stats.write_hits = atomic_read(&cache->stats.write_hit);
				1057	stats.write_misses = atomic_read(&cache->stats.write_miss);
				1058
				1059	dm_cache_metadata_set_stats(cache->cmd, &stats);
				1060	}
				1061
				1062	static void update_stats(struct cache_stats *stats, enum policy_operation op)
				1063	{
				1064	switch (op) {
				1065	case POLICY_PROMOTE:
				1066	atomic_inc(&stats->promotion);
				1067	break;
				1068
				1069	case POLICY_DEMOTE:
				1070	atomic_inc(&stats->demotion);
				1071	break;
				1072
				1073	case POLICY_WRITEBACK:
				1074	atomic_inc(&stats->writeback);
				1075	break;
				1076	}
				1077	}
				1078
				1079	/*----------------------------------------------------------------
				1080	* Migration processing
				1081	*
				1082	* Migration covers moving data from the origin device to the cache, or
				1083	* vice versa.
				1084	--------------------------------------------------------------/
				1085
				1086	static void inc_io_migrations(struct cache *cache)
				1087	{
				1088	atomic_inc(&cache->nr_io_migrations);
				1089	}
				1090
				1091	static void dec_io_migrations(struct cache *cache)
				1092	{
				1093	atomic_dec(&cache->nr_io_migrations);
				1094	}
				1095
				1096	static bool discard_or_flush(struct bio *bio)
				1097	{
				1098	return bio_op(bio) == REQ_OP_DISCARD \|\| op_is_flush(bio->bi_opf);
				1099	}
				1100
				1101	static void calc_discard_block_range(struct cache cache, struct bio bio,
				1102	dm_dblock_t b, dm_dblock_t e)
				1103	{
				1104	sector_t sb = bio->bi_iter.bi_sector;
				1105	sector_t se = bio_end_sector(bio);
				1106
				1107	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
				1108
				1109	if (se - sb < cache->discard_block_size)
				1110	e = b;
				1111	else
				1112	*e = to_dblock(block_div(se, cache->discard_block_size));
				1113	}
				1114
				1115	/----------------------------------------------------------------/
				1116
				1117	static void prevent_background_work(struct cache *cache)
				1118	{
				1119	lockdep_off();
				1120	down_write(&cache->background_work_lock);
				1121	lockdep_on();
				1122	}
				1123
				1124	static void allow_background_work(struct cache *cache)
				1125	{
				1126	lockdep_off();
				1127	up_write(&cache->background_work_lock);
				1128	lockdep_on();
				1129	}
				1130
				1131	static bool background_work_begin(struct cache *cache)
				1132	{
				1133	bool r;
				1134
				1135	lockdep_off();
				1136	r = down_read_trylock(&cache->background_work_lock);
				1137	lockdep_on();
				1138
				1139	return r;
				1140	}
				1141
				1142	static void background_work_end(struct cache *cache)
				1143	{
				1144	lockdep_off();
				1145	up_read(&cache->background_work_lock);
				1146	lockdep_on();
				1147	}
				1148
				1149	/----------------------------------------------------------------/
				1150
				1151	static bool bio_writes_complete_block(struct cache cache, struct bio bio)
				1152	{
				1153	return (bio_data_dir(bio) == WRITE) &&
				1154	(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
				1155	}
				1156
				1157	static bool optimisable_bio(struct cache cache, struct bio bio, dm_oblock_t block)
				1158	{
				1159	return writeback_mode(cache) &&
				1160	(is_discarded_oblock(cache, block) \|\| bio_writes_complete_block(cache, bio));
				1161	}
				1162
				1163	static void quiesce(struct dm_cache_migration *mg,
				1164	void (continuation)(struct work_struct ))
				1165	{
				1166	init_continuation(&mg->k, continuation);
				1167	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
				1168	}
				1169
				1170	static struct dm_cache_migration ws_to_mg(struct work_struct ws)
				1171	{
				1172	struct continuation *k = container_of(ws, struct continuation, ws);
				1173	return container_of(k, struct dm_cache_migration, k);
				1174	}
				1175
				1176	static void copy_complete(int read_err, unsigned long write_err, void *context)
				1177	{
				1178	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
				1179
				1180	if (read_err \|\| write_err)
				1181	mg->k.input = BLK_STS_IOERR;
				1182
				1183	queue_continuation(mg->cache->wq, &mg->k);
				1184	}
				1185
				1186	static void copy(struct dm_cache_migration *mg, bool promote)
				1187	{
				1188	struct dm_io_region o_region, c_region;
				1189	struct cache *cache = mg->cache;
				1190
				1191	o_region.bdev = cache->origin_dev->bdev;
				1192	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
				1193	o_region.count = cache->sectors_per_block;
				1194
				1195	c_region.bdev = cache->cache_dev->bdev;
				1196	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
				1197	c_region.count = cache->sectors_per_block;
				1198
				1199	if (promote)
				1200	dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
				1201	else
				1202	dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
				1203	}
				1204
				1205	static void bio_drop_shared_lock(struct cache cache, struct bio bio)
				1206	{
				1207	struct per_bio_data *pb = get_per_bio_data(bio);
				1208
				1209	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
				1210	free_prison_cell(cache, pb->cell);
				1211	pb->cell = NULL;
				1212	}
				1213
				1214	static void overwrite_endio(struct bio *bio)
				1215	{
				1216	struct dm_cache_migration *mg = bio->bi_private;
				1217	struct cache *cache = mg->cache;
				1218	struct per_bio_data *pb = get_per_bio_data(bio);
				1219
				1220	dm_unhook_bio(&pb->hook_info, bio);
				1221
				1222	if (bio->bi_status)
				1223	mg->k.input = bio->bi_status;
				1224
				1225	queue_continuation(cache->wq, &mg->k);
				1226	}
				1227
				1228	static void overwrite(struct dm_cache_migration *mg,
				1229	void (continuation)(struct work_struct ))
				1230	{
				1231	struct bio *bio = mg->overwrite_bio;
				1232	struct per_bio_data *pb = get_per_bio_data(bio);
				1233
				1234	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
				1235
				1236	/*
				1237	* The overwrite bio is part of the copy operation, as such it does
				1238	* not set/clear discard or dirty flags.
				1239	*/
				1240	if (mg->op->op == POLICY_PROMOTE)
				1241	remap_to_cache(mg->cache, bio, mg->op->cblock);
				1242	else
				1243	remap_to_origin(mg->cache, bio);
				1244
				1245	init_continuation(&mg->k, continuation);
				1246	accounted_request(mg->cache, bio);
				1247	}
				1248
				1249	/*
				1250	* Migration steps:
				1251	*
				1252	* 1) exclusive lock preventing WRITEs
				1253	* 2) quiesce
				1254	* 3) copy or issue overwrite bio
				1255	* 4) upgrade to exclusive lock preventing READs and WRITEs
				1256	* 5) quiesce
				1257	* 6) update metadata and commit
				1258	* 7) unlock
				1259	*/
				1260	static void mg_complete(struct dm_cache_migration *mg, bool success)
				1261	{
				1262	struct bio_list bios;
				1263	struct cache *cache = mg->cache;
				1264	struct policy_work *op = mg->op;
				1265	dm_cblock_t cblock = op->cblock;
				1266
				1267	if (success)
				1268	update_stats(&cache->stats, op->op);
				1269
				1270	switch (op->op) {
				1271	case POLICY_PROMOTE:
				1272	clear_discard(cache, oblock_to_dblock(cache, op->oblock));
				1273	policy_complete_background_work(cache->policy, op, success);
				1274
				1275	if (mg->overwrite_bio) {
				1276	if (success)
				1277	force_set_dirty(cache, cblock);
				1278	else if (mg->k.input)
				1279	mg->overwrite_bio->bi_status = mg->k.input;
				1280	else
				1281	mg->overwrite_bio->bi_status = BLK_STS_IOERR;
				1282	bio_endio(mg->overwrite_bio);
				1283	} else {
				1284	if (success)
				1285	force_clear_dirty(cache, cblock);
				1286	dec_io_migrations(cache);
				1287	}
				1288	break;
				1289
				1290	case POLICY_DEMOTE:
				1291	/*
				1292	* We clear dirty here to update the nr_dirty counter.
				1293	*/
				1294	if (success)
				1295	force_clear_dirty(cache, cblock);
				1296	policy_complete_background_work(cache->policy, op, success);
				1297	dec_io_migrations(cache);
				1298	break;
				1299
				1300	case POLICY_WRITEBACK:
				1301	if (success)
				1302	force_clear_dirty(cache, cblock);
				1303	policy_complete_background_work(cache->policy, op, success);
				1304	dec_io_migrations(cache);
				1305	break;
				1306	}
				1307
				1308	bio_list_init(&bios);
				1309	if (mg->cell) {
				1310	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1311	free_prison_cell(cache, mg->cell);
				1312	}
				1313
				1314	free_migration(mg);
				1315	defer_bios(cache, &bios);
				1316	wake_migration_worker(cache);
				1317
				1318	background_work_end(cache);
				1319	}
				1320
				1321	static void mg_success(struct work_struct *ws)
				1322	{
				1323	struct dm_cache_migration *mg = ws_to_mg(ws);
				1324	mg_complete(mg, mg->k.input == 0);
				1325	}
				1326
				1327	static void mg_update_metadata(struct work_struct *ws)
				1328	{
				1329	int r;
				1330	struct dm_cache_migration *mg = ws_to_mg(ws);
				1331	struct cache *cache = mg->cache;
				1332	struct policy_work *op = mg->op;
				1333
				1334	switch (op->op) {
				1335	case POLICY_PROMOTE:
				1336	r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
				1337	if (r) {
				1338	DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
				1339	cache_device_name(cache));
				1340	metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
				1341
				1342	mg_complete(mg, false);
				1343	return;
				1344	}
				1345	mg_complete(mg, true);
				1346	break;
				1347
				1348	case POLICY_DEMOTE:
				1349	r = dm_cache_remove_mapping(cache->cmd, op->cblock);
				1350	if (r) {
				1351	DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
				1352	cache_device_name(cache));
				1353	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1354
				1355	mg_complete(mg, false);
				1356	return;
				1357	}
				1358
				1359	/*
				1360	* It would be nice if we only had to commit when a REQ_FLUSH
				1361	* comes through. But there's one scenario that we have to
				1362	* look out for:
				1363	*
				1364	* - vblock x in a cache block
				1365	* - domotion occurs
				1366	* - cache block gets reallocated and over written
				1367	* - crash
				1368	*
				1369	* When we recover, because there was no commit the cache will
				1370	* rollback to having the data for vblock x in the cache block.
				1371	* But the cache block has since been overwritten, so it'll end
				1372	* up pointing to data that was never in 'x' during the history
				1373	* of the device.
				1374	*
				1375	* To avoid this issue we require a commit as part of the
				1376	* demotion operation.
				1377	*/
				1378	init_continuation(&mg->k, mg_success);
				1379	continue_after_commit(&cache->committer, &mg->k);
				1380	schedule_commit(&cache->committer);
				1381	break;
				1382
				1383	case POLICY_WRITEBACK:
				1384	mg_complete(mg, true);
				1385	break;
				1386	}
				1387	}
				1388
				1389	static void mg_update_metadata_after_copy(struct work_struct *ws)
				1390	{
				1391	struct dm_cache_migration *mg = ws_to_mg(ws);
				1392
				1393	/*
				1394	* Did the copy succeed?
				1395	*/
				1396	if (mg->k.input)
				1397	mg_complete(mg, false);
				1398	else
				1399	mg_update_metadata(ws);
				1400	}
				1401
				1402	static void mg_upgrade_lock(struct work_struct *ws)
				1403	{
				1404	int r;
				1405	struct dm_cache_migration *mg = ws_to_mg(ws);
				1406
				1407	/*
				1408	* Did the copy succeed?
				1409	*/
				1410	if (mg->k.input)
				1411	mg_complete(mg, false);
				1412
				1413	else {
				1414	/*
				1415	* Now we want the lock to prevent both reads and writes.
				1416	*/
				1417	r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
				1418	READ_WRITE_LOCK_LEVEL);
				1419	if (r < 0)
				1420	mg_complete(mg, false);
				1421
				1422	else if (r)
				1423	quiesce(mg, mg_update_metadata);
				1424
				1425	else
				1426	mg_update_metadata(ws);
				1427	}
				1428	}
				1429
				1430	static void mg_full_copy(struct work_struct *ws)
				1431	{
				1432	struct dm_cache_migration *mg = ws_to_mg(ws);
				1433	struct cache *cache = mg->cache;
				1434	struct policy_work *op = mg->op;
				1435	bool is_policy_promote = (op->op == POLICY_PROMOTE);
				1436
				1437	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) \|\|
				1438	is_discarded_oblock(cache, op->oblock)) {
				1439	mg_upgrade_lock(ws);
				1440	return;
				1441	}
				1442
				1443	init_continuation(&mg->k, mg_upgrade_lock);
				1444	copy(mg, is_policy_promote);
				1445	}
				1446
				1447	static void mg_copy(struct work_struct *ws)
				1448	{
				1449	struct dm_cache_migration *mg = ws_to_mg(ws);
				1450
				1451	if (mg->overwrite_bio) {
				1452	/*
				1453	* No exclusive lock was held when we last checked if the bio
				1454	* was optimisable. So we have to check again in case things
				1455	* have changed (eg, the block may no longer be discarded).
				1456	*/
				1457	if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
				1458	/*
				1459	* Fallback to a real full copy after doing some tidying up.
				1460	*/
				1461	bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
				1462	BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
				1463	mg->overwrite_bio = NULL;
				1464	inc_io_migrations(mg->cache);
				1465	mg_full_copy(ws);
				1466	return;
				1467	}
				1468
				1469	/*
				1470	* It's safe to do this here, even though it's new data
				1471	* because all IO has been locked out of the block.
				1472	*
				1473	* mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
				1474	* so _not_ using mg_upgrade_lock() as continutation.
				1475	*/
				1476	overwrite(mg, mg_update_metadata_after_copy);
				1477
				1478	} else
				1479	mg_full_copy(ws);
				1480	}
				1481
				1482	static int mg_lock_writes(struct dm_cache_migration *mg)
				1483	{
				1484	int r;
				1485	struct dm_cell_key_v2 key;
				1486	struct cache *cache = mg->cache;
				1487	struct dm_bio_prison_cell_v2 *prealloc;
				1488
				1489	prealloc = alloc_prison_cell(cache);
				1490
				1491	/*
				1492	* Prevent writes to the block, but allow reads to continue.
				1493	* Unless we're using an overwrite bio, in which case we lock
				1494	* everything.
				1495	*/
				1496	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
				1497	r = dm_cell_lock_v2(cache->prison, &key,
				1498	mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
				1499	prealloc, &mg->cell);
				1500	if (r < 0) {
				1501	free_prison_cell(cache, prealloc);
				1502	mg_complete(mg, false);
				1503	return r;
				1504	}
				1505
				1506	if (mg->cell != prealloc)
				1507	free_prison_cell(cache, prealloc);
				1508
				1509	if (r == 0)
				1510	mg_copy(&mg->k.ws);
				1511	else
				1512	quiesce(mg, mg_copy);
				1513
				1514	return 0;
				1515	}
				1516
				1517	static int mg_start(struct cache cache, struct policy_work op, struct bio *bio)
				1518	{
				1519	struct dm_cache_migration *mg;
				1520
				1521	if (!background_work_begin(cache)) {
				1522	policy_complete_background_work(cache->policy, op, false);
				1523	return -EPERM;
				1524	}
				1525
				1526	mg = alloc_migration(cache);
				1527
				1528	mg->op = op;
				1529	mg->overwrite_bio = bio;
				1530
				1531	if (!bio)
				1532	inc_io_migrations(cache);
				1533
				1534	return mg_lock_writes(mg);
				1535	}
				1536
				1537	/*----------------------------------------------------------------
				1538	* invalidation processing
				1539	--------------------------------------------------------------/
				1540
				1541	static void invalidate_complete(struct dm_cache_migration *mg, bool success)
				1542	{
				1543	struct bio_list bios;
				1544	struct cache *cache = mg->cache;
				1545
				1546	bio_list_init(&bios);
				1547	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1548	free_prison_cell(cache, mg->cell);
				1549
				1550	if (!success && mg->overwrite_bio)
				1551	bio_io_error(mg->overwrite_bio);
				1552
				1553	free_migration(mg);
				1554	defer_bios(cache, &bios);
				1555
				1556	background_work_end(cache);
				1557	}
				1558
				1559	static void invalidate_completed(struct work_struct *ws)
				1560	{
				1561	struct dm_cache_migration *mg = ws_to_mg(ws);
				1562	invalidate_complete(mg, !mg->k.input);
				1563	}
				1564
				1565	static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
				1566	{
				1567	int r = policy_invalidate_mapping(cache->policy, cblock);
				1568	if (!r) {
				1569	r = dm_cache_remove_mapping(cache->cmd, cblock);
				1570	if (r) {
				1571	DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
				1572	cache_device_name(cache));
				1573	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1574	}
				1575
				1576	} else if (r == -ENODATA) {
				1577	/*
				1578	* Harmless, already unmapped.
				1579	*/
				1580	r = 0;
				1581
				1582	} else
				1583	DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
				1584
				1585	return r;
				1586	}
				1587
				1588	static void invalidate_remove(struct work_struct *ws)
				1589	{
				1590	int r;
				1591	struct dm_cache_migration *mg = ws_to_mg(ws);
				1592	struct cache *cache = mg->cache;
				1593
				1594	r = invalidate_cblock(cache, mg->invalidate_cblock);
				1595	if (r) {
				1596	invalidate_complete(mg, false);
				1597	return;
				1598	}
				1599
				1600	init_continuation(&mg->k, invalidate_completed);
				1601	continue_after_commit(&cache->committer, &mg->k);
				1602	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
				1603	mg->overwrite_bio = NULL;
				1604	schedule_commit(&cache->committer);
				1605	}
				1606
				1607	static int invalidate_lock(struct dm_cache_migration *mg)
				1608	{
				1609	int r;
				1610	struct dm_cell_key_v2 key;
				1611	struct cache *cache = mg->cache;
				1612	struct dm_bio_prison_cell_v2 *prealloc;
				1613
				1614	prealloc = alloc_prison_cell(cache);
				1615
				1616	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
				1617	r = dm_cell_lock_v2(cache->prison, &key,
				1618	READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
				1619	if (r < 0) {
				1620	free_prison_cell(cache, prealloc);
				1621	invalidate_complete(mg, false);
				1622	return r;
				1623	}
				1624
				1625	if (mg->cell != prealloc)
				1626	free_prison_cell(cache, prealloc);
				1627
				1628	if (r)
				1629	quiesce(mg, invalidate_remove);
				1630
				1631	else {
				1632	/*
				1633	* We can't call invalidate_remove() directly here because we
				1634	* might still be in request context.
				1635	*/
				1636	init_continuation(&mg->k, invalidate_remove);
				1637	queue_work(cache->wq, &mg->k.ws);
				1638	}
				1639
				1640	return 0;
				1641	}
				1642
				1643	static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
				1644	dm_oblock_t oblock, struct bio *bio)
				1645	{
				1646	struct dm_cache_migration *mg;
				1647
				1648	if (!background_work_begin(cache))
				1649	return -EPERM;
				1650
				1651	mg = alloc_migration(cache);
				1652
				1653	mg->overwrite_bio = bio;
				1654	mg->invalidate_cblock = cblock;
				1655	mg->invalidate_oblock = oblock;
				1656
				1657	return invalidate_lock(mg);
				1658	}
				1659
				1660	/*----------------------------------------------------------------
				1661	* bio processing
				1662	--------------------------------------------------------------/
				1663
				1664	enum busy {
				1665	IDLE,
				1666	BUSY
				1667	};
				1668
				1669	static enum busy spare_migration_bandwidth(struct cache *cache)
				1670	{
				1671	bool idle = iot_idle_for(&cache->tracker, HZ);
				1672	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
				1673	cache->sectors_per_block;
				1674
				1675	if (idle && current_volume <= cache->migration_threshold)
				1676	return IDLE;
				1677	else
				1678	return BUSY;
				1679	}
				1680
				1681	static void inc_hit_counter(struct cache cache, struct bio bio)
				1682	{
				1683	atomic_inc(bio_data_dir(bio) == READ ?
				1684	&cache->stats.read_hit : &cache->stats.write_hit);
				1685	}
				1686
				1687	static void inc_miss_counter(struct cache cache, struct bio bio)
				1688	{
				1689	atomic_inc(bio_data_dir(bio) == READ ?
				1690	&cache->stats.read_miss : &cache->stats.write_miss);
				1691	}
				1692
				1693	/----------------------------------------------------------------/
				1694
				1695	static int map_bio(struct cache cache, struct bio bio, dm_oblock_t block,
				1696	bool *commit_needed)
				1697	{
				1698	int r, data_dir;
				1699	bool rb, background_queued;
				1700	dm_cblock_t cblock;
				1701
				1702	*commit_needed = false;
				1703
				1704	rb = bio_detain_shared(cache, block, bio);
				1705	if (!rb) {
				1706	/*
				1707	* An exclusive lock is held for this block, so we have to
				1708	* wait. We set the commit_needed flag so the current
				1709	* transaction will be committed asap, allowing this lock
				1710	* to be dropped.
				1711	*/
				1712	*commit_needed = true;
				1713	return DM_MAPIO_SUBMITTED;
				1714	}
				1715
				1716	data_dir = bio_data_dir(bio);
				1717
				1718	if (optimisable_bio(cache, bio, block)) {
				1719	struct policy_work *op = NULL;
				1720
				1721	r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
				1722	if (unlikely(r && r != -ENOENT)) {
				1723	DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
				1724	cache_device_name(cache), r);
				1725	bio_io_error(bio);
				1726	return DM_MAPIO_SUBMITTED;
				1727	}
				1728
				1729	if (r == -ENOENT && op) {
				1730	bio_drop_shared_lock(cache, bio);
				1731	BUG_ON(op->op != POLICY_PROMOTE);
				1732	mg_start(cache, op, bio);
				1733	return DM_MAPIO_SUBMITTED;
				1734	}
				1735	} else {
				1736	r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
				1737	if (unlikely(r && r != -ENOENT)) {
				1738	DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
				1739	cache_device_name(cache), r);
				1740	bio_io_error(bio);
				1741	return DM_MAPIO_SUBMITTED;
				1742	}
				1743
				1744	if (background_queued)
				1745	wake_migration_worker(cache);
				1746	}
				1747
				1748	if (r == -ENOENT) {
				1749	struct per_bio_data *pb = get_per_bio_data(bio);
				1750
				1751	/*
				1752	* Miss.
				1753	*/
				1754	inc_miss_counter(cache, bio);
				1755	if (pb->req_nr == 0) {
				1756	accounted_begin(cache, bio);
				1757	remap_to_origin_clear_discard(cache, bio, block);
				1758	} else {
				1759	/*
				1760	* This is a duplicate writethrough io that is no
				1761	* longer needed because the block has been demoted.
				1762	*/
				1763	bio_endio(bio);
				1764	return DM_MAPIO_SUBMITTED;
				1765	}
				1766	} else {
				1767	/*
				1768	* Hit.
				1769	*/
				1770	inc_hit_counter(cache, bio);
				1771
				1772	/*
				1773	* Passthrough always maps to the origin, invalidating any
				1774	* cache blocks that are written to.
				1775	*/
				1776	if (passthrough_mode(cache)) {
				1777	if (bio_data_dir(bio) == WRITE) {
				1778	bio_drop_shared_lock(cache, bio);
				1779	atomic_inc(&cache->stats.demotion);
				1780	invalidate_start(cache, cblock, block, bio);
				1781	} else
				1782	remap_to_origin_clear_discard(cache, bio, block);
				1783	} else {
				1784	if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
				1785	!is_dirty(cache, cblock)) {
				1786	remap_to_origin_and_cache(cache, bio, block, cblock);
				1787	accounted_begin(cache, bio);
				1788	} else
				1789	remap_to_cache_dirty(cache, bio, block, cblock);
				1790	}
				1791	}
				1792
				1793	/*
				1794	* dm core turns FUA requests into a separate payload and FLUSH req.
				1795	*/
				1796	if (bio->bi_opf & REQ_FUA) {
				1797	/*
				1798	* issue_after_commit will call accounted_begin a second time. So
				1799	* we call accounted_complete() to avoid double accounting.
				1800	*/
				1801	accounted_complete(cache, bio);
				1802	issue_after_commit(&cache->committer, bio);
				1803	*commit_needed = true;
				1804	return DM_MAPIO_SUBMITTED;
				1805	}
				1806
				1807	return DM_MAPIO_REMAPPED;
				1808	}
				1809
				1810	static bool process_bio(struct cache cache, struct bio bio)
				1811	{
				1812	bool commit_needed;
				1813
				1814	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
				1815	generic_make_request(bio);
				1816
				1817	return commit_needed;
				1818	}
				1819
				1820	/*
				1821	* A non-zero return indicates read_only or fail_io mode.
				1822	*/
				1823	static int commit(struct cache *cache, bool clean_shutdown)
				1824	{
				1825	int r;
				1826
				1827	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1828	return -EINVAL;
				1829
				1830	atomic_inc(&cache->stats.commit_count);
				1831	r = dm_cache_commit(cache->cmd, clean_shutdown);
				1832	if (r)
				1833	metadata_operation_failed(cache, "dm_cache_commit", r);
				1834
				1835	return r;
				1836	}
				1837
				1838	/*
				1839	* Used by the batcher.
				1840	*/
				1841	static blk_status_t commit_op(void *context)
				1842	{
				1843	struct cache *cache = context;
				1844
				1845	if (dm_cache_changed_this_transaction(cache->cmd))
				1846	return errno_to_blk_status(commit(cache, false));
				1847
				1848	return 0;
				1849	}
				1850
				1851	/----------------------------------------------------------------/
				1852
				1853	static bool process_flush_bio(struct cache cache, struct bio bio)
				1854	{
				1855	struct per_bio_data *pb = get_per_bio_data(bio);
				1856
				1857	if (!pb->req_nr)
				1858	remap_to_origin(cache, bio);
				1859	else
				1860	remap_to_cache(cache, bio, 0);
				1861
				1862	issue_after_commit(&cache->committer, bio);
				1863	return true;
				1864	}
				1865
				1866	static bool process_discard_bio(struct cache cache, struct bio bio)
				1867	{
				1868	dm_dblock_t b, e;
				1869
				1870	// FIXME: do we need to lock the region? Or can we just assume the
				1871	// user wont be so foolish as to issue discard concurrently with
				1872	// other IO?
				1873	calc_discard_block_range(cache, bio, &b, &e);
				1874	while (b != e) {
				1875	set_discard(cache, b);
				1876	b = to_dblock(from_dblock(b) + 1);
				1877	}
				1878
				1879	if (cache->features.discard_passdown) {
				1880	remap_to_origin(cache, bio);
				1881	generic_make_request(bio);
				1882	} else
				1883	bio_endio(bio);
				1884
				1885	return false;
				1886	}
				1887
				1888	static void process_deferred_bios(struct work_struct *ws)
				1889	{
				1890	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
				1891
				1892	unsigned long flags;
				1893	bool commit_needed = false;
				1894	struct bio_list bios;
				1895	struct bio *bio;
				1896
				1897	bio_list_init(&bios);
				1898
				1899	spin_lock_irqsave(&cache->lock, flags);
				1900	bio_list_merge(&bios, &cache->deferred_bios);
				1901	bio_list_init(&cache->deferred_bios);
				1902	spin_unlock_irqrestore(&cache->lock, flags);
				1903
				1904	while ((bio = bio_list_pop(&bios))) {
				1905	if (bio->bi_opf & REQ_PREFLUSH)
				1906	commit_needed = process_flush_bio(cache, bio) \|\| commit_needed;
				1907
				1908	else if (bio_op(bio) == REQ_OP_DISCARD)
				1909	commit_needed = process_discard_bio(cache, bio) \|\| commit_needed;
				1910
				1911	else
				1912	commit_needed = process_bio(cache, bio) \|\| commit_needed;
				1913	cond_resched();
				1914	}
				1915
				1916	if (commit_needed)
				1917	schedule_commit(&cache->committer);
				1918	}
				1919
				1920	/*----------------------------------------------------------------
				1921	* Main worker loop
				1922	--------------------------------------------------------------/
				1923
				1924	static void requeue_deferred_bios(struct cache *cache)
				1925	{
				1926	struct bio *bio;
				1927	struct bio_list bios;
				1928
				1929	bio_list_init(&bios);
				1930	bio_list_merge(&bios, &cache->deferred_bios);
				1931	bio_list_init(&cache->deferred_bios);
				1932
				1933	while ((bio = bio_list_pop(&bios))) {
				1934	bio->bi_status = BLK_STS_DM_REQUEUE;
				1935	bio_endio(bio);
				1936	cond_resched();
				1937	}
				1938	}
				1939
				1940	/*
				1941	* We want to commit periodically so that not too much
				1942	* unwritten metadata builds up.
				1943	*/
				1944	static void do_waker(struct work_struct *ws)
				1945	{
				1946	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
				1947
				1948	policy_tick(cache->policy, true);
				1949	wake_migration_worker(cache);
				1950	schedule_commit(&cache->committer);
				1951	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
				1952	}
				1953
				1954	static void check_migrations(struct work_struct *ws)
				1955	{
				1956	int r;
				1957	struct policy_work *op;
				1958	struct cache *cache = container_of(ws, struct cache, migration_worker);
				1959	enum busy b;
				1960
				1961	for (;;) {
				1962	b = spare_migration_bandwidth(cache);
				1963
				1964	r = policy_get_background_work(cache->policy, b == IDLE, &op);
				1965	if (r == -ENODATA)
				1966	break;
				1967
				1968	if (r) {
				1969	DMERR_LIMIT("%s: policy_background_work failed",
				1970	cache_device_name(cache));
				1971	break;
				1972	}
				1973
				1974	r = mg_start(cache, op, NULL);
				1975	if (r)
				1976	break;
				1977
				1978	cond_resched();
				1979	}
				1980	}
				1981
				1982	/*----------------------------------------------------------------
				1983	* Target methods
				1984	--------------------------------------------------------------/
				1985
				1986	/*
				1987	* This function gets called on the error paths of the constructor, so we
				1988	* have to cope with a partially initialised struct.
				1989	*/
				1990	static void destroy(struct cache *cache)
				1991	{
				1992	unsigned i;
				1993
				1994	mempool_exit(&cache->migration_pool);
				1995
				1996	if (cache->prison)
				1997	dm_bio_prison_destroy_v2(cache->prison);
				1998
				1999	cancel_delayed_work_sync(&cache->waker);
				2000	if (cache->wq)
				2001	destroy_workqueue(cache->wq);
				2002
				2003	if (cache->dirty_bitset)
				2004	free_bitset(cache->dirty_bitset);
				2005
				2006	if (cache->discard_bitset)
				2007	free_bitset(cache->discard_bitset);
				2008
				2009	if (cache->copier)
				2010	dm_kcopyd_client_destroy(cache->copier);
				2011
				2012	if (cache->cmd)
				2013	dm_cache_metadata_close(cache->cmd);
				2014
				2015	if (cache->metadata_dev)
				2016	dm_put_device(cache->ti, cache->metadata_dev);
				2017
				2018	if (cache->origin_dev)
				2019	dm_put_device(cache->ti, cache->origin_dev);
				2020
				2021	if (cache->cache_dev)
				2022	dm_put_device(cache->ti, cache->cache_dev);
				2023
				2024	if (cache->policy)
				2025	dm_cache_policy_destroy(cache->policy);
				2026
				2027	for (i = 0; i < cache->nr_ctr_args ; i++)
				2028	kfree(cache->ctr_args[i]);
				2029	kfree(cache->ctr_args);
				2030
				2031	bioset_exit(&cache->bs);
				2032
				2033	kfree(cache);
				2034	}
				2035
				2036	static void cache_dtr(struct dm_target *ti)
				2037	{
				2038	struct cache *cache = ti->private;
				2039
				2040	destroy(cache);
				2041	}
				2042
				2043	static sector_t get_dev_size(struct dm_dev *dev)
				2044	{
				2045	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				2046	}
				2047
				2048	/----------------------------------------------------------------/
				2049
				2050	/*
				2051	* Construct a cache device mapping.
				2052	*
				2053	* cache <metadata dev> <cache dev> <origin dev> <block size>
				2054	* <#feature args> [<feature arg>]*
				2055	* <policy> <#policy args> [<policy arg>]*
				2056	*
				2057	* metadata dev : fast device holding the persistent metadata
				2058	* cache dev : fast device holding cached data blocks
				2059	* origin dev : slow device holding original data blocks
				2060	* block size : cache unit size in sectors
				2061	*
				2062	* #feature args : number of feature arguments passed
				2063	* feature args : writethrough. (The default is writeback.)
				2064	*
				2065	* policy : the replacement policy to use
				2066	* #policy args : an even number of policy arguments corresponding
				2067	* to key/value pairs passed to the policy
				2068	* policy args : key/value pairs passed to the policy
				2069	* E.g. 'sequential_threshold 1024'
				2070	* See cache-policies.txt for details.
				2071	*
				2072	* Optional feature arguments are:
				2073	* writethrough : write through caching that prohibits cache block
				2074	* content from being different from origin block content.
				2075	* Without this argument, the default behaviour is to write
				2076	* back cache block contents later for performance reasons,
				2077	* so they may differ from the corresponding origin blocks.
				2078	*/
				2079	struct cache_args {
				2080	struct dm_target *ti;
				2081
				2082	struct dm_dev *metadata_dev;
				2083
				2084	struct dm_dev *cache_dev;
				2085	sector_t cache_sectors;
				2086
				2087	struct dm_dev *origin_dev;
				2088
				2089	uint32_t block_size;
				2090
				2091	const char *policy_name;
				2092	int policy_argc;
				2093	const char **policy_argv;
				2094
				2095	struct cache_features features;
				2096	};
				2097
				2098	static void destroy_cache_args(struct cache_args *ca)
				2099	{
				2100	if (ca->metadata_dev)
				2101	dm_put_device(ca->ti, ca->metadata_dev);
				2102
				2103	if (ca->cache_dev)
				2104	dm_put_device(ca->ti, ca->cache_dev);
				2105
				2106	if (ca->origin_dev)
				2107	dm_put_device(ca->ti, ca->origin_dev);
				2108
				2109	kfree(ca);
				2110	}
				2111
				2112	static bool at_least_one_arg(struct dm_arg_set as, char *error)
				2113	{
				2114	if (!as->argc) {
				2115	*error = "Insufficient args";
				2116	return false;
				2117	}
				2118
				2119	return true;
				2120	}
				2121
				2122	static int parse_metadata_dev(struct cache_args ca, struct dm_arg_set as,
				2123	char **error)
				2124	{
				2125	int r;
				2126	sector_t metadata_dev_size;
				2127	char b[BDEVNAME_SIZE];
				2128
				2129	if (!at_least_one_arg(as, error))
				2130	return -EINVAL;
				2131
				2132	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2133	&ca->metadata_dev);
				2134	if (r) {
				2135	*error = "Error opening metadata device";
				2136	return r;
				2137	}
				2138
				2139	metadata_dev_size = get_dev_size(ca->metadata_dev);
				2140	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
				2141	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				2142	bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
				2143
				2144	return 0;
				2145	}
				2146
				2147	static int parse_cache_dev(struct cache_args ca, struct dm_arg_set as,
				2148	char **error)
				2149	{
				2150	int r;
				2151
				2152	if (!at_least_one_arg(as, error))
				2153	return -EINVAL;
				2154
				2155	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2156	&ca->cache_dev);
				2157	if (r) {
				2158	*error = "Error opening cache device";
				2159	return r;
				2160	}
				2161	ca->cache_sectors = get_dev_size(ca->cache_dev);
				2162
				2163	return 0;
				2164	}
				2165
				2166	static int parse_origin_dev(struct cache_args ca, struct dm_arg_set as,
				2167	char **error)
				2168	{
				2169	sector_t origin_sectors;
				2170	int r;
				2171
				2172	if (!at_least_one_arg(as, error))
				2173	return -EINVAL;
				2174
				2175	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2176	&ca->origin_dev);
				2177	if (r) {
				2178	*error = "Error opening origin device";
				2179	return r;
				2180	}
				2181
				2182	origin_sectors = get_dev_size(ca->origin_dev);
				2183	if (ca->ti->len > origin_sectors) {
				2184	*error = "Device size larger than cached device";
				2185	return -EINVAL;
				2186	}
				2187
				2188	return 0;
				2189	}
				2190
				2191	static int parse_block_size(struct cache_args ca, struct dm_arg_set as,
				2192	char **error)
				2193	{
				2194	unsigned long block_size;
				2195
				2196	if (!at_least_one_arg(as, error))
				2197	return -EINVAL;
				2198
				2199	if (kstrtoul(dm_shift_arg(as), 10, &block_size) \|\| !block_size \|\|
				2200	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				2201	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				2202	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				2203	*error = "Invalid data block size";
				2204	return -EINVAL;
				2205	}
				2206
				2207	if (block_size > ca->cache_sectors) {
				2208	*error = "Data block size is larger than the cache device";
				2209	return -EINVAL;
				2210	}
				2211
				2212	ca->block_size = block_size;
				2213
				2214	return 0;
				2215	}
				2216
				2217	static void init_features(struct cache_features *cf)
				2218	{
				2219	cf->mode = CM_WRITE;
				2220	cf->io_mode = CM_IO_WRITEBACK;
				2221	cf->metadata_version = 1;
				2222	cf->discard_passdown = true;
				2223	}
				2224
				2225	static int parse_features(struct cache_args ca, struct dm_arg_set as,
				2226	char **error)
				2227	{
				2228	static const struct dm_arg _args[] = {
				2229	{0, 3, "Invalid number of cache feature arguments"},
				2230	};
				2231
				2232	int r, mode_ctr = 0;
				2233	unsigned argc;
				2234	const char *arg;
				2235	struct cache_features *cf = &ca->features;
				2236
				2237	init_features(cf);
				2238
				2239	r = dm_read_arg_group(_args, as, &argc, error);
				2240	if (r)
				2241	return -EINVAL;
				2242
				2243	while (argc--) {
				2244	arg = dm_shift_arg(as);
				2245
				2246	if (!strcasecmp(arg, "writeback")) {
				2247	cf->io_mode = CM_IO_WRITEBACK;
				2248	mode_ctr++;
				2249	}
				2250
				2251	else if (!strcasecmp(arg, "writethrough")) {
				2252	cf->io_mode = CM_IO_WRITETHROUGH;
				2253	mode_ctr++;
				2254	}
				2255
				2256	else if (!strcasecmp(arg, "passthrough")) {
				2257	cf->io_mode = CM_IO_PASSTHROUGH;
				2258	mode_ctr++;
				2259	}
				2260
				2261	else if (!strcasecmp(arg, "metadata2"))
				2262	cf->metadata_version = 2;
				2263
				2264	else if (!strcasecmp(arg, "no_discard_passdown"))
				2265	cf->discard_passdown = false;
				2266
				2267	else {
				2268	*error = "Unrecognised cache feature requested";
				2269	return -EINVAL;
				2270	}
				2271	}
				2272
				2273	if (mode_ctr > 1) {
				2274	*error = "Duplicate cache io_mode features requested";
				2275	return -EINVAL;
				2276	}
				2277
				2278	return 0;
				2279	}
				2280
				2281	static int parse_policy(struct cache_args ca, struct dm_arg_set as,
				2282	char **error)
				2283	{
				2284	static const struct dm_arg _args[] = {
				2285	{0, 1024, "Invalid number of policy arguments"},
				2286	};
				2287
				2288	int r;
				2289
				2290	if (!at_least_one_arg(as, error))
				2291	return -EINVAL;
				2292
				2293	ca->policy_name = dm_shift_arg(as);
				2294
				2295	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
				2296	if (r)
				2297	return -EINVAL;
				2298
				2299	ca->policy_argv = (const char **)as->argv;
				2300	dm_consume_args(as, ca->policy_argc);
				2301
				2302	return 0;
				2303	}
				2304
				2305	static int parse_cache_args(struct cache_args ca, int argc, char *argv,
				2306	char **error)
				2307	{
				2308	int r;
				2309	struct dm_arg_set as;
				2310
				2311	as.argc = argc;
				2312	as.argv = argv;
				2313
				2314	r = parse_metadata_dev(ca, &as, error);
				2315	if (r)
				2316	return r;
				2317
				2318	r = parse_cache_dev(ca, &as, error);
				2319	if (r)
				2320	return r;
				2321
				2322	r = parse_origin_dev(ca, &as, error);
				2323	if (r)
				2324	return r;
				2325
				2326	r = parse_block_size(ca, &as, error);
				2327	if (r)
				2328	return r;
				2329
				2330	r = parse_features(ca, &as, error);
				2331	if (r)
				2332	return r;
				2333
				2334	r = parse_policy(ca, &as, error);
				2335	if (r)
				2336	return r;
				2337
				2338	return 0;
				2339	}
				2340
				2341	/----------------------------------------------------------------/
				2342
				2343	static struct kmem_cache *migration_cache;
				2344
				2345	#define NOT_CORE_OPTION 1
				2346
				2347	static int process_config_option(struct cache cache, const char key, const char *value)
				2348	{
				2349	unsigned long tmp;
				2350
				2351	if (!strcasecmp(key, "migration_threshold")) {
				2352	if (kstrtoul(value, 10, &tmp))
				2353	return -EINVAL;
				2354
				2355	cache->migration_threshold = tmp;
				2356	return 0;
				2357	}
				2358
				2359	return NOT_CORE_OPTION;
				2360	}
				2361
				2362	static int set_config_value(struct cache cache, const char key, const char *value)
				2363	{
				2364	int r = process_config_option(cache, key, value);
				2365
				2366	if (r == NOT_CORE_OPTION)
				2367	r = policy_set_config_value(cache->policy, key, value);
				2368
				2369	if (r)
				2370	DMWARN("bad config value for %s: %s", key, value);
				2371
				2372	return r;
				2373	}
				2374
				2375	static int set_config_values(struct cache cache, int argc, const char *argv)
				2376	{
				2377	int r = 0;
				2378
				2379	if (argc & 1) {
				2380	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
				2381	return -EINVAL;
				2382	}
				2383
				2384	while (argc) {
				2385	r = set_config_value(cache, argv[0], argv[1]);
				2386	if (r)
				2387	break;
				2388
				2389	argc -= 2;
				2390	argv += 2;
				2391	}
				2392
				2393	return r;
				2394	}
				2395
				2396	static int create_cache_policy(struct cache cache, struct cache_args ca,
				2397	char **error)
				2398	{
				2399	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
				2400	cache->cache_size,
				2401	cache->origin_sectors,
				2402	cache->sectors_per_block);
				2403	if (IS_ERR(p)) {
				2404	*error = "Error creating cache's policy";
				2405	return PTR_ERR(p);
				2406	}
				2407	cache->policy = p;
				2408	BUG_ON(!cache->policy);
				2409
				2410	return 0;
				2411	}
				2412
				2413	/*
				2414	* We want the discard block size to be at least the size of the cache
				2415	* block size and have no more than 2^14 discard blocks across the origin.
				2416	*/
				2417	#define MAX_DISCARD_BLOCKS (1 << 14)
				2418
				2419	static bool too_many_discard_blocks(sector_t discard_block_size,
				2420	sector_t origin_size)
				2421	{
				2422	(void) sector_div(origin_size, discard_block_size);
				2423
				2424	return origin_size > MAX_DISCARD_BLOCKS;
				2425	}
				2426
				2427	static sector_t calculate_discard_block_size(sector_t cache_block_size,
				2428	sector_t origin_size)
				2429	{
				2430	sector_t discard_block_size = cache_block_size;
				2431
				2432	if (origin_size)
				2433	while (too_many_discard_blocks(discard_block_size, origin_size))
				2434	discard_block_size *= 2;
				2435
				2436	return discard_block_size;
				2437	}
				2438
				2439	static void set_cache_size(struct cache *cache, dm_cblock_t size)
				2440	{
				2441	dm_block_t nr_blocks = from_cblock(size);
				2442
				2443	if (nr_blocks > (1 << 20) && cache->cache_size != size)
				2444	DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
				2445	"All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
				2446	"Please consider increasing the cache block size to reduce the overall cache block count.",
				2447	(unsigned long long) nr_blocks);
				2448
				2449	cache->cache_size = size;
				2450	}
				2451
				2452	static int is_congested(struct dm_dev *dev, int bdi_bits)
				2453	{
				2454	struct request_queue *q = bdev_get_queue(dev->bdev);
				2455	return bdi_congested(q->backing_dev_info, bdi_bits);
				2456	}
				2457
				2458	static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				2459	{
				2460	struct cache *cache = container_of(cb, struct cache, callbacks);
				2461
				2462	return is_congested(cache->origin_dev, bdi_bits) \|\|
				2463	is_congested(cache->cache_dev, bdi_bits);
				2464	}
				2465
				2466	#define DEFAULT_MIGRATION_THRESHOLD 2048
				2467
				2468	static int cache_create(struct cache_args ca, struct cache *result)
				2469	{
				2470	int r = 0;
				2471	char **error = &ca->ti->error;
				2472	struct cache *cache;
				2473	struct dm_target *ti = ca->ti;
				2474	dm_block_t origin_blocks;
				2475	struct dm_cache_metadata *cmd;
				2476	bool may_format = ca->features.mode == CM_WRITE;
				2477
				2478	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
				2479	if (!cache)
				2480	return -ENOMEM;
				2481
				2482	cache->ti = ca->ti;
				2483	ti->private = cache;
				2484	ti->num_flush_bios = 2;
				2485	ti->flush_supported = true;
				2486
				2487	ti->num_discard_bios = 1;
				2488	ti->discards_supported = true;
				2489
				2490	ti->per_io_data_size = sizeof(struct per_bio_data);
				2491
				2492	cache->features = ca->features;
				2493	if (writethrough_mode(cache)) {
				2494	/* Create bioset for writethrough bios issued to origin */
				2495	r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
				2496	if (r)
				2497	goto bad;
				2498	}
				2499
				2500	cache->callbacks.congested_fn = cache_is_congested;
				2501	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
				2502
				2503	cache->metadata_dev = ca->metadata_dev;
				2504	cache->origin_dev = ca->origin_dev;
				2505	cache->cache_dev = ca->cache_dev;
				2506
				2507	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
				2508
				2509	origin_blocks = cache->origin_sectors = ti->len;
				2510	origin_blocks = block_div(origin_blocks, ca->block_size);
				2511	cache->origin_blocks = to_oblock(origin_blocks);
				2512
				2513	cache->sectors_per_block = ca->block_size;
				2514	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
				2515	r = -EINVAL;
				2516	goto bad;
				2517	}
				2518
				2519	if (ca->block_size & (ca->block_size - 1)) {
				2520	dm_block_t cache_size = ca->cache_sectors;
				2521
				2522	cache->sectors_per_block_shift = -1;
				2523	cache_size = block_div(cache_size, ca->block_size);
				2524	set_cache_size(cache, to_cblock(cache_size));
				2525	} else {
				2526	cache->sectors_per_block_shift = __ffs(ca->block_size);
				2527	set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
				2528	}
				2529
				2530	r = create_cache_policy(cache, ca, error);
				2531	if (r)
				2532	goto bad;
				2533
				2534	cache->policy_nr_args = ca->policy_argc;
				2535	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
				2536
				2537	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
				2538	if (r) {
				2539	*error = "Error setting cache policy's config values";
				2540	goto bad;
				2541	}
				2542
				2543	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
				2544	ca->block_size, may_format,
				2545	dm_cache_policy_get_hint_size(cache->policy),
				2546	ca->features.metadata_version);
				2547	if (IS_ERR(cmd)) {
				2548	*error = "Error creating metadata object";
				2549	r = PTR_ERR(cmd);
				2550	goto bad;
				2551	}
				2552	cache->cmd = cmd;
				2553	set_cache_mode(cache, CM_WRITE);
				2554	if (get_cache_mode(cache) != CM_WRITE) {
				2555	*error = "Unable to get write access to metadata, please check/repair metadata.";
				2556	r = -EINVAL;
				2557	goto bad;
				2558	}
				2559
				2560	if (passthrough_mode(cache)) {
				2561	bool all_clean;
				2562
				2563	r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
				2564	if (r) {
				2565	*error = "dm_cache_metadata_all_clean() failed";
				2566	goto bad;
				2567	}
				2568
				2569	if (!all_clean) {
				2570	*error = "Cannot enter passthrough mode unless all blocks are clean";
				2571	r = -EINVAL;
				2572	goto bad;
				2573	}
				2574
				2575	policy_allow_migrations(cache->policy, false);
				2576	}
				2577
				2578	spin_lock_init(&cache->lock);
				2579	bio_list_init(&cache->deferred_bios);
				2580	atomic_set(&cache->nr_allocated_migrations, 0);
				2581	atomic_set(&cache->nr_io_migrations, 0);
				2582	init_waitqueue_head(&cache->migration_wait);
				2583
				2584	r = -ENOMEM;
				2585	atomic_set(&cache->nr_dirty, 0);
				2586	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
				2587	if (!cache->dirty_bitset) {
				2588	*error = "could not allocate dirty bitset";
				2589	goto bad;
				2590	}
				2591	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
				2592
				2593	cache->discard_block_size =
				2594	calculate_discard_block_size(cache->sectors_per_block,
				2595	cache->origin_sectors);
				2596	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
				2597	cache->discard_block_size));
				2598	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
				2599	if (!cache->discard_bitset) {
				2600	*error = "could not allocate discard bitset";
				2601	goto bad;
				2602	}
				2603	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				2604
				2605	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				2606	if (IS_ERR(cache->copier)) {
				2607	*error = "could not create kcopyd client";
				2608	r = PTR_ERR(cache->copier);
				2609	goto bad;
				2610	}
				2611
				2612	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
				2613	if (!cache->wq) {
				2614	*error = "could not create workqueue for metadata object";
				2615	goto bad;
				2616	}
				2617	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
				2618	INIT_WORK(&cache->migration_worker, check_migrations);
				2619	INIT_DELAYED_WORK(&cache->waker, do_waker);
				2620
				2621	cache->prison = dm_bio_prison_create_v2(cache->wq);
				2622	if (!cache->prison) {
				2623	*error = "could not create bio prison";
				2624	goto bad;
				2625	}
				2626
				2627	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
				2628	migration_cache);
				2629	if (r) {
				2630	*error = "Error creating cache's migration mempool";
				2631	goto bad;
				2632	}
				2633
				2634	cache->need_tick_bio = true;
				2635	cache->sized = false;
				2636	cache->invalidate = false;
				2637	cache->commit_requested = false;
				2638	cache->loaded_mappings = false;
				2639	cache->loaded_discards = false;
				2640
				2641	load_stats(cache);
				2642
				2643	atomic_set(&cache->stats.demotion, 0);
				2644	atomic_set(&cache->stats.promotion, 0);
				2645	atomic_set(&cache->stats.copies_avoided, 0);
				2646	atomic_set(&cache->stats.cache_cell_clash, 0);
				2647	atomic_set(&cache->stats.commit_count, 0);
				2648	atomic_set(&cache->stats.discard_count, 0);
				2649
				2650	spin_lock_init(&cache->invalidation_lock);
				2651	INIT_LIST_HEAD(&cache->invalidation_requests);
				2652
				2653	batcher_init(&cache->committer, commit_op, cache,
				2654	issue_op, cache, cache->wq);
				2655	iot_init(&cache->tracker);
				2656
				2657	init_rwsem(&cache->background_work_lock);
				2658	prevent_background_work(cache);
				2659
				2660	*result = cache;
				2661	return 0;
				2662	bad:
				2663	destroy(cache);
				2664	return r;
				2665	}
				2666
				2667	static int copy_ctr_args(struct cache cache, int argc, const char *argv)
				2668	{
				2669	unsigned i;
				2670	const char **copy;
				2671
				2672	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				2673	if (!copy)
				2674	return -ENOMEM;
				2675	for (i = 0; i < argc; i++) {
				2676	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				2677	if (!copy[i]) {
				2678	while (i--)
				2679	kfree(copy[i]);
				2680	kfree(copy);
				2681	return -ENOMEM;
				2682	}
				2683	}
				2684
				2685	cache->nr_ctr_args = argc;
				2686	cache->ctr_args = copy;
				2687
				2688	return 0;
				2689	}
				2690
				2691	static int cache_ctr(struct dm_target ti, unsigned argc, char *argv)
				2692	{
				2693	int r = -EINVAL;
				2694	struct cache_args *ca;
				2695	struct cache *cache = NULL;
				2696
				2697	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
				2698	if (!ca) {
				2699	ti->error = "Error allocating memory for cache";
				2700	return -ENOMEM;
				2701	}
				2702	ca->ti = ti;
				2703
				2704	r = parse_cache_args(ca, argc, argv, &ti->error);
				2705	if (r)
				2706	goto out;
				2707
				2708	r = cache_create(ca, &cache);
				2709	if (r)
				2710	goto out;
				2711
				2712	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
				2713	if (r) {
				2714	destroy(cache);
				2715	goto out;
				2716	}
				2717
				2718	ti->private = cache;
				2719	out:
				2720	destroy_cache_args(ca);
				2721	return r;
				2722	}
				2723
				2724	/----------------------------------------------------------------/
				2725
				2726	static int cache_map(struct dm_target ti, struct bio bio)
				2727	{
				2728	struct cache *cache = ti->private;
				2729
				2730	int r;
				2731	bool commit_needed;
				2732	dm_oblock_t block = get_bio_block(cache, bio);
				2733
				2734	init_per_bio_data(bio);
				2735	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
				2736	/*
				2737	* This can only occur if the io goes to a partial block at
				2738	* the end of the origin device. We don't cache these.
				2739	* Just remap to the origin and carry on.
				2740	*/
				2741	remap_to_origin(cache, bio);
				2742	accounted_begin(cache, bio);
				2743	return DM_MAPIO_REMAPPED;
				2744	}
				2745
				2746	if (discard_or_flush(bio)) {
				2747	defer_bio(cache, bio);
				2748	return DM_MAPIO_SUBMITTED;
				2749	}
				2750
				2751	r = map_bio(cache, bio, block, &commit_needed);
				2752	if (commit_needed)
				2753	schedule_commit(&cache->committer);
				2754
				2755	return r;
				2756	}
				2757
				2758	static int cache_end_io(struct dm_target ti, struct bio bio, blk_status_t *error)
				2759	{
				2760	struct cache *cache = ti->private;
				2761	unsigned long flags;
				2762	struct per_bio_data *pb = get_per_bio_data(bio);
				2763
				2764	if (pb->tick) {
				2765	policy_tick(cache->policy, false);
				2766
				2767	spin_lock_irqsave(&cache->lock, flags);
				2768	cache->need_tick_bio = true;
				2769	spin_unlock_irqrestore(&cache->lock, flags);
				2770	}
				2771
				2772	bio_drop_shared_lock(cache, bio);
				2773	accounted_complete(cache, bio);
				2774
				2775	return DM_ENDIO_DONE;
				2776	}
				2777
				2778	static int write_dirty_bitset(struct cache *cache)
				2779	{
				2780	int r;
				2781
				2782	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2783	return -EINVAL;
				2784
				2785	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
				2786	if (r)
				2787	metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
				2788
				2789	return r;
				2790	}
				2791
				2792	static int write_discard_bitset(struct cache *cache)
				2793	{
				2794	unsigned i, r;
				2795
				2796	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2797	return -EINVAL;
				2798
				2799	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
				2800	cache->discard_nr_blocks);
				2801	if (r) {
				2802	DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
				2803	metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
				2804	return r;
				2805	}
				2806
				2807	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
				2808	r = dm_cache_set_discard(cache->cmd, to_dblock(i),
				2809	is_discarded(cache, to_dblock(i)));
				2810	if (r) {
				2811	metadata_operation_failed(cache, "dm_cache_set_discard", r);
				2812	return r;
				2813	}
				2814	}
				2815
				2816	return 0;
				2817	}
				2818
				2819	static int write_hints(struct cache *cache)
				2820	{
				2821	int r;
				2822
				2823	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2824	return -EINVAL;
				2825
				2826	r = dm_cache_write_hints(cache->cmd, cache->policy);
				2827	if (r) {
				2828	metadata_operation_failed(cache, "dm_cache_write_hints", r);
				2829	return r;
				2830	}
				2831
				2832	return 0;
				2833	}
				2834
				2835	/*
				2836	* returns true on success
				2837	*/
				2838	static bool sync_metadata(struct cache *cache)
				2839	{
				2840	int r1, r2, r3, r4;
				2841
				2842	r1 = write_dirty_bitset(cache);
				2843	if (r1)
				2844	DMERR("%s: could not write dirty bitset", cache_device_name(cache));
				2845
				2846	r2 = write_discard_bitset(cache);
				2847	if (r2)
				2848	DMERR("%s: could not write discard bitset", cache_device_name(cache));
				2849
				2850	save_stats(cache);
				2851
				2852	r3 = write_hints(cache);
				2853	if (r3)
				2854	DMERR("%s: could not write hints", cache_device_name(cache));
				2855
				2856	/*
				2857	* If writing the above metadata failed, we still commit, but don't
				2858	* set the clean shutdown flag. This will effectively force every
				2859	* dirty bit to be set on reload.
				2860	*/
				2861	r4 = commit(cache, !r1 && !r2 && !r3);
				2862	if (r4)
				2863	DMERR("%s: could not write cache metadata", cache_device_name(cache));
				2864
				2865	return !r1 && !r2 && !r3 && !r4;
				2866	}
				2867
				2868	static void cache_postsuspend(struct dm_target *ti)
				2869	{
				2870	struct cache *cache = ti->private;
				2871
				2872	prevent_background_work(cache);
				2873	BUG_ON(atomic_read(&cache->nr_io_migrations));
				2874
				2875	cancel_delayed_work_sync(&cache->waker);
				2876	drain_workqueue(cache->wq);
				2877	WARN_ON(cache->tracker.in_flight);
				2878
				2879	/*
				2880	* If it's a flush suspend there won't be any deferred bios, so this
				2881	* call is harmless.
				2882	*/
				2883	requeue_deferred_bios(cache);
				2884
				2885	if (get_cache_mode(cache) == CM_WRITE)
				2886	(void) sync_metadata(cache);
				2887	}
				2888
				2889	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
				2890	bool dirty, uint32_t hint, bool hint_valid)
				2891	{
				2892	int r;
				2893	struct cache *cache = context;
				2894
				2895	if (dirty) {
				2896	set_bit(from_cblock(cblock), cache->dirty_bitset);
				2897	atomic_inc(&cache->nr_dirty);
				2898	} else
				2899	clear_bit(from_cblock(cblock), cache->dirty_bitset);
				2900
				2901	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
				2902	if (r)
				2903	return r;
				2904
				2905	return 0;
				2906	}
				2907
				2908	/*
				2909	* The discard block size in the on disk metadata is not
				2910	* neccessarily the same as we're currently using. So we have to
				2911	* be careful to only set the discarded attribute if we know it
				2912	* covers a complete block of the new size.
				2913	*/
				2914	struct discard_load_info {
				2915	struct cache *cache;
				2916
				2917	/*
				2918	* These blocks are sized using the on disk dblock size, rather
				2919	* than the current one.
				2920	*/
				2921	dm_block_t block_size;
				2922	dm_block_t discard_begin, discard_end;
				2923	};
				2924
				2925	static void discard_load_info_init(struct cache *cache,
				2926	struct discard_load_info *li)
				2927	{
				2928	li->cache = cache;
				2929	li->discard_begin = li->discard_end = 0;
				2930	}
				2931
				2932	static void set_discard_range(struct discard_load_info *li)
				2933	{
				2934	sector_t b, e;
				2935
				2936	if (li->discard_begin == li->discard_end)
				2937	return;
				2938
				2939	/*
				2940	* Convert to sectors.
				2941	*/
				2942	b = li->discard_begin * li->block_size;
				2943	e = li->discard_end * li->block_size;
				2944
				2945	/*
				2946	* Then convert back to the current dblock size.
				2947	*/
				2948	b = dm_sector_div_up(b, li->cache->discard_block_size);
				2949	sector_div(e, li->cache->discard_block_size);
				2950
				2951	/*
				2952	* The origin may have shrunk, so we need to check we're still in
				2953	* bounds.
				2954	*/
				2955	if (e > from_dblock(li->cache->discard_nr_blocks))
				2956	e = from_dblock(li->cache->discard_nr_blocks);
				2957
				2958	for (; b < e; b++)
				2959	set_discard(li->cache, to_dblock(b));
				2960	}
				2961
				2962	static int load_discard(void *context, sector_t discard_block_size,
				2963	dm_dblock_t dblock, bool discard)
				2964	{
				2965	struct discard_load_info *li = context;
				2966
				2967	li->block_size = discard_block_size;
				2968
				2969	if (discard) {
				2970	if (from_dblock(dblock) == li->discard_end)
				2971	/*
				2972	* We're already in a discard range, just extend it.
				2973	*/
				2974	li->discard_end = li->discard_end + 1ULL;
				2975
				2976	else {
				2977	/*
				2978	* Emit the old range and start a new one.
				2979	*/
				2980	set_discard_range(li);
				2981	li->discard_begin = from_dblock(dblock);
				2982	li->discard_end = li->discard_begin + 1ULL;
				2983	}
				2984	} else {
				2985	set_discard_range(li);
				2986	li->discard_begin = li->discard_end = 0;
				2987	}
				2988
				2989	return 0;
				2990	}
				2991
				2992	static dm_cblock_t get_cache_dev_size(struct cache *cache)
				2993	{
				2994	sector_t size = get_dev_size(cache->cache_dev);
				2995	(void) sector_div(size, cache->sectors_per_block);
				2996	return to_cblock(size);
				2997	}
				2998
				2999	static bool can_resize(struct cache *cache, dm_cblock_t new_size)
				3000	{
				3001	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
				3002	DMERR("%s: unable to extend cache due to missing cache table reload",
				3003	cache_device_name(cache));
				3004	return false;
				3005	}
				3006
				3007	/*
				3008	* We can't drop a dirty block when shrinking the cache.
				3009	*/
				3010	if (cache->loaded_mappings) {
				3011	new_size = to_cblock(find_next_bit(cache->dirty_bitset,
				3012	from_cblock(cache->cache_size),
				3013	from_cblock(new_size)));
				3014	if (new_size != cache->cache_size) {
				3015	DMERR("%s: unable to shrink cache; cache block %llu is dirty",
				3016	cache_device_name(cache),
				3017	(unsigned long long) from_cblock(new_size));
				3018	return false;
				3019	}
				3020	}
				3021
				3022	return true;
				3023	}
				3024
				3025	static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
				3026	{
				3027	int r;
				3028
				3029	r = dm_cache_resize(cache->cmd, new_size);
				3030	if (r) {
				3031	DMERR("%s: could not resize cache metadata", cache_device_name(cache));
				3032	metadata_operation_failed(cache, "dm_cache_resize", r);
				3033	return r;
				3034	}
				3035
				3036	set_cache_size(cache, new_size);
				3037
				3038	return 0;
				3039	}
				3040
				3041	static int cache_preresume(struct dm_target *ti)
				3042	{
				3043	int r = 0;
				3044	struct cache *cache = ti->private;
				3045	dm_cblock_t csize = get_cache_dev_size(cache);
				3046
				3047	/*
				3048	* Check to see if the cache has resized.
				3049	*/
				3050	if (!cache->sized \|\| csize != cache->cache_size) {
				3051	if (!can_resize(cache, csize))
				3052	return -EINVAL;
				3053
				3054	r = resize_cache_dev(cache, csize);
				3055	if (r)
				3056	return r;
				3057
				3058	cache->sized = true;
				3059	}
				3060
				3061	if (!cache->loaded_mappings) {
				3062	r = dm_cache_load_mappings(cache->cmd, cache->policy,
				3063	load_mapping, cache);
				3064	if (r) {
				3065	DMERR("%s: could not load cache mappings", cache_device_name(cache));
				3066	metadata_operation_failed(cache, "dm_cache_load_mappings", r);
				3067	return r;
				3068	}
				3069
				3070	cache->loaded_mappings = true;
				3071	}
				3072
				3073	if (!cache->loaded_discards) {
				3074	struct discard_load_info li;
				3075
				3076	/*
				3077	* The discard bitset could have been resized, or the
				3078	* discard block size changed. To be safe we start by
				3079	* setting every dblock to not discarded.
				3080	*/
				3081	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				3082
				3083	discard_load_info_init(cache, &li);
				3084	r = dm_cache_load_discards(cache->cmd, load_discard, &li);
				3085	if (r) {
				3086	DMERR("%s: could not load origin discards", cache_device_name(cache));
				3087	metadata_operation_failed(cache, "dm_cache_load_discards", r);
				3088	return r;
				3089	}
				3090	set_discard_range(&li);
				3091
				3092	cache->loaded_discards = true;
				3093	}
				3094
				3095	return r;
				3096	}
				3097
				3098	static void cache_resume(struct dm_target *ti)
				3099	{
				3100	struct cache *cache = ti->private;
				3101
				3102	cache->need_tick_bio = true;
				3103	allow_background_work(cache);
				3104	do_waker(&cache->waker.work);
				3105	}
				3106
				3107	static void emit_flags(struct cache cache, char result,
				3108	unsigned maxlen, ssize_t *sz_ptr)
				3109	{
				3110	ssize_t sz = *sz_ptr;
				3111	struct cache_features *cf = &cache->features;
				3112	unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
				3113
				3114	DMEMIT("%u ", count);
				3115
				3116	if (cf->metadata_version == 2)
				3117	DMEMIT("metadata2 ");
				3118
				3119	if (writethrough_mode(cache))
				3120	DMEMIT("writethrough ");
				3121
				3122	else if (passthrough_mode(cache))
				3123	DMEMIT("passthrough ");
				3124
				3125	else if (writeback_mode(cache))
				3126	DMEMIT("writeback ");
				3127
				3128	else {
				3129	DMEMIT("unknown ");
				3130	DMERR("%s: internal error: unknown io mode: %d",
				3131	cache_device_name(cache), (int) cf->io_mode);
				3132	}
				3133
				3134	if (!cf->discard_passdown)
				3135	DMEMIT("no_discard_passdown ");
				3136
				3137	*sz_ptr = sz;
				3138	}
				3139
				3140	/*
				3141	* Status format:
				3142	*
				3143	* <metadata block size> <#used metadata blocks>/<#total metadata blocks>
				3144	* <cache block size> <#used cache blocks>/<#total cache blocks>
				3145	* <#read hits> <#read misses> <#write hits> <#write misses>
				3146	* <#demotions> <#promotions> <#dirty>
				3147	* <#features> <features>*
				3148	* <#core args> <core args>
				3149	* <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
				3150	*/
				3151	static void cache_status(struct dm_target *ti, status_type_t type,
				3152	unsigned status_flags, char *result, unsigned maxlen)
				3153	{
				3154	int r = 0;
				3155	unsigned i;
				3156	ssize_t sz = 0;
				3157	dm_block_t nr_free_blocks_metadata = 0;
				3158	dm_block_t nr_blocks_metadata = 0;
				3159	char buf[BDEVNAME_SIZE];
				3160	struct cache *cache = ti->private;
				3161	dm_cblock_t residency;
				3162	bool needs_check;
				3163
				3164	switch (type) {
				3165	case STATUSTYPE_INFO:
				3166	if (get_cache_mode(cache) == CM_FAIL) {
				3167	DMEMIT("Fail");
				3168	break;
				3169	}
				3170
				3171	/* Commit to ensure statistics aren't out-of-date */
				3172	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
				3173	(void) commit(cache, false);
				3174
				3175	r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
				3176	if (r) {
				3177	DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
				3178	cache_device_name(cache), r);
				3179	goto err;
				3180	}
				3181
				3182	r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
				3183	if (r) {
				3184	DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
				3185	cache_device_name(cache), r);
				3186	goto err;
				3187	}
				3188
				3189	residency = policy_residency(cache->policy);
				3190
				3191	DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
				3192	(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
				3193	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				3194	(unsigned long long)nr_blocks_metadata,
				3195	(unsigned long long)cache->sectors_per_block,
				3196	(unsigned long long) from_cblock(residency),
				3197	(unsigned long long) from_cblock(cache->cache_size),
				3198	(unsigned) atomic_read(&cache->stats.read_hit),
				3199	(unsigned) atomic_read(&cache->stats.read_miss),
				3200	(unsigned) atomic_read(&cache->stats.write_hit),
				3201	(unsigned) atomic_read(&cache->stats.write_miss),
				3202	(unsigned) atomic_read(&cache->stats.demotion),
				3203	(unsigned) atomic_read(&cache->stats.promotion),
				3204	(unsigned long) atomic_read(&cache->nr_dirty));
				3205
				3206	emit_flags(cache, result, maxlen, &sz);
				3207
				3208	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
				3209
				3210	DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
				3211	if (sz < maxlen) {
				3212	r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
				3213	if (r)
				3214	DMERR("%s: policy_emit_config_values returned %d",
				3215	cache_device_name(cache), r);
				3216	}
				3217
				3218	if (get_cache_mode(cache) == CM_READ_ONLY)
				3219	DMEMIT("ro ");
				3220	else
				3221	DMEMIT("rw ");
				3222
				3223	r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
				3224
				3225	if (r \|\| needs_check)
				3226	DMEMIT("needs_check ");
				3227	else
				3228	DMEMIT("- ");
				3229
				3230	break;
				3231
				3232	case STATUSTYPE_TABLE:
				3233	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
				3234	DMEMIT("%s ", buf);
				3235	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
				3236	DMEMIT("%s ", buf);
				3237	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
				3238	DMEMIT("%s", buf);
				3239
				3240	for (i = 0; i < cache->nr_ctr_args - 1; i++)
				3241	DMEMIT(" %s", cache->ctr_args[i]);
				3242	if (cache->nr_ctr_args)
				3243	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
				3244	}
				3245
				3246	return;
				3247
				3248	err:
				3249	DMEMIT("Error");
				3250	}
				3251
				3252	/*
				3253	* Defines a range of cblocks, begin to (end - 1) are in the range. end is
				3254	* the one-past-the-end value.
				3255	*/
				3256	struct cblock_range {
				3257	dm_cblock_t begin;
				3258	dm_cblock_t end;
				3259	};
				3260
				3261	/*
				3262	* A cache block range can take two forms:
				3263	*
				3264	* i) A single cblock, eg. '3456'
				3265	* ii) A begin and end cblock with a dash between, eg. 123-234
				3266	*/
				3267	static int parse_cblock_range(struct cache cache, const char str,
				3268	struct cblock_range *result)
				3269	{
				3270	char dummy;
				3271	uint64_t b, e;
				3272	int r;
				3273
				3274	/*
				3275	* Try and parse form (ii) first.
				3276	*/
				3277	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
				3278	if (r < 0)
				3279	return r;
				3280
				3281	if (r == 2) {
				3282	result->begin = to_cblock(b);
				3283	result->end = to_cblock(e);
				3284	return 0;
				3285	}
				3286
				3287	/*
				3288	* That didn't work, try form (i).
				3289	*/
				3290	r = sscanf(str, "%llu%c", &b, &dummy);
				3291	if (r < 0)
				3292	return r;
				3293
				3294	if (r == 1) {
				3295	result->begin = to_cblock(b);
				3296	result->end = to_cblock(from_cblock(result->begin) + 1u);
				3297	return 0;
				3298	}
				3299
				3300	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
				3301	return -EINVAL;
				3302	}
				3303
				3304	static int validate_cblock_range(struct cache cache, struct cblock_range range)
				3305	{
				3306	uint64_t b = from_cblock(range->begin);
				3307	uint64_t e = from_cblock(range->end);
				3308	uint64_t n = from_cblock(cache->cache_size);
				3309
				3310	if (b >= n) {
				3311	DMERR("%s: begin cblock out of range: %llu >= %llu",
				3312	cache_device_name(cache), b, n);
				3313	return -EINVAL;
				3314	}
				3315
				3316	if (e > n) {
				3317	DMERR("%s: end cblock out of range: %llu > %llu",
				3318	cache_device_name(cache), e, n);
				3319	return -EINVAL;
				3320	}
				3321
				3322	if (b >= e) {
				3323	DMERR("%s: invalid cblock range: %llu >= %llu",
				3324	cache_device_name(cache), b, e);
				3325	return -EINVAL;
				3326	}
				3327
				3328	return 0;
				3329	}
				3330
				3331	static inline dm_cblock_t cblock_succ(dm_cblock_t b)
				3332	{
				3333	return to_cblock(from_cblock(b) + 1);
				3334	}
				3335
				3336	static int request_invalidation(struct cache cache, struct cblock_range range)
				3337	{
				3338	int r = 0;
				3339
				3340	/*
				3341	* We don't need to do any locking here because we know we're in
				3342	* passthrough mode. There's is potential for a race between an
				3343	* invalidation triggered by an io and an invalidation message. This
				3344	* is harmless, we must not worry if the policy call fails.
				3345	*/
				3346	while (range->begin != range->end) {
				3347	r = invalidate_cblock(cache, range->begin);
				3348	if (r)
				3349	return r;
				3350
				3351	range->begin = cblock_succ(range->begin);
				3352	}
				3353
				3354	cache->commit_requested = true;
				3355	return r;
				3356	}
				3357
				3358	static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
				3359	const char **cblock_ranges)
				3360	{
				3361	int r = 0;
				3362	unsigned i;
				3363	struct cblock_range range;
				3364
				3365	if (!passthrough_mode(cache)) {
				3366	DMERR("%s: cache has to be in passthrough mode for invalidation",
				3367	cache_device_name(cache));
				3368	return -EPERM;
				3369	}
				3370
				3371	for (i = 0; i < count; i++) {
				3372	r = parse_cblock_range(cache, cblock_ranges[i], &range);
				3373	if (r)
				3374	break;
				3375
				3376	r = validate_cblock_range(cache, &range);
				3377	if (r)
				3378	break;
				3379
				3380	/*
				3381	* Pass begin and end origin blocks to the worker and wake it.
				3382	*/
				3383	r = request_invalidation(cache, &range);
				3384	if (r)
				3385	break;
				3386	}
				3387
				3388	return r;
				3389	}
				3390
				3391	/*
				3392	* Supports
				3393	* "<key> <value>"
				3394	* and
				3395	* "invalidate_cblocks [(<begin>)\|(<begin>-<end>)]*
				3396	*
				3397	* The key migration_threshold is supported by the cache target core.
				3398	*/
				3399	static int cache_message(struct dm_target ti, unsigned argc, char *argv,
				3400	char *result, unsigned maxlen)
				3401	{
				3402	struct cache *cache = ti->private;
				3403
				3404	if (!argc)
				3405	return -EINVAL;
				3406
				3407	if (get_cache_mode(cache) >= CM_READ_ONLY) {
				3408	DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
				3409	cache_device_name(cache));
				3410	return -EOPNOTSUPP;
				3411	}
				3412
				3413	if (!strcasecmp(argv[0], "invalidate_cblocks"))
				3414	return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
				3415
				3416	if (argc != 2)
				3417	return -EINVAL;
				3418
				3419	return set_config_value(cache, argv[0], argv[1]);
				3420	}
				3421
				3422	static int cache_iterate_devices(struct dm_target *ti,
				3423	iterate_devices_callout_fn fn, void *data)
				3424	{
				3425	int r = 0;
				3426	struct cache *cache = ti->private;
				3427
				3428	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
				3429	if (!r)
				3430	r = fn(ti, cache->origin_dev, 0, ti->len, data);
				3431
				3432	return r;
				3433	}
				3434
				3435	static bool origin_dev_supports_discard(struct block_device *origin_bdev)
				3436	{
				3437	struct request_queue *q = bdev_get_queue(origin_bdev);
				3438
				3439	return q && blk_queue_discard(q);
				3440	}
				3441
				3442	/*
				3443	* If discard_passdown was enabled verify that the origin device
				3444	* supports discards. Disable discard_passdown if not.
				3445	*/
				3446	static void disable_passdown_if_not_supported(struct cache *cache)
				3447	{
				3448	struct block_device *origin_bdev = cache->origin_dev->bdev;
				3449	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
				3450	const char *reason = NULL;
				3451	char buf[BDEVNAME_SIZE];
				3452
				3453	if (!cache->features.discard_passdown)
				3454	return;
				3455
				3456	if (!origin_dev_supports_discard(origin_bdev))
				3457	reason = "discard unsupported";
				3458
				3459	else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
				3460	reason = "max discard sectors smaller than a block";
				3461
				3462	if (reason) {
				3463	DMWARN("Origin device (%s) %s: Disabling discard passdown.",
				3464	bdevname(origin_bdev, buf), reason);
				3465	cache->features.discard_passdown = false;
				3466	}
				3467	}
				3468
				3469	static void set_discard_limits(struct cache cache, struct queue_limits limits)
				3470	{
				3471	struct block_device *origin_bdev = cache->origin_dev->bdev;
				3472	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
				3473
				3474	if (!cache->features.discard_passdown) {
				3475	/* No passdown is done so setting own virtual limits */
				3476	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
				3477	cache->origin_sectors);
				3478	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
				3479	return;
				3480	}
				3481
				3482	/*
				3483	* cache_iterate_devices() is stacking both origin and fast device limits
				3484	* but discards aren't passed to fast device, so inherit origin's limits.
				3485	*/
				3486	limits->max_discard_sectors = origin_limits->max_discard_sectors;
				3487	limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
				3488	limits->discard_granularity = origin_limits->discard_granularity;
				3489	limits->discard_alignment = origin_limits->discard_alignment;
				3490	limits->discard_misaligned = origin_limits->discard_misaligned;
				3491	}
				3492
				3493	static void cache_io_hints(struct dm_target ti, struct queue_limits limits)
				3494	{
				3495	struct cache *cache = ti->private;
				3496	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
				3497
				3498	/*
				3499	* If the system-determined stacked limits are compatible with the
				3500	* cache's blocksize (io_opt is a factor) do not override them.
				3501	*/
				3502	if (io_opt_sectors < cache->sectors_per_block \|\|
				3503	do_div(io_opt_sectors, cache->sectors_per_block)) {
				3504	blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3505	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3506	}
				3507
				3508	disable_passdown_if_not_supported(cache);
				3509	set_discard_limits(cache, limits);
				3510	}
				3511
				3512	/----------------------------------------------------------------/
				3513
				3514	static struct target_type cache_target = {
				3515	.name = "cache",
				3516	.version = {2, 1, 0},
				3517	.module = THIS_MODULE,
				3518	.ctr = cache_ctr,
				3519	.dtr = cache_dtr,
				3520	.map = cache_map,
				3521	.end_io = cache_end_io,
				3522	.postsuspend = cache_postsuspend,
				3523	.preresume = cache_preresume,
				3524	.resume = cache_resume,
				3525	.status = cache_status,
				3526	.message = cache_message,
				3527	.iterate_devices = cache_iterate_devices,
				3528	.io_hints = cache_io_hints,
				3529	};
				3530
				3531	static int __init dm_cache_init(void)
				3532	{
				3533	int r;
				3534
				3535	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
				3536	if (!migration_cache)
				3537	return -ENOMEM;
				3538
				3539	r = dm_register_target(&cache_target);
				3540	if (r) {
				3541	DMERR("cache target registration failed: %d", r);
				3542	kmem_cache_destroy(migration_cache);
				3543	return r;
				3544	}
				3545
				3546	return 0;
				3547	}
				3548
				3549	static void __exit dm_cache_exit(void)
				3550	{
				3551	dm_unregister_target(&cache_target);
				3552	kmem_cache_destroy(migration_cache);
				3553	}
				3554
				3555	module_init(dm_cache_init);
				3556	module_exit(dm_cache_exit);
				3557
				3558	MODULE_DESCRIPTION(DM_NAME " cache target");
				3559	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
				3560	MODULE_LICENSE("GPL");