Blame - src/kernel/linux/v4.19/drivers/md/dm-cache-target.c - T800

blob: 84ff70027c2520d4e04da2b4b725d0abadc74b3b [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2012 Red Hat. All rights reserved.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm.h"
				8	#include "dm-bio-prison-v2.h"
				9	#include "dm-bio-record.h"
				10	#include "dm-cache-metadata.h"
				11
				12	#include <linux/dm-io.h>
				13	#include <linux/dm-kcopyd.h>
				14	#include <linux/jiffies.h>
				15	#include <linux/init.h>
				16	#include <linux/mempool.h>
				17	#include <linux/module.h>
				18	#include <linux/rwsem.h>
				19	#include <linux/slab.h>
				20	#include <linux/vmalloc.h>
				21
				22	#define DM_MSG_PREFIX "cache"
				23
				24	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
				25	"A percentage of time allocated for copying to and/or from cache");
				26
				27	/----------------------------------------------------------------/
				28
				29	/*
				30	* Glossary:
				31	*
				32	* oblock: index of an origin block
				33	* cblock: index of a cache block
				34	* promotion: movement of a block from origin to cache
				35	* demotion: movement of a block from cache to origin
				36	* migration: movement of a block between the origin and cache device,
				37	* either direction
				38	*/
				39
				40	/----------------------------------------------------------------/
				41
				42	struct io_tracker {
				43	spinlock_t lock;
				44
				45	/*
				46	* Sectors of in-flight IO.
				47	*/
				48	sector_t in_flight;
				49
				50	/*
				51	* The time, in jiffies, when this device became idle (if it is
				52	* indeed idle).
				53	*/
				54	unsigned long idle_time;
				55	unsigned long last_update_time;
				56	};
				57
				58	static void iot_init(struct io_tracker *iot)
				59	{
				60	spin_lock_init(&iot->lock);
				61	iot->in_flight = 0ul;
				62	iot->idle_time = 0ul;
				63	iot->last_update_time = jiffies;
				64	}
				65
				66	static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				67	{
				68	if (iot->in_flight)
				69	return false;
				70
				71	return time_after(jiffies, iot->idle_time + jifs);
				72	}
				73
				74	static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
				75	{
				76	bool r;
				77	unsigned long flags;
				78
				79	spin_lock_irqsave(&iot->lock, flags);
				80	r = __iot_idle_for(iot, jifs);
				81	spin_unlock_irqrestore(&iot->lock, flags);
				82
				83	return r;
				84	}
				85
				86	static void iot_io_begin(struct io_tracker *iot, sector_t len)
				87	{
				88	unsigned long flags;
				89
				90	spin_lock_irqsave(&iot->lock, flags);
				91	iot->in_flight += len;
				92	spin_unlock_irqrestore(&iot->lock, flags);
				93	}
				94
				95	static void __iot_io_end(struct io_tracker *iot, sector_t len)
				96	{
				97	if (!len)
				98	return;
				99
				100	iot->in_flight -= len;
				101	if (!iot->in_flight)
				102	iot->idle_time = jiffies;
				103	}
				104
				105	static void iot_io_end(struct io_tracker *iot, sector_t len)
				106	{
				107	unsigned long flags;
				108
				109	spin_lock_irqsave(&iot->lock, flags);
				110	__iot_io_end(iot, len);
				111	spin_unlock_irqrestore(&iot->lock, flags);
				112	}
				113
				114	/----------------------------------------------------------------/
				115
				116	/*
				117	* Represents a chunk of future work. 'input' allows continuations to pass
				118	* values between themselves, typically error values.
				119	*/
				120	struct continuation {
				121	struct work_struct ws;
				122	blk_status_t input;
				123	};
				124
				125	static inline void init_continuation(struct continuation *k,
				126	void (fn)(struct work_struct ))
				127	{
				128	INIT_WORK(&k->ws, fn);
				129	k->input = 0;
				130	}
				131
				132	static inline void queue_continuation(struct workqueue_struct *wq,
				133	struct continuation *k)
				134	{
				135	queue_work(wq, &k->ws);
				136	}
				137
				138	/----------------------------------------------------------------/
				139
				140	/*
				141	* The batcher collects together pieces of work that need a particular
				142	* operation to occur before they can proceed (typically a commit).
				143	*/
				144	struct batcher {
				145	/*
				146	* The operation that everyone is waiting for.
				147	*/
				148	blk_status_t (commit_op)(void context);
				149	void *commit_context;
				150
				151	/*
				152	* This is how bios should be issued once the commit op is complete
				153	* (accounted_request).
				154	*/
				155	void (issue_op)(struct bio bio, void *context);
				156	void *issue_context;
				157
				158	/*
				159	* Queued work gets put on here after commit.
				160	*/
				161	struct workqueue_struct *wq;
				162
				163	spinlock_t lock;
				164	struct list_head work_items;
				165	struct bio_list bios;
				166	struct work_struct commit_work;
				167
				168	bool commit_scheduled;
				169	};
				170
				171	static void __commit(struct work_struct *_ws)
				172	{
				173	struct batcher *b = container_of(_ws, struct batcher, commit_work);
				174	blk_status_t r;
				175	unsigned long flags;
				176	struct list_head work_items;
				177	struct work_struct ws, tmp;
				178	struct continuation *k;
				179	struct bio *bio;
				180	struct bio_list bios;
				181
				182	INIT_LIST_HEAD(&work_items);
				183	bio_list_init(&bios);
				184
				185	/*
				186	* We have to grab these before the commit_op to avoid a race
				187	* condition.
				188	*/
				189	spin_lock_irqsave(&b->lock, flags);
				190	list_splice_init(&b->work_items, &work_items);
				191	bio_list_merge(&bios, &b->bios);
				192	bio_list_init(&b->bios);
				193	b->commit_scheduled = false;
				194	spin_unlock_irqrestore(&b->lock, flags);
				195
				196	r = b->commit_op(b->commit_context);
				197
				198	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
				199	k = container_of(ws, struct continuation, ws);
				200	k->input = r;
				201	INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
				202	queue_work(b->wq, ws);
				203	}
				204
				205	while ((bio = bio_list_pop(&bios))) {
				206	if (r) {
				207	bio->bi_status = r;
				208	bio_endio(bio);
				209	} else
				210	b->issue_op(bio, b->issue_context);
				211	}
				212	}
				213
				214	static void batcher_init(struct batcher *b,
				215	blk_status_t (commit_op)(void ),
				216	void *commit_context,
				217	void (issue_op)(struct bio bio, void *),
				218	void *issue_context,
				219	struct workqueue_struct *wq)
				220	{
				221	b->commit_op = commit_op;
				222	b->commit_context = commit_context;
				223	b->issue_op = issue_op;
				224	b->issue_context = issue_context;
				225	b->wq = wq;
				226
				227	spin_lock_init(&b->lock);
				228	INIT_LIST_HEAD(&b->work_items);
				229	bio_list_init(&b->bios);
				230	INIT_WORK(&b->commit_work, __commit);
				231	b->commit_scheduled = false;
				232	}
				233
				234	static void async_commit(struct batcher *b)
				235	{
				236	queue_work(b->wq, &b->commit_work);
				237	}
				238
				239	static void continue_after_commit(struct batcher b, struct continuation k)
				240	{
				241	unsigned long flags;
				242	bool commit_scheduled;
				243
				244	spin_lock_irqsave(&b->lock, flags);
				245	commit_scheduled = b->commit_scheduled;
				246	list_add_tail(&k->ws.entry, &b->work_items);
				247	spin_unlock_irqrestore(&b->lock, flags);
				248
				249	if (commit_scheduled)
				250	async_commit(b);
				251	}
				252
				253	/*
				254	* Bios are errored if commit failed.
				255	*/
				256	static void issue_after_commit(struct batcher b, struct bio bio)
				257	{
				258	unsigned long flags;
				259	bool commit_scheduled;
				260
				261	spin_lock_irqsave(&b->lock, flags);
				262	commit_scheduled = b->commit_scheduled;
				263	bio_list_add(&b->bios, bio);
				264	spin_unlock_irqrestore(&b->lock, flags);
				265
				266	if (commit_scheduled)
				267	async_commit(b);
				268	}
				269
				270	/*
				271	* Call this if some urgent work is waiting for the commit to complete.
				272	*/
				273	static void schedule_commit(struct batcher *b)
				274	{
				275	bool immediate;
				276	unsigned long flags;
				277
				278	spin_lock_irqsave(&b->lock, flags);
				279	immediate = !list_empty(&b->work_items) \|\| !bio_list_empty(&b->bios);
				280	b->commit_scheduled = true;
				281	spin_unlock_irqrestore(&b->lock, flags);
				282
				283	if (immediate)
				284	async_commit(b);
				285	}
				286
				287	/*
				288	* There are a couple of places where we let a bio run, but want to do some
				289	* work before calling its endio function. We do this by temporarily
				290	* changing the endio fn.
				291	*/
				292	struct dm_hook_info {
				293	bio_end_io_t *bi_end_io;
				294	};
				295
				296	static void dm_hook_bio(struct dm_hook_info h, struct bio bio,
				297	bio_end_io_t bi_end_io, void bi_private)
				298	{
				299	h->bi_end_io = bio->bi_end_io;
				300
				301	bio->bi_end_io = bi_end_io;
				302	bio->bi_private = bi_private;
				303	}
				304
				305	static void dm_unhook_bio(struct dm_hook_info h, struct bio bio)
				306	{
				307	bio->bi_end_io = h->bi_end_io;
				308	}
				309
				310	/----------------------------------------------------------------/
				311
				312	#define MIGRATION_POOL_SIZE 128
				313	#define COMMIT_PERIOD HZ
				314	#define MIGRATION_COUNT_WINDOW 10
				315
				316	/*
				317	* The block size of the device holding cache data must be
				318	* between 32KB and 1GB.
				319	*/
				320	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
				321	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				322
				323	enum cache_metadata_mode {
				324	CM_WRITE, /* metadata may be changed */
				325	CM_READ_ONLY, /* metadata may not be changed */
				326	CM_FAIL
				327	};
				328
				329	enum cache_io_mode {
				330	/*
				331	* Data is written to cached blocks only. These blocks are marked
				332	* dirty. If you lose the cache device you will lose data.
				333	* Potential performance increase for both reads and writes.
				334	*/
				335	CM_IO_WRITEBACK,
				336
				337	/*
				338	* Data is written to both cache and origin. Blocks are never
				339	* dirty. Potential performance benfit for reads only.
				340	*/
				341	CM_IO_WRITETHROUGH,
				342
				343	/*
				344	* A degraded mode useful for various cache coherency situations
				345	* (eg, rolling back snapshots). Reads and writes always go to the
				346	* origin. If a write goes to a cached oblock, then the cache
				347	* block is invalidated.
				348	*/
				349	CM_IO_PASSTHROUGH
				350	};
				351
				352	struct cache_features {
				353	enum cache_metadata_mode mode;
				354	enum cache_io_mode io_mode;
				355	unsigned metadata_version;
				356	};
				357
				358	struct cache_stats {
				359	atomic_t read_hit;
				360	atomic_t read_miss;
				361	atomic_t write_hit;
				362	atomic_t write_miss;
				363	atomic_t demotion;
				364	atomic_t promotion;
				365	atomic_t writeback;
				366	atomic_t copies_avoided;
				367	atomic_t cache_cell_clash;
				368	atomic_t commit_count;
				369	atomic_t discard_count;
				370	};
				371
				372	struct cache {
				373	struct dm_target *ti;
				374	spinlock_t lock;
				375
				376	/*
				377	* Fields for converting from sectors to blocks.
				378	*/
				379	int sectors_per_block_shift;
				380	sector_t sectors_per_block;
				381
				382	struct dm_cache_metadata *cmd;
				383
				384	/*
				385	* Metadata is written to this device.
				386	*/
				387	struct dm_dev *metadata_dev;
				388
				389	/*
				390	* The slower of the two data devices. Typically a spindle.
				391	*/
				392	struct dm_dev *origin_dev;
				393
				394	/*
				395	* The faster of the two data devices. Typically an SSD.
				396	*/
				397	struct dm_dev *cache_dev;
				398
				399	/*
				400	* Size of the origin device in _complete_ blocks and native sectors.
				401	*/
				402	dm_oblock_t origin_blocks;
				403	sector_t origin_sectors;
				404
				405	/*
				406	* Size of the cache device in blocks.
				407	*/
				408	dm_cblock_t cache_size;
				409
				410	/*
				411	* Invalidation fields.
				412	*/
				413	spinlock_t invalidation_lock;
				414	struct list_head invalidation_requests;
				415
				416	sector_t migration_threshold;
				417	wait_queue_head_t migration_wait;
				418	atomic_t nr_allocated_migrations;
				419
				420	/*
				421	* The number of in flight migrations that are performing
				422	* background io. eg, promotion, writeback.
				423	*/
				424	atomic_t nr_io_migrations;
				425
				426	struct bio_list deferred_bios;
				427
				428	struct rw_semaphore quiesce_lock;
				429
				430	struct dm_target_callbacks callbacks;
				431
				432	/*
				433	* origin_blocks entries, discarded if set.
				434	*/
				435	dm_dblock_t discard_nr_blocks;
				436	unsigned long *discard_bitset;
				437	uint32_t discard_block_size; /* a power of 2 times sectors per block */
				438
				439	/*
				440	* Rather than reconstructing the table line for the status we just
				441	* save it and regurgitate.
				442	*/
				443	unsigned nr_ctr_args;
				444	const char **ctr_args;
				445
				446	struct dm_kcopyd_client *copier;
				447	struct work_struct deferred_bio_worker;
				448	struct work_struct migration_worker;
				449	struct workqueue_struct *wq;
				450	struct delayed_work waker;
				451	struct dm_bio_prison_v2 *prison;
				452
				453	/*
				454	* cache_size entries, dirty if set
				455	*/
				456	unsigned long *dirty_bitset;
				457	atomic_t nr_dirty;
				458
				459	unsigned policy_nr_args;
				460	struct dm_cache_policy *policy;
				461
				462	/*
				463	* Cache features such as write-through.
				464	*/
				465	struct cache_features features;
				466
				467	struct cache_stats stats;
				468
				469	bool need_tick_bio:1;
				470	bool sized:1;
				471	bool invalidate:1;
				472	bool commit_requested:1;
				473	bool loaded_mappings:1;
				474	bool loaded_discards:1;
				475
				476	struct rw_semaphore background_work_lock;
				477
				478	struct batcher committer;
				479	struct work_struct commit_ws;
				480
				481	struct io_tracker tracker;
				482
				483	mempool_t migration_pool;
				484
				485	struct bio_set bs;
				486	};
				487
				488	struct per_bio_data {
				489	bool tick:1;
				490	unsigned req_nr:2;
				491	struct dm_bio_prison_cell_v2 *cell;
				492	struct dm_hook_info hook_info;
				493	sector_t len;
				494	};
				495
				496	struct dm_cache_migration {
				497	struct continuation k;
				498	struct cache *cache;
				499
				500	struct policy_work *op;
				501	struct bio *overwrite_bio;
				502	struct dm_bio_prison_cell_v2 *cell;
				503
				504	dm_cblock_t invalidate_cblock;
				505	dm_oblock_t invalidate_oblock;
				506	};
				507
				508	/----------------------------------------------------------------/
				509
				510	static bool writethrough_mode(struct cache *cache)
				511	{
				512	return cache->features.io_mode == CM_IO_WRITETHROUGH;
				513	}
				514
				515	static bool writeback_mode(struct cache *cache)
				516	{
				517	return cache->features.io_mode == CM_IO_WRITEBACK;
				518	}
				519
				520	static inline bool passthrough_mode(struct cache *cache)
				521	{
				522	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
				523	}
				524
				525	/----------------------------------------------------------------/
				526
				527	static void wake_deferred_bio_worker(struct cache *cache)
				528	{
				529	queue_work(cache->wq, &cache->deferred_bio_worker);
				530	}
				531
				532	static void wake_migration_worker(struct cache *cache)
				533	{
				534	if (passthrough_mode(cache))
				535	return;
				536
				537	queue_work(cache->wq, &cache->migration_worker);
				538	}
				539
				540	/----------------------------------------------------------------/
				541
				542	static struct dm_bio_prison_cell_v2 alloc_prison_cell(struct cache cache)
				543	{
				544	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
				545	}
				546
				547	static void free_prison_cell(struct cache cache, struct dm_bio_prison_cell_v2 cell)
				548	{
				549	dm_bio_prison_free_cell_v2(cache->prison, cell);
				550	}
				551
				552	static struct dm_cache_migration alloc_migration(struct cache cache)
				553	{
				554	struct dm_cache_migration *mg;
				555
				556	mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
				557
				558	memset(mg, 0, sizeof(*mg));
				559
				560	mg->cache = cache;
				561	atomic_inc(&cache->nr_allocated_migrations);
				562
				563	return mg;
				564	}
				565
				566	static void free_migration(struct dm_cache_migration *mg)
				567	{
				568	struct cache *cache = mg->cache;
				569
				570	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
				571	wake_up(&cache->migration_wait);
				572
				573	mempool_free(mg, &cache->migration_pool);
				574	}
				575
				576	/----------------------------------------------------------------/
				577
				578	static inline dm_oblock_t oblock_succ(dm_oblock_t b)
				579	{
				580	return to_oblock(from_oblock(b) + 1ull);
				581	}
				582
				583	static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
				584	{
				585	key->virtual = 0;
				586	key->dev = 0;
				587	key->block_begin = from_oblock(begin);
				588	key->block_end = from_oblock(end);
				589	}
				590
				591	/*
				592	* We have two lock levels. Level 0, which is used to prevent WRITEs, and
				593	* level 1 which prevents both READs and WRITEs.
				594	*/
				595	#define WRITE_LOCK_LEVEL 0
				596	#define READ_WRITE_LOCK_LEVEL 1
				597
				598	static unsigned lock_level(struct bio *bio)
				599	{
				600	return bio_data_dir(bio) == WRITE ?
				601	WRITE_LOCK_LEVEL :
				602	READ_WRITE_LOCK_LEVEL;
				603	}
				604
				605	/*----------------------------------------------------------------
				606	* Per bio data
				607	--------------------------------------------------------------/
				608
				609	static struct per_bio_data get_per_bio_data(struct bio bio)
				610	{
				611	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
				612	BUG_ON(!pb);
				613	return pb;
				614	}
				615
				616	static struct per_bio_data init_per_bio_data(struct bio bio)
				617	{
				618	struct per_bio_data *pb = get_per_bio_data(bio);
				619
				620	pb->tick = false;
				621	pb->req_nr = dm_bio_get_target_bio_nr(bio);
				622	pb->cell = NULL;
				623	pb->len = 0;
				624
				625	return pb;
				626	}
				627
				628	/----------------------------------------------------------------/
				629
				630	static void defer_bio(struct cache cache, struct bio bio)
				631	{
				632	unsigned long flags;
				633
				634	spin_lock_irqsave(&cache->lock, flags);
				635	bio_list_add(&cache->deferred_bios, bio);
				636	spin_unlock_irqrestore(&cache->lock, flags);
				637
				638	wake_deferred_bio_worker(cache);
				639	}
				640
				641	static void defer_bios(struct cache cache, struct bio_list bios)
				642	{
				643	unsigned long flags;
				644
				645	spin_lock_irqsave(&cache->lock, flags);
				646	bio_list_merge(&cache->deferred_bios, bios);
				647	bio_list_init(bios);
				648	spin_unlock_irqrestore(&cache->lock, flags);
				649
				650	wake_deferred_bio_worker(cache);
				651	}
				652
				653	/----------------------------------------------------------------/
				654
				655	static bool bio_detain_shared(struct cache cache, dm_oblock_t oblock, struct bio bio)
				656	{
				657	bool r;
				658	struct per_bio_data *pb;
				659	struct dm_cell_key_v2 key;
				660	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
				661	struct dm_bio_prison_cell_v2 cell_prealloc, cell;
				662
				663	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
				664
				665	build_key(oblock, end, &key);
				666	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
				667	if (!r) {
				668	/*
				669	* Failed to get the lock.
				670	*/
				671	free_prison_cell(cache, cell_prealloc);
				672	return r;
				673	}
				674
				675	if (cell != cell_prealloc)
				676	free_prison_cell(cache, cell_prealloc);
				677
				678	pb = get_per_bio_data(bio);
				679	pb->cell = cell;
				680
				681	return r;
				682	}
				683
				684	/----------------------------------------------------------------/
				685
				686	static bool is_dirty(struct cache *cache, dm_cblock_t b)
				687	{
				688	return test_bit(from_cblock(b), cache->dirty_bitset);
				689	}
				690
				691	static void set_dirty(struct cache *cache, dm_cblock_t cblock)
				692	{
				693	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
				694	atomic_inc(&cache->nr_dirty);
				695	policy_set_dirty(cache->policy, cblock);
				696	}
				697	}
				698
				699	/*
				700	* These two are called when setting after migrations to force the policy
				701	* and dirty bitset to be in sync.
				702	*/
				703	static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
				704	{
				705	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
				706	atomic_inc(&cache->nr_dirty);
				707	policy_set_dirty(cache->policy, cblock);
				708	}
				709
				710	static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
				711	{
				712	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
				713	if (atomic_dec_return(&cache->nr_dirty) == 0)
				714	dm_table_event(cache->ti->table);
				715	}
				716
				717	policy_clear_dirty(cache->policy, cblock);
				718	}
				719
				720	/----------------------------------------------------------------/
				721
				722	static bool block_size_is_power_of_two(struct cache *cache)
				723	{
				724	return cache->sectors_per_block_shift >= 0;
				725	}
				726
				727	/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
				728	#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
				729	__always_inline
				730	#endif
				731	static dm_block_t block_div(dm_block_t b, uint32_t n)
				732	{
				733	do_div(b, n);
				734
				735	return b;
				736	}
				737
				738	static dm_block_t oblocks_per_dblock(struct cache *cache)
				739	{
				740	dm_block_t oblocks = cache->discard_block_size;
				741
				742	if (block_size_is_power_of_two(cache))
				743	oblocks >>= cache->sectors_per_block_shift;
				744	else
				745	oblocks = block_div(oblocks, cache->sectors_per_block);
				746
				747	return oblocks;
				748	}
				749
				750	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
				751	{
				752	return to_dblock(block_div(from_oblock(oblock),
				753	oblocks_per_dblock(cache)));
				754	}
				755
				756	static void set_discard(struct cache *cache, dm_dblock_t b)
				757	{
				758	unsigned long flags;
				759
				760	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
				761	atomic_inc(&cache->stats.discard_count);
				762
				763	spin_lock_irqsave(&cache->lock, flags);
				764	set_bit(from_dblock(b), cache->discard_bitset);
				765	spin_unlock_irqrestore(&cache->lock, flags);
				766	}
				767
				768	static void clear_discard(struct cache *cache, dm_dblock_t b)
				769	{
				770	unsigned long flags;
				771
				772	spin_lock_irqsave(&cache->lock, flags);
				773	clear_bit(from_dblock(b), cache->discard_bitset);
				774	spin_unlock_irqrestore(&cache->lock, flags);
				775	}
				776
				777	static bool is_discarded(struct cache *cache, dm_dblock_t b)
				778	{
				779	int r;
				780	unsigned long flags;
				781
				782	spin_lock_irqsave(&cache->lock, flags);
				783	r = test_bit(from_dblock(b), cache->discard_bitset);
				784	spin_unlock_irqrestore(&cache->lock, flags);
				785
				786	return r;
				787	}
				788
				789	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
				790	{
				791	int r;
				792	unsigned long flags;
				793
				794	spin_lock_irqsave(&cache->lock, flags);
				795	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
				796	cache->discard_bitset);
				797	spin_unlock_irqrestore(&cache->lock, flags);
				798
				799	return r;
				800	}
				801
				802	/*----------------------------------------------------------------
				803	* Remapping
				804	--------------------------------------------------------------/
				805	static void remap_to_origin(struct cache cache, struct bio bio)
				806	{
				807	bio_set_dev(bio, cache->origin_dev->bdev);
				808	}
				809
				810	static void remap_to_cache(struct cache cache, struct bio bio,
				811	dm_cblock_t cblock)
				812	{
				813	sector_t bi_sector = bio->bi_iter.bi_sector;
				814	sector_t block = from_cblock(cblock);
				815
				816	bio_set_dev(bio, cache->cache_dev->bdev);
				817	if (!block_size_is_power_of_two(cache))
				818	bio->bi_iter.bi_sector =
				819	(block * cache->sectors_per_block) +
				820	sector_div(bi_sector, cache->sectors_per_block);
				821	else
				822	bio->bi_iter.bi_sector =
				823	(block << cache->sectors_per_block_shift) \|
				824	(bi_sector & (cache->sectors_per_block - 1));
				825	}
				826
				827	static void check_if_tick_bio_needed(struct cache cache, struct bio bio)
				828	{
				829	unsigned long flags;
				830	struct per_bio_data *pb;
				831
				832	spin_lock_irqsave(&cache->lock, flags);
				833	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
				834	bio_op(bio) != REQ_OP_DISCARD) {
				835	pb = get_per_bio_data(bio);
				836	pb->tick = true;
				837	cache->need_tick_bio = false;
				838	}
				839	spin_unlock_irqrestore(&cache->lock, flags);
				840	}
				841
				842	static void __remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				843	dm_oblock_t oblock, bool bio_has_pbd)
				844	{
				845	if (bio_has_pbd)
				846	check_if_tick_bio_needed(cache, bio);
				847	remap_to_origin(cache, bio);
				848	if (bio_data_dir(bio) == WRITE)
				849	clear_discard(cache, oblock_to_dblock(cache, oblock));
				850	}
				851
				852	static void remap_to_origin_clear_discard(struct cache cache, struct bio bio,
				853	dm_oblock_t oblock)
				854	{
				855	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
				856	__remap_to_origin_clear_discard(cache, bio, oblock, true);
				857	}
				858
				859	static void remap_to_cache_dirty(struct cache cache, struct bio bio,
				860	dm_oblock_t oblock, dm_cblock_t cblock)
				861	{
				862	check_if_tick_bio_needed(cache, bio);
				863	remap_to_cache(cache, bio, cblock);
				864	if (bio_data_dir(bio) == WRITE) {
				865	set_dirty(cache, cblock);
				866	clear_discard(cache, oblock_to_dblock(cache, oblock));
				867	}
				868	}
				869
				870	static dm_oblock_t get_bio_block(struct cache cache, struct bio bio)
				871	{
				872	sector_t block_nr = bio->bi_iter.bi_sector;
				873
				874	if (!block_size_is_power_of_two(cache))
				875	(void) sector_div(block_nr, cache->sectors_per_block);
				876	else
				877	block_nr >>= cache->sectors_per_block_shift;
				878
				879	return to_oblock(block_nr);
				880	}
				881
				882	static bool accountable_bio(struct cache cache, struct bio bio)
				883	{
				884	return bio_op(bio) != REQ_OP_DISCARD;
				885	}
				886
				887	static void accounted_begin(struct cache cache, struct bio bio)
				888	{
				889	struct per_bio_data *pb;
				890
				891	if (accountable_bio(cache, bio)) {
				892	pb = get_per_bio_data(bio);
				893	pb->len = bio_sectors(bio);
				894	iot_io_begin(&cache->tracker, pb->len);
				895	}
				896	}
				897
				898	static void accounted_complete(struct cache cache, struct bio bio)
				899	{
				900	struct per_bio_data *pb = get_per_bio_data(bio);
				901
				902	iot_io_end(&cache->tracker, pb->len);
				903	}
				904
				905	static void accounted_request(struct cache cache, struct bio bio)
				906	{
				907	accounted_begin(cache, bio);
				908	generic_make_request(bio);
				909	}
				910
				911	static void issue_op(struct bio bio, void context)
				912	{
				913	struct cache *cache = context;
				914	accounted_request(cache, bio);
				915	}
				916
				917	/*
				918	* When running in writethrough mode we need to send writes to clean blocks
				919	* to both the cache and origin devices. Clone the bio and send them in parallel.
				920	*/
				921	static void remap_to_origin_and_cache(struct cache cache, struct bio bio,
				922	dm_oblock_t oblock, dm_cblock_t cblock)
				923	{
				924	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
				925
				926	BUG_ON(!origin_bio);
				927
				928	bio_chain(origin_bio, bio);
				929	/*
				930	* Passing false to __remap_to_origin_clear_discard() skips
				931	* all code that might use per_bio_data (since clone doesn't have it)
				932	*/
				933	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
				934	submit_bio(origin_bio);
				935
				936	remap_to_cache(cache, bio, cblock);
				937	}
				938
				939	/*----------------------------------------------------------------
				940	* Failure modes
				941	--------------------------------------------------------------/
				942	static enum cache_metadata_mode get_cache_mode(struct cache *cache)
				943	{
				944	return cache->features.mode;
				945	}
				946
				947	static const char cache_device_name(struct cache cache)
				948	{
				949	return dm_device_name(dm_table_get_md(cache->ti->table));
				950	}
				951
				952	static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
				953	{
				954	const char *descs[] = {
				955	"write",
				956	"read-only",
				957	"fail"
				958	};
				959
				960	dm_table_event(cache->ti->table);
				961	DMINFO("%s: switching cache to %s mode",
				962	cache_device_name(cache), descs[(int)mode]);
				963	}
				964
				965	static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
				966	{
				967	bool needs_check;
				968	enum cache_metadata_mode old_mode = get_cache_mode(cache);
				969
				970	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
				971	DMERR("%s: unable to read needs_check flag, setting failure mode.",
				972	cache_device_name(cache));
				973	new_mode = CM_FAIL;
				974	}
				975
				976	if (new_mode == CM_WRITE && needs_check) {
				977	DMERR("%s: unable to switch cache to write mode until repaired.",
				978	cache_device_name(cache));
				979	if (old_mode != new_mode)
				980	new_mode = old_mode;
				981	else
				982	new_mode = CM_READ_ONLY;
				983	}
				984
				985	/* Never move out of fail mode */
				986	if (old_mode == CM_FAIL)
				987	new_mode = CM_FAIL;
				988
				989	switch (new_mode) {
				990	case CM_FAIL:
				991	case CM_READ_ONLY:
				992	dm_cache_metadata_set_read_only(cache->cmd);
				993	break;
				994
				995	case CM_WRITE:
				996	dm_cache_metadata_set_read_write(cache->cmd);
				997	break;
				998	}
				999
				1000	cache->features.mode = new_mode;
				1001
				1002	if (new_mode != old_mode)
				1003	notify_mode_switch(cache, new_mode);
				1004	}
				1005
				1006	static void abort_transaction(struct cache *cache)
				1007	{
				1008	const char *dev_name = cache_device_name(cache);
				1009
				1010	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1011	return;
				1012
				1013	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
				1014	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
				1015	set_cache_mode(cache, CM_FAIL);
				1016	}
				1017
				1018	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
				1019	if (dm_cache_metadata_abort(cache->cmd)) {
				1020	DMERR("%s: failed to abort metadata transaction", dev_name);
				1021	set_cache_mode(cache, CM_FAIL);
				1022	}
				1023	}
				1024
				1025	static void metadata_operation_failed(struct cache cache, const char op, int r)
				1026	{
				1027	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
				1028	cache_device_name(cache), op, r);
				1029	abort_transaction(cache);
				1030	set_cache_mode(cache, CM_READ_ONLY);
				1031	}
				1032
				1033	/----------------------------------------------------------------/
				1034
				1035	static void load_stats(struct cache *cache)
				1036	{
				1037	struct dm_cache_statistics stats;
				1038
				1039	dm_cache_metadata_get_stats(cache->cmd, &stats);
				1040	atomic_set(&cache->stats.read_hit, stats.read_hits);
				1041	atomic_set(&cache->stats.read_miss, stats.read_misses);
				1042	atomic_set(&cache->stats.write_hit, stats.write_hits);
				1043	atomic_set(&cache->stats.write_miss, stats.write_misses);
				1044	}
				1045
				1046	static void save_stats(struct cache *cache)
				1047	{
				1048	struct dm_cache_statistics stats;
				1049
				1050	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1051	return;
				1052
				1053	stats.read_hits = atomic_read(&cache->stats.read_hit);
				1054	stats.read_misses = atomic_read(&cache->stats.read_miss);
				1055	stats.write_hits = atomic_read(&cache->stats.write_hit);
				1056	stats.write_misses = atomic_read(&cache->stats.write_miss);
				1057
				1058	dm_cache_metadata_set_stats(cache->cmd, &stats);
				1059	}
				1060
				1061	static void update_stats(struct cache_stats *stats, enum policy_operation op)
				1062	{
				1063	switch (op) {
				1064	case POLICY_PROMOTE:
				1065	atomic_inc(&stats->promotion);
				1066	break;
				1067
				1068	case POLICY_DEMOTE:
				1069	atomic_inc(&stats->demotion);
				1070	break;
				1071
				1072	case POLICY_WRITEBACK:
				1073	atomic_inc(&stats->writeback);
				1074	break;
				1075	}
				1076	}
				1077
				1078	/*----------------------------------------------------------------
				1079	* Migration processing
				1080	*
				1081	* Migration covers moving data from the origin device to the cache, or
				1082	* vice versa.
				1083	--------------------------------------------------------------/
				1084
				1085	static void inc_io_migrations(struct cache *cache)
				1086	{
				1087	atomic_inc(&cache->nr_io_migrations);
				1088	}
				1089
				1090	static void dec_io_migrations(struct cache *cache)
				1091	{
				1092	atomic_dec(&cache->nr_io_migrations);
				1093	}
				1094
				1095	static bool discard_or_flush(struct bio *bio)
				1096	{
				1097	return bio_op(bio) == REQ_OP_DISCARD \|\| op_is_flush(bio->bi_opf);
				1098	}
				1099
				1100	static void calc_discard_block_range(struct cache cache, struct bio bio,
				1101	dm_dblock_t b, dm_dblock_t e)
				1102	{
				1103	sector_t sb = bio->bi_iter.bi_sector;
				1104	sector_t se = bio_end_sector(bio);
				1105
				1106	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
				1107
				1108	if (se - sb < cache->discard_block_size)
				1109	e = b;
				1110	else
				1111	*e = to_dblock(block_div(se, cache->discard_block_size));
				1112	}
				1113
				1114	/----------------------------------------------------------------/
				1115
				1116	static void prevent_background_work(struct cache *cache)
				1117	{
				1118	lockdep_off();
				1119	down_write(&cache->background_work_lock);
				1120	lockdep_on();
				1121	}
				1122
				1123	static void allow_background_work(struct cache *cache)
				1124	{
				1125	lockdep_off();
				1126	up_write(&cache->background_work_lock);
				1127	lockdep_on();
				1128	}
				1129
				1130	static bool background_work_begin(struct cache *cache)
				1131	{
				1132	bool r;
				1133
				1134	lockdep_off();
				1135	r = down_read_trylock(&cache->background_work_lock);
				1136	lockdep_on();
				1137
				1138	return r;
				1139	}
				1140
				1141	static void background_work_end(struct cache *cache)
				1142	{
				1143	lockdep_off();
				1144	up_read(&cache->background_work_lock);
				1145	lockdep_on();
				1146	}
				1147
				1148	/----------------------------------------------------------------/
				1149
				1150	static bool bio_writes_complete_block(struct cache cache, struct bio bio)
				1151	{
				1152	return (bio_data_dir(bio) == WRITE) &&
				1153	(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
				1154	}
				1155
				1156	static bool optimisable_bio(struct cache cache, struct bio bio, dm_oblock_t block)
				1157	{
				1158	return writeback_mode(cache) &&
				1159	(is_discarded_oblock(cache, block) \|\| bio_writes_complete_block(cache, bio));
				1160	}
				1161
				1162	static void quiesce(struct dm_cache_migration *mg,
				1163	void (continuation)(struct work_struct ))
				1164	{
				1165	init_continuation(&mg->k, continuation);
				1166	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
				1167	}
				1168
				1169	static struct dm_cache_migration ws_to_mg(struct work_struct ws)
				1170	{
				1171	struct continuation *k = container_of(ws, struct continuation, ws);
				1172	return container_of(k, struct dm_cache_migration, k);
				1173	}
				1174
				1175	static void copy_complete(int read_err, unsigned long write_err, void *context)
				1176	{
				1177	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
				1178
				1179	if (read_err \|\| write_err)
				1180	mg->k.input = BLK_STS_IOERR;
				1181
				1182	queue_continuation(mg->cache->wq, &mg->k);
				1183	}
				1184
				1185	static void copy(struct dm_cache_migration *mg, bool promote)
				1186	{
				1187	struct dm_io_region o_region, c_region;
				1188	struct cache *cache = mg->cache;
				1189
				1190	o_region.bdev = cache->origin_dev->bdev;
				1191	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
				1192	o_region.count = cache->sectors_per_block;
				1193
				1194	c_region.bdev = cache->cache_dev->bdev;
				1195	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
				1196	c_region.count = cache->sectors_per_block;
				1197
				1198	if (promote)
				1199	dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
				1200	else
				1201	dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
				1202	}
				1203
				1204	static void bio_drop_shared_lock(struct cache cache, struct bio bio)
				1205	{
				1206	struct per_bio_data *pb = get_per_bio_data(bio);
				1207
				1208	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
				1209	free_prison_cell(cache, pb->cell);
				1210	pb->cell = NULL;
				1211	}
				1212
				1213	static void overwrite_endio(struct bio *bio)
				1214	{
				1215	struct dm_cache_migration *mg = bio->bi_private;
				1216	struct cache *cache = mg->cache;
				1217	struct per_bio_data *pb = get_per_bio_data(bio);
				1218
				1219	dm_unhook_bio(&pb->hook_info, bio);
				1220
				1221	if (bio->bi_status)
				1222	mg->k.input = bio->bi_status;
				1223
				1224	queue_continuation(cache->wq, &mg->k);
				1225	}
				1226
				1227	static void overwrite(struct dm_cache_migration *mg,
				1228	void (continuation)(struct work_struct ))
				1229	{
				1230	struct bio *bio = mg->overwrite_bio;
				1231	struct per_bio_data *pb = get_per_bio_data(bio);
				1232
				1233	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
				1234
				1235	/*
				1236	* The overwrite bio is part of the copy operation, as such it does
				1237	* not set/clear discard or dirty flags.
				1238	*/
				1239	if (mg->op->op == POLICY_PROMOTE)
				1240	remap_to_cache(mg->cache, bio, mg->op->cblock);
				1241	else
				1242	remap_to_origin(mg->cache, bio);
				1243
				1244	init_continuation(&mg->k, continuation);
				1245	accounted_request(mg->cache, bio);
				1246	}
				1247
				1248	/*
				1249	* Migration steps:
				1250	*
				1251	* 1) exclusive lock preventing WRITEs
				1252	* 2) quiesce
				1253	* 3) copy or issue overwrite bio
				1254	* 4) upgrade to exclusive lock preventing READs and WRITEs
				1255	* 5) quiesce
				1256	* 6) update metadata and commit
				1257	* 7) unlock
				1258	*/
				1259	static void mg_complete(struct dm_cache_migration *mg, bool success)
				1260	{
				1261	struct bio_list bios;
				1262	struct cache *cache = mg->cache;
				1263	struct policy_work *op = mg->op;
				1264	dm_cblock_t cblock = op->cblock;
				1265
				1266	if (success)
				1267	update_stats(&cache->stats, op->op);
				1268
				1269	switch (op->op) {
				1270	case POLICY_PROMOTE:
				1271	clear_discard(cache, oblock_to_dblock(cache, op->oblock));
				1272	policy_complete_background_work(cache->policy, op, success);
				1273
				1274	if (mg->overwrite_bio) {
				1275	if (success)
				1276	force_set_dirty(cache, cblock);
				1277	else if (mg->k.input)
				1278	mg->overwrite_bio->bi_status = mg->k.input;
				1279	else
				1280	mg->overwrite_bio->bi_status = BLK_STS_IOERR;
				1281	bio_endio(mg->overwrite_bio);
				1282	} else {
				1283	if (success)
				1284	force_clear_dirty(cache, cblock);
				1285	dec_io_migrations(cache);
				1286	}
				1287	break;
				1288
				1289	case POLICY_DEMOTE:
				1290	/*
				1291	* We clear dirty here to update the nr_dirty counter.
				1292	*/
				1293	if (success)
				1294	force_clear_dirty(cache, cblock);
				1295	policy_complete_background_work(cache->policy, op, success);
				1296	dec_io_migrations(cache);
				1297	break;
				1298
				1299	case POLICY_WRITEBACK:
				1300	if (success)
				1301	force_clear_dirty(cache, cblock);
				1302	policy_complete_background_work(cache->policy, op, success);
				1303	dec_io_migrations(cache);
				1304	break;
				1305	}
				1306
				1307	bio_list_init(&bios);
				1308	if (mg->cell) {
				1309	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1310	free_prison_cell(cache, mg->cell);
				1311	}
				1312
				1313	free_migration(mg);
				1314	defer_bios(cache, &bios);
				1315	wake_migration_worker(cache);
				1316
				1317	background_work_end(cache);
				1318	}
				1319
				1320	static void mg_success(struct work_struct *ws)
				1321	{
				1322	struct dm_cache_migration *mg = ws_to_mg(ws);
				1323	mg_complete(mg, mg->k.input == 0);
				1324	}
				1325
				1326	static void mg_update_metadata(struct work_struct *ws)
				1327	{
				1328	int r;
				1329	struct dm_cache_migration *mg = ws_to_mg(ws);
				1330	struct cache *cache = mg->cache;
				1331	struct policy_work *op = mg->op;
				1332
				1333	switch (op->op) {
				1334	case POLICY_PROMOTE:
				1335	r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
				1336	if (r) {
				1337	DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
				1338	cache_device_name(cache));
				1339	metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
				1340
				1341	mg_complete(mg, false);
				1342	return;
				1343	}
				1344	mg_complete(mg, true);
				1345	break;
				1346
				1347	case POLICY_DEMOTE:
				1348	r = dm_cache_remove_mapping(cache->cmd, op->cblock);
				1349	if (r) {
				1350	DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
				1351	cache_device_name(cache));
				1352	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1353
				1354	mg_complete(mg, false);
				1355	return;
				1356	}
				1357
				1358	/*
				1359	* It would be nice if we only had to commit when a REQ_FLUSH
				1360	* comes through. But there's one scenario that we have to
				1361	* look out for:
				1362	*
				1363	* - vblock x in a cache block
				1364	* - domotion occurs
				1365	* - cache block gets reallocated and over written
				1366	* - crash
				1367	*
				1368	* When we recover, because there was no commit the cache will
				1369	* rollback to having the data for vblock x in the cache block.
				1370	* But the cache block has since been overwritten, so it'll end
				1371	* up pointing to data that was never in 'x' during the history
				1372	* of the device.
				1373	*
				1374	* To avoid this issue we require a commit as part of the
				1375	* demotion operation.
				1376	*/
				1377	init_continuation(&mg->k, mg_success);
				1378	continue_after_commit(&cache->committer, &mg->k);
				1379	schedule_commit(&cache->committer);
				1380	break;
				1381
				1382	case POLICY_WRITEBACK:
				1383	mg_complete(mg, true);
				1384	break;
				1385	}
				1386	}
				1387
				1388	static void mg_update_metadata_after_copy(struct work_struct *ws)
				1389	{
				1390	struct dm_cache_migration *mg = ws_to_mg(ws);
				1391
				1392	/*
				1393	* Did the copy succeed?
				1394	*/
				1395	if (mg->k.input)
				1396	mg_complete(mg, false);
				1397	else
				1398	mg_update_metadata(ws);
				1399	}
				1400
				1401	static void mg_upgrade_lock(struct work_struct *ws)
				1402	{
				1403	int r;
				1404	struct dm_cache_migration *mg = ws_to_mg(ws);
				1405
				1406	/*
				1407	* Did the copy succeed?
				1408	*/
				1409	if (mg->k.input)
				1410	mg_complete(mg, false);
				1411
				1412	else {
				1413	/*
				1414	* Now we want the lock to prevent both reads and writes.
				1415	*/
				1416	r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
				1417	READ_WRITE_LOCK_LEVEL);
				1418	if (r < 0)
				1419	mg_complete(mg, false);
				1420
				1421	else if (r)
				1422	quiesce(mg, mg_update_metadata);
				1423
				1424	else
				1425	mg_update_metadata(ws);
				1426	}
				1427	}
				1428
				1429	static void mg_full_copy(struct work_struct *ws)
				1430	{
				1431	struct dm_cache_migration *mg = ws_to_mg(ws);
				1432	struct cache *cache = mg->cache;
				1433	struct policy_work *op = mg->op;
				1434	bool is_policy_promote = (op->op == POLICY_PROMOTE);
				1435
				1436	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) \|\|
				1437	is_discarded_oblock(cache, op->oblock)) {
				1438	mg_upgrade_lock(ws);
				1439	return;
				1440	}
				1441
				1442	init_continuation(&mg->k, mg_upgrade_lock);
				1443	copy(mg, is_policy_promote);
				1444	}
				1445
				1446	static void mg_copy(struct work_struct *ws)
				1447	{
				1448	struct dm_cache_migration *mg = ws_to_mg(ws);
				1449
				1450	if (mg->overwrite_bio) {
				1451	/*
				1452	* No exclusive lock was held when we last checked if the bio
				1453	* was optimisable. So we have to check again in case things
				1454	* have changed (eg, the block may no longer be discarded).
				1455	*/
				1456	if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
				1457	/*
				1458	* Fallback to a real full copy after doing some tidying up.
				1459	*/
				1460	bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
				1461	BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
				1462	mg->overwrite_bio = NULL;
				1463	inc_io_migrations(mg->cache);
				1464	mg_full_copy(ws);
				1465	return;
				1466	}
				1467
				1468	/*
				1469	* It's safe to do this here, even though it's new data
				1470	* because all IO has been locked out of the block.
				1471	*
				1472	* mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
				1473	* so _not_ using mg_upgrade_lock() as continutation.
				1474	*/
				1475	overwrite(mg, mg_update_metadata_after_copy);
				1476
				1477	} else
				1478	mg_full_copy(ws);
				1479	}
				1480
				1481	static int mg_lock_writes(struct dm_cache_migration *mg)
				1482	{
				1483	int r;
				1484	struct dm_cell_key_v2 key;
				1485	struct cache *cache = mg->cache;
				1486	struct dm_bio_prison_cell_v2 *prealloc;
				1487
				1488	prealloc = alloc_prison_cell(cache);
				1489
				1490	/*
				1491	* Prevent writes to the block, but allow reads to continue.
				1492	* Unless we're using an overwrite bio, in which case we lock
				1493	* everything.
				1494	*/
				1495	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
				1496	r = dm_cell_lock_v2(cache->prison, &key,
				1497	mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
				1498	prealloc, &mg->cell);
				1499	if (r < 0) {
				1500	free_prison_cell(cache, prealloc);
				1501	mg_complete(mg, false);
				1502	return r;
				1503	}
				1504
				1505	if (mg->cell != prealloc)
				1506	free_prison_cell(cache, prealloc);
				1507
				1508	if (r == 0)
				1509	mg_copy(&mg->k.ws);
				1510	else
				1511	quiesce(mg, mg_copy);
				1512
				1513	return 0;
				1514	}
				1515
				1516	static int mg_start(struct cache cache, struct policy_work op, struct bio *bio)
				1517	{
				1518	struct dm_cache_migration *mg;
				1519
				1520	if (!background_work_begin(cache)) {
				1521	policy_complete_background_work(cache->policy, op, false);
				1522	return -EPERM;
				1523	}
				1524
				1525	mg = alloc_migration(cache);
				1526
				1527	mg->op = op;
				1528	mg->overwrite_bio = bio;
				1529
				1530	if (!bio)
				1531	inc_io_migrations(cache);
				1532
				1533	return mg_lock_writes(mg);
				1534	}
				1535
				1536	/*----------------------------------------------------------------
				1537	* invalidation processing
				1538	--------------------------------------------------------------/
				1539
				1540	static void invalidate_complete(struct dm_cache_migration *mg, bool success)
				1541	{
				1542	struct bio_list bios;
				1543	struct cache *cache = mg->cache;
				1544
				1545	bio_list_init(&bios);
				1546	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
				1547	free_prison_cell(cache, mg->cell);
				1548
				1549	if (!success && mg->overwrite_bio)
				1550	bio_io_error(mg->overwrite_bio);
				1551
				1552	free_migration(mg);
				1553	defer_bios(cache, &bios);
				1554
				1555	background_work_end(cache);
				1556	}
				1557
				1558	static void invalidate_completed(struct work_struct *ws)
				1559	{
				1560	struct dm_cache_migration *mg = ws_to_mg(ws);
				1561	invalidate_complete(mg, !mg->k.input);
				1562	}
				1563
				1564	static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
				1565	{
				1566	int r = policy_invalidate_mapping(cache->policy, cblock);
				1567	if (!r) {
				1568	r = dm_cache_remove_mapping(cache->cmd, cblock);
				1569	if (r) {
				1570	DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
				1571	cache_device_name(cache));
				1572	metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
				1573	}
				1574
				1575	} else if (r == -ENODATA) {
				1576	/*
				1577	* Harmless, already unmapped.
				1578	*/
				1579	r = 0;
				1580
				1581	} else
				1582	DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
				1583
				1584	return r;
				1585	}
				1586
				1587	static void invalidate_remove(struct work_struct *ws)
				1588	{
				1589	int r;
				1590	struct dm_cache_migration *mg = ws_to_mg(ws);
				1591	struct cache *cache = mg->cache;
				1592
				1593	r = invalidate_cblock(cache, mg->invalidate_cblock);
				1594	if (r) {
				1595	invalidate_complete(mg, false);
				1596	return;
				1597	}
				1598
				1599	init_continuation(&mg->k, invalidate_completed);
				1600	continue_after_commit(&cache->committer, &mg->k);
				1601	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
				1602	mg->overwrite_bio = NULL;
				1603	schedule_commit(&cache->committer);
				1604	}
				1605
				1606	static int invalidate_lock(struct dm_cache_migration *mg)
				1607	{
				1608	int r;
				1609	struct dm_cell_key_v2 key;
				1610	struct cache *cache = mg->cache;
				1611	struct dm_bio_prison_cell_v2 *prealloc;
				1612
				1613	prealloc = alloc_prison_cell(cache);
				1614
				1615	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
				1616	r = dm_cell_lock_v2(cache->prison, &key,
				1617	READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
				1618	if (r < 0) {
				1619	free_prison_cell(cache, prealloc);
				1620	invalidate_complete(mg, false);
				1621	return r;
				1622	}
				1623
				1624	if (mg->cell != prealloc)
				1625	free_prison_cell(cache, prealloc);
				1626
				1627	if (r)
				1628	quiesce(mg, invalidate_remove);
				1629
				1630	else {
				1631	/*
				1632	* We can't call invalidate_remove() directly here because we
				1633	* might still be in request context.
				1634	*/
				1635	init_continuation(&mg->k, invalidate_remove);
				1636	queue_work(cache->wq, &mg->k.ws);
				1637	}
				1638
				1639	return 0;
				1640	}
				1641
				1642	static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
				1643	dm_oblock_t oblock, struct bio *bio)
				1644	{
				1645	struct dm_cache_migration *mg;
				1646
				1647	if (!background_work_begin(cache))
				1648	return -EPERM;
				1649
				1650	mg = alloc_migration(cache);
				1651
				1652	mg->overwrite_bio = bio;
				1653	mg->invalidate_cblock = cblock;
				1654	mg->invalidate_oblock = oblock;
				1655
				1656	return invalidate_lock(mg);
				1657	}
				1658
				1659	/*----------------------------------------------------------------
				1660	* bio processing
				1661	--------------------------------------------------------------/
				1662
				1663	enum busy {
				1664	IDLE,
				1665	BUSY
				1666	};
				1667
				1668	static enum busy spare_migration_bandwidth(struct cache *cache)
				1669	{
				1670	bool idle = iot_idle_for(&cache->tracker, HZ);
				1671	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
				1672	cache->sectors_per_block;
				1673
				1674	if (idle && current_volume <= cache->migration_threshold)
				1675	return IDLE;
				1676	else
				1677	return BUSY;
				1678	}
				1679
				1680	static void inc_hit_counter(struct cache cache, struct bio bio)
				1681	{
				1682	atomic_inc(bio_data_dir(bio) == READ ?
				1683	&cache->stats.read_hit : &cache->stats.write_hit);
				1684	}
				1685
				1686	static void inc_miss_counter(struct cache cache, struct bio bio)
				1687	{
				1688	atomic_inc(bio_data_dir(bio) == READ ?
				1689	&cache->stats.read_miss : &cache->stats.write_miss);
				1690	}
				1691
				1692	/----------------------------------------------------------------/
				1693
				1694	static int map_bio(struct cache cache, struct bio bio, dm_oblock_t block,
				1695	bool *commit_needed)
				1696	{
				1697	int r, data_dir;
				1698	bool rb, background_queued;
				1699	dm_cblock_t cblock;
				1700
				1701	*commit_needed = false;
				1702
				1703	rb = bio_detain_shared(cache, block, bio);
				1704	if (!rb) {
				1705	/*
				1706	* An exclusive lock is held for this block, so we have to
				1707	* wait. We set the commit_needed flag so the current
				1708	* transaction will be committed asap, allowing this lock
				1709	* to be dropped.
				1710	*/
				1711	*commit_needed = true;
				1712	return DM_MAPIO_SUBMITTED;
				1713	}
				1714
				1715	data_dir = bio_data_dir(bio);
				1716
				1717	if (optimisable_bio(cache, bio, block)) {
				1718	struct policy_work *op = NULL;
				1719
				1720	r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
				1721	if (unlikely(r && r != -ENOENT)) {
				1722	DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
				1723	cache_device_name(cache), r);
				1724	bio_io_error(bio);
				1725	return DM_MAPIO_SUBMITTED;
				1726	}
				1727
				1728	if (r == -ENOENT && op) {
				1729	bio_drop_shared_lock(cache, bio);
				1730	BUG_ON(op->op != POLICY_PROMOTE);
				1731	mg_start(cache, op, bio);
				1732	return DM_MAPIO_SUBMITTED;
				1733	}
				1734	} else {
				1735	r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
				1736	if (unlikely(r && r != -ENOENT)) {
				1737	DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
				1738	cache_device_name(cache), r);
				1739	bio_io_error(bio);
				1740	return DM_MAPIO_SUBMITTED;
				1741	}
				1742
				1743	if (background_queued)
				1744	wake_migration_worker(cache);
				1745	}
				1746
				1747	if (r == -ENOENT) {
				1748	struct per_bio_data *pb = get_per_bio_data(bio);
				1749
				1750	/*
				1751	* Miss.
				1752	*/
				1753	inc_miss_counter(cache, bio);
				1754	if (pb->req_nr == 0) {
				1755	accounted_begin(cache, bio);
				1756	remap_to_origin_clear_discard(cache, bio, block);
				1757	} else {
				1758	/*
				1759	* This is a duplicate writethrough io that is no
				1760	* longer needed because the block has been demoted.
				1761	*/
				1762	bio_endio(bio);
				1763	return DM_MAPIO_SUBMITTED;
				1764	}
				1765	} else {
				1766	/*
				1767	* Hit.
				1768	*/
				1769	inc_hit_counter(cache, bio);
				1770
				1771	/*
				1772	* Passthrough always maps to the origin, invalidating any
				1773	* cache blocks that are written to.
				1774	*/
				1775	if (passthrough_mode(cache)) {
				1776	if (bio_data_dir(bio) == WRITE) {
				1777	bio_drop_shared_lock(cache, bio);
				1778	atomic_inc(&cache->stats.demotion);
				1779	invalidate_start(cache, cblock, block, bio);
				1780	} else
				1781	remap_to_origin_clear_discard(cache, bio, block);
				1782	} else {
				1783	if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
				1784	!is_dirty(cache, cblock)) {
				1785	remap_to_origin_and_cache(cache, bio, block, cblock);
				1786	accounted_begin(cache, bio);
				1787	} else
				1788	remap_to_cache_dirty(cache, bio, block, cblock);
				1789	}
				1790	}
				1791
				1792	/*
				1793	* dm core turns FUA requests into a separate payload and FLUSH req.
				1794	*/
				1795	if (bio->bi_opf & REQ_FUA) {
				1796	/*
				1797	* issue_after_commit will call accounted_begin a second time. So
				1798	* we call accounted_complete() to avoid double accounting.
				1799	*/
				1800	accounted_complete(cache, bio);
				1801	issue_after_commit(&cache->committer, bio);
				1802	*commit_needed = true;
				1803	return DM_MAPIO_SUBMITTED;
				1804	}
				1805
				1806	return DM_MAPIO_REMAPPED;
				1807	}
				1808
				1809	static bool process_bio(struct cache cache, struct bio bio)
				1810	{
				1811	bool commit_needed;
				1812
				1813	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
				1814	generic_make_request(bio);
				1815
				1816	return commit_needed;
				1817	}
				1818
				1819	/*
				1820	* A non-zero return indicates read_only or fail_io mode.
				1821	*/
				1822	static int commit(struct cache *cache, bool clean_shutdown)
				1823	{
				1824	int r;
				1825
				1826	if (get_cache_mode(cache) >= CM_READ_ONLY)
				1827	return -EINVAL;
				1828
				1829	atomic_inc(&cache->stats.commit_count);
				1830	r = dm_cache_commit(cache->cmd, clean_shutdown);
				1831	if (r)
				1832	metadata_operation_failed(cache, "dm_cache_commit", r);
				1833
				1834	return r;
				1835	}
				1836
				1837	/*
				1838	* Used by the batcher.
				1839	*/
				1840	static blk_status_t commit_op(void *context)
				1841	{
				1842	struct cache *cache = context;
				1843
				1844	if (dm_cache_changed_this_transaction(cache->cmd))
				1845	return errno_to_blk_status(commit(cache, false));
				1846
				1847	return 0;
				1848	}
				1849
				1850	/----------------------------------------------------------------/
				1851
				1852	static bool process_flush_bio(struct cache cache, struct bio bio)
				1853	{
				1854	struct per_bio_data *pb = get_per_bio_data(bio);
				1855
				1856	if (!pb->req_nr)
				1857	remap_to_origin(cache, bio);
				1858	else
				1859	remap_to_cache(cache, bio, 0);
				1860
				1861	issue_after_commit(&cache->committer, bio);
				1862	return true;
				1863	}
				1864
				1865	static bool process_discard_bio(struct cache cache, struct bio bio)
				1866	{
				1867	dm_dblock_t b, e;
				1868
				1869	// FIXME: do we need to lock the region? Or can we just assume the
				1870	// user wont be so foolish as to issue discard concurrently with
				1871	// other IO?
				1872	calc_discard_block_range(cache, bio, &b, &e);
				1873	while (b != e) {
				1874	set_discard(cache, b);
				1875	b = to_dblock(from_dblock(b) + 1);
				1876	}
				1877
				1878	bio_endio(bio);
				1879
				1880	return false;
				1881	}
				1882
				1883	static void process_deferred_bios(struct work_struct *ws)
				1884	{
				1885	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
				1886
				1887	unsigned long flags;
				1888	bool commit_needed = false;
				1889	struct bio_list bios;
				1890	struct bio *bio;
				1891
				1892	bio_list_init(&bios);
				1893
				1894	spin_lock_irqsave(&cache->lock, flags);
				1895	bio_list_merge(&bios, &cache->deferred_bios);
				1896	bio_list_init(&cache->deferred_bios);
				1897	spin_unlock_irqrestore(&cache->lock, flags);
				1898
				1899	while ((bio = bio_list_pop(&bios))) {
				1900	if (bio->bi_opf & REQ_PREFLUSH)
				1901	commit_needed = process_flush_bio(cache, bio) \|\| commit_needed;
				1902
				1903	else if (bio_op(bio) == REQ_OP_DISCARD)
				1904	commit_needed = process_discard_bio(cache, bio) \|\| commit_needed;
				1905
				1906	else
				1907	commit_needed = process_bio(cache, bio) \|\| commit_needed;
				1908	}
				1909
				1910	if (commit_needed)
				1911	schedule_commit(&cache->committer);
				1912	}
				1913
				1914	/*----------------------------------------------------------------
				1915	* Main worker loop
				1916	--------------------------------------------------------------/
				1917
				1918	static void requeue_deferred_bios(struct cache *cache)
				1919	{
				1920	struct bio *bio;
				1921	struct bio_list bios;
				1922
				1923	bio_list_init(&bios);
				1924	bio_list_merge(&bios, &cache->deferred_bios);
				1925	bio_list_init(&cache->deferred_bios);
				1926
				1927	while ((bio = bio_list_pop(&bios))) {
				1928	bio->bi_status = BLK_STS_DM_REQUEUE;
				1929	bio_endio(bio);
				1930	}
				1931	}
				1932
				1933	/*
				1934	* We want to commit periodically so that not too much
				1935	* unwritten metadata builds up.
				1936	*/
				1937	static void do_waker(struct work_struct *ws)
				1938	{
				1939	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
				1940
				1941	policy_tick(cache->policy, true);
				1942	wake_migration_worker(cache);
				1943	schedule_commit(&cache->committer);
				1944	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
				1945	}
				1946
				1947	static void check_migrations(struct work_struct *ws)
				1948	{
				1949	int r;
				1950	struct policy_work *op;
				1951	struct cache *cache = container_of(ws, struct cache, migration_worker);
				1952	enum busy b;
				1953
				1954	for (;;) {
				1955	b = spare_migration_bandwidth(cache);
				1956
				1957	r = policy_get_background_work(cache->policy, b == IDLE, &op);
				1958	if (r == -ENODATA)
				1959	break;
				1960
				1961	if (r) {
				1962	DMERR_LIMIT("%s: policy_background_work failed",
				1963	cache_device_name(cache));
				1964	break;
				1965	}
				1966
				1967	r = mg_start(cache, op, NULL);
				1968	if (r)
				1969	break;
				1970	}
				1971	}
				1972
				1973	/*----------------------------------------------------------------
				1974	* Target methods
				1975	--------------------------------------------------------------/
				1976
				1977	/*
				1978	* This function gets called on the error paths of the constructor, so we
				1979	* have to cope with a partially initialised struct.
				1980	*/
				1981	static void destroy(struct cache *cache)
				1982	{
				1983	unsigned i;
				1984
				1985	mempool_exit(&cache->migration_pool);
				1986
				1987	if (cache->prison)
				1988	dm_bio_prison_destroy_v2(cache->prison);
				1989
				1990	if (cache->wq)
				1991	destroy_workqueue(cache->wq);
				1992
				1993	if (cache->dirty_bitset)
				1994	free_bitset(cache->dirty_bitset);
				1995
				1996	if (cache->discard_bitset)
				1997	free_bitset(cache->discard_bitset);
				1998
				1999	if (cache->copier)
				2000	dm_kcopyd_client_destroy(cache->copier);
				2001
				2002	if (cache->cmd)
				2003	dm_cache_metadata_close(cache->cmd);
				2004
				2005	if (cache->metadata_dev)
				2006	dm_put_device(cache->ti, cache->metadata_dev);
				2007
				2008	if (cache->origin_dev)
				2009	dm_put_device(cache->ti, cache->origin_dev);
				2010
				2011	if (cache->cache_dev)
				2012	dm_put_device(cache->ti, cache->cache_dev);
				2013
				2014	if (cache->policy)
				2015	dm_cache_policy_destroy(cache->policy);
				2016
				2017	for (i = 0; i < cache->nr_ctr_args ; i++)
				2018	kfree(cache->ctr_args[i]);
				2019	kfree(cache->ctr_args);
				2020
				2021	bioset_exit(&cache->bs);
				2022
				2023	kfree(cache);
				2024	}
				2025
				2026	static void cache_dtr(struct dm_target *ti)
				2027	{
				2028	struct cache *cache = ti->private;
				2029
				2030	destroy(cache);
				2031	}
				2032
				2033	static sector_t get_dev_size(struct dm_dev *dev)
				2034	{
				2035	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				2036	}
				2037
				2038	/----------------------------------------------------------------/
				2039
				2040	/*
				2041	* Construct a cache device mapping.
				2042	*
				2043	* cache <metadata dev> <cache dev> <origin dev> <block size>
				2044	* <#feature args> [<feature arg>]*
				2045	* <policy> <#policy args> [<policy arg>]*
				2046	*
				2047	* metadata dev : fast device holding the persistent metadata
				2048	* cache dev : fast device holding cached data blocks
				2049	* origin dev : slow device holding original data blocks
				2050	* block size : cache unit size in sectors
				2051	*
				2052	* #feature args : number of feature arguments passed
				2053	* feature args : writethrough. (The default is writeback.)
				2054	*
				2055	* policy : the replacement policy to use
				2056	* #policy args : an even number of policy arguments corresponding
				2057	* to key/value pairs passed to the policy
				2058	* policy args : key/value pairs passed to the policy
				2059	* E.g. 'sequential_threshold 1024'
				2060	* See cache-policies.txt for details.
				2061	*
				2062	* Optional feature arguments are:
				2063	* writethrough : write through caching that prohibits cache block
				2064	* content from being different from origin block content.
				2065	* Without this argument, the default behaviour is to write
				2066	* back cache block contents later for performance reasons,
				2067	* so they may differ from the corresponding origin blocks.
				2068	*/
				2069	struct cache_args {
				2070	struct dm_target *ti;
				2071
				2072	struct dm_dev *metadata_dev;
				2073
				2074	struct dm_dev *cache_dev;
				2075	sector_t cache_sectors;
				2076
				2077	struct dm_dev *origin_dev;
				2078	sector_t origin_sectors;
				2079
				2080	uint32_t block_size;
				2081
				2082	const char *policy_name;
				2083	int policy_argc;
				2084	const char **policy_argv;
				2085
				2086	struct cache_features features;
				2087	};
				2088
				2089	static void destroy_cache_args(struct cache_args *ca)
				2090	{
				2091	if (ca->metadata_dev)
				2092	dm_put_device(ca->ti, ca->metadata_dev);
				2093
				2094	if (ca->cache_dev)
				2095	dm_put_device(ca->ti, ca->cache_dev);
				2096
				2097	if (ca->origin_dev)
				2098	dm_put_device(ca->ti, ca->origin_dev);
				2099
				2100	kfree(ca);
				2101	}
				2102
				2103	static bool at_least_one_arg(struct dm_arg_set as, char *error)
				2104	{
				2105	if (!as->argc) {
				2106	*error = "Insufficient args";
				2107	return false;
				2108	}
				2109
				2110	return true;
				2111	}
				2112
				2113	static int parse_metadata_dev(struct cache_args ca, struct dm_arg_set as,
				2114	char **error)
				2115	{
				2116	int r;
				2117	sector_t metadata_dev_size;
				2118	char b[BDEVNAME_SIZE];
				2119
				2120	if (!at_least_one_arg(as, error))
				2121	return -EINVAL;
				2122
				2123	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2124	&ca->metadata_dev);
				2125	if (r) {
				2126	*error = "Error opening metadata device";
				2127	return r;
				2128	}
				2129
				2130	metadata_dev_size = get_dev_size(ca->metadata_dev);
				2131	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
				2132	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				2133	bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
				2134
				2135	return 0;
				2136	}
				2137
				2138	static int parse_cache_dev(struct cache_args ca, struct dm_arg_set as,
				2139	char **error)
				2140	{
				2141	int r;
				2142
				2143	if (!at_least_one_arg(as, error))
				2144	return -EINVAL;
				2145
				2146	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2147	&ca->cache_dev);
				2148	if (r) {
				2149	*error = "Error opening cache device";
				2150	return r;
				2151	}
				2152	ca->cache_sectors = get_dev_size(ca->cache_dev);
				2153
				2154	return 0;
				2155	}
				2156
				2157	static int parse_origin_dev(struct cache_args ca, struct dm_arg_set as,
				2158	char **error)
				2159	{
				2160	int r;
				2161
				2162	if (!at_least_one_arg(as, error))
				2163	return -EINVAL;
				2164
				2165	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				2166	&ca->origin_dev);
				2167	if (r) {
				2168	*error = "Error opening origin device";
				2169	return r;
				2170	}
				2171
				2172	ca->origin_sectors = get_dev_size(ca->origin_dev);
				2173	if (ca->ti->len > ca->origin_sectors) {
				2174	*error = "Device size larger than cached device";
				2175	return -EINVAL;
				2176	}
				2177
				2178	return 0;
				2179	}
				2180
				2181	static int parse_block_size(struct cache_args ca, struct dm_arg_set as,
				2182	char **error)
				2183	{
				2184	unsigned long block_size;
				2185
				2186	if (!at_least_one_arg(as, error))
				2187	return -EINVAL;
				2188
				2189	if (kstrtoul(dm_shift_arg(as), 10, &block_size) \|\| !block_size \|\|
				2190	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				2191	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				2192	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				2193	*error = "Invalid data block size";
				2194	return -EINVAL;
				2195	}
				2196
				2197	if (block_size > ca->cache_sectors) {
				2198	*error = "Data block size is larger than the cache device";
				2199	return -EINVAL;
				2200	}
				2201
				2202	ca->block_size = block_size;
				2203
				2204	return 0;
				2205	}
				2206
				2207	static void init_features(struct cache_features *cf)
				2208	{
				2209	cf->mode = CM_WRITE;
				2210	cf->io_mode = CM_IO_WRITEBACK;
				2211	cf->metadata_version = 1;
				2212	}
				2213
				2214	static int parse_features(struct cache_args ca, struct dm_arg_set as,
				2215	char **error)
				2216	{
				2217	static const struct dm_arg _args[] = {
				2218	{0, 2, "Invalid number of cache feature arguments"},
				2219	};
				2220
				2221	int r, mode_ctr = 0;
				2222	unsigned argc;
				2223	const char *arg;
				2224	struct cache_features *cf = &ca->features;
				2225
				2226	init_features(cf);
				2227
				2228	r = dm_read_arg_group(_args, as, &argc, error);
				2229	if (r)
				2230	return -EINVAL;
				2231
				2232	while (argc--) {
				2233	arg = dm_shift_arg(as);
				2234
				2235	if (!strcasecmp(arg, "writeback")) {
				2236	cf->io_mode = CM_IO_WRITEBACK;
				2237	mode_ctr++;
				2238	}
				2239
				2240	else if (!strcasecmp(arg, "writethrough")) {
				2241	cf->io_mode = CM_IO_WRITETHROUGH;
				2242	mode_ctr++;
				2243	}
				2244
				2245	else if (!strcasecmp(arg, "passthrough")) {
				2246	cf->io_mode = CM_IO_PASSTHROUGH;
				2247	mode_ctr++;
				2248	}
				2249
				2250	else if (!strcasecmp(arg, "metadata2"))
				2251	cf->metadata_version = 2;
				2252
				2253	else {
				2254	*error = "Unrecognised cache feature requested";
				2255	return -EINVAL;
				2256	}
				2257	}
				2258
				2259	if (mode_ctr > 1) {
				2260	*error = "Duplicate cache io_mode features requested";
				2261	return -EINVAL;
				2262	}
				2263
				2264	return 0;
				2265	}
				2266
				2267	static int parse_policy(struct cache_args ca, struct dm_arg_set as,
				2268	char **error)
				2269	{
				2270	static const struct dm_arg _args[] = {
				2271	{0, 1024, "Invalid number of policy arguments"},
				2272	};
				2273
				2274	int r;
				2275
				2276	if (!at_least_one_arg(as, error))
				2277	return -EINVAL;
				2278
				2279	ca->policy_name = dm_shift_arg(as);
				2280
				2281	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
				2282	if (r)
				2283	return -EINVAL;
				2284
				2285	ca->policy_argv = (const char **)as->argv;
				2286	dm_consume_args(as, ca->policy_argc);
				2287
				2288	return 0;
				2289	}
				2290
				2291	static int parse_cache_args(struct cache_args ca, int argc, char *argv,
				2292	char **error)
				2293	{
				2294	int r;
				2295	struct dm_arg_set as;
				2296
				2297	as.argc = argc;
				2298	as.argv = argv;
				2299
				2300	r = parse_metadata_dev(ca, &as, error);
				2301	if (r)
				2302	return r;
				2303
				2304	r = parse_cache_dev(ca, &as, error);
				2305	if (r)
				2306	return r;
				2307
				2308	r = parse_origin_dev(ca, &as, error);
				2309	if (r)
				2310	return r;
				2311
				2312	r = parse_block_size(ca, &as, error);
				2313	if (r)
				2314	return r;
				2315
				2316	r = parse_features(ca, &as, error);
				2317	if (r)
				2318	return r;
				2319
				2320	r = parse_policy(ca, &as, error);
				2321	if (r)
				2322	return r;
				2323
				2324	return 0;
				2325	}
				2326
				2327	/----------------------------------------------------------------/
				2328
				2329	static struct kmem_cache *migration_cache;
				2330
				2331	#define NOT_CORE_OPTION 1
				2332
				2333	static int process_config_option(struct cache cache, const char key, const char *value)
				2334	{
				2335	unsigned long tmp;
				2336
				2337	if (!strcasecmp(key, "migration_threshold")) {
				2338	if (kstrtoul(value, 10, &tmp))
				2339	return -EINVAL;
				2340
				2341	cache->migration_threshold = tmp;
				2342	return 0;
				2343	}
				2344
				2345	return NOT_CORE_OPTION;
				2346	}
				2347
				2348	static int set_config_value(struct cache cache, const char key, const char *value)
				2349	{
				2350	int r = process_config_option(cache, key, value);
				2351
				2352	if (r == NOT_CORE_OPTION)
				2353	r = policy_set_config_value(cache->policy, key, value);
				2354
				2355	if (r)
				2356	DMWARN("bad config value for %s: %s", key, value);
				2357
				2358	return r;
				2359	}
				2360
				2361	static int set_config_values(struct cache cache, int argc, const char *argv)
				2362	{
				2363	int r = 0;
				2364
				2365	if (argc & 1) {
				2366	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
				2367	return -EINVAL;
				2368	}
				2369
				2370	while (argc) {
				2371	r = set_config_value(cache, argv[0], argv[1]);
				2372	if (r)
				2373	break;
				2374
				2375	argc -= 2;
				2376	argv += 2;
				2377	}
				2378
				2379	return r;
				2380	}
				2381
				2382	static int create_cache_policy(struct cache cache, struct cache_args ca,
				2383	char **error)
				2384	{
				2385	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
				2386	cache->cache_size,
				2387	cache->origin_sectors,
				2388	cache->sectors_per_block);
				2389	if (IS_ERR(p)) {
				2390	*error = "Error creating cache's policy";
				2391	return PTR_ERR(p);
				2392	}
				2393	cache->policy = p;
				2394	BUG_ON(!cache->policy);
				2395
				2396	return 0;
				2397	}
				2398
				2399	/*
				2400	* We want the discard block size to be at least the size of the cache
				2401	* block size and have no more than 2^14 discard blocks across the origin.
				2402	*/
				2403	#define MAX_DISCARD_BLOCKS (1 << 14)
				2404
				2405	static bool too_many_discard_blocks(sector_t discard_block_size,
				2406	sector_t origin_size)
				2407	{
				2408	(void) sector_div(origin_size, discard_block_size);
				2409
				2410	return origin_size > MAX_DISCARD_BLOCKS;
				2411	}
				2412
				2413	static sector_t calculate_discard_block_size(sector_t cache_block_size,
				2414	sector_t origin_size)
				2415	{
				2416	sector_t discard_block_size = cache_block_size;
				2417
				2418	if (origin_size)
				2419	while (too_many_discard_blocks(discard_block_size, origin_size))
				2420	discard_block_size *= 2;
				2421
				2422	return discard_block_size;
				2423	}
				2424
				2425	static void set_cache_size(struct cache *cache, dm_cblock_t size)
				2426	{
				2427	dm_block_t nr_blocks = from_cblock(size);
				2428
				2429	if (nr_blocks > (1 << 20) && cache->cache_size != size)
				2430	DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
				2431	"All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
				2432	"Please consider increasing the cache block size to reduce the overall cache block count.",
				2433	(unsigned long long) nr_blocks);
				2434
				2435	cache->cache_size = size;
				2436	}
				2437
				2438	static int is_congested(struct dm_dev *dev, int bdi_bits)
				2439	{
				2440	struct request_queue *q = bdev_get_queue(dev->bdev);
				2441	return bdi_congested(q->backing_dev_info, bdi_bits);
				2442	}
				2443
				2444	static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				2445	{
				2446	struct cache *cache = container_of(cb, struct cache, callbacks);
				2447
				2448	return is_congested(cache->origin_dev, bdi_bits) \|\|
				2449	is_congested(cache->cache_dev, bdi_bits);
				2450	}
				2451
				2452	#define DEFAULT_MIGRATION_THRESHOLD 2048
				2453
				2454	static int cache_create(struct cache_args ca, struct cache *result)
				2455	{
				2456	int r = 0;
				2457	char **error = &ca->ti->error;
				2458	struct cache *cache;
				2459	struct dm_target *ti = ca->ti;
				2460	dm_block_t origin_blocks;
				2461	struct dm_cache_metadata *cmd;
				2462	bool may_format = ca->features.mode == CM_WRITE;
				2463
				2464	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
				2465	if (!cache)
				2466	return -ENOMEM;
				2467
				2468	cache->ti = ca->ti;
				2469	ti->private = cache;
				2470	ti->num_flush_bios = 2;
				2471	ti->flush_supported = true;
				2472
				2473	ti->num_discard_bios = 1;
				2474	ti->discards_supported = true;
				2475	ti->split_discard_bios = false;
				2476
				2477	ti->per_io_data_size = sizeof(struct per_bio_data);
				2478
				2479	cache->features = ca->features;
				2480	if (writethrough_mode(cache)) {
				2481	/* Create bioset for writethrough bios issued to origin */
				2482	r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
				2483	if (r)
				2484	goto bad;
				2485	}
				2486
				2487	cache->callbacks.congested_fn = cache_is_congested;
				2488	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
				2489
				2490	cache->metadata_dev = ca->metadata_dev;
				2491	cache->origin_dev = ca->origin_dev;
				2492	cache->cache_dev = ca->cache_dev;
				2493
				2494	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
				2495
				2496	origin_blocks = cache->origin_sectors = ca->origin_sectors;
				2497	origin_blocks = block_div(origin_blocks, ca->block_size);
				2498	cache->origin_blocks = to_oblock(origin_blocks);
				2499
				2500	cache->sectors_per_block = ca->block_size;
				2501	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
				2502	r = -EINVAL;
				2503	goto bad;
				2504	}
				2505
				2506	if (ca->block_size & (ca->block_size - 1)) {
				2507	dm_block_t cache_size = ca->cache_sectors;
				2508
				2509	cache->sectors_per_block_shift = -1;
				2510	cache_size = block_div(cache_size, ca->block_size);
				2511	set_cache_size(cache, to_cblock(cache_size));
				2512	} else {
				2513	cache->sectors_per_block_shift = __ffs(ca->block_size);
				2514	set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
				2515	}
				2516
				2517	r = create_cache_policy(cache, ca, error);
				2518	if (r)
				2519	goto bad;
				2520
				2521	cache->policy_nr_args = ca->policy_argc;
				2522	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
				2523
				2524	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
				2525	if (r) {
				2526	*error = "Error setting cache policy's config values";
				2527	goto bad;
				2528	}
				2529
				2530	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
				2531	ca->block_size, may_format,
				2532	dm_cache_policy_get_hint_size(cache->policy),
				2533	ca->features.metadata_version);
				2534	if (IS_ERR(cmd)) {
				2535	*error = "Error creating metadata object";
				2536	r = PTR_ERR(cmd);
				2537	goto bad;
				2538	}
				2539	cache->cmd = cmd;
				2540	set_cache_mode(cache, CM_WRITE);
				2541	if (get_cache_mode(cache) != CM_WRITE) {
				2542	*error = "Unable to get write access to metadata, please check/repair metadata.";
				2543	r = -EINVAL;
				2544	goto bad;
				2545	}
				2546
				2547	if (passthrough_mode(cache)) {
				2548	bool all_clean;
				2549
				2550	r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
				2551	if (r) {
				2552	*error = "dm_cache_metadata_all_clean() failed";
				2553	goto bad;
				2554	}
				2555
				2556	if (!all_clean) {
				2557	*error = "Cannot enter passthrough mode unless all blocks are clean";
				2558	r = -EINVAL;
				2559	goto bad;
				2560	}
				2561
				2562	policy_allow_migrations(cache->policy, false);
				2563	}
				2564
				2565	spin_lock_init(&cache->lock);
				2566	bio_list_init(&cache->deferred_bios);
				2567	atomic_set(&cache->nr_allocated_migrations, 0);
				2568	atomic_set(&cache->nr_io_migrations, 0);
				2569	init_waitqueue_head(&cache->migration_wait);
				2570
				2571	r = -ENOMEM;
				2572	atomic_set(&cache->nr_dirty, 0);
				2573	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
				2574	if (!cache->dirty_bitset) {
				2575	*error = "could not allocate dirty bitset";
				2576	goto bad;
				2577	}
				2578	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
				2579
				2580	cache->discard_block_size =
				2581	calculate_discard_block_size(cache->sectors_per_block,
				2582	cache->origin_sectors);
				2583	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
				2584	cache->discard_block_size));
				2585	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
				2586	if (!cache->discard_bitset) {
				2587	*error = "could not allocate discard bitset";
				2588	goto bad;
				2589	}
				2590	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				2591
				2592	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				2593	if (IS_ERR(cache->copier)) {
				2594	*error = "could not create kcopyd client";
				2595	r = PTR_ERR(cache->copier);
				2596	goto bad;
				2597	}
				2598
				2599	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
				2600	if (!cache->wq) {
				2601	*error = "could not create workqueue for metadata object";
				2602	goto bad;
				2603	}
				2604	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
				2605	INIT_WORK(&cache->migration_worker, check_migrations);
				2606	INIT_DELAYED_WORK(&cache->waker, do_waker);
				2607
				2608	cache->prison = dm_bio_prison_create_v2(cache->wq);
				2609	if (!cache->prison) {
				2610	*error = "could not create bio prison";
				2611	goto bad;
				2612	}
				2613
				2614	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
				2615	migration_cache);
				2616	if (r) {
				2617	*error = "Error creating cache's migration mempool";
				2618	goto bad;
				2619	}
				2620
				2621	cache->need_tick_bio = true;
				2622	cache->sized = false;
				2623	cache->invalidate = false;
				2624	cache->commit_requested = false;
				2625	cache->loaded_mappings = false;
				2626	cache->loaded_discards = false;
				2627
				2628	load_stats(cache);
				2629
				2630	atomic_set(&cache->stats.demotion, 0);
				2631	atomic_set(&cache->stats.promotion, 0);
				2632	atomic_set(&cache->stats.copies_avoided, 0);
				2633	atomic_set(&cache->stats.cache_cell_clash, 0);
				2634	atomic_set(&cache->stats.commit_count, 0);
				2635	atomic_set(&cache->stats.discard_count, 0);
				2636
				2637	spin_lock_init(&cache->invalidation_lock);
				2638	INIT_LIST_HEAD(&cache->invalidation_requests);
				2639
				2640	batcher_init(&cache->committer, commit_op, cache,
				2641	issue_op, cache, cache->wq);
				2642	iot_init(&cache->tracker);
				2643
				2644	init_rwsem(&cache->background_work_lock);
				2645	prevent_background_work(cache);
				2646
				2647	*result = cache;
				2648	return 0;
				2649	bad:
				2650	destroy(cache);
				2651	return r;
				2652	}
				2653
				2654	static int copy_ctr_args(struct cache cache, int argc, const char *argv)
				2655	{
				2656	unsigned i;
				2657	const char **copy;
				2658
				2659	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				2660	if (!copy)
				2661	return -ENOMEM;
				2662	for (i = 0; i < argc; i++) {
				2663	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				2664	if (!copy[i]) {
				2665	while (i--)
				2666	kfree(copy[i]);
				2667	kfree(copy);
				2668	return -ENOMEM;
				2669	}
				2670	}
				2671
				2672	cache->nr_ctr_args = argc;
				2673	cache->ctr_args = copy;
				2674
				2675	return 0;
				2676	}
				2677
				2678	static int cache_ctr(struct dm_target ti, unsigned argc, char *argv)
				2679	{
				2680	int r = -EINVAL;
				2681	struct cache_args *ca;
				2682	struct cache *cache = NULL;
				2683
				2684	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
				2685	if (!ca) {
				2686	ti->error = "Error allocating memory for cache";
				2687	return -ENOMEM;
				2688	}
				2689	ca->ti = ti;
				2690
				2691	r = parse_cache_args(ca, argc, argv, &ti->error);
				2692	if (r)
				2693	goto out;
				2694
				2695	r = cache_create(ca, &cache);
				2696	if (r)
				2697	goto out;
				2698
				2699	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
				2700	if (r) {
				2701	destroy(cache);
				2702	goto out;
				2703	}
				2704
				2705	ti->private = cache;
				2706	out:
				2707	destroy_cache_args(ca);
				2708	return r;
				2709	}
				2710
				2711	/----------------------------------------------------------------/
				2712
				2713	static int cache_map(struct dm_target ti, struct bio bio)
				2714	{
				2715	struct cache *cache = ti->private;
				2716
				2717	int r;
				2718	bool commit_needed;
				2719	dm_oblock_t block = get_bio_block(cache, bio);
				2720
				2721	init_per_bio_data(bio);
				2722	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
				2723	/*
				2724	* This can only occur if the io goes to a partial block at
				2725	* the end of the origin device. We don't cache these.
				2726	* Just remap to the origin and carry on.
				2727	*/
				2728	remap_to_origin(cache, bio);
				2729	accounted_begin(cache, bio);
				2730	return DM_MAPIO_REMAPPED;
				2731	}
				2732
				2733	if (discard_or_flush(bio)) {
				2734	defer_bio(cache, bio);
				2735	return DM_MAPIO_SUBMITTED;
				2736	}
				2737
				2738	r = map_bio(cache, bio, block, &commit_needed);
				2739	if (commit_needed)
				2740	schedule_commit(&cache->committer);
				2741
				2742	return r;
				2743	}
				2744
				2745	static int cache_end_io(struct dm_target ti, struct bio bio, blk_status_t *error)
				2746	{
				2747	struct cache *cache = ti->private;
				2748	unsigned long flags;
				2749	struct per_bio_data *pb = get_per_bio_data(bio);
				2750
				2751	if (pb->tick) {
				2752	policy_tick(cache->policy, false);
				2753
				2754	spin_lock_irqsave(&cache->lock, flags);
				2755	cache->need_tick_bio = true;
				2756	spin_unlock_irqrestore(&cache->lock, flags);
				2757	}
				2758
				2759	bio_drop_shared_lock(cache, bio);
				2760	accounted_complete(cache, bio);
				2761
				2762	return DM_ENDIO_DONE;
				2763	}
				2764
				2765	static int write_dirty_bitset(struct cache *cache)
				2766	{
				2767	int r;
				2768
				2769	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2770	return -EINVAL;
				2771
				2772	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
				2773	if (r)
				2774	metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
				2775
				2776	return r;
				2777	}
				2778
				2779	static int write_discard_bitset(struct cache *cache)
				2780	{
				2781	unsigned i, r;
				2782
				2783	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2784	return -EINVAL;
				2785
				2786	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
				2787	cache->discard_nr_blocks);
				2788	if (r) {
				2789	DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
				2790	metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
				2791	return r;
				2792	}
				2793
				2794	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
				2795	r = dm_cache_set_discard(cache->cmd, to_dblock(i),
				2796	is_discarded(cache, to_dblock(i)));
				2797	if (r) {
				2798	metadata_operation_failed(cache, "dm_cache_set_discard", r);
				2799	return r;
				2800	}
				2801	}
				2802
				2803	return 0;
				2804	}
				2805
				2806	static int write_hints(struct cache *cache)
				2807	{
				2808	int r;
				2809
				2810	if (get_cache_mode(cache) >= CM_READ_ONLY)
				2811	return -EINVAL;
				2812
				2813	r = dm_cache_write_hints(cache->cmd, cache->policy);
				2814	if (r) {
				2815	metadata_operation_failed(cache, "dm_cache_write_hints", r);
				2816	return r;
				2817	}
				2818
				2819	return 0;
				2820	}
				2821
				2822	/*
				2823	* returns true on success
				2824	*/
				2825	static bool sync_metadata(struct cache *cache)
				2826	{
				2827	int r1, r2, r3, r4;
				2828
				2829	r1 = write_dirty_bitset(cache);
				2830	if (r1)
				2831	DMERR("%s: could not write dirty bitset", cache_device_name(cache));
				2832
				2833	r2 = write_discard_bitset(cache);
				2834	if (r2)
				2835	DMERR("%s: could not write discard bitset", cache_device_name(cache));
				2836
				2837	save_stats(cache);
				2838
				2839	r3 = write_hints(cache);
				2840	if (r3)
				2841	DMERR("%s: could not write hints", cache_device_name(cache));
				2842
				2843	/*
				2844	* If writing the above metadata failed, we still commit, but don't
				2845	* set the clean shutdown flag. This will effectively force every
				2846	* dirty bit to be set on reload.
				2847	*/
				2848	r4 = commit(cache, !r1 && !r2 && !r3);
				2849	if (r4)
				2850	DMERR("%s: could not write cache metadata", cache_device_name(cache));
				2851
				2852	return !r1 && !r2 && !r3 && !r4;
				2853	}
				2854
				2855	static void cache_postsuspend(struct dm_target *ti)
				2856	{
				2857	struct cache *cache = ti->private;
				2858
				2859	prevent_background_work(cache);
				2860	BUG_ON(atomic_read(&cache->nr_io_migrations));
				2861
				2862	cancel_delayed_work(&cache->waker);
				2863	flush_workqueue(cache->wq);
				2864	WARN_ON(cache->tracker.in_flight);
				2865
				2866	/*
				2867	* If it's a flush suspend there won't be any deferred bios, so this
				2868	* call is harmless.
				2869	*/
				2870	requeue_deferred_bios(cache);
				2871
				2872	if (get_cache_mode(cache) == CM_WRITE)
				2873	(void) sync_metadata(cache);
				2874	}
				2875
				2876	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
				2877	bool dirty, uint32_t hint, bool hint_valid)
				2878	{
				2879	int r;
				2880	struct cache *cache = context;
				2881
				2882	if (dirty) {
				2883	set_bit(from_cblock(cblock), cache->dirty_bitset);
				2884	atomic_inc(&cache->nr_dirty);
				2885	} else
				2886	clear_bit(from_cblock(cblock), cache->dirty_bitset);
				2887
				2888	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
				2889	if (r)
				2890	return r;
				2891
				2892	return 0;
				2893	}
				2894
				2895	/*
				2896	* The discard block size in the on disk metadata is not
				2897	* neccessarily the same as we're currently using. So we have to
				2898	* be careful to only set the discarded attribute if we know it
				2899	* covers a complete block of the new size.
				2900	*/
				2901	struct discard_load_info {
				2902	struct cache *cache;
				2903
				2904	/*
				2905	* These blocks are sized using the on disk dblock size, rather
				2906	* than the current one.
				2907	*/
				2908	dm_block_t block_size;
				2909	dm_block_t discard_begin, discard_end;
				2910	};
				2911
				2912	static void discard_load_info_init(struct cache *cache,
				2913	struct discard_load_info *li)
				2914	{
				2915	li->cache = cache;
				2916	li->discard_begin = li->discard_end = 0;
				2917	}
				2918
				2919	static void set_discard_range(struct discard_load_info *li)
				2920	{
				2921	sector_t b, e;
				2922
				2923	if (li->discard_begin == li->discard_end)
				2924	return;
				2925
				2926	/*
				2927	* Convert to sectors.
				2928	*/
				2929	b = li->discard_begin * li->block_size;
				2930	e = li->discard_end * li->block_size;
				2931
				2932	/*
				2933	* Then convert back to the current dblock size.
				2934	*/
				2935	b = dm_sector_div_up(b, li->cache->discard_block_size);
				2936	sector_div(e, li->cache->discard_block_size);
				2937
				2938	/*
				2939	* The origin may have shrunk, so we need to check we're still in
				2940	* bounds.
				2941	*/
				2942	if (e > from_dblock(li->cache->discard_nr_blocks))
				2943	e = from_dblock(li->cache->discard_nr_blocks);
				2944
				2945	for (; b < e; b++)
				2946	set_discard(li->cache, to_dblock(b));
				2947	}
				2948
				2949	static int load_discard(void *context, sector_t discard_block_size,
				2950	dm_dblock_t dblock, bool discard)
				2951	{
				2952	struct discard_load_info *li = context;
				2953
				2954	li->block_size = discard_block_size;
				2955
				2956	if (discard) {
				2957	if (from_dblock(dblock) == li->discard_end)
				2958	/*
				2959	* We're already in a discard range, just extend it.
				2960	*/
				2961	li->discard_end = li->discard_end + 1ULL;
				2962
				2963	else {
				2964	/*
				2965	* Emit the old range and start a new one.
				2966	*/
				2967	set_discard_range(li);
				2968	li->discard_begin = from_dblock(dblock);
				2969	li->discard_end = li->discard_begin + 1ULL;
				2970	}
				2971	} else {
				2972	set_discard_range(li);
				2973	li->discard_begin = li->discard_end = 0;
				2974	}
				2975
				2976	return 0;
				2977	}
				2978
				2979	static dm_cblock_t get_cache_dev_size(struct cache *cache)
				2980	{
				2981	sector_t size = get_dev_size(cache->cache_dev);
				2982	(void) sector_div(size, cache->sectors_per_block);
				2983	return to_cblock(size);
				2984	}
				2985
				2986	static bool can_resize(struct cache *cache, dm_cblock_t new_size)
				2987	{
				2988	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
				2989	if (cache->sized) {
				2990	DMERR("%s: unable to extend cache due to missing cache table reload",
				2991	cache_device_name(cache));
				2992	return false;
				2993	}
				2994	}
				2995
				2996	/*
				2997	* We can't drop a dirty block when shrinking the cache.
				2998	*/
				2999	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
				3000	new_size = to_cblock(from_cblock(new_size) + 1);
				3001	if (is_dirty(cache, new_size)) {
				3002	DMERR("%s: unable to shrink cache; cache block %llu is dirty",
				3003	cache_device_name(cache),
				3004	(unsigned long long) from_cblock(new_size));
				3005	return false;
				3006	}
				3007	}
				3008
				3009	return true;
				3010	}
				3011
				3012	static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
				3013	{
				3014	int r;
				3015
				3016	r = dm_cache_resize(cache->cmd, new_size);
				3017	if (r) {
				3018	DMERR("%s: could not resize cache metadata", cache_device_name(cache));
				3019	metadata_operation_failed(cache, "dm_cache_resize", r);
				3020	return r;
				3021	}
				3022
				3023	set_cache_size(cache, new_size);
				3024
				3025	return 0;
				3026	}
				3027
				3028	static int cache_preresume(struct dm_target *ti)
				3029	{
				3030	int r = 0;
				3031	struct cache *cache = ti->private;
				3032	dm_cblock_t csize = get_cache_dev_size(cache);
				3033
				3034	/*
				3035	* Check to see if the cache has resized.
				3036	*/
				3037	if (!cache->sized) {
				3038	r = resize_cache_dev(cache, csize);
				3039	if (r)
				3040	return r;
				3041
				3042	cache->sized = true;
				3043
				3044	} else if (csize != cache->cache_size) {
				3045	if (!can_resize(cache, csize))
				3046	return -EINVAL;
				3047
				3048	r = resize_cache_dev(cache, csize);
				3049	if (r)
				3050	return r;
				3051	}
				3052
				3053	if (!cache->loaded_mappings) {
				3054	r = dm_cache_load_mappings(cache->cmd, cache->policy,
				3055	load_mapping, cache);
				3056	if (r) {
				3057	DMERR("%s: could not load cache mappings", cache_device_name(cache));
				3058	metadata_operation_failed(cache, "dm_cache_load_mappings", r);
				3059	return r;
				3060	}
				3061
				3062	cache->loaded_mappings = true;
				3063	}
				3064
				3065	if (!cache->loaded_discards) {
				3066	struct discard_load_info li;
				3067
				3068	/*
				3069	* The discard bitset could have been resized, or the
				3070	* discard block size changed. To be safe we start by
				3071	* setting every dblock to not discarded.
				3072	*/
				3073	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
				3074
				3075	discard_load_info_init(cache, &li);
				3076	r = dm_cache_load_discards(cache->cmd, load_discard, &li);
				3077	if (r) {
				3078	DMERR("%s: could not load origin discards", cache_device_name(cache));
				3079	metadata_operation_failed(cache, "dm_cache_load_discards", r);
				3080	return r;
				3081	}
				3082	set_discard_range(&li);
				3083
				3084	cache->loaded_discards = true;
				3085	}
				3086
				3087	return r;
				3088	}
				3089
				3090	static void cache_resume(struct dm_target *ti)
				3091	{
				3092	struct cache *cache = ti->private;
				3093
				3094	cache->need_tick_bio = true;
				3095	allow_background_work(cache);
				3096	do_waker(&cache->waker.work);
				3097	}
				3098
				3099	/*
				3100	* Status format:
				3101	*
				3102	* <metadata block size> <#used metadata blocks>/<#total metadata blocks>
				3103	* <cache block size> <#used cache blocks>/<#total cache blocks>
				3104	* <#read hits> <#read misses> <#write hits> <#write misses>
				3105	* <#demotions> <#promotions> <#dirty>
				3106	* <#features> <features>*
				3107	* <#core args> <core args>
				3108	* <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
				3109	*/
				3110	static void cache_status(struct dm_target *ti, status_type_t type,
				3111	unsigned status_flags, char *result, unsigned maxlen)
				3112	{
				3113	int r = 0;
				3114	unsigned i;
				3115	ssize_t sz = 0;
				3116	dm_block_t nr_free_blocks_metadata = 0;
				3117	dm_block_t nr_blocks_metadata = 0;
				3118	char buf[BDEVNAME_SIZE];
				3119	struct cache *cache = ti->private;
				3120	dm_cblock_t residency;
				3121	bool needs_check;
				3122
				3123	switch (type) {
				3124	case STATUSTYPE_INFO:
				3125	if (get_cache_mode(cache) == CM_FAIL) {
				3126	DMEMIT("Fail");
				3127	break;
				3128	}
				3129
				3130	/* Commit to ensure statistics aren't out-of-date */
				3131	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
				3132	(void) commit(cache, false);
				3133
				3134	r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
				3135	if (r) {
				3136	DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
				3137	cache_device_name(cache), r);
				3138	goto err;
				3139	}
				3140
				3141	r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
				3142	if (r) {
				3143	DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
				3144	cache_device_name(cache), r);
				3145	goto err;
				3146	}
				3147
				3148	residency = policy_residency(cache->policy);
				3149
				3150	DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
				3151	(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
				3152	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				3153	(unsigned long long)nr_blocks_metadata,
				3154	(unsigned long long)cache->sectors_per_block,
				3155	(unsigned long long) from_cblock(residency),
				3156	(unsigned long long) from_cblock(cache->cache_size),
				3157	(unsigned) atomic_read(&cache->stats.read_hit),
				3158	(unsigned) atomic_read(&cache->stats.read_miss),
				3159	(unsigned) atomic_read(&cache->stats.write_hit),
				3160	(unsigned) atomic_read(&cache->stats.write_miss),
				3161	(unsigned) atomic_read(&cache->stats.demotion),
				3162	(unsigned) atomic_read(&cache->stats.promotion),
				3163	(unsigned long) atomic_read(&cache->nr_dirty));
				3164
				3165	if (cache->features.metadata_version == 2)
				3166	DMEMIT("2 metadata2 ");
				3167	else
				3168	DMEMIT("1 ");
				3169
				3170	if (writethrough_mode(cache))
				3171	DMEMIT("writethrough ");
				3172
				3173	else if (passthrough_mode(cache))
				3174	DMEMIT("passthrough ");
				3175
				3176	else if (writeback_mode(cache))
				3177	DMEMIT("writeback ");
				3178
				3179	else {
				3180	DMERR("%s: internal error: unknown io mode: %d",
				3181	cache_device_name(cache), (int) cache->features.io_mode);
				3182	goto err;
				3183	}
				3184
				3185	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
				3186
				3187	DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
				3188	if (sz < maxlen) {
				3189	r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
				3190	if (r)
				3191	DMERR("%s: policy_emit_config_values returned %d",
				3192	cache_device_name(cache), r);
				3193	}
				3194
				3195	if (get_cache_mode(cache) == CM_READ_ONLY)
				3196	DMEMIT("ro ");
				3197	else
				3198	DMEMIT("rw ");
				3199
				3200	r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
				3201
				3202	if (r \|\| needs_check)
				3203	DMEMIT("needs_check ");
				3204	else
				3205	DMEMIT("- ");
				3206
				3207	break;
				3208
				3209	case STATUSTYPE_TABLE:
				3210	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
				3211	DMEMIT("%s ", buf);
				3212	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
				3213	DMEMIT("%s ", buf);
				3214	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
				3215	DMEMIT("%s", buf);
				3216
				3217	for (i = 0; i < cache->nr_ctr_args - 1; i++)
				3218	DMEMIT(" %s", cache->ctr_args[i]);
				3219	if (cache->nr_ctr_args)
				3220	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
				3221	}
				3222
				3223	return;
				3224
				3225	err:
				3226	DMEMIT("Error");
				3227	}
				3228
				3229	/*
				3230	* Defines a range of cblocks, begin to (end - 1) are in the range. end is
				3231	* the one-past-the-end value.
				3232	*/
				3233	struct cblock_range {
				3234	dm_cblock_t begin;
				3235	dm_cblock_t end;
				3236	};
				3237
				3238	/*
				3239	* A cache block range can take two forms:
				3240	*
				3241	* i) A single cblock, eg. '3456'
				3242	* ii) A begin and end cblock with a dash between, eg. 123-234
				3243	*/
				3244	static int parse_cblock_range(struct cache cache, const char str,
				3245	struct cblock_range *result)
				3246	{
				3247	char dummy;
				3248	uint64_t b, e;
				3249	int r;
				3250
				3251	/*
				3252	* Try and parse form (ii) first.
				3253	*/
				3254	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
				3255	if (r < 0)
				3256	return r;
				3257
				3258	if (r == 2) {
				3259	result->begin = to_cblock(b);
				3260	result->end = to_cblock(e);
				3261	return 0;
				3262	}
				3263
				3264	/*
				3265	* That didn't work, try form (i).
				3266	*/
				3267	r = sscanf(str, "%llu%c", &b, &dummy);
				3268	if (r < 0)
				3269	return r;
				3270
				3271	if (r == 1) {
				3272	result->begin = to_cblock(b);
				3273	result->end = to_cblock(from_cblock(result->begin) + 1u);
				3274	return 0;
				3275	}
				3276
				3277	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
				3278	return -EINVAL;
				3279	}
				3280
				3281	static int validate_cblock_range(struct cache cache, struct cblock_range range)
				3282	{
				3283	uint64_t b = from_cblock(range->begin);
				3284	uint64_t e = from_cblock(range->end);
				3285	uint64_t n = from_cblock(cache->cache_size);
				3286
				3287	if (b >= n) {
				3288	DMERR("%s: begin cblock out of range: %llu >= %llu",
				3289	cache_device_name(cache), b, n);
				3290	return -EINVAL;
				3291	}
				3292
				3293	if (e > n) {
				3294	DMERR("%s: end cblock out of range: %llu > %llu",
				3295	cache_device_name(cache), e, n);
				3296	return -EINVAL;
				3297	}
				3298
				3299	if (b >= e) {
				3300	DMERR("%s: invalid cblock range: %llu >= %llu",
				3301	cache_device_name(cache), b, e);
				3302	return -EINVAL;
				3303	}
				3304
				3305	return 0;
				3306	}
				3307
				3308	static inline dm_cblock_t cblock_succ(dm_cblock_t b)
				3309	{
				3310	return to_cblock(from_cblock(b) + 1);
				3311	}
				3312
				3313	static int request_invalidation(struct cache cache, struct cblock_range range)
				3314	{
				3315	int r = 0;
				3316
				3317	/*
				3318	* We don't need to do any locking here because we know we're in
				3319	* passthrough mode. There's is potential for a race between an
				3320	* invalidation triggered by an io and an invalidation message. This
				3321	* is harmless, we must not worry if the policy call fails.
				3322	*/
				3323	while (range->begin != range->end) {
				3324	r = invalidate_cblock(cache, range->begin);
				3325	if (r)
				3326	return r;
				3327
				3328	range->begin = cblock_succ(range->begin);
				3329	}
				3330
				3331	cache->commit_requested = true;
				3332	return r;
				3333	}
				3334
				3335	static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
				3336	const char **cblock_ranges)
				3337	{
				3338	int r = 0;
				3339	unsigned i;
				3340	struct cblock_range range;
				3341
				3342	if (!passthrough_mode(cache)) {
				3343	DMERR("%s: cache has to be in passthrough mode for invalidation",
				3344	cache_device_name(cache));
				3345	return -EPERM;
				3346	}
				3347
				3348	for (i = 0; i < count; i++) {
				3349	r = parse_cblock_range(cache, cblock_ranges[i], &range);
				3350	if (r)
				3351	break;
				3352
				3353	r = validate_cblock_range(cache, &range);
				3354	if (r)
				3355	break;
				3356
				3357	/*
				3358	* Pass begin and end origin blocks to the worker and wake it.
				3359	*/
				3360	r = request_invalidation(cache, &range);
				3361	if (r)
				3362	break;
				3363	}
				3364
				3365	return r;
				3366	}
				3367
				3368	/*
				3369	* Supports
				3370	* "<key> <value>"
				3371	* and
				3372	* "invalidate_cblocks [(<begin>)\|(<begin>-<end>)]*
				3373	*
				3374	* The key migration_threshold is supported by the cache target core.
				3375	*/
				3376	static int cache_message(struct dm_target ti, unsigned argc, char *argv,
				3377	char *result, unsigned maxlen)
				3378	{
				3379	struct cache *cache = ti->private;
				3380
				3381	if (!argc)
				3382	return -EINVAL;
				3383
				3384	if (get_cache_mode(cache) >= CM_READ_ONLY) {
				3385	DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
				3386	cache_device_name(cache));
				3387	return -EOPNOTSUPP;
				3388	}
				3389
				3390	if (!strcasecmp(argv[0], "invalidate_cblocks"))
				3391	return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
				3392
				3393	if (argc != 2)
				3394	return -EINVAL;
				3395
				3396	return set_config_value(cache, argv[0], argv[1]);
				3397	}
				3398
				3399	static int cache_iterate_devices(struct dm_target *ti,
				3400	iterate_devices_callout_fn fn, void *data)
				3401	{
				3402	int r = 0;
				3403	struct cache *cache = ti->private;
				3404
				3405	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
				3406	if (!r)
				3407	r = fn(ti, cache->origin_dev, 0, ti->len, data);
				3408
				3409	return r;
				3410	}
				3411
				3412	static void set_discard_limits(struct cache cache, struct queue_limits limits)
				3413	{
				3414	/*
				3415	* FIXME: these limits may be incompatible with the cache device
				3416	*/
				3417	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
				3418	cache->origin_sectors);
				3419	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
				3420	}
				3421
				3422	static void cache_io_hints(struct dm_target ti, struct queue_limits limits)
				3423	{
				3424	struct cache *cache = ti->private;
				3425	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
				3426
				3427	/*
				3428	* If the system-determined stacked limits are compatible with the
				3429	* cache's blocksize (io_opt is a factor) do not override them.
				3430	*/
				3431	if (io_opt_sectors < cache->sectors_per_block \|\|
				3432	do_div(io_opt_sectors, cache->sectors_per_block)) {
				3433	blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3434	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
				3435	}
				3436	set_discard_limits(cache, limits);
				3437	}
				3438
				3439	/----------------------------------------------------------------/
				3440
				3441	static struct target_type cache_target = {
				3442	.name = "cache",
				3443	.version = {2, 0, 0},
				3444	.module = THIS_MODULE,
				3445	.ctr = cache_ctr,
				3446	.dtr = cache_dtr,
				3447	.map = cache_map,
				3448	.end_io = cache_end_io,
				3449	.postsuspend = cache_postsuspend,
				3450	.preresume = cache_preresume,
				3451	.resume = cache_resume,
				3452	.status = cache_status,
				3453	.message = cache_message,
				3454	.iterate_devices = cache_iterate_devices,
				3455	.io_hints = cache_io_hints,
				3456	};
				3457
				3458	static int __init dm_cache_init(void)
				3459	{
				3460	int r;
				3461
				3462	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
				3463	if (!migration_cache)
				3464	return -ENOMEM;
				3465
				3466	r = dm_register_target(&cache_target);
				3467	if (r) {
				3468	DMERR("cache target registration failed: %d", r);
				3469	kmem_cache_destroy(migration_cache);
				3470	return r;
				3471	}
				3472
				3473	return 0;
				3474	}
				3475
				3476	static void __exit dm_cache_exit(void)
				3477	{
				3478	dm_unregister_target(&cache_target);
				3479	kmem_cache_destroy(migration_cache);
				3480	}
				3481
				3482	module_init(dm_cache_init);
				3483	module_exit(dm_cache_exit);
				3484
				3485	MODULE_DESCRIPTION(DM_NAME " cache target");
				3486	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
				3487	MODULE_LICENSE("GPL");