Blame - src/kernel/linux/v4.14/drivers/md/dm-thin.c - T103

blob: 0ee5eae716909ae945cf4bf5f62fc79302b52989 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011-2012 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8	#include "dm-bio-prison-v1.h"
				9	#include "dm.h"
				10
				11	#include <linux/device-mapper.h>
				12	#include <linux/dm-io.h>
				13	#include <linux/dm-kcopyd.h>
				14	#include <linux/jiffies.h>
				15	#include <linux/log2.h>
				16	#include <linux/list.h>
				17	#include <linux/rculist.h>
				18	#include <linux/init.h>
				19	#include <linux/module.h>
				20	#include <linux/slab.h>
				21	#include <linux/vmalloc.h>
				22	#include <linux/sort.h>
				23	#include <linux/rbtree.h>
				24
				25	#define DM_MSG_PREFIX "thin"
				26
				27	/*
				28	* Tunable constants
				29	*/
				30	#define ENDIO_HOOK_POOL_SIZE 1024
				31	#define MAPPING_POOL_SIZE 1024
				32	#define COMMIT_PERIOD HZ
				33	#define NO_SPACE_TIMEOUT_SECS 60
				34
				35	static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
				36
				37	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
				38	"A percentage of time allocated for copy on write");
				39
				40	/*
				41	* The block size of the device holding pool data must be
				42	* between 64KB and 1GB.
				43	*/
				44	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				45	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				46
				47	/*
				48	* Device id is restricted to 24 bits.
				49	*/
				50	#define MAX_DEV_ID ((1 << 24) - 1)
				51
				52	/*
				53	* How do we handle breaking sharing of data blocks?
				54	* =================================================
				55	*
				56	* We use a standard copy-on-write btree to store the mappings for the
				57	* devices (note I'm talking about copy-on-write of the metadata here, not
				58	* the data). When you take an internal snapshot you clone the root node
				59	* of the origin btree. After this there is no concept of an origin or a
				60	* snapshot. They are just two device trees that happen to point to the
				61	* same data blocks.
				62	*
				63	* When we get a write in we decide if it's to a shared data block using
				64	* some timestamp magic. If it is, we have to break sharing.
				65	*
				66	* Let's say we write to a shared block in what was the origin. The
				67	* steps are:
				68	*
				69	* i) plug io further to this physical block. (see bio_prison code).
				70	*
				71	* ii) quiesce any read io to that shared data block. Obviously
				72	* including all devices that share this block. (see dm_deferred_set code)
				73	*
				74	* iii) copy the data block to a newly allocate block. This step can be
				75	* missed out if the io covers the block. (schedule_copy).
				76	*
				77	* iv) insert the new mapping into the origin's btree
				78	* (process_prepared_mapping). This act of inserting breaks some
				79	* sharing of btree nodes between the two devices. Breaking sharing only
				80	* effects the btree of that specific device. Btrees for the other
				81	* devices that share the block never change. The btree for the origin
				82	* device as it was after the last commit is untouched, ie. we're using
				83	* persistent data structures in the functional programming sense.
				84	*
				85	* v) unplug io to this physical block, including the io that triggered
				86	* the breaking of sharing.
				87	*
				88	* Steps (ii) and (iii) occur in parallel.
				89	*
				90	* The metadata _doesn't_ need to be committed before the io continues. We
				91	* get away with this because the io is always written to a _new_ block.
				92	* If there's a crash, then:
				93	*
				94	* - The origin mapping will point to the old origin block (the shared
				95	* one). This will contain the data as it was before the io that triggered
				96	* the breaking of sharing came in.
				97	*
				98	* - The snap mapping still points to the old block. As it would after
				99	* the commit.
				100	*
				101	* The downside of this scheme is the timestamp magic isn't perfect, and
				102	* will continue to think that data block in the snapshot device is shared
				103	* even after the write to the origin has broken sharing. I suspect data
				104	* blocks will typically be shared by many different devices, so we're
				105	* breaking sharing n + 1 times, rather than n, where n is the number of
				106	* devices that reference this data block. At the moment I think the
				107	* benefits far, far outweigh the disadvantages.
				108	*/
				109
				110	/----------------------------------------------------------------/
				111
				112	/*
				113	* Key building.
				114	*/
				115	enum lock_space {
				116	VIRTUAL,
				117	PHYSICAL
				118	};
				119
				120	static void build_key(struct dm_thin_device *td, enum lock_space ls,
				121	dm_block_t b, dm_block_t e, struct dm_cell_key *key)
				122	{
				123	key->virtual = (ls == VIRTUAL);
				124	key->dev = dm_thin_dev_id(td);
				125	key->block_begin = b;
				126	key->block_end = e;
				127	}
				128
				129	static void build_data_key(struct dm_thin_device *td, dm_block_t b,
				130	struct dm_cell_key *key)
				131	{
				132	build_key(td, PHYSICAL, b, b + 1llu, key);
				133	}
				134
				135	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				136	struct dm_cell_key *key)
				137	{
				138	build_key(td, VIRTUAL, b, b + 1llu, key);
				139	}
				140
				141	/----------------------------------------------------------------/
				142
				143	#define THROTTLE_THRESHOLD (1 * HZ)
				144
				145	struct throttle {
				146	struct rw_semaphore lock;
				147	unsigned long threshold;
				148	bool throttle_applied;
				149	};
				150
				151	static void throttle_init(struct throttle *t)
				152	{
				153	init_rwsem(&t->lock);
				154	t->throttle_applied = false;
				155	}
				156
				157	static void throttle_work_start(struct throttle *t)
				158	{
				159	t->threshold = jiffies + THROTTLE_THRESHOLD;
				160	}
				161
				162	static void throttle_work_update(struct throttle *t)
				163	{
				164	if (!t->throttle_applied && jiffies > t->threshold) {
				165	down_write(&t->lock);
				166	t->throttle_applied = true;
				167	}
				168	}
				169
				170	static void throttle_work_complete(struct throttle *t)
				171	{
				172	if (t->throttle_applied) {
				173	t->throttle_applied = false;
				174	up_write(&t->lock);
				175	}
				176	}
				177
				178	static void throttle_lock(struct throttle *t)
				179	{
				180	down_read(&t->lock);
				181	}
				182
				183	static void throttle_unlock(struct throttle *t)
				184	{
				185	up_read(&t->lock);
				186	}
				187
				188	/----------------------------------------------------------------/
				189
				190	/*
				191	* A pool device ties together a metadata device and a data device. It
				192	* also provides the interface for creating and destroying internal
				193	* devices.
				194	*/
				195	struct dm_thin_new_mapping;
				196
				197	/*
				198	* The pool runs in various modes. Ordered in degraded order for comparisons.
				199	*/
				200	enum pool_mode {
				201	PM_WRITE, /* metadata may be changed */
				202	PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
				203
				204	/*
				205	* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
				206	*/
				207	PM_OUT_OF_METADATA_SPACE,
				208	PM_READ_ONLY, /* metadata may not be changed */
				209
				210	PM_FAIL, /* all I/O fails */
				211	};
				212
				213	struct pool_features {
				214	enum pool_mode mode;
				215
				216	bool zero_new_blocks:1;
				217	bool discard_enabled:1;
				218	bool discard_passdown:1;
				219	bool error_if_no_space:1;
				220	};
				221
				222	struct thin_c;
				223	typedef void (process_bio_fn)(struct thin_c tc, struct bio *bio);
				224	typedef void (process_cell_fn)(struct thin_c tc, struct dm_bio_prison_cell *cell);
				225	typedef void (process_mapping_fn)(struct dm_thin_new_mapping m);
				226
				227	#define CELL_SORT_ARRAY_SIZE 8192
				228
				229	struct pool {
				230	struct list_head list;
				231	struct dm_target ti; / Only set if a pool target is bound */
				232
				233	struct mapped_device *pool_md;
				234	struct block_device *md_dev;
				235	struct dm_pool_metadata *pmd;
				236
				237	dm_block_t low_water_blocks;
				238	uint32_t sectors_per_block;
				239	int sectors_per_block_shift;
				240
				241	struct pool_features pf;
				242	bool low_water_triggered:1; /* A dm event has been sent */
				243	bool suspended:1;
				244	bool out_of_data_space:1;
				245
				246	struct dm_bio_prison *prison;
				247	struct dm_kcopyd_client *copier;
				248
				249	struct workqueue_struct *wq;
				250	struct throttle throttle;
				251	struct work_struct worker;
				252	struct delayed_work waker;
				253	struct delayed_work no_space_timeout;
				254
				255	unsigned long last_commit_jiffies;
				256	unsigned ref_count;
				257
				258	spinlock_t lock;
				259	struct bio_list deferred_flush_bios;
				260	struct bio_list deferred_flush_completions;
				261	struct list_head prepared_mappings;
				262	struct list_head prepared_discards;
				263	struct list_head prepared_discards_pt2;
				264	struct list_head active_thins;
				265
				266	struct dm_deferred_set *shared_read_ds;
				267	struct dm_deferred_set *all_io_ds;
				268
				269	struct dm_thin_new_mapping *next_mapping;
				270	mempool_t *mapping_pool;
				271
				272	process_bio_fn process_bio;
				273	process_bio_fn process_discard;
				274
				275	process_cell_fn process_cell;
				276	process_cell_fn process_discard_cell;
				277
				278	process_mapping_fn process_prepared_mapping;
				279	process_mapping_fn process_prepared_discard;
				280	process_mapping_fn process_prepared_discard_pt2;
				281
				282	struct dm_bio_prison_cell **cell_sort_array;
				283	};
				284
				285	static void metadata_operation_failed(struct pool pool, const char op, int r);
				286
				287	static enum pool_mode get_pool_mode(struct pool *pool)
				288	{
				289	return pool->pf.mode;
				290	}
				291
				292	static void notify_of_pool_mode_change(struct pool *pool)
				293	{
				294	const char *descs[] = {
				295	"write",
				296	"out-of-data-space",
				297	"read-only",
				298	"read-only",
				299	"fail"
				300	};
				301	const char *extra_desc = NULL;
				302	enum pool_mode mode = get_pool_mode(pool);
				303
				304	if (mode == PM_OUT_OF_DATA_SPACE) {
				305	if (!pool->pf.error_if_no_space)
				306	extra_desc = " (queue IO)";
				307	else
				308	extra_desc = " (error IO)";
				309	}
				310
				311	dm_table_event(pool->ti->table);
				312	DMINFO("%s: switching pool to %s%s mode",
				313	dm_device_name(pool->pool_md),
				314	descs[(int)mode], extra_desc ? : "");
				315	}
				316
				317	/*
				318	* Target context for a pool.
				319	*/
				320	struct pool_c {
				321	struct dm_target *ti;
				322	struct pool *pool;
				323	struct dm_dev *data_dev;
				324	struct dm_dev *metadata_dev;
				325	struct dm_target_callbacks callbacks;
				326
				327	dm_block_t low_water_blocks;
				328	struct pool_features requested_pf; /* Features requested during table load */
				329	struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
				330	};
				331
				332	/*
				333	* Target context for a thin.
				334	*/
				335	struct thin_c {
				336	struct list_head list;
				337	struct dm_dev *pool_dev;
				338	struct dm_dev *origin_dev;
				339	sector_t origin_size;
				340	dm_thin_id dev_id;
				341
				342	struct pool *pool;
				343	struct dm_thin_device *td;
				344	struct mapped_device *thin_md;
				345
				346	bool requeue_mode:1;
				347	spinlock_t lock;
				348	struct list_head deferred_cells;
				349	struct bio_list deferred_bio_list;
				350	struct bio_list retry_on_resume_list;
				351	struct rb_root sort_bio_list; /* sorted list of deferred bios */
				352
				353	/*
				354	* Ensures the thin is not destroyed until the worker has finished
				355	* iterating the active_thins list.
				356	*/
				357	atomic_t refcount;
				358	struct completion can_destroy;
				359	};
				360
				361	/----------------------------------------------------------------/
				362
				363	static bool block_size_is_power_of_two(struct pool *pool)
				364	{
				365	return pool->sectors_per_block_shift >= 0;
				366	}
				367
				368	static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
				369	{
				370	return block_size_is_power_of_two(pool) ?
				371	(b << pool->sectors_per_block_shift) :
				372	(b * pool->sectors_per_block);
				373	}
				374
				375	/----------------------------------------------------------------/
				376
				377	struct discard_op {
				378	struct thin_c *tc;
				379	struct blk_plug plug;
				380	struct bio *parent_bio;
				381	struct bio *bio;
				382	};
				383
				384	static void begin_discard(struct discard_op op, struct thin_c tc, struct bio *parent)
				385	{
				386	BUG_ON(!parent);
				387
				388	op->tc = tc;
				389	blk_start_plug(&op->plug);
				390	op->parent_bio = parent;
				391	op->bio = NULL;
				392	}
				393
				394	static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
				395	{
				396	struct thin_c *tc = op->tc;
				397	sector_t s = block_to_sectors(tc->pool, data_b);
				398	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
				399
				400	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
				401	GFP_NOWAIT, 0, &op->bio);
				402	}
				403
				404	static void end_discard(struct discard_op *op, int r)
				405	{
				406	if (op->bio) {
				407	/*
				408	* Even if one of the calls to issue_discard failed, we
				409	* need to wait for the chain to complete.
				410	*/
				411	bio_chain(op->bio, op->parent_bio);
				412	bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
				413	submit_bio(op->bio);
				414	}
				415
				416	blk_finish_plug(&op->plug);
				417
				418	/*
				419	* Even if r is set, there could be sub discards in flight that we
				420	* need to wait for.
				421	*/
				422	if (r && !op->parent_bio->bi_status)
				423	op->parent_bio->bi_status = errno_to_blk_status(r);
				424	bio_endio(op->parent_bio);
				425	}
				426
				427	/----------------------------------------------------------------/
				428
				429	/*
				430	* wake_worker() is used when new work is queued and when pool_resume is
				431	* ready to continue deferred IO processing.
				432	*/
				433	static void wake_worker(struct pool *pool)
				434	{
				435	queue_work(pool->wq, &pool->worker);
				436	}
				437
				438	/----------------------------------------------------------------/
				439
				440	static int bio_detain(struct pool pool, struct dm_cell_key key, struct bio *bio,
				441	struct dm_bio_prison_cell **cell_result)
				442	{
				443	int r;
				444	struct dm_bio_prison_cell *cell_prealloc;
				445
				446	/*
				447	* Allocate a cell from the prison's mempool.
				448	* This might block but it can't fail.
				449	*/
				450	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
				451
				452	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
				453	if (r)
				454	/*
				455	* We reused an old cell; we can get rid of
				456	* the new one.
				457	*/
				458	dm_bio_prison_free_cell(pool->prison, cell_prealloc);
				459
				460	return r;
				461	}
				462
				463	static void cell_release(struct pool *pool,
				464	struct dm_bio_prison_cell *cell,
				465	struct bio_list *bios)
				466	{
				467	dm_cell_release(pool->prison, cell, bios);
				468	dm_bio_prison_free_cell(pool->prison, cell);
				469	}
				470
				471	static void cell_visit_release(struct pool *pool,
				472	void (fn)(void , struct dm_bio_prison_cell *),
				473	void *context,
				474	struct dm_bio_prison_cell *cell)
				475	{
				476	dm_cell_visit_release(pool->prison, fn, context, cell);
				477	dm_bio_prison_free_cell(pool->prison, cell);
				478	}
				479
				480	static void cell_release_no_holder(struct pool *pool,
				481	struct dm_bio_prison_cell *cell,
				482	struct bio_list *bios)
				483	{
				484	dm_cell_release_no_holder(pool->prison, cell, bios);
				485	dm_bio_prison_free_cell(pool->prison, cell);
				486	}
				487
				488	static void cell_error_with_code(struct pool *pool,
				489	struct dm_bio_prison_cell *cell, blk_status_t error_code)
				490	{
				491	dm_cell_error(pool->prison, cell, error_code);
				492	dm_bio_prison_free_cell(pool->prison, cell);
				493	}
				494
				495	static blk_status_t get_pool_io_error_code(struct pool *pool)
				496	{
				497	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
				498	}
				499
				500	static void cell_error(struct pool pool, struct dm_bio_prison_cell cell)
				501	{
				502	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
				503	}
				504
				505	static void cell_success(struct pool pool, struct dm_bio_prison_cell cell)
				506	{
				507	cell_error_with_code(pool, cell, 0);
				508	}
				509
				510	static void cell_requeue(struct pool pool, struct dm_bio_prison_cell cell)
				511	{
				512	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
				513	}
				514
				515	/----------------------------------------------------------------/
				516
				517	/*
				518	* A global list of pools that uses a struct mapped_device as a key.
				519	*/
				520	static struct dm_thin_pool_table {
				521	struct mutex mutex;
				522	struct list_head pools;
				523	} dm_thin_pool_table;
				524
				525	static void pool_table_init(void)
				526	{
				527	mutex_init(&dm_thin_pool_table.mutex);
				528	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				529	}
				530
				531	static void __pool_table_insert(struct pool *pool)
				532	{
				533	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				534	list_add(&pool->list, &dm_thin_pool_table.pools);
				535	}
				536
				537	static void __pool_table_remove(struct pool *pool)
				538	{
				539	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				540	list_del(&pool->list);
				541	}
				542
				543	static struct pool __pool_table_lookup(struct mapped_device md)
				544	{
				545	struct pool pool = NULL, tmp;
				546
				547	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				548
				549	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				550	if (tmp->pool_md == md) {
				551	pool = tmp;
				552	break;
				553	}
				554	}
				555
				556	return pool;
				557	}
				558
				559	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				560	{
				561	struct pool pool = NULL, tmp;
				562
				563	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				564
				565	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				566	if (tmp->md_dev == md_dev) {
				567	pool = tmp;
				568	break;
				569	}
				570	}
				571
				572	return pool;
				573	}
				574
				575	/----------------------------------------------------------------/
				576
				577	struct dm_thin_endio_hook {
				578	struct thin_c *tc;
				579	struct dm_deferred_entry *shared_read_entry;
				580	struct dm_deferred_entry *all_io_entry;
				581	struct dm_thin_new_mapping *overwrite_mapping;
				582	struct rb_node rb_node;
				583	struct dm_bio_prison_cell *cell;
				584	};
				585
				586	static void __merge_bio_list(struct bio_list bios, struct bio_list master)
				587	{
				588	bio_list_merge(bios, master);
				589	bio_list_init(master);
				590	}
				591
				592	static void error_bio_list(struct bio_list *bios, blk_status_t error)
				593	{
				594	struct bio *bio;
				595
				596	while ((bio = bio_list_pop(bios))) {
				597	bio->bi_status = error;
				598	bio_endio(bio);
				599	}
				600	}
				601
				602	static void error_thin_bio_list(struct thin_c tc, struct bio_list master,
				603	blk_status_t error)
				604	{
				605	struct bio_list bios;
				606	unsigned long flags;
				607
				608	bio_list_init(&bios);
				609
				610	spin_lock_irqsave(&tc->lock, flags);
				611	__merge_bio_list(&bios, master);
				612	spin_unlock_irqrestore(&tc->lock, flags);
				613
				614	error_bio_list(&bios, error);
				615	}
				616
				617	static void requeue_deferred_cells(struct thin_c *tc)
				618	{
				619	struct pool *pool = tc->pool;
				620	unsigned long flags;
				621	struct list_head cells;
				622	struct dm_bio_prison_cell cell, tmp;
				623
				624	INIT_LIST_HEAD(&cells);
				625
				626	spin_lock_irqsave(&tc->lock, flags);
				627	list_splice_init(&tc->deferred_cells, &cells);
				628	spin_unlock_irqrestore(&tc->lock, flags);
				629
				630	list_for_each_entry_safe(cell, tmp, &cells, user_list)
				631	cell_requeue(pool, cell);
				632	}
				633
				634	static void requeue_io(struct thin_c *tc)
				635	{
				636	struct bio_list bios;
				637	unsigned long flags;
				638
				639	bio_list_init(&bios);
				640
				641	spin_lock_irqsave(&tc->lock, flags);
				642	__merge_bio_list(&bios, &tc->deferred_bio_list);
				643	__merge_bio_list(&bios, &tc->retry_on_resume_list);
				644	spin_unlock_irqrestore(&tc->lock, flags);
				645
				646	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
				647	requeue_deferred_cells(tc);
				648	}
				649
				650	static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
				651	{
				652	struct thin_c *tc;
				653
				654	rcu_read_lock();
				655	list_for_each_entry_rcu(tc, &pool->active_thins, list)
				656	error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
				657	rcu_read_unlock();
				658	}
				659
				660	static void error_retry_list(struct pool *pool)
				661	{
				662	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
				663	}
				664
				665	/*
				666	* This section of code contains the logic for processing a thin device's IO.
				667	* Much of the code depends on pool object resources (lists, workqueues, etc)
				668	* but most is exclusively called from the thin target rather than the thin-pool
				669	* target.
				670	*/
				671
				672	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				673	{
				674	struct pool *pool = tc->pool;
				675	sector_t block_nr = bio->bi_iter.bi_sector;
				676
				677	if (block_size_is_power_of_two(pool))
				678	block_nr >>= pool->sectors_per_block_shift;
				679	else
				680	(void) sector_div(block_nr, pool->sectors_per_block);
				681
				682	return block_nr;
				683	}
				684
				685	/*
				686	* Returns the _complete_ blocks that this bio covers.
				687	*/
				688	static void get_bio_block_range(struct thin_c tc, struct bio bio,
				689	dm_block_t begin, dm_block_t end)
				690	{
				691	struct pool *pool = tc->pool;
				692	sector_t b = bio->bi_iter.bi_sector;
				693	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
				694
				695	b += pool->sectors_per_block - 1ull; /* so we round up */
				696
				697	if (block_size_is_power_of_two(pool)) {
				698	b >>= pool->sectors_per_block_shift;
				699	e >>= pool->sectors_per_block_shift;
				700	} else {
				701	(void) sector_div(b, pool->sectors_per_block);
				702	(void) sector_div(e, pool->sectors_per_block);
				703	}
				704
				705	if (e < b)
				706	/* Can happen if the bio is within a single block. */
				707	e = b;
				708
				709	*begin = b;
				710	*end = e;
				711	}
				712
				713	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				714	{
				715	struct pool *pool = tc->pool;
				716	sector_t bi_sector = bio->bi_iter.bi_sector;
				717
				718	bio_set_dev(bio, tc->pool_dev->bdev);
				719	if (block_size_is_power_of_two(pool))
				720	bio->bi_iter.bi_sector =
				721	(block << pool->sectors_per_block_shift) \|
				722	(bi_sector & (pool->sectors_per_block - 1));
				723	else
				724	bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
				725	sector_div(bi_sector, pool->sectors_per_block);
				726	}
				727
				728	static void remap_to_origin(struct thin_c tc, struct bio bio)
				729	{
				730	bio_set_dev(bio, tc->origin_dev->bdev);
				731	}
				732
				733	static int bio_triggers_commit(struct thin_c tc, struct bio bio)
				734	{
				735	return op_is_flush(bio->bi_opf) &&
				736	dm_thin_changed_this_transaction(tc->td);
				737	}
				738
				739	static void inc_all_io_entry(struct pool pool, struct bio bio)
				740	{
				741	struct dm_thin_endio_hook *h;
				742
				743	if (bio_op(bio) == REQ_OP_DISCARD)
				744	return;
				745
				746	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				747	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
				748	}
				749
				750	static void issue(struct thin_c tc, struct bio bio)
				751	{
				752	struct pool *pool = tc->pool;
				753	unsigned long flags;
				754
				755	if (!bio_triggers_commit(tc, bio)) {
				756	generic_make_request(bio);
				757	return;
				758	}
				759
				760	/*
				761	* Complete bio with an error if earlier I/O caused changes to
				762	* the metadata that can't be committed e.g, due to I/O errors
				763	* on the metadata device.
				764	*/
				765	if (dm_thin_aborted_changes(tc->td)) {
				766	bio_io_error(bio);
				767	return;
				768	}
				769
				770	/*
				771	* Batch together any bios that trigger commits and then issue a
				772	* single commit for them in process_deferred_bios().
				773	*/
				774	spin_lock_irqsave(&pool->lock, flags);
				775	bio_list_add(&pool->deferred_flush_bios, bio);
				776	spin_unlock_irqrestore(&pool->lock, flags);
				777	}
				778
				779	static void remap_to_origin_and_issue(struct thin_c tc, struct bio bio)
				780	{
				781	remap_to_origin(tc, bio);
				782	issue(tc, bio);
				783	}
				784
				785	static void remap_and_issue(struct thin_c tc, struct bio bio,
				786	dm_block_t block)
				787	{
				788	remap(tc, bio, block);
				789	issue(tc, bio);
				790	}
				791
				792	/----------------------------------------------------------------/
				793
				794	/*
				795	* Bio endio functions.
				796	*/
				797	struct dm_thin_new_mapping {
				798	struct list_head list;
				799
				800	bool pass_discard:1;
				801	bool maybe_shared:1;
				802
				803	/*
				804	* Track quiescing, copying and zeroing preparation actions. When this
				805	* counter hits zero the block is prepared and can be inserted into the
				806	* btree.
				807	*/
				808	atomic_t prepare_actions;
				809
				810	blk_status_t status;
				811	struct thin_c *tc;
				812	dm_block_t virt_begin, virt_end;
				813	dm_block_t data_block;
				814	struct dm_bio_prison_cell *cell;
				815
				816	/*
				817	* If the bio covers the whole area of a block then we can avoid
				818	* zeroing or copying. Instead this bio is hooked. The bio will
				819	* still be in the cell, so care has to be taken to avoid issuing
				820	* the bio twice.
				821	*/
				822	struct bio *bio;
				823	bio_end_io_t *saved_bi_end_io;
				824	};
				825
				826	static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
				827	{
				828	struct pool *pool = m->tc->pool;
				829
				830	if (atomic_dec_and_test(&m->prepare_actions)) {
				831	list_add_tail(&m->list, &pool->prepared_mappings);
				832	wake_worker(pool);
				833	}
				834	}
				835
				836	static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
				837	{
				838	unsigned long flags;
				839	struct pool *pool = m->tc->pool;
				840
				841	spin_lock_irqsave(&pool->lock, flags);
				842	__complete_mapping_preparation(m);
				843	spin_unlock_irqrestore(&pool->lock, flags);
				844	}
				845
				846	static void copy_complete(int read_err, unsigned long write_err, void *context)
				847	{
				848	struct dm_thin_new_mapping *m = context;
				849
				850	m->status = read_err \|\| write_err ? BLK_STS_IOERR : 0;
				851	complete_mapping_preparation(m);
				852	}
				853
				854	static void overwrite_endio(struct bio *bio)
				855	{
				856	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				857	struct dm_thin_new_mapping *m = h->overwrite_mapping;
				858
				859	bio->bi_end_io = m->saved_bi_end_io;
				860
				861	m->status = bio->bi_status;
				862	complete_mapping_preparation(m);
				863	}
				864
				865	/----------------------------------------------------------------/
				866
				867	/*
				868	* Workqueue.
				869	*/
				870
				871	/*
				872	* Prepared mapping jobs.
				873	*/
				874
				875	/*
				876	* This sends the bios in the cell, except the original holder, back
				877	* to the deferred_bios list.
				878	*/
				879	static void cell_defer_no_holder(struct thin_c tc, struct dm_bio_prison_cell cell)
				880	{
				881	struct pool *pool = tc->pool;
				882	unsigned long flags;
				883
				884	spin_lock_irqsave(&tc->lock, flags);
				885	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
				886	spin_unlock_irqrestore(&tc->lock, flags);
				887
				888	wake_worker(pool);
				889	}
				890
				891	static void thin_defer_bio(struct thin_c tc, struct bio bio);
				892
				893	struct remap_info {
				894	struct thin_c *tc;
				895	struct bio_list defer_bios;
				896	struct bio_list issue_bios;
				897	};
				898
				899	static void __inc_remap_and_issue_cell(void *context,
				900	struct dm_bio_prison_cell *cell)
				901	{
				902	struct remap_info *info = context;
				903	struct bio *bio;
				904
				905	while ((bio = bio_list_pop(&cell->bios))) {
				906	if (op_is_flush(bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD)
				907	bio_list_add(&info->defer_bios, bio);
				908	else {
				909	inc_all_io_entry(info->tc->pool, bio);
				910
				911	/*
				912	* We can't issue the bios with the bio prison lock
				913	* held, so we add them to a list to issue on
				914	* return from this function.
				915	*/
				916	bio_list_add(&info->issue_bios, bio);
				917	}
				918	}
				919	}
				920
				921	static void inc_remap_and_issue_cell(struct thin_c *tc,
				922	struct dm_bio_prison_cell *cell,
				923	dm_block_t block)
				924	{
				925	struct bio *bio;
				926	struct remap_info info;
				927
				928	info.tc = tc;
				929	bio_list_init(&info.defer_bios);
				930	bio_list_init(&info.issue_bios);
				931
				932	/*
				933	* We have to be careful to inc any bios we're about to issue
				934	* before the cell is released, and avoid a race with new bios
				935	* being added to the cell.
				936	*/
				937	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
				938	&info, cell);
				939
				940	while ((bio = bio_list_pop(&info.defer_bios)))
				941	thin_defer_bio(tc, bio);
				942
				943	while ((bio = bio_list_pop(&info.issue_bios)))
				944	remap_and_issue(info.tc, bio, block);
				945	}
				946
				947	static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
				948	{
				949	cell_error(m->tc->pool, m->cell);
				950	list_del(&m->list);
				951	mempool_free(m, m->tc->pool->mapping_pool);
				952	}
				953
				954	static void complete_overwrite_bio(struct thin_c tc, struct bio bio)
				955	{
				956	struct pool *pool = tc->pool;
				957	unsigned long flags;
				958
				959	/*
				960	* If the bio has the REQ_FUA flag set we must commit the metadata
				961	* before signaling its completion.
				962	*/
				963	if (!bio_triggers_commit(tc, bio)) {
				964	bio_endio(bio);
				965	return;
				966	}
				967
				968	/*
				969	* Complete bio with an error if earlier I/O caused changes to the
				970	* metadata that can't be committed, e.g, due to I/O errors on the
				971	* metadata device.
				972	*/
				973	if (dm_thin_aborted_changes(tc->td)) {
				974	bio_io_error(bio);
				975	return;
				976	}
				977
				978	/*
				979	* Batch together any bios that trigger commits and then issue a
				980	* single commit for them in process_deferred_bios().
				981	*/
				982	spin_lock_irqsave(&pool->lock, flags);
				983	bio_list_add(&pool->deferred_flush_completions, bio);
				984	spin_unlock_irqrestore(&pool->lock, flags);
				985	}
				986
				987	static void process_prepared_mapping(struct dm_thin_new_mapping *m)
				988	{
				989	struct thin_c *tc = m->tc;
				990	struct pool *pool = tc->pool;
				991	struct bio *bio = m->bio;
				992	int r;
				993
				994	if (m->status) {
				995	cell_error(pool, m->cell);
				996	goto out;
				997	}
				998
				999	/*
				1000	* Commit the prepared block into the mapping btree.
				1001	* Any I/O for this block arriving after this point will get
				1002	* remapped to it directly.
				1003	*/
				1004	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
				1005	if (r) {
				1006	metadata_operation_failed(pool, "dm_thin_insert_block", r);
				1007	cell_error(pool, m->cell);
				1008	goto out;
				1009	}
				1010
				1011	/*
				1012	* Release any bios held while the block was being provisioned.
				1013	* If we are processing a write bio that completely covers the block,
				1014	* we already processed it so can ignore it now when processing
				1015	* the bios in the cell.
				1016	*/
				1017	if (bio) {
				1018	inc_remap_and_issue_cell(tc, m->cell, m->data_block);
				1019	complete_overwrite_bio(tc, bio);
				1020	} else {
				1021	inc_all_io_entry(tc->pool, m->cell->holder);
				1022	remap_and_issue(tc, m->cell->holder, m->data_block);
				1023	inc_remap_and_issue_cell(tc, m->cell, m->data_block);
				1024	}
				1025
				1026	out:
				1027	list_del(&m->list);
				1028	mempool_free(m, pool->mapping_pool);
				1029	}
				1030
				1031	/----------------------------------------------------------------/
				1032
				1033	static void free_discard_mapping(struct dm_thin_new_mapping *m)
				1034	{
				1035	struct thin_c *tc = m->tc;
				1036	if (m->cell)
				1037	cell_defer_no_holder(tc, m->cell);
				1038	mempool_free(m, tc->pool->mapping_pool);
				1039	}
				1040
				1041	static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
				1042	{
				1043	bio_io_error(m->bio);
				1044	free_discard_mapping(m);
				1045	}
				1046
				1047	static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
				1048	{
				1049	bio_endio(m->bio);
				1050	free_discard_mapping(m);
				1051	}
				1052
				1053	static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
				1054	{
				1055	int r;
				1056	struct thin_c *tc = m->tc;
				1057
				1058	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
				1059	if (r) {
				1060	metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
				1061	bio_io_error(m->bio);
				1062	} else
				1063	bio_endio(m->bio);
				1064
				1065	cell_defer_no_holder(tc, m->cell);
				1066	mempool_free(m, tc->pool->mapping_pool);
				1067	}
				1068
				1069	/----------------------------------------------------------------/
				1070
				1071	static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
				1072	struct bio *discard_parent)
				1073	{
				1074	/*
				1075	* We've already unmapped this range of blocks, but before we
				1076	* passdown we have to check that these blocks are now unused.
				1077	*/
				1078	int r = 0;
				1079	bool shared = true;
				1080	struct thin_c *tc = m->tc;
				1081	struct pool *pool = tc->pool;
				1082	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
				1083	struct discard_op op;
				1084
				1085	begin_discard(&op, tc, discard_parent);
				1086	while (b != end) {
				1087	/* find start of unmapped run */
				1088	for (; b < end; b++) {
				1089	r = dm_pool_block_is_shared(pool->pmd, b, &shared);
				1090	if (r)
				1091	goto out;
				1092
				1093	if (!shared)
				1094	break;
				1095	}
				1096
				1097	if (b == end)
				1098	break;
				1099
				1100	/* find end of run */
				1101	for (e = b + 1; e != end; e++) {
				1102	r = dm_pool_block_is_shared(pool->pmd, e, &shared);
				1103	if (r)
				1104	goto out;
				1105
				1106	if (shared)
				1107	break;
				1108	}
				1109
				1110	r = issue_discard(&op, b, e);
				1111	if (r)
				1112	goto out;
				1113
				1114	b = e;
				1115	}
				1116	out:
				1117	end_discard(&op, r);
				1118	}
				1119
				1120	static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
				1121	{
				1122	unsigned long flags;
				1123	struct pool *pool = m->tc->pool;
				1124
				1125	spin_lock_irqsave(&pool->lock, flags);
				1126	list_add_tail(&m->list, &pool->prepared_discards_pt2);
				1127	spin_unlock_irqrestore(&pool->lock, flags);
				1128	wake_worker(pool);
				1129	}
				1130
				1131	static void passdown_endio(struct bio *bio)
				1132	{
				1133	/*
				1134	* It doesn't matter if the passdown discard failed, we still want
				1135	* to unmap (we ignore err).
				1136	*/
				1137	queue_passdown_pt2(bio->bi_private);
				1138	bio_put(bio);
				1139	}
				1140
				1141	static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
				1142	{
				1143	int r;
				1144	struct thin_c *tc = m->tc;
				1145	struct pool *pool = tc->pool;
				1146	struct bio *discard_parent;
				1147	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
				1148
				1149	/*
				1150	* Only this thread allocates blocks, so we can be sure that the
				1151	* newly unmapped blocks will not be allocated before the end of
				1152	* the function.
				1153	*/
				1154	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
				1155	if (r) {
				1156	metadata_operation_failed(pool, "dm_thin_remove_range", r);
				1157	bio_io_error(m->bio);
				1158	cell_defer_no_holder(tc, m->cell);
				1159	mempool_free(m, pool->mapping_pool);
				1160	return;
				1161	}
				1162
				1163	/*
				1164	* Increment the unmapped blocks. This prevents a race between the
				1165	* passdown io and reallocation of freed blocks.
				1166	*/
				1167	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
				1168	if (r) {
				1169	metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
				1170	bio_io_error(m->bio);
				1171	cell_defer_no_holder(tc, m->cell);
				1172	mempool_free(m, pool->mapping_pool);
				1173	return;
				1174	}
				1175
				1176	discard_parent = bio_alloc(GFP_NOIO, 1);
				1177	if (!discard_parent) {
				1178	DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
				1179	dm_device_name(tc->pool->pool_md));
				1180	queue_passdown_pt2(m);
				1181
				1182	} else {
				1183	discard_parent->bi_end_io = passdown_endio;
				1184	discard_parent->bi_private = m;
				1185
				1186	if (m->maybe_shared)
				1187	passdown_double_checking_shared_status(m, discard_parent);
				1188	else {
				1189	struct discard_op op;
				1190
				1191	begin_discard(&op, tc, discard_parent);
				1192	r = issue_discard(&op, m->data_block, data_end);
				1193	end_discard(&op, r);
				1194	}
				1195	}
				1196	}
				1197
				1198	static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
				1199	{
				1200	int r;
				1201	struct thin_c *tc = m->tc;
				1202	struct pool *pool = tc->pool;
				1203
				1204	/*
				1205	* The passdown has completed, so now we can decrement all those
				1206	* unmapped blocks.
				1207	*/
				1208	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
				1209	m->data_block + (m->virt_end - m->virt_begin));
				1210	if (r) {
				1211	metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
				1212	bio_io_error(m->bio);
				1213	} else
				1214	bio_endio(m->bio);
				1215
				1216	cell_defer_no_holder(tc, m->cell);
				1217	mempool_free(m, pool->mapping_pool);
				1218	}
				1219
				1220	static void process_prepared(struct pool pool, struct list_head head,
				1221	process_mapping_fn *fn)
				1222	{
				1223	unsigned long flags;
				1224	struct list_head maps;
				1225	struct dm_thin_new_mapping m, tmp;
				1226
				1227	INIT_LIST_HEAD(&maps);
				1228	spin_lock_irqsave(&pool->lock, flags);
				1229	list_splice_init(head, &maps);
				1230	spin_unlock_irqrestore(&pool->lock, flags);
				1231
				1232	list_for_each_entry_safe(m, tmp, &maps, list)
				1233	(*fn)(m);
				1234	}
				1235
				1236	/*
				1237	* Deferred bio jobs.
				1238	*/
				1239	static int io_overlaps_block(struct pool pool, struct bio bio)
				1240	{
				1241	return bio->bi_iter.bi_size ==
				1242	(pool->sectors_per_block << SECTOR_SHIFT);
				1243	}
				1244
				1245	static int io_overwrites_block(struct pool pool, struct bio bio)
				1246	{
				1247	return (bio_data_dir(bio) == WRITE) &&
				1248	io_overlaps_block(pool, bio);
				1249	}
				1250
				1251	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				1252	bio_end_io_t *fn)
				1253	{
				1254	*save = bio->bi_end_io;
				1255	bio->bi_end_io = fn;
				1256	}
				1257
				1258	static int ensure_next_mapping(struct pool *pool)
				1259	{
				1260	if (pool->next_mapping)
				1261	return 0;
				1262
				1263	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				1264
				1265	return pool->next_mapping ? 0 : -ENOMEM;
				1266	}
				1267
				1268	static struct dm_thin_new_mapping get_next_mapping(struct pool pool)
				1269	{
				1270	struct dm_thin_new_mapping *m = pool->next_mapping;
				1271
				1272	BUG_ON(!pool->next_mapping);
				1273
				1274	memset(m, 0, sizeof(struct dm_thin_new_mapping));
				1275	INIT_LIST_HEAD(&m->list);
				1276	m->bio = NULL;
				1277
				1278	pool->next_mapping = NULL;
				1279
				1280	return m;
				1281	}
				1282
				1283	static void ll_zero(struct thin_c tc, struct dm_thin_new_mapping m,
				1284	sector_t begin, sector_t end)
				1285	{
				1286	int r;
				1287	struct dm_io_region to;
				1288
				1289	to.bdev = tc->pool_dev->bdev;
				1290	to.sector = begin;
				1291	to.count = end - begin;
				1292
				1293	r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
				1294	if (r < 0) {
				1295	DMERR_LIMIT("dm_kcopyd_zero() failed");
				1296	copy_complete(1, 1, m);
				1297	}
				1298	}
				1299
				1300	static void remap_and_issue_overwrite(struct thin_c tc, struct bio bio,
				1301	dm_block_t data_begin,
				1302	struct dm_thin_new_mapping *m)
				1303	{
				1304	struct pool *pool = tc->pool;
				1305	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				1306
				1307	h->overwrite_mapping = m;
				1308	m->bio = bio;
				1309	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
				1310	inc_all_io_entry(pool, bio);
				1311	remap_and_issue(tc, bio, data_begin);
				1312	}
				1313
				1314	/*
				1315	* A partial copy also needs to zero the uncopied region.
				1316	*/
				1317	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
				1318	struct dm_dev *origin, dm_block_t data_origin,
				1319	dm_block_t data_dest,
				1320	struct dm_bio_prison_cell cell, struct bio bio,
				1321	sector_t len)
				1322	{
				1323	int r;
				1324	struct pool *pool = tc->pool;
				1325	struct dm_thin_new_mapping *m = get_next_mapping(pool);
				1326
				1327	m->tc = tc;
				1328	m->virt_begin = virt_block;
				1329	m->virt_end = virt_block + 1u;
				1330	m->data_block = data_dest;
				1331	m->cell = cell;
				1332
				1333	/*
				1334	* quiesce action + copy action + an extra reference held for the
				1335	* duration of this function (we may need to inc later for a
				1336	* partial zero).
				1337	*/
				1338	atomic_set(&m->prepare_actions, 3);
				1339
				1340	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
				1341	complete_mapping_preparation(m); /* already quiesced */
				1342
				1343	/*
				1344	* IO to pool_dev remaps to the pool target's data_dev.
				1345	*
				1346	* If the whole block of data is being overwritten, we can issue the
				1347	* bio immediately. Otherwise we use kcopyd to clone the data first.
				1348	*/
				1349	if (io_overwrites_block(pool, bio))
				1350	remap_and_issue_overwrite(tc, bio, data_dest, m);
				1351	else {
				1352	struct dm_io_region from, to;
				1353
				1354	from.bdev = origin->bdev;
				1355	from.sector = data_origin * pool->sectors_per_block;
				1356	from.count = len;
				1357
				1358	to.bdev = tc->pool_dev->bdev;
				1359	to.sector = data_dest * pool->sectors_per_block;
				1360	to.count = len;
				1361
				1362	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				1363	0, copy_complete, m);
				1364	if (r < 0) {
				1365	DMERR_LIMIT("dm_kcopyd_copy() failed");
				1366	copy_complete(1, 1, m);
				1367
				1368	/*
				1369	* We allow the zero to be issued, to simplify the
				1370	* error path. Otherwise we'd need to start
				1371	* worrying about decrementing the prepare_actions
				1372	* counter.
				1373	*/
				1374	}
				1375
				1376	/*
				1377	* Do we need to zero a tail region?
				1378	*/
				1379	if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
				1380	atomic_inc(&m->prepare_actions);
				1381	ll_zero(tc, m,
				1382	data_dest * pool->sectors_per_block + len,
				1383	(data_dest + 1) * pool->sectors_per_block);
				1384	}
				1385	}
				1386
				1387	complete_mapping_preparation(m); /* drop our ref */
				1388	}
				1389
				1390	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
				1391	dm_block_t data_origin, dm_block_t data_dest,
				1392	struct dm_bio_prison_cell cell, struct bio bio)
				1393	{
				1394	schedule_copy(tc, virt_block, tc->pool_dev,
				1395	data_origin, data_dest, cell, bio,
				1396	tc->pool->sectors_per_block);
				1397	}
				1398
				1399	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
				1400	dm_block_t data_block, struct dm_bio_prison_cell *cell,
				1401	struct bio *bio)
				1402	{
				1403	struct pool *pool = tc->pool;
				1404	struct dm_thin_new_mapping *m = get_next_mapping(pool);
				1405
				1406	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
				1407	m->tc = tc;
				1408	m->virt_begin = virt_block;
				1409	m->virt_end = virt_block + 1u;
				1410	m->data_block = data_block;
				1411	m->cell = cell;
				1412
				1413	/*
				1414	* If the whole block of data is being overwritten or we are not
				1415	* zeroing pre-existing data, we can issue the bio immediately.
				1416	* Otherwise we use kcopyd to zero the data first.
				1417	*/
				1418	if (pool->pf.zero_new_blocks) {
				1419	if (io_overwrites_block(pool, bio))
				1420	remap_and_issue_overwrite(tc, bio, data_block, m);
				1421	else
				1422	ll_zero(tc, m, data_block * pool->sectors_per_block,
				1423	(data_block + 1) * pool->sectors_per_block);
				1424	} else
				1425	process_prepared_mapping(m);
				1426	}
				1427
				1428	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
				1429	dm_block_t data_dest,
				1430	struct dm_bio_prison_cell cell, struct bio bio)
				1431	{
				1432	struct pool *pool = tc->pool;
				1433	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
				1434	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
				1435
				1436	if (virt_block_end <= tc->origin_size)
				1437	schedule_copy(tc, virt_block, tc->origin_dev,
				1438	virt_block, data_dest, cell, bio,
				1439	pool->sectors_per_block);
				1440
				1441	else if (virt_block_begin < tc->origin_size)
				1442	schedule_copy(tc, virt_block, tc->origin_dev,
				1443	virt_block, data_dest, cell, bio,
				1444	tc->origin_size - virt_block_begin);
				1445
				1446	else
				1447	schedule_zero(tc, virt_block, data_dest, cell, bio);
				1448	}
				1449
				1450	static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
				1451
				1452	static void requeue_bios(struct pool *pool);
				1453
				1454	static bool is_read_only_pool_mode(enum pool_mode mode)
				1455	{
				1456	return (mode == PM_OUT_OF_METADATA_SPACE \|\| mode == PM_READ_ONLY);
				1457	}
				1458
				1459	static bool is_read_only(struct pool *pool)
				1460	{
				1461	return is_read_only_pool_mode(get_pool_mode(pool));
				1462	}
				1463
				1464	static void check_for_metadata_space(struct pool *pool)
				1465	{
				1466	int r;
				1467	const char *ooms_reason = NULL;
				1468	dm_block_t nr_free;
				1469
				1470	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
				1471	if (r)
				1472	ooms_reason = "Could not get free metadata blocks";
				1473	else if (!nr_free)
				1474	ooms_reason = "No free metadata blocks";
				1475
				1476	if (ooms_reason && !is_read_only(pool)) {
				1477	DMERR("%s", ooms_reason);
				1478	set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
				1479	}
				1480	}
				1481
				1482	static void check_for_data_space(struct pool *pool)
				1483	{
				1484	int r;
				1485	dm_block_t nr_free;
				1486
				1487	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
				1488	return;
				1489
				1490	r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
				1491	if (r)
				1492	return;
				1493
				1494	if (nr_free) {
				1495	set_pool_mode(pool, PM_WRITE);
				1496	requeue_bios(pool);
				1497	}
				1498	}
				1499
				1500	/*
				1501	* A non-zero return indicates read_only or fail_io mode.
				1502	* Many callers don't care about the return value.
				1503	*/
				1504	static int commit(struct pool *pool)
				1505	{
				1506	int r;
				1507
				1508	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
				1509	return -EINVAL;
				1510
				1511	r = dm_pool_commit_metadata(pool->pmd);
				1512	if (r)
				1513	metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
				1514	else {
				1515	check_for_metadata_space(pool);
				1516	check_for_data_space(pool);
				1517	}
				1518
				1519	return r;
				1520	}
				1521
				1522	static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
				1523	{
				1524	unsigned long flags;
				1525
				1526	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1527	DMWARN("%s: reached low water mark for data device: sending event.",
				1528	dm_device_name(pool->pool_md));
				1529	spin_lock_irqsave(&pool->lock, flags);
				1530	pool->low_water_triggered = true;
				1531	spin_unlock_irqrestore(&pool->lock, flags);
				1532	dm_table_event(pool->ti->table);
				1533	}
				1534	}
				1535
				1536	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1537	{
				1538	int r;
				1539	dm_block_t free_blocks;
				1540	struct pool *pool = tc->pool;
				1541
				1542	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
				1543	return -EINVAL;
				1544
				1545	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1546	if (r) {
				1547	metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
				1548	return r;
				1549	}
				1550
				1551	check_low_water_mark(pool, free_blocks);
				1552
				1553	if (!free_blocks) {
				1554	/*
				1555	* Try to commit to see if that will free up some
				1556	* more space.
				1557	*/
				1558	r = commit(pool);
				1559	if (r)
				1560	return r;
				1561
				1562	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1563	if (r) {
				1564	metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
				1565	return r;
				1566	}
				1567
				1568	if (!free_blocks) {
				1569	set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
				1570	return -ENOSPC;
				1571	}
				1572	}
				1573
				1574	r = dm_pool_alloc_data_block(pool->pmd, result);
				1575	if (r) {
				1576	if (r == -ENOSPC)
				1577	set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
				1578	else
				1579	metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
				1580	return r;
				1581	}
				1582
				1583	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
				1584	if (r) {
				1585	metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
				1586	return r;
				1587	}
				1588
				1589	if (!free_blocks) {
				1590	/* Let's commit before we use up the metadata reserve. */
				1591	r = commit(pool);
				1592	if (r)
				1593	return r;
				1594	}
				1595
				1596	return 0;
				1597	}
				1598
				1599	/*
				1600	* If we have run out of space, queue bios until the device is
				1601	* resumed, presumably after having been reloaded with more space.
				1602	*/
				1603	static void retry_on_resume(struct bio *bio)
				1604	{
				1605	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				1606	struct thin_c *tc = h->tc;
				1607	unsigned long flags;
				1608
				1609	spin_lock_irqsave(&tc->lock, flags);
				1610	bio_list_add(&tc->retry_on_resume_list, bio);
				1611	spin_unlock_irqrestore(&tc->lock, flags);
				1612	}
				1613
				1614	static blk_status_t should_error_unserviceable_bio(struct pool *pool)
				1615	{
				1616	enum pool_mode m = get_pool_mode(pool);
				1617
				1618	switch (m) {
				1619	case PM_WRITE:
				1620	/* Shouldn't get here */
				1621	DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
				1622	return BLK_STS_IOERR;
				1623
				1624	case PM_OUT_OF_DATA_SPACE:
				1625	return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
				1626
				1627	case PM_OUT_OF_METADATA_SPACE:
				1628	case PM_READ_ONLY:
				1629	case PM_FAIL:
				1630	return BLK_STS_IOERR;
				1631	default:
				1632	/* Shouldn't get here */
				1633	DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
				1634	return BLK_STS_IOERR;
				1635	}
				1636	}
				1637
				1638	static void handle_unserviceable_bio(struct pool pool, struct bio bio)
				1639	{
				1640	blk_status_t error = should_error_unserviceable_bio(pool);
				1641
				1642	if (error) {
				1643	bio->bi_status = error;
				1644	bio_endio(bio);
				1645	} else
				1646	retry_on_resume(bio);
				1647	}
				1648
				1649	static void retry_bios_on_resume(struct pool pool, struct dm_bio_prison_cell cell)
				1650	{
				1651	struct bio *bio;
				1652	struct bio_list bios;
				1653	blk_status_t error;
				1654
				1655	error = should_error_unserviceable_bio(pool);
				1656	if (error) {
				1657	cell_error_with_code(pool, cell, error);
				1658	return;
				1659	}
				1660
				1661	bio_list_init(&bios);
				1662	cell_release(pool, cell, &bios);
				1663
				1664	while ((bio = bio_list_pop(&bios)))
				1665	retry_on_resume(bio);
				1666	}
				1667
				1668	static void process_discard_cell_no_passdown(struct thin_c *tc,
				1669	struct dm_bio_prison_cell *virt_cell)
				1670	{
				1671	struct pool *pool = tc->pool;
				1672	struct dm_thin_new_mapping *m = get_next_mapping(pool);
				1673
				1674	/*
				1675	* We don't need to lock the data blocks, since there's no
				1676	* passdown. We only lock data blocks for allocation and breaking sharing.
				1677	*/
				1678	m->tc = tc;
				1679	m->virt_begin = virt_cell->key.block_begin;
				1680	m->virt_end = virt_cell->key.block_end;
				1681	m->cell = virt_cell;
				1682	m->bio = virt_cell->holder;
				1683
				1684	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
				1685	pool->process_prepared_discard(m);
				1686	}
				1687
				1688	static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
				1689	struct bio *bio)
				1690	{
				1691	struct pool *pool = tc->pool;
				1692
				1693	int r;
				1694	bool maybe_shared;
				1695	struct dm_cell_key data_key;
				1696	struct dm_bio_prison_cell *data_cell;
				1697	struct dm_thin_new_mapping *m;
				1698	dm_block_t virt_begin, virt_end, data_begin;
				1699
				1700	while (begin != end) {
				1701	r = ensure_next_mapping(pool);
				1702	if (r)
				1703	/* we did our best */
				1704	return;
				1705
				1706	r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
				1707	&data_begin, &maybe_shared);
				1708	if (r)
				1709	/*
				1710	* Silently fail, letting any mappings we've
				1711	* created complete.
				1712	*/
				1713	break;
				1714
				1715	build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
				1716	if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
				1717	/* contention, we'll give up with this range */
				1718	begin = virt_end;
				1719	continue;
				1720	}
				1721
				1722	/*
				1723	* IO may still be going to the destination block. We must
				1724	* quiesce before we can do the removal.
				1725	*/
				1726	m = get_next_mapping(pool);
				1727	m->tc = tc;
				1728	m->maybe_shared = maybe_shared;
				1729	m->virt_begin = virt_begin;
				1730	m->virt_end = virt_end;
				1731	m->data_block = data_begin;
				1732	m->cell = data_cell;
				1733	m->bio = bio;
				1734
				1735	/*
				1736	* The parent bio must not complete before sub discard bios are
				1737	* chained to it (see end_discard's bio_chain)!
				1738	*
				1739	* This per-mapping bi_remaining increment is paired with
				1740	* the implicit decrement that occurs via bio_endio() in
				1741	* end_discard().
				1742	*/
				1743	bio_inc_remaining(bio);
				1744	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
				1745	pool->process_prepared_discard(m);
				1746
				1747	begin = virt_end;
				1748	}
				1749	}
				1750
				1751	static void process_discard_cell_passdown(struct thin_c tc, struct dm_bio_prison_cell virt_cell)
				1752	{
				1753	struct bio *bio = virt_cell->holder;
				1754	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				1755
				1756	/*
				1757	* The virt_cell will only get freed once the origin bio completes.
				1758	* This means it will remain locked while all the individual
				1759	* passdown bios are in flight.
				1760	*/
				1761	h->cell = virt_cell;
				1762	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
				1763
				1764	/*
				1765	* We complete the bio now, knowing that the bi_remaining field
				1766	* will prevent completion until the sub range discards have
				1767	* completed.
				1768	*/
				1769	bio_endio(bio);
				1770	}
				1771
				1772	static void process_discard_bio(struct thin_c tc, struct bio bio)
				1773	{
				1774	dm_block_t begin, end;
				1775	struct dm_cell_key virt_key;
				1776	struct dm_bio_prison_cell *virt_cell;
				1777
				1778	get_bio_block_range(tc, bio, &begin, &end);
				1779	if (begin == end) {
				1780	/*
				1781	* The discard covers less than a block.
				1782	*/
				1783	bio_endio(bio);
				1784	return;
				1785	}
				1786
				1787	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
				1788	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
				1789	/*
				1790	* Potential starvation issue: We're relying on the
				1791	* fs/application being well behaved, and not trying to
				1792	* send IO to a region at the same time as discarding it.
				1793	* If they do this persistently then it's possible this
				1794	* cell will never be granted.
				1795	*/
				1796	return;
				1797
				1798	tc->pool->process_discard_cell(tc, virt_cell);
				1799	}
				1800
				1801	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1802	struct dm_cell_key *key,
				1803	struct dm_thin_lookup_result *lookup_result,
				1804	struct dm_bio_prison_cell *cell)
				1805	{
				1806	int r;
				1807	dm_block_t data_block;
				1808	struct pool *pool = tc->pool;
				1809
				1810	r = alloc_data_block(tc, &data_block);
				1811	switch (r) {
				1812	case 0:
				1813	schedule_internal_copy(tc, block, lookup_result->block,
				1814	data_block, cell, bio);
				1815	break;
				1816
				1817	case -ENOSPC:
				1818	retry_bios_on_resume(pool, cell);
				1819	break;
				1820
				1821	default:
				1822	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
				1823	__func__, r);
				1824	cell_error(pool, cell);
				1825	break;
				1826	}
				1827	}
				1828
				1829	static void __remap_and_issue_shared_cell(void *context,
				1830	struct dm_bio_prison_cell *cell)
				1831	{
				1832	struct remap_info *info = context;
				1833	struct bio *bio;
				1834
				1835	while ((bio = bio_list_pop(&cell->bios))) {
				1836	if (bio_data_dir(bio) == WRITE \|\| op_is_flush(bio->bi_opf) \|\|
				1837	bio_op(bio) == REQ_OP_DISCARD)
				1838	bio_list_add(&info->defer_bios, bio);
				1839	else {
				1840	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
				1841
				1842	h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
				1843	inc_all_io_entry(info->tc->pool, bio);
				1844	bio_list_add(&info->issue_bios, bio);
				1845	}
				1846	}
				1847	}
				1848
				1849	static void remap_and_issue_shared_cell(struct thin_c *tc,
				1850	struct dm_bio_prison_cell *cell,
				1851	dm_block_t block)
				1852	{
				1853	struct bio *bio;
				1854	struct remap_info info;
				1855
				1856	info.tc = tc;
				1857	bio_list_init(&info.defer_bios);
				1858	bio_list_init(&info.issue_bios);
				1859
				1860	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
				1861	&info, cell);
				1862
				1863	while ((bio = bio_list_pop(&info.defer_bios)))
				1864	thin_defer_bio(tc, bio);
				1865
				1866	while ((bio = bio_list_pop(&info.issue_bios)))
				1867	remap_and_issue(tc, bio, block);
				1868	}
				1869
				1870	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1871	dm_block_t block,
				1872	struct dm_thin_lookup_result *lookup_result,
				1873	struct dm_bio_prison_cell *virt_cell)
				1874	{
				1875	struct dm_bio_prison_cell *data_cell;
				1876	struct pool *pool = tc->pool;
				1877	struct dm_cell_key key;
				1878
				1879	/*
				1880	* If cell is already occupied, then sharing is already in the process
				1881	* of being broken so we have nothing further to do here.
				1882	*/
				1883	build_data_key(tc->td, lookup_result->block, &key);
				1884	if (bio_detain(pool, &key, bio, &data_cell)) {
				1885	cell_defer_no_holder(tc, virt_cell);
				1886	return;
				1887	}
				1888
				1889	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
				1890	break_sharing(tc, bio, block, &key, lookup_result, data_cell);
				1891	cell_defer_no_holder(tc, virt_cell);
				1892	} else {
				1893	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				1894
				1895	h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
				1896	inc_all_io_entry(pool, bio);
				1897	remap_and_issue(tc, bio, lookup_result->block);
				1898
				1899	remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
				1900	remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
				1901	}
				1902	}
				1903
				1904	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
				1905	struct dm_bio_prison_cell *cell)
				1906	{
				1907	int r;
				1908	dm_block_t data_block;
				1909	struct pool *pool = tc->pool;
				1910
				1911	/*
				1912	* Remap empty bios (flushes) immediately, without provisioning.
				1913	*/
				1914	if (!bio->bi_iter.bi_size) {
				1915	inc_all_io_entry(pool, bio);
				1916	cell_defer_no_holder(tc, cell);
				1917
				1918	remap_and_issue(tc, bio, 0);
				1919	return;
				1920	}
				1921
				1922	/*
				1923	* Fill read bios with zeroes and complete them immediately.
				1924	*/
				1925	if (bio_data_dir(bio) == READ) {
				1926	zero_fill_bio(bio);
				1927	cell_defer_no_holder(tc, cell);
				1928	bio_endio(bio);
				1929	return;
				1930	}
				1931
				1932	r = alloc_data_block(tc, &data_block);
				1933	switch (r) {
				1934	case 0:
				1935	if (tc->origin_dev)
				1936	schedule_external_copy(tc, block, data_block, cell, bio);
				1937	else
				1938	schedule_zero(tc, block, data_block, cell, bio);
				1939	break;
				1940
				1941	case -ENOSPC:
				1942	retry_bios_on_resume(pool, cell);
				1943	break;
				1944
				1945	default:
				1946	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
				1947	__func__, r);
				1948	cell_error(pool, cell);
				1949	break;
				1950	}
				1951	}
				1952
				1953	static void process_cell(struct thin_c tc, struct dm_bio_prison_cell cell)
				1954	{
				1955	int r;
				1956	struct pool *pool = tc->pool;
				1957	struct bio *bio = cell->holder;
				1958	dm_block_t block = get_bio_block(tc, bio);
				1959	struct dm_thin_lookup_result lookup_result;
				1960
				1961	if (tc->requeue_mode) {
				1962	cell_requeue(pool, cell);
				1963	return;
				1964	}
				1965
				1966	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1967	switch (r) {
				1968	case 0:
				1969	if (lookup_result.shared)
				1970	process_shared_bio(tc, bio, block, &lookup_result, cell);
				1971	else {
				1972	inc_all_io_entry(pool, bio);
				1973	remap_and_issue(tc, bio, lookup_result.block);
				1974	inc_remap_and_issue_cell(tc, cell, lookup_result.block);
				1975	}
				1976	break;
				1977
				1978	case -ENODATA:
				1979	if (bio_data_dir(bio) == READ && tc->origin_dev) {
				1980	inc_all_io_entry(pool, bio);
				1981	cell_defer_no_holder(tc, cell);
				1982
				1983	if (bio_end_sector(bio) <= tc->origin_size)
				1984	remap_to_origin_and_issue(tc, bio);
				1985
				1986	else if (bio->bi_iter.bi_sector < tc->origin_size) {
				1987	zero_fill_bio(bio);
				1988	bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
				1989	remap_to_origin_and_issue(tc, bio);
				1990
				1991	} else {
				1992	zero_fill_bio(bio);
				1993	bio_endio(bio);
				1994	}
				1995	} else
				1996	provision_block(tc, bio, block, cell);
				1997	break;
				1998
				1999	default:
				2000	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
				2001	__func__, r);
				2002	cell_defer_no_holder(tc, cell);
				2003	bio_io_error(bio);
				2004	break;
				2005	}
				2006	}
				2007
				2008	static void process_bio(struct thin_c tc, struct bio bio)
				2009	{
				2010	struct pool *pool = tc->pool;
				2011	dm_block_t block = get_bio_block(tc, bio);
				2012	struct dm_bio_prison_cell *cell;
				2013	struct dm_cell_key key;
				2014
				2015	/*
				2016	* If cell is already occupied, then the block is already
				2017	* being provisioned so we have nothing further to do here.
				2018	*/
				2019	build_virtual_key(tc->td, block, &key);
				2020	if (bio_detain(pool, &key, bio, &cell))
				2021	return;
				2022
				2023	process_cell(tc, cell);
				2024	}
				2025
				2026	static void __process_bio_read_only(struct thin_c tc, struct bio bio,
				2027	struct dm_bio_prison_cell *cell)
				2028	{
				2029	int r;
				2030	int rw = bio_data_dir(bio);
				2031	dm_block_t block = get_bio_block(tc, bio);
				2032	struct dm_thin_lookup_result lookup_result;
				2033
				2034	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				2035	switch (r) {
				2036	case 0:
				2037	if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
				2038	handle_unserviceable_bio(tc->pool, bio);
				2039	if (cell)
				2040	cell_defer_no_holder(tc, cell);
				2041	} else {
				2042	inc_all_io_entry(tc->pool, bio);
				2043	remap_and_issue(tc, bio, lookup_result.block);
				2044	if (cell)
				2045	inc_remap_and_issue_cell(tc, cell, lookup_result.block);
				2046	}
				2047	break;
				2048
				2049	case -ENODATA:
				2050	if (cell)
				2051	cell_defer_no_holder(tc, cell);
				2052	if (rw != READ) {
				2053	handle_unserviceable_bio(tc->pool, bio);
				2054	break;
				2055	}
				2056
				2057	if (tc->origin_dev) {
				2058	inc_all_io_entry(tc->pool, bio);
				2059	remap_to_origin_and_issue(tc, bio);
				2060	break;
				2061	}
				2062
				2063	zero_fill_bio(bio);
				2064	bio_endio(bio);
				2065	break;
				2066
				2067	default:
				2068	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
				2069	__func__, r);
				2070	if (cell)
				2071	cell_defer_no_holder(tc, cell);
				2072	bio_io_error(bio);
				2073	break;
				2074	}
				2075	}
				2076
				2077	static void process_bio_read_only(struct thin_c tc, struct bio bio)
				2078	{
				2079	__process_bio_read_only(tc, bio, NULL);
				2080	}
				2081
				2082	static void process_cell_read_only(struct thin_c tc, struct dm_bio_prison_cell cell)
				2083	{
				2084	__process_bio_read_only(tc, cell->holder, cell);
				2085	}
				2086
				2087	static void process_bio_success(struct thin_c tc, struct bio bio)
				2088	{
				2089	bio_endio(bio);
				2090	}
				2091
				2092	static void process_bio_fail(struct thin_c tc, struct bio bio)
				2093	{
				2094	bio_io_error(bio);
				2095	}
				2096
				2097	static void process_cell_success(struct thin_c tc, struct dm_bio_prison_cell cell)
				2098	{
				2099	cell_success(tc->pool, cell);
				2100	}
				2101
				2102	static void process_cell_fail(struct thin_c tc, struct dm_bio_prison_cell cell)
				2103	{
				2104	cell_error(tc->pool, cell);
				2105	}
				2106
				2107	/*
				2108	* FIXME: should we also commit due to size of transaction, measured in
				2109	* metadata blocks?
				2110	*/
				2111	static int need_commit_due_to_time(struct pool *pool)
				2112	{
				2113	return !time_in_range(jiffies, pool->last_commit_jiffies,
				2114	pool->last_commit_jiffies + COMMIT_PERIOD);
				2115	}
				2116
				2117	#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
				2118	#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
				2119
				2120	static void __thin_bio_rb_add(struct thin_c tc, struct bio bio)
				2121	{
				2122	struct rb_node *rbp, parent;
				2123	struct dm_thin_endio_hook *pbd;
				2124	sector_t bi_sector = bio->bi_iter.bi_sector;
				2125
				2126	rbp = &tc->sort_bio_list.rb_node;
				2127	parent = NULL;
				2128	while (*rbp) {
				2129	parent = *rbp;
				2130	pbd = thin_pbd(parent);
				2131
				2132	if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
				2133	rbp = &(*rbp)->rb_left;
				2134	else
				2135	rbp = &(*rbp)->rb_right;
				2136	}
				2137
				2138	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				2139	rb_link_node(&pbd->rb_node, parent, rbp);
				2140	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
				2141	}
				2142
				2143	static void __extract_sorted_bios(struct thin_c *tc)
				2144	{
				2145	struct rb_node *node;
				2146	struct dm_thin_endio_hook *pbd;
				2147	struct bio *bio;
				2148
				2149	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
				2150	pbd = thin_pbd(node);
				2151	bio = thin_bio(pbd);
				2152
				2153	bio_list_add(&tc->deferred_bio_list, bio);
				2154	rb_erase(&pbd->rb_node, &tc->sort_bio_list);
				2155	}
				2156
				2157	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
				2158	}
				2159
				2160	static void __sort_thin_deferred_bios(struct thin_c *tc)
				2161	{
				2162	struct bio *bio;
				2163	struct bio_list bios;
				2164
				2165	bio_list_init(&bios);
				2166	bio_list_merge(&bios, &tc->deferred_bio_list);
				2167	bio_list_init(&tc->deferred_bio_list);
				2168
				2169	/* Sort deferred_bio_list using rb-tree */
				2170	while ((bio = bio_list_pop(&bios)))
				2171	__thin_bio_rb_add(tc, bio);
				2172
				2173	/*
				2174	* Transfer the sorted bios in sort_bio_list back to
				2175	* deferred_bio_list to allow lockless submission of
				2176	* all bios.
				2177	*/
				2178	__extract_sorted_bios(tc);
				2179	}
				2180
				2181	static void process_thin_deferred_bios(struct thin_c *tc)
				2182	{
				2183	struct pool *pool = tc->pool;
				2184	unsigned long flags;
				2185	struct bio *bio;
				2186	struct bio_list bios;
				2187	struct blk_plug plug;
				2188	unsigned count = 0;
				2189
				2190	if (tc->requeue_mode) {
				2191	error_thin_bio_list(tc, &tc->deferred_bio_list,
				2192	BLK_STS_DM_REQUEUE);
				2193	return;
				2194	}
				2195
				2196	bio_list_init(&bios);
				2197
				2198	spin_lock_irqsave(&tc->lock, flags);
				2199
				2200	if (bio_list_empty(&tc->deferred_bio_list)) {
				2201	spin_unlock_irqrestore(&tc->lock, flags);
				2202	return;
				2203	}
				2204
				2205	__sort_thin_deferred_bios(tc);
				2206
				2207	bio_list_merge(&bios, &tc->deferred_bio_list);
				2208	bio_list_init(&tc->deferred_bio_list);
				2209
				2210	spin_unlock_irqrestore(&tc->lock, flags);
				2211
				2212	blk_start_plug(&plug);
				2213	while ((bio = bio_list_pop(&bios))) {
				2214	/*
				2215	* If we've got no free new_mapping structs, and processing
				2216	* this bio might require one, we pause until there are some
				2217	* prepared mappings to process.
				2218	*/
				2219	if (ensure_next_mapping(pool)) {
				2220	spin_lock_irqsave(&tc->lock, flags);
				2221	bio_list_add(&tc->deferred_bio_list, bio);
				2222	bio_list_merge(&tc->deferred_bio_list, &bios);
				2223	spin_unlock_irqrestore(&tc->lock, flags);
				2224	break;
				2225	}
				2226
				2227	if (bio_op(bio) == REQ_OP_DISCARD)
				2228	pool->process_discard(tc, bio);
				2229	else
				2230	pool->process_bio(tc, bio);
				2231
				2232	if ((count++ & 127) == 0) {
				2233	throttle_work_update(&pool->throttle);
				2234	dm_pool_issue_prefetches(pool->pmd);
				2235	}
				2236	}
				2237	blk_finish_plug(&plug);
				2238	}
				2239
				2240	static int cmp_cells(const void lhs, const void rhs)
				2241	{
				2242	struct dm_bio_prison_cell lhs_cell = ((struct dm_bio_prison_cell **) lhs);
				2243	struct dm_bio_prison_cell rhs_cell = ((struct dm_bio_prison_cell **) rhs);
				2244
				2245	BUG_ON(!lhs_cell->holder);
				2246	BUG_ON(!rhs_cell->holder);
				2247
				2248	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
				2249	return -1;
				2250
				2251	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
				2252	return 1;
				2253
				2254	return 0;
				2255	}
				2256
				2257	static unsigned sort_cells(struct pool pool, struct list_head cells)
				2258	{
				2259	unsigned count = 0;
				2260	struct dm_bio_prison_cell cell, tmp;
				2261
				2262	list_for_each_entry_safe(cell, tmp, cells, user_list) {
				2263	if (count >= CELL_SORT_ARRAY_SIZE)
				2264	break;
				2265
				2266	pool->cell_sort_array[count++] = cell;
				2267	list_del(&cell->user_list);
				2268	}
				2269
				2270	sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
				2271
				2272	return count;
				2273	}
				2274
				2275	static void process_thin_deferred_cells(struct thin_c *tc)
				2276	{
				2277	struct pool *pool = tc->pool;
				2278	unsigned long flags;
				2279	struct list_head cells;
				2280	struct dm_bio_prison_cell *cell;
				2281	unsigned i, j, count;
				2282
				2283	INIT_LIST_HEAD(&cells);
				2284
				2285	spin_lock_irqsave(&tc->lock, flags);
				2286	list_splice_init(&tc->deferred_cells, &cells);
				2287	spin_unlock_irqrestore(&tc->lock, flags);
				2288
				2289	if (list_empty(&cells))
				2290	return;
				2291
				2292	do {
				2293	count = sort_cells(tc->pool, &cells);
				2294
				2295	for (i = 0; i < count; i++) {
				2296	cell = pool->cell_sort_array[i];
				2297	BUG_ON(!cell->holder);
				2298
				2299	/*
				2300	* If we've got no free new_mapping structs, and processing
				2301	* this bio might require one, we pause until there are some
				2302	* prepared mappings to process.
				2303	*/
				2304	if (ensure_next_mapping(pool)) {
				2305	for (j = i; j < count; j++)
				2306	list_add(&pool->cell_sort_array[j]->user_list, &cells);
				2307
				2308	spin_lock_irqsave(&tc->lock, flags);
				2309	list_splice(&cells, &tc->deferred_cells);
				2310	spin_unlock_irqrestore(&tc->lock, flags);
				2311	return;
				2312	}
				2313
				2314	if (bio_op(cell->holder) == REQ_OP_DISCARD)
				2315	pool->process_discard_cell(tc, cell);
				2316	else
				2317	pool->process_cell(tc, cell);
				2318	}
				2319	} while (!list_empty(&cells));
				2320	}
				2321
				2322	static void thin_get(struct thin_c *tc);
				2323	static void thin_put(struct thin_c *tc);
				2324
				2325	/*
				2326	* We can't hold rcu_read_lock() around code that can block. So we
				2327	* find a thin with the rcu lock held; bump a refcount; then drop
				2328	* the lock.
				2329	*/
				2330	static struct thin_c get_first_thin(struct pool pool)
				2331	{
				2332	struct thin_c *tc = NULL;
				2333
				2334	rcu_read_lock();
				2335	if (!list_empty(&pool->active_thins)) {
				2336	tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
				2337	thin_get(tc);
				2338	}
				2339	rcu_read_unlock();
				2340
				2341	return tc;
				2342	}
				2343
				2344	static struct thin_c get_next_thin(struct pool pool, struct thin_c *tc)
				2345	{
				2346	struct thin_c *old_tc = tc;
				2347
				2348	rcu_read_lock();
				2349	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
				2350	thin_get(tc);
				2351	thin_put(old_tc);
				2352	rcu_read_unlock();
				2353	return tc;
				2354	}
				2355	thin_put(old_tc);
				2356	rcu_read_unlock();
				2357
				2358	return NULL;
				2359	}
				2360
				2361	static void process_deferred_bios(struct pool *pool)
				2362	{
				2363	unsigned long flags;
				2364	struct bio *bio;
				2365	struct bio_list bios, bio_completions;
				2366	struct thin_c *tc;
				2367
				2368	tc = get_first_thin(pool);
				2369	while (tc) {
				2370	process_thin_deferred_cells(tc);
				2371	process_thin_deferred_bios(tc);
				2372	tc = get_next_thin(pool, tc);
				2373	}
				2374
				2375	/*
				2376	* If there are any deferred flush bios, we must commit the metadata
				2377	* before issuing them or signaling their completion.
				2378	*/
				2379	bio_list_init(&bios);
				2380	bio_list_init(&bio_completions);
				2381
				2382	spin_lock_irqsave(&pool->lock, flags);
				2383	bio_list_merge(&bios, &pool->deferred_flush_bios);
				2384	bio_list_init(&pool->deferred_flush_bios);
				2385
				2386	bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
				2387	bio_list_init(&pool->deferred_flush_completions);
				2388	spin_unlock_irqrestore(&pool->lock, flags);
				2389
				2390	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
				2391	!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
				2392	return;
				2393
				2394	if (commit(pool)) {
				2395	bio_list_merge(&bios, &bio_completions);
				2396
				2397	while ((bio = bio_list_pop(&bios)))
				2398	bio_io_error(bio);
				2399	return;
				2400	}
				2401	pool->last_commit_jiffies = jiffies;
				2402
				2403	while ((bio = bio_list_pop(&bio_completions)))
				2404	bio_endio(bio);
				2405
				2406	while ((bio = bio_list_pop(&bios)))
				2407	generic_make_request(bio);
				2408	}
				2409
				2410	static void do_worker(struct work_struct *ws)
				2411	{
				2412	struct pool *pool = container_of(ws, struct pool, worker);
				2413
				2414	throttle_work_start(&pool->throttle);
				2415	dm_pool_issue_prefetches(pool->pmd);
				2416	throttle_work_update(&pool->throttle);
				2417	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
				2418	throttle_work_update(&pool->throttle);
				2419	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
				2420	throttle_work_update(&pool->throttle);
				2421	process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
				2422	throttle_work_update(&pool->throttle);
				2423	process_deferred_bios(pool);
				2424	throttle_work_complete(&pool->throttle);
				2425	}
				2426
				2427	/*
				2428	* We want to commit periodically so that not too much
				2429	* unwritten data builds up.
				2430	*/
				2431	static void do_waker(struct work_struct *ws)
				2432	{
				2433	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
				2434	wake_worker(pool);
				2435	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
				2436	}
				2437
				2438	/*
				2439	* We're holding onto IO to allow userland time to react. After the
				2440	* timeout either the pool will have been resized (and thus back in
				2441	* PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
				2442	*/
				2443	static void do_no_space_timeout(struct work_struct *ws)
				2444	{
				2445	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
				2446	no_space_timeout);
				2447
				2448	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
				2449	pool->pf.error_if_no_space = true;
				2450	notify_of_pool_mode_change(pool);
				2451	error_retry_list_with_code(pool, BLK_STS_NOSPC);
				2452	}
				2453	}
				2454
				2455	/----------------------------------------------------------------/
				2456
				2457	struct pool_work {
				2458	struct work_struct worker;
				2459	struct completion complete;
				2460	};
				2461
				2462	static struct pool_work to_pool_work(struct work_struct ws)
				2463	{
				2464	return container_of(ws, struct pool_work, worker);
				2465	}
				2466
				2467	static void pool_work_complete(struct pool_work *pw)
				2468	{
				2469	complete(&pw->complete);
				2470	}
				2471
				2472	static void pool_work_wait(struct pool_work pw, struct pool pool,
				2473	void (fn)(struct work_struct ))
				2474	{
				2475	INIT_WORK_ONSTACK(&pw->worker, fn);
				2476	init_completion(&pw->complete);
				2477	queue_work(pool->wq, &pw->worker);
				2478	wait_for_completion(&pw->complete);
				2479	}
				2480
				2481	/----------------------------------------------------------------/
				2482
				2483	struct noflush_work {
				2484	struct pool_work pw;
				2485	struct thin_c *tc;
				2486	};
				2487
				2488	static struct noflush_work to_noflush(struct work_struct ws)
				2489	{
				2490	return container_of(to_pool_work(ws), struct noflush_work, pw);
				2491	}
				2492
				2493	static void do_noflush_start(struct work_struct *ws)
				2494	{
				2495	struct noflush_work *w = to_noflush(ws);
				2496	w->tc->requeue_mode = true;
				2497	requeue_io(w->tc);
				2498	pool_work_complete(&w->pw);
				2499	}
				2500
				2501	static void do_noflush_stop(struct work_struct *ws)
				2502	{
				2503	struct noflush_work *w = to_noflush(ws);
				2504	w->tc->requeue_mode = false;
				2505	pool_work_complete(&w->pw);
				2506	}
				2507
				2508	static void noflush_work(struct thin_c tc, void (fn)(struct work_struct *))
				2509	{
				2510	struct noflush_work w;
				2511
				2512	w.tc = tc;
				2513	pool_work_wait(&w.pw, tc->pool, fn);
				2514	}
				2515
				2516	/----------------------------------------------------------------/
				2517
				2518	static bool passdown_enabled(struct pool_c *pt)
				2519	{
				2520	return pt->adjusted_pf.discard_passdown;
				2521	}
				2522
				2523	static void set_discard_callbacks(struct pool *pool)
				2524	{
				2525	struct pool_c *pt = pool->ti->private;
				2526
				2527	if (passdown_enabled(pt)) {
				2528	pool->process_discard_cell = process_discard_cell_passdown;
				2529	pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
				2530	pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
				2531	} else {
				2532	pool->process_discard_cell = process_discard_cell_no_passdown;
				2533	pool->process_prepared_discard = process_prepared_discard_no_passdown;
				2534	}
				2535	}
				2536
				2537	static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
				2538	{
				2539	struct pool_c *pt = pool->ti->private;
				2540	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
				2541	enum pool_mode old_mode = get_pool_mode(pool);
				2542	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
				2543
				2544	/*
				2545	* Never allow the pool to transition to PM_WRITE mode if user
				2546	* intervention is required to verify metadata and data consistency.
				2547	*/
				2548	if (new_mode == PM_WRITE && needs_check) {
				2549	DMERR("%s: unable to switch pool to write mode until repaired.",
				2550	dm_device_name(pool->pool_md));
				2551	if (old_mode != new_mode)
				2552	new_mode = old_mode;
				2553	else
				2554	new_mode = PM_READ_ONLY;
				2555	}
				2556	/*
				2557	* If we were in PM_FAIL mode, rollback of metadata failed. We're
				2558	* not going to recover without a thin_repair. So we never let the
				2559	* pool move out of the old mode.
				2560	*/
				2561	if (old_mode == PM_FAIL)
				2562	new_mode = old_mode;
				2563
				2564	switch (new_mode) {
				2565	case PM_FAIL:
				2566	dm_pool_metadata_read_only(pool->pmd);
				2567	pool->process_bio = process_bio_fail;
				2568	pool->process_discard = process_bio_fail;
				2569	pool->process_cell = process_cell_fail;
				2570	pool->process_discard_cell = process_cell_fail;
				2571	pool->process_prepared_mapping = process_prepared_mapping_fail;
				2572	pool->process_prepared_discard = process_prepared_discard_fail;
				2573
				2574	error_retry_list(pool);
				2575	break;
				2576
				2577	case PM_OUT_OF_METADATA_SPACE:
				2578	case PM_READ_ONLY:
				2579	dm_pool_metadata_read_only(pool->pmd);
				2580	pool->process_bio = process_bio_read_only;
				2581	pool->process_discard = process_bio_success;
				2582	pool->process_cell = process_cell_read_only;
				2583	pool->process_discard_cell = process_cell_success;
				2584	pool->process_prepared_mapping = process_prepared_mapping_fail;
				2585	pool->process_prepared_discard = process_prepared_discard_success;
				2586
				2587	error_retry_list(pool);
				2588	break;
				2589
				2590	case PM_OUT_OF_DATA_SPACE:
				2591	/*
				2592	* Ideally we'd never hit this state; the low water mark
				2593	* would trigger userland to extend the pool before we
				2594	* completely run out of data space. However, many small
				2595	* IOs to unprovisioned space can consume data space at an
				2596	* alarming rate. Adjust your low water mark if you're
				2597	* frequently seeing this mode.
				2598	*/
				2599	pool->out_of_data_space = true;
				2600	pool->process_bio = process_bio_read_only;
				2601	pool->process_discard = process_discard_bio;
				2602	pool->process_cell = process_cell_read_only;
				2603	pool->process_prepared_mapping = process_prepared_mapping;
				2604	set_discard_callbacks(pool);
				2605
				2606	if (!pool->pf.error_if_no_space && no_space_timeout)
				2607	queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
				2608	break;
				2609
				2610	case PM_WRITE:
				2611	if (old_mode == PM_OUT_OF_DATA_SPACE)
				2612	cancel_delayed_work_sync(&pool->no_space_timeout);
				2613	pool->out_of_data_space = false;
				2614	pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
				2615	dm_pool_metadata_read_write(pool->pmd);
				2616	pool->process_bio = process_bio;
				2617	pool->process_discard = process_discard_bio;
				2618	pool->process_cell = process_cell;
				2619	pool->process_prepared_mapping = process_prepared_mapping;
				2620	set_discard_callbacks(pool);
				2621	break;
				2622	}
				2623
				2624	pool->pf.mode = new_mode;
				2625	/*
				2626	* The pool mode may have changed, sync it so bind_control_target()
				2627	* doesn't cause an unexpected mode transition on resume.
				2628	*/
				2629	pt->adjusted_pf.mode = new_mode;
				2630
				2631	if (old_mode != new_mode)
				2632	notify_of_pool_mode_change(pool);
				2633	}
				2634
				2635	static void abort_transaction(struct pool *pool)
				2636	{
				2637	const char *dev_name = dm_device_name(pool->pool_md);
				2638
				2639	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
				2640	if (dm_pool_abort_metadata(pool->pmd)) {
				2641	DMERR("%s: failed to abort metadata transaction", dev_name);
				2642	set_pool_mode(pool, PM_FAIL);
				2643	}
				2644
				2645	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
				2646	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
				2647	set_pool_mode(pool, PM_FAIL);
				2648	}
				2649	}
				2650
				2651	static void metadata_operation_failed(struct pool pool, const char op, int r)
				2652	{
				2653	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
				2654	dm_device_name(pool->pool_md), op, r);
				2655
				2656	abort_transaction(pool);
				2657	set_pool_mode(pool, PM_READ_ONLY);
				2658	}
				2659
				2660	/----------------------------------------------------------------/
				2661
				2662	/*
				2663	* Mapping functions.
				2664	*/
				2665
				2666	/*
				2667	* Called only while mapping a thin bio to hand it over to the workqueue.
				2668	*/
				2669	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				2670	{
				2671	unsigned long flags;
				2672	struct pool *pool = tc->pool;
				2673
				2674	spin_lock_irqsave(&tc->lock, flags);
				2675	bio_list_add(&tc->deferred_bio_list, bio);
				2676	spin_unlock_irqrestore(&tc->lock, flags);
				2677
				2678	wake_worker(pool);
				2679	}
				2680
				2681	static void thin_defer_bio_with_throttle(struct thin_c tc, struct bio bio)
				2682	{
				2683	struct pool *pool = tc->pool;
				2684
				2685	throttle_lock(&pool->throttle);
				2686	thin_defer_bio(tc, bio);
				2687	throttle_unlock(&pool->throttle);
				2688	}
				2689
				2690	static void thin_defer_cell(struct thin_c tc, struct dm_bio_prison_cell cell)
				2691	{
				2692	unsigned long flags;
				2693	struct pool *pool = tc->pool;
				2694
				2695	throttle_lock(&pool->throttle);
				2696	spin_lock_irqsave(&tc->lock, flags);
				2697	list_add_tail(&cell->user_list, &tc->deferred_cells);
				2698	spin_unlock_irqrestore(&tc->lock, flags);
				2699	throttle_unlock(&pool->throttle);
				2700
				2701	wake_worker(pool);
				2702	}
				2703
				2704	static void thin_hook_bio(struct thin_c tc, struct bio bio)
				2705	{
				2706	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				2707
				2708	h->tc = tc;
				2709	h->shared_read_entry = NULL;
				2710	h->all_io_entry = NULL;
				2711	h->overwrite_mapping = NULL;
				2712	h->cell = NULL;
				2713	}
				2714
				2715	/*
				2716	* Non-blocking function called from the thin target's map function.
				2717	*/
				2718	static int thin_bio_map(struct dm_target ti, struct bio bio)
				2719	{
				2720	int r;
				2721	struct thin_c *tc = ti->private;
				2722	dm_block_t block = get_bio_block(tc, bio);
				2723	struct dm_thin_device *td = tc->td;
				2724	struct dm_thin_lookup_result result;
				2725	struct dm_bio_prison_cell virt_cell, data_cell;
				2726	struct dm_cell_key key;
				2727
				2728	thin_hook_bio(tc, bio);
				2729
				2730	if (tc->requeue_mode) {
				2731	bio->bi_status = BLK_STS_DM_REQUEUE;
				2732	bio_endio(bio);
				2733	return DM_MAPIO_SUBMITTED;
				2734	}
				2735
				2736	if (get_pool_mode(tc->pool) == PM_FAIL) {
				2737	bio_io_error(bio);
				2738	return DM_MAPIO_SUBMITTED;
				2739	}
				2740
				2741	if (op_is_flush(bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD) {
				2742	thin_defer_bio_with_throttle(tc, bio);
				2743	return DM_MAPIO_SUBMITTED;
				2744	}
				2745
				2746	/*
				2747	* We must hold the virtual cell before doing the lookup, otherwise
				2748	* there's a race with discard.
				2749	*/
				2750	build_virtual_key(tc->td, block, &key);
				2751	if (bio_detain(tc->pool, &key, bio, &virt_cell))
				2752	return DM_MAPIO_SUBMITTED;
				2753
				2754	r = dm_thin_find_block(td, block, 0, &result);
				2755
				2756	/*
				2757	* Note that we defer readahead too.
				2758	*/
				2759	switch (r) {
				2760	case 0:
				2761	if (unlikely(result.shared)) {
				2762	/*
				2763	* We have a race condition here between the
				2764	* result.shared value returned by the lookup and
				2765	* snapshot creation, which may cause new
				2766	* sharing.
				2767	*
				2768	* To avoid this always quiesce the origin before
				2769	* taking the snap. You want to do this anyway to
				2770	* ensure a consistent application view
				2771	* (i.e. lockfs).
				2772	*
				2773	* More distant ancestors are irrelevant. The
				2774	* shared flag will be set in their case.
				2775	*/
				2776	thin_defer_cell(tc, virt_cell);
				2777	return DM_MAPIO_SUBMITTED;
				2778	}
				2779
				2780	build_data_key(tc->td, result.block, &key);
				2781	if (bio_detain(tc->pool, &key, bio, &data_cell)) {
				2782	cell_defer_no_holder(tc, virt_cell);
				2783	return DM_MAPIO_SUBMITTED;
				2784	}
				2785
				2786	inc_all_io_entry(tc->pool, bio);
				2787	cell_defer_no_holder(tc, data_cell);
				2788	cell_defer_no_holder(tc, virt_cell);
				2789
				2790	remap(tc, bio, result.block);
				2791	return DM_MAPIO_REMAPPED;
				2792
				2793	case -ENODATA:
				2794	case -EWOULDBLOCK:
				2795	thin_defer_cell(tc, virt_cell);
				2796	return DM_MAPIO_SUBMITTED;
				2797
				2798	default:
				2799	/*
				2800	* Must always call bio_io_error on failure.
				2801	* dm_thin_find_block can fail with -EINVAL if the
				2802	* pool is switched to fail-io mode.
				2803	*/
				2804	bio_io_error(bio);
				2805	cell_defer_no_holder(tc, virt_cell);
				2806	return DM_MAPIO_SUBMITTED;
				2807	}
				2808	}
				2809
				2810	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				2811	{
				2812	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				2813	struct request_queue *q;
				2814
				2815	if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
				2816	return 1;
				2817
				2818	q = bdev_get_queue(pt->data_dev->bdev);
				2819	return bdi_congested(q->backing_dev_info, bdi_bits);
				2820	}
				2821
				2822	static void requeue_bios(struct pool *pool)
				2823	{
				2824	unsigned long flags;
				2825	struct thin_c *tc;
				2826
				2827	rcu_read_lock();
				2828	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
				2829	spin_lock_irqsave(&tc->lock, flags);
				2830	bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
				2831	bio_list_init(&tc->retry_on_resume_list);
				2832	spin_unlock_irqrestore(&tc->lock, flags);
				2833	}
				2834	rcu_read_unlock();
				2835	}
				2836
				2837	/*----------------------------------------------------------------
				2838	* Binding of control targets to a pool object
				2839	--------------------------------------------------------------/
				2840	static bool data_dev_supports_discard(struct pool_c *pt)
				2841	{
				2842	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2843
				2844	return q && blk_queue_discard(q);
				2845	}
				2846
				2847	static bool is_factor(sector_t block_size, uint32_t n)
				2848	{
				2849	return !sector_div(block_size, n);
				2850	}
				2851
				2852	/*
				2853	* If discard_passdown was enabled verify that the data device
				2854	* supports discards. Disable discard_passdown if not.
				2855	*/
				2856	static void disable_passdown_if_not_supported(struct pool_c *pt)
				2857	{
				2858	struct pool *pool = pt->pool;
				2859	struct block_device *data_bdev = pt->data_dev->bdev;
				2860	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
				2861	const char *reason = NULL;
				2862	char buf[BDEVNAME_SIZE];
				2863
				2864	if (!pt->adjusted_pf.discard_passdown)
				2865	return;
				2866
				2867	if (!data_dev_supports_discard(pt))
				2868	reason = "discard unsupported";
				2869
				2870	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
				2871	reason = "max discard sectors smaller than a block";
				2872
				2873	if (reason) {
				2874	DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
				2875	pt->adjusted_pf.discard_passdown = false;
				2876	}
				2877	}
				2878
				2879	static int bind_control_target(struct pool pool, struct dm_target ti)
				2880	{
				2881	struct pool_c *pt = ti->private;
				2882
				2883	/*
				2884	* We want to make sure that a pool in PM_FAIL mode is never upgraded.
				2885	*/
				2886	enum pool_mode old_mode = get_pool_mode(pool);
				2887	enum pool_mode new_mode = pt->adjusted_pf.mode;
				2888
				2889	/*
				2890	* Don't change the pool's mode until set_pool_mode() below.
				2891	* Otherwise the pool's process_* function pointers may
				2892	* not match the desired pool mode.
				2893	*/
				2894	pt->adjusted_pf.mode = old_mode;
				2895
				2896	pool->ti = ti;
				2897	pool->pf = pt->adjusted_pf;
				2898	pool->low_water_blocks = pt->low_water_blocks;
				2899
				2900	set_pool_mode(pool, new_mode);
				2901
				2902	return 0;
				2903	}
				2904
				2905	static void unbind_control_target(struct pool pool, struct dm_target ti)
				2906	{
				2907	if (pool->ti == ti)
				2908	pool->ti = NULL;
				2909	}
				2910
				2911	/*----------------------------------------------------------------
				2912	* Pool creation
				2913	--------------------------------------------------------------/
				2914	/* Initialize pool features. */
				2915	static void pool_features_init(struct pool_features *pf)
				2916	{
				2917	pf->mode = PM_WRITE;
				2918	pf->zero_new_blocks = true;
				2919	pf->discard_enabled = true;
				2920	pf->discard_passdown = true;
				2921	pf->error_if_no_space = false;
				2922	}
				2923
				2924	static void __pool_destroy(struct pool *pool)
				2925	{
				2926	__pool_table_remove(pool);
				2927
				2928	vfree(pool->cell_sort_array);
				2929	if (dm_pool_metadata_close(pool->pmd) < 0)
				2930	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				2931
				2932	dm_bio_prison_destroy(pool->prison);
				2933	dm_kcopyd_client_destroy(pool->copier);
				2934
				2935	if (pool->wq)
				2936	destroy_workqueue(pool->wq);
				2937
				2938	if (pool->next_mapping)
				2939	mempool_free(pool->next_mapping, pool->mapping_pool);
				2940	mempool_destroy(pool->mapping_pool);
				2941	dm_deferred_set_destroy(pool->shared_read_ds);
				2942	dm_deferred_set_destroy(pool->all_io_ds);
				2943	kfree(pool);
				2944	}
				2945
				2946	static struct kmem_cache *_new_mapping_cache;
				2947
				2948	static struct pool pool_create(struct mapped_device pool_md,
				2949	struct block_device *metadata_dev,
				2950	unsigned long block_size,
				2951	int read_only, char **error)
				2952	{
				2953	int r;
				2954	void *err_p;
				2955	struct pool *pool;
				2956	struct dm_pool_metadata *pmd;
				2957	bool format_device = read_only ? false : true;
				2958
				2959	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
				2960	if (IS_ERR(pmd)) {
				2961	*error = "Error creating metadata object";
				2962	return (struct pool *)pmd;
				2963	}
				2964
				2965	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
				2966	if (!pool) {
				2967	*error = "Error allocating memory for pool";
				2968	err_p = ERR_PTR(-ENOMEM);
				2969	goto bad_pool;
				2970	}
				2971
				2972	pool->pmd = pmd;
				2973	pool->sectors_per_block = block_size;
				2974	if (block_size & (block_size - 1))
				2975	pool->sectors_per_block_shift = -1;
				2976	else
				2977	pool->sectors_per_block_shift = __ffs(block_size);
				2978	pool->low_water_blocks = 0;
				2979	pool_features_init(&pool->pf);
				2980	pool->prison = dm_bio_prison_create();
				2981	if (!pool->prison) {
				2982	*error = "Error creating pool's bio prison";
				2983	err_p = ERR_PTR(-ENOMEM);
				2984	goto bad_prison;
				2985	}
				2986
				2987	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				2988	if (IS_ERR(pool->copier)) {
				2989	r = PTR_ERR(pool->copier);
				2990	*error = "Error creating pool's kcopyd client";
				2991	err_p = ERR_PTR(r);
				2992	goto bad_kcopyd_client;
				2993	}
				2994
				2995	/*
				2996	* Create singlethreaded workqueue that will service all devices
				2997	* that use this metadata.
				2998	*/
				2999	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				3000	if (!pool->wq) {
				3001	*error = "Error creating pool's workqueue";
				3002	err_p = ERR_PTR(-ENOMEM);
				3003	goto bad_wq;
				3004	}
				3005
				3006	throttle_init(&pool->throttle);
				3007	INIT_WORK(&pool->worker, do_worker);
				3008	INIT_DELAYED_WORK(&pool->waker, do_waker);
				3009	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
				3010	spin_lock_init(&pool->lock);
				3011	bio_list_init(&pool->deferred_flush_bios);
				3012	bio_list_init(&pool->deferred_flush_completions);
				3013	INIT_LIST_HEAD(&pool->prepared_mappings);
				3014	INIT_LIST_HEAD(&pool->prepared_discards);
				3015	INIT_LIST_HEAD(&pool->prepared_discards_pt2);
				3016	INIT_LIST_HEAD(&pool->active_thins);
				3017	pool->low_water_triggered = false;
				3018	pool->suspended = true;
				3019	pool->out_of_data_space = false;
				3020
				3021	pool->shared_read_ds = dm_deferred_set_create();
				3022	if (!pool->shared_read_ds) {
				3023	*error = "Error creating pool's shared read deferred set";
				3024	err_p = ERR_PTR(-ENOMEM);
				3025	goto bad_shared_read_ds;
				3026	}
				3027
				3028	pool->all_io_ds = dm_deferred_set_create();
				3029	if (!pool->all_io_ds) {
				3030	*error = "Error creating pool's all io deferred set";
				3031	err_p = ERR_PTR(-ENOMEM);
				3032	goto bad_all_io_ds;
				3033	}
				3034
				3035	pool->next_mapping = NULL;
				3036	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
				3037	_new_mapping_cache);
				3038	if (!pool->mapping_pool) {
				3039	*error = "Error creating pool's mapping mempool";
				3040	err_p = ERR_PTR(-ENOMEM);
				3041	goto bad_mapping_pool;
				3042	}
				3043
				3044	pool->cell_sort_array = vmalloc(sizeof(pool->cell_sort_array) CELL_SORT_ARRAY_SIZE);
				3045	if (!pool->cell_sort_array) {
				3046	*error = "Error allocating cell sort array";
				3047	err_p = ERR_PTR(-ENOMEM);
				3048	goto bad_sort_array;
				3049	}
				3050
				3051	pool->ref_count = 1;
				3052	pool->last_commit_jiffies = jiffies;
				3053	pool->pool_md = pool_md;
				3054	pool->md_dev = metadata_dev;
				3055	__pool_table_insert(pool);
				3056
				3057	return pool;
				3058
				3059	bad_sort_array:
				3060	mempool_destroy(pool->mapping_pool);
				3061	bad_mapping_pool:
				3062	dm_deferred_set_destroy(pool->all_io_ds);
				3063	bad_all_io_ds:
				3064	dm_deferred_set_destroy(pool->shared_read_ds);
				3065	bad_shared_read_ds:
				3066	destroy_workqueue(pool->wq);
				3067	bad_wq:
				3068	dm_kcopyd_client_destroy(pool->copier);
				3069	bad_kcopyd_client:
				3070	dm_bio_prison_destroy(pool->prison);
				3071	bad_prison:
				3072	kfree(pool);
				3073	bad_pool:
				3074	if (dm_pool_metadata_close(pmd))
				3075	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				3076
				3077	return err_p;
				3078	}
				3079
				3080	static void __pool_inc(struct pool *pool)
				3081	{
				3082	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				3083	pool->ref_count++;
				3084	}
				3085
				3086	static void __pool_dec(struct pool *pool)
				3087	{
				3088	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				3089	BUG_ON(!pool->ref_count);
				3090	if (!--pool->ref_count)
				3091	__pool_destroy(pool);
				3092	}
				3093
				3094	static struct pool __pool_find(struct mapped_device pool_md,
				3095	struct block_device *metadata_dev,
				3096	unsigned long block_size, int read_only,
				3097	char *error, int created)
				3098	{
				3099	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				3100
				3101	if (pool) {
				3102	if (pool->pool_md != pool_md) {
				3103	*error = "metadata device already in use by a pool";
				3104	return ERR_PTR(-EBUSY);
				3105	}
				3106	__pool_inc(pool);
				3107
				3108	} else {
				3109	pool = __pool_table_lookup(pool_md);
				3110	if (pool) {
				3111	if (pool->md_dev != metadata_dev) {
				3112	*error = "different pool cannot replace a pool";
				3113	return ERR_PTR(-EINVAL);
				3114	}
				3115	__pool_inc(pool);
				3116
				3117	} else {
				3118	pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
				3119	*created = 1;
				3120	}
				3121	}
				3122
				3123	return pool;
				3124	}
				3125
				3126	/*----------------------------------------------------------------
				3127	* Pool target methods
				3128	--------------------------------------------------------------/
				3129	static void pool_dtr(struct dm_target *ti)
				3130	{
				3131	struct pool_c *pt = ti->private;
				3132
				3133	mutex_lock(&dm_thin_pool_table.mutex);
				3134
				3135	unbind_control_target(pt->pool, ti);
				3136	__pool_dec(pt->pool);
				3137	dm_put_device(ti, pt->metadata_dev);
				3138	dm_put_device(ti, pt->data_dev);
				3139	kfree(pt);
				3140
				3141	mutex_unlock(&dm_thin_pool_table.mutex);
				3142	}
				3143
				3144	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				3145	struct dm_target *ti)
				3146	{
				3147	int r;
				3148	unsigned argc;
				3149	const char *arg_name;
				3150
				3151	static const struct dm_arg _args[] = {
				3152	{0, 4, "Invalid number of pool feature arguments"},
				3153	};
				3154
				3155	/*
				3156	* No feature arguments supplied.
				3157	*/
				3158	if (!as->argc)
				3159	return 0;
				3160
				3161	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				3162	if (r)
				3163	return -EINVAL;
				3164
				3165	while (argc && !r) {
				3166	arg_name = dm_shift_arg(as);
				3167	argc--;
				3168
				3169	if (!strcasecmp(arg_name, "skip_block_zeroing"))
				3170	pf->zero_new_blocks = false;
				3171
				3172	else if (!strcasecmp(arg_name, "ignore_discard"))
				3173	pf->discard_enabled = false;
				3174
				3175	else if (!strcasecmp(arg_name, "no_discard_passdown"))
				3176	pf->discard_passdown = false;
				3177
				3178	else if (!strcasecmp(arg_name, "read_only"))
				3179	pf->mode = PM_READ_ONLY;
				3180
				3181	else if (!strcasecmp(arg_name, "error_if_no_space"))
				3182	pf->error_if_no_space = true;
				3183
				3184	else {
				3185	ti->error = "Unrecognised pool feature requested";
				3186	r = -EINVAL;
				3187	break;
				3188	}
				3189	}
				3190
				3191	return r;
				3192	}
				3193
				3194	static void metadata_low_callback(void *context)
				3195	{
				3196	struct pool *pool = context;
				3197
				3198	DMWARN("%s: reached low water mark for metadata device: sending event.",
				3199	dm_device_name(pool->pool_md));
				3200
				3201	dm_table_event(pool->ti->table);
				3202	}
				3203
				3204	static sector_t get_dev_size(struct block_device *bdev)
				3205	{
				3206	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
				3207	}
				3208
				3209	static void warn_if_metadata_device_too_big(struct block_device *bdev)
				3210	{
				3211	sector_t metadata_dev_size = get_dev_size(bdev);
				3212	char buffer[BDEVNAME_SIZE];
				3213
				3214	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
				3215	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				3216	bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
				3217	}
				3218
				3219	static sector_t get_metadata_dev_size(struct block_device *bdev)
				3220	{
				3221	sector_t metadata_dev_size = get_dev_size(bdev);
				3222
				3223	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
				3224	metadata_dev_size = THIN_METADATA_MAX_SECTORS;
				3225
				3226	return metadata_dev_size;
				3227	}
				3228
				3229	static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
				3230	{
				3231	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
				3232
				3233	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
				3234
				3235	return metadata_dev_size;
				3236	}
				3237
				3238	/*
				3239	* When a metadata threshold is crossed a dm event is triggered, and
				3240	* userland should respond by growing the metadata device. We could let
				3241	* userland set the threshold, like we do with the data threshold, but I'm
				3242	* not sure they know enough to do this well.
				3243	*/
				3244	static dm_block_t calc_metadata_threshold(struct pool_c *pt)
				3245	{
				3246	/*
				3247	* 4M is ample for all ops with the possible exception of thin
				3248	* device deletion which is harmless if it fails (just retry the
				3249	* delete after you've grown the device).
				3250	*/
				3251	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
				3252	return min((dm_block_t)1024ULL /* 4M */, quarter);
				3253	}
				3254
				3255	/*
				3256	* thin-pool <metadata dev> <data dev>
				3257	* <data block size (sectors)>
				3258	* <low water mark (blocks)>
				3259	* [<#feature args> [<arg>]*]
				3260	*
				3261	* Optional feature arguments are:
				3262	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
				3263	* ignore_discard: disable discard
				3264	* no_discard_passdown: don't pass discards down to the data device
				3265	* read_only: Don't allow any changes to be made to the pool metadata.
				3266	* error_if_no_space: error IOs, instead of queueing, if no space.
				3267	*/
				3268	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				3269	{
				3270	int r, pool_created = 0;
				3271	struct pool_c *pt;
				3272	struct pool *pool;
				3273	struct pool_features pf;
				3274	struct dm_arg_set as;
				3275	struct dm_dev *data_dev;
				3276	unsigned long block_size;
				3277	dm_block_t low_water_blocks;
				3278	struct dm_dev *metadata_dev;
				3279	fmode_t metadata_mode;
				3280
				3281	/*
				3282	* FIXME Remove validation from scope of lock.
				3283	*/
				3284	mutex_lock(&dm_thin_pool_table.mutex);
				3285
				3286	if (argc < 4) {
				3287	ti->error = "Invalid argument count";
				3288	r = -EINVAL;
				3289	goto out_unlock;
				3290	}
				3291
				3292	as.argc = argc;
				3293	as.argv = argv;
				3294
				3295	/* make sure metadata and data are different devices */
				3296	if (!strcmp(argv[0], argv[1])) {
				3297	ti->error = "Error setting metadata or data device";
				3298	r = -EINVAL;
				3299	goto out_unlock;
				3300	}
				3301
				3302	/*
				3303	* Set default pool features.
				3304	*/
				3305	pool_features_init(&pf);
				3306
				3307	dm_consume_args(&as, 4);
				3308	r = parse_pool_features(&as, &pf, ti);
				3309	if (r)
				3310	goto out_unlock;
				3311
				3312	metadata_mode = FMODE_READ \| ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
				3313	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
				3314	if (r) {
				3315	ti->error = "Error opening metadata block device";
				3316	goto out_unlock;
				3317	}
				3318	warn_if_metadata_device_too_big(metadata_dev->bdev);
				3319
				3320	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				3321	if (r) {
				3322	ti->error = "Error getting data device";
				3323	goto out_metadata;
				3324	}
				3325
				3326	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				3327	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				3328	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				3329	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
				3330	ti->error = "Invalid block size";
				3331	r = -EINVAL;
				3332	goto out;
				3333	}
				3334
				3335	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				3336	ti->error = "Invalid low water mark";
				3337	r = -EINVAL;
				3338	goto out;
				3339	}
				3340
				3341	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				3342	if (!pt) {
				3343	r = -ENOMEM;
				3344	goto out;
				3345	}
				3346
				3347	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
				3348	block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
				3349	if (IS_ERR(pool)) {
				3350	r = PTR_ERR(pool);
				3351	goto out_free_pt;
				3352	}
				3353
				3354	/*
				3355	* 'pool_created' reflects whether this is the first table load.
				3356	* Top level discard support is not allowed to be changed after
				3357	* initial load. This would require a pool reload to trigger thin
				3358	* device changes.
				3359	*/
				3360	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
				3361	ti->error = "Discard support cannot be disabled once enabled";
				3362	r = -EINVAL;
				3363	goto out_flags_changed;
				3364	}
				3365
				3366	pt->pool = pool;
				3367	pt->ti = ti;
				3368	pt->metadata_dev = metadata_dev;
				3369	pt->data_dev = data_dev;
				3370	pt->low_water_blocks = low_water_blocks;
				3371	pt->adjusted_pf = pt->requested_pf = pf;
				3372	ti->num_flush_bios = 1;
				3373
				3374	/*
				3375	* Only need to enable discards if the pool should pass
				3376	* them down to the data device. The thin device's discard
				3377	* processing will cause mappings to be removed from the btree.
				3378	*/
				3379	if (pf.discard_enabled && pf.discard_passdown) {
				3380	ti->num_discard_bios = 1;
				3381
				3382	/*
				3383	* Setting 'discards_supported' circumvents the normal
				3384	* stacking of discard limits (this keeps the pool and
				3385	* thin devices' discard limits consistent).
				3386	*/
				3387	ti->discards_supported = true;
				3388	}
				3389	ti->private = pt;
				3390
				3391	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
				3392	calc_metadata_threshold(pt),
				3393	metadata_low_callback,
				3394	pool);
				3395	if (r)
				3396	goto out_flags_changed;
				3397
				3398	pt->callbacks.congested_fn = pool_is_congested;
				3399	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				3400
				3401	mutex_unlock(&dm_thin_pool_table.mutex);
				3402
				3403	return 0;
				3404
				3405	out_flags_changed:
				3406	__pool_dec(pool);
				3407	out_free_pt:
				3408	kfree(pt);
				3409	out:
				3410	dm_put_device(ti, data_dev);
				3411	out_metadata:
				3412	dm_put_device(ti, metadata_dev);
				3413	out_unlock:
				3414	mutex_unlock(&dm_thin_pool_table.mutex);
				3415
				3416	return r;
				3417	}
				3418
				3419	static int pool_map(struct dm_target ti, struct bio bio)
				3420	{
				3421	int r;
				3422	struct pool_c *pt = ti->private;
				3423	struct pool *pool = pt->pool;
				3424	unsigned long flags;
				3425
				3426	/*
				3427	* As this is a singleton target, ti->begin is always zero.
				3428	*/
				3429	spin_lock_irqsave(&pool->lock, flags);
				3430	bio_set_dev(bio, pt->data_dev->bdev);
				3431	r = DM_MAPIO_REMAPPED;
				3432	spin_unlock_irqrestore(&pool->lock, flags);
				3433
				3434	return r;
				3435	}
				3436
				3437	static int maybe_resize_data_dev(struct dm_target ti, bool need_commit)
				3438	{
				3439	int r;
				3440	struct pool_c *pt = ti->private;
				3441	struct pool *pool = pt->pool;
				3442	sector_t data_size = ti->len;
				3443	dm_block_t sb_data_size;
				3444
				3445	*need_commit = false;
				3446
				3447	(void) sector_div(data_size, pool->sectors_per_block);
				3448
				3449	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				3450	if (r) {
				3451	DMERR("%s: failed to retrieve data device size",
				3452	dm_device_name(pool->pool_md));
				3453	return r;
				3454	}
				3455
				3456	if (data_size < sb_data_size) {
				3457	DMERR("%s: pool target (%llu blocks) too small: expected %llu",
				3458	dm_device_name(pool->pool_md),
				3459	(unsigned long long)data_size, sb_data_size);
				3460	return -EINVAL;
				3461
				3462	} else if (data_size > sb_data_size) {
				3463	if (dm_pool_metadata_needs_check(pool->pmd)) {
				3464	DMERR("%s: unable to grow the data device until repaired.",
				3465	dm_device_name(pool->pool_md));
				3466	return 0;
				3467	}
				3468
				3469	if (sb_data_size)
				3470	DMINFO("%s: growing the data device from %llu to %llu blocks",
				3471	dm_device_name(pool->pool_md),
				3472	sb_data_size, (unsigned long long)data_size);
				3473	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				3474	if (r) {
				3475	metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
				3476	return r;
				3477	}
				3478
				3479	*need_commit = true;
				3480	}
				3481
				3482	return 0;
				3483	}
				3484
				3485	static int maybe_resize_metadata_dev(struct dm_target ti, bool need_commit)
				3486	{
				3487	int r;
				3488	struct pool_c *pt = ti->private;
				3489	struct pool *pool = pt->pool;
				3490	dm_block_t metadata_dev_size, sb_metadata_dev_size;
				3491
				3492	*need_commit = false;
				3493
				3494	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
				3495
				3496	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
				3497	if (r) {
				3498	DMERR("%s: failed to retrieve metadata device size",
				3499	dm_device_name(pool->pool_md));
				3500	return r;
				3501	}
				3502
				3503	if (metadata_dev_size < sb_metadata_dev_size) {
				3504	DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
				3505	dm_device_name(pool->pool_md),
				3506	metadata_dev_size, sb_metadata_dev_size);
				3507	return -EINVAL;
				3508
				3509	} else if (metadata_dev_size > sb_metadata_dev_size) {
				3510	if (dm_pool_metadata_needs_check(pool->pmd)) {
				3511	DMERR("%s: unable to grow the metadata device until repaired.",
				3512	dm_device_name(pool->pool_md));
				3513	return 0;
				3514	}
				3515
				3516	warn_if_metadata_device_too_big(pool->md_dev);
				3517	DMINFO("%s: growing the metadata device from %llu to %llu blocks",
				3518	dm_device_name(pool->pool_md),
				3519	sb_metadata_dev_size, metadata_dev_size);
				3520
				3521	if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
				3522	set_pool_mode(pool, PM_WRITE);
				3523
				3524	r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
				3525	if (r) {
				3526	metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
				3527	return r;
				3528	}
				3529
				3530	*need_commit = true;
				3531	}
				3532
				3533	return 0;
				3534	}
				3535
				3536	/*
				3537	* Retrieves the number of blocks of the data device from
				3538	* the superblock and compares it to the actual device size,
				3539	* thus resizing the data device in case it has grown.
				3540	*
				3541	* This both copes with opening preallocated data devices in the ctr
				3542	* being followed by a resume
				3543	* -and-
				3544	* calling the resume method individually after userspace has
				3545	* grown the data device in reaction to a table event.
				3546	*/
				3547	static int pool_preresume(struct dm_target *ti)
				3548	{
				3549	int r;
				3550	bool need_commit1, need_commit2;
				3551	struct pool_c *pt = ti->private;
				3552	struct pool *pool = pt->pool;
				3553
				3554	/*
				3555	* Take control of the pool object.
				3556	*/
				3557	r = bind_control_target(pool, ti);
				3558	if (r)
				3559	return r;
				3560
				3561	r = maybe_resize_data_dev(ti, &need_commit1);
				3562	if (r)
				3563	return r;
				3564
				3565	r = maybe_resize_metadata_dev(ti, &need_commit2);
				3566	if (r)
				3567	return r;
				3568
				3569	if (need_commit1 \|\| need_commit2)
				3570	(void) commit(pool);
				3571
				3572	return 0;
				3573	}
				3574
				3575	static void pool_suspend_active_thins(struct pool *pool)
				3576	{
				3577	struct thin_c *tc;
				3578
				3579	/* Suspend all active thin devices */
				3580	tc = get_first_thin(pool);
				3581	while (tc) {
				3582	dm_internal_suspend_noflush(tc->thin_md);
				3583	tc = get_next_thin(pool, tc);
				3584	}
				3585	}
				3586
				3587	static void pool_resume_active_thins(struct pool *pool)
				3588	{
				3589	struct thin_c *tc;
				3590
				3591	/* Resume all active thin devices */
				3592	tc = get_first_thin(pool);
				3593	while (tc) {
				3594	dm_internal_resume(tc->thin_md);
				3595	tc = get_next_thin(pool, tc);
				3596	}
				3597	}
				3598
				3599	static void pool_resume(struct dm_target *ti)
				3600	{
				3601	struct pool_c *pt = ti->private;
				3602	struct pool *pool = pt->pool;
				3603	unsigned long flags;
				3604
				3605	/*
				3606	* Must requeue active_thins' bios and then resume
				3607	* active_thins _before_ clearing 'suspend' flag.
				3608	*/
				3609	requeue_bios(pool);
				3610	pool_resume_active_thins(pool);
				3611
				3612	spin_lock_irqsave(&pool->lock, flags);
				3613	pool->low_water_triggered = false;
				3614	pool->suspended = false;
				3615	spin_unlock_irqrestore(&pool->lock, flags);
				3616
				3617	do_waker(&pool->waker.work);
				3618	}
				3619
				3620	static void pool_presuspend(struct dm_target *ti)
				3621	{
				3622	struct pool_c *pt = ti->private;
				3623	struct pool *pool = pt->pool;
				3624	unsigned long flags;
				3625
				3626	spin_lock_irqsave(&pool->lock, flags);
				3627	pool->suspended = true;
				3628	spin_unlock_irqrestore(&pool->lock, flags);
				3629
				3630	pool_suspend_active_thins(pool);
				3631	}
				3632
				3633	static void pool_presuspend_undo(struct dm_target *ti)
				3634	{
				3635	struct pool_c *pt = ti->private;
				3636	struct pool *pool = pt->pool;
				3637	unsigned long flags;
				3638
				3639	pool_resume_active_thins(pool);
				3640
				3641	spin_lock_irqsave(&pool->lock, flags);
				3642	pool->suspended = false;
				3643	spin_unlock_irqrestore(&pool->lock, flags);
				3644	}
				3645
				3646	static void pool_postsuspend(struct dm_target *ti)
				3647	{
				3648	struct pool_c *pt = ti->private;
				3649	struct pool *pool = pt->pool;
				3650
				3651	cancel_delayed_work_sync(&pool->waker);
				3652	cancel_delayed_work_sync(&pool->no_space_timeout);
				3653	flush_workqueue(pool->wq);
				3654	(void) commit(pool);
				3655	}
				3656
				3657	static int check_arg_count(unsigned argc, unsigned args_required)
				3658	{
				3659	if (argc != args_required) {
				3660	DMWARN("Message received with %u arguments instead of %u.",
				3661	argc, args_required);
				3662	return -EINVAL;
				3663	}
				3664
				3665	return 0;
				3666	}
				3667
				3668	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				3669	{
				3670	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				3671	*dev_id <= MAX_DEV_ID)
				3672	return 0;
				3673
				3674	if (warning)
				3675	DMWARN("Message received with invalid device id: %s", arg);
				3676
				3677	return -EINVAL;
				3678	}
				3679
				3680	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				3681	{
				3682	dm_thin_id dev_id;
				3683	int r;
				3684
				3685	r = check_arg_count(argc, 2);
				3686	if (r)
				3687	return r;
				3688
				3689	r = read_dev_id(argv[1], &dev_id, 1);
				3690	if (r)
				3691	return r;
				3692
				3693	r = dm_pool_create_thin(pool->pmd, dev_id);
				3694	if (r) {
				3695	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				3696	argv[1]);
				3697	return r;
				3698	}
				3699
				3700	return 0;
				3701	}
				3702
				3703	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				3704	{
				3705	dm_thin_id dev_id;
				3706	dm_thin_id origin_dev_id;
				3707	int r;
				3708
				3709	r = check_arg_count(argc, 3);
				3710	if (r)
				3711	return r;
				3712
				3713	r = read_dev_id(argv[1], &dev_id, 1);
				3714	if (r)
				3715	return r;
				3716
				3717	r = read_dev_id(argv[2], &origin_dev_id, 1);
				3718	if (r)
				3719	return r;
				3720
				3721	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				3722	if (r) {
				3723	DMWARN("Creation of new snapshot %s of device %s failed.",
				3724	argv[1], argv[2]);
				3725	return r;
				3726	}
				3727
				3728	return 0;
				3729	}
				3730
				3731	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				3732	{
				3733	dm_thin_id dev_id;
				3734	int r;
				3735
				3736	r = check_arg_count(argc, 2);
				3737	if (r)
				3738	return r;
				3739
				3740	r = read_dev_id(argv[1], &dev_id, 1);
				3741	if (r)
				3742	return r;
				3743
				3744	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				3745	if (r)
				3746	DMWARN("Deletion of thin device %s failed.", argv[1]);
				3747
				3748	return r;
				3749	}
				3750
				3751	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				3752	{
				3753	dm_thin_id old_id, new_id;
				3754	int r;
				3755
				3756	r = check_arg_count(argc, 3);
				3757	if (r)
				3758	return r;
				3759
				3760	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				3761	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				3762	return -EINVAL;
				3763	}
				3764
				3765	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				3766	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				3767	return -EINVAL;
				3768	}
				3769
				3770	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				3771	if (r) {
				3772	DMWARN("Failed to change transaction id from %s to %s.",
				3773	argv[1], argv[2]);
				3774	return r;
				3775	}
				3776
				3777	return 0;
				3778	}
				3779
				3780	static int process_reserve_metadata_snap_mesg(unsigned argc, char *argv, struct pool pool)
				3781	{
				3782	int r;
				3783
				3784	r = check_arg_count(argc, 1);
				3785	if (r)
				3786	return r;
				3787
				3788	(void) commit(pool);
				3789
				3790	r = dm_pool_reserve_metadata_snap(pool->pmd);
				3791	if (r)
				3792	DMWARN("reserve_metadata_snap message failed.");
				3793
				3794	return r;
				3795	}
				3796
				3797	static int process_release_metadata_snap_mesg(unsigned argc, char *argv, struct pool pool)
				3798	{
				3799	int r;
				3800
				3801	r = check_arg_count(argc, 1);
				3802	if (r)
				3803	return r;
				3804
				3805	r = dm_pool_release_metadata_snap(pool->pmd);
				3806	if (r)
				3807	DMWARN("release_metadata_snap message failed.");
				3808
				3809	return r;
				3810	}
				3811
				3812	/*
				3813	* Messages supported:
				3814	* create_thin <dev_id>
				3815	* create_snap <dev_id> <origin_id>
				3816	* delete <dev_id>
				3817	* set_transaction_id <current_trans_id> <new_trans_id>
				3818	* reserve_metadata_snap
				3819	* release_metadata_snap
				3820	*/
				3821	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				3822	{
				3823	int r = -EINVAL;
				3824	struct pool_c *pt = ti->private;
				3825	struct pool *pool = pt->pool;
				3826
				3827	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
				3828	DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
				3829	dm_device_name(pool->pool_md));
				3830	return -EOPNOTSUPP;
				3831	}
				3832
				3833	if (!strcasecmp(argv[0], "create_thin"))
				3834	r = process_create_thin_mesg(argc, argv, pool);
				3835
				3836	else if (!strcasecmp(argv[0], "create_snap"))
				3837	r = process_create_snap_mesg(argc, argv, pool);
				3838
				3839	else if (!strcasecmp(argv[0], "delete"))
				3840	r = process_delete_mesg(argc, argv, pool);
				3841
				3842	else if (!strcasecmp(argv[0], "set_transaction_id"))
				3843	r = process_set_transaction_id_mesg(argc, argv, pool);
				3844
				3845	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
				3846	r = process_reserve_metadata_snap_mesg(argc, argv, pool);
				3847
				3848	else if (!strcasecmp(argv[0], "release_metadata_snap"))
				3849	r = process_release_metadata_snap_mesg(argc, argv, pool);
				3850
				3851	else
				3852	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				3853
				3854	if (!r)
				3855	(void) commit(pool);
				3856
				3857	return r;
				3858	}
				3859
				3860	static void emit_flags(struct pool_features pf, char result,
				3861	unsigned sz, unsigned maxlen)
				3862	{
				3863	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
				3864	!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
				3865	pf->error_if_no_space;
				3866	DMEMIT("%u ", count);
				3867
				3868	if (!pf->zero_new_blocks)
				3869	DMEMIT("skip_block_zeroing ");
				3870
				3871	if (!pf->discard_enabled)
				3872	DMEMIT("ignore_discard ");
				3873
				3874	if (!pf->discard_passdown)
				3875	DMEMIT("no_discard_passdown ");
				3876
				3877	if (pf->mode == PM_READ_ONLY)
				3878	DMEMIT("read_only ");
				3879
				3880	if (pf->error_if_no_space)
				3881	DMEMIT("error_if_no_space ");
				3882	}
				3883
				3884	/*
				3885	* Status line is:
				3886	* <transaction id> <used metadata sectors>/<total metadata sectors>
				3887	* <used data sectors>/<total data sectors> <held metadata root>
				3888	* <pool mode> <discard config> <no space config> <needs_check>
				3889	*/
				3890	static void pool_status(struct dm_target *ti, status_type_t type,
				3891	unsigned status_flags, char *result, unsigned maxlen)
				3892	{
				3893	int r;
				3894	unsigned sz = 0;
				3895	uint64_t transaction_id;
				3896	dm_block_t nr_free_blocks_data;
				3897	dm_block_t nr_free_blocks_metadata;
				3898	dm_block_t nr_blocks_data;
				3899	dm_block_t nr_blocks_metadata;
				3900	dm_block_t held_root;
				3901	enum pool_mode mode;
				3902	char buf[BDEVNAME_SIZE];
				3903	char buf2[BDEVNAME_SIZE];
				3904	struct pool_c *pt = ti->private;
				3905	struct pool *pool = pt->pool;
				3906
				3907	switch (type) {
				3908	case STATUSTYPE_INFO:
				3909	if (get_pool_mode(pool) == PM_FAIL) {
				3910	DMEMIT("Fail");
				3911	break;
				3912	}
				3913
				3914	/* Commit to ensure statistics aren't out-of-date */
				3915	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
				3916	(void) commit(pool);
				3917
				3918	r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
				3919	if (r) {
				3920	DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
				3921	dm_device_name(pool->pool_md), r);
				3922	goto err;
				3923	}
				3924
				3925	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
				3926	if (r) {
				3927	DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
				3928	dm_device_name(pool->pool_md), r);
				3929	goto err;
				3930	}
				3931
				3932	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				3933	if (r) {
				3934	DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
				3935	dm_device_name(pool->pool_md), r);
				3936	goto err;
				3937	}
				3938
				3939	r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
				3940	if (r) {
				3941	DMERR("%s: dm_pool_get_free_block_count returned %d",
				3942	dm_device_name(pool->pool_md), r);
				3943	goto err;
				3944	}
				3945
				3946	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				3947	if (r) {
				3948	DMERR("%s: dm_pool_get_data_dev_size returned %d",
				3949	dm_device_name(pool->pool_md), r);
				3950	goto err;
				3951	}
				3952
				3953	r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
				3954	if (r) {
				3955	DMERR("%s: dm_pool_get_metadata_snap returned %d",
				3956	dm_device_name(pool->pool_md), r);
				3957	goto err;
				3958	}
				3959
				3960	DMEMIT("%llu %llu/%llu %llu/%llu ",
				3961	(unsigned long long)transaction_id,
				3962	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				3963	(unsigned long long)nr_blocks_metadata,
				3964	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				3965	(unsigned long long)nr_blocks_data);
				3966
				3967	if (held_root)
				3968	DMEMIT("%llu ", held_root);
				3969	else
				3970	DMEMIT("- ");
				3971
				3972	mode = get_pool_mode(pool);
				3973	if (mode == PM_OUT_OF_DATA_SPACE)
				3974	DMEMIT("out_of_data_space ");
				3975	else if (is_read_only_pool_mode(mode))
				3976	DMEMIT("ro ");
				3977	else
				3978	DMEMIT("rw ");
				3979
				3980	if (!pool->pf.discard_enabled)
				3981	DMEMIT("ignore_discard ");
				3982	else if (pool->pf.discard_passdown)
				3983	DMEMIT("discard_passdown ");
				3984	else
				3985	DMEMIT("no_discard_passdown ");
				3986
				3987	if (pool->pf.error_if_no_space)
				3988	DMEMIT("error_if_no_space ");
				3989	else
				3990	DMEMIT("queue_if_no_space ");
				3991
				3992	if (dm_pool_metadata_needs_check(pool->pmd))
				3993	DMEMIT("needs_check ");
				3994	else
				3995	DMEMIT("- ");
				3996
				3997	break;
				3998
				3999	case STATUSTYPE_TABLE:
				4000	DMEMIT("%s %s %lu %llu ",
				4001	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				4002	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				4003	(unsigned long)pool->sectors_per_block,
				4004	(unsigned long long)pt->low_water_blocks);
				4005	emit_flags(&pt->requested_pf, result, sz, maxlen);
				4006	break;
				4007	}
				4008	return;
				4009
				4010	err:
				4011	DMEMIT("Error");
				4012	}
				4013
				4014	static int pool_iterate_devices(struct dm_target *ti,
				4015	iterate_devices_callout_fn fn, void *data)
				4016	{
				4017	struct pool_c *pt = ti->private;
				4018
				4019	return fn(ti, pt->data_dev, 0, ti->len, data);
				4020	}
				4021
				4022	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				4023	{
				4024	struct pool_c *pt = ti->private;
				4025	struct pool *pool = pt->pool;
				4026	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
				4027
				4028	/*
				4029	* If max_sectors is smaller than pool->sectors_per_block adjust it
				4030	* to the highest possible power-of-2 factor of pool->sectors_per_block.
				4031	* This is especially beneficial when the pool's data device is a RAID
				4032	* device that has a full stripe width that matches pool->sectors_per_block
				4033	* -- because even though partial RAID stripe-sized IOs will be issued to a
				4034	* single RAID stripe; when aggregated they will end on a full RAID stripe
				4035	* boundary.. which avoids additional partial RAID stripe writes cascading
				4036	*/
				4037	if (limits->max_sectors < pool->sectors_per_block) {
				4038	while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
				4039	if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
				4040	limits->max_sectors--;
				4041	limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
				4042	}
				4043	}
				4044
				4045	/*
				4046	* If the system-determined stacked limits are compatible with the
				4047	* pool's blocksize (io_opt is a factor) do not override them.
				4048	*/
				4049	if (io_opt_sectors < pool->sectors_per_block \|\|
				4050	!is_factor(io_opt_sectors, pool->sectors_per_block)) {
				4051	if (is_factor(pool->sectors_per_block, limits->max_sectors))
				4052	blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
				4053	else
				4054	blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
				4055	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				4056	}
				4057
				4058	/*
				4059	* pt->adjusted_pf is a staging area for the actual features to use.
				4060	* They get transferred to the live pool in bind_control_target()
				4061	* called from pool_preresume().
				4062	*/
				4063	if (!pt->adjusted_pf.discard_enabled) {
				4064	/*
				4065	* Must explicitly disallow stacking discard limits otherwise the
				4066	* block layer will stack them if pool's data device has support.
				4067	* QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
				4068	* user to see that, so make sure to set all discard limits to 0.
				4069	*/
				4070	limits->discard_granularity = 0;
				4071	return;
				4072	}
				4073
				4074	disable_passdown_if_not_supported(pt);
				4075
				4076	/*
				4077	* The pool uses the same discard limits as the underlying data
				4078	* device. DM core has already set this up.
				4079	*/
				4080	}
				4081
				4082	static struct target_type pool_target = {
				4083	.name = "thin-pool",
				4084	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				4085	DM_TARGET_IMMUTABLE,
				4086	.version = {1, 19, 0},
				4087	.module = THIS_MODULE,
				4088	.ctr = pool_ctr,
				4089	.dtr = pool_dtr,
				4090	.map = pool_map,
				4091	.presuspend = pool_presuspend,
				4092	.presuspend_undo = pool_presuspend_undo,
				4093	.postsuspend = pool_postsuspend,
				4094	.preresume = pool_preresume,
				4095	.resume = pool_resume,
				4096	.message = pool_message,
				4097	.status = pool_status,
				4098	.iterate_devices = pool_iterate_devices,
				4099	.io_hints = pool_io_hints,
				4100	};
				4101
				4102	/*----------------------------------------------------------------
				4103	* Thin target methods
				4104	--------------------------------------------------------------/
				4105	static void thin_get(struct thin_c *tc)
				4106	{
				4107	atomic_inc(&tc->refcount);
				4108	}
				4109
				4110	static void thin_put(struct thin_c *tc)
				4111	{
				4112	if (atomic_dec_and_test(&tc->refcount))
				4113	complete(&tc->can_destroy);
				4114	}
				4115
				4116	static void thin_dtr(struct dm_target *ti)
				4117	{
				4118	struct thin_c *tc = ti->private;
				4119	unsigned long flags;
				4120
				4121	spin_lock_irqsave(&tc->pool->lock, flags);
				4122	list_del_rcu(&tc->list);
				4123	spin_unlock_irqrestore(&tc->pool->lock, flags);
				4124	synchronize_rcu();
				4125
				4126	thin_put(tc);
				4127	wait_for_completion(&tc->can_destroy);
				4128
				4129	mutex_lock(&dm_thin_pool_table.mutex);
				4130
				4131	__pool_dec(tc->pool);
				4132	dm_pool_close_thin_device(tc->td);
				4133	dm_put_device(ti, tc->pool_dev);
				4134	if (tc->origin_dev)
				4135	dm_put_device(ti, tc->origin_dev);
				4136	kfree(tc);
				4137
				4138	mutex_unlock(&dm_thin_pool_table.mutex);
				4139	}
				4140
				4141	/*
				4142	* Thin target parameters:
				4143	*
				4144	* <pool_dev> <dev_id> [origin_dev]
				4145	*
				4146	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				4147	* dev_id: the internal device identifier
				4148	* origin_dev: a device external to the pool that should act as the origin
				4149	*
				4150	* If the pool device has discards disabled, they get disabled for the thin
				4151	* device as well.
				4152	*/
				4153	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				4154	{
				4155	int r;
				4156	struct thin_c *tc;
				4157	struct dm_dev pool_dev, origin_dev;
				4158	struct mapped_device *pool_md;
				4159	unsigned long flags;
				4160
				4161	mutex_lock(&dm_thin_pool_table.mutex);
				4162
				4163	if (argc != 2 && argc != 3) {
				4164	ti->error = "Invalid argument count";
				4165	r = -EINVAL;
				4166	goto out_unlock;
				4167	}
				4168
				4169	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				4170	if (!tc) {
				4171	ti->error = "Out of memory";
				4172	r = -ENOMEM;
				4173	goto out_unlock;
				4174	}
				4175	tc->thin_md = dm_table_get_md(ti->table);
				4176	spin_lock_init(&tc->lock);
				4177	INIT_LIST_HEAD(&tc->deferred_cells);
				4178	bio_list_init(&tc->deferred_bio_list);
				4179	bio_list_init(&tc->retry_on_resume_list);
				4180	tc->sort_bio_list = RB_ROOT;
				4181
				4182	if (argc == 3) {
				4183	if (!strcmp(argv[0], argv[2])) {
				4184	ti->error = "Error setting origin device";
				4185	r = -EINVAL;
				4186	goto bad_origin_dev;
				4187	}
				4188
				4189	r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
				4190	if (r) {
				4191	ti->error = "Error opening origin device";
				4192	goto bad_origin_dev;
				4193	}
				4194	tc->origin_dev = origin_dev;
				4195	}
				4196
				4197	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				4198	if (r) {
				4199	ti->error = "Error opening pool device";
				4200	goto bad_pool_dev;
				4201	}
				4202	tc->pool_dev = pool_dev;
				4203
				4204	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				4205	ti->error = "Invalid device id";
				4206	r = -EINVAL;
				4207	goto bad_common;
				4208	}
				4209
				4210	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				4211	if (!pool_md) {
				4212	ti->error = "Couldn't get pool mapped device";
				4213	r = -EINVAL;
				4214	goto bad_common;
				4215	}
				4216
				4217	tc->pool = __pool_table_lookup(pool_md);
				4218	if (!tc->pool) {
				4219	ti->error = "Couldn't find pool object";
				4220	r = -EINVAL;
				4221	goto bad_pool_lookup;
				4222	}
				4223	__pool_inc(tc->pool);
				4224
				4225	if (get_pool_mode(tc->pool) == PM_FAIL) {
				4226	ti->error = "Couldn't open thin device, Pool is in fail mode";
				4227	r = -EINVAL;
				4228	goto bad_pool;
				4229	}
				4230
				4231	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				4232	if (r) {
				4233	ti->error = "Couldn't open thin internal device";
				4234	goto bad_pool;
				4235	}
				4236
				4237	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
				4238	if (r)
				4239	goto bad;
				4240
				4241	ti->num_flush_bios = 1;
				4242	ti->flush_supported = true;
				4243	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
				4244
				4245	/* In case the pool supports discards, pass them on. */
				4246	if (tc->pool->pf.discard_enabled) {
				4247	ti->discards_supported = true;
				4248	ti->num_discard_bios = 1;
				4249	ti->split_discard_bios = false;
				4250	}
				4251
				4252	mutex_unlock(&dm_thin_pool_table.mutex);
				4253
				4254	spin_lock_irqsave(&tc->pool->lock, flags);
				4255	if (tc->pool->suspended) {
				4256	spin_unlock_irqrestore(&tc->pool->lock, flags);
				4257	mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
				4258	ti->error = "Unable to activate thin device while pool is suspended";
				4259	r = -EINVAL;
				4260	goto bad;
				4261	}
				4262	atomic_set(&tc->refcount, 1);
				4263	init_completion(&tc->can_destroy);
				4264	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
				4265	spin_unlock_irqrestore(&tc->pool->lock, flags);
				4266	/*
				4267	* This synchronize_rcu() call is needed here otherwise we risk a
				4268	* wake_worker() call finding no bios to process (because the newly
				4269	* added tc isn't yet visible). So this reduces latency since we
				4270	* aren't then dependent on the periodic commit to wake_worker().
				4271	*/
				4272	synchronize_rcu();
				4273
				4274	dm_put(pool_md);
				4275
				4276	return 0;
				4277
				4278	bad:
				4279	dm_pool_close_thin_device(tc->td);
				4280	bad_pool:
				4281	__pool_dec(tc->pool);
				4282	bad_pool_lookup:
				4283	dm_put(pool_md);
				4284	bad_common:
				4285	dm_put_device(ti, tc->pool_dev);
				4286	bad_pool_dev:
				4287	if (tc->origin_dev)
				4288	dm_put_device(ti, tc->origin_dev);
				4289	bad_origin_dev:
				4290	kfree(tc);
				4291	out_unlock:
				4292	mutex_unlock(&dm_thin_pool_table.mutex);
				4293
				4294	return r;
				4295	}
				4296
				4297	static int thin_map(struct dm_target ti, struct bio bio)
				4298	{
				4299	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
				4300
				4301	return thin_bio_map(ti, bio);
				4302	}
				4303
				4304	static int thin_endio(struct dm_target ti, struct bio bio,
				4305	blk_status_t *err)
				4306	{
				4307	unsigned long flags;
				4308	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
				4309	struct list_head work;
				4310	struct dm_thin_new_mapping m, tmp;
				4311	struct pool *pool = h->tc->pool;
				4312
				4313	if (h->shared_read_entry) {
				4314	INIT_LIST_HEAD(&work);
				4315	dm_deferred_entry_dec(h->shared_read_entry, &work);
				4316
				4317	spin_lock_irqsave(&pool->lock, flags);
				4318	list_for_each_entry_safe(m, tmp, &work, list) {
				4319	list_del(&m->list);
				4320	__complete_mapping_preparation(m);
				4321	}
				4322	spin_unlock_irqrestore(&pool->lock, flags);
				4323	}
				4324
				4325	if (h->all_io_entry) {
				4326	INIT_LIST_HEAD(&work);
				4327	dm_deferred_entry_dec(h->all_io_entry, &work);
				4328	if (!list_empty(&work)) {
				4329	spin_lock_irqsave(&pool->lock, flags);
				4330	list_for_each_entry_safe(m, tmp, &work, list)
				4331	list_add_tail(&m->list, &pool->prepared_discards);
				4332	spin_unlock_irqrestore(&pool->lock, flags);
				4333	wake_worker(pool);
				4334	}
				4335	}
				4336
				4337	if (h->cell)
				4338	cell_defer_no_holder(h->tc, h->cell);
				4339
				4340	return DM_ENDIO_DONE;
				4341	}
				4342
				4343	static void thin_presuspend(struct dm_target *ti)
				4344	{
				4345	struct thin_c *tc = ti->private;
				4346
				4347	if (dm_noflush_suspending(ti))
				4348	noflush_work(tc, do_noflush_start);
				4349	}
				4350
				4351	static void thin_postsuspend(struct dm_target *ti)
				4352	{
				4353	struct thin_c *tc = ti->private;
				4354
				4355	/*
				4356	* The dm_noflush_suspending flag has been cleared by now, so
				4357	* unfortunately we must always run this.
				4358	*/
				4359	noflush_work(tc, do_noflush_stop);
				4360	}
				4361
				4362	static int thin_preresume(struct dm_target *ti)
				4363	{
				4364	struct thin_c *tc = ti->private;
				4365
				4366	if (tc->origin_dev)
				4367	tc->origin_size = get_dev_size(tc->origin_dev->bdev);
				4368
				4369	return 0;
				4370	}
				4371
				4372	/*
				4373	* <nr mapped sectors> <highest mapped sector>
				4374	*/
				4375	static void thin_status(struct dm_target *ti, status_type_t type,
				4376	unsigned status_flags, char *result, unsigned maxlen)
				4377	{
				4378	int r;
				4379	ssize_t sz = 0;
				4380	dm_block_t mapped, highest;
				4381	char buf[BDEVNAME_SIZE];
				4382	struct thin_c *tc = ti->private;
				4383
				4384	if (get_pool_mode(tc->pool) == PM_FAIL) {
				4385	DMEMIT("Fail");
				4386	return;
				4387	}
				4388
				4389	if (!tc->td)
				4390	DMEMIT("-");
				4391	else {
				4392	switch (type) {
				4393	case STATUSTYPE_INFO:
				4394	r = dm_thin_get_mapped_count(tc->td, &mapped);
				4395	if (r) {
				4396	DMERR("dm_thin_get_mapped_count returned %d", r);
				4397	goto err;
				4398	}
				4399
				4400	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				4401	if (r < 0) {
				4402	DMERR("dm_thin_get_highest_mapped_block returned %d", r);
				4403	goto err;
				4404	}
				4405
				4406	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				4407	if (r)
				4408	DMEMIT("%llu", ((highest + 1) *
				4409	tc->pool->sectors_per_block) - 1);
				4410	else
				4411	DMEMIT("-");
				4412	break;
				4413
				4414	case STATUSTYPE_TABLE:
				4415	DMEMIT("%s %lu",
				4416	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				4417	(unsigned long) tc->dev_id);
				4418	if (tc->origin_dev)
				4419	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
				4420	break;
				4421	}
				4422	}
				4423
				4424	return;
				4425
				4426	err:
				4427	DMEMIT("Error");
				4428	}
				4429
				4430	static int thin_iterate_devices(struct dm_target *ti,
				4431	iterate_devices_callout_fn fn, void *data)
				4432	{
				4433	sector_t blocks;
				4434	struct thin_c *tc = ti->private;
				4435	struct pool *pool = tc->pool;
				4436
				4437	/*
				4438	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				4439	* we follow a more convoluted path through to the pool's target.
				4440	*/
				4441	if (!pool->ti)
				4442	return 0; /* nothing is bound */
				4443
				4444	blocks = pool->ti->len;
				4445	(void) sector_div(blocks, pool->sectors_per_block);
				4446	if (blocks)
				4447	return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
				4448
				4449	return 0;
				4450	}
				4451
				4452	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				4453	{
				4454	struct thin_c *tc = ti->private;
				4455	struct pool *pool = tc->pool;
				4456
				4457	if (!pool->pf.discard_enabled)
				4458	return;
				4459
				4460	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
				4461	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
				4462	}
				4463
				4464	static struct target_type thin_target = {
				4465	.name = "thin",
				4466	.version = {1, 19, 0},
				4467	.module = THIS_MODULE,
				4468	.ctr = thin_ctr,
				4469	.dtr = thin_dtr,
				4470	.map = thin_map,
				4471	.end_io = thin_endio,
				4472	.preresume = thin_preresume,
				4473	.presuspend = thin_presuspend,
				4474	.postsuspend = thin_postsuspend,
				4475	.status = thin_status,
				4476	.iterate_devices = thin_iterate_devices,
				4477	.io_hints = thin_io_hints,
				4478	};
				4479
				4480	/----------------------------------------------------------------/
				4481
				4482	static int __init dm_thin_init(void)
				4483	{
				4484	int r = -ENOMEM;
				4485
				4486	pool_table_init();
				4487
				4488	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
				4489	if (!_new_mapping_cache)
				4490	return r;
				4491
				4492	r = dm_register_target(&thin_target);
				4493	if (r)
				4494	goto bad_new_mapping_cache;
				4495
				4496	r = dm_register_target(&pool_target);
				4497	if (r)
				4498	goto bad_thin_target;
				4499
				4500	return 0;
				4501
				4502	bad_thin_target:
				4503	dm_unregister_target(&thin_target);
				4504	bad_new_mapping_cache:
				4505	kmem_cache_destroy(_new_mapping_cache);
				4506
				4507	return r;
				4508	}
				4509
				4510	static void dm_thin_exit(void)
				4511	{
				4512	dm_unregister_target(&thin_target);
				4513	dm_unregister_target(&pool_target);
				4514
				4515	kmem_cache_destroy(_new_mapping_cache);
				4516	}
				4517
				4518	module_init(dm_thin_init);
				4519	module_exit(dm_thin_exit);
				4520
				4521	module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO \| S_IWUSR);
				4522	MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
				4523
				4524	MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
				4525	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				4526	MODULE_LICENSE("GPL");