Blame - src/kernel/linux/v4.19/drivers/md/dm-snap.c - T800

blob: d3f28a9e3fd950f432f55c81387adda025dce574 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* dm-snapshot.c
				3	*
				4	* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
				5	*
				6	* This file is released under the GPL.
				7	*/
				8
				9	#include <linux/blkdev.h>
				10	#include <linux/device-mapper.h>
				11	#include <linux/delay.h>
				12	#include <linux/fs.h>
				13	#include <linux/init.h>
				14	#include <linux/kdev_t.h>
				15	#include <linux/list.h>
				16	#include <linux/mempool.h>
				17	#include <linux/module.h>
				18	#include <linux/slab.h>
				19	#include <linux/vmalloc.h>
				20	#include <linux/log2.h>
				21	#include <linux/dm-kcopyd.h>
				22
				23	#include "dm.h"
				24
				25	#include "dm-exception-store.h"
				26
				27	#define DM_MSG_PREFIX "snapshots"
				28
				29	static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
				30
				31	#define dm_target_is_snapshot_merge(ti) \
				32	((ti)->type->name == dm_snapshot_merge_target_name)
				33
				34	/*
				35	* The size of the mempool used to track chunks in use.
				36	*/
				37	#define MIN_IOS 256
				38
				39	#define DM_TRACKED_CHUNK_HASH_SIZE 16
				40	#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
				41	(DM_TRACKED_CHUNK_HASH_SIZE - 1))
				42
				43	struct dm_exception_table {
				44	uint32_t hash_mask;
				45	unsigned hash_shift;
				46	struct list_head *table;
				47	};
				48
				49	struct dm_snapshot {
				50	struct mutex lock;
				51
				52	struct dm_dev *origin;
				53	struct dm_dev *cow;
				54
				55	struct dm_target *ti;
				56
				57	/* List of snapshots per Origin */
				58	struct list_head list;
				59
				60	/*
				61	* You can't use a snapshot if this is 0 (e.g. if full).
				62	* A snapshot-merge target never clears this.
				63	*/
				64	int valid;
				65
				66	/*
				67	* The snapshot overflowed because of a write to the snapshot device.
				68	* We don't have to invalidate the snapshot in this case, but we need
				69	* to prevent further writes.
				70	*/
				71	int snapshot_overflowed;
				72
				73	/* Origin writes don't trigger exceptions until this is set */
				74	int active;
				75
				76	atomic_t pending_exceptions_count;
				77
				78	/* Protected by "lock" */
				79	sector_t exception_start_sequence;
				80
				81	/* Protected by kcopyd single-threaded callback */
				82	sector_t exception_complete_sequence;
				83
				84	/*
				85	* A list of pending exceptions that completed out of order.
				86	* Protected by kcopyd single-threaded callback.
				87	*/
				88	struct rb_root out_of_order_tree;
				89
				90	mempool_t pending_pool;
				91
				92	struct dm_exception_table pending;
				93	struct dm_exception_table complete;
				94
				95	/*
				96	* pe_lock protects all pending_exception operations and access
				97	* as well as the snapshot_bios list.
				98	*/
				99	spinlock_t pe_lock;
				100
				101	/* Chunks with outstanding reads */
				102	spinlock_t tracked_chunk_lock;
				103	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
				104
				105	/* The on disk metadata handler */
				106	struct dm_exception_store *store;
				107
				108	unsigned in_progress;
				109	struct wait_queue_head in_progress_wait;
				110
				111	struct dm_kcopyd_client *kcopyd_client;
				112
				113	/* Wait for events based on state_bits */
				114	unsigned long state_bits;
				115
				116	/* Range of chunks currently being merged. */
				117	chunk_t first_merging_chunk;
				118	int num_merging_chunks;
				119
				120	/*
				121	* The merge operation failed if this flag is set.
				122	* Failure modes are handled as follows:
				123	* - I/O error reading the header
				124	* => don't load the target; abort.
				125	* - Header does not have "valid" flag set
				126	* => use the origin; forget about the snapshot.
				127	* - I/O error when reading exceptions
				128	* => don't load the target; abort.
				129	* (We can't use the intermediate origin state.)
				130	* - I/O error while merging
				131	* => stop merging; set merge_failed; process I/O normally.
				132	*/
				133	int merge_failed;
				134
				135	/*
				136	* Incoming bios that overlap with chunks being merged must wait
				137	* for them to be committed.
				138	*/
				139	struct bio_list bios_queued_during_merge;
				140	};
				141
				142	/*
				143	* state_bits:
				144	* RUNNING_MERGE - Merge operation is in progress.
				145	* SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
				146	* cleared afterwards.
				147	*/
				148	#define RUNNING_MERGE 0
				149	#define SHUTDOWN_MERGE 1
				150
				151	/*
				152	* Maximum number of chunks being copied on write.
				153	*
				154	* The value was decided experimentally as a trade-off between memory
				155	* consumption, stalling the kernel's workqueues and maintaining a high enough
				156	* throughput.
				157	*/
				158	#define DEFAULT_COW_THRESHOLD 2048
				159
				160	static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
				161	module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
				162	MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
				163
				164	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
				165	"A percentage of time allocated for copy on write");
				166
				167	struct dm_dev dm_snap_origin(struct dm_snapshot s)
				168	{
				169	return s->origin;
				170	}
				171	EXPORT_SYMBOL(dm_snap_origin);
				172
				173	struct dm_dev dm_snap_cow(struct dm_snapshot s)
				174	{
				175	return s->cow;
				176	}
				177	EXPORT_SYMBOL(dm_snap_cow);
				178
				179	static sector_t chunk_to_sector(struct dm_exception_store *store,
				180	chunk_t chunk)
				181	{
				182	return chunk << store->chunk_shift;
				183	}
				184
				185	static int bdev_equal(struct block_device lhs, struct block_device rhs)
				186	{
				187	/*
				188	* There is only ever one instance of a particular block
				189	* device so we can compare pointers safely.
				190	*/
				191	return lhs == rhs;
				192	}
				193
				194	struct dm_snap_pending_exception {
				195	struct dm_exception e;
				196
				197	/*
				198	* Origin buffers waiting for this to complete are held
				199	* in a bio list
				200	*/
				201	struct bio_list origin_bios;
				202	struct bio_list snapshot_bios;
				203
				204	/* Pointer back to snapshot context */
				205	struct dm_snapshot *snap;
				206
				207	/*
				208	* 1 indicates the exception has already been sent to
				209	* kcopyd.
				210	*/
				211	int started;
				212
				213	/* There was copying error. */
				214	int copy_error;
				215
				216	/* A sequence number, it is used for in-order completion. */
				217	sector_t exception_sequence;
				218
				219	struct rb_node out_of_order_node;
				220
				221	/*
				222	* For writing a complete chunk, bypassing the copy.
				223	*/
				224	struct bio *full_bio;
				225	bio_end_io_t *full_bio_end_io;
				226	};
				227
				228	/*
				229	* Hash table mapping origin volumes to lists of snapshots and
				230	* a lock to protect it
				231	*/
				232	static struct kmem_cache *exception_cache;
				233	static struct kmem_cache *pending_cache;
				234
				235	struct dm_snap_tracked_chunk {
				236	struct hlist_node node;
				237	chunk_t chunk;
				238	};
				239
				240	static void init_tracked_chunk(struct bio *bio)
				241	{
				242	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				243	INIT_HLIST_NODE(&c->node);
				244	}
				245
				246	static bool is_bio_tracked(struct bio *bio)
				247	{
				248	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				249	return !hlist_unhashed(&c->node);
				250	}
				251
				252	static void track_chunk(struct dm_snapshot s, struct bio bio, chunk_t chunk)
				253	{
				254	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				255
				256	c->chunk = chunk;
				257
				258	spin_lock_irq(&s->tracked_chunk_lock);
				259	hlist_add_head(&c->node,
				260	&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
				261	spin_unlock_irq(&s->tracked_chunk_lock);
				262	}
				263
				264	static void stop_tracking_chunk(struct dm_snapshot s, struct bio bio)
				265	{
				266	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				267	unsigned long flags;
				268
				269	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
				270	hlist_del(&c->node);
				271	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
				272	}
				273
				274	static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
				275	{
				276	struct dm_snap_tracked_chunk *c;
				277	int found = 0;
				278
				279	spin_lock_irq(&s->tracked_chunk_lock);
				280
				281	hlist_for_each_entry(c,
				282	&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
				283	if (c->chunk == chunk) {
				284	found = 1;
				285	break;
				286	}
				287	}
				288
				289	spin_unlock_irq(&s->tracked_chunk_lock);
				290
				291	return found;
				292	}
				293
				294	/*
				295	* This conflicting I/O is extremely improbable in the caller,
				296	* so msleep(1) is sufficient and there is no need for a wait queue.
				297	*/
				298	static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
				299	{
				300	while (__chunk_is_tracked(s, chunk))
				301	msleep(1);
				302	}
				303
				304	/*
				305	* One of these per registered origin, held in the snapshot_origins hash
				306	*/
				307	struct origin {
				308	/* The origin device */
				309	struct block_device *bdev;
				310
				311	struct list_head hash_list;
				312
				313	/* List of snapshots for this origin */
				314	struct list_head snapshots;
				315	};
				316
				317	/*
				318	* This structure is allocated for each origin target
				319	*/
				320	struct dm_origin {
				321	struct dm_dev *dev;
				322	struct dm_target *ti;
				323	unsigned split_boundary;
				324	struct list_head hash_list;
				325	};
				326
				327	/*
				328	* Size of the hash table for origin volumes. If we make this
				329	* the size of the minors list then it should be nearly perfect
				330	*/
				331	#define ORIGIN_HASH_SIZE 256
				332	#define ORIGIN_MASK 0xFF
				333	static struct list_head *_origins;
				334	static struct list_head *_dm_origins;
				335	static struct rw_semaphore _origins_lock;
				336
				337	static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
				338	static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
				339	static uint64_t _pending_exceptions_done_count;
				340
				341	static int init_origin_hash(void)
				342	{
				343	int i;
				344
				345	_origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
				346	GFP_KERNEL);
				347	if (!_origins) {
				348	DMERR("unable to allocate memory for _origins");
				349	return -ENOMEM;
				350	}
				351	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
				352	INIT_LIST_HEAD(_origins + i);
				353
				354	_dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
				355	sizeof(struct list_head),
				356	GFP_KERNEL);
				357	if (!_dm_origins) {
				358	DMERR("unable to allocate memory for _dm_origins");
				359	kfree(_origins);
				360	return -ENOMEM;
				361	}
				362	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
				363	INIT_LIST_HEAD(_dm_origins + i);
				364
				365	init_rwsem(&_origins_lock);
				366
				367	return 0;
				368	}
				369
				370	static void exit_origin_hash(void)
				371	{
				372	kfree(_origins);
				373	kfree(_dm_origins);
				374	}
				375
				376	static unsigned origin_hash(struct block_device *bdev)
				377	{
				378	return bdev->bd_dev & ORIGIN_MASK;
				379	}
				380
				381	static struct origin __lookup_origin(struct block_device origin)
				382	{
				383	struct list_head *ol;
				384	struct origin *o;
				385
				386	ol = &_origins[origin_hash(origin)];
				387	list_for_each_entry (o, ol, hash_list)
				388	if (bdev_equal(o->bdev, origin))
				389	return o;
				390
				391	return NULL;
				392	}
				393
				394	static void __insert_origin(struct origin *o)
				395	{
				396	struct list_head *sl = &_origins[origin_hash(o->bdev)];
				397	list_add_tail(&o->hash_list, sl);
				398	}
				399
				400	static struct dm_origin __lookup_dm_origin(struct block_device origin)
				401	{
				402	struct list_head *ol;
				403	struct dm_origin *o;
				404
				405	ol = &_dm_origins[origin_hash(origin)];
				406	list_for_each_entry (o, ol, hash_list)
				407	if (bdev_equal(o->dev->bdev, origin))
				408	return o;
				409
				410	return NULL;
				411	}
				412
				413	static void __insert_dm_origin(struct dm_origin *o)
				414	{
				415	struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
				416	list_add_tail(&o->hash_list, sl);
				417	}
				418
				419	static void __remove_dm_origin(struct dm_origin *o)
				420	{
				421	list_del(&o->hash_list);
				422	}
				423
				424	/*
				425	* _origins_lock must be held when calling this function.
				426	* Returns number of snapshots registered using the supplied cow device, plus:
				427	* snap_src - a snapshot suitable for use as a source of exception handover
				428	* snap_dest - a snapshot capable of receiving exception handover.
				429	* snap_merge - an existing snapshot-merge target linked to the same origin.
				430	* There can be at most one snapshot-merge target. The parameter is optional.
				431	*
				432	* Possible return values and states of snap_src and snap_dest.
				433	* 0: NULL, NULL - first new snapshot
				434	* 1: snap_src, NULL - normal snapshot
				435	* 2: snap_src, snap_dest - waiting for handover
				436	* 2: snap_src, NULL - handed over, waiting for old to be deleted
				437	* 1: NULL, snap_dest - source got destroyed without handover
				438	*/
				439	static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
				440	struct dm_snapshot **snap_src,
				441	struct dm_snapshot **snap_dest,
				442	struct dm_snapshot **snap_merge)
				443	{
				444	struct dm_snapshot *s;
				445	struct origin *o;
				446	int count = 0;
				447	int active;
				448
				449	o = __lookup_origin(snap->origin->bdev);
				450	if (!o)
				451	goto out;
				452
				453	list_for_each_entry(s, &o->snapshots, list) {
				454	if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
				455	*snap_merge = s;
				456	if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
				457	continue;
				458
				459	mutex_lock(&s->lock);
				460	active = s->active;
				461	mutex_unlock(&s->lock);
				462
				463	if (active) {
				464	if (snap_src)
				465	*snap_src = s;
				466	} else if (snap_dest)
				467	*snap_dest = s;
				468
				469	count++;
				470	}
				471
				472	out:
				473	return count;
				474	}
				475
				476	/*
				477	* On success, returns 1 if this snapshot is a handover destination,
				478	* otherwise returns 0.
				479	*/
				480	static int __validate_exception_handover(struct dm_snapshot *snap)
				481	{
				482	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				483	struct dm_snapshot *snap_merge = NULL;
				484
				485	/* Does snapshot need exceptions handed over to it? */
				486	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
				487	&snap_merge) == 2) \|\|
				488	snap_dest) {
				489	snap->ti->error = "Snapshot cow pairing for exception "
				490	"table handover failed";
				491	return -EINVAL;
				492	}
				493
				494	/*
				495	* If no snap_src was found, snap cannot become a handover
				496	* destination.
				497	*/
				498	if (!snap_src)
				499	return 0;
				500
				501	/*
				502	* Non-snapshot-merge handover?
				503	*/
				504	if (!dm_target_is_snapshot_merge(snap->ti))
				505	return 1;
				506
				507	/*
				508	* Do not allow more than one merging snapshot.
				509	*/
				510	if (snap_merge) {
				511	snap->ti->error = "A snapshot is already merging.";
				512	return -EINVAL;
				513	}
				514
				515	if (!snap_src->store->type->prepare_merge \|\|
				516	!snap_src->store->type->commit_merge) {
				517	snap->ti->error = "Snapshot exception store does not "
				518	"support snapshot-merge.";
				519	return -EINVAL;
				520	}
				521
				522	return 1;
				523	}
				524
				525	static void __insert_snapshot(struct origin o, struct dm_snapshot s)
				526	{
				527	struct dm_snapshot *l;
				528
				529	/* Sort the list according to chunk size, largest-first smallest-last */
				530	list_for_each_entry(l, &o->snapshots, list)
				531	if (l->store->chunk_size < s->store->chunk_size)
				532	break;
				533	list_add_tail(&s->list, &l->list);
				534	}
				535
				536	/*
				537	* Make a note of the snapshot and its origin so we can look it
				538	* up when the origin has a write on it.
				539	*
				540	* Also validate snapshot exception store handovers.
				541	* On success, returns 1 if this registration is a handover destination,
				542	* otherwise returns 0.
				543	*/
				544	static int register_snapshot(struct dm_snapshot *snap)
				545	{
				546	struct origin o, new_o = NULL;
				547	struct block_device *bdev = snap->origin->bdev;
				548	int r = 0;
				549
				550	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
				551	if (!new_o)
				552	return -ENOMEM;
				553
				554	down_write(&_origins_lock);
				555
				556	r = __validate_exception_handover(snap);
				557	if (r < 0) {
				558	kfree(new_o);
				559	goto out;
				560	}
				561
				562	o = __lookup_origin(bdev);
				563	if (o)
				564	kfree(new_o);
				565	else {
				566	/* New origin */
				567	o = new_o;
				568
				569	/* Initialise the struct */
				570	INIT_LIST_HEAD(&o->snapshots);
				571	o->bdev = bdev;
				572
				573	__insert_origin(o);
				574	}
				575
				576	__insert_snapshot(o, snap);
				577
				578	out:
				579	up_write(&_origins_lock);
				580
				581	return r;
				582	}
				583
				584	/*
				585	* Move snapshot to correct place in list according to chunk size.
				586	*/
				587	static void reregister_snapshot(struct dm_snapshot *s)
				588	{
				589	struct block_device *bdev = s->origin->bdev;
				590
				591	down_write(&_origins_lock);
				592
				593	list_del(&s->list);
				594	__insert_snapshot(__lookup_origin(bdev), s);
				595
				596	up_write(&_origins_lock);
				597	}
				598
				599	static void unregister_snapshot(struct dm_snapshot *s)
				600	{
				601	struct origin *o;
				602
				603	down_write(&_origins_lock);
				604	o = __lookup_origin(s->origin->bdev);
				605
				606	list_del(&s->list);
				607	if (o && list_empty(&o->snapshots)) {
				608	list_del(&o->hash_list);
				609	kfree(o);
				610	}
				611
				612	up_write(&_origins_lock);
				613	}
				614
				615	/*
				616	* Implementation of the exception hash tables.
				617	* The lowest hash_shift bits of the chunk number are ignored, allowing
				618	* some consecutive chunks to be grouped together.
				619	*/
				620	static int dm_exception_table_init(struct dm_exception_table *et,
				621	uint32_t size, unsigned hash_shift)
				622	{
				623	unsigned int i;
				624
				625	et->hash_shift = hash_shift;
				626	et->hash_mask = size - 1;
				627	et->table = dm_vcalloc(size, sizeof(struct list_head));
				628	if (!et->table)
				629	return -ENOMEM;
				630
				631	for (i = 0; i < size; i++)
				632	INIT_LIST_HEAD(et->table + i);
				633
				634	return 0;
				635	}
				636
				637	static void dm_exception_table_exit(struct dm_exception_table *et,
				638	struct kmem_cache *mem)
				639	{
				640	struct list_head *slot;
				641	struct dm_exception ex, next;
				642	int i, size;
				643
				644	size = et->hash_mask + 1;
				645	for (i = 0; i < size; i++) {
				646	slot = et->table + i;
				647
				648	list_for_each_entry_safe (ex, next, slot, hash_list)
				649	kmem_cache_free(mem, ex);
				650	}
				651
				652	vfree(et->table);
				653	}
				654
				655	static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
				656	{
				657	return (chunk >> et->hash_shift) & et->hash_mask;
				658	}
				659
				660	static void dm_remove_exception(struct dm_exception *e)
				661	{
				662	list_del(&e->hash_list);
				663	}
				664
				665	/*
				666	* Return the exception data for a sector, or NULL if not
				667	* remapped.
				668	*/
				669	static struct dm_exception dm_lookup_exception(struct dm_exception_table et,
				670	chunk_t chunk)
				671	{
				672	struct list_head *slot;
				673	struct dm_exception *e;
				674
				675	slot = &et->table[exception_hash(et, chunk)];
				676	list_for_each_entry (e, slot, hash_list)
				677	if (chunk >= e->old_chunk &&
				678	chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
				679	return e;
				680
				681	return NULL;
				682	}
				683
				684	static struct dm_exception *alloc_completed_exception(gfp_t gfp)
				685	{
				686	struct dm_exception *e;
				687
				688	e = kmem_cache_alloc(exception_cache, gfp);
				689	if (!e && gfp == GFP_NOIO)
				690	e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
				691
				692	return e;
				693	}
				694
				695	static void free_completed_exception(struct dm_exception *e)
				696	{
				697	kmem_cache_free(exception_cache, e);
				698	}
				699
				700	static struct dm_snap_pending_exception alloc_pending_exception(struct dm_snapshot s)
				701	{
				702	struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
				703	GFP_NOIO);
				704
				705	atomic_inc(&s->pending_exceptions_count);
				706	pe->snap = s;
				707
				708	return pe;
				709	}
				710
				711	static void free_pending_exception(struct dm_snap_pending_exception *pe)
				712	{
				713	struct dm_snapshot *s = pe->snap;
				714
				715	mempool_free(pe, &s->pending_pool);
				716	smp_mb__before_atomic();
				717	atomic_dec(&s->pending_exceptions_count);
				718	}
				719
				720	static void dm_insert_exception(struct dm_exception_table *eh,
				721	struct dm_exception *new_e)
				722	{
				723	struct list_head *l;
				724	struct dm_exception *e = NULL;
				725
				726	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
				727
				728	/* Add immediately if this table doesn't support consecutive chunks */
				729	if (!eh->hash_shift)
				730	goto out;
				731
				732	/* List is ordered by old_chunk */
				733	list_for_each_entry_reverse(e, l, hash_list) {
				734	/* Insert after an existing chunk? */
				735	if (new_e->old_chunk == (e->old_chunk +
				736	dm_consecutive_chunk_count(e) + 1) &&
				737	new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
				738	dm_consecutive_chunk_count(e) + 1)) {
				739	dm_consecutive_chunk_count_inc(e);
				740	free_completed_exception(new_e);
				741	return;
				742	}
				743
				744	/* Insert before an existing chunk? */
				745	if (new_e->old_chunk == (e->old_chunk - 1) &&
				746	new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
				747	dm_consecutive_chunk_count_inc(e);
				748	e->old_chunk--;
				749	e->new_chunk--;
				750	free_completed_exception(new_e);
				751	return;
				752	}
				753
				754	if (new_e->old_chunk > e->old_chunk)
				755	break;
				756	}
				757
				758	out:
				759	list_add(&new_e->hash_list, e ? &e->hash_list : l);
				760	}
				761
				762	/*
				763	* Callback used by the exception stores to load exceptions when
				764	* initialising.
				765	*/
				766	static int dm_add_exception(void *context, chunk_t old, chunk_t new)
				767	{
				768	struct dm_snapshot *s = context;
				769	struct dm_exception *e;
				770
				771	e = alloc_completed_exception(GFP_KERNEL);
				772	if (!e)
				773	return -ENOMEM;
				774
				775	e->old_chunk = old;
				776
				777	/* Consecutive_count is implicitly initialised to zero */
				778	e->new_chunk = new;
				779
				780	dm_insert_exception(&s->complete, e);
				781
				782	return 0;
				783	}
				784
				785	/*
				786	* Return a minimum chunk size of all snapshots that have the specified origin.
				787	* Return zero if the origin has no snapshots.
				788	*/
				789	static uint32_t __minimum_chunk_size(struct origin *o)
				790	{
				791	struct dm_snapshot *snap;
				792	unsigned chunk_size = 0;
				793
				794	if (o)
				795	list_for_each_entry(snap, &o->snapshots, list)
				796	chunk_size = min_not_zero(chunk_size,
				797	snap->store->chunk_size);
				798
				799	return (uint32_t) chunk_size;
				800	}
				801
				802	/*
				803	* Hard coded magic.
				804	*/
				805	static int calc_max_buckets(void)
				806	{
				807	/* use a fixed size of 2MB */
				808	unsigned long mem = 2 * 1024 * 1024;
				809	mem /= sizeof(struct list_head);
				810
				811	return mem;
				812	}
				813
				814	/*
				815	* Allocate room for a suitable hash table.
				816	*/
				817	static int init_hash_tables(struct dm_snapshot *s)
				818	{
				819	sector_t hash_size, cow_dev_size, max_buckets;
				820
				821	/*
				822	* Calculate based on the size of the original volume or
				823	* the COW volume...
				824	*/
				825	cow_dev_size = get_dev_size(s->cow->bdev);
				826	max_buckets = calc_max_buckets();
				827
				828	hash_size = cow_dev_size >> s->store->chunk_shift;
				829	hash_size = min(hash_size, max_buckets);
				830
				831	if (hash_size < 64)
				832	hash_size = 64;
				833	hash_size = rounddown_pow_of_two(hash_size);
				834	if (dm_exception_table_init(&s->complete, hash_size,
				835	DM_CHUNK_CONSECUTIVE_BITS))
				836	return -ENOMEM;
				837
				838	/*
				839	* Allocate hash table for in-flight exceptions
				840	* Make this smaller than the real hash table
				841	*/
				842	hash_size >>= 3;
				843	if (hash_size < 64)
				844	hash_size = 64;
				845
				846	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
				847	dm_exception_table_exit(&s->complete, exception_cache);
				848	return -ENOMEM;
				849	}
				850
				851	return 0;
				852	}
				853
				854	static void merge_shutdown(struct dm_snapshot *s)
				855	{
				856	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
				857	smp_mb__after_atomic();
				858	wake_up_bit(&s->state_bits, RUNNING_MERGE);
				859	}
				860
				861	static struct bio __release_queued_bios_after_merge(struct dm_snapshot s)
				862	{
				863	s->first_merging_chunk = 0;
				864	s->num_merging_chunks = 0;
				865
				866	return bio_list_get(&s->bios_queued_during_merge);
				867	}
				868
				869	/*
				870	* Remove one chunk from the index of completed exceptions.
				871	*/
				872	static int __remove_single_exception_chunk(struct dm_snapshot *s,
				873	chunk_t old_chunk)
				874	{
				875	struct dm_exception *e;
				876
				877	e = dm_lookup_exception(&s->complete, old_chunk);
				878	if (!e) {
				879	DMERR("Corruption detected: exception for block %llu is "
				880	"on disk but not in memory",
				881	(unsigned long long)old_chunk);
				882	return -EINVAL;
				883	}
				884
				885	/*
				886	* If this is the only chunk using this exception, remove exception.
				887	*/
				888	if (!dm_consecutive_chunk_count(e)) {
				889	dm_remove_exception(e);
				890	free_completed_exception(e);
				891	return 0;
				892	}
				893
				894	/*
				895	* The chunk may be either at the beginning or the end of a
				896	* group of consecutive chunks - never in the middle. We are
				897	* removing chunks in the opposite order to that in which they
				898	* were added, so this should always be true.
				899	* Decrement the consecutive chunk counter and adjust the
				900	* starting point if necessary.
				901	*/
				902	if (old_chunk == e->old_chunk) {
				903	e->old_chunk++;
				904	e->new_chunk++;
				905	} else if (old_chunk != e->old_chunk +
				906	dm_consecutive_chunk_count(e)) {
				907	DMERR("Attempt to merge block %llu from the "
				908	"middle of a chunk range [%llu - %llu]",
				909	(unsigned long long)old_chunk,
				910	(unsigned long long)e->old_chunk,
				911	(unsigned long long)
				912	e->old_chunk + dm_consecutive_chunk_count(e));
				913	return -EINVAL;
				914	}
				915
				916	dm_consecutive_chunk_count_dec(e);
				917
				918	return 0;
				919	}
				920
				921	static void flush_bios(struct bio *bio);
				922
				923	static int remove_single_exception_chunk(struct dm_snapshot *s)
				924	{
				925	struct bio *b = NULL;
				926	int r;
				927	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
				928
				929	mutex_lock(&s->lock);
				930
				931	/*
				932	* Process chunks (and associated exceptions) in reverse order
				933	* so that dm_consecutive_chunk_count_dec() accounting works.
				934	*/
				935	do {
				936	r = __remove_single_exception_chunk(s, old_chunk);
				937	if (r)
				938	goto out;
				939	} while (old_chunk-- > s->first_merging_chunk);
				940
				941	b = __release_queued_bios_after_merge(s);
				942
				943	out:
				944	mutex_unlock(&s->lock);
				945	if (b)
				946	flush_bios(b);
				947
				948	return r;
				949	}
				950
				951	static int origin_write_extent(struct dm_snapshot *merging_snap,
				952	sector_t sector, unsigned chunk_size);
				953
				954	static void merge_callback(int read_err, unsigned long write_err,
				955	void *context);
				956
				957	static uint64_t read_pending_exceptions_done_count(void)
				958	{
				959	uint64_t pending_exceptions_done;
				960
				961	spin_lock(&_pending_exceptions_done_spinlock);
				962	pending_exceptions_done = _pending_exceptions_done_count;
				963	spin_unlock(&_pending_exceptions_done_spinlock);
				964
				965	return pending_exceptions_done;
				966	}
				967
				968	static void increment_pending_exceptions_done_count(void)
				969	{
				970	spin_lock(&_pending_exceptions_done_spinlock);
				971	_pending_exceptions_done_count++;
				972	spin_unlock(&_pending_exceptions_done_spinlock);
				973
				974	wake_up_all(&_pending_exceptions_done);
				975	}
				976
				977	static void snapshot_merge_next_chunks(struct dm_snapshot *s)
				978	{
				979	int i, linear_chunks;
				980	chunk_t old_chunk, new_chunk;
				981	struct dm_io_region src, dest;
				982	sector_t io_size;
				983	uint64_t previous_count;
				984
				985	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
				986	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
				987	goto shut;
				988
				989	/*
				990	* valid flag never changes during merge, so no lock required.
				991	*/
				992	if (!s->valid) {
				993	DMERR("Snapshot is invalid: can't merge");
				994	goto shut;
				995	}
				996
				997	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
				998	&new_chunk);
				999	if (linear_chunks <= 0) {
				1000	if (linear_chunks < 0) {
				1001	DMERR("Read error in exception store: "
				1002	"shutting down merge");
				1003	mutex_lock(&s->lock);
				1004	s->merge_failed = 1;
				1005	mutex_unlock(&s->lock);
				1006	}
				1007	goto shut;
				1008	}
				1009
				1010	/* Adjust old_chunk and new_chunk to reflect start of linear region */
				1011	old_chunk = old_chunk + 1 - linear_chunks;
				1012	new_chunk = new_chunk + 1 - linear_chunks;
				1013
				1014	/*
				1015	* Use one (potentially large) I/O to copy all 'linear_chunks'
				1016	* from the exception store to the origin
				1017	*/
				1018	io_size = linear_chunks * s->store->chunk_size;
				1019
				1020	dest.bdev = s->origin->bdev;
				1021	dest.sector = chunk_to_sector(s->store, old_chunk);
				1022	dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
				1023
				1024	src.bdev = s->cow->bdev;
				1025	src.sector = chunk_to_sector(s->store, new_chunk);
				1026	src.count = dest.count;
				1027
				1028	/*
				1029	* Reallocate any exceptions needed in other snapshots then
				1030	* wait for the pending exceptions to complete.
				1031	* Each time any pending exception (globally on the system)
				1032	* completes we are woken and repeat the process to find out
				1033	* if we can proceed. While this may not seem a particularly
				1034	* efficient algorithm, it is not expected to have any
				1035	* significant impact on performance.
				1036	*/
				1037	previous_count = read_pending_exceptions_done_count();
				1038	while (origin_write_extent(s, dest.sector, io_size)) {
				1039	wait_event(_pending_exceptions_done,
				1040	(read_pending_exceptions_done_count() !=
				1041	previous_count));
				1042	/* Retry after the wait, until all exceptions are done. */
				1043	previous_count = read_pending_exceptions_done_count();
				1044	}
				1045
				1046	mutex_lock(&s->lock);
				1047	s->first_merging_chunk = old_chunk;
				1048	s->num_merging_chunks = linear_chunks;
				1049	mutex_unlock(&s->lock);
				1050
				1051	/* Wait until writes to all 'linear_chunks' drain */
				1052	for (i = 0; i < linear_chunks; i++)
				1053	__check_for_conflicting_io(s, old_chunk + i);
				1054
				1055	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
				1056	return;
				1057
				1058	shut:
				1059	merge_shutdown(s);
				1060	}
				1061
				1062	static void error_bios(struct bio *bio);
				1063
				1064	static void merge_callback(int read_err, unsigned long write_err, void *context)
				1065	{
				1066	struct dm_snapshot *s = context;
				1067	struct bio *b = NULL;
				1068
				1069	if (read_err \|\| write_err) {
				1070	if (read_err)
				1071	DMERR("Read error: shutting down merge.");
				1072	else
				1073	DMERR("Write error: shutting down merge.");
				1074	goto shut;
				1075	}
				1076
				1077	if (s->store->type->commit_merge(s->store,
				1078	s->num_merging_chunks) < 0) {
				1079	DMERR("Write error in exception store: shutting down merge");
				1080	goto shut;
				1081	}
				1082
				1083	if (remove_single_exception_chunk(s) < 0)
				1084	goto shut;
				1085
				1086	snapshot_merge_next_chunks(s);
				1087
				1088	return;
				1089
				1090	shut:
				1091	mutex_lock(&s->lock);
				1092	s->merge_failed = 1;
				1093	b = __release_queued_bios_after_merge(s);
				1094	mutex_unlock(&s->lock);
				1095	error_bios(b);
				1096
				1097	merge_shutdown(s);
				1098	}
				1099
				1100	static void start_merge(struct dm_snapshot *s)
				1101	{
				1102	if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
				1103	snapshot_merge_next_chunks(s);
				1104	}
				1105
				1106	/*
				1107	* Stop the merging process and wait until it finishes.
				1108	*/
				1109	static void stop_merge(struct dm_snapshot *s)
				1110	{
				1111	set_bit(SHUTDOWN_MERGE, &s->state_bits);
				1112	wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
				1113	clear_bit(SHUTDOWN_MERGE, &s->state_bits);
				1114	}
				1115
				1116	/*
				1117	* Construct a snapshot mapping: <origin_dev> <COW-dev> <p\|po\|n> <chunk-size>
				1118	*/
				1119	static int snapshot_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1120	{
				1121	struct dm_snapshot *s;
				1122	int i;
				1123	int r = -EINVAL;
				1124	char origin_path, cow_path;
				1125	dev_t origin_dev, cow_dev;
				1126	unsigned args_used, num_flush_bios = 1;
				1127	fmode_t origin_mode = FMODE_READ;
				1128
				1129	if (argc != 4) {
				1130	ti->error = "requires exactly 4 arguments";
				1131	r = -EINVAL;
				1132	goto bad;
				1133	}
				1134
				1135	if (dm_target_is_snapshot_merge(ti)) {
				1136	num_flush_bios = 2;
				1137	origin_mode = FMODE_WRITE;
				1138	}
				1139
				1140	s = kzalloc(sizeof(*s), GFP_KERNEL);
				1141	if (!s) {
				1142	ti->error = "Cannot allocate private snapshot structure";
				1143	r = -ENOMEM;
				1144	goto bad;
				1145	}
				1146
				1147	origin_path = argv[0];
				1148	argv++;
				1149	argc--;
				1150
				1151	r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
				1152	if (r) {
				1153	ti->error = "Cannot get origin device";
				1154	goto bad_origin;
				1155	}
				1156	origin_dev = s->origin->bdev->bd_dev;
				1157
				1158	cow_path = argv[0];
				1159	argv++;
				1160	argc--;
				1161
				1162	cow_dev = dm_get_dev_t(cow_path);
				1163	if (cow_dev && cow_dev == origin_dev) {
				1164	ti->error = "COW device cannot be the same as origin device";
				1165	r = -EINVAL;
				1166	goto bad_cow;
				1167	}
				1168
				1169	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
				1170	if (r) {
				1171	ti->error = "Cannot get COW device";
				1172	goto bad_cow;
				1173	}
				1174
				1175	r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
				1176	if (r) {
				1177	ti->error = "Couldn't create exception store";
				1178	r = -EINVAL;
				1179	goto bad_store;
				1180	}
				1181
				1182	argv += args_used;
				1183	argc -= args_used;
				1184
				1185	s->ti = ti;
				1186	s->valid = 1;
				1187	s->snapshot_overflowed = 0;
				1188	s->active = 0;
				1189	atomic_set(&s->pending_exceptions_count, 0);
				1190	s->exception_start_sequence = 0;
				1191	s->exception_complete_sequence = 0;
				1192	s->out_of_order_tree = RB_ROOT;
				1193	mutex_init(&s->lock);
				1194	INIT_LIST_HEAD(&s->list);
				1195	spin_lock_init(&s->pe_lock);
				1196	s->state_bits = 0;
				1197	s->merge_failed = 0;
				1198	s->first_merging_chunk = 0;
				1199	s->num_merging_chunks = 0;
				1200	bio_list_init(&s->bios_queued_during_merge);
				1201
				1202	/* Allocate hash table for COW data */
				1203	if (init_hash_tables(s)) {
				1204	ti->error = "Unable to allocate hash table space";
				1205	r = -ENOMEM;
				1206	goto bad_hash_tables;
				1207	}
				1208
				1209	init_waitqueue_head(&s->in_progress_wait);
				1210
				1211	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				1212	if (IS_ERR(s->kcopyd_client)) {
				1213	r = PTR_ERR(s->kcopyd_client);
				1214	ti->error = "Could not create kcopyd client";
				1215	goto bad_kcopyd;
				1216	}
				1217
				1218	r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
				1219	if (r) {
				1220	ti->error = "Could not allocate mempool for pending exceptions";
				1221	goto bad_pending_pool;
				1222	}
				1223
				1224	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
				1225	INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
				1226
				1227	spin_lock_init(&s->tracked_chunk_lock);
				1228
				1229	ti->private = s;
				1230	ti->num_flush_bios = num_flush_bios;
				1231	ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
				1232
				1233	/* Add snapshot to the list of snapshots for this origin */
				1234	/* Exceptions aren't triggered till snapshot_resume() is called */
				1235	r = register_snapshot(s);
				1236	if (r == -ENOMEM) {
				1237	ti->error = "Snapshot origin struct allocation failed";
				1238	goto bad_load_and_register;
				1239	} else if (r < 0) {
				1240	/* invalid handover, register_snapshot has set ti->error */
				1241	goto bad_load_and_register;
				1242	}
				1243
				1244	/*
				1245	* Metadata must only be loaded into one table at once, so skip this
				1246	* if metadata will be handed over during resume.
				1247	* Chunk size will be set during the handover - set it to zero to
				1248	* ensure it's ignored.
				1249	*/
				1250	if (r > 0) {
				1251	s->store->chunk_size = 0;
				1252	return 0;
				1253	}
				1254
				1255	r = s->store->type->read_metadata(s->store, dm_add_exception,
				1256	(void *)s);
				1257	if (r < 0) {
				1258	ti->error = "Failed to read snapshot metadata";
				1259	goto bad_read_metadata;
				1260	} else if (r > 0) {
				1261	s->valid = 0;
				1262	DMWARN("Snapshot is marked invalid.");
				1263	}
				1264
				1265	if (!s->store->chunk_size) {
				1266	ti->error = "Chunk size not set";
				1267	goto bad_read_metadata;
				1268	}
				1269
				1270	r = dm_set_target_max_io_len(ti, s->store->chunk_size);
				1271	if (r)
				1272	goto bad_read_metadata;
				1273
				1274	return 0;
				1275
				1276	bad_read_metadata:
				1277	unregister_snapshot(s);
				1278
				1279	bad_load_and_register:
				1280	mempool_exit(&s->pending_pool);
				1281
				1282	bad_pending_pool:
				1283	dm_kcopyd_client_destroy(s->kcopyd_client);
				1284
				1285	bad_kcopyd:
				1286	dm_exception_table_exit(&s->pending, pending_cache);
				1287	dm_exception_table_exit(&s->complete, exception_cache);
				1288
				1289	bad_hash_tables:
				1290	dm_exception_store_destroy(s->store);
				1291
				1292	bad_store:
				1293	dm_put_device(ti, s->cow);
				1294
				1295	bad_cow:
				1296	dm_put_device(ti, s->origin);
				1297
				1298	bad_origin:
				1299	kfree(s);
				1300
				1301	bad:
				1302	return r;
				1303	}
				1304
				1305	static void __free_exceptions(struct dm_snapshot *s)
				1306	{
				1307	dm_kcopyd_client_destroy(s->kcopyd_client);
				1308	s->kcopyd_client = NULL;
				1309
				1310	dm_exception_table_exit(&s->pending, pending_cache);
				1311	dm_exception_table_exit(&s->complete, exception_cache);
				1312	}
				1313
				1314	static void __handover_exceptions(struct dm_snapshot *snap_src,
				1315	struct dm_snapshot *snap_dest)
				1316	{
				1317	union {
				1318	struct dm_exception_table table_swap;
				1319	struct dm_exception_store *store_swap;
				1320	} u;
				1321
				1322	/*
				1323	* Swap all snapshot context information between the two instances.
				1324	*/
				1325	u.table_swap = snap_dest->complete;
				1326	snap_dest->complete = snap_src->complete;
				1327	snap_src->complete = u.table_swap;
				1328
				1329	u.store_swap = snap_dest->store;
				1330	snap_dest->store = snap_src->store;
				1331	snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
				1332	snap_src->store = u.store_swap;
				1333
				1334	snap_dest->store->snap = snap_dest;
				1335	snap_src->store->snap = snap_src;
				1336
				1337	snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
				1338	snap_dest->valid = snap_src->valid;
				1339	snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
				1340
				1341	/*
				1342	* Set source invalid to ensure it receives no further I/O.
				1343	*/
				1344	snap_src->valid = 0;
				1345	}
				1346
				1347	static void snapshot_dtr(struct dm_target *ti)
				1348	{
				1349	#ifdef CONFIG_DM_DEBUG
				1350	int i;
				1351	#endif
				1352	struct dm_snapshot *s = ti->private;
				1353	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				1354
				1355	down_read(&_origins_lock);
				1356	/* Check whether exception handover must be cancelled */
				1357	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				1358	if (snap_src && snap_dest && (s == snap_src)) {
				1359	mutex_lock(&snap_dest->lock);
				1360	snap_dest->valid = 0;
				1361	mutex_unlock(&snap_dest->lock);
				1362	DMERR("Cancelling snapshot handover.");
				1363	}
				1364	up_read(&_origins_lock);
				1365
				1366	if (dm_target_is_snapshot_merge(ti))
				1367	stop_merge(s);
				1368
				1369	/* Prevent further origin writes from using this snapshot. */
				1370	/* After this returns there can be no new kcopyd jobs. */
				1371	unregister_snapshot(s);
				1372
				1373	while (atomic_read(&s->pending_exceptions_count))
				1374	msleep(1);
				1375	/*
				1376	* Ensure instructions in mempool_exit aren't reordered
				1377	* before atomic_read.
				1378	*/
				1379	smp_mb();
				1380
				1381	#ifdef CONFIG_DM_DEBUG
				1382	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
				1383	BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
				1384	#endif
				1385
				1386	__free_exceptions(s);
				1387
				1388	mempool_exit(&s->pending_pool);
				1389
				1390	dm_exception_store_destroy(s->store);
				1391
				1392	mutex_destroy(&s->lock);
				1393
				1394	dm_put_device(ti, s->cow);
				1395
				1396	dm_put_device(ti, s->origin);
				1397
				1398	WARN_ON(s->in_progress);
				1399
				1400	kfree(s);
				1401	}
				1402
				1403	static void account_start_copy(struct dm_snapshot *s)
				1404	{
				1405	spin_lock(&s->in_progress_wait.lock);
				1406	s->in_progress++;
				1407	spin_unlock(&s->in_progress_wait.lock);
				1408	}
				1409
				1410	static void account_end_copy(struct dm_snapshot *s)
				1411	{
				1412	spin_lock(&s->in_progress_wait.lock);
				1413	BUG_ON(!s->in_progress);
				1414	s->in_progress--;
				1415	if (likely(s->in_progress <= cow_threshold) &&
				1416	unlikely(waitqueue_active(&s->in_progress_wait)))
				1417	wake_up_locked(&s->in_progress_wait);
				1418	spin_unlock(&s->in_progress_wait.lock);
				1419	}
				1420
				1421	static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
				1422	{
				1423	if (unlikely(s->in_progress > cow_threshold)) {
				1424	spin_lock(&s->in_progress_wait.lock);
				1425	if (likely(s->in_progress > cow_threshold)) {
				1426	/*
				1427	* NOTE: this throttle doesn't account for whether
				1428	* the caller is servicing an IO that will trigger a COW
				1429	* so excess throttling may result for chunks not required
				1430	* to be COW'd. But if cow_threshold was reached, extra
				1431	* throttling is unlikely to negatively impact performance.
				1432	*/
				1433	DECLARE_WAITQUEUE(wait, current);
				1434	__add_wait_queue(&s->in_progress_wait, &wait);
				1435	__set_current_state(TASK_UNINTERRUPTIBLE);
				1436	spin_unlock(&s->in_progress_wait.lock);
				1437	if (unlock_origins)
				1438	up_read(&_origins_lock);
				1439	io_schedule();
				1440	remove_wait_queue(&s->in_progress_wait, &wait);
				1441	return false;
				1442	}
				1443	spin_unlock(&s->in_progress_wait.lock);
				1444	}
				1445	return true;
				1446	}
				1447
				1448	/*
				1449	* Flush a list of buffers.
				1450	*/
				1451	static void flush_bios(struct bio *bio)
				1452	{
				1453	struct bio *n;
				1454
				1455	while (bio) {
				1456	n = bio->bi_next;
				1457	bio->bi_next = NULL;
				1458	generic_make_request(bio);
				1459	bio = n;
				1460	}
				1461	}
				1462
				1463	static int do_origin(struct dm_dev origin, struct bio bio, bool limit);
				1464
				1465	/*
				1466	* Flush a list of buffers.
				1467	*/
				1468	static void retry_origin_bios(struct dm_snapshot s, struct bio bio)
				1469	{
				1470	struct bio *n;
				1471	int r;
				1472
				1473	while (bio) {
				1474	n = bio->bi_next;
				1475	bio->bi_next = NULL;
				1476	r = do_origin(s->origin, bio, false);
				1477	if (r == DM_MAPIO_REMAPPED)
				1478	generic_make_request(bio);
				1479	bio = n;
				1480	}
				1481	}
				1482
				1483	/*
				1484	* Error a list of buffers.
				1485	*/
				1486	static void error_bios(struct bio *bio)
				1487	{
				1488	struct bio *n;
				1489
				1490	while (bio) {
				1491	n = bio->bi_next;
				1492	bio->bi_next = NULL;
				1493	bio_io_error(bio);
				1494	bio = n;
				1495	}
				1496	}
				1497
				1498	static void __invalidate_snapshot(struct dm_snapshot *s, int err)
				1499	{
				1500	if (!s->valid)
				1501	return;
				1502
				1503	if (err == -EIO)
				1504	DMERR("Invalidating snapshot: Error reading/writing.");
				1505	else if (err == -ENOMEM)
				1506	DMERR("Invalidating snapshot: Unable to allocate exception.");
				1507
				1508	if (s->store->type->drop_snapshot)
				1509	s->store->type->drop_snapshot(s->store);
				1510
				1511	s->valid = 0;
				1512
				1513	dm_table_event(s->ti->table);
				1514	}
				1515
				1516	static void pending_complete(void *context, int success)
				1517	{
				1518	struct dm_snap_pending_exception *pe = context;
				1519	struct dm_exception *e;
				1520	struct dm_snapshot *s = pe->snap;
				1521	struct bio *origin_bios = NULL;
				1522	struct bio *snapshot_bios = NULL;
				1523	struct bio *full_bio = NULL;
				1524	int error = 0;
				1525
				1526	if (!success) {
				1527	/* Read/write error - snapshot is unusable */
				1528	mutex_lock(&s->lock);
				1529	__invalidate_snapshot(s, -EIO);
				1530	error = 1;
				1531	goto out;
				1532	}
				1533
				1534	e = alloc_completed_exception(GFP_NOIO);
				1535	if (!e) {
				1536	mutex_lock(&s->lock);
				1537	__invalidate_snapshot(s, -ENOMEM);
				1538	error = 1;
				1539	goto out;
				1540	}
				1541	*e = pe->e;
				1542
				1543	mutex_lock(&s->lock);
				1544	if (!s->valid) {
				1545	free_completed_exception(e);
				1546	error = 1;
				1547	goto out;
				1548	}
				1549
				1550	/* Check for conflicting reads */
				1551	__check_for_conflicting_io(s, pe->e.old_chunk);
				1552
				1553	/*
				1554	* Add a proper exception, and remove the
				1555	* in-flight exception from the list.
				1556	*/
				1557	dm_insert_exception(&s->complete, e);
				1558
				1559	out:
				1560	dm_remove_exception(&pe->e);
				1561	snapshot_bios = bio_list_get(&pe->snapshot_bios);
				1562	origin_bios = bio_list_get(&pe->origin_bios);
				1563	full_bio = pe->full_bio;
				1564	if (full_bio)
				1565	full_bio->bi_end_io = pe->full_bio_end_io;
				1566	increment_pending_exceptions_done_count();
				1567
				1568	mutex_unlock(&s->lock);
				1569
				1570	/* Submit any pending write bios */
				1571	if (error) {
				1572	if (full_bio)
				1573	bio_io_error(full_bio);
				1574	error_bios(snapshot_bios);
				1575	} else {
				1576	if (full_bio)
				1577	bio_endio(full_bio);
				1578	flush_bios(snapshot_bios);
				1579	}
				1580
				1581	retry_origin_bios(s, origin_bios);
				1582
				1583	free_pending_exception(pe);
				1584	}
				1585
				1586	static void complete_exception(struct dm_snap_pending_exception *pe)
				1587	{
				1588	struct dm_snapshot *s = pe->snap;
				1589
				1590	/* Update the metadata if we are persistent */
				1591	s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
				1592	pending_complete, pe);
				1593	}
				1594
				1595	/*
				1596	* Called when the copy I/O has finished. kcopyd actually runs
				1597	* this code so don't block.
				1598	*/
				1599	static void copy_callback(int read_err, unsigned long write_err, void *context)
				1600	{
				1601	struct dm_snap_pending_exception *pe = context;
				1602	struct dm_snapshot *s = pe->snap;
				1603
				1604	pe->copy_error = read_err \|\| write_err;
				1605
				1606	if (pe->exception_sequence == s->exception_complete_sequence) {
				1607	struct rb_node *next;
				1608
				1609	s->exception_complete_sequence++;
				1610	complete_exception(pe);
				1611
				1612	next = rb_first(&s->out_of_order_tree);
				1613	while (next) {
				1614	pe = rb_entry(next, struct dm_snap_pending_exception,
				1615	out_of_order_node);
				1616	if (pe->exception_sequence != s->exception_complete_sequence)
				1617	break;
				1618	next = rb_next(next);
				1619	s->exception_complete_sequence++;
				1620	rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
				1621	complete_exception(pe);
				1622	cond_resched();
				1623	}
				1624	} else {
				1625	struct rb_node *parent = NULL;
				1626	struct rb_node **p = &s->out_of_order_tree.rb_node;
				1627	struct dm_snap_pending_exception *pe2;
				1628
				1629	while (*p) {
				1630	pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
				1631	parent = *p;
				1632
				1633	BUG_ON(pe->exception_sequence == pe2->exception_sequence);
				1634	if (pe->exception_sequence < pe2->exception_sequence)
				1635	p = &((*p)->rb_left);
				1636	else
				1637	p = &((*p)->rb_right);
				1638	}
				1639
				1640	rb_link_node(&pe->out_of_order_node, parent, p);
				1641	rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
				1642	}
				1643	account_end_copy(s);
				1644	}
				1645
				1646	/*
				1647	* Dispatches the copy operation to kcopyd.
				1648	*/
				1649	static void start_copy(struct dm_snap_pending_exception *pe)
				1650	{
				1651	struct dm_snapshot *s = pe->snap;
				1652	struct dm_io_region src, dest;
				1653	struct block_device *bdev = s->origin->bdev;
				1654	sector_t dev_size;
				1655
				1656	dev_size = get_dev_size(bdev);
				1657
				1658	src.bdev = bdev;
				1659	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
				1660	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
				1661
				1662	dest.bdev = s->cow->bdev;
				1663	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
				1664	dest.count = src.count;
				1665
				1666	/* Hand over to kcopyd */
				1667	account_start_copy(s);
				1668	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
				1669	}
				1670
				1671	static void full_bio_end_io(struct bio *bio)
				1672	{
				1673	void *callback_data = bio->bi_private;
				1674
				1675	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
				1676	}
				1677
				1678	static void start_full_bio(struct dm_snap_pending_exception *pe,
				1679	struct bio *bio)
				1680	{
				1681	struct dm_snapshot *s = pe->snap;
				1682	void *callback_data;
				1683
				1684	pe->full_bio = bio;
				1685	pe->full_bio_end_io = bio->bi_end_io;
				1686
				1687	account_start_copy(s);
				1688	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
				1689	copy_callback, pe);
				1690
				1691	bio->bi_end_io = full_bio_end_io;
				1692	bio->bi_private = callback_data;
				1693
				1694	generic_make_request(bio);
				1695	}
				1696
				1697	static struct dm_snap_pending_exception *
				1698	__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
				1699	{
				1700	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
				1701
				1702	if (!e)
				1703	return NULL;
				1704
				1705	return container_of(e, struct dm_snap_pending_exception, e);
				1706	}
				1707
				1708	/*
				1709	* Looks to see if this snapshot already has a pending exception
				1710	* for this chunk, otherwise it allocates a new one and inserts
				1711	* it into the pending table.
				1712	*
				1713	* NOTE: a write lock must be held on snap->lock before calling
				1714	* this.
				1715	*/
				1716	static struct dm_snap_pending_exception *
				1717	__find_pending_exception(struct dm_snapshot *s,
				1718	struct dm_snap_pending_exception *pe, chunk_t chunk)
				1719	{
				1720	struct dm_snap_pending_exception *pe2;
				1721
				1722	pe2 = __lookup_pending_exception(s, chunk);
				1723	if (pe2) {
				1724	free_pending_exception(pe);
				1725	return pe2;
				1726	}
				1727
				1728	pe->e.old_chunk = chunk;
				1729	bio_list_init(&pe->origin_bios);
				1730	bio_list_init(&pe->snapshot_bios);
				1731	pe->started = 0;
				1732	pe->full_bio = NULL;
				1733
				1734	if (s->store->type->prepare_exception(s->store, &pe->e)) {
				1735	free_pending_exception(pe);
				1736	return NULL;
				1737	}
				1738
				1739	pe->exception_sequence = s->exception_start_sequence++;
				1740
				1741	dm_insert_exception(&s->pending, &pe->e);
				1742
				1743	return pe;
				1744	}
				1745
				1746	static void remap_exception(struct dm_snapshot s, struct dm_exception e,
				1747	struct bio *bio, chunk_t chunk)
				1748	{
				1749	bio_set_dev(bio, s->cow->bdev);
				1750	bio->bi_iter.bi_sector =
				1751	chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
				1752	(chunk - e->old_chunk)) +
				1753	(bio->bi_iter.bi_sector & s->store->chunk_mask);
				1754	}
				1755
				1756	static int snapshot_map(struct dm_target ti, struct bio bio)
				1757	{
				1758	struct dm_exception *e;
				1759	struct dm_snapshot *s = ti->private;
				1760	int r = DM_MAPIO_REMAPPED;
				1761	chunk_t chunk;
				1762	struct dm_snap_pending_exception *pe = NULL;
				1763
				1764	init_tracked_chunk(bio);
				1765
				1766	if (bio->bi_opf & REQ_PREFLUSH) {
				1767	bio_set_dev(bio, s->cow->bdev);
				1768	return DM_MAPIO_REMAPPED;
				1769	}
				1770
				1771	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
				1772
				1773	/* Full snapshots are not usable */
				1774	/* To get here the table must be live so s->active is always set. */
				1775	if (!s->valid)
				1776	return DM_MAPIO_KILL;
				1777
				1778	if (bio_data_dir(bio) == WRITE) {
				1779	while (unlikely(!wait_for_in_progress(s, false)))
				1780	; /* wait_for_in_progress() has slept */
				1781	}
				1782
				1783	mutex_lock(&s->lock);
				1784
				1785	if (!s->valid \|\| (unlikely(s->snapshot_overflowed) &&
				1786	bio_data_dir(bio) == WRITE)) {
				1787	r = DM_MAPIO_KILL;
				1788	goto out_unlock;
				1789	}
				1790
				1791	/* If the block is already remapped - use that, else remap it */
				1792	e = dm_lookup_exception(&s->complete, chunk);
				1793	if (e) {
				1794	remap_exception(s, e, bio, chunk);
				1795	goto out_unlock;
				1796	}
				1797
				1798	/*
				1799	* Write to snapshot - higher level takes care of RW/RO
				1800	* flags so we should only get this if we are
				1801	* writeable.
				1802	*/
				1803	if (bio_data_dir(bio) == WRITE) {
				1804	pe = __lookup_pending_exception(s, chunk);
				1805	if (!pe) {
				1806	mutex_unlock(&s->lock);
				1807	pe = alloc_pending_exception(s);
				1808	mutex_lock(&s->lock);
				1809
				1810	if (!s->valid \|\| s->snapshot_overflowed) {
				1811	free_pending_exception(pe);
				1812	r = DM_MAPIO_KILL;
				1813	goto out_unlock;
				1814	}
				1815
				1816	e = dm_lookup_exception(&s->complete, chunk);
				1817	if (e) {
				1818	free_pending_exception(pe);
				1819	remap_exception(s, e, bio, chunk);
				1820	goto out_unlock;
				1821	}
				1822
				1823	pe = __find_pending_exception(s, pe, chunk);
				1824	if (!pe) {
				1825	if (s->store->userspace_supports_overflow) {
				1826	s->snapshot_overflowed = 1;
				1827	DMERR("Snapshot overflowed: Unable to allocate exception.");
				1828	} else
				1829	__invalidate_snapshot(s, -ENOMEM);
				1830	r = DM_MAPIO_KILL;
				1831	goto out_unlock;
				1832	}
				1833	}
				1834
				1835	remap_exception(s, &pe->e, bio, chunk);
				1836
				1837	r = DM_MAPIO_SUBMITTED;
				1838
				1839	if (!pe->started &&
				1840	bio->bi_iter.bi_size ==
				1841	(s->store->chunk_size << SECTOR_SHIFT)) {
				1842	pe->started = 1;
				1843	mutex_unlock(&s->lock);
				1844	start_full_bio(pe, bio);
				1845	goto out;
				1846	}
				1847
				1848	bio_list_add(&pe->snapshot_bios, bio);
				1849
				1850	if (!pe->started) {
				1851	/* this is protected by snap->lock */
				1852	pe->started = 1;
				1853	mutex_unlock(&s->lock);
				1854	start_copy(pe);
				1855	goto out;
				1856	}
				1857	} else {
				1858	bio_set_dev(bio, s->origin->bdev);
				1859	track_chunk(s, bio, chunk);
				1860	}
				1861
				1862	out_unlock:
				1863	mutex_unlock(&s->lock);
				1864	out:
				1865	return r;
				1866	}
				1867
				1868	/*
				1869	* A snapshot-merge target behaves like a combination of a snapshot
				1870	* target and a snapshot-origin target. It only generates new
				1871	* exceptions in other snapshots and not in the one that is being
				1872	* merged.
				1873	*
				1874	* For each chunk, if there is an existing exception, it is used to
				1875	* redirect I/O to the cow device. Otherwise I/O is sent to the origin,
				1876	* which in turn might generate exceptions in other snapshots.
				1877	* If merging is currently taking place on the chunk in question, the
				1878	* I/O is deferred by adding it to s->bios_queued_during_merge.
				1879	*/
				1880	static int snapshot_merge_map(struct dm_target ti, struct bio bio)
				1881	{
				1882	struct dm_exception *e;
				1883	struct dm_snapshot *s = ti->private;
				1884	int r = DM_MAPIO_REMAPPED;
				1885	chunk_t chunk;
				1886
				1887	init_tracked_chunk(bio);
				1888
				1889	if (bio->bi_opf & REQ_PREFLUSH) {
				1890	if (!dm_bio_get_target_bio_nr(bio))
				1891	bio_set_dev(bio, s->origin->bdev);
				1892	else
				1893	bio_set_dev(bio, s->cow->bdev);
				1894	return DM_MAPIO_REMAPPED;
				1895	}
				1896
				1897	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
				1898
				1899	mutex_lock(&s->lock);
				1900
				1901	/* Full merging snapshots are redirected to the origin */
				1902	if (!s->valid)
				1903	goto redirect_to_origin;
				1904
				1905	/* If the block is already remapped - use that */
				1906	e = dm_lookup_exception(&s->complete, chunk);
				1907	if (e) {
				1908	/* Queue writes overlapping with chunks being merged */
				1909	if (bio_data_dir(bio) == WRITE &&
				1910	chunk >= s->first_merging_chunk &&
				1911	chunk < (s->first_merging_chunk +
				1912	s->num_merging_chunks)) {
				1913	bio_set_dev(bio, s->origin->bdev);
				1914	bio_list_add(&s->bios_queued_during_merge, bio);
				1915	r = DM_MAPIO_SUBMITTED;
				1916	goto out_unlock;
				1917	}
				1918
				1919	remap_exception(s, e, bio, chunk);
				1920
				1921	if (bio_data_dir(bio) == WRITE)
				1922	track_chunk(s, bio, chunk);
				1923	goto out_unlock;
				1924	}
				1925
				1926	redirect_to_origin:
				1927	bio_set_dev(bio, s->origin->bdev);
				1928
				1929	if (bio_data_dir(bio) == WRITE) {
				1930	mutex_unlock(&s->lock);
				1931	return do_origin(s->origin, bio, false);
				1932	}
				1933
				1934	out_unlock:
				1935	mutex_unlock(&s->lock);
				1936
				1937	return r;
				1938	}
				1939
				1940	static int snapshot_end_io(struct dm_target ti, struct bio bio,
				1941	blk_status_t *error)
				1942	{
				1943	struct dm_snapshot *s = ti->private;
				1944
				1945	if (is_bio_tracked(bio))
				1946	stop_tracking_chunk(s, bio);
				1947
				1948	return DM_ENDIO_DONE;
				1949	}
				1950
				1951	static void snapshot_merge_presuspend(struct dm_target *ti)
				1952	{
				1953	struct dm_snapshot *s = ti->private;
				1954
				1955	stop_merge(s);
				1956	}
				1957
				1958	static int snapshot_preresume(struct dm_target *ti)
				1959	{
				1960	int r = 0;
				1961	struct dm_snapshot *s = ti->private;
				1962	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				1963
				1964	down_read(&_origins_lock);
				1965	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				1966	if (snap_src && snap_dest) {
				1967	mutex_lock(&snap_src->lock);
				1968	if (s == snap_src) {
				1969	DMERR("Unable to resume snapshot source until "
				1970	"handover completes.");
				1971	r = -EINVAL;
				1972	} else if (!dm_suspended(snap_src->ti)) {
				1973	DMERR("Unable to perform snapshot handover until "
				1974	"source is suspended.");
				1975	r = -EINVAL;
				1976	}
				1977	mutex_unlock(&snap_src->lock);
				1978	}
				1979	up_read(&_origins_lock);
				1980
				1981	return r;
				1982	}
				1983
				1984	static void snapshot_resume(struct dm_target *ti)
				1985	{
				1986	struct dm_snapshot *s = ti->private;
				1987	struct dm_snapshot snap_src = NULL, snap_dest = NULL, *snap_merging = NULL;
				1988	struct dm_origin *o;
				1989	struct mapped_device *origin_md = NULL;
				1990	bool must_restart_merging = false;
				1991
				1992	down_read(&_origins_lock);
				1993
				1994	o = __lookup_dm_origin(s->origin->bdev);
				1995	if (o)
				1996	origin_md = dm_table_get_md(o->ti->table);
				1997	if (!origin_md) {
				1998	(void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
				1999	if (snap_merging)
				2000	origin_md = dm_table_get_md(snap_merging->ti->table);
				2001	}
				2002	if (origin_md == dm_table_get_md(ti->table))
				2003	origin_md = NULL;
				2004	if (origin_md) {
				2005	if (dm_hold(origin_md))
				2006	origin_md = NULL;
				2007	}
				2008
				2009	up_read(&_origins_lock);
				2010
				2011	if (origin_md) {
				2012	dm_internal_suspend_fast(origin_md);
				2013	if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
				2014	must_restart_merging = true;
				2015	stop_merge(snap_merging);
				2016	}
				2017	}
				2018
				2019	down_read(&_origins_lock);
				2020
				2021	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				2022	if (snap_src && snap_dest) {
				2023	mutex_lock(&snap_src->lock);
				2024	mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
				2025	__handover_exceptions(snap_src, snap_dest);
				2026	mutex_unlock(&snap_dest->lock);
				2027	mutex_unlock(&snap_src->lock);
				2028	}
				2029
				2030	up_read(&_origins_lock);
				2031
				2032	if (origin_md) {
				2033	if (must_restart_merging)
				2034	start_merge(snap_merging);
				2035	dm_internal_resume_fast(origin_md);
				2036	dm_put(origin_md);
				2037	}
				2038
				2039	/* Now we have correct chunk size, reregister */
				2040	reregister_snapshot(s);
				2041
				2042	mutex_lock(&s->lock);
				2043	s->active = 1;
				2044	mutex_unlock(&s->lock);
				2045	}
				2046
				2047	static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
				2048	{
				2049	uint32_t min_chunksize;
				2050
				2051	down_read(&_origins_lock);
				2052	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
				2053	up_read(&_origins_lock);
				2054
				2055	return min_chunksize;
				2056	}
				2057
				2058	static void snapshot_merge_resume(struct dm_target *ti)
				2059	{
				2060	struct dm_snapshot *s = ti->private;
				2061
				2062	/*
				2063	* Handover exceptions from existing snapshot.
				2064	*/
				2065	snapshot_resume(ti);
				2066
				2067	/*
				2068	* snapshot-merge acts as an origin, so set ti->max_io_len
				2069	*/
				2070	ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
				2071
				2072	start_merge(s);
				2073	}
				2074
				2075	static void snapshot_status(struct dm_target *ti, status_type_t type,
				2076	unsigned status_flags, char *result, unsigned maxlen)
				2077	{
				2078	unsigned sz = 0;
				2079	struct dm_snapshot *snap = ti->private;
				2080
				2081	switch (type) {
				2082	case STATUSTYPE_INFO:
				2083
				2084	mutex_lock(&snap->lock);
				2085
				2086	if (!snap->valid)
				2087	DMEMIT("Invalid");
				2088	else if (snap->merge_failed)
				2089	DMEMIT("Merge failed");
				2090	else if (snap->snapshot_overflowed)
				2091	DMEMIT("Overflow");
				2092	else {
				2093	if (snap->store->type->usage) {
				2094	sector_t total_sectors, sectors_allocated,
				2095	metadata_sectors;
				2096	snap->store->type->usage(snap->store,
				2097	&total_sectors,
				2098	&sectors_allocated,
				2099	&metadata_sectors);
				2100	DMEMIT("%llu/%llu %llu",
				2101	(unsigned long long)sectors_allocated,
				2102	(unsigned long long)total_sectors,
				2103	(unsigned long long)metadata_sectors);
				2104	}
				2105	else
				2106	DMEMIT("Unknown");
				2107	}
				2108
				2109	mutex_unlock(&snap->lock);
				2110
				2111	break;
				2112
				2113	case STATUSTYPE_TABLE:
				2114	/*
				2115	* kdevname returns a static pointer so we need
				2116	* to make private copies if the output is to
				2117	* make sense.
				2118	*/
				2119	DMEMIT("%s %s", snap->origin->name, snap->cow->name);
				2120	snap->store->type->status(snap->store, type, result + sz,
				2121	maxlen - sz);
				2122	break;
				2123	}
				2124	}
				2125
				2126	static int snapshot_iterate_devices(struct dm_target *ti,
				2127	iterate_devices_callout_fn fn, void *data)
				2128	{
				2129	struct dm_snapshot *snap = ti->private;
				2130	int r;
				2131
				2132	r = fn(ti, snap->origin, 0, ti->len, data);
				2133
				2134	if (!r)
				2135	r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
				2136
				2137	return r;
				2138	}
				2139
				2140
				2141	/*-----------------------------------------------------------------
				2142	* Origin methods
				2143	---------------------------------------------------------------/
				2144
				2145	/*
				2146	* If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
				2147	* supplied bio was ignored. The caller may submit it immediately.
				2148	* (No remapping actually occurs as the origin is always a direct linear
				2149	* map.)
				2150	*
				2151	* If further exceptions are required, DM_MAPIO_SUBMITTED is returned
				2152	* and any supplied bio is added to a list to be submitted once all
				2153	* the necessary exceptions exist.
				2154	*/
				2155	static int __origin_write(struct list_head *snapshots, sector_t sector,
				2156	struct bio *bio)
				2157	{
				2158	int r = DM_MAPIO_REMAPPED;
				2159	struct dm_snapshot *snap;
				2160	struct dm_exception *e;
				2161	struct dm_snap_pending_exception *pe;
				2162	struct dm_snap_pending_exception *pe_to_start_now = NULL;
				2163	struct dm_snap_pending_exception *pe_to_start_last = NULL;
				2164	chunk_t chunk;
				2165
				2166	/* Do all the snapshots on this origin */
				2167	list_for_each_entry (snap, snapshots, list) {
				2168	/*
				2169	* Don't make new exceptions in a merging snapshot
				2170	* because it has effectively been deleted
				2171	*/
				2172	if (dm_target_is_snapshot_merge(snap->ti))
				2173	continue;
				2174
				2175	mutex_lock(&snap->lock);
				2176
				2177	/* Only deal with valid and active snapshots */
				2178	if (!snap->valid \|\| !snap->active)
				2179	goto next_snapshot;
				2180
				2181	/* Nothing to do if writing beyond end of snapshot */
				2182	if (sector >= dm_table_get_size(snap->ti->table))
				2183	goto next_snapshot;
				2184
				2185	/*
				2186	* Remember, different snapshots can have
				2187	* different chunk sizes.
				2188	*/
				2189	chunk = sector_to_chunk(snap->store, sector);
				2190
				2191	/*
				2192	* Check exception table to see if block
				2193	* is already remapped in this snapshot
				2194	* and trigger an exception if not.
				2195	*/
				2196	e = dm_lookup_exception(&snap->complete, chunk);
				2197	if (e)
				2198	goto next_snapshot;
				2199
				2200	pe = __lookup_pending_exception(snap, chunk);
				2201	if (!pe) {
				2202	mutex_unlock(&snap->lock);
				2203	pe = alloc_pending_exception(snap);
				2204	mutex_lock(&snap->lock);
				2205
				2206	if (!snap->valid) {
				2207	free_pending_exception(pe);
				2208	goto next_snapshot;
				2209	}
				2210
				2211	e = dm_lookup_exception(&snap->complete, chunk);
				2212	if (e) {
				2213	free_pending_exception(pe);
				2214	goto next_snapshot;
				2215	}
				2216
				2217	pe = __find_pending_exception(snap, pe, chunk);
				2218	if (!pe) {
				2219	__invalidate_snapshot(snap, -ENOMEM);
				2220	goto next_snapshot;
				2221	}
				2222	}
				2223
				2224	r = DM_MAPIO_SUBMITTED;
				2225
				2226	/*
				2227	* If an origin bio was supplied, queue it to wait for the
				2228	* completion of this exception, and start this one last,
				2229	* at the end of the function.
				2230	*/
				2231	if (bio) {
				2232	bio_list_add(&pe->origin_bios, bio);
				2233	bio = NULL;
				2234
				2235	if (!pe->started) {
				2236	pe->started = 1;
				2237	pe_to_start_last = pe;
				2238	}
				2239	}
				2240
				2241	if (!pe->started) {
				2242	pe->started = 1;
				2243	pe_to_start_now = pe;
				2244	}
				2245
				2246	next_snapshot:
				2247	mutex_unlock(&snap->lock);
				2248
				2249	if (pe_to_start_now) {
				2250	start_copy(pe_to_start_now);
				2251	pe_to_start_now = NULL;
				2252	}
				2253	}
				2254
				2255	/*
				2256	* Submit the exception against which the bio is queued last,
				2257	* to give the other exceptions a head start.
				2258	*/
				2259	if (pe_to_start_last)
				2260	start_copy(pe_to_start_last);
				2261
				2262	return r;
				2263	}
				2264
				2265	/*
				2266	* Called on a write from the origin driver.
				2267	*/
				2268	static int do_origin(struct dm_dev origin, struct bio bio, bool limit)
				2269	{
				2270	struct origin *o;
				2271	int r = DM_MAPIO_REMAPPED;
				2272
				2273	again:
				2274	down_read(&_origins_lock);
				2275	o = __lookup_origin(origin->bdev);
				2276	if (o) {
				2277	if (limit) {
				2278	struct dm_snapshot *s;
				2279	list_for_each_entry(s, &o->snapshots, list)
				2280	if (unlikely(!wait_for_in_progress(s, true)))
				2281	goto again;
				2282	}
				2283
				2284	r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
				2285	}
				2286	up_read(&_origins_lock);
				2287
				2288	return r;
				2289	}
				2290
				2291	/*
				2292	* Trigger exceptions in all non-merging snapshots.
				2293	*
				2294	* The chunk size of the merging snapshot may be larger than the chunk
				2295	* size of some other snapshot so we may need to reallocate multiple
				2296	* chunks in other snapshots.
				2297	*
				2298	* We scan all the overlapping exceptions in the other snapshots.
				2299	* Returns 1 if anything was reallocated and must be waited for,
				2300	* otherwise returns 0.
				2301	*
				2302	* size must be a multiple of merging_snap's chunk_size.
				2303	*/
				2304	static int origin_write_extent(struct dm_snapshot *merging_snap,
				2305	sector_t sector, unsigned size)
				2306	{
				2307	int must_wait = 0;
				2308	sector_t n;
				2309	struct origin *o;
				2310
				2311	/*
				2312	* The origin's __minimum_chunk_size() got stored in max_io_len
				2313	* by snapshot_merge_resume().
				2314	*/
				2315	down_read(&_origins_lock);
				2316	o = __lookup_origin(merging_snap->origin->bdev);
				2317	for (n = 0; n < size; n += merging_snap->ti->max_io_len)
				2318	if (__origin_write(&o->snapshots, sector + n, NULL) ==
				2319	DM_MAPIO_SUBMITTED)
				2320	must_wait = 1;
				2321	up_read(&_origins_lock);
				2322
				2323	return must_wait;
				2324	}
				2325
				2326	/*
				2327	* Origin: maps a linear range of a device, with hooks for snapshotting.
				2328	*/
				2329
				2330	/*
				2331	* Construct an origin mapping: <dev_path>
				2332	* The context for an origin is merely a 'struct dm_dev *'
				2333	* pointing to the real device.
				2334	*/
				2335	static int origin_ctr(struct dm_target ti, unsigned int argc, char *argv)
				2336	{
				2337	int r;
				2338	struct dm_origin *o;
				2339
				2340	if (argc != 1) {
				2341	ti->error = "origin: incorrect number of arguments";
				2342	return -EINVAL;
				2343	}
				2344
				2345	o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
				2346	if (!o) {
				2347	ti->error = "Cannot allocate private origin structure";
				2348	r = -ENOMEM;
				2349	goto bad_alloc;
				2350	}
				2351
				2352	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
				2353	if (r) {
				2354	ti->error = "Cannot get target device";
				2355	goto bad_open;
				2356	}
				2357
				2358	o->ti = ti;
				2359	ti->private = o;
				2360	ti->num_flush_bios = 1;
				2361
				2362	return 0;
				2363
				2364	bad_open:
				2365	kfree(o);
				2366	bad_alloc:
				2367	return r;
				2368	}
				2369
				2370	static void origin_dtr(struct dm_target *ti)
				2371	{
				2372	struct dm_origin *o = ti->private;
				2373
				2374	dm_put_device(ti, o->dev);
				2375	kfree(o);
				2376	}
				2377
				2378	static int origin_map(struct dm_target ti, struct bio bio)
				2379	{
				2380	struct dm_origin *o = ti->private;
				2381	unsigned available_sectors;
				2382
				2383	bio_set_dev(bio, o->dev->bdev);
				2384
				2385	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
				2386	return DM_MAPIO_REMAPPED;
				2387
				2388	if (bio_data_dir(bio) != WRITE)
				2389	return DM_MAPIO_REMAPPED;
				2390
				2391	available_sectors = o->split_boundary -
				2392	((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
				2393
				2394	if (bio_sectors(bio) > available_sectors)
				2395	dm_accept_partial_bio(bio, available_sectors);
				2396
				2397	/* Only tell snapshots if this is a write */
				2398	return do_origin(o->dev, bio, true);
				2399	}
				2400
				2401	static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
				2402	long nr_pages, void *kaddr, pfn_t pfn)
				2403	{
				2404	DMWARN("device does not support dax.");
				2405	return -EIO;
				2406	}
				2407
				2408	/*
				2409	* Set the target "max_io_len" field to the minimum of all the snapshots'
				2410	* chunk sizes.
				2411	*/
				2412	static void origin_resume(struct dm_target *ti)
				2413	{
				2414	struct dm_origin *o = ti->private;
				2415
				2416	o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
				2417
				2418	down_write(&_origins_lock);
				2419	__insert_dm_origin(o);
				2420	up_write(&_origins_lock);
				2421	}
				2422
				2423	static void origin_postsuspend(struct dm_target *ti)
				2424	{
				2425	struct dm_origin *o = ti->private;
				2426
				2427	down_write(&_origins_lock);
				2428	__remove_dm_origin(o);
				2429	up_write(&_origins_lock);
				2430	}
				2431
				2432	static void origin_status(struct dm_target *ti, status_type_t type,
				2433	unsigned status_flags, char *result, unsigned maxlen)
				2434	{
				2435	struct dm_origin *o = ti->private;
				2436
				2437	switch (type) {
				2438	case STATUSTYPE_INFO:
				2439	result[0] = '\0';
				2440	break;
				2441
				2442	case STATUSTYPE_TABLE:
				2443	snprintf(result, maxlen, "%s", o->dev->name);
				2444	break;
				2445	}
				2446	}
				2447
				2448	static int origin_iterate_devices(struct dm_target *ti,
				2449	iterate_devices_callout_fn fn, void *data)
				2450	{
				2451	struct dm_origin *o = ti->private;
				2452
				2453	return fn(ti, o->dev, 0, ti->len, data);
				2454	}
				2455
				2456	static struct target_type origin_target = {
				2457	.name = "snapshot-origin",
				2458	.version = {1, 9, 0},
				2459	.module = THIS_MODULE,
				2460	.ctr = origin_ctr,
				2461	.dtr = origin_dtr,
				2462	.map = origin_map,
				2463	.resume = origin_resume,
				2464	.postsuspend = origin_postsuspend,
				2465	.status = origin_status,
				2466	.iterate_devices = origin_iterate_devices,
				2467	.direct_access = origin_dax_direct_access,
				2468	};
				2469
				2470	static struct target_type snapshot_target = {
				2471	.name = "snapshot",
				2472	.version = {1, 15, 0},
				2473	.module = THIS_MODULE,
				2474	.ctr = snapshot_ctr,
				2475	.dtr = snapshot_dtr,
				2476	.map = snapshot_map,
				2477	.end_io = snapshot_end_io,
				2478	.preresume = snapshot_preresume,
				2479	.resume = snapshot_resume,
				2480	.status = snapshot_status,
				2481	.iterate_devices = snapshot_iterate_devices,
				2482	};
				2483
				2484	static struct target_type merge_target = {
				2485	.name = dm_snapshot_merge_target_name,
				2486	.version = {1, 4, 0},
				2487	.module = THIS_MODULE,
				2488	.ctr = snapshot_ctr,
				2489	.dtr = snapshot_dtr,
				2490	.map = snapshot_merge_map,
				2491	.end_io = snapshot_end_io,
				2492	.presuspend = snapshot_merge_presuspend,
				2493	.preresume = snapshot_preresume,
				2494	.resume = snapshot_merge_resume,
				2495	.status = snapshot_status,
				2496	.iterate_devices = snapshot_iterate_devices,
				2497	};
				2498
				2499	static int __init dm_snapshot_init(void)
				2500	{
				2501	int r;
				2502
				2503	r = dm_exception_store_init();
				2504	if (r) {
				2505	DMERR("Failed to initialize exception stores");
				2506	return r;
				2507	}
				2508
				2509	r = init_origin_hash();
				2510	if (r) {
				2511	DMERR("init_origin_hash failed.");
				2512	goto bad_origin_hash;
				2513	}
				2514
				2515	exception_cache = KMEM_CACHE(dm_exception, 0);
				2516	if (!exception_cache) {
				2517	DMERR("Couldn't create exception cache.");
				2518	r = -ENOMEM;
				2519	goto bad_exception_cache;
				2520	}
				2521
				2522	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
				2523	if (!pending_cache) {
				2524	DMERR("Couldn't create pending cache.");
				2525	r = -ENOMEM;
				2526	goto bad_pending_cache;
				2527	}
				2528
				2529	r = dm_register_target(&snapshot_target);
				2530	if (r < 0) {
				2531	DMERR("snapshot target register failed %d", r);
				2532	goto bad_register_snapshot_target;
				2533	}
				2534
				2535	r = dm_register_target(&origin_target);
				2536	if (r < 0) {
				2537	DMERR("Origin target register failed %d", r);
				2538	goto bad_register_origin_target;
				2539	}
				2540
				2541	r = dm_register_target(&merge_target);
				2542	if (r < 0) {
				2543	DMERR("Merge target register failed %d", r);
				2544	goto bad_register_merge_target;
				2545	}
				2546
				2547	return 0;
				2548
				2549	bad_register_merge_target:
				2550	dm_unregister_target(&origin_target);
				2551	bad_register_origin_target:
				2552	dm_unregister_target(&snapshot_target);
				2553	bad_register_snapshot_target:
				2554	kmem_cache_destroy(pending_cache);
				2555	bad_pending_cache:
				2556	kmem_cache_destroy(exception_cache);
				2557	bad_exception_cache:
				2558	exit_origin_hash();
				2559	bad_origin_hash:
				2560	dm_exception_store_exit();
				2561
				2562	return r;
				2563	}
				2564
				2565	static void __exit dm_snapshot_exit(void)
				2566	{
				2567	dm_unregister_target(&snapshot_target);
				2568	dm_unregister_target(&origin_target);
				2569	dm_unregister_target(&merge_target);
				2570
				2571	exit_origin_hash();
				2572	kmem_cache_destroy(pending_cache);
				2573	kmem_cache_destroy(exception_cache);
				2574
				2575	dm_exception_store_exit();
				2576	}
				2577
				2578	/* Module hooks */
				2579	module_init(dm_snapshot_init);
				2580	module_exit(dm_snapshot_exit);
				2581
				2582	MODULE_DESCRIPTION(DM_NAME " snapshot target");
				2583	MODULE_AUTHOR("Joe Thornber");
				2584	MODULE_LICENSE("GPL");
				2585	MODULE_ALIAS("dm-snapshot-origin");
				2586	MODULE_ALIAS("dm-snapshot-merge");