Blame - marvell/linux/drivers/md/dm-snap.c - T108

blob: d8902d2b6aa6696ccacab328f226b3fa419e0010 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include <linux/blkdev.h>
				8	#include <linux/device-mapper.h>
				9	#include <linux/delay.h>
				10	#include <linux/fs.h>
				11	#include <linux/init.h>
				12	#include <linux/kdev_t.h>
				13	#include <linux/list.h>
				14	#include <linux/list_bl.h>
				15	#include <linux/mempool.h>
				16	#include <linux/module.h>
				17	#include <linux/slab.h>
				18	#include <linux/vmalloc.h>
				19	#include <linux/log2.h>
				20	#include <linux/dm-kcopyd.h>
				21
				22	#include "dm.h"
				23
				24	#include "dm-exception-store.h"
				25
				26	#define DM_MSG_PREFIX "snapshots"
				27
				28	static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
				29
				30	#define dm_target_is_snapshot_merge(ti) \
				31	((ti)->type->name == dm_snapshot_merge_target_name)
				32
				33	/*
				34	* The size of the mempool used to track chunks in use.
				35	*/
				36	#define MIN_IOS 256
				37
				38	#define DM_TRACKED_CHUNK_HASH_SIZE 16
				39	#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
				40	(DM_TRACKED_CHUNK_HASH_SIZE - 1))
				41
				42	struct dm_exception_table {
				43	uint32_t hash_mask;
				44	unsigned hash_shift;
				45	struct hlist_bl_head *table;
				46	};
				47
				48	struct dm_snapshot {
				49	struct rw_semaphore lock;
				50
				51	struct dm_dev *origin;
				52	struct dm_dev *cow;
				53
				54	struct dm_target *ti;
				55
				56	/* List of snapshots per Origin */
				57	struct list_head list;
				58
				59	/*
				60	* You can't use a snapshot if this is 0 (e.g. if full).
				61	* A snapshot-merge target never clears this.
				62	*/
				63	int valid;
				64
				65	/*
				66	* The snapshot overflowed because of a write to the snapshot device.
				67	* We don't have to invalidate the snapshot in this case, but we need
				68	* to prevent further writes.
				69	*/
				70	int snapshot_overflowed;
				71
				72	/* Origin writes don't trigger exceptions until this is set */
				73	int active;
				74
				75	atomic_t pending_exceptions_count;
				76
				77	spinlock_t pe_allocation_lock;
				78
				79	/* Protected by "pe_allocation_lock" */
				80	sector_t exception_start_sequence;
				81
				82	/* Protected by kcopyd single-threaded callback */
				83	sector_t exception_complete_sequence;
				84
				85	/*
				86	* A list of pending exceptions that completed out of order.
				87	* Protected by kcopyd single-threaded callback.
				88	*/
				89	struct rb_root out_of_order_tree;
				90
				91	mempool_t pending_pool;
				92
				93	struct dm_exception_table pending;
				94	struct dm_exception_table complete;
				95
				96	/*
				97	* pe_lock protects all pending_exception operations and access
				98	* as well as the snapshot_bios list.
				99	*/
				100	spinlock_t pe_lock;
				101
				102	/* Chunks with outstanding reads */
				103	spinlock_t tracked_chunk_lock;
				104	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
				105
				106	/* The on disk metadata handler */
				107	struct dm_exception_store *store;
				108
				109	unsigned in_progress;
				110	struct wait_queue_head in_progress_wait;
				111
				112	struct dm_kcopyd_client *kcopyd_client;
				113
				114	/* Wait for events based on state_bits */
				115	unsigned long state_bits;
				116
				117	/* Range of chunks currently being merged. */
				118	chunk_t first_merging_chunk;
				119	int num_merging_chunks;
				120
				121	/*
				122	* The merge operation failed if this flag is set.
				123	* Failure modes are handled as follows:
				124	* - I/O error reading the header
				125	* => don't load the target; abort.
				126	* - Header does not have "valid" flag set
				127	* => use the origin; forget about the snapshot.
				128	* - I/O error when reading exceptions
				129	* => don't load the target; abort.
				130	* (We can't use the intermediate origin state.)
				131	* - I/O error while merging
				132	* => stop merging; set merge_failed; process I/O normally.
				133	*/
				134	bool merge_failed:1;
				135
				136	bool discard_zeroes_cow:1;
				137	bool discard_passdown_origin:1;
				138
				139	/*
				140	* Incoming bios that overlap with chunks being merged must wait
				141	* for them to be committed.
				142	*/
				143	struct bio_list bios_queued_during_merge;
				144
				145	/*
				146	* Flush data after merge.
				147	*/
				148	struct bio flush_bio;
				149	};
				150
				151	/*
				152	* state_bits:
				153	* RUNNING_MERGE - Merge operation is in progress.
				154	* SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
				155	* cleared afterwards.
				156	*/
				157	#define RUNNING_MERGE 0
				158	#define SHUTDOWN_MERGE 1
				159
				160	/*
				161	* Maximum number of chunks being copied on write.
				162	*
				163	* The value was decided experimentally as a trade-off between memory
				164	* consumption, stalling the kernel's workqueues and maintaining a high enough
				165	* throughput.
				166	*/
				167	#define DEFAULT_COW_THRESHOLD 2048
				168
				169	static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
				170	module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
				171	MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
				172
				173	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
				174	"A percentage of time allocated for copy on write");
				175
				176	struct dm_dev dm_snap_origin(struct dm_snapshot s)
				177	{
				178	return s->origin;
				179	}
				180	EXPORT_SYMBOL(dm_snap_origin);
				181
				182	struct dm_dev dm_snap_cow(struct dm_snapshot s)
				183	{
				184	return s->cow;
				185	}
				186	EXPORT_SYMBOL(dm_snap_cow);
				187
				188	static sector_t chunk_to_sector(struct dm_exception_store *store,
				189	chunk_t chunk)
				190	{
				191	return chunk << store->chunk_shift;
				192	}
				193
				194	static int bdev_equal(struct block_device lhs, struct block_device rhs)
				195	{
				196	/*
				197	* There is only ever one instance of a particular block
				198	* device so we can compare pointers safely.
				199	*/
				200	return lhs == rhs;
				201	}
				202
				203	struct dm_snap_pending_exception {
				204	struct dm_exception e;
				205
				206	/*
				207	* Origin buffers waiting for this to complete are held
				208	* in a bio list
				209	*/
				210	struct bio_list origin_bios;
				211	struct bio_list snapshot_bios;
				212
				213	/* Pointer back to snapshot context */
				214	struct dm_snapshot *snap;
				215
				216	/*
				217	* 1 indicates the exception has already been sent to
				218	* kcopyd.
				219	*/
				220	int started;
				221
				222	/* There was copying error. */
				223	int copy_error;
				224
				225	/* A sequence number, it is used for in-order completion. */
				226	sector_t exception_sequence;
				227
				228	struct rb_node out_of_order_node;
				229
				230	/*
				231	* For writing a complete chunk, bypassing the copy.
				232	*/
				233	struct bio *full_bio;
				234	bio_end_io_t *full_bio_end_io;
				235	};
				236
				237	/*
				238	* Hash table mapping origin volumes to lists of snapshots and
				239	* a lock to protect it
				240	*/
				241	static struct kmem_cache *exception_cache;
				242	static struct kmem_cache *pending_cache;
				243
				244	struct dm_snap_tracked_chunk {
				245	struct hlist_node node;
				246	chunk_t chunk;
				247	};
				248
				249	static void init_tracked_chunk(struct bio *bio)
				250	{
				251	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				252	INIT_HLIST_NODE(&c->node);
				253	}
				254
				255	static bool is_bio_tracked(struct bio *bio)
				256	{
				257	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				258	return !hlist_unhashed(&c->node);
				259	}
				260
				261	static void track_chunk(struct dm_snapshot s, struct bio bio, chunk_t chunk)
				262	{
				263	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				264
				265	c->chunk = chunk;
				266
				267	spin_lock_irq(&s->tracked_chunk_lock);
				268	hlist_add_head(&c->node,
				269	&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
				270	spin_unlock_irq(&s->tracked_chunk_lock);
				271	}
				272
				273	static void stop_tracking_chunk(struct dm_snapshot s, struct bio bio)
				274	{
				275	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
				276	unsigned long flags;
				277
				278	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
				279	hlist_del(&c->node);
				280	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
				281	}
				282
				283	static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
				284	{
				285	struct dm_snap_tracked_chunk *c;
				286	int found = 0;
				287
				288	spin_lock_irq(&s->tracked_chunk_lock);
				289
				290	hlist_for_each_entry(c,
				291	&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
				292	if (c->chunk == chunk) {
				293	found = 1;
				294	break;
				295	}
				296	}
				297
				298	spin_unlock_irq(&s->tracked_chunk_lock);
				299
				300	return found;
				301	}
				302
				303	/*
				304	* This conflicting I/O is extremely improbable in the caller,
				305	* so msleep(1) is sufficient and there is no need for a wait queue.
				306	*/
				307	static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
				308	{
				309	while (__chunk_is_tracked(s, chunk))
				310	msleep(1);
				311	}
				312
				313	/*
				314	* One of these per registered origin, held in the snapshot_origins hash
				315	*/
				316	struct origin {
				317	/* The origin device */
				318	struct block_device *bdev;
				319
				320	struct list_head hash_list;
				321
				322	/* List of snapshots for this origin */
				323	struct list_head snapshots;
				324	};
				325
				326	/*
				327	* This structure is allocated for each origin target
				328	*/
				329	struct dm_origin {
				330	struct dm_dev *dev;
				331	struct dm_target *ti;
				332	unsigned split_boundary;
				333	struct list_head hash_list;
				334	};
				335
				336	/*
				337	* Size of the hash table for origin volumes. If we make this
				338	* the size of the minors list then it should be nearly perfect
				339	*/
				340	#define ORIGIN_HASH_SIZE 256
				341	#define ORIGIN_MASK 0xFF
				342	static struct list_head *_origins;
				343	static struct list_head *_dm_origins;
				344	static struct rw_semaphore _origins_lock;
				345
				346	static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
				347	static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
				348	static uint64_t _pending_exceptions_done_count;
				349
				350	static int init_origin_hash(void)
				351	{
				352	int i;
				353
				354	_origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
				355	GFP_KERNEL);
				356	if (!_origins) {
				357	DMERR("unable to allocate memory for _origins");
				358	return -ENOMEM;
				359	}
				360	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
				361	INIT_LIST_HEAD(_origins + i);
				362
				363	_dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
				364	sizeof(struct list_head),
				365	GFP_KERNEL);
				366	if (!_dm_origins) {
				367	DMERR("unable to allocate memory for _dm_origins");
				368	kfree(_origins);
				369	return -ENOMEM;
				370	}
				371	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
				372	INIT_LIST_HEAD(_dm_origins + i);
				373
				374	init_rwsem(&_origins_lock);
				375
				376	return 0;
				377	}
				378
				379	static void exit_origin_hash(void)
				380	{
				381	kfree(_origins);
				382	kfree(_dm_origins);
				383	}
				384
				385	static unsigned origin_hash(struct block_device *bdev)
				386	{
				387	return bdev->bd_dev & ORIGIN_MASK;
				388	}
				389
				390	static struct origin __lookup_origin(struct block_device origin)
				391	{
				392	struct list_head *ol;
				393	struct origin *o;
				394
				395	ol = &_origins[origin_hash(origin)];
				396	list_for_each_entry (o, ol, hash_list)
				397	if (bdev_equal(o->bdev, origin))
				398	return o;
				399
				400	return NULL;
				401	}
				402
				403	static void __insert_origin(struct origin *o)
				404	{
				405	struct list_head *sl = &_origins[origin_hash(o->bdev)];
				406	list_add_tail(&o->hash_list, sl);
				407	}
				408
				409	static struct dm_origin __lookup_dm_origin(struct block_device origin)
				410	{
				411	struct list_head *ol;
				412	struct dm_origin *o;
				413
				414	ol = &_dm_origins[origin_hash(origin)];
				415	list_for_each_entry (o, ol, hash_list)
				416	if (bdev_equal(o->dev->bdev, origin))
				417	return o;
				418
				419	return NULL;
				420	}
				421
				422	static void __insert_dm_origin(struct dm_origin *o)
				423	{
				424	struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
				425	list_add_tail(&o->hash_list, sl);
				426	}
				427
				428	static void __remove_dm_origin(struct dm_origin *o)
				429	{
				430	list_del(&o->hash_list);
				431	}
				432
				433	/*
				434	* _origins_lock must be held when calling this function.
				435	* Returns number of snapshots registered using the supplied cow device, plus:
				436	* snap_src - a snapshot suitable for use as a source of exception handover
				437	* snap_dest - a snapshot capable of receiving exception handover.
				438	* snap_merge - an existing snapshot-merge target linked to the same origin.
				439	* There can be at most one snapshot-merge target. The parameter is optional.
				440	*
				441	* Possible return values and states of snap_src and snap_dest.
				442	* 0: NULL, NULL - first new snapshot
				443	* 1: snap_src, NULL - normal snapshot
				444	* 2: snap_src, snap_dest - waiting for handover
				445	* 2: snap_src, NULL - handed over, waiting for old to be deleted
				446	* 1: NULL, snap_dest - source got destroyed without handover
				447	*/
				448	static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
				449	struct dm_snapshot **snap_src,
				450	struct dm_snapshot **snap_dest,
				451	struct dm_snapshot **snap_merge)
				452	{
				453	struct dm_snapshot *s;
				454	struct origin *o;
				455	int count = 0;
				456	int active;
				457
				458	o = __lookup_origin(snap->origin->bdev);
				459	if (!o)
				460	goto out;
				461
				462	list_for_each_entry(s, &o->snapshots, list) {
				463	if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
				464	*snap_merge = s;
				465	if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
				466	continue;
				467
				468	down_read(&s->lock);
				469	active = s->active;
				470	up_read(&s->lock);
				471
				472	if (active) {
				473	if (snap_src)
				474	*snap_src = s;
				475	} else if (snap_dest)
				476	*snap_dest = s;
				477
				478	count++;
				479	}
				480
				481	out:
				482	return count;
				483	}
				484
				485	/*
				486	* On success, returns 1 if this snapshot is a handover destination,
				487	* otherwise returns 0.
				488	*/
				489	static int __validate_exception_handover(struct dm_snapshot *snap)
				490	{
				491	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				492	struct dm_snapshot *snap_merge = NULL;
				493
				494	/* Does snapshot need exceptions handed over to it? */
				495	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
				496	&snap_merge) == 2) \|\|
				497	snap_dest) {
				498	snap->ti->error = "Snapshot cow pairing for exception "
				499	"table handover failed";
				500	return -EINVAL;
				501	}
				502
				503	/*
				504	* If no snap_src was found, snap cannot become a handover
				505	* destination.
				506	*/
				507	if (!snap_src)
				508	return 0;
				509
				510	/*
				511	* Non-snapshot-merge handover?
				512	*/
				513	if (!dm_target_is_snapshot_merge(snap->ti))
				514	return 1;
				515
				516	/*
				517	* Do not allow more than one merging snapshot.
				518	*/
				519	if (snap_merge) {
				520	snap->ti->error = "A snapshot is already merging.";
				521	return -EINVAL;
				522	}
				523
				524	if (!snap_src->store->type->prepare_merge \|\|
				525	!snap_src->store->type->commit_merge) {
				526	snap->ti->error = "Snapshot exception store does not "
				527	"support snapshot-merge.";
				528	return -EINVAL;
				529	}
				530
				531	return 1;
				532	}
				533
				534	static void __insert_snapshot(struct origin o, struct dm_snapshot s)
				535	{
				536	struct dm_snapshot *l;
				537
				538	/* Sort the list according to chunk size, largest-first smallest-last */
				539	list_for_each_entry(l, &o->snapshots, list)
				540	if (l->store->chunk_size < s->store->chunk_size)
				541	break;
				542	list_add_tail(&s->list, &l->list);
				543	}
				544
				545	/*
				546	* Make a note of the snapshot and its origin so we can look it
				547	* up when the origin has a write on it.
				548	*
				549	* Also validate snapshot exception store handovers.
				550	* On success, returns 1 if this registration is a handover destination,
				551	* otherwise returns 0.
				552	*/
				553	static int register_snapshot(struct dm_snapshot *snap)
				554	{
				555	struct origin o, new_o = NULL;
				556	struct block_device *bdev = snap->origin->bdev;
				557	int r = 0;
				558
				559	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
				560	if (!new_o)
				561	return -ENOMEM;
				562
				563	down_write(&_origins_lock);
				564
				565	r = __validate_exception_handover(snap);
				566	if (r < 0) {
				567	kfree(new_o);
				568	goto out;
				569	}
				570
				571	o = __lookup_origin(bdev);
				572	if (o)
				573	kfree(new_o);
				574	else {
				575	/* New origin */
				576	o = new_o;
				577
				578	/* Initialise the struct */
				579	INIT_LIST_HEAD(&o->snapshots);
				580	o->bdev = bdev;
				581
				582	__insert_origin(o);
				583	}
				584
				585	__insert_snapshot(o, snap);
				586
				587	out:
				588	up_write(&_origins_lock);
				589
				590	return r;
				591	}
				592
				593	/*
				594	* Move snapshot to correct place in list according to chunk size.
				595	*/
				596	static void reregister_snapshot(struct dm_snapshot *s)
				597	{
				598	struct block_device *bdev = s->origin->bdev;
				599
				600	down_write(&_origins_lock);
				601
				602	list_del(&s->list);
				603	__insert_snapshot(__lookup_origin(bdev), s);
				604
				605	up_write(&_origins_lock);
				606	}
				607
				608	static void unregister_snapshot(struct dm_snapshot *s)
				609	{
				610	struct origin *o;
				611
				612	down_write(&_origins_lock);
				613	o = __lookup_origin(s->origin->bdev);
				614
				615	list_del(&s->list);
				616	if (o && list_empty(&o->snapshots)) {
				617	list_del(&o->hash_list);
				618	kfree(o);
				619	}
				620
				621	up_write(&_origins_lock);
				622	}
				623
				624	/*
				625	* Implementation of the exception hash tables.
				626	* The lowest hash_shift bits of the chunk number are ignored, allowing
				627	* some consecutive chunks to be grouped together.
				628	*/
				629	static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
				630
				631	/* Lock to protect access to the completed and pending exception hash tables. */
				632	struct dm_exception_table_lock {
				633	struct hlist_bl_head *complete_slot;
				634	struct hlist_bl_head *pending_slot;
				635	};
				636
				637	static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
				638	struct dm_exception_table_lock *lock)
				639	{
				640	struct dm_exception_table *complete = &s->complete;
				641	struct dm_exception_table *pending = &s->pending;
				642
				643	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
				644	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
				645	}
				646
				647	static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
				648	{
				649	hlist_bl_lock(lock->complete_slot);
				650	hlist_bl_lock(lock->pending_slot);
				651	}
				652
				653	static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
				654	{
				655	hlist_bl_unlock(lock->pending_slot);
				656	hlist_bl_unlock(lock->complete_slot);
				657	}
				658
				659	static int dm_exception_table_init(struct dm_exception_table *et,
				660	uint32_t size, unsigned hash_shift)
				661	{
				662	unsigned int i;
				663
				664	et->hash_shift = hash_shift;
				665	et->hash_mask = size - 1;
				666	et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
				667	if (!et->table)
				668	return -ENOMEM;
				669
				670	for (i = 0; i < size; i++)
				671	INIT_HLIST_BL_HEAD(et->table + i);
				672
				673	return 0;
				674	}
				675
				676	static void dm_exception_table_exit(struct dm_exception_table *et,
				677	struct kmem_cache *mem)
				678	{
				679	struct hlist_bl_head *slot;
				680	struct dm_exception *ex;
				681	struct hlist_bl_node pos, n;
				682	int i, size;
				683
				684	size = et->hash_mask + 1;
				685	for (i = 0; i < size; i++) {
				686	slot = et->table + i;
				687
				688	hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
				689	kmem_cache_free(mem, ex);
				690	cond_resched();
				691	}
				692	}
				693
				694	vfree(et->table);
				695	}
				696
				697	static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
				698	{
				699	return (chunk >> et->hash_shift) & et->hash_mask;
				700	}
				701
				702	static void dm_remove_exception(struct dm_exception *e)
				703	{
				704	hlist_bl_del(&e->hash_list);
				705	}
				706
				707	/*
				708	* Return the exception data for a sector, or NULL if not
				709	* remapped.
				710	*/
				711	static struct dm_exception dm_lookup_exception(struct dm_exception_table et,
				712	chunk_t chunk)
				713	{
				714	struct hlist_bl_head *slot;
				715	struct hlist_bl_node *pos;
				716	struct dm_exception *e;
				717
				718	slot = &et->table[exception_hash(et, chunk)];
				719	hlist_bl_for_each_entry(e, pos, slot, hash_list)
				720	if (chunk >= e->old_chunk &&
				721	chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
				722	return e;
				723
				724	return NULL;
				725	}
				726
				727	static struct dm_exception *alloc_completed_exception(gfp_t gfp)
				728	{
				729	struct dm_exception *e;
				730
				731	e = kmem_cache_alloc(exception_cache, gfp);
				732	if (!e && gfp == GFP_NOIO)
				733	e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
				734
				735	return e;
				736	}
				737
				738	static void free_completed_exception(struct dm_exception *e)
				739	{
				740	kmem_cache_free(exception_cache, e);
				741	}
				742
				743	static struct dm_snap_pending_exception alloc_pending_exception(struct dm_snapshot s)
				744	{
				745	struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
				746	GFP_NOIO);
				747
				748	atomic_inc(&s->pending_exceptions_count);
				749	pe->snap = s;
				750
				751	return pe;
				752	}
				753
				754	static void free_pending_exception(struct dm_snap_pending_exception *pe)
				755	{
				756	struct dm_snapshot *s = pe->snap;
				757
				758	mempool_free(pe, &s->pending_pool);
				759	smp_mb__before_atomic();
				760	atomic_dec(&s->pending_exceptions_count);
				761	}
				762
				763	static void dm_insert_exception(struct dm_exception_table *eh,
				764	struct dm_exception *new_e)
				765	{
				766	struct hlist_bl_head *l;
				767	struct hlist_bl_node *pos;
				768	struct dm_exception *e = NULL;
				769
				770	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
				771
				772	/* Add immediately if this table doesn't support consecutive chunks */
				773	if (!eh->hash_shift)
				774	goto out;
				775
				776	/* List is ordered by old_chunk */
				777	hlist_bl_for_each_entry(e, pos, l, hash_list) {
				778	/* Insert after an existing chunk? */
				779	if (new_e->old_chunk == (e->old_chunk +
				780	dm_consecutive_chunk_count(e) + 1) &&
				781	new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
				782	dm_consecutive_chunk_count(e) + 1)) {
				783	dm_consecutive_chunk_count_inc(e);
				784	free_completed_exception(new_e);
				785	return;
				786	}
				787
				788	/* Insert before an existing chunk? */
				789	if (new_e->old_chunk == (e->old_chunk - 1) &&
				790	new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
				791	dm_consecutive_chunk_count_inc(e);
				792	e->old_chunk--;
				793	e->new_chunk--;
				794	free_completed_exception(new_e);
				795	return;
				796	}
				797
				798	if (new_e->old_chunk < e->old_chunk)
				799	break;
				800	}
				801
				802	out:
				803	if (!e) {
				804	/*
				805	* Either the table doesn't support consecutive chunks or slot
				806	* l is empty.
				807	*/
				808	hlist_bl_add_head(&new_e->hash_list, l);
				809	} else if (new_e->old_chunk < e->old_chunk) {
				810	/* Add before an existing exception */
				811	hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
				812	} else {
				813	/* Add to l's tail: e is the last exception in this slot */
				814	hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
				815	}
				816	}
				817
				818	/*
				819	* Callback used by the exception stores to load exceptions when
				820	* initialising.
				821	*/
				822	static int dm_add_exception(void *context, chunk_t old, chunk_t new)
				823	{
				824	struct dm_exception_table_lock lock;
				825	struct dm_snapshot *s = context;
				826	struct dm_exception *e;
				827
				828	e = alloc_completed_exception(GFP_KERNEL);
				829	if (!e)
				830	return -ENOMEM;
				831
				832	e->old_chunk = old;
				833
				834	/* Consecutive_count is implicitly initialised to zero */
				835	e->new_chunk = new;
				836
				837	/*
				838	* Although there is no need to lock access to the exception tables
				839	* here, if we don't then hlist_bl_add_head(), called by
				840	* dm_insert_exception(), will complain about accessing the
				841	* corresponding list without locking it first.
				842	*/
				843	dm_exception_table_lock_init(s, old, &lock);
				844
				845	dm_exception_table_lock(&lock);
				846	dm_insert_exception(&s->complete, e);
				847	dm_exception_table_unlock(&lock);
				848
				849	return 0;
				850	}
				851
				852	/*
				853	* Return a minimum chunk size of all snapshots that have the specified origin.
				854	* Return zero if the origin has no snapshots.
				855	*/
				856	static uint32_t __minimum_chunk_size(struct origin *o)
				857	{
				858	struct dm_snapshot *snap;
				859	unsigned chunk_size = rounddown_pow_of_two(UINT_MAX);
				860
				861	if (o)
				862	list_for_each_entry(snap, &o->snapshots, list)
				863	chunk_size = min_not_zero(chunk_size,
				864	snap->store->chunk_size);
				865
				866	return (uint32_t) chunk_size;
				867	}
				868
				869	/*
				870	* Hard coded magic.
				871	*/
				872	static int calc_max_buckets(void)
				873	{
				874	/* use a fixed size of 2MB */
				875	unsigned long mem = 2 * 1024 * 1024;
				876	mem /= sizeof(struct hlist_bl_head);
				877
				878	return mem;
				879	}
				880
				881	/*
				882	* Allocate room for a suitable hash table.
				883	*/
				884	static int init_hash_tables(struct dm_snapshot *s)
				885	{
				886	sector_t hash_size, cow_dev_size, max_buckets;
				887
				888	/*
				889	* Calculate based on the size of the original volume or
				890	* the COW volume...
				891	*/
				892	cow_dev_size = get_dev_size(s->cow->bdev);
				893	max_buckets = calc_max_buckets();
				894
				895	hash_size = cow_dev_size >> s->store->chunk_shift;
				896	hash_size = min(hash_size, max_buckets);
				897
				898	if (hash_size < 64)
				899	hash_size = 64;
				900	hash_size = rounddown_pow_of_two(hash_size);
				901	if (dm_exception_table_init(&s->complete, hash_size,
				902	DM_CHUNK_CONSECUTIVE_BITS))
				903	return -ENOMEM;
				904
				905	/*
				906	* Allocate hash table for in-flight exceptions
				907	* Make this smaller than the real hash table
				908	*/
				909	hash_size >>= 3;
				910	if (hash_size < 64)
				911	hash_size = 64;
				912
				913	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
				914	dm_exception_table_exit(&s->complete, exception_cache);
				915	return -ENOMEM;
				916	}
				917
				918	return 0;
				919	}
				920
				921	static void merge_shutdown(struct dm_snapshot *s)
				922	{
				923	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
				924	smp_mb__after_atomic();
				925	wake_up_bit(&s->state_bits, RUNNING_MERGE);
				926	}
				927
				928	static struct bio __release_queued_bios_after_merge(struct dm_snapshot s)
				929	{
				930	s->first_merging_chunk = 0;
				931	s->num_merging_chunks = 0;
				932
				933	return bio_list_get(&s->bios_queued_during_merge);
				934	}
				935
				936	/*
				937	* Remove one chunk from the index of completed exceptions.
				938	*/
				939	static int __remove_single_exception_chunk(struct dm_snapshot *s,
				940	chunk_t old_chunk)
				941	{
				942	struct dm_exception *e;
				943
				944	e = dm_lookup_exception(&s->complete, old_chunk);
				945	if (!e) {
				946	DMERR("Corruption detected: exception for block %llu is "
				947	"on disk but not in memory",
				948	(unsigned long long)old_chunk);
				949	return -EINVAL;
				950	}
				951
				952	/*
				953	* If this is the only chunk using this exception, remove exception.
				954	*/
				955	if (!dm_consecutive_chunk_count(e)) {
				956	dm_remove_exception(e);
				957	free_completed_exception(e);
				958	return 0;
				959	}
				960
				961	/*
				962	* The chunk may be either at the beginning or the end of a
				963	* group of consecutive chunks - never in the middle. We are
				964	* removing chunks in the opposite order to that in which they
				965	* were added, so this should always be true.
				966	* Decrement the consecutive chunk counter and adjust the
				967	* starting point if necessary.
				968	*/
				969	if (old_chunk == e->old_chunk) {
				970	e->old_chunk++;
				971	e->new_chunk++;
				972	} else if (old_chunk != e->old_chunk +
				973	dm_consecutive_chunk_count(e)) {
				974	DMERR("Attempt to merge block %llu from the "
				975	"middle of a chunk range [%llu - %llu]",
				976	(unsigned long long)old_chunk,
				977	(unsigned long long)e->old_chunk,
				978	(unsigned long long)
				979	e->old_chunk + dm_consecutive_chunk_count(e));
				980	return -EINVAL;
				981	}
				982
				983	dm_consecutive_chunk_count_dec(e);
				984
				985	return 0;
				986	}
				987
				988	static void flush_bios(struct bio *bio);
				989
				990	static int remove_single_exception_chunk(struct dm_snapshot *s)
				991	{
				992	struct bio *b = NULL;
				993	int r;
				994	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
				995
				996	down_write(&s->lock);
				997
				998	/*
				999	* Process chunks (and associated exceptions) in reverse order
				1000	* so that dm_consecutive_chunk_count_dec() accounting works.
				1001	*/
				1002	do {
				1003	r = __remove_single_exception_chunk(s, old_chunk);
				1004	if (r)
				1005	goto out;
				1006	} while (old_chunk-- > s->first_merging_chunk);
				1007
				1008	b = __release_queued_bios_after_merge(s);
				1009
				1010	out:
				1011	up_write(&s->lock);
				1012	if (b)
				1013	flush_bios(b);
				1014
				1015	return r;
				1016	}
				1017
				1018	static int origin_write_extent(struct dm_snapshot *merging_snap,
				1019	sector_t sector, unsigned chunk_size);
				1020
				1021	static void merge_callback(int read_err, unsigned long write_err,
				1022	void *context);
				1023
				1024	static uint64_t read_pending_exceptions_done_count(void)
				1025	{
				1026	uint64_t pending_exceptions_done;
				1027
				1028	spin_lock(&_pending_exceptions_done_spinlock);
				1029	pending_exceptions_done = _pending_exceptions_done_count;
				1030	spin_unlock(&_pending_exceptions_done_spinlock);
				1031
				1032	return pending_exceptions_done;
				1033	}
				1034
				1035	static void increment_pending_exceptions_done_count(void)
				1036	{
				1037	spin_lock(&_pending_exceptions_done_spinlock);
				1038	_pending_exceptions_done_count++;
				1039	spin_unlock(&_pending_exceptions_done_spinlock);
				1040
				1041	wake_up_all(&_pending_exceptions_done);
				1042	}
				1043
				1044	static void snapshot_merge_next_chunks(struct dm_snapshot *s)
				1045	{
				1046	int i, linear_chunks;
				1047	chunk_t old_chunk, new_chunk;
				1048	struct dm_io_region src, dest;
				1049	sector_t io_size;
				1050	uint64_t previous_count;
				1051
				1052	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
				1053	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
				1054	goto shut;
				1055
				1056	/*
				1057	* valid flag never changes during merge, so no lock required.
				1058	*/
				1059	if (!s->valid) {
				1060	DMERR("Snapshot is invalid: can't merge");
				1061	goto shut;
				1062	}
				1063
				1064	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
				1065	&new_chunk);
				1066	if (linear_chunks <= 0) {
				1067	if (linear_chunks < 0) {
				1068	DMERR("Read error in exception store: "
				1069	"shutting down merge");
				1070	down_write(&s->lock);
				1071	s->merge_failed = 1;
				1072	up_write(&s->lock);
				1073	}
				1074	goto shut;
				1075	}
				1076
				1077	/* Adjust old_chunk and new_chunk to reflect start of linear region */
				1078	old_chunk = old_chunk + 1 - linear_chunks;
				1079	new_chunk = new_chunk + 1 - linear_chunks;
				1080
				1081	/*
				1082	* Use one (potentially large) I/O to copy all 'linear_chunks'
				1083	* from the exception store to the origin
				1084	*/
				1085	io_size = linear_chunks * s->store->chunk_size;
				1086
				1087	dest.bdev = s->origin->bdev;
				1088	dest.sector = chunk_to_sector(s->store, old_chunk);
				1089	dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
				1090
				1091	src.bdev = s->cow->bdev;
				1092	src.sector = chunk_to_sector(s->store, new_chunk);
				1093	src.count = dest.count;
				1094
				1095	/*
				1096	* Reallocate any exceptions needed in other snapshots then
				1097	* wait for the pending exceptions to complete.
				1098	* Each time any pending exception (globally on the system)
				1099	* completes we are woken and repeat the process to find out
				1100	* if we can proceed. While this may not seem a particularly
				1101	* efficient algorithm, it is not expected to have any
				1102	* significant impact on performance.
				1103	*/
				1104	previous_count = read_pending_exceptions_done_count();
				1105	while (origin_write_extent(s, dest.sector, io_size)) {
				1106	wait_event(_pending_exceptions_done,
				1107	(read_pending_exceptions_done_count() !=
				1108	previous_count));
				1109	/* Retry after the wait, until all exceptions are done. */
				1110	previous_count = read_pending_exceptions_done_count();
				1111	}
				1112
				1113	down_write(&s->lock);
				1114	s->first_merging_chunk = old_chunk;
				1115	s->num_merging_chunks = linear_chunks;
				1116	up_write(&s->lock);
				1117
				1118	/* Wait until writes to all 'linear_chunks' drain */
				1119	for (i = 0; i < linear_chunks; i++)
				1120	__check_for_conflicting_io(s, old_chunk + i);
				1121
				1122	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
				1123	return;
				1124
				1125	shut:
				1126	merge_shutdown(s);
				1127	}
				1128
				1129	static void error_bios(struct bio *bio);
				1130
				1131	static int flush_data(struct dm_snapshot *s)
				1132	{
				1133	struct bio *flush_bio = &s->flush_bio;
				1134
				1135	bio_reset(flush_bio);
				1136	bio_set_dev(flush_bio, s->origin->bdev);
				1137	flush_bio->bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH;
				1138
				1139	return submit_bio_wait(flush_bio);
				1140	}
				1141
				1142	static void merge_callback(int read_err, unsigned long write_err, void *context)
				1143	{
				1144	struct dm_snapshot *s = context;
				1145	struct bio *b = NULL;
				1146
				1147	if (read_err \|\| write_err) {
				1148	if (read_err)
				1149	DMERR("Read error: shutting down merge.");
				1150	else
				1151	DMERR("Write error: shutting down merge.");
				1152	goto shut;
				1153	}
				1154
				1155	if (flush_data(s) < 0) {
				1156	DMERR("Flush after merge failed: shutting down merge");
				1157	goto shut;
				1158	}
				1159
				1160	if (s->store->type->commit_merge(s->store,
				1161	s->num_merging_chunks) < 0) {
				1162	DMERR("Write error in exception store: shutting down merge");
				1163	goto shut;
				1164	}
				1165
				1166	if (remove_single_exception_chunk(s) < 0)
				1167	goto shut;
				1168
				1169	snapshot_merge_next_chunks(s);
				1170
				1171	return;
				1172
				1173	shut:
				1174	down_write(&s->lock);
				1175	s->merge_failed = 1;
				1176	b = __release_queued_bios_after_merge(s);
				1177	up_write(&s->lock);
				1178	error_bios(b);
				1179
				1180	merge_shutdown(s);
				1181	}
				1182
				1183	static void start_merge(struct dm_snapshot *s)
				1184	{
				1185	if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
				1186	snapshot_merge_next_chunks(s);
				1187	}
				1188
				1189	/*
				1190	* Stop the merging process and wait until it finishes.
				1191	*/
				1192	static void stop_merge(struct dm_snapshot *s)
				1193	{
				1194	set_bit(SHUTDOWN_MERGE, &s->state_bits);
				1195	wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
				1196	clear_bit(SHUTDOWN_MERGE, &s->state_bits);
				1197	}
				1198
				1199	static int parse_snapshot_features(struct dm_arg_set as, struct dm_snapshot s,
				1200	struct dm_target *ti)
				1201	{
				1202	int r;
				1203	unsigned argc;
				1204	const char *arg_name;
				1205
				1206	static const struct dm_arg _args[] = {
				1207	{0, 2, "Invalid number of feature arguments"},
				1208	};
				1209
				1210	/*
				1211	* No feature arguments supplied.
				1212	*/
				1213	if (!as->argc)
				1214	return 0;
				1215
				1216	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1217	if (r)
				1218	return -EINVAL;
				1219
				1220	while (argc && !r) {
				1221	arg_name = dm_shift_arg(as);
				1222	argc--;
				1223
				1224	if (!strcasecmp(arg_name, "discard_zeroes_cow"))
				1225	s->discard_zeroes_cow = true;
				1226
				1227	else if (!strcasecmp(arg_name, "discard_passdown_origin"))
				1228	s->discard_passdown_origin = true;
				1229
				1230	else {
				1231	ti->error = "Unrecognised feature requested";
				1232	r = -EINVAL;
				1233	break;
				1234	}
				1235	}
				1236
				1237	if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
				1238	/*
				1239	* TODO: really these are disjoint.. but ti->num_discard_bios
				1240	* and dm_bio_get_target_bio_nr() require rigid constraints.
				1241	*/
				1242	ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
				1243	r = -EINVAL;
				1244	}
				1245
				1246	return r;
				1247	}
				1248
				1249	/*
				1250	* Construct a snapshot mapping:
				1251	* <origin_dev> <COW-dev> <p\|po\|n> <chunk-size> [<# feature args> [<arg>]*]
				1252	*/
				1253	static int snapshot_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1254	{
				1255	struct dm_snapshot *s;
				1256	struct dm_arg_set as;
				1257	int i;
				1258	int r = -EINVAL;
				1259	char origin_path, cow_path;
				1260	dev_t origin_dev, cow_dev;
				1261	unsigned args_used, num_flush_bios = 1;
				1262	fmode_t origin_mode = FMODE_READ;
				1263
				1264	if (argc < 4) {
				1265	ti->error = "requires 4 or more arguments";
				1266	r = -EINVAL;
				1267	goto bad;
				1268	}
				1269
				1270	if (dm_target_is_snapshot_merge(ti)) {
				1271	num_flush_bios = 2;
				1272	origin_mode = FMODE_WRITE;
				1273	}
				1274
				1275	s = kzalloc(sizeof(*s), GFP_KERNEL);
				1276	if (!s) {
				1277	ti->error = "Cannot allocate private snapshot structure";
				1278	r = -ENOMEM;
				1279	goto bad;
				1280	}
				1281
				1282	as.argc = argc;
				1283	as.argv = argv;
				1284	dm_consume_args(&as, 4);
				1285	r = parse_snapshot_features(&as, s, ti);
				1286	if (r)
				1287	goto bad_features;
				1288
				1289	origin_path = argv[0];
				1290	argv++;
				1291	argc--;
				1292
				1293	r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
				1294	if (r) {
				1295	ti->error = "Cannot get origin device";
				1296	goto bad_origin;
				1297	}
				1298	origin_dev = s->origin->bdev->bd_dev;
				1299
				1300	cow_path = argv[0];
				1301	argv++;
				1302	argc--;
				1303
				1304	cow_dev = dm_get_dev_t(cow_path);
				1305	if (cow_dev && cow_dev == origin_dev) {
				1306	ti->error = "COW device cannot be the same as origin device";
				1307	r = -EINVAL;
				1308	goto bad_cow;
				1309	}
				1310
				1311	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
				1312	if (r) {
				1313	ti->error = "Cannot get COW device";
				1314	goto bad_cow;
				1315	}
				1316
				1317	r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
				1318	if (r) {
				1319	ti->error = "Couldn't create exception store";
				1320	r = -EINVAL;
				1321	goto bad_store;
				1322	}
				1323
				1324	argv += args_used;
				1325	argc -= args_used;
				1326
				1327	s->ti = ti;
				1328	s->valid = 1;
				1329	s->snapshot_overflowed = 0;
				1330	s->active = 0;
				1331	atomic_set(&s->pending_exceptions_count, 0);
				1332	spin_lock_init(&s->pe_allocation_lock);
				1333	s->exception_start_sequence = 0;
				1334	s->exception_complete_sequence = 0;
				1335	s->out_of_order_tree = RB_ROOT;
				1336	init_rwsem(&s->lock);
				1337	INIT_LIST_HEAD(&s->list);
				1338	spin_lock_init(&s->pe_lock);
				1339	s->state_bits = 0;
				1340	s->merge_failed = 0;
				1341	s->first_merging_chunk = 0;
				1342	s->num_merging_chunks = 0;
				1343	bio_list_init(&s->bios_queued_during_merge);
				1344	bio_init(&s->flush_bio, NULL, 0);
				1345
				1346	/* Allocate hash table for COW data */
				1347	if (init_hash_tables(s)) {
				1348	ti->error = "Unable to allocate hash table space";
				1349	r = -ENOMEM;
				1350	goto bad_hash_tables;
				1351	}
				1352
				1353	init_waitqueue_head(&s->in_progress_wait);
				1354
				1355	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				1356	if (IS_ERR(s->kcopyd_client)) {
				1357	r = PTR_ERR(s->kcopyd_client);
				1358	ti->error = "Could not create kcopyd client";
				1359	goto bad_kcopyd;
				1360	}
				1361
				1362	r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
				1363	if (r) {
				1364	ti->error = "Could not allocate mempool for pending exceptions";
				1365	goto bad_pending_pool;
				1366	}
				1367
				1368	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
				1369	INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
				1370
				1371	spin_lock_init(&s->tracked_chunk_lock);
				1372
				1373	ti->private = s;
				1374	ti->num_flush_bios = num_flush_bios;
				1375	if (s->discard_zeroes_cow)
				1376	ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
				1377	ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
				1378
				1379	/* Add snapshot to the list of snapshots for this origin */
				1380	/* Exceptions aren't triggered till snapshot_resume() is called */
				1381	r = register_snapshot(s);
				1382	if (r == -ENOMEM) {
				1383	ti->error = "Snapshot origin struct allocation failed";
				1384	goto bad_load_and_register;
				1385	} else if (r < 0) {
				1386	/* invalid handover, register_snapshot has set ti->error */
				1387	goto bad_load_and_register;
				1388	}
				1389
				1390	/*
				1391	* Metadata must only be loaded into one table at once, so skip this
				1392	* if metadata will be handed over during resume.
				1393	* Chunk size will be set during the handover - set it to zero to
				1394	* ensure it's ignored.
				1395	*/
				1396	if (r > 0) {
				1397	s->store->chunk_size = 0;
				1398	return 0;
				1399	}
				1400
				1401	r = s->store->type->read_metadata(s->store, dm_add_exception,
				1402	(void *)s);
				1403	if (r < 0) {
				1404	ti->error = "Failed to read snapshot metadata";
				1405	goto bad_read_metadata;
				1406	} else if (r > 0) {
				1407	s->valid = 0;
				1408	DMWARN("Snapshot is marked invalid.");
				1409	}
				1410
				1411	if (!s->store->chunk_size) {
				1412	ti->error = "Chunk size not set";
				1413	r = -EINVAL;
				1414	goto bad_read_metadata;
				1415	}
				1416
				1417	r = dm_set_target_max_io_len(ti, s->store->chunk_size);
				1418	if (r)
				1419	goto bad_read_metadata;
				1420
				1421	return 0;
				1422
				1423	bad_read_metadata:
				1424	unregister_snapshot(s);
				1425	bad_load_and_register:
				1426	mempool_exit(&s->pending_pool);
				1427	bad_pending_pool:
				1428	dm_kcopyd_client_destroy(s->kcopyd_client);
				1429	bad_kcopyd:
				1430	dm_exception_table_exit(&s->pending, pending_cache);
				1431	dm_exception_table_exit(&s->complete, exception_cache);
				1432	bad_hash_tables:
				1433	dm_exception_store_destroy(s->store);
				1434	bad_store:
				1435	dm_put_device(ti, s->cow);
				1436	bad_cow:
				1437	dm_put_device(ti, s->origin);
				1438	bad_origin:
				1439	bad_features:
				1440	kfree(s);
				1441	bad:
				1442	return r;
				1443	}
				1444
				1445	static void __free_exceptions(struct dm_snapshot *s)
				1446	{
				1447	dm_kcopyd_client_destroy(s->kcopyd_client);
				1448	s->kcopyd_client = NULL;
				1449
				1450	dm_exception_table_exit(&s->pending, pending_cache);
				1451	dm_exception_table_exit(&s->complete, exception_cache);
				1452	}
				1453
				1454	static void __handover_exceptions(struct dm_snapshot *snap_src,
				1455	struct dm_snapshot *snap_dest)
				1456	{
				1457	union {
				1458	struct dm_exception_table table_swap;
				1459	struct dm_exception_store *store_swap;
				1460	} u;
				1461
				1462	/*
				1463	* Swap all snapshot context information between the two instances.
				1464	*/
				1465	u.table_swap = snap_dest->complete;
				1466	snap_dest->complete = snap_src->complete;
				1467	snap_src->complete = u.table_swap;
				1468
				1469	u.store_swap = snap_dest->store;
				1470	snap_dest->store = snap_src->store;
				1471	snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
				1472	snap_src->store = u.store_swap;
				1473
				1474	snap_dest->store->snap = snap_dest;
				1475	snap_src->store->snap = snap_src;
				1476
				1477	snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
				1478	snap_dest->valid = snap_src->valid;
				1479	snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
				1480
				1481	/*
				1482	* Set source invalid to ensure it receives no further I/O.
				1483	*/
				1484	snap_src->valid = 0;
				1485	}
				1486
				1487	static void snapshot_dtr(struct dm_target *ti)
				1488	{
				1489	#ifdef CONFIG_DM_DEBUG
				1490	int i;
				1491	#endif
				1492	struct dm_snapshot *s = ti->private;
				1493	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				1494
				1495	down_read(&_origins_lock);
				1496	/* Check whether exception handover must be cancelled */
				1497	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				1498	if (snap_src && snap_dest && (s == snap_src)) {
				1499	down_write(&snap_dest->lock);
				1500	snap_dest->valid = 0;
				1501	up_write(&snap_dest->lock);
				1502	DMERR("Cancelling snapshot handover.");
				1503	}
				1504	up_read(&_origins_lock);
				1505
				1506	if (dm_target_is_snapshot_merge(ti))
				1507	stop_merge(s);
				1508
				1509	/* Prevent further origin writes from using this snapshot. */
				1510	/* After this returns there can be no new kcopyd jobs. */
				1511	unregister_snapshot(s);
				1512
				1513	while (atomic_read(&s->pending_exceptions_count))
				1514	msleep(1);
				1515	/*
				1516	* Ensure instructions in mempool_exit aren't reordered
				1517	* before atomic_read.
				1518	*/
				1519	smp_mb();
				1520
				1521	#ifdef CONFIG_DM_DEBUG
				1522	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
				1523	BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
				1524	#endif
				1525
				1526	__free_exceptions(s);
				1527
				1528	mempool_exit(&s->pending_pool);
				1529
				1530	dm_exception_store_destroy(s->store);
				1531
				1532	bio_uninit(&s->flush_bio);
				1533
				1534	dm_put_device(ti, s->cow);
				1535
				1536	dm_put_device(ti, s->origin);
				1537
				1538	WARN_ON(s->in_progress);
				1539
				1540	kfree(s);
				1541	}
				1542
				1543	static void account_start_copy(struct dm_snapshot *s)
				1544	{
				1545	spin_lock(&s->in_progress_wait.lock);
				1546	s->in_progress++;
				1547	spin_unlock(&s->in_progress_wait.lock);
				1548	}
				1549
				1550	static void account_end_copy(struct dm_snapshot *s)
				1551	{
				1552	spin_lock(&s->in_progress_wait.lock);
				1553	BUG_ON(!s->in_progress);
				1554	s->in_progress--;
				1555	if (likely(s->in_progress <= cow_threshold) &&
				1556	unlikely(waitqueue_active(&s->in_progress_wait)))
				1557	wake_up_locked(&s->in_progress_wait);
				1558	spin_unlock(&s->in_progress_wait.lock);
				1559	}
				1560
				1561	static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
				1562	{
				1563	if (unlikely(s->in_progress > cow_threshold)) {
				1564	spin_lock(&s->in_progress_wait.lock);
				1565	if (likely(s->in_progress > cow_threshold)) {
				1566	/*
				1567	* NOTE: this throttle doesn't account for whether
				1568	* the caller is servicing an IO that will trigger a COW
				1569	* so excess throttling may result for chunks not required
				1570	* to be COW'd. But if cow_threshold was reached, extra
				1571	* throttling is unlikely to negatively impact performance.
				1572	*/
				1573	DECLARE_WAITQUEUE(wait, current);
				1574	__add_wait_queue(&s->in_progress_wait, &wait);
				1575	__set_current_state(TASK_UNINTERRUPTIBLE);
				1576	spin_unlock(&s->in_progress_wait.lock);
				1577	if (unlock_origins)
				1578	up_read(&_origins_lock);
				1579	io_schedule();
				1580	remove_wait_queue(&s->in_progress_wait, &wait);
				1581	return false;
				1582	}
				1583	spin_unlock(&s->in_progress_wait.lock);
				1584	}
				1585	return true;
				1586	}
				1587
				1588	/*
				1589	* Flush a list of buffers.
				1590	*/
				1591	static void flush_bios(struct bio *bio)
				1592	{
				1593	struct bio *n;
				1594
				1595	while (bio) {
				1596	n = bio->bi_next;
				1597	bio->bi_next = NULL;
				1598	generic_make_request(bio);
				1599	bio = n;
				1600	}
				1601	}
				1602
				1603	static int do_origin(struct dm_dev origin, struct bio bio, bool limit);
				1604
				1605	/*
				1606	* Flush a list of buffers.
				1607	*/
				1608	static void retry_origin_bios(struct dm_snapshot s, struct bio bio)
				1609	{
				1610	struct bio *n;
				1611	int r;
				1612
				1613	while (bio) {
				1614	n = bio->bi_next;
				1615	bio->bi_next = NULL;
				1616	r = do_origin(s->origin, bio, false);
				1617	if (r == DM_MAPIO_REMAPPED)
				1618	generic_make_request(bio);
				1619	bio = n;
				1620	}
				1621	}
				1622
				1623	/*
				1624	* Error a list of buffers.
				1625	*/
				1626	static void error_bios(struct bio *bio)
				1627	{
				1628	struct bio *n;
				1629
				1630	while (bio) {
				1631	n = bio->bi_next;
				1632	bio->bi_next = NULL;
				1633	bio_io_error(bio);
				1634	bio = n;
				1635	}
				1636	}
				1637
				1638	static void __invalidate_snapshot(struct dm_snapshot *s, int err)
				1639	{
				1640	if (!s->valid)
				1641	return;
				1642
				1643	if (err == -EIO)
				1644	DMERR("Invalidating snapshot: Error reading/writing.");
				1645	else if (err == -ENOMEM)
				1646	DMERR("Invalidating snapshot: Unable to allocate exception.");
				1647
				1648	if (s->store->type->drop_snapshot)
				1649	s->store->type->drop_snapshot(s->store);
				1650
				1651	s->valid = 0;
				1652
				1653	dm_table_event(s->ti->table);
				1654	}
				1655
				1656	static void invalidate_snapshot(struct dm_snapshot *s, int err)
				1657	{
				1658	down_write(&s->lock);
				1659	__invalidate_snapshot(s, err);
				1660	up_write(&s->lock);
				1661	}
				1662
				1663	static void pending_complete(void *context, int success)
				1664	{
				1665	struct dm_snap_pending_exception *pe = context;
				1666	struct dm_exception *e;
				1667	struct dm_snapshot *s = pe->snap;
				1668	struct bio *origin_bios = NULL;
				1669	struct bio *snapshot_bios = NULL;
				1670	struct bio *full_bio = NULL;
				1671	struct dm_exception_table_lock lock;
				1672	int error = 0;
				1673
				1674	dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
				1675
				1676	if (!success) {
				1677	/* Read/write error - snapshot is unusable */
				1678	invalidate_snapshot(s, -EIO);
				1679	error = 1;
				1680
				1681	dm_exception_table_lock(&lock);
				1682	goto out;
				1683	}
				1684
				1685	e = alloc_completed_exception(GFP_NOIO);
				1686	if (!e) {
				1687	invalidate_snapshot(s, -ENOMEM);
				1688	error = 1;
				1689
				1690	dm_exception_table_lock(&lock);
				1691	goto out;
				1692	}
				1693	*e = pe->e;
				1694
				1695	down_read(&s->lock);
				1696	dm_exception_table_lock(&lock);
				1697	if (!s->valid) {
				1698	up_read(&s->lock);
				1699	free_completed_exception(e);
				1700	error = 1;
				1701
				1702	goto out;
				1703	}
				1704
				1705	/*
				1706	* Add a proper exception. After inserting the completed exception all
				1707	* subsequent snapshot reads to this chunk will be redirected to the
				1708	* COW device. This ensures that we do not starve. Moreover, as long
				1709	* as the pending exception exists, neither origin writes nor snapshot
				1710	* merging can overwrite the chunk in origin.
				1711	*/
				1712	dm_insert_exception(&s->complete, e);
				1713	up_read(&s->lock);
				1714
				1715	/* Wait for conflicting reads to drain */
				1716	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
				1717	dm_exception_table_unlock(&lock);
				1718	__check_for_conflicting_io(s, pe->e.old_chunk);
				1719	dm_exception_table_lock(&lock);
				1720	}
				1721
				1722	out:
				1723	/* Remove the in-flight exception from the list */
				1724	dm_remove_exception(&pe->e);
				1725
				1726	dm_exception_table_unlock(&lock);
				1727
				1728	snapshot_bios = bio_list_get(&pe->snapshot_bios);
				1729	origin_bios = bio_list_get(&pe->origin_bios);
				1730	full_bio = pe->full_bio;
				1731	if (full_bio)
				1732	full_bio->bi_end_io = pe->full_bio_end_io;
				1733	increment_pending_exceptions_done_count();
				1734
				1735	/* Submit any pending write bios */
				1736	if (error) {
				1737	if (full_bio)
				1738	bio_io_error(full_bio);
				1739	error_bios(snapshot_bios);
				1740	} else {
				1741	if (full_bio)
				1742	bio_endio(full_bio);
				1743	flush_bios(snapshot_bios);
				1744	}
				1745
				1746	retry_origin_bios(s, origin_bios);
				1747
				1748	free_pending_exception(pe);
				1749	}
				1750
				1751	static void complete_exception(struct dm_snap_pending_exception *pe)
				1752	{
				1753	struct dm_snapshot *s = pe->snap;
				1754
				1755	/* Update the metadata if we are persistent */
				1756	s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
				1757	pending_complete, pe);
				1758	}
				1759
				1760	/*
				1761	* Called when the copy I/O has finished. kcopyd actually runs
				1762	* this code so don't block.
				1763	*/
				1764	static void copy_callback(int read_err, unsigned long write_err, void *context)
				1765	{
				1766	struct dm_snap_pending_exception *pe = context;
				1767	struct dm_snapshot *s = pe->snap;
				1768
				1769	pe->copy_error = read_err \|\| write_err;
				1770
				1771	if (pe->exception_sequence == s->exception_complete_sequence) {
				1772	struct rb_node *next;
				1773
				1774	s->exception_complete_sequence++;
				1775	complete_exception(pe);
				1776
				1777	next = rb_first(&s->out_of_order_tree);
				1778	while (next) {
				1779	pe = rb_entry(next, struct dm_snap_pending_exception,
				1780	out_of_order_node);
				1781	if (pe->exception_sequence != s->exception_complete_sequence)
				1782	break;
				1783	next = rb_next(next);
				1784	s->exception_complete_sequence++;
				1785	rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
				1786	complete_exception(pe);
				1787	cond_resched();
				1788	}
				1789	} else {
				1790	struct rb_node *parent = NULL;
				1791	struct rb_node **p = &s->out_of_order_tree.rb_node;
				1792	struct dm_snap_pending_exception *pe2;
				1793
				1794	while (*p) {
				1795	pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
				1796	parent = *p;
				1797
				1798	BUG_ON(pe->exception_sequence == pe2->exception_sequence);
				1799	if (pe->exception_sequence < pe2->exception_sequence)
				1800	p = &((*p)->rb_left);
				1801	else
				1802	p = &((*p)->rb_right);
				1803	}
				1804
				1805	rb_link_node(&pe->out_of_order_node, parent, p);
				1806	rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
				1807	}
				1808	account_end_copy(s);
				1809	}
				1810
				1811	/*
				1812	* Dispatches the copy operation to kcopyd.
				1813	*/
				1814	static void start_copy(struct dm_snap_pending_exception *pe)
				1815	{
				1816	struct dm_snapshot *s = pe->snap;
				1817	struct dm_io_region src, dest;
				1818	struct block_device *bdev = s->origin->bdev;
				1819	sector_t dev_size;
				1820
				1821	dev_size = get_dev_size(bdev);
				1822
				1823	src.bdev = bdev;
				1824	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
				1825	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
				1826
				1827	dest.bdev = s->cow->bdev;
				1828	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
				1829	dest.count = src.count;
				1830
				1831	/* Hand over to kcopyd */
				1832	account_start_copy(s);
				1833	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
				1834	}
				1835
				1836	static void full_bio_end_io(struct bio *bio)
				1837	{
				1838	void *callback_data = bio->bi_private;
				1839
				1840	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
				1841	}
				1842
				1843	static void start_full_bio(struct dm_snap_pending_exception *pe,
				1844	struct bio *bio)
				1845	{
				1846	struct dm_snapshot *s = pe->snap;
				1847	void *callback_data;
				1848
				1849	pe->full_bio = bio;
				1850	pe->full_bio_end_io = bio->bi_end_io;
				1851
				1852	account_start_copy(s);
				1853	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
				1854	copy_callback, pe);
				1855
				1856	bio->bi_end_io = full_bio_end_io;
				1857	bio->bi_private = callback_data;
				1858
				1859	generic_make_request(bio);
				1860	}
				1861
				1862	static struct dm_snap_pending_exception *
				1863	__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
				1864	{
				1865	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
				1866
				1867	if (!e)
				1868	return NULL;
				1869
				1870	return container_of(e, struct dm_snap_pending_exception, e);
				1871	}
				1872
				1873	/*
				1874	* Inserts a pending exception into the pending table.
				1875	*
				1876	* NOTE: a write lock must be held on the chunk's pending exception table slot
				1877	* before calling this.
				1878	*/
				1879	static struct dm_snap_pending_exception *
				1880	__insert_pending_exception(struct dm_snapshot *s,
				1881	struct dm_snap_pending_exception *pe, chunk_t chunk)
				1882	{
				1883	pe->e.old_chunk = chunk;
				1884	bio_list_init(&pe->origin_bios);
				1885	bio_list_init(&pe->snapshot_bios);
				1886	pe->started = 0;
				1887	pe->full_bio = NULL;
				1888
				1889	spin_lock(&s->pe_allocation_lock);
				1890	if (s->store->type->prepare_exception(s->store, &pe->e)) {
				1891	spin_unlock(&s->pe_allocation_lock);
				1892	free_pending_exception(pe);
				1893	return NULL;
				1894	}
				1895
				1896	pe->exception_sequence = s->exception_start_sequence++;
				1897	spin_unlock(&s->pe_allocation_lock);
				1898
				1899	dm_insert_exception(&s->pending, &pe->e);
				1900
				1901	return pe;
				1902	}
				1903
				1904	/*
				1905	* Looks to see if this snapshot already has a pending exception
				1906	* for this chunk, otherwise it allocates a new one and inserts
				1907	* it into the pending table.
				1908	*
				1909	* NOTE: a write lock must be held on the chunk's pending exception table slot
				1910	* before calling this.
				1911	*/
				1912	static struct dm_snap_pending_exception *
				1913	__find_pending_exception(struct dm_snapshot *s,
				1914	struct dm_snap_pending_exception *pe, chunk_t chunk)
				1915	{
				1916	struct dm_snap_pending_exception *pe2;
				1917
				1918	pe2 = __lookup_pending_exception(s, chunk);
				1919	if (pe2) {
				1920	free_pending_exception(pe);
				1921	return pe2;
				1922	}
				1923
				1924	return __insert_pending_exception(s, pe, chunk);
				1925	}
				1926
				1927	static void remap_exception(struct dm_snapshot s, struct dm_exception e,
				1928	struct bio *bio, chunk_t chunk)
				1929	{
				1930	bio_set_dev(bio, s->cow->bdev);
				1931	bio->bi_iter.bi_sector =
				1932	chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
				1933	(chunk - e->old_chunk)) +
				1934	(bio->bi_iter.bi_sector & s->store->chunk_mask);
				1935	}
				1936
				1937	static void zero_callback(int read_err, unsigned long write_err, void *context)
				1938	{
				1939	struct bio *bio = context;
				1940	struct dm_snapshot *s = bio->bi_private;
				1941
				1942	account_end_copy(s);
				1943	bio->bi_status = write_err ? BLK_STS_IOERR : 0;
				1944	bio_endio(bio);
				1945	}
				1946
				1947	static void zero_exception(struct dm_snapshot s, struct dm_exception e,
				1948	struct bio *bio, chunk_t chunk)
				1949	{
				1950	struct dm_io_region dest;
				1951
				1952	dest.bdev = s->cow->bdev;
				1953	dest.sector = bio->bi_iter.bi_sector;
				1954	dest.count = s->store->chunk_size;
				1955
				1956	account_start_copy(s);
				1957	WARN_ON_ONCE(bio->bi_private);
				1958	bio->bi_private = s;
				1959	dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
				1960	}
				1961
				1962	static bool io_overlaps_chunk(struct dm_snapshot s, struct bio bio)
				1963	{
				1964	return bio->bi_iter.bi_size ==
				1965	(s->store->chunk_size << SECTOR_SHIFT);
				1966	}
				1967
				1968	static int snapshot_map(struct dm_target ti, struct bio bio)
				1969	{
				1970	struct dm_exception *e;
				1971	struct dm_snapshot *s = ti->private;
				1972	int r = DM_MAPIO_REMAPPED;
				1973	chunk_t chunk;
				1974	struct dm_snap_pending_exception *pe = NULL;
				1975	struct dm_exception_table_lock lock;
				1976
				1977	init_tracked_chunk(bio);
				1978
				1979	if (bio->bi_opf & REQ_PREFLUSH) {
				1980	bio_set_dev(bio, s->cow->bdev);
				1981	return DM_MAPIO_REMAPPED;
				1982	}
				1983
				1984	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
				1985	dm_exception_table_lock_init(s, chunk, &lock);
				1986
				1987	/* Full snapshots are not usable */
				1988	/* To get here the table must be live so s->active is always set. */
				1989	if (!s->valid)
				1990	return DM_MAPIO_KILL;
				1991
				1992	if (bio_data_dir(bio) == WRITE) {
				1993	while (unlikely(!wait_for_in_progress(s, false)))
				1994	; /* wait_for_in_progress() has slept */
				1995	}
				1996
				1997	down_read(&s->lock);
				1998	dm_exception_table_lock(&lock);
				1999
				2000	if (!s->valid \|\| (unlikely(s->snapshot_overflowed) &&
				2001	bio_data_dir(bio) == WRITE)) {
				2002	r = DM_MAPIO_KILL;
				2003	goto out_unlock;
				2004	}
				2005
				2006	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
				2007	if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
				2008	/*
				2009	* passdown discard to origin (without triggering
				2010	* snapshot exceptions via do_origin; doing so would
				2011	* defeat the goal of freeing space in origin that is
				2012	* implied by the "discard_passdown_origin" feature)
				2013	*/
				2014	bio_set_dev(bio, s->origin->bdev);
				2015	track_chunk(s, bio, chunk);
				2016	goto out_unlock;
				2017	}
				2018	/* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
				2019	}
				2020
				2021	/* If the block is already remapped - use that, else remap it */
				2022	e = dm_lookup_exception(&s->complete, chunk);
				2023	if (e) {
				2024	remap_exception(s, e, bio, chunk);
				2025	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
				2026	io_overlaps_chunk(s, bio)) {
				2027	dm_exception_table_unlock(&lock);
				2028	up_read(&s->lock);
				2029	zero_exception(s, e, bio, chunk);
				2030	r = DM_MAPIO_SUBMITTED; /* discard is not issued */
				2031	goto out;
				2032	}
				2033	goto out_unlock;
				2034	}
				2035
				2036	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
				2037	/*
				2038	* If no exception exists, complete discard immediately
				2039	* otherwise it'll trigger copy-out.
				2040	*/
				2041	bio_endio(bio);
				2042	r = DM_MAPIO_SUBMITTED;
				2043	goto out_unlock;
				2044	}
				2045
				2046	/*
				2047	* Write to snapshot - higher level takes care of RW/RO
				2048	* flags so we should only get this if we are
				2049	* writeable.
				2050	*/
				2051	if (bio_data_dir(bio) == WRITE) {
				2052	pe = __lookup_pending_exception(s, chunk);
				2053	if (!pe) {
				2054	dm_exception_table_unlock(&lock);
				2055	pe = alloc_pending_exception(s);
				2056	dm_exception_table_lock(&lock);
				2057
				2058	e = dm_lookup_exception(&s->complete, chunk);
				2059	if (e) {
				2060	free_pending_exception(pe);
				2061	remap_exception(s, e, bio, chunk);
				2062	goto out_unlock;
				2063	}
				2064
				2065	pe = __find_pending_exception(s, pe, chunk);
				2066	if (!pe) {
				2067	dm_exception_table_unlock(&lock);
				2068	up_read(&s->lock);
				2069
				2070	down_write(&s->lock);
				2071
				2072	if (s->store->userspace_supports_overflow) {
				2073	if (s->valid && !s->snapshot_overflowed) {
				2074	s->snapshot_overflowed = 1;
				2075	DMERR("Snapshot overflowed: Unable to allocate exception.");
				2076	}
				2077	} else
				2078	__invalidate_snapshot(s, -ENOMEM);
				2079	up_write(&s->lock);
				2080
				2081	r = DM_MAPIO_KILL;
				2082	goto out;
				2083	}
				2084	}
				2085
				2086	remap_exception(s, &pe->e, bio, chunk);
				2087
				2088	r = DM_MAPIO_SUBMITTED;
				2089
				2090	if (!pe->started && io_overlaps_chunk(s, bio)) {
				2091	pe->started = 1;
				2092
				2093	dm_exception_table_unlock(&lock);
				2094	up_read(&s->lock);
				2095
				2096	start_full_bio(pe, bio);
				2097	goto out;
				2098	}
				2099
				2100	bio_list_add(&pe->snapshot_bios, bio);
				2101
				2102	if (!pe->started) {
				2103	/* this is protected by the exception table lock */
				2104	pe->started = 1;
				2105
				2106	dm_exception_table_unlock(&lock);
				2107	up_read(&s->lock);
				2108
				2109	start_copy(pe);
				2110	goto out;
				2111	}
				2112	} else {
				2113	bio_set_dev(bio, s->origin->bdev);
				2114	track_chunk(s, bio, chunk);
				2115	}
				2116
				2117	out_unlock:
				2118	dm_exception_table_unlock(&lock);
				2119	up_read(&s->lock);
				2120	out:
				2121	return r;
				2122	}
				2123
				2124	/*
				2125	* A snapshot-merge target behaves like a combination of a snapshot
				2126	* target and a snapshot-origin target. It only generates new
				2127	* exceptions in other snapshots and not in the one that is being
				2128	* merged.
				2129	*
				2130	* For each chunk, if there is an existing exception, it is used to
				2131	* redirect I/O to the cow device. Otherwise I/O is sent to the origin,
				2132	* which in turn might generate exceptions in other snapshots.
				2133	* If merging is currently taking place on the chunk in question, the
				2134	* I/O is deferred by adding it to s->bios_queued_during_merge.
				2135	*/
				2136	static int snapshot_merge_map(struct dm_target ti, struct bio bio)
				2137	{
				2138	struct dm_exception *e;
				2139	struct dm_snapshot *s = ti->private;
				2140	int r = DM_MAPIO_REMAPPED;
				2141	chunk_t chunk;
				2142
				2143	init_tracked_chunk(bio);
				2144
				2145	if (bio->bi_opf & REQ_PREFLUSH) {
				2146	if (!dm_bio_get_target_bio_nr(bio))
				2147	bio_set_dev(bio, s->origin->bdev);
				2148	else
				2149	bio_set_dev(bio, s->cow->bdev);
				2150	return DM_MAPIO_REMAPPED;
				2151	}
				2152
				2153	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
				2154	/* Once merging, discards no longer effect change */
				2155	bio_endio(bio);
				2156	return DM_MAPIO_SUBMITTED;
				2157	}
				2158
				2159	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
				2160
				2161	down_write(&s->lock);
				2162
				2163	/* Full merging snapshots are redirected to the origin */
				2164	if (!s->valid)
				2165	goto redirect_to_origin;
				2166
				2167	/* If the block is already remapped - use that */
				2168	e = dm_lookup_exception(&s->complete, chunk);
				2169	if (e) {
				2170	/* Queue writes overlapping with chunks being merged */
				2171	if (bio_data_dir(bio) == WRITE &&
				2172	chunk >= s->first_merging_chunk &&
				2173	chunk < (s->first_merging_chunk +
				2174	s->num_merging_chunks)) {
				2175	bio_set_dev(bio, s->origin->bdev);
				2176	bio_list_add(&s->bios_queued_during_merge, bio);
				2177	r = DM_MAPIO_SUBMITTED;
				2178	goto out_unlock;
				2179	}
				2180
				2181	remap_exception(s, e, bio, chunk);
				2182
				2183	if (bio_data_dir(bio) == WRITE)
				2184	track_chunk(s, bio, chunk);
				2185	goto out_unlock;
				2186	}
				2187
				2188	redirect_to_origin:
				2189	bio_set_dev(bio, s->origin->bdev);
				2190
				2191	if (bio_data_dir(bio) == WRITE) {
				2192	up_write(&s->lock);
				2193	return do_origin(s->origin, bio, false);
				2194	}
				2195
				2196	out_unlock:
				2197	up_write(&s->lock);
				2198
				2199	return r;
				2200	}
				2201
				2202	static int snapshot_end_io(struct dm_target ti, struct bio bio,
				2203	blk_status_t *error)
				2204	{
				2205	struct dm_snapshot *s = ti->private;
				2206
				2207	if (is_bio_tracked(bio))
				2208	stop_tracking_chunk(s, bio);
				2209
				2210	return DM_ENDIO_DONE;
				2211	}
				2212
				2213	static void snapshot_merge_presuspend(struct dm_target *ti)
				2214	{
				2215	struct dm_snapshot *s = ti->private;
				2216
				2217	stop_merge(s);
				2218	}
				2219
				2220	static int snapshot_preresume(struct dm_target *ti)
				2221	{
				2222	int r = 0;
				2223	struct dm_snapshot *s = ti->private;
				2224	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				2225
				2226	down_read(&_origins_lock);
				2227	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				2228	if (snap_src && snap_dest) {
				2229	down_read(&snap_src->lock);
				2230	if (s == snap_src) {
				2231	DMERR("Unable to resume snapshot source until "
				2232	"handover completes.");
				2233	r = -EINVAL;
				2234	} else if (!dm_suspended(snap_src->ti)) {
				2235	DMERR("Unable to perform snapshot handover until "
				2236	"source is suspended.");
				2237	r = -EINVAL;
				2238	}
				2239	up_read(&snap_src->lock);
				2240	}
				2241	up_read(&_origins_lock);
				2242
				2243	return r;
				2244	}
				2245
				2246	static void snapshot_resume(struct dm_target *ti)
				2247	{
				2248	struct dm_snapshot *s = ti->private;
				2249	struct dm_snapshot snap_src = NULL, snap_dest = NULL, *snap_merging = NULL;
				2250	struct dm_origin *o;
				2251	struct mapped_device *origin_md = NULL;
				2252	bool must_restart_merging = false;
				2253
				2254	down_read(&_origins_lock);
				2255
				2256	o = __lookup_dm_origin(s->origin->bdev);
				2257	if (o)
				2258	origin_md = dm_table_get_md(o->ti->table);
				2259	if (!origin_md) {
				2260	(void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
				2261	if (snap_merging)
				2262	origin_md = dm_table_get_md(snap_merging->ti->table);
				2263	}
				2264	if (origin_md == dm_table_get_md(ti->table))
				2265	origin_md = NULL;
				2266	if (origin_md) {
				2267	if (dm_hold(origin_md))
				2268	origin_md = NULL;
				2269	}
				2270
				2271	up_read(&_origins_lock);
				2272
				2273	if (origin_md) {
				2274	dm_internal_suspend_fast(origin_md);
				2275	if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
				2276	must_restart_merging = true;
				2277	stop_merge(snap_merging);
				2278	}
				2279	}
				2280
				2281	down_read(&_origins_lock);
				2282
				2283	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
				2284	if (snap_src && snap_dest) {
				2285	down_write(&snap_src->lock);
				2286	down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
				2287	__handover_exceptions(snap_src, snap_dest);
				2288	up_write(&snap_dest->lock);
				2289	up_write(&snap_src->lock);
				2290	}
				2291
				2292	up_read(&_origins_lock);
				2293
				2294	if (origin_md) {
				2295	if (must_restart_merging)
				2296	start_merge(snap_merging);
				2297	dm_internal_resume_fast(origin_md);
				2298	dm_put(origin_md);
				2299	}
				2300
				2301	/* Now we have correct chunk size, reregister */
				2302	reregister_snapshot(s);
				2303
				2304	down_write(&s->lock);
				2305	s->active = 1;
				2306	up_write(&s->lock);
				2307	}
				2308
				2309	static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
				2310	{
				2311	uint32_t min_chunksize;
				2312
				2313	down_read(&_origins_lock);
				2314	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
				2315	up_read(&_origins_lock);
				2316
				2317	return min_chunksize;
				2318	}
				2319
				2320	static void snapshot_merge_resume(struct dm_target *ti)
				2321	{
				2322	struct dm_snapshot *s = ti->private;
				2323
				2324	/*
				2325	* Handover exceptions from existing snapshot.
				2326	*/
				2327	snapshot_resume(ti);
				2328
				2329	/*
				2330	* snapshot-merge acts as an origin, so set ti->max_io_len
				2331	*/
				2332	ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
				2333
				2334	start_merge(s);
				2335	}
				2336
				2337	static void snapshot_status(struct dm_target *ti, status_type_t type,
				2338	unsigned status_flags, char *result, unsigned maxlen)
				2339	{
				2340	unsigned sz = 0;
				2341	struct dm_snapshot *snap = ti->private;
				2342	unsigned num_features;
				2343
				2344	switch (type) {
				2345	case STATUSTYPE_INFO:
				2346
				2347	down_write(&snap->lock);
				2348
				2349	if (!snap->valid)
				2350	DMEMIT("Invalid");
				2351	else if (snap->merge_failed)
				2352	DMEMIT("Merge failed");
				2353	else if (snap->snapshot_overflowed)
				2354	DMEMIT("Overflow");
				2355	else {
				2356	if (snap->store->type->usage) {
				2357	sector_t total_sectors, sectors_allocated,
				2358	metadata_sectors;
				2359	snap->store->type->usage(snap->store,
				2360	&total_sectors,
				2361	&sectors_allocated,
				2362	&metadata_sectors);
				2363	DMEMIT("%llu/%llu %llu",
				2364	(unsigned long long)sectors_allocated,
				2365	(unsigned long long)total_sectors,
				2366	(unsigned long long)metadata_sectors);
				2367	}
				2368	else
				2369	DMEMIT("Unknown");
				2370	}
				2371
				2372	up_write(&snap->lock);
				2373
				2374	break;
				2375
				2376	case STATUSTYPE_TABLE:
				2377	/*
				2378	* kdevname returns a static pointer so we need
				2379	* to make private copies if the output is to
				2380	* make sense.
				2381	*/
				2382	DMEMIT("%s %s", snap->origin->name, snap->cow->name);
				2383	sz += snap->store->type->status(snap->store, type, result + sz,
				2384	maxlen - sz);
				2385	num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
				2386	if (num_features) {
				2387	DMEMIT(" %u", num_features);
				2388	if (snap->discard_zeroes_cow)
				2389	DMEMIT(" discard_zeroes_cow");
				2390	if (snap->discard_passdown_origin)
				2391	DMEMIT(" discard_passdown_origin");
				2392	}
				2393	break;
				2394	}
				2395	}
				2396
				2397	static int snapshot_iterate_devices(struct dm_target *ti,
				2398	iterate_devices_callout_fn fn, void *data)
				2399	{
				2400	struct dm_snapshot *snap = ti->private;
				2401	int r;
				2402
				2403	r = fn(ti, snap->origin, 0, ti->len, data);
				2404
				2405	if (!r)
				2406	r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
				2407
				2408	return r;
				2409	}
				2410
				2411	static void snapshot_io_hints(struct dm_target ti, struct queue_limits limits)
				2412	{
				2413	struct dm_snapshot *snap = ti->private;
				2414
				2415	if (snap->discard_zeroes_cow) {
				2416	struct dm_snapshot snap_src = NULL, snap_dest = NULL;
				2417
				2418	down_read(&_origins_lock);
				2419
				2420	(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
				2421	if (snap_src && snap_dest)
				2422	snap = snap_src;
				2423
				2424	/* All discards are split on chunk_size boundary */
				2425	limits->discard_granularity = snap->store->chunk_size;
				2426	limits->max_discard_sectors = snap->store->chunk_size;
				2427
				2428	up_read(&_origins_lock);
				2429	}
				2430	}
				2431
				2432	/*-----------------------------------------------------------------
				2433	* Origin methods
				2434	---------------------------------------------------------------/
				2435
				2436	/*
				2437	* If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
				2438	* supplied bio was ignored. The caller may submit it immediately.
				2439	* (No remapping actually occurs as the origin is always a direct linear
				2440	* map.)
				2441	*
				2442	* If further exceptions are required, DM_MAPIO_SUBMITTED is returned
				2443	* and any supplied bio is added to a list to be submitted once all
				2444	* the necessary exceptions exist.
				2445	*/
				2446	static int __origin_write(struct list_head *snapshots, sector_t sector,
				2447	struct bio *bio)
				2448	{
				2449	int r = DM_MAPIO_REMAPPED;
				2450	struct dm_snapshot *snap;
				2451	struct dm_exception *e;
				2452	struct dm_snap_pending_exception pe, pe2;
				2453	struct dm_snap_pending_exception *pe_to_start_now = NULL;
				2454	struct dm_snap_pending_exception *pe_to_start_last = NULL;
				2455	struct dm_exception_table_lock lock;
				2456	chunk_t chunk;
				2457
				2458	/* Do all the snapshots on this origin */
				2459	list_for_each_entry (snap, snapshots, list) {
				2460	/*
				2461	* Don't make new exceptions in a merging snapshot
				2462	* because it has effectively been deleted
				2463	*/
				2464	if (dm_target_is_snapshot_merge(snap->ti))
				2465	continue;
				2466
				2467	/* Nothing to do if writing beyond end of snapshot */
				2468	if (sector >= dm_table_get_size(snap->ti->table))
				2469	continue;
				2470
				2471	/*
				2472	* Remember, different snapshots can have
				2473	* different chunk sizes.
				2474	*/
				2475	chunk = sector_to_chunk(snap->store, sector);
				2476	dm_exception_table_lock_init(snap, chunk, &lock);
				2477
				2478	down_read(&snap->lock);
				2479	dm_exception_table_lock(&lock);
				2480
				2481	/* Only deal with valid and active snapshots */
				2482	if (!snap->valid \|\| !snap->active)
				2483	goto next_snapshot;
				2484
				2485	pe = __lookup_pending_exception(snap, chunk);
				2486	if (!pe) {
				2487	/*
				2488	* Check exception table to see if block is already
				2489	* remapped in this snapshot and trigger an exception
				2490	* if not.
				2491	*/
				2492	e = dm_lookup_exception(&snap->complete, chunk);
				2493	if (e)
				2494	goto next_snapshot;
				2495
				2496	dm_exception_table_unlock(&lock);
				2497	pe = alloc_pending_exception(snap);
				2498	dm_exception_table_lock(&lock);
				2499
				2500	pe2 = __lookup_pending_exception(snap, chunk);
				2501
				2502	if (!pe2) {
				2503	e = dm_lookup_exception(&snap->complete, chunk);
				2504	if (e) {
				2505	free_pending_exception(pe);
				2506	goto next_snapshot;
				2507	}
				2508
				2509	pe = __insert_pending_exception(snap, pe, chunk);
				2510	if (!pe) {
				2511	dm_exception_table_unlock(&lock);
				2512	up_read(&snap->lock);
				2513
				2514	invalidate_snapshot(snap, -ENOMEM);
				2515	continue;
				2516	}
				2517	} else {
				2518	free_pending_exception(pe);
				2519	pe = pe2;
				2520	}
				2521	}
				2522
				2523	r = DM_MAPIO_SUBMITTED;
				2524
				2525	/*
				2526	* If an origin bio was supplied, queue it to wait for the
				2527	* completion of this exception, and start this one last,
				2528	* at the end of the function.
				2529	*/
				2530	if (bio) {
				2531	bio_list_add(&pe->origin_bios, bio);
				2532	bio = NULL;
				2533
				2534	if (!pe->started) {
				2535	pe->started = 1;
				2536	pe_to_start_last = pe;
				2537	}
				2538	}
				2539
				2540	if (!pe->started) {
				2541	pe->started = 1;
				2542	pe_to_start_now = pe;
				2543	}
				2544
				2545	next_snapshot:
				2546	dm_exception_table_unlock(&lock);
				2547	up_read(&snap->lock);
				2548
				2549	if (pe_to_start_now) {
				2550	start_copy(pe_to_start_now);
				2551	pe_to_start_now = NULL;
				2552	}
				2553	}
				2554
				2555	/*
				2556	* Submit the exception against which the bio is queued last,
				2557	* to give the other exceptions a head start.
				2558	*/
				2559	if (pe_to_start_last)
				2560	start_copy(pe_to_start_last);
				2561
				2562	return r;
				2563	}
				2564
				2565	/*
				2566	* Called on a write from the origin driver.
				2567	*/
				2568	static int do_origin(struct dm_dev origin, struct bio bio, bool limit)
				2569	{
				2570	struct origin *o;
				2571	int r = DM_MAPIO_REMAPPED;
				2572
				2573	again:
				2574	down_read(&_origins_lock);
				2575	o = __lookup_origin(origin->bdev);
				2576	if (o) {
				2577	if (limit) {
				2578	struct dm_snapshot *s;
				2579	list_for_each_entry(s, &o->snapshots, list)
				2580	if (unlikely(!wait_for_in_progress(s, true)))
				2581	goto again;
				2582	}
				2583
				2584	r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
				2585	}
				2586	up_read(&_origins_lock);
				2587
				2588	return r;
				2589	}
				2590
				2591	/*
				2592	* Trigger exceptions in all non-merging snapshots.
				2593	*
				2594	* The chunk size of the merging snapshot may be larger than the chunk
				2595	* size of some other snapshot so we may need to reallocate multiple
				2596	* chunks in other snapshots.
				2597	*
				2598	* We scan all the overlapping exceptions in the other snapshots.
				2599	* Returns 1 if anything was reallocated and must be waited for,
				2600	* otherwise returns 0.
				2601	*
				2602	* size must be a multiple of merging_snap's chunk_size.
				2603	*/
				2604	static int origin_write_extent(struct dm_snapshot *merging_snap,
				2605	sector_t sector, unsigned size)
				2606	{
				2607	int must_wait = 0;
				2608	sector_t n;
				2609	struct origin *o;
				2610
				2611	/*
				2612	* The origin's __minimum_chunk_size() got stored in max_io_len
				2613	* by snapshot_merge_resume().
				2614	*/
				2615	down_read(&_origins_lock);
				2616	o = __lookup_origin(merging_snap->origin->bdev);
				2617	for (n = 0; n < size; n += merging_snap->ti->max_io_len)
				2618	if (__origin_write(&o->snapshots, sector + n, NULL) ==
				2619	DM_MAPIO_SUBMITTED)
				2620	must_wait = 1;
				2621	up_read(&_origins_lock);
				2622
				2623	return must_wait;
				2624	}
				2625
				2626	/*
				2627	* Origin: maps a linear range of a device, with hooks for snapshotting.
				2628	*/
				2629
				2630	/*
				2631	* Construct an origin mapping: <dev_path>
				2632	* The context for an origin is merely a 'struct dm_dev *'
				2633	* pointing to the real device.
				2634	*/
				2635	static int origin_ctr(struct dm_target ti, unsigned int argc, char *argv)
				2636	{
				2637	int r;
				2638	struct dm_origin *o;
				2639
				2640	if (argc != 1) {
				2641	ti->error = "origin: incorrect number of arguments";
				2642	return -EINVAL;
				2643	}
				2644
				2645	o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
				2646	if (!o) {
				2647	ti->error = "Cannot allocate private origin structure";
				2648	r = -ENOMEM;
				2649	goto bad_alloc;
				2650	}
				2651
				2652	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
				2653	if (r) {
				2654	ti->error = "Cannot get target device";
				2655	goto bad_open;
				2656	}
				2657
				2658	o->ti = ti;
				2659	ti->private = o;
				2660	ti->num_flush_bios = 1;
				2661
				2662	return 0;
				2663
				2664	bad_open:
				2665	kfree(o);
				2666	bad_alloc:
				2667	return r;
				2668	}
				2669
				2670	static void origin_dtr(struct dm_target *ti)
				2671	{
				2672	struct dm_origin *o = ti->private;
				2673
				2674	dm_put_device(ti, o->dev);
				2675	kfree(o);
				2676	}
				2677
				2678	static int origin_map(struct dm_target ti, struct bio bio)
				2679	{
				2680	struct dm_origin *o = ti->private;
				2681	unsigned available_sectors;
				2682
				2683	bio_set_dev(bio, o->dev->bdev);
				2684
				2685	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
				2686	return DM_MAPIO_REMAPPED;
				2687
				2688	if (bio_data_dir(bio) != WRITE)
				2689	return DM_MAPIO_REMAPPED;
				2690
				2691	available_sectors = o->split_boundary -
				2692	((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
				2693
				2694	if (bio_sectors(bio) > available_sectors)
				2695	dm_accept_partial_bio(bio, available_sectors);
				2696
				2697	/* Only tell snapshots if this is a write */
				2698	return do_origin(o->dev, bio, true);
				2699	}
				2700
				2701	/*
				2702	* Set the target "max_io_len" field to the minimum of all the snapshots'
				2703	* chunk sizes.
				2704	*/
				2705	static void origin_resume(struct dm_target *ti)
				2706	{
				2707	struct dm_origin *o = ti->private;
				2708
				2709	o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
				2710
				2711	down_write(&_origins_lock);
				2712	__insert_dm_origin(o);
				2713	up_write(&_origins_lock);
				2714	}
				2715
				2716	static void origin_postsuspend(struct dm_target *ti)
				2717	{
				2718	struct dm_origin *o = ti->private;
				2719
				2720	down_write(&_origins_lock);
				2721	__remove_dm_origin(o);
				2722	up_write(&_origins_lock);
				2723	}
				2724
				2725	static void origin_status(struct dm_target *ti, status_type_t type,
				2726	unsigned status_flags, char *result, unsigned maxlen)
				2727	{
				2728	struct dm_origin *o = ti->private;
				2729
				2730	switch (type) {
				2731	case STATUSTYPE_INFO:
				2732	result[0] = '\0';
				2733	break;
				2734
				2735	case STATUSTYPE_TABLE:
				2736	snprintf(result, maxlen, "%s", o->dev->name);
				2737	break;
				2738	}
				2739	}
				2740
				2741	static int origin_iterate_devices(struct dm_target *ti,
				2742	iterate_devices_callout_fn fn, void *data)
				2743	{
				2744	struct dm_origin *o = ti->private;
				2745
				2746	return fn(ti, o->dev, 0, ti->len, data);
				2747	}
				2748
				2749	static struct target_type origin_target = {
				2750	.name = "snapshot-origin",
				2751	.version = {1, 9, 0},
				2752	.module = THIS_MODULE,
				2753	.ctr = origin_ctr,
				2754	.dtr = origin_dtr,
				2755	.map = origin_map,
				2756	.resume = origin_resume,
				2757	.postsuspend = origin_postsuspend,
				2758	.status = origin_status,
				2759	.iterate_devices = origin_iterate_devices,
				2760	};
				2761
				2762	static struct target_type snapshot_target = {
				2763	.name = "snapshot",
				2764	.version = {1, 16, 0},
				2765	.module = THIS_MODULE,
				2766	.ctr = snapshot_ctr,
				2767	.dtr = snapshot_dtr,
				2768	.map = snapshot_map,
				2769	.end_io = snapshot_end_io,
				2770	.preresume = snapshot_preresume,
				2771	.resume = snapshot_resume,
				2772	.status = snapshot_status,
				2773	.iterate_devices = snapshot_iterate_devices,
				2774	.io_hints = snapshot_io_hints,
				2775	};
				2776
				2777	static struct target_type merge_target = {
				2778	.name = dm_snapshot_merge_target_name,
				2779	.version = {1, 5, 0},
				2780	.module = THIS_MODULE,
				2781	.ctr = snapshot_ctr,
				2782	.dtr = snapshot_dtr,
				2783	.map = snapshot_merge_map,
				2784	.end_io = snapshot_end_io,
				2785	.presuspend = snapshot_merge_presuspend,
				2786	.preresume = snapshot_preresume,
				2787	.resume = snapshot_merge_resume,
				2788	.status = snapshot_status,
				2789	.iterate_devices = snapshot_iterate_devices,
				2790	.io_hints = snapshot_io_hints,
				2791	};
				2792
				2793	static int __init dm_snapshot_init(void)
				2794	{
				2795	int r;
				2796
				2797	r = dm_exception_store_init();
				2798	if (r) {
				2799	DMERR("Failed to initialize exception stores");
				2800	return r;
				2801	}
				2802
				2803	r = init_origin_hash();
				2804	if (r) {
				2805	DMERR("init_origin_hash failed.");
				2806	goto bad_origin_hash;
				2807	}
				2808
				2809	exception_cache = KMEM_CACHE(dm_exception, 0);
				2810	if (!exception_cache) {
				2811	DMERR("Couldn't create exception cache.");
				2812	r = -ENOMEM;
				2813	goto bad_exception_cache;
				2814	}
				2815
				2816	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
				2817	if (!pending_cache) {
				2818	DMERR("Couldn't create pending cache.");
				2819	r = -ENOMEM;
				2820	goto bad_pending_cache;
				2821	}
				2822
				2823	r = dm_register_target(&snapshot_target);
				2824	if (r < 0) {
				2825	DMERR("snapshot target register failed %d", r);
				2826	goto bad_register_snapshot_target;
				2827	}
				2828
				2829	r = dm_register_target(&origin_target);
				2830	if (r < 0) {
				2831	DMERR("Origin target register failed %d", r);
				2832	goto bad_register_origin_target;
				2833	}
				2834
				2835	r = dm_register_target(&merge_target);
				2836	if (r < 0) {
				2837	DMERR("Merge target register failed %d", r);
				2838	goto bad_register_merge_target;
				2839	}
				2840
				2841	return 0;
				2842
				2843	bad_register_merge_target:
				2844	dm_unregister_target(&origin_target);
				2845	bad_register_origin_target:
				2846	dm_unregister_target(&snapshot_target);
				2847	bad_register_snapshot_target:
				2848	kmem_cache_destroy(pending_cache);
				2849	bad_pending_cache:
				2850	kmem_cache_destroy(exception_cache);
				2851	bad_exception_cache:
				2852	exit_origin_hash();
				2853	bad_origin_hash:
				2854	dm_exception_store_exit();
				2855
				2856	return r;
				2857	}
				2858
				2859	static void __exit dm_snapshot_exit(void)
				2860	{
				2861	dm_unregister_target(&snapshot_target);
				2862	dm_unregister_target(&origin_target);
				2863	dm_unregister_target(&merge_target);
				2864
				2865	exit_origin_hash();
				2866	kmem_cache_destroy(pending_cache);
				2867	kmem_cache_destroy(exception_cache);
				2868
				2869	dm_exception_store_exit();
				2870	}
				2871
				2872	/* Module hooks */
				2873	module_init(dm_snapshot_init);
				2874	module_exit(dm_snapshot_exit);
				2875
				2876	MODULE_DESCRIPTION(DM_NAME " snapshot target");
				2877	MODULE_AUTHOR("Joe Thornber");
				2878	MODULE_LICENSE("GPL");
				2879	MODULE_ALIAS("dm-snapshot-origin");
				2880	MODULE_ALIAS("dm-snapshot-merge");