Blame - ap/os/linux/linux-3.4.x/drivers/md/dm.c - T106_DC

blob: a335e68aa1cdaa301f706bdcbded8736bd1103b9 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
				3	* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
				4	*
				5	* This file is released under the GPL.
				6	*/
				7
				8	#include "dm.h"
				9	#include "dm-uevent.h"
				10
				11	#include <linux/init.h>
				12	#include <linux/module.h>
				13	#include <linux/mutex.h>
				14	#include <linux/moduleparam.h>
				15	#include <linux/blkpg.h>
				16	#include <linux/bio.h>
				17	#include <linux/mempool.h>
				18	#include <linux/slab.h>
				19	#include <linux/idr.h>
				20	#include <linux/hdreg.h>
				21	#include <linux/delay.h>
				22
				23	#include <trace/events/block.h>
				24
				25	#define DM_MSG_PREFIX "core"
				26
				27	#ifdef CONFIG_PRINTK
				28	/*
				29	* ratelimit state to be used in DMXXX_LIMIT().
				30	*/
				31	DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
				32	DEFAULT_RATELIMIT_INTERVAL,
				33	DEFAULT_RATELIMIT_BURST);
				34	EXPORT_SYMBOL(dm_ratelimit_state);
				35	#endif
				36
				37	/*
				38	* Cookies are numeric values sent with CHANGE and REMOVE
				39	* uevents while resuming, removing or renaming the device.
				40	*/
				41	#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
				42	#define DM_COOKIE_LENGTH 24
				43
				44	static const char *_name = DM_NAME;
				45
				46	static unsigned int major = 0;
				47	static unsigned int _major = 0;
				48
				49	static DEFINE_IDR(_minor_idr);
				50
				51	static DEFINE_SPINLOCK(_minor_lock);
				52	/*
				53	* For bio-based dm.
				54	* One of these is allocated per bio.
				55	*/
				56	struct dm_io {
				57	struct mapped_device *md;
				58	int error;
				59	atomic_t io_count;
				60	struct bio *bio;
				61	unsigned long start_time;
				62	spinlock_t endio_lock;
				63	};
				64
				65	/*
				66	* For bio-based dm.
				67	* One of these is allocated per target within a bio. Hopefully
				68	* this will be simplified out one day.
				69	*/
				70	struct dm_target_io {
				71	struct dm_io *io;
				72	struct dm_target *ti;
				73	union map_info info;
				74	};
				75
				76	/*
				77	* For request-based dm.
				78	* One of these is allocated per request.
				79	*/
				80	struct dm_rq_target_io {
				81	struct mapped_device *md;
				82	struct dm_target *ti;
				83	struct request *orig, clone;
				84	int error;
				85	union map_info info;
				86	};
				87
				88	/*
				89	* For request-based dm.
				90	* One of these is allocated per bio.
				91	*/
				92	struct dm_rq_clone_bio_info {
				93	struct bio *orig;
				94	struct dm_rq_target_io *tio;
				95	};
				96
				97	union map_info dm_get_mapinfo(struct bio bio)
				98	{
				99	if (bio && bio->bi_private)
				100	return &((struct dm_target_io *)bio->bi_private)->info;
				101	return NULL;
				102	}
				103
				104	union map_info dm_get_rq_mapinfo(struct request rq)
				105	{
				106	if (rq && rq->end_io_data)
				107	return &((struct dm_rq_target_io *)rq->end_io_data)->info;
				108	return NULL;
				109	}
				110	EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
				111
				112	#define MINOR_ALLOCED ((void *)-1)
				113
				114	/*
				115	* Bits for the md->flags field.
				116	*/
				117	#define DMF_BLOCK_IO_FOR_SUSPEND 0
				118	#define DMF_SUSPENDED 1
				119	#define DMF_FROZEN 2
				120	#define DMF_FREEING 3
				121	#define DMF_DELETING 4
				122	#define DMF_NOFLUSH_SUSPENDING 5
				123	#define DMF_MERGE_IS_OPTIONAL 6
				124
				125	/*
				126	* Work processed by per-device workqueue.
				127	*/
				128	struct mapped_device {
				129	struct rw_semaphore io_lock;
				130	struct mutex suspend_lock;
				131	rwlock_t map_lock;
				132	atomic_t holders;
				133	atomic_t open_count;
				134
				135	unsigned long flags;
				136
				137	struct request_queue *queue;
				138	unsigned type;
				139	/* Protect queue and type against concurrent access. */
				140	struct mutex type_lock;
				141
				142	struct target_type *immutable_target_type;
				143
				144	struct gendisk *disk;
				145	char name[16];
				146
				147	void *interface_ptr;
				148
				149	/*
				150	* A list of ios that arrived while we were suspended.
				151	*/
				152	atomic_t pending[2];
				153	wait_queue_head_t wait;
				154	struct work_struct work;
				155	struct bio_list deferred;
				156	spinlock_t deferred_lock;
				157
				158	/*
				159	* Processing queue (flush)
				160	*/
				161	struct workqueue_struct *wq;
				162
				163	/*
				164	* The current mapping.
				165	*/
				166	struct dm_table *map;
				167
				168	/*
				169	* io objects are allocated from here.
				170	*/
				171	mempool_t *io_pool;
				172	mempool_t *tio_pool;
				173
				174	struct bio_set *bs;
				175
				176	/*
				177	* Event handling.
				178	*/
				179	atomic_t event_nr;
				180	wait_queue_head_t eventq;
				181	atomic_t uevent_seq;
				182	struct list_head uevent_list;
				183	spinlock_t uevent_lock; /* Protect access to uevent_list */
				184
				185	/*
				186	* freeze/thaw support require holding onto a super block
				187	*/
				188	struct super_block *frozen_sb;
				189	struct block_device *bdev;
				190
				191	/* forced geometry settings */
				192	struct hd_geometry geometry;
				193
				194	/* kobject and completion */
				195	struct dm_kobject_holder kobj_holder;
				196
				197	/* zero-length flush that will be cloned and submitted to targets */
				198	struct bio flush_bio;
				199	};
				200
				201	/*
				202	* For mempools pre-allocation at the table loading time.
				203	*/
				204	struct dm_md_mempools {
				205	mempool_t *io_pool;
				206	mempool_t *tio_pool;
				207	struct bio_set *bs;
				208	};
				209
				210	#define MIN_IOS 256
				211	static struct kmem_cache *_io_cache;
				212	static struct kmem_cache *_tio_cache;
				213	static struct kmem_cache *_rq_tio_cache;
				214	static struct kmem_cache *_rq_bio_info_cache;
				215
				216	static int __init local_init(void)
				217	{
				218	int r = -ENOMEM;
				219
				220	/* allocate a slab for the dm_ios */
				221	_io_cache = KMEM_CACHE(dm_io, 0);
				222	if (!_io_cache)
				223	return r;
				224
				225	/* allocate a slab for the target ios */
				226	_tio_cache = KMEM_CACHE(dm_target_io, 0);
				227	if (!_tio_cache)
				228	goto out_free_io_cache;
				229
				230	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
				231	if (!_rq_tio_cache)
				232	goto out_free_tio_cache;
				233
				234	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
				235	if (!_rq_bio_info_cache)
				236	goto out_free_rq_tio_cache;
				237
				238	r = dm_uevent_init();
				239	if (r)
				240	goto out_free_rq_bio_info_cache;
				241
				242	_major = major;
				243	r = register_blkdev(_major, _name);
				244	if (r < 0)
				245	goto out_uevent_exit;
				246
				247	if (!_major)
				248	_major = r;
				249
				250	return 0;
				251
				252	out_uevent_exit:
				253	dm_uevent_exit();
				254	out_free_rq_bio_info_cache:
				255	kmem_cache_destroy(_rq_bio_info_cache);
				256	out_free_rq_tio_cache:
				257	kmem_cache_destroy(_rq_tio_cache);
				258	out_free_tio_cache:
				259	kmem_cache_destroy(_tio_cache);
				260	out_free_io_cache:
				261	kmem_cache_destroy(_io_cache);
				262
				263	return r;
				264	}
				265
				266	static void local_exit(void)
				267	{
				268	kmem_cache_destroy(_rq_bio_info_cache);
				269	kmem_cache_destroy(_rq_tio_cache);
				270	kmem_cache_destroy(_tio_cache);
				271	kmem_cache_destroy(_io_cache);
				272	unregister_blkdev(_major, _name);
				273	dm_uevent_exit();
				274
				275	_major = 0;
				276
				277	DMINFO("cleaned up");
				278	}
				279
				280	static int (*_inits[])(void) __initdata = {
				281	local_init,
				282	dm_target_init,
				283	dm_linear_init,
				284	dm_stripe_init,
				285	dm_io_init,
				286	dm_kcopyd_init,
				287	dm_interface_init,
				288	};
				289
				290	static void (*_exits[])(void) = {
				291	local_exit,
				292	dm_target_exit,
				293	dm_linear_exit,
				294	dm_stripe_exit,
				295	dm_io_exit,
				296	dm_kcopyd_exit,
				297	dm_interface_exit,
				298	};
				299
				300	static int __init dm_init(void)
				301	{
				302	const int count = ARRAY_SIZE(_inits);
				303
				304	int r, i;
				305
				306	for (i = 0; i < count; i++) {
				307	r = _inits[i]();
				308	if (r)
				309	goto bad;
				310	}
				311
				312	return 0;
				313
				314	bad:
				315	while (i--)
				316	_exits[i]();
				317
				318	return r;
				319	}
				320
				321	static void __exit dm_exit(void)
				322	{
				323	int i = ARRAY_SIZE(_exits);
				324
				325	while (i--)
				326	_exits[i]();
				327
				328	/*
				329	* Should be empty by this point.
				330	*/
				331	idr_remove_all(&_minor_idr);
				332	idr_destroy(&_minor_idr);
				333	}
				334
				335	/*
				336	* Block device functions
				337	*/
				338	int dm_deleting_md(struct mapped_device *md)
				339	{
				340	return test_bit(DMF_DELETING, &md->flags);
				341	}
				342
				343	static int dm_blk_open(struct block_device *bdev, fmode_t mode)
				344	{
				345	struct mapped_device *md;
				346
				347	spin_lock(&_minor_lock);
				348
				349	md = bdev->bd_disk->private_data;
				350	if (!md)
				351	goto out;
				352
				353	if (test_bit(DMF_FREEING, &md->flags) \|\|
				354	dm_deleting_md(md)) {
				355	md = NULL;
				356	goto out;
				357	}
				358
				359	dm_get(md);
				360	atomic_inc(&md->open_count);
				361
				362	out:
				363	spin_unlock(&_minor_lock);
				364
				365	return md ? 0 : -ENXIO;
				366	}
				367
				368	static int dm_blk_close(struct gendisk *disk, fmode_t mode)
				369	{
				370	struct mapped_device *md = disk->private_data;
				371
				372	spin_lock(&_minor_lock);
				373
				374	atomic_dec(&md->open_count);
				375	dm_put(md);
				376
				377	spin_unlock(&_minor_lock);
				378
				379	return 0;
				380	}
				381
				382	int dm_open_count(struct mapped_device *md)
				383	{
				384	return atomic_read(&md->open_count);
				385	}
				386
				387	/*
				388	* Guarantees nothing is using the device before it's deleted.
				389	*/
				390	int dm_lock_for_deletion(struct mapped_device *md)
				391	{
				392	int r = 0;
				393
				394	spin_lock(&_minor_lock);
				395
				396	if (dm_open_count(md))
				397	r = -EBUSY;
				398	else
				399	set_bit(DMF_DELETING, &md->flags);
				400
				401	spin_unlock(&_minor_lock);
				402
				403	return r;
				404	}
				405
				406	static int dm_blk_getgeo(struct block_device bdev, struct hd_geometry geo)
				407	{
				408	struct mapped_device *md = bdev->bd_disk->private_data;
				409
				410	return dm_get_geometry(md, geo);
				411	}
				412
				413	static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
				414	unsigned int cmd, unsigned long arg)
				415	{
				416	struct mapped_device *md = bdev->bd_disk->private_data;
				417	struct dm_table *map = dm_get_live_table(md);
				418	struct dm_target *tgt;
				419	int r = -ENOTTY;
				420
				421	if (!map \|\| !dm_table_get_size(map))
				422	goto out;
				423
				424	/* We only support devices that have a single target */
				425	if (dm_table_get_num_targets(map) != 1)
				426	goto out;
				427
				428	tgt = dm_table_get_target(map, 0);
				429
				430	if (dm_suspended_md(md)) {
				431	r = -EAGAIN;
				432	goto out;
				433	}
				434
				435	if (tgt->type->ioctl)
				436	r = tgt->type->ioctl(tgt, cmd, arg);
				437
				438	out:
				439	dm_table_put(map);
				440
				441	return r;
				442	}
				443
				444	static struct dm_io alloc_io(struct mapped_device md)
				445	{
				446	return mempool_alloc(md->io_pool, GFP_NOIO);
				447	}
				448
				449	static void free_io(struct mapped_device md, struct dm_io io)
				450	{
				451	mempool_free(io, md->io_pool);
				452	}
				453
				454	static void free_tio(struct mapped_device md, struct dm_target_io tio)
				455	{
				456	mempool_free(tio, md->tio_pool);
				457	}
				458
				459	static struct dm_rq_target_io alloc_rq_tio(struct mapped_device md,
				460	gfp_t gfp_mask)
				461	{
				462	return mempool_alloc(md->tio_pool, gfp_mask);
				463	}
				464
				465	static void free_rq_tio(struct dm_rq_target_io *tio)
				466	{
				467	mempool_free(tio, tio->md->tio_pool);
				468	}
				469
				470	static struct dm_rq_clone_bio_info alloc_bio_info(struct mapped_device md)
				471	{
				472	return mempool_alloc(md->io_pool, GFP_ATOMIC);
				473	}
				474
				475	static void free_bio_info(struct dm_rq_clone_bio_info *info)
				476	{
				477	mempool_free(info, info->tio->md->io_pool);
				478	}
				479
				480	static int md_in_flight(struct mapped_device *md)
				481	{
				482	return atomic_read(&md->pending[READ]) +
				483	atomic_read(&md->pending[WRITE]);
				484	}
				485
				486	static void start_io_acct(struct dm_io *io)
				487	{
				488	struct mapped_device *md = io->md;
				489	int cpu;
				490	int rw = bio_data_dir(io->bio);
				491
				492	io->start_time = jiffies;
				493
				494	cpu = part_stat_lock();
				495	part_round_stats(cpu, &dm_disk(md)->part0);
				496	part_stat_unlock();
				497	atomic_set(&dm_disk(md)->part0.in_flight[rw],
				498	atomic_inc_return(&md->pending[rw]));
				499	}
				500
				501	static void end_io_acct(struct dm_io *io)
				502	{
				503	struct mapped_device *md = io->md;
				504	struct bio *bio = io->bio;
				505	unsigned long duration = jiffies - io->start_time;
				506	int pending, cpu;
				507	int rw = bio_data_dir(bio);
				508
				509	cpu = part_stat_lock();
				510	part_round_stats(cpu, &dm_disk(md)->part0);
				511	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
				512	part_stat_unlock();
				513
				514	/*
				515	* After this is decremented the bio must not be touched if it is
				516	* a flush.
				517	*/
				518	pending = atomic_dec_return(&md->pending[rw]);
				519	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
				520	pending += atomic_read(&md->pending[rw^0x1]);
				521
				522	/* nudge anyone waiting on suspend queue */
				523	if (!pending)
				524	wake_up(&md->wait);
				525	}
				526
				527	/*
				528	* Add the bio to the list of deferred io.
				529	*/
				530	static void queue_io(struct mapped_device md, struct bio bio)
				531	{
				532	unsigned long flags;
				533
				534	spin_lock_irqsave(&md->deferred_lock, flags);
				535	bio_list_add(&md->deferred, bio);
				536	spin_unlock_irqrestore(&md->deferred_lock, flags);
				537	queue_work(md->wq, &md->work);
				538	}
				539
				540	/*
				541	* Everyone (including functions in this file), should use this
				542	* function to access the md->map field, and make sure they call
				543	* dm_table_put() when finished.
				544	*/
				545	struct dm_table dm_get_live_table(struct mapped_device md)
				546	{
				547	struct dm_table *t;
				548	unsigned long flags;
				549
				550	read_lock_irqsave(&md->map_lock, flags);
				551	t = md->map;
				552	if (t)
				553	dm_table_get(t);
				554	read_unlock_irqrestore(&md->map_lock, flags);
				555
				556	return t;
				557	}
				558
				559	/*
				560	* Get the geometry associated with a dm device
				561	*/
				562	int dm_get_geometry(struct mapped_device md, struct hd_geometry geo)
				563	{
				564	*geo = md->geometry;
				565
				566	return 0;
				567	}
				568
				569	/*
				570	* Set the geometry of a device.
				571	*/
				572	int dm_set_geometry(struct mapped_device md, struct hd_geometry geo)
				573	{
				574	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
				575
				576	if (geo->start > sz) {
				577	DMWARN("Start sector is beyond the geometry limits.");
				578	return -EINVAL;
				579	}
				580
				581	md->geometry = *geo;
				582
				583	return 0;
				584	}
				585
				586	/*-----------------------------------------------------------------
				587	* CRUD START:
				588	* A more elegant soln is in the works that uses the queue
				589	* merge fn, unfortunately there are a couple of changes to
				590	* the block layer that I want to make for this. So in the
				591	* interests of getting something for people to use I give
				592	* you this clearly demarcated crap.
				593	---------------------------------------------------------------/
				594
				595	static int __noflush_suspending(struct mapped_device *md)
				596	{
				597	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				598	}
				599
				600	/*
				601	* Decrements the number of outstanding ios that a bio has been
				602	* cloned into, completing the original io if necc.
				603	*/
				604	static void dec_pending(struct dm_io *io, int error)
				605	{
				606	unsigned long flags;
				607	int io_error;
				608	struct bio *bio;
				609	struct mapped_device *md = io->md;
				610
				611	/* Push-back supersedes any I/O errors */
				612	if (unlikely(error)) {
				613	spin_lock_irqsave(&io->endio_lock, flags);
				614	if (!(io->error > 0 && __noflush_suspending(md)))
				615	io->error = error;
				616	spin_unlock_irqrestore(&io->endio_lock, flags);
				617	}
				618
				619	if (atomic_dec_and_test(&io->io_count)) {
				620	if (io->error == DM_ENDIO_REQUEUE) {
				621	/*
				622	* Target requested pushing back the I/O.
				623	*/
				624	spin_lock_irqsave(&md->deferred_lock, flags);
				625	if (__noflush_suspending(md))
				626	bio_list_add_head(&md->deferred, io->bio);
				627	else
				628	/* noflush suspend was interrupted. */
				629	io->error = -EIO;
				630	spin_unlock_irqrestore(&md->deferred_lock, flags);
				631	}
				632
				633	io_error = io->error;
				634	bio = io->bio;
				635	end_io_acct(io);
				636	free_io(md, io);
				637
				638	if (io_error == DM_ENDIO_REQUEUE)
				639	return;
				640
				641	if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
				642	/*
				643	* Preflush done for flush with data, reissue
				644	* without REQ_FLUSH.
				645	*/
				646	bio->bi_rw &= ~REQ_FLUSH;
				647	queue_io(md, bio);
				648	} else {
				649	/* done with normal IO or empty flush */
				650	trace_block_bio_complete(md->queue, bio, io_error);
				651	bio_endio(bio, io_error);
				652	}
				653	}
				654	}
				655
				656	static void clone_endio(struct bio *bio, int error)
				657	{
				658	int r = 0;
				659	struct dm_target_io *tio = bio->bi_private;
				660	struct dm_io *io = tio->io;
				661	struct mapped_device *md = tio->io->md;
				662	dm_endio_fn endio = tio->ti->type->end_io;
				663
				664	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
				665	error = -EIO;
				666
				667	if (endio) {
				668	r = endio(tio->ti, bio, error, &tio->info);
				669	if (r < 0 \|\| r == DM_ENDIO_REQUEUE)
				670	/*
				671	* error and requeue request are handled
				672	* in dec_pending().
				673	*/
				674	error = r;
				675	else if (r == DM_ENDIO_INCOMPLETE)
				676	/* The target will handle the io */
				677	return;
				678	else if (r) {
				679	DMWARN("unimplemented target endio return value: %d", r);
				680	BUG();
				681	}
				682	}
				683
				684	/*
				685	* Store md for cleanup instead of tio which is about to get freed.
				686	*/
				687	bio->bi_private = md->bs;
				688
				689	free_tio(md, tio);
				690	bio_put(bio);
				691	dec_pending(io, error);
				692	}
				693
				694	/*
				695	* Partial completion handling for request-based dm
				696	*/
				697	static void end_clone_bio(struct bio *clone, int error)
				698	{
				699	struct dm_rq_clone_bio_info *info = clone->bi_private;
				700	struct dm_rq_target_io *tio = info->tio;
				701	struct bio *bio = info->orig;
				702	unsigned int nr_bytes = info->orig->bi_size;
				703
				704	bio_put(clone);
				705
				706	if (tio->error)
				707	/*
				708	* An error has already been detected on the request.
				709	* Once error occurred, just let clone->end_io() handle
				710	* the remainder.
				711	*/
				712	return;
				713	else if (error) {
				714	/*
				715	* Don't notice the error to the upper layer yet.
				716	* The error handling decision is made by the target driver,
				717	* when the request is completed.
				718	*/
				719	tio->error = error;
				720	return;
				721	}
				722
				723	/*
				724	* I/O for the bio successfully completed.
				725	* Notice the data completion to the upper layer.
				726	*/
				727
				728	/*
				729	* bios are processed from the head of the list.
				730	* So the completing bio should always be rq->bio.
				731	* If it's not, something wrong is happening.
				732	*/
				733	if (tio->orig->bio != bio)
				734	DMERR("bio completion is going in the middle of the request");
				735
				736	/*
				737	* Update the original request.
				738	* Do not use blk_end_request() here, because it may complete
				739	* the original request before the clone, and break the ordering.
				740	*/
				741	blk_update_request(tio->orig, 0, nr_bytes);
				742	}
				743
				744	/*
				745	* Don't touch any member of the md after calling this function because
				746	* the md may be freed in dm_put() at the end of this function.
				747	* Or do dm_get() before calling this function and dm_put() later.
				748	*/
				749	static void rq_completed(struct mapped_device *md, int rw, int run_queue)
				750	{
				751	atomic_dec(&md->pending[rw]);
				752
				753	/* nudge anyone waiting on suspend queue */
				754	if (!md_in_flight(md))
				755	wake_up(&md->wait);
				756
				757	/*
				758	* Run this off this callpath, as drivers could invoke end_io while
				759	* inside their request_fn (and holding the queue lock). Calling
				760	* back into ->request_fn() could deadlock attempting to grab the
				761	* queue lock again.
				762	*/
				763	if (run_queue)
				764	blk_run_queue_async(md->queue);
				765
				766	/*
				767	* dm_put() must be at the end of this function. See the comment above
				768	*/
				769	dm_put(md);
				770	}
				771
				772	static void free_rq_clone(struct request *clone)
				773	{
				774	struct dm_rq_target_io *tio = clone->end_io_data;
				775
				776	blk_rq_unprep_clone(clone);
				777	free_rq_tio(tio);
				778	}
				779
				780	/*
				781	* Complete the clone and the original request.
				782	* Must be called without queue lock.
				783	*/
				784	static void dm_end_request(struct request *clone, int error)
				785	{
				786	int rw = rq_data_dir(clone);
				787	struct dm_rq_target_io *tio = clone->end_io_data;
				788	struct mapped_device *md = tio->md;
				789	struct request *rq = tio->orig;
				790
				791	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
				792	rq->errors = clone->errors;
				793	rq->resid_len = clone->resid_len;
				794
				795	if (rq->sense)
				796	/*
				797	* We are using the sense buffer of the original
				798	* request.
				799	* So setting the length of the sense data is enough.
				800	*/
				801	rq->sense_len = clone->sense_len;
				802	}
				803
				804	free_rq_clone(clone);
				805	blk_end_request_all(rq, error);
				806	rq_completed(md, rw, true);
				807	}
				808
				809	static void dm_unprep_request(struct request *rq)
				810	{
				811	struct request *clone = rq->special;
				812
				813	rq->special = NULL;
				814	rq->cmd_flags &= ~REQ_DONTPREP;
				815
				816	free_rq_clone(clone);
				817	}
				818
				819	/*
				820	* Requeue the original request of a clone.
				821	*/
				822	void dm_requeue_unmapped_request(struct request *clone)
				823	{
				824	int rw = rq_data_dir(clone);
				825	struct dm_rq_target_io *tio = clone->end_io_data;
				826	struct mapped_device *md = tio->md;
				827	struct request *rq = tio->orig;
				828	struct request_queue *q = rq->q;
				829	unsigned long flags;
				830
				831	dm_unprep_request(rq);
				832
				833	spin_lock_irqsave(q->queue_lock, flags);
				834	blk_requeue_request(q, rq);
				835	spin_unlock_irqrestore(q->queue_lock, flags);
				836
				837	rq_completed(md, rw, 0);
				838	}
				839	EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
				840
				841	static void __stop_queue(struct request_queue *q)
				842	{
				843	blk_stop_queue(q);
				844	}
				845
				846	static void stop_queue(struct request_queue *q)
				847	{
				848	unsigned long flags;
				849
				850	spin_lock_irqsave(q->queue_lock, flags);
				851	__stop_queue(q);
				852	spin_unlock_irqrestore(q->queue_lock, flags);
				853	}
				854
				855	static void __start_queue(struct request_queue *q)
				856	{
				857	if (blk_queue_stopped(q))
				858	blk_start_queue(q);
				859	}
				860
				861	static void start_queue(struct request_queue *q)
				862	{
				863	unsigned long flags;
				864
				865	spin_lock_irqsave(q->queue_lock, flags);
				866	__start_queue(q);
				867	spin_unlock_irqrestore(q->queue_lock, flags);
				868	}
				869
				870	static void dm_done(struct request *clone, int error, bool mapped)
				871	{
				872	int r = error;
				873	struct dm_rq_target_io *tio = clone->end_io_data;
				874	dm_request_endio_fn rq_end_io = NULL;
				875
				876	if (tio->ti) {
				877	rq_end_io = tio->ti->type->rq_end_io;
				878
				879	if (mapped && rq_end_io)
				880	r = rq_end_io(tio->ti, clone, error, &tio->info);
				881	}
				882
				883	if (r <= 0)
				884	/* The target wants to complete the I/O */
				885	dm_end_request(clone, r);
				886	else if (r == DM_ENDIO_INCOMPLETE)
				887	/* The target will handle the I/O */
				888	return;
				889	else if (r == DM_ENDIO_REQUEUE)
				890	/* The target wants to requeue the I/O */
				891	dm_requeue_unmapped_request(clone);
				892	else {
				893	DMWARN("unimplemented target endio return value: %d", r);
				894	BUG();
				895	}
				896	}
				897
				898	/*
				899	* Request completion handler for request-based dm
				900	*/
				901	static void dm_softirq_done(struct request *rq)
				902	{
				903	bool mapped = true;
				904	struct request *clone = rq->completion_data;
				905	struct dm_rq_target_io *tio = clone->end_io_data;
				906
				907	if (rq->cmd_flags & REQ_FAILED)
				908	mapped = false;
				909
				910	dm_done(clone, tio->error, mapped);
				911	}
				912
				913	/*
				914	* Complete the clone and the original request with the error status
				915	* through softirq context.
				916	*/
				917	static void dm_complete_request(struct request *clone, int error)
				918	{
				919	struct dm_rq_target_io *tio = clone->end_io_data;
				920	struct request *rq = tio->orig;
				921
				922	tio->error = error;
				923	rq->completion_data = clone;
				924	blk_complete_request(rq);
				925	}
				926
				927	/*
				928	* Complete the not-mapped clone and the original request with the error status
				929	* through softirq context.
				930	* Target's rq_end_io() function isn't called.
				931	* This may be used when the target's map_rq() function fails.
				932	*/
				933	void dm_kill_unmapped_request(struct request *clone, int error)
				934	{
				935	struct dm_rq_target_io *tio = clone->end_io_data;
				936	struct request *rq = tio->orig;
				937
				938	rq->cmd_flags \|= REQ_FAILED;
				939	dm_complete_request(clone, error);
				940	}
				941	EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
				942
				943	/*
				944	* Called with the queue lock held
				945	*/
				946	static void end_clone_request(struct request *clone, int error)
				947	{
				948	/*
				949	* For just cleaning up the information of the queue in which
				950	* the clone was dispatched.
				951	* The clone is NOT freed actually here because it is alloced from
				952	* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
				953	*/
				954	__blk_put_request(clone->q, clone);
				955
				956	/*
				957	* Actual request completion is done in a softirq context which doesn't
				958	* hold the queue lock. Otherwise, deadlock could occur because:
				959	* - another request may be submitted by the upper level driver
				960	* of the stacking during the completion
				961	* - the submission which requires queue lock may be done
				962	* against this queue
				963	*/
				964	dm_complete_request(clone, error);
				965	}
				966
				967	/*
				968	* Return maximum size of I/O possible at the supplied sector up to the current
				969	* target boundary.
				970	*/
				971	static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
				972	{
				973	sector_t target_offset = dm_target_offset(ti, sector);
				974
				975	return ti->len - target_offset;
				976	}
				977
				978	static sector_t max_io_len(sector_t sector, struct dm_target *ti)
				979	{
				980	sector_t len = max_io_len_target_boundary(sector, ti);
				981
				982	/*
				983	* Does the target need to split even further ?
				984	*/
				985	if (ti->split_io) {
				986	sector_t boundary;
				987	sector_t offset = dm_target_offset(ti, sector);
				988	boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
				989	- offset;
				990	if (len > boundary)
				991	len = boundary;
				992	}
				993
				994	return len;
				995	}
				996
				997	static void __map_bio(struct dm_target ti, struct bio clone,
				998	struct dm_target_io *tio)
				999	{
				1000	int r;
				1001	sector_t sector;
				1002	struct mapped_device *md;
				1003
				1004	clone->bi_end_io = clone_endio;
				1005	clone->bi_private = tio;
				1006
				1007	/*
				1008	* Map the clone. If r == 0 we don't need to do
				1009	* anything, the target has assumed ownership of
				1010	* this io.
				1011	*/
				1012	atomic_inc(&tio->io->io_count);
				1013	sector = clone->bi_sector;
				1014	r = ti->type->map(ti, clone, &tio->info);
				1015	if (r == DM_MAPIO_REMAPPED) {
				1016	/* the bio has been remapped so dispatch it */
				1017
				1018	trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
				1019	tio->io->bio->bi_bdev->bd_dev, sector);
				1020
				1021	generic_make_request(clone);
				1022	} else if (r < 0 \|\| r == DM_MAPIO_REQUEUE) {
				1023	/* error the io and bail out, or requeue it if needed */
				1024	md = tio->io->md;
				1025	dec_pending(tio->io, r);
				1026	/*
				1027	* Store bio_set for cleanup.
				1028	*/
				1029	clone->bi_end_io = NULL;
				1030	clone->bi_private = md->bs;
				1031	bio_put(clone);
				1032	free_tio(md, tio);
				1033	} else if (r) {
				1034	DMWARN("unimplemented target map return value: %d", r);
				1035	BUG();
				1036	}
				1037	}
				1038
				1039	struct clone_info {
				1040	struct mapped_device *md;
				1041	struct dm_table *map;
				1042	struct bio *bio;
				1043	struct dm_io *io;
				1044	sector_t sector;
				1045	sector_t sector_count;
				1046	unsigned short idx;
				1047	};
				1048
				1049	static void dm_bio_destructor(struct bio *bio)
				1050	{
				1051	struct bio_set *bs = bio->bi_private;
				1052
				1053	bio_free(bio, bs);
				1054	}
				1055
				1056	/*
				1057	* Creates a little bio that just does part of a bvec.
				1058	*/
				1059	static struct bio split_bvec(struct bio bio, sector_t sector,
				1060	unsigned short idx, unsigned int offset,
				1061	unsigned int len, struct bio_set *bs)
				1062	{
				1063	struct bio *clone;
				1064	struct bio_vec *bv = bio->bi_io_vec + idx;
				1065
				1066	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
				1067	clone->bi_destructor = dm_bio_destructor;
				1068	clone->bi_io_vec = bv;
				1069
				1070	clone->bi_sector = sector;
				1071	clone->bi_bdev = bio->bi_bdev;
				1072	clone->bi_rw = bio->bi_rw;
				1073	clone->bi_vcnt = 1;
				1074	clone->bi_size = to_bytes(len);
				1075	clone->bi_io_vec->bv_offset = offset;
				1076	clone->bi_io_vec->bv_len = clone->bi_size;
				1077	clone->bi_flags \|= 1 << BIO_CLONED;
				1078
				1079	if (bio_integrity(bio)) {
				1080	bio_integrity_clone(clone, bio, GFP_NOIO, bs);
				1081	bio_integrity_trim(clone,
				1082	bio_sector_offset(bio, idx, offset), len);
				1083	}
				1084
				1085	return clone;
				1086	}
				1087
				1088	/*
				1089	* Creates a bio that consists of range of complete bvecs.
				1090	*/
				1091	static struct bio clone_bio(struct bio bio, sector_t sector,
				1092	unsigned short idx, unsigned short bv_count,
				1093	unsigned int len, struct bio_set *bs)
				1094	{
				1095	struct bio *clone;
				1096
				1097	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
				1098	__bio_clone(clone, bio);
				1099	clone->bi_destructor = dm_bio_destructor;
				1100	clone->bi_sector = sector;
				1101	clone->bi_idx = idx;
				1102	clone->bi_vcnt = idx + bv_count;
				1103	clone->bi_size = to_bytes(len);
				1104	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
				1105
				1106	if (bio_integrity(bio)) {
				1107	bio_integrity_clone(clone, bio, GFP_NOIO, bs);
				1108
				1109	if (idx != bio->bi_idx \|\| clone->bi_size < bio->bi_size)
				1110	bio_integrity_trim(clone,
				1111	bio_sector_offset(bio, idx, 0), len);
				1112	}
				1113
				1114	return clone;
				1115	}
				1116
				1117	static struct dm_target_io alloc_tio(struct clone_info ci,
				1118	struct dm_target *ti)
				1119	{
				1120	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
				1121
				1122	tio->io = ci->io;
				1123	tio->ti = ti;
				1124	memset(&tio->info, 0, sizeof(tio->info));
				1125
				1126	return tio;
				1127	}
				1128
				1129	static void __issue_target_request(struct clone_info ci, struct dm_target ti,
				1130	unsigned request_nr, sector_t len)
				1131	{
				1132	struct dm_target_io *tio = alloc_tio(ci, ti);
				1133	struct bio *clone;
				1134
				1135	tio->info.target_request_nr = request_nr;
				1136
				1137	/*
				1138	* Discard requests require the bio's inline iovecs be initialized.
				1139	* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
				1140	* and discard, so no need for concern about wasted bvec allocations.
				1141	*/
				1142	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
				1143	__bio_clone(clone, ci->bio);
				1144	clone->bi_destructor = dm_bio_destructor;
				1145	if (len) {
				1146	clone->bi_sector = ci->sector;
				1147	clone->bi_size = to_bytes(len);
				1148	}
				1149
				1150	__map_bio(ti, clone, tio);
				1151	}
				1152
				1153	static void __issue_target_requests(struct clone_info ci, struct dm_target ti,
				1154	unsigned num_requests, sector_t len)
				1155	{
				1156	unsigned request_nr;
				1157
				1158	for (request_nr = 0; request_nr < num_requests; request_nr++)
				1159	__issue_target_request(ci, ti, request_nr, len);
				1160	}
				1161
				1162	static int __clone_and_map_empty_flush(struct clone_info *ci)
				1163	{
				1164	unsigned target_nr = 0;
				1165	struct dm_target *ti;
				1166
				1167	BUG_ON(bio_has_data(ci->bio));
				1168	while ((ti = dm_table_get_target(ci->map, target_nr++)))
				1169	__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
				1170
				1171	return 0;
				1172	}
				1173
				1174	/*
				1175	* Perform all io with a single clone.
				1176	*/
				1177	static void __clone_and_map_simple(struct clone_info ci, struct dm_target ti)
				1178	{
				1179	struct bio clone, bio = ci->bio;
				1180	struct dm_target_io *tio;
				1181
				1182	tio = alloc_tio(ci, ti);
				1183	clone = clone_bio(bio, ci->sector, ci->idx,
				1184	bio->bi_vcnt - ci->idx, ci->sector_count,
				1185	ci->md->bs);
				1186	__map_bio(ti, clone, tio);
				1187	ci->sector_count = 0;
				1188	}
				1189
				1190	static int __clone_and_map_discard(struct clone_info *ci)
				1191	{
				1192	struct dm_target *ti;
				1193	sector_t len;
				1194
				1195	do {
				1196	ti = dm_table_find_target(ci->map, ci->sector);
				1197	if (!dm_target_is_valid(ti))
				1198	return -EIO;
				1199
				1200	/*
				1201	* Even though the device advertised discard support,
				1202	* that does not mean every target supports it, and
				1203	* reconfiguration might also have changed that since the
				1204	* check was performed.
				1205	*/
				1206	if (!ti->num_discard_requests)
				1207	return -EOPNOTSUPP;
				1208
				1209	len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
				1210
				1211	__issue_target_requests(ci, ti, ti->num_discard_requests, len);
				1212
				1213	ci->sector += len;
				1214	} while (ci->sector_count -= len);
				1215
				1216	return 0;
				1217	}
				1218
				1219	static int __clone_and_map(struct clone_info *ci)
				1220	{
				1221	struct bio clone, bio = ci->bio;
				1222	struct dm_target *ti;
				1223	sector_t len = 0, max;
				1224	struct dm_target_io *tio;
				1225
				1226	if (unlikely(bio->bi_rw & REQ_DISCARD))
				1227	return __clone_and_map_discard(ci);
				1228
				1229	ti = dm_table_find_target(ci->map, ci->sector);
				1230	if (!dm_target_is_valid(ti))
				1231	return -EIO;
				1232
				1233	max = max_io_len(ci->sector, ti);
				1234
				1235	if (ci->sector_count <= max) {
				1236	/*
				1237	* Optimise for the simple case where we can do all of
				1238	* the remaining io with a single clone.
				1239	*/
				1240	__clone_and_map_simple(ci, ti);
				1241
				1242	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
				1243	/*
				1244	* There are some bvecs that don't span targets.
				1245	* Do as many of these as possible.
				1246	*/
				1247	int i;
				1248	sector_t remaining = max;
				1249	sector_t bv_len;
				1250
				1251	for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
				1252	bv_len = to_sector(bio->bi_io_vec[i].bv_len);
				1253
				1254	if (bv_len > remaining)
				1255	break;
				1256
				1257	remaining -= bv_len;
				1258	len += bv_len;
				1259	}
				1260
				1261	tio = alloc_tio(ci, ti);
				1262	clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
				1263	ci->md->bs);
				1264	__map_bio(ti, clone, tio);
				1265
				1266	ci->sector += len;
				1267	ci->sector_count -= len;
				1268	ci->idx = i;
				1269
				1270	} else {
				1271	/*
				1272	* Handle a bvec that must be split between two or more targets.
				1273	*/
				1274	struct bio_vec *bv = bio->bi_io_vec + ci->idx;
				1275	sector_t remaining = to_sector(bv->bv_len);
				1276	unsigned int offset = 0;
				1277
				1278	do {
				1279	if (offset) {
				1280	ti = dm_table_find_target(ci->map, ci->sector);
				1281	if (!dm_target_is_valid(ti))
				1282	return -EIO;
				1283
				1284	max = max_io_len(ci->sector, ti);
				1285	}
				1286
				1287	len = min(remaining, max);
				1288
				1289	tio = alloc_tio(ci, ti);
				1290	clone = split_bvec(bio, ci->sector, ci->idx,
				1291	bv->bv_offset + offset, len,
				1292	ci->md->bs);
				1293
				1294	__map_bio(ti, clone, tio);
				1295
				1296	ci->sector += len;
				1297	ci->sector_count -= len;
				1298	offset += to_bytes(len);
				1299	} while (remaining -= len);
				1300
				1301	ci->idx++;
				1302	}
				1303
				1304	return 0;
				1305	}
				1306
				1307	/*
				1308	* Split the bio into several clones and submit it to targets.
				1309	*/
				1310	static void __split_and_process_bio(struct mapped_device md, struct bio bio)
				1311	{
				1312	struct clone_info ci;
				1313	int error = 0;
				1314
				1315	ci.map = dm_get_live_table(md);
				1316	if (unlikely(!ci.map)) {
				1317	bio_io_error(bio);
				1318	return;
				1319	}
				1320
				1321	ci.md = md;
				1322	ci.io = alloc_io(md);
				1323	ci.io->error = 0;
				1324	atomic_set(&ci.io->io_count, 1);
				1325	ci.io->bio = bio;
				1326	ci.io->md = md;
				1327	spin_lock_init(&ci.io->endio_lock);
				1328	ci.sector = bio->bi_sector;
				1329	ci.idx = bio->bi_idx;
				1330
				1331	start_io_acct(ci.io);
				1332	if (bio->bi_rw & REQ_FLUSH) {
				1333	ci.bio = &ci.md->flush_bio;
				1334	ci.sector_count = 0;
				1335	error = __clone_and_map_empty_flush(&ci);
				1336	/* dec_pending submits any data associated with flush */
				1337	} else {
				1338	ci.bio = bio;
				1339	ci.sector_count = bio_sectors(bio);
				1340	while (ci.sector_count && !error)
				1341	error = __clone_and_map(&ci);
				1342	}
				1343
				1344	/* drop the extra reference count */
				1345	dec_pending(ci.io, error);
				1346	dm_table_put(ci.map);
				1347	}
				1348	/*-----------------------------------------------------------------
				1349	* CRUD END
				1350	---------------------------------------------------------------/
				1351
				1352	static int dm_merge_bvec(struct request_queue *q,
				1353	struct bvec_merge_data *bvm,
				1354	struct bio_vec *biovec)
				1355	{
				1356	struct mapped_device *md = q->queuedata;
				1357	struct dm_table *map = dm_get_live_table(md);
				1358	struct dm_target *ti;
				1359	sector_t max_sectors;
				1360	int max_size = 0;
				1361
				1362	if (unlikely(!map))
				1363	goto out;
				1364
				1365	ti = dm_table_find_target(map, bvm->bi_sector);
				1366	if (!dm_target_is_valid(ti))
				1367	goto out_table;
				1368
				1369	/*
				1370	* Find maximum amount of I/O that won't need splitting
				1371	*/
				1372	max_sectors = min(max_io_len(bvm->bi_sector, ti),
				1373	(sector_t) BIO_MAX_SECTORS);
				1374	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
				1375	if (max_size < 0)
				1376	max_size = 0;
				1377
				1378	/*
				1379	* merge_bvec_fn() returns number of bytes
				1380	* it can accept at this offset
				1381	* max is precomputed maximal io size
				1382	*/
				1383	if (max_size && ti->type->merge)
				1384	max_size = ti->type->merge(ti, bvm, biovec, max_size);
				1385	/*
				1386	* If the target doesn't support merge method and some of the devices
				1387	* provided their merge_bvec method (we know this by looking at
				1388	* queue_max_hw_sectors), then we can't allow bios with multiple vector
				1389	* entries. So always set max_size to 0, and the code below allows
				1390	* just one page.
				1391	*/
				1392	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
				1393
				1394	max_size = 0;
				1395
				1396	out_table:
				1397	dm_table_put(map);
				1398
				1399	out:
				1400	/*
				1401	* Always allow an entire first page
				1402	*/
				1403	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
				1404	max_size = biovec->bv_len;
				1405
				1406	return max_size;
				1407	}
				1408
				1409	/*
				1410	* The request function that just remaps the bio built up by
				1411	* dm_merge_bvec.
				1412	*/
				1413	static void _dm_request(struct request_queue q, struct bio bio)
				1414	{
				1415	int rw = bio_data_dir(bio);
				1416	struct mapped_device *md = q->queuedata;
				1417	int cpu;
				1418
				1419	down_read(&md->io_lock);
				1420
				1421	cpu = part_stat_lock();
				1422	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
				1423	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
				1424	part_stat_unlock();
				1425
				1426	/* if we're suspended, we have to queue this io for later */
				1427	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
				1428	up_read(&md->io_lock);
				1429
				1430	if (bio_rw(bio) != READA)
				1431	queue_io(md, bio);
				1432	else
				1433	bio_io_error(bio);
				1434	return;
				1435	}
				1436
				1437	__split_and_process_bio(md, bio);
				1438	up_read(&md->io_lock);
				1439	return;
				1440	}
				1441
				1442	static int dm_request_based(struct mapped_device *md)
				1443	{
				1444	return blk_queue_stackable(md->queue);
				1445	}
				1446
				1447	static void dm_request(struct request_queue q, struct bio bio)
				1448	{
				1449	struct mapped_device *md = q->queuedata;
				1450
				1451	if (dm_request_based(md))
				1452	blk_queue_bio(q, bio);
				1453	else
				1454	_dm_request(q, bio);
				1455	}
				1456
				1457	void dm_dispatch_request(struct request *rq)
				1458	{
				1459	int r;
				1460
				1461	if (blk_queue_io_stat(rq->q))
				1462	rq->cmd_flags \|= REQ_IO_STAT;
				1463
				1464	rq->start_time = jiffies;
				1465	r = blk_insert_cloned_request(rq->q, rq);
				1466	if (r)
				1467	dm_complete_request(rq, r);
				1468	}
				1469	EXPORT_SYMBOL_GPL(dm_dispatch_request);
				1470
				1471	static void dm_rq_bio_destructor(struct bio *bio)
				1472	{
				1473	struct dm_rq_clone_bio_info *info = bio->bi_private;
				1474	struct mapped_device *md = info->tio->md;
				1475
				1476	free_bio_info(info);
				1477	bio_free(bio, md->bs);
				1478	}
				1479
				1480	static int dm_rq_bio_constructor(struct bio bio, struct bio bio_orig,
				1481	void *data)
				1482	{
				1483	struct dm_rq_target_io *tio = data;
				1484	struct mapped_device *md = tio->md;
				1485	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
				1486
				1487	if (!info)
				1488	return -ENOMEM;
				1489
				1490	info->orig = bio_orig;
				1491	info->tio = tio;
				1492	bio->bi_end_io = end_clone_bio;
				1493	bio->bi_private = info;
				1494	bio->bi_destructor = dm_rq_bio_destructor;
				1495
				1496	return 0;
				1497	}
				1498
				1499	static int setup_clone(struct request clone, struct request rq,
				1500	struct dm_rq_target_io *tio)
				1501	{
				1502	int r;
				1503
				1504	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
				1505	dm_rq_bio_constructor, tio);
				1506	if (r)
				1507	return r;
				1508
				1509	clone->cmd = rq->cmd;
				1510	clone->cmd_len = rq->cmd_len;
				1511	clone->sense = rq->sense;
				1512	clone->buffer = rq->buffer;
				1513	clone->end_io = end_clone_request;
				1514	clone->end_io_data = tio;
				1515
				1516	return 0;
				1517	}
				1518
				1519	static struct request clone_rq(struct request rq, struct mapped_device *md,
				1520	gfp_t gfp_mask)
				1521	{
				1522	struct request *clone;
				1523	struct dm_rq_target_io *tio;
				1524
				1525	tio = alloc_rq_tio(md, gfp_mask);
				1526	if (!tio)
				1527	return NULL;
				1528
				1529	tio->md = md;
				1530	tio->ti = NULL;
				1531	tio->orig = rq;
				1532	tio->error = 0;
				1533	memset(&tio->info, 0, sizeof(tio->info));
				1534
				1535	clone = &tio->clone;
				1536	if (setup_clone(clone, rq, tio)) {
				1537	/* -ENOMEM */
				1538	free_rq_tio(tio);
				1539	return NULL;
				1540	}
				1541
				1542	return clone;
				1543	}
				1544
				1545	/*
				1546	* Called with the queue lock held.
				1547	*/
				1548	static int dm_prep_fn(struct request_queue q, struct request rq)
				1549	{
				1550	struct mapped_device *md = q->queuedata;
				1551	struct request *clone;
				1552
				1553	if (unlikely(rq->special)) {
				1554	DMWARN("Already has something in rq->special.");
				1555	return BLKPREP_KILL;
				1556	}
				1557
				1558	clone = clone_rq(rq, md, GFP_ATOMIC);
				1559	if (!clone)
				1560	return BLKPREP_DEFER;
				1561
				1562	rq->special = clone;
				1563	rq->cmd_flags \|= REQ_DONTPREP;
				1564
				1565	return BLKPREP_OK;
				1566	}
				1567
				1568	/*
				1569	* Returns:
				1570	* 0 : the request has been processed (not requeued)
				1571	* !0 : the request has been requeued
				1572	*/
				1573	static int map_request(struct dm_target ti, struct request clone,
				1574	struct mapped_device *md)
				1575	{
				1576	int r, requeued = 0;
				1577	struct dm_rq_target_io *tio = clone->end_io_data;
				1578
				1579	tio->ti = ti;
				1580	r = ti->type->map_rq(ti, clone, &tio->info);
				1581	switch (r) {
				1582	case DM_MAPIO_SUBMITTED:
				1583	/* The target has taken the I/O to submit by itself later */
				1584	break;
				1585	case DM_MAPIO_REMAPPED:
				1586	/* The target has remapped the I/O so dispatch it */
				1587	trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
				1588	blk_rq_pos(tio->orig));
				1589	dm_dispatch_request(clone);
				1590	break;
				1591	case DM_MAPIO_REQUEUE:
				1592	/* The target wants to requeue the I/O */
				1593	dm_requeue_unmapped_request(clone);
				1594	requeued = 1;
				1595	break;
				1596	default:
				1597	if (r > 0) {
				1598	DMWARN("unimplemented target map return value: %d", r);
				1599	BUG();
				1600	}
				1601
				1602	/* The target wants to complete the I/O */
				1603	dm_kill_unmapped_request(clone, r);
				1604	break;
				1605	}
				1606
				1607	return requeued;
				1608	}
				1609
				1610	static struct request dm_start_request(struct mapped_device md, struct request *orig)
				1611	{
				1612	struct request *clone;
				1613
				1614	blk_start_request(orig);
				1615	clone = orig->special;
				1616	atomic_inc(&md->pending[rq_data_dir(clone)]);
				1617
				1618	/*
				1619	* Hold the md reference here for the in-flight I/O.
				1620	* We can't rely on the reference count by device opener,
				1621	* because the device may be closed during the request completion
				1622	* when all bios are completed.
				1623	* See the comment in rq_completed() too.
				1624	*/
				1625	dm_get(md);
				1626
				1627	return clone;
				1628	}
				1629
				1630	/*
				1631	* q->request_fn for request-based dm.
				1632	* Called with the queue lock held.
				1633	*/
				1634	static void dm_request_fn(struct request_queue *q)
				1635	{
				1636	struct mapped_device *md = q->queuedata;
				1637	struct dm_table *map = dm_get_live_table(md);
				1638	struct dm_target *ti;
				1639	struct request rq, clone;
				1640	sector_t pos;
				1641
				1642	/*
				1643	* For suspend, check blk_queue_stopped() and increment
				1644	* ->pending within a single queue_lock not to increment the
				1645	* number of in-flight I/Os after the queue is stopped in
				1646	* dm_suspend().
				1647	*/
				1648	while (!blk_queue_stopped(q)) {
				1649	rq = blk_peek_request(q);
				1650	if (!rq)
				1651	goto delay_and_out;
				1652
				1653	/* always use block 0 to find the target for flushes for now */
				1654	pos = 0;
				1655	if (!(rq->cmd_flags & REQ_FLUSH))
				1656	pos = blk_rq_pos(rq);
				1657
				1658	ti = dm_table_find_target(map, pos);
				1659	if (!dm_target_is_valid(ti)) {
				1660	/*
				1661	* Must perform setup, that dm_done() requires,
				1662	* before calling dm_kill_unmapped_request
				1663	*/
				1664	DMERR_LIMIT("request attempted access beyond the end of device");
				1665	clone = dm_start_request(md, rq);
				1666	dm_kill_unmapped_request(clone, -EIO);
				1667	continue;
				1668	}
				1669
				1670	if (ti->type->busy && ti->type->busy(ti))
				1671	goto delay_and_out;
				1672
				1673	clone = dm_start_request(md, rq);
				1674
				1675	spin_unlock(q->queue_lock);
				1676	if (map_request(ti, clone, md))
				1677	goto requeued;
				1678
				1679	BUG_ON_NONRT(!irqs_disabled());
				1680	spin_lock(q->queue_lock);
				1681	}
				1682
				1683	goto out;
				1684
				1685	requeued:
				1686	BUG_ON_NONRT(!irqs_disabled());
				1687	spin_lock(q->queue_lock);
				1688
				1689	delay_and_out:
				1690	blk_delay_queue(q, HZ / 10);
				1691	out:
				1692	dm_table_put(map);
				1693	}
				1694
				1695	int dm_underlying_device_busy(struct request_queue *q)
				1696	{
				1697	return blk_lld_busy(q);
				1698	}
				1699	EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
				1700
				1701	static int dm_lld_busy(struct request_queue *q)
				1702	{
				1703	int r;
				1704	struct mapped_device *md = q->queuedata;
				1705	struct dm_table *map = dm_get_live_table(md);
				1706
				1707	if (!map \|\| test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
				1708	r = 1;
				1709	else
				1710	r = dm_table_any_busy_target(map);
				1711
				1712	dm_table_put(map);
				1713
				1714	return r;
				1715	}
				1716
				1717	static int dm_any_congested(void *congested_data, int bdi_bits)
				1718	{
				1719	int r = bdi_bits;
				1720	struct mapped_device *md = congested_data;
				1721	struct dm_table *map;
				1722
				1723	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				1724	map = dm_get_live_table(md);
				1725	if (map) {
				1726	/*
				1727	* Request-based dm cares about only own queue for
				1728	* the query about congestion status of request_queue
				1729	*/
				1730	if (dm_request_based(md))
				1731	r = md->queue->backing_dev_info.state &
				1732	bdi_bits;
				1733	else
				1734	r = dm_table_any_congested(map, bdi_bits);
				1735
				1736	dm_table_put(map);
				1737	}
				1738	}
				1739
				1740	return r;
				1741	}
				1742
				1743	/*-----------------------------------------------------------------
				1744	* An IDR is used to keep track of allocated minor numbers.
				1745	---------------------------------------------------------------/
				1746	static void free_minor(int minor)
				1747	{
				1748	spin_lock(&_minor_lock);
				1749	idr_remove(&_minor_idr, minor);
				1750	spin_unlock(&_minor_lock);
				1751	}
				1752
				1753	/*
				1754	* See if the device with a specific minor # is free.
				1755	*/
				1756	static int specific_minor(int minor)
				1757	{
				1758	int r, m;
				1759
				1760	if (minor >= (1 << MINORBITS))
				1761	return -EINVAL;
				1762
				1763	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
				1764	if (!r)
				1765	return -ENOMEM;
				1766
				1767	spin_lock(&_minor_lock);
				1768
				1769	if (idr_find(&_minor_idr, minor)) {
				1770	r = -EBUSY;
				1771	goto out;
				1772	}
				1773
				1774	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
				1775	if (r)
				1776	goto out;
				1777
				1778	if (m != minor) {
				1779	idr_remove(&_minor_idr, m);
				1780	r = -EBUSY;
				1781	goto out;
				1782	}
				1783
				1784	out:
				1785	spin_unlock(&_minor_lock);
				1786	return r;
				1787	}
				1788
				1789	static int next_free_minor(int *minor)
				1790	{
				1791	int r, m;
				1792
				1793	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
				1794	if (!r)
				1795	return -ENOMEM;
				1796
				1797	spin_lock(&_minor_lock);
				1798
				1799	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
				1800	if (r)
				1801	goto out;
				1802
				1803	if (m >= (1 << MINORBITS)) {
				1804	idr_remove(&_minor_idr, m);
				1805	r = -ENOSPC;
				1806	goto out;
				1807	}
				1808
				1809	*minor = m;
				1810
				1811	out:
				1812	spin_unlock(&_minor_lock);
				1813	return r;
				1814	}
				1815
				1816	static const struct block_device_operations dm_blk_dops;
				1817
				1818	static void dm_wq_work(struct work_struct *work);
				1819
				1820	static void dm_init_md_queue(struct mapped_device *md)
				1821	{
				1822	/*
				1823	* Request-based dm devices cannot be stacked on top of bio-based dm
				1824	* devices. The type of this dm device has not been decided yet.
				1825	* The type is decided at the first table loading time.
				1826	* To prevent problematic device stacking, clear the queue flag
				1827	* for request stacking support until then.
				1828	*
				1829	* This queue is new, so no concurrency on the queue_flags.
				1830	*/
				1831	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
				1832
				1833	md->queue->queuedata = md;
				1834	md->queue->backing_dev_info.congested_fn = dm_any_congested;
				1835	md->queue->backing_dev_info.congested_data = md;
				1836	blk_queue_make_request(md->queue, dm_request);
				1837	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
				1838	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
				1839	}
				1840
				1841	/*
				1842	* Allocate and initialise a blank device with a given minor.
				1843	*/
				1844	static struct mapped_device *alloc_dev(int minor)
				1845	{
				1846	int r;
				1847	struct mapped_device md = kzalloc(sizeof(md), GFP_KERNEL);
				1848	void *old_md;
				1849
				1850	if (!md) {
				1851	DMWARN("unable to allocate device, out of memory.");
				1852	return NULL;
				1853	}
				1854
				1855	if (!try_module_get(THIS_MODULE))
				1856	goto bad_module_get;
				1857
				1858	/* get a minor number for the dev */
				1859	if (minor == DM_ANY_MINOR)
				1860	r = next_free_minor(&minor);
				1861	else
				1862	r = specific_minor(minor);
				1863	if (r < 0)
				1864	goto bad_minor;
				1865
				1866	md->type = DM_TYPE_NONE;
				1867	init_rwsem(&md->io_lock);
				1868	mutex_init(&md->suspend_lock);
				1869	mutex_init(&md->type_lock);
				1870	spin_lock_init(&md->deferred_lock);
				1871	rwlock_init(&md->map_lock);
				1872	atomic_set(&md->holders, 1);
				1873	atomic_set(&md->open_count, 0);
				1874	atomic_set(&md->event_nr, 0);
				1875	atomic_set(&md->uevent_seq, 0);
				1876	INIT_LIST_HEAD(&md->uevent_list);
				1877	spin_lock_init(&md->uevent_lock);
				1878
				1879	md->queue = blk_alloc_queue(GFP_KERNEL);
				1880	if (!md->queue)
				1881	goto bad_queue;
				1882
				1883	dm_init_md_queue(md);
				1884
				1885	md->disk = alloc_disk(1);
				1886	if (!md->disk)
				1887	goto bad_disk;
				1888
				1889	atomic_set(&md->pending[0], 0);
				1890	atomic_set(&md->pending[1], 0);
				1891	init_waitqueue_head(&md->wait);
				1892	INIT_WORK(&md->work, dm_wq_work);
				1893	init_waitqueue_head(&md->eventq);
				1894	init_completion(&md->kobj_holder.completion);
				1895
				1896	md->disk->major = _major;
				1897	md->disk->first_minor = minor;
				1898	md->disk->fops = &dm_blk_dops;
				1899	md->disk->queue = md->queue;
				1900	md->disk->private_data = md;
				1901	sprintf(md->disk->disk_name, "dm-%d", minor);
				1902	add_disk(md->disk);
				1903	format_dev_t(md->name, MKDEV(_major, minor));
				1904
				1905	md->wq = alloc_workqueue("kdmflush",
				1906	WQ_NON_REENTRANT \| WQ_MEM_RECLAIM, 0);
				1907	if (!md->wq)
				1908	goto bad_thread;
				1909
				1910	md->bdev = bdget_disk(md->disk, 0);
				1911	if (!md->bdev)
				1912	goto bad_bdev;
				1913
				1914	bio_init(&md->flush_bio);
				1915	md->flush_bio.bi_bdev = md->bdev;
				1916	md->flush_bio.bi_rw = WRITE_FLUSH;
				1917
				1918	/* Populate the mapping, nobody knows we exist yet */
				1919	spin_lock(&_minor_lock);
				1920	old_md = idr_replace(&_minor_idr, md, minor);
				1921	spin_unlock(&_minor_lock);
				1922
				1923	BUG_ON(old_md != MINOR_ALLOCED);
				1924
				1925	return md;
				1926
				1927	bad_bdev:
				1928	destroy_workqueue(md->wq);
				1929	bad_thread:
				1930	del_gendisk(md->disk);
				1931	put_disk(md->disk);
				1932	bad_disk:
				1933	blk_cleanup_queue(md->queue);
				1934	bad_queue:
				1935	free_minor(minor);
				1936	bad_minor:
				1937	module_put(THIS_MODULE);
				1938	bad_module_get:
				1939	kfree(md);
				1940	return NULL;
				1941	}
				1942
				1943	static void unlock_fs(struct mapped_device *md);
				1944
				1945	static void free_dev(struct mapped_device *md)
				1946	{
				1947	int minor = MINOR(disk_devt(md->disk));
				1948
				1949	unlock_fs(md);
				1950	bdput(md->bdev);
				1951	destroy_workqueue(md->wq);
				1952	if (md->tio_pool)
				1953	mempool_destroy(md->tio_pool);
				1954	if (md->io_pool)
				1955	mempool_destroy(md->io_pool);
				1956	if (md->bs)
				1957	bioset_free(md->bs);
				1958	blk_integrity_unregister(md->disk);
				1959	del_gendisk(md->disk);
				1960	free_minor(minor);
				1961
				1962	spin_lock(&_minor_lock);
				1963	md->disk->private_data = NULL;
				1964	spin_unlock(&_minor_lock);
				1965
				1966	put_disk(md->disk);
				1967	blk_cleanup_queue(md->queue);
				1968	module_put(THIS_MODULE);
				1969	kfree(md);
				1970	}
				1971
				1972	static void __bind_mempools(struct mapped_device md, struct dm_table t)
				1973	{
				1974	struct dm_md_mempools *p;
				1975
				1976	if (md->io_pool && md->tio_pool && md->bs)
				1977	/* the md already has necessary mempools */
				1978	goto out;
				1979
				1980	p = dm_table_get_md_mempools(t);
				1981	BUG_ON(!p \|\| md->io_pool \|\| md->tio_pool \|\| md->bs);
				1982
				1983	md->io_pool = p->io_pool;
				1984	p->io_pool = NULL;
				1985	md->tio_pool = p->tio_pool;
				1986	p->tio_pool = NULL;
				1987	md->bs = p->bs;
				1988	p->bs = NULL;
				1989
				1990	out:
				1991	/* mempool bind completed, now no need any mempools in the table */
				1992	dm_table_free_md_mempools(t);
				1993	}
				1994
				1995	/*
				1996	* Bind a table to the device.
				1997	*/
				1998	static void event_callback(void *context)
				1999	{
				2000	unsigned long flags;
				2001	LIST_HEAD(uevents);
				2002	struct mapped_device md = (struct mapped_device ) context;
				2003
				2004	spin_lock_irqsave(&md->uevent_lock, flags);
				2005	list_splice_init(&md->uevent_list, &uevents);
				2006	spin_unlock_irqrestore(&md->uevent_lock, flags);
				2007
				2008	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
				2009
				2010	atomic_inc(&md->event_nr);
				2011	wake_up(&md->eventq);
				2012	}
				2013
				2014	/*
				2015	* Protected by md->suspend_lock obtained by dm_swap_table().
				2016	*/
				2017	static void __set_size(struct mapped_device *md, sector_t size)
				2018	{
				2019	set_capacity(md->disk, size);
				2020
				2021	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
				2022	}
				2023
				2024	/*
				2025	* Return 1 if the queue has a compulsory merge_bvec_fn function.
				2026	*
				2027	* If this function returns 0, then the device is either a non-dm
				2028	* device without a merge_bvec_fn, or it is a dm device that is
				2029	* able to split any bios it receives that are too big.
				2030	*/
				2031	int dm_queue_merge_is_compulsory(struct request_queue *q)
				2032	{
				2033	struct mapped_device *dev_md;
				2034
				2035	if (!q->merge_bvec_fn)
				2036	return 0;
				2037
				2038	if (q->make_request_fn == dm_request) {
				2039	dev_md = q->queuedata;
				2040	if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
				2041	return 0;
				2042	}
				2043
				2044	return 1;
				2045	}
				2046
				2047	static int dm_device_merge_is_compulsory(struct dm_target *ti,
				2048	struct dm_dev *dev, sector_t start,
				2049	sector_t len, void *data)
				2050	{
				2051	struct block_device *bdev = dev->bdev;
				2052	struct request_queue *q = bdev_get_queue(bdev);
				2053
				2054	return dm_queue_merge_is_compulsory(q);
				2055	}
				2056
				2057	/*
				2058	* Return 1 if it is acceptable to ignore merge_bvec_fn based
				2059	* on the properties of the underlying devices.
				2060	*/
				2061	static int dm_table_merge_is_optional(struct dm_table *table)
				2062	{
				2063	unsigned i = 0;
				2064	struct dm_target *ti;
				2065
				2066	while (i < dm_table_get_num_targets(table)) {
				2067	ti = dm_table_get_target(table, i++);
				2068
				2069	if (ti->type->iterate_devices &&
				2070	ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
				2071	return 0;
				2072	}
				2073
				2074	return 1;
				2075	}
				2076
				2077	/*
				2078	* Returns old map, which caller must destroy.
				2079	*/
				2080	static struct dm_table __bind(struct mapped_device md, struct dm_table *t,
				2081	struct queue_limits *limits)
				2082	{
				2083	struct dm_table *old_map;
				2084	struct request_queue *q = md->queue;
				2085	sector_t size;
				2086	unsigned long flags;
				2087	int merge_is_optional;
				2088
				2089	size = dm_table_get_size(t);
				2090
				2091	/*
				2092	* Wipe any geometry if the size of the table changed.
				2093	*/
				2094	if (size != get_capacity(md->disk))
				2095	memset(&md->geometry, 0, sizeof(md->geometry));
				2096
				2097	__set_size(md, size);
				2098
				2099	dm_table_event_callback(t, event_callback, md);
				2100
				2101	/*
				2102	* The queue hasn't been stopped yet, if the old table type wasn't
				2103	* for request-based during suspension. So stop it to prevent
				2104	* I/O mapping before resume.
				2105	* This must be done before setting the queue restrictions,
				2106	* because request-based dm may be run just after the setting.
				2107	*/
				2108	if (dm_table_request_based(t) && !blk_queue_stopped(q))
				2109	stop_queue(q);
				2110
				2111	__bind_mempools(md, t);
				2112
				2113	merge_is_optional = dm_table_merge_is_optional(t);
				2114
				2115	write_lock_irqsave(&md->map_lock, flags);
				2116	old_map = md->map;
				2117	md->map = t;
				2118	md->immutable_target_type = dm_table_get_immutable_target_type(t);
				2119
				2120	dm_table_set_restrictions(t, q, limits);
				2121	if (merge_is_optional)
				2122	set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
				2123	else
				2124	clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
				2125	write_unlock_irqrestore(&md->map_lock, flags);
				2126
				2127	return old_map;
				2128	}
				2129
				2130	/*
				2131	* Returns unbound table for the caller to free.
				2132	*/
				2133	static struct dm_table __unbind(struct mapped_device md)
				2134	{
				2135	struct dm_table *map = md->map;
				2136	unsigned long flags;
				2137
				2138	if (!map)
				2139	return NULL;
				2140
				2141	dm_table_event_callback(map, NULL, NULL);
				2142	write_lock_irqsave(&md->map_lock, flags);
				2143	md->map = NULL;
				2144	write_unlock_irqrestore(&md->map_lock, flags);
				2145
				2146	return map;
				2147	}
				2148
				2149	/*
				2150	* Constructor for a new device.
				2151	*/
				2152	int dm_create(int minor, struct mapped_device **result)
				2153	{
				2154	struct mapped_device *md;
				2155
				2156	md = alloc_dev(minor);
				2157	if (!md)
				2158	return -ENXIO;
				2159
				2160	dm_sysfs_init(md);
				2161
				2162	*result = md;
				2163	return 0;
				2164	}
				2165
				2166	/*
				2167	* Functions to manage md->type.
				2168	* All are required to hold md->type_lock.
				2169	*/
				2170	void dm_lock_md_type(struct mapped_device *md)
				2171	{
				2172	mutex_lock(&md->type_lock);
				2173	}
				2174
				2175	void dm_unlock_md_type(struct mapped_device *md)
				2176	{
				2177	mutex_unlock(&md->type_lock);
				2178	}
				2179
				2180	void dm_set_md_type(struct mapped_device *md, unsigned type)
				2181	{
				2182	md->type = type;
				2183	}
				2184
				2185	unsigned dm_get_md_type(struct mapped_device *md)
				2186	{
				2187	return md->type;
				2188	}
				2189
				2190	struct target_type dm_get_immutable_target_type(struct mapped_device md)
				2191	{
				2192	return md->immutable_target_type;
				2193	}
				2194
				2195	/*
				2196	* Fully initialize a request-based queue (->elevator, ->request_fn, etc).
				2197	*/
				2198	static int dm_init_request_based_queue(struct mapped_device *md)
				2199	{
				2200	struct request_queue *q = NULL;
				2201
				2202	if (md->queue->elevator)
				2203	return 1;
				2204
				2205	/* Fully initialize the queue */
				2206	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
				2207	if (!q)
				2208	return 0;
				2209
				2210	md->queue = q;
				2211	dm_init_md_queue(md);
				2212	blk_queue_softirq_done(md->queue, dm_softirq_done);
				2213	blk_queue_prep_rq(md->queue, dm_prep_fn);
				2214	blk_queue_lld_busy(md->queue, dm_lld_busy);
				2215
				2216	elv_register_queue(md->queue);
				2217
				2218	return 1;
				2219	}
				2220
				2221	/*
				2222	* Setup the DM device's queue based on md's type
				2223	*/
				2224	int dm_setup_md_queue(struct mapped_device *md)
				2225	{
				2226	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
				2227	!dm_init_request_based_queue(md)) {
				2228	DMWARN("Cannot initialize queue for request-based mapped device");
				2229	return -EINVAL;
				2230	}
				2231
				2232	return 0;
				2233	}
				2234
				2235	struct mapped_device *dm_get_md(dev_t dev)
				2236	{
				2237	struct mapped_device *md;
				2238	unsigned minor = MINOR(dev);
				2239
				2240	if (MAJOR(dev) != _major \|\| minor >= (1 << MINORBITS))
				2241	return NULL;
				2242
				2243	spin_lock(&_minor_lock);
				2244
				2245	md = idr_find(&_minor_idr, minor);
				2246	if (md) {
				2247	if ((md == MINOR_ALLOCED \|\|
				2248	(MINOR(disk_devt(dm_disk(md))) != minor) \|\|
				2249	dm_deleting_md(md) \|\|
				2250	test_bit(DMF_FREEING, &md->flags))) {
				2251	md = NULL;
				2252	goto out;
				2253	}
				2254	dm_get(md);
				2255	}
				2256
				2257	out:
				2258	spin_unlock(&_minor_lock);
				2259
				2260	return md;
				2261	}
				2262	EXPORT_SYMBOL_GPL(dm_get_md);
				2263
				2264	void dm_get_mdptr(struct mapped_device md)
				2265	{
				2266	return md->interface_ptr;
				2267	}
				2268
				2269	void dm_set_mdptr(struct mapped_device md, void ptr)
				2270	{
				2271	md->interface_ptr = ptr;
				2272	}
				2273
				2274	void dm_get(struct mapped_device *md)
				2275	{
				2276	atomic_inc(&md->holders);
				2277	BUG_ON(test_bit(DMF_FREEING, &md->flags));
				2278	}
				2279
				2280	const char dm_device_name(struct mapped_device md)
				2281	{
				2282	return md->name;
				2283	}
				2284	EXPORT_SYMBOL_GPL(dm_device_name);
				2285
				2286	static void __dm_destroy(struct mapped_device *md, bool wait)
				2287	{
				2288	struct dm_table *map;
				2289
				2290	might_sleep();
				2291
				2292	spin_lock(&_minor_lock);
				2293	map = dm_get_live_table(md);
				2294	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
				2295	set_bit(DMF_FREEING, &md->flags);
				2296	spin_unlock(&_minor_lock);
				2297
				2298	/*
				2299	* Take suspend_lock so that presuspend and postsuspend methods
				2300	* do not race with internal suspend.
				2301	*/
				2302	mutex_lock(&md->suspend_lock);
				2303	if (!dm_suspended_md(md)) {
				2304	dm_table_presuspend_targets(map);
				2305	dm_table_postsuspend_targets(map);
				2306	}
				2307	mutex_unlock(&md->suspend_lock);
				2308
				2309	/*
				2310	* Rare, but there may be I/O requests still going to complete,
				2311	* for example. Wait for all references to disappear.
				2312	* No one should increment the reference count of the mapped_device,
				2313	* after the mapped_device state becomes DMF_FREEING.
				2314	*/
				2315	if (wait)
				2316	while (atomic_read(&md->holders))
				2317	msleep(1);
				2318	else if (atomic_read(&md->holders))
				2319	DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
				2320	dm_device_name(md), atomic_read(&md->holders));
				2321
				2322	dm_sysfs_exit(md);
				2323	dm_table_put(map);
				2324	dm_table_destroy(__unbind(md));
				2325	free_dev(md);
				2326	}
				2327
				2328	void dm_destroy(struct mapped_device *md)
				2329	{
				2330	__dm_destroy(md, true);
				2331	}
				2332
				2333	void dm_destroy_immediate(struct mapped_device *md)
				2334	{
				2335	__dm_destroy(md, false);
				2336	}
				2337
				2338	void dm_put(struct mapped_device *md)
				2339	{
				2340	atomic_dec(&md->holders);
				2341	}
				2342	EXPORT_SYMBOL_GPL(dm_put);
				2343
				2344	static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
				2345	{
				2346	int r = 0;
				2347	DECLARE_WAITQUEUE(wait, current);
				2348
				2349	add_wait_queue(&md->wait, &wait);
				2350
				2351	while (1) {
				2352	set_current_state(interruptible);
				2353
				2354	if (!md_in_flight(md))
				2355	break;
				2356
				2357	if (interruptible == TASK_INTERRUPTIBLE &&
				2358	signal_pending(current)) {
				2359	r = -EINTR;
				2360	break;
				2361	}
				2362
				2363	io_schedule();
				2364	}
				2365	set_current_state(TASK_RUNNING);
				2366
				2367	remove_wait_queue(&md->wait, &wait);
				2368
				2369	return r;
				2370	}
				2371
				2372	/*
				2373	* Process the deferred bios
				2374	*/
				2375	static void dm_wq_work(struct work_struct *work)
				2376	{
				2377	struct mapped_device *md = container_of(work, struct mapped_device,
				2378	work);
				2379	struct bio *c;
				2380
				2381	down_read(&md->io_lock);
				2382
				2383	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				2384	spin_lock_irq(&md->deferred_lock);
				2385	c = bio_list_pop(&md->deferred);
				2386	spin_unlock_irq(&md->deferred_lock);
				2387
				2388	if (!c)
				2389	break;
				2390
				2391	up_read(&md->io_lock);
				2392
				2393	if (dm_request_based(md))
				2394	generic_make_request(c);
				2395	else
				2396	__split_and_process_bio(md, c);
				2397
				2398	down_read(&md->io_lock);
				2399	}
				2400
				2401	up_read(&md->io_lock);
				2402	}
				2403
				2404	static void dm_queue_flush(struct mapped_device *md)
				2405	{
				2406	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2407	smp_mb__after_clear_bit();
				2408	queue_work(md->wq, &md->work);
				2409	}
				2410
				2411	/*
				2412	* Swap in a new table, returning the old one for the caller to destroy.
				2413	*/
				2414	struct dm_table dm_swap_table(struct mapped_device md, struct dm_table *table)
				2415	{
				2416	struct dm_table *map = ERR_PTR(-EINVAL);
				2417	struct queue_limits limits;
				2418	int r;
				2419
				2420	mutex_lock(&md->suspend_lock);
				2421
				2422	/* device must be suspended */
				2423	if (!dm_suspended_md(md))
				2424	goto out;
				2425
				2426	r = dm_calculate_queue_limits(table, &limits);
				2427	if (r) {
				2428	map = ERR_PTR(r);
				2429	goto out;
				2430	}
				2431
				2432	map = __bind(md, table, &limits);
				2433
				2434	out:
				2435	mutex_unlock(&md->suspend_lock);
				2436	return map;
				2437	}
				2438
				2439	/*
				2440	* Functions to lock and unlock any filesystem running on the
				2441	* device.
				2442	*/
				2443	static int lock_fs(struct mapped_device *md)
				2444	{
				2445	int r;
				2446
				2447	WARN_ON(md->frozen_sb);
				2448
				2449	md->frozen_sb = freeze_bdev(md->bdev);
				2450	if (IS_ERR(md->frozen_sb)) {
				2451	r = PTR_ERR(md->frozen_sb);
				2452	md->frozen_sb = NULL;
				2453	return r;
				2454	}
				2455
				2456	set_bit(DMF_FROZEN, &md->flags);
				2457
				2458	return 0;
				2459	}
				2460
				2461	static void unlock_fs(struct mapped_device *md)
				2462	{
				2463	if (!test_bit(DMF_FROZEN, &md->flags))
				2464	return;
				2465
				2466	thaw_bdev(md->bdev, md->frozen_sb);
				2467	md->frozen_sb = NULL;
				2468	clear_bit(DMF_FROZEN, &md->flags);
				2469	}
				2470
				2471	/*
				2472	* We need to be able to change a mapping table under a mounted
				2473	* filesystem. For example we might want to move some data in
				2474	* the background. Before the table can be swapped with
				2475	* dm_bind_table, dm_suspend must be called to flush any in
				2476	* flight bios and ensure that any further io gets deferred.
				2477	*/
				2478	/*
				2479	* Suspend mechanism in request-based dm.
				2480	*
				2481	* 1. Flush all I/Os by lock_fs() if needed.
				2482	* 2. Stop dispatching any I/O by stopping the request_queue.
				2483	* 3. Wait for all in-flight I/Os to be completed or requeued.
				2484	*
				2485	* To abort suspend, start the request_queue.
				2486	*/
				2487	int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
				2488	{
				2489	struct dm_table *map = NULL;
				2490	int r = 0;
				2491	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
				2492	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
				2493
				2494	mutex_lock(&md->suspend_lock);
				2495
				2496	if (dm_suspended_md(md)) {
				2497	r = -EINVAL;
				2498	goto out_unlock;
				2499	}
				2500
				2501	map = dm_get_live_table(md);
				2502
				2503	/*
				2504	* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
				2505	* This flag is cleared before dm_suspend returns.
				2506	*/
				2507	if (noflush)
				2508	set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2509
				2510	/* This does not get reverted if there's an error later. */
				2511	dm_table_presuspend_targets(map);
				2512
				2513	/*
				2514	* Flush I/O to the device.
				2515	* Any I/O submitted after lock_fs() may not be flushed.
				2516	* noflush takes precedence over do_lockfs.
				2517	* (lock_fs() flushes I/Os and waits for them to complete.)
				2518	*/
				2519	if (!noflush && do_lockfs) {
				2520	r = lock_fs(md);
				2521	if (r)
				2522	goto out;
				2523	}
				2524
				2525	/*
				2526	* Here we must make sure that no processes are submitting requests
				2527	* to target drivers i.e. no one may be executing
				2528	* __split_and_process_bio. This is called from dm_request and
				2529	* dm_wq_work.
				2530	*
				2531	* To get all processes out of __split_and_process_bio in dm_request,
				2532	* we take the write lock. To prevent any process from reentering
				2533	* __split_and_process_bio from dm_request and quiesce the thread
				2534	* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
				2535	* flush_workqueue(md->wq).
				2536	*/
				2537	down_write(&md->io_lock);
				2538	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2539	up_write(&md->io_lock);
				2540
				2541	/*
				2542	* Stop md->queue before flushing md->wq in case request-based
				2543	* dm defers requests to md->wq from md->queue.
				2544	*/
				2545	if (dm_request_based(md))
				2546	stop_queue(md->queue);
				2547
				2548	flush_workqueue(md->wq);
				2549
				2550	/*
				2551	* At this point no more requests are entering target request routines.
				2552	* We call dm_wait_for_completion to wait for all existing requests
				2553	* to finish.
				2554	*/
				2555	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
				2556
				2557	down_write(&md->io_lock);
				2558	if (noflush)
				2559	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2560	up_write(&md->io_lock);
				2561
				2562	/* were we interrupted ? */
				2563	if (r < 0) {
				2564	dm_queue_flush(md);
				2565
				2566	if (dm_request_based(md))
				2567	start_queue(md->queue);
				2568
				2569	unlock_fs(md);
				2570	goto out; /* pushback list is already flushed, so skip flush */
				2571	}
				2572
				2573	/*
				2574	* If dm_wait_for_completion returned 0, the device is completely
				2575	* quiescent now. There is no request-processing activity. All new
				2576	* requests are being added to md->deferred list.
				2577	*/
				2578
				2579	set_bit(DMF_SUSPENDED, &md->flags);
				2580
				2581	dm_table_postsuspend_targets(map);
				2582
				2583	out:
				2584	dm_table_put(map);
				2585
				2586	out_unlock:
				2587	mutex_unlock(&md->suspend_lock);
				2588	return r;
				2589	}
				2590
				2591	int dm_resume(struct mapped_device *md)
				2592	{
				2593	int r = -EINVAL;
				2594	struct dm_table *map = NULL;
				2595
				2596	mutex_lock(&md->suspend_lock);
				2597	if (!dm_suspended_md(md))
				2598	goto out;
				2599
				2600	map = dm_get_live_table(md);
				2601	if (!map \|\| !dm_table_get_size(map))
				2602	goto out;
				2603
				2604	r = dm_table_resume_targets(map);
				2605	if (r)
				2606	goto out;
				2607
				2608	dm_queue_flush(md);
				2609
				2610	/*
				2611	* Flushing deferred I/Os must be done after targets are resumed
				2612	* so that mapping of targets can work correctly.
				2613	* Request-based dm is queueing the deferred I/Os in its request_queue.
				2614	*/
				2615	if (dm_request_based(md))
				2616	start_queue(md->queue);
				2617
				2618	unlock_fs(md);
				2619
				2620	clear_bit(DMF_SUSPENDED, &md->flags);
				2621
				2622	r = 0;
				2623	out:
				2624	dm_table_put(map);
				2625	mutex_unlock(&md->suspend_lock);
				2626
				2627	return r;
				2628	}
				2629
				2630	/*-----------------------------------------------------------------
				2631	* Event notification.
				2632	---------------------------------------------------------------/
				2633	int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
				2634	unsigned cookie)
				2635	{
				2636	char udev_cookie[DM_COOKIE_LENGTH];
				2637	char *envp[] = { udev_cookie, NULL };
				2638
				2639	if (!cookie)
				2640	return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
				2641	else {
				2642	snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
				2643	DM_COOKIE_ENV_VAR_NAME, cookie);
				2644	return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
				2645	action, envp);
				2646	}
				2647	}
				2648
				2649	uint32_t dm_next_uevent_seq(struct mapped_device *md)
				2650	{
				2651	return atomic_add_return(1, &md->uevent_seq);
				2652	}
				2653
				2654	uint32_t dm_get_event_nr(struct mapped_device *md)
				2655	{
				2656	return atomic_read(&md->event_nr);
				2657	}
				2658
				2659	int dm_wait_event(struct mapped_device *md, int event_nr)
				2660	{
				2661	return wait_event_interruptible(md->eventq,
				2662	(event_nr != atomic_read(&md->event_nr)));
				2663	}
				2664
				2665	void dm_uevent_add(struct mapped_device md, struct list_head elist)
				2666	{
				2667	unsigned long flags;
				2668
				2669	spin_lock_irqsave(&md->uevent_lock, flags);
				2670	list_add(elist, &md->uevent_list);
				2671	spin_unlock_irqrestore(&md->uevent_lock, flags);
				2672	}
				2673
				2674	/*
				2675	* The gendisk is only valid as long as you have a reference
				2676	* count on 'md'.
				2677	*/
				2678	struct gendisk dm_disk(struct mapped_device md)
				2679	{
				2680	return md->disk;
				2681	}
				2682
				2683	struct kobject dm_kobject(struct mapped_device md)
				2684	{
				2685	return &md->kobj_holder.kobj;
				2686	}
				2687
				2688	struct mapped_device dm_get_from_kobject(struct kobject kobj)
				2689	{
				2690	struct mapped_device *md;
				2691
				2692	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
				2693
				2694	if (test_bit(DMF_FREEING, &md->flags) \|\|
				2695	dm_deleting_md(md))
				2696	return NULL;
				2697
				2698	dm_get(md);
				2699	return md;
				2700	}
				2701
				2702	int dm_suspended_md(struct mapped_device *md)
				2703	{
				2704	return test_bit(DMF_SUSPENDED, &md->flags);
				2705	}
				2706
				2707	int dm_suspended(struct dm_target *ti)
				2708	{
				2709	return dm_suspended_md(dm_table_get_md(ti->table));
				2710	}
				2711	EXPORT_SYMBOL_GPL(dm_suspended);
				2712
				2713	int dm_noflush_suspending(struct dm_target *ti)
				2714	{
				2715	return __noflush_suspending(dm_table_get_md(ti->table));
				2716	}
				2717	EXPORT_SYMBOL_GPL(dm_noflush_suspending);
				2718
				2719	struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
				2720	{
				2721	struct dm_md_mempools pools = kmalloc(sizeof(pools), GFP_KERNEL);
				2722	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
				2723
				2724	if (!pools)
				2725	return NULL;
				2726
				2727	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
				2728	mempool_create_slab_pool(MIN_IOS, _io_cache) :
				2729	mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
				2730	if (!pools->io_pool)
				2731	goto free_pools_and_out;
				2732
				2733	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
				2734	mempool_create_slab_pool(MIN_IOS, _tio_cache) :
				2735	mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
				2736	if (!pools->tio_pool)
				2737	goto free_io_pool_and_out;
				2738
				2739	pools->bs = bioset_create(pool_size, 0);
				2740	if (!pools->bs)
				2741	goto free_tio_pool_and_out;
				2742
				2743	if (integrity && bioset_integrity_create(pools->bs, pool_size))
				2744	goto free_bioset_and_out;
				2745
				2746	return pools;
				2747
				2748	free_bioset_and_out:
				2749	bioset_free(pools->bs);
				2750
				2751	free_tio_pool_and_out:
				2752	mempool_destroy(pools->tio_pool);
				2753
				2754	free_io_pool_and_out:
				2755	mempool_destroy(pools->io_pool);
				2756
				2757	free_pools_and_out:
				2758	kfree(pools);
				2759
				2760	return NULL;
				2761	}
				2762
				2763	void dm_free_md_mempools(struct dm_md_mempools *pools)
				2764	{
				2765	if (!pools)
				2766	return;
				2767
				2768	if (pools->io_pool)
				2769	mempool_destroy(pools->io_pool);
				2770
				2771	if (pools->tio_pool)
				2772	mempool_destroy(pools->tio_pool);
				2773
				2774	if (pools->bs)
				2775	bioset_free(pools->bs);
				2776
				2777	kfree(pools);
				2778	}
				2779
				2780	static const struct block_device_operations dm_blk_dops = {
				2781	.open = dm_blk_open,
				2782	.release = dm_blk_close,
				2783	.ioctl = dm_blk_ioctl,
				2784	.getgeo = dm_blk_getgeo,
				2785	.owner = THIS_MODULE
				2786	};
				2787
				2788	EXPORT_SYMBOL(dm_get_mapinfo);
				2789
				2790	/*
				2791	* module hooks
				2792	*/
				2793	module_init(dm_init);
				2794	module_exit(dm_exit);
				2795
				2796	module_param(major, uint, 0);
				2797	MODULE_PARM_DESC(major, "The major number of the device mapper");
				2798	MODULE_DESCRIPTION(DM_NAME " driver");
				2799	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2800	MODULE_LICENSE("GPL");