Blame - src/kernel/linux/v4.14/drivers/md/dm.c - T103

blob: 6e741f19a732e6156b0e1fd834bc9f2dfbc08f68 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
				3	* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
				4	*
				5	* This file is released under the GPL.
				6	*/
				7
				8	#include "dm-core.h"
				9	#include "dm-rq.h"
				10	#include "dm-uevent.h"
				11
				12	#include <linux/init.h>
				13	#include <linux/module.h>
				14	#include <linux/mutex.h>
				15	#include <linux/sched/mm.h>
				16	#include <linux/sched/signal.h>
				17	#include <linux/blkpg.h>
				18	#include <linux/bio.h>
				19	#include <linux/mempool.h>
				20	#include <linux/dax.h>
				21	#include <linux/slab.h>
				22	#include <linux/idr.h>
				23	#include <linux/uio.h>
				24	#include <linux/hdreg.h>
				25	#include <linux/delay.h>
				26	#include <linux/wait.h>
				27	#include <linux/pr.h>
				28
				29	#define DM_MSG_PREFIX "core"
				30
				31	/*
				32	* Cookies are numeric values sent with CHANGE and REMOVE
				33	* uevents while resuming, removing or renaming the device.
				34	*/
				35	#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
				36	#define DM_COOKIE_LENGTH 24
				37
				38	static const char *_name = DM_NAME;
				39
				40	static unsigned int major = 0;
				41	static unsigned int _major = 0;
				42
				43	static DEFINE_IDR(_minor_idr);
				44
				45	static DEFINE_SPINLOCK(_minor_lock);
				46
				47	static void do_deferred_remove(struct work_struct *w);
				48
				49	static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
				50
				51	static struct workqueue_struct *deferred_remove_workqueue;
				52
				53	atomic_t dm_global_event_nr = ATOMIC_INIT(0);
				54	DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
				55
				56	void dm_issue_global_event(void)
				57	{
				58	atomic_inc(&dm_global_event_nr);
				59	wake_up(&dm_global_eventq);
				60	}
				61
				62	/*
				63	* One of these is allocated per bio.
				64	*/
				65	struct dm_io {
				66	struct mapped_device *md;
				67	blk_status_t status;
				68	atomic_t io_count;
				69	struct bio *bio;
				70	unsigned long start_time;
				71	spinlock_t endio_lock;
				72	struct dm_stats_aux stats_aux;
				73	};
				74
				75	#define MINOR_ALLOCED ((void *)-1)
				76
				77	/*
				78	* Bits for the md->flags field.
				79	*/
				80	#define DMF_BLOCK_IO_FOR_SUSPEND 0
				81	#define DMF_SUSPENDED 1
				82	#define DMF_FROZEN 2
				83	#define DMF_FREEING 3
				84	#define DMF_DELETING 4
				85	#define DMF_NOFLUSH_SUSPENDING 5
				86	#define DMF_DEFERRED_REMOVE 6
				87	#define DMF_SUSPENDED_INTERNALLY 7
				88
				89	#define DM_NUMA_NODE NUMA_NO_NODE
				90	static int dm_numa_node = DM_NUMA_NODE;
				91
				92	/*
				93	* For mempools pre-allocation at the table loading time.
				94	*/
				95	struct dm_md_mempools {
				96	mempool_t *io_pool;
				97	struct bio_set *bs;
				98	};
				99
				100	struct table_device {
				101	struct list_head list;
				102	atomic_t count;
				103	struct dm_dev dm_dev;
				104	};
				105
				106	static struct kmem_cache *_io_cache;
				107	static struct kmem_cache *_rq_tio_cache;
				108	static struct kmem_cache *_rq_cache;
				109
				110	/*
				111	* Bio-based DM's mempools' reserved IOs set by the user.
				112	*/
				113	#define RESERVED_BIO_BASED_IOS 16
				114	static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
				115
				116	static int __dm_get_module_param_int(int *module_param, int min, int max)
				117	{
				118	int param = ACCESS_ONCE(*module_param);
				119	int modified_param = 0;
				120	bool modified = true;
				121
				122	if (param < min)
				123	modified_param = min;
				124	else if (param > max)
				125	modified_param = max;
				126	else
				127	modified = false;
				128
				129	if (modified) {
				130	(void)cmpxchg(module_param, param, modified_param);
				131	param = modified_param;
				132	}
				133
				134	return param;
				135	}
				136
				137	unsigned __dm_get_module_param(unsigned *module_param,
				138	unsigned def, unsigned max)
				139	{
				140	unsigned param = ACCESS_ONCE(*module_param);
				141	unsigned modified_param = 0;
				142
				143	if (!param)
				144	modified_param = def;
				145	else if (param > max)
				146	modified_param = max;
				147
				148	if (modified_param) {
				149	(void)cmpxchg(module_param, param, modified_param);
				150	param = modified_param;
				151	}
				152
				153	return param;
				154	}
				155
				156	unsigned dm_get_reserved_bio_based_ios(void)
				157	{
				158	return __dm_get_module_param(&reserved_bio_based_ios,
				159	RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
				160	}
				161	EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
				162
				163	static unsigned dm_get_numa_node(void)
				164	{
				165	return __dm_get_module_param_int(&dm_numa_node,
				166	DM_NUMA_NODE, num_online_nodes() - 1);
				167	}
				168
				169	static int __init local_init(void)
				170	{
				171	int r = -ENOMEM;
				172
				173	/* allocate a slab for the dm_ios */
				174	_io_cache = KMEM_CACHE(dm_io, 0);
				175	if (!_io_cache)
				176	return r;
				177
				178	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
				179	if (!_rq_tio_cache)
				180	goto out_free_io_cache;
				181
				182	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
				183	__alignof__(struct request), 0, NULL);
				184	if (!_rq_cache)
				185	goto out_free_rq_tio_cache;
				186
				187	r = dm_uevent_init();
				188	if (r)
				189	goto out_free_rq_cache;
				190
				191	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
				192	if (!deferred_remove_workqueue) {
				193	r = -ENOMEM;
				194	goto out_uevent_exit;
				195	}
				196
				197	_major = major;
				198	r = register_blkdev(_major, _name);
				199	if (r < 0)
				200	goto out_free_workqueue;
				201
				202	if (!_major)
				203	_major = r;
				204
				205	return 0;
				206
				207	out_free_workqueue:
				208	destroy_workqueue(deferred_remove_workqueue);
				209	out_uevent_exit:
				210	dm_uevent_exit();
				211	out_free_rq_cache:
				212	kmem_cache_destroy(_rq_cache);
				213	out_free_rq_tio_cache:
				214	kmem_cache_destroy(_rq_tio_cache);
				215	out_free_io_cache:
				216	kmem_cache_destroy(_io_cache);
				217
				218	return r;
				219	}
				220
				221	static void local_exit(void)
				222	{
				223	flush_scheduled_work();
				224	destroy_workqueue(deferred_remove_workqueue);
				225
				226	kmem_cache_destroy(_rq_cache);
				227	kmem_cache_destroy(_rq_tio_cache);
				228	kmem_cache_destroy(_io_cache);
				229	unregister_blkdev(_major, _name);
				230	dm_uevent_exit();
				231
				232	_major = 0;
				233
				234	DMINFO("cleaned up");
				235	}
				236
				237	static int (*_inits[])(void) __initdata = {
				238	local_init,
				239	dm_target_init,
				240	dm_linear_init,
				241	dm_stripe_init,
				242	dm_io_init,
				243	dm_kcopyd_init,
				244	dm_interface_init,
				245	dm_statistics_init,
				246	};
				247
				248	static void (*_exits[])(void) = {
				249	local_exit,
				250	dm_target_exit,
				251	dm_linear_exit,
				252	dm_stripe_exit,
				253	dm_io_exit,
				254	dm_kcopyd_exit,
				255	dm_interface_exit,
				256	dm_statistics_exit,
				257	};
				258
				259	static int __init dm_init(void)
				260	{
				261	const int count = ARRAY_SIZE(_inits);
				262
				263	int r, i;
				264
				265	for (i = 0; i < count; i++) {
				266	r = _inits[i]();
				267	if (r)
				268	goto bad;
				269	}
				270
				271	return 0;
				272
				273	bad:
				274	while (i--)
				275	_exits[i]();
				276
				277	return r;
				278	}
				279
				280	static void __exit dm_exit(void)
				281	{
				282	int i = ARRAY_SIZE(_exits);
				283
				284	while (i--)
				285	_exits[i]();
				286
				287	/*
				288	* Should be empty by this point.
				289	*/
				290	idr_destroy(&_minor_idr);
				291	}
				292
				293	/*
				294	* Block device functions
				295	*/
				296	int dm_deleting_md(struct mapped_device *md)
				297	{
				298	return test_bit(DMF_DELETING, &md->flags);
				299	}
				300
				301	static int dm_blk_open(struct block_device *bdev, fmode_t mode)
				302	{
				303	struct mapped_device *md;
				304
				305	spin_lock(&_minor_lock);
				306
				307	md = bdev->bd_disk->private_data;
				308	if (!md)
				309	goto out;
				310
				311	if (test_bit(DMF_FREEING, &md->flags) \|\|
				312	dm_deleting_md(md)) {
				313	md = NULL;
				314	goto out;
				315	}
				316
				317	dm_get(md);
				318	atomic_inc(&md->open_count);
				319	out:
				320	spin_unlock(&_minor_lock);
				321
				322	return md ? 0 : -ENXIO;
				323	}
				324
				325	static void dm_blk_close(struct gendisk *disk, fmode_t mode)
				326	{
				327	struct mapped_device *md;
				328
				329	spin_lock(&_minor_lock);
				330
				331	md = disk->private_data;
				332	if (WARN_ON(!md))
				333	goto out;
				334
				335	if (atomic_dec_and_test(&md->open_count) &&
				336	(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
				337	queue_work(deferred_remove_workqueue, &deferred_remove_work);
				338
				339	dm_put(md);
				340	out:
				341	spin_unlock(&_minor_lock);
				342	}
				343
				344	int dm_open_count(struct mapped_device *md)
				345	{
				346	return atomic_read(&md->open_count);
				347	}
				348
				349	/*
				350	* Guarantees nothing is using the device before it's deleted.
				351	*/
				352	int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
				353	{
				354	int r = 0;
				355
				356	spin_lock(&_minor_lock);
				357
				358	if (dm_open_count(md)) {
				359	r = -EBUSY;
				360	if (mark_deferred)
				361	set_bit(DMF_DEFERRED_REMOVE, &md->flags);
				362	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
				363	r = -EEXIST;
				364	else
				365	set_bit(DMF_DELETING, &md->flags);
				366
				367	spin_unlock(&_minor_lock);
				368
				369	return r;
				370	}
				371
				372	int dm_cancel_deferred_remove(struct mapped_device *md)
				373	{
				374	int r = 0;
				375
				376	spin_lock(&_minor_lock);
				377
				378	if (test_bit(DMF_DELETING, &md->flags))
				379	r = -EBUSY;
				380	else
				381	clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
				382
				383	spin_unlock(&_minor_lock);
				384
				385	return r;
				386	}
				387
				388	static void do_deferred_remove(struct work_struct *w)
				389	{
				390	dm_deferred_remove();
				391	}
				392
				393	sector_t dm_get_size(struct mapped_device *md)
				394	{
				395	return get_capacity(md->disk);
				396	}
				397
				398	struct request_queue dm_get_md_queue(struct mapped_device md)
				399	{
				400	return md->queue;
				401	}
				402
				403	struct dm_stats dm_get_stats(struct mapped_device md)
				404	{
				405	return &md->stats;
				406	}
				407
				408	static int dm_blk_getgeo(struct block_device bdev, struct hd_geometry geo)
				409	{
				410	struct mapped_device *md = bdev->bd_disk->private_data;
				411
				412	return dm_get_geometry(md, geo);
				413	}
				414
				415	static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
				416	struct block_device **bdev,
				417	fmode_t *mode)
				418	{
				419	struct dm_target *tgt;
				420	struct dm_table *map;
				421	int srcu_idx, r;
				422
				423	retry:
				424	r = -ENOTTY;
				425	map = dm_get_live_table(md, &srcu_idx);
				426	if (!map \|\| !dm_table_get_size(map))
				427	goto out;
				428
				429	/* We only support devices that have a single target */
				430	if (dm_table_get_num_targets(map) != 1)
				431	goto out;
				432
				433	tgt = dm_table_get_target(map, 0);
				434	if (!tgt->type->prepare_ioctl)
				435	goto out;
				436
				437	if (dm_suspended_md(md)) {
				438	r = -EAGAIN;
				439	goto out;
				440	}
				441
				442	r = tgt->type->prepare_ioctl(tgt, bdev, mode);
				443	if (r < 0)
				444	goto out;
				445
				446	bdgrab(*bdev);
				447	dm_put_live_table(md, srcu_idx);
				448	return r;
				449
				450	out:
				451	dm_put_live_table(md, srcu_idx);
				452	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
				453	msleep(10);
				454	goto retry;
				455	}
				456	return r;
				457	}
				458
				459	static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
				460	unsigned int cmd, unsigned long arg)
				461	{
				462	struct mapped_device *md = bdev->bd_disk->private_data;
				463	int r;
				464
				465	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
				466	if (r < 0)
				467	return r;
				468
				469	if (r > 0) {
				470	/*
				471	* Target determined this ioctl is being issued against a
				472	* subset of the parent bdev; require extra privileges.
				473	*/
				474	if (!capable(CAP_SYS_RAWIO)) {
				475	DMWARN_LIMIT(
				476	"%s: sending ioctl %x to DM device without required privilege.",
				477	current->comm, cmd);
				478	r = -ENOIOCTLCMD;
				479	goto out;
				480	}
				481	}
				482
				483	r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
				484	out:
				485	bdput(bdev);
				486	return r;
				487	}
				488
				489	static struct dm_io alloc_io(struct mapped_device md)
				490	{
				491	return mempool_alloc(md->io_pool, GFP_NOIO);
				492	}
				493
				494	static void free_io(struct mapped_device md, struct dm_io io)
				495	{
				496	mempool_free(io, md->io_pool);
				497	}
				498
				499	static void free_tio(struct dm_target_io *tio)
				500	{
				501	bio_put(&tio->clone);
				502	}
				503
				504	int md_in_flight(struct mapped_device *md)
				505	{
				506	return atomic_read(&md->pending[READ]) +
				507	atomic_read(&md->pending[WRITE]);
				508	}
				509
				510	static void start_io_acct(struct dm_io *io)
				511	{
				512	struct mapped_device *md = io->md;
				513	struct bio *bio = io->bio;
				514	int cpu;
				515	int rw = bio_data_dir(bio);
				516
				517	io->start_time = jiffies;
				518
				519	cpu = part_stat_lock();
				520	part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
				521	part_stat_unlock();
				522	atomic_set(&dm_disk(md)->part0.in_flight[rw],
				523	atomic_inc_return(&md->pending[rw]));
				524
				525	if (unlikely(dm_stats_used(&md->stats)))
				526	dm_stats_account_io(&md->stats, bio_data_dir(bio),
				527	bio->bi_iter.bi_sector, bio_sectors(bio),
				528	false, 0, &io->stats_aux);
				529	}
				530
				531	static void end_io_acct(struct dm_io *io)
				532	{
				533	struct mapped_device *md = io->md;
				534	struct bio *bio = io->bio;
				535	unsigned long duration = jiffies - io->start_time;
				536	int pending;
				537	int rw = bio_data_dir(bio);
				538
				539	generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
				540
				541	if (unlikely(dm_stats_used(&md->stats)))
				542	dm_stats_account_io(&md->stats, bio_data_dir(bio),
				543	bio->bi_iter.bi_sector, bio_sectors(bio),
				544	true, duration, &io->stats_aux);
				545
				546	/*
				547	* After this is decremented the bio must not be touched if it is
				548	* a flush.
				549	*/
				550	pending = atomic_dec_return(&md->pending[rw]);
				551	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
				552	pending += atomic_read(&md->pending[rw^0x1]);
				553
				554	/* nudge anyone waiting on suspend queue */
				555	if (!pending)
				556	wake_up(&md->wait);
				557	}
				558
				559	/*
				560	* Add the bio to the list of deferred io.
				561	*/
				562	static void queue_io(struct mapped_device md, struct bio bio)
				563	{
				564	unsigned long flags;
				565
				566	spin_lock_irqsave(&md->deferred_lock, flags);
				567	bio_list_add(&md->deferred, bio);
				568	spin_unlock_irqrestore(&md->deferred_lock, flags);
				569	queue_work(md->wq, &md->work);
				570	}
				571
				572	/*
				573	* Everyone (including functions in this file), should use this
				574	* function to access the md->map field, and make sure they call
				575	* dm_put_live_table() when finished.
				576	*/
				577	struct dm_table dm_get_live_table(struct mapped_device md, int *srcu_idx) __acquires(md->io_barrier)
				578	{
				579	*srcu_idx = srcu_read_lock(&md->io_barrier);
				580
				581	return srcu_dereference(md->map, &md->io_barrier);
				582	}
				583
				584	void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
				585	{
				586	srcu_read_unlock(&md->io_barrier, srcu_idx);
				587	}
				588
				589	void dm_sync_table(struct mapped_device *md)
				590	{
				591	synchronize_srcu(&md->io_barrier);
				592	synchronize_rcu_expedited();
				593	}
				594
				595	/*
				596	* A fast alternative to dm_get_live_table/dm_put_live_table.
				597	* The caller must not block between these two functions.
				598	*/
				599	static struct dm_table dm_get_live_table_fast(struct mapped_device md) __acquires(RCU)
				600	{
				601	rcu_read_lock();
				602	return rcu_dereference(md->map);
				603	}
				604
				605	static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
				606	{
				607	rcu_read_unlock();
				608	}
				609
				610	/*
				611	* Open a table device so we can use it as a map destination.
				612	*/
				613	static int open_table_device(struct table_device *td, dev_t dev,
				614	struct mapped_device *md)
				615	{
				616	static char *_claim_ptr = "I belong to device-mapper";
				617	struct block_device *bdev;
				618
				619	int r;
				620
				621	BUG_ON(td->dm_dev.bdev);
				622
				623	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode \| FMODE_EXCL, _claim_ptr);
				624	if (IS_ERR(bdev))
				625	return PTR_ERR(bdev);
				626
				627	r = bd_link_disk_holder(bdev, dm_disk(md));
				628	if (r) {
				629	blkdev_put(bdev, td->dm_dev.mode \| FMODE_EXCL);
				630	return r;
				631	}
				632
				633	td->dm_dev.bdev = bdev;
				634	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
				635	return 0;
				636	}
				637
				638	/*
				639	* Close a table device that we've been using.
				640	*/
				641	static void close_table_device(struct table_device td, struct mapped_device md)
				642	{
				643	if (!td->dm_dev.bdev)
				644	return;
				645
				646	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
				647	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode \| FMODE_EXCL);
				648	put_dax(td->dm_dev.dax_dev);
				649	td->dm_dev.bdev = NULL;
				650	td->dm_dev.dax_dev = NULL;
				651	}
				652
				653	static struct table_device find_table_device(struct list_head l, dev_t dev,
				654	fmode_t mode) {
				655	struct table_device *td;
				656
				657	list_for_each_entry(td, l, list)
				658	if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
				659	return td;
				660
				661	return NULL;
				662	}
				663
				664	int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
				665	struct dm_dev **result) {
				666	int r;
				667	struct table_device *td;
				668
				669	mutex_lock(&md->table_devices_lock);
				670	td = find_table_device(&md->table_devices, dev, mode);
				671	if (!td) {
				672	td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
				673	if (!td) {
				674	mutex_unlock(&md->table_devices_lock);
				675	return -ENOMEM;
				676	}
				677
				678	td->dm_dev.mode = mode;
				679	td->dm_dev.bdev = NULL;
				680
				681	if ((r = open_table_device(td, dev, md))) {
				682	mutex_unlock(&md->table_devices_lock);
				683	kfree(td);
				684	return r;
				685	}
				686
				687	format_dev_t(td->dm_dev.name, dev);
				688
				689	atomic_set(&td->count, 0);
				690	list_add(&td->list, &md->table_devices);
				691	}
				692	atomic_inc(&td->count);
				693	mutex_unlock(&md->table_devices_lock);
				694
				695	*result = &td->dm_dev;
				696	return 0;
				697	}
				698	EXPORT_SYMBOL_GPL(dm_get_table_device);
				699
				700	void dm_put_table_device(struct mapped_device md, struct dm_dev d)
				701	{
				702	struct table_device *td = container_of(d, struct table_device, dm_dev);
				703
				704	mutex_lock(&md->table_devices_lock);
				705	if (atomic_dec_and_test(&td->count)) {
				706	close_table_device(td, md);
				707	list_del(&td->list);
				708	kfree(td);
				709	}
				710	mutex_unlock(&md->table_devices_lock);
				711	}
				712	EXPORT_SYMBOL(dm_put_table_device);
				713
				714	static void free_table_devices(struct list_head *devices)
				715	{
				716	struct list_head tmp, next;
				717
				718	list_for_each_safe(tmp, next, devices) {
				719	struct table_device *td = list_entry(tmp, struct table_device, list);
				720
				721	DMWARN("dm_destroy: %s still exists with %d references",
				722	td->dm_dev.name, atomic_read(&td->count));
				723	kfree(td);
				724	}
				725	}
				726
				727	/*
				728	* Get the geometry associated with a dm device
				729	*/
				730	int dm_get_geometry(struct mapped_device md, struct hd_geometry geo)
				731	{
				732	*geo = md->geometry;
				733
				734	return 0;
				735	}
				736
				737	/*
				738	* Set the geometry of a device.
				739	*/
				740	int dm_set_geometry(struct mapped_device md, struct hd_geometry geo)
				741	{
				742	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
				743
				744	if (geo->start > sz) {
				745	DMWARN("Start sector is beyond the geometry limits.");
				746	return -EINVAL;
				747	}
				748
				749	md->geometry = *geo;
				750
				751	return 0;
				752	}
				753
				754	/*-----------------------------------------------------------------
				755	* CRUD START:
				756	* A more elegant soln is in the works that uses the queue
				757	* merge fn, unfortunately there are a couple of changes to
				758	* the block layer that I want to make for this. So in the
				759	* interests of getting something for people to use I give
				760	* you this clearly demarcated crap.
				761	---------------------------------------------------------------/
				762
				763	static int __noflush_suspending(struct mapped_device *md)
				764	{
				765	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				766	}
				767
				768	/*
				769	* Decrements the number of outstanding ios that a bio has been
				770	* cloned into, completing the original io if necc.
				771	*/
				772	static void dec_pending(struct dm_io *io, blk_status_t error)
				773	{
				774	unsigned long flags;
				775	blk_status_t io_error;
				776	struct bio *bio;
				777	struct mapped_device *md = io->md;
				778
				779	/* Push-back supersedes any I/O errors */
				780	if (unlikely(error)) {
				781	spin_lock_irqsave(&io->endio_lock, flags);
				782	if (!(io->status == BLK_STS_DM_REQUEUE &&
				783	__noflush_suspending(md)))
				784	io->status = error;
				785	spin_unlock_irqrestore(&io->endio_lock, flags);
				786	}
				787
				788	if (atomic_dec_and_test(&io->io_count)) {
				789	if (io->status == BLK_STS_DM_REQUEUE) {
				790	/*
				791	* Target requested pushing back the I/O.
				792	*/
				793	spin_lock_irqsave(&md->deferred_lock, flags);
				794	if (__noflush_suspending(md))
				795	bio_list_add_head(&md->deferred, io->bio);
				796	else
				797	/* noflush suspend was interrupted. */
				798	io->status = BLK_STS_IOERR;
				799	spin_unlock_irqrestore(&md->deferred_lock, flags);
				800	}
				801
				802	io_error = io->status;
				803	bio = io->bio;
				804	end_io_acct(io);
				805	free_io(md, io);
				806
				807	if (io_error == BLK_STS_DM_REQUEUE)
				808	return;
				809
				810	if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
				811	/*
				812	* Preflush done for flush with data, reissue
				813	* without REQ_PREFLUSH.
				814	*/
				815	bio->bi_opf &= ~REQ_PREFLUSH;
				816	queue_io(md, bio);
				817	} else {
				818	/* done with normal IO or empty flush */
				819	if (io_error)
				820	bio->bi_status = io_error;
				821	bio_endio(bio);
				822	}
				823	}
				824	}
				825
				826	void disable_write_same(struct mapped_device *md)
				827	{
				828	struct queue_limits *limits = dm_get_queue_limits(md);
				829
				830	/* device doesn't really support WRITE SAME, disable it */
				831	limits->max_write_same_sectors = 0;
				832	}
				833
				834	void disable_write_zeroes(struct mapped_device *md)
				835	{
				836	struct queue_limits *limits = dm_get_queue_limits(md);
				837
				838	/* device doesn't really support WRITE ZEROES, disable it */
				839	limits->max_write_zeroes_sectors = 0;
				840	}
				841
				842	static void clone_endio(struct bio *bio)
				843	{
				844	blk_status_t error = bio->bi_status;
				845	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				846	struct dm_io *io = tio->io;
				847	struct mapped_device *md = tio->io->md;
				848	dm_endio_fn endio = tio->ti->type->end_io;
				849
				850	if (unlikely(error == BLK_STS_TARGET)) {
				851	if (bio_op(bio) == REQ_OP_WRITE_SAME &&
				852	!bio->bi_disk->queue->limits.max_write_same_sectors)
				853	disable_write_same(md);
				854	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
				855	!bio->bi_disk->queue->limits.max_write_zeroes_sectors)
				856	disable_write_zeroes(md);
				857	}
				858
				859	if (endio) {
				860	int r = endio(tio->ti, bio, &error);
				861	switch (r) {
				862	case DM_ENDIO_REQUEUE:
				863	error = BLK_STS_DM_REQUEUE;
				864	/FALLTHRU/
				865	case DM_ENDIO_DONE:
				866	break;
				867	case DM_ENDIO_INCOMPLETE:
				868	/* The target will handle the io */
				869	return;
				870	default:
				871	DMWARN("unimplemented target endio return value: %d", r);
				872	BUG();
				873	}
				874	}
				875
				876	free_tio(tio);
				877	dec_pending(io, error);
				878	}
				879
				880	/*
				881	* Return maximum size of I/O possible at the supplied sector up to the current
				882	* target boundary.
				883	*/
				884	static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
				885	{
				886	sector_t target_offset = dm_target_offset(ti, sector);
				887
				888	return ti->len - target_offset;
				889	}
				890
				891	static sector_t max_io_len(sector_t sector, struct dm_target *ti)
				892	{
				893	sector_t len = max_io_len_target_boundary(sector, ti);
				894	sector_t offset, max_len;
				895
				896	/*
				897	* Does the target need to split even further?
				898	*/
				899	if (ti->max_io_len) {
				900	offset = dm_target_offset(ti, sector);
				901	if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
				902	max_len = sector_div(offset, ti->max_io_len);
				903	else
				904	max_len = offset & (ti->max_io_len - 1);
				905	max_len = ti->max_io_len - max_len;
				906
				907	if (len > max_len)
				908	len = max_len;
				909	}
				910
				911	return len;
				912	}
				913
				914	int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
				915	{
				916	if (len > UINT_MAX) {
				917	DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
				918	(unsigned long long)len, UINT_MAX);
				919	ti->error = "Maximum size of target IO is too large";
				920	return -EINVAL;
				921	}
				922
				923	ti->max_io_len = (uint32_t) len;
				924
				925	return 0;
				926	}
				927	EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
				928
				929	static struct dm_target dm_dax_get_live_target(struct mapped_device md,
				930	sector_t sector, int *srcu_idx)
				931	{
				932	struct dm_table *map;
				933	struct dm_target *ti;
				934
				935	map = dm_get_live_table(md, srcu_idx);
				936	if (!map)
				937	return NULL;
				938
				939	ti = dm_table_find_target(map, sector);
				940	if (!dm_target_is_valid(ti))
				941	return NULL;
				942
				943	return ti;
				944	}
				945
				946	static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
				947	long nr_pages, void *kaddr, pfn_t pfn)
				948	{
				949	struct mapped_device *md = dax_get_private(dax_dev);
				950	sector_t sector = pgoff * PAGE_SECTORS;
				951	struct dm_target *ti;
				952	long len, ret = -EIO;
				953	int srcu_idx;
				954
				955	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
				956
				957	if (!ti)
				958	goto out;
				959	if (!ti->type->direct_access)
				960	goto out;
				961	len = max_io_len(sector, ti) / PAGE_SECTORS;
				962	if (len < 1)
				963	goto out;
				964	nr_pages = min(len, nr_pages);
				965	ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
				966
				967	out:
				968	dm_put_live_table(md, srcu_idx);
				969
				970	return ret;
				971	}
				972
				973	static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
				974	void addr, size_t bytes, struct iov_iter i)
				975	{
				976	struct mapped_device *md = dax_get_private(dax_dev);
				977	sector_t sector = pgoff * PAGE_SECTORS;
				978	struct dm_target *ti;
				979	long ret = 0;
				980	int srcu_idx;
				981
				982	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
				983
				984	if (!ti)
				985	goto out;
				986	if (!ti->type->dax_copy_from_iter) {
				987	ret = copy_from_iter(addr, bytes, i);
				988	goto out;
				989	}
				990	ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
				991	out:
				992	dm_put_live_table(md, srcu_idx);
				993
				994	return ret;
				995	}
				996
				997	/*
				998	* A target may call dm_accept_partial_bio only from the map routine. It is
				999	* allowed for all bio types except REQ_PREFLUSH.
				1000	*
				1001	* dm_accept_partial_bio informs the dm that the target only wants to process
				1002	* additional n_sectors sectors of the bio and the rest of the data should be
				1003	* sent in a next bio.
				1004	*
				1005	* A diagram that explains the arithmetics:
				1006	* +--------------------+---------------+-------+
				1007	* \| 1 \| 2 \| 3 \|
				1008	* +--------------------+---------------+-------+
				1009	*
				1010	* <-------------- *tio->len_ptr --------------->
				1011	* <------- bi_size ------->
				1012	* <-- n_sectors -->
				1013	*
				1014	* Region 1 was already iterated over with bio_advance or similar function.
				1015	* (it may be empty if the target doesn't use bio_advance)
				1016	* Region 2 is the remaining bio size that the target wants to process.
				1017	* (it may be empty if region 1 is non-empty, although there is no reason
				1018	* to make it empty)
				1019	* The target requires that region 3 is to be sent in the next bio.
				1020	*
				1021	* If the target wants to receive multiple copies of the bio (via num_*bios, etc),
				1022	* the partially processed part (the sum of regions 1+2) must be the same for all
				1023	* copies of the bio.
				1024	*/
				1025	void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
				1026	{
				1027	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				1028	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
				1029	BUG_ON(bio->bi_opf & REQ_PREFLUSH);
				1030	BUG_ON(bi_size > *tio->len_ptr);
				1031	BUG_ON(n_sectors > bi_size);
				1032	*tio->len_ptr -= bi_size - n_sectors;
				1033	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
				1034	}
				1035	EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
				1036
				1037	/*
				1038	* The zone descriptors obtained with a zone report indicate zone positions
				1039	* within the target backing device, regardless of that device is a partition
				1040	* and regardless of the target mapping start sector on the device or partition.
				1041	* The zone descriptors start sector and write pointer position must be adjusted
				1042	* to match their relative position within the dm device.
				1043	* A target may call dm_remap_zone_report() after completion of a
				1044	* REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
				1045	* backing device.
				1046	*/
				1047	void dm_remap_zone_report(struct dm_target ti, struct bio bio, sector_t start)
				1048	{
				1049	#ifdef CONFIG_BLK_DEV_ZONED
				1050	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				1051	struct bio *report_bio = tio->io->bio;
				1052	struct blk_zone_report_hdr *hdr = NULL;
				1053	struct blk_zone *zone;
				1054	unsigned int nr_rep = 0;
				1055	unsigned int ofst;
				1056	sector_t part_offset;
				1057	struct bio_vec bvec;
				1058	struct bvec_iter iter;
				1059	void *addr;
				1060
				1061	if (bio->bi_status)
				1062	return;
				1063
				1064	/*
				1065	* bio sector was incremented by the request size on completion. Taking
				1066	* into account the original request sector, the target start offset on
				1067	* the backing device and the target mapping offset (ti->begin), the
				1068	* start sector of the backing device. The partition offset is always 0
				1069	* if the target uses a whole device.
				1070	*/
				1071	part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
				1072
				1073	/*
				1074	* Remap the start sector of the reported zones. For sequential zones,
				1075	* also remap the write pointer position.
				1076	*/
				1077	bio_for_each_segment(bvec, report_bio, iter) {
				1078	addr = kmap_atomic(bvec.bv_page);
				1079
				1080	/* Remember the report header in the first page */
				1081	if (!hdr) {
				1082	hdr = addr;
				1083	ofst = sizeof(struct blk_zone_report_hdr);
				1084	} else
				1085	ofst = 0;
				1086
				1087	/* Set zones start sector */
				1088	while (hdr->nr_zones && ofst < bvec.bv_len) {
				1089	zone = addr + ofst;
				1090	zone->start -= part_offset;
				1091	if (zone->start >= start + ti->len) {
				1092	hdr->nr_zones = 0;
				1093	break;
				1094	}
				1095	zone->start = zone->start + ti->begin - start;
				1096	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
				1097	if (zone->cond == BLK_ZONE_COND_FULL)
				1098	zone->wp = zone->start + zone->len;
				1099	else if (zone->cond == BLK_ZONE_COND_EMPTY)
				1100	zone->wp = zone->start;
				1101	else
				1102	zone->wp = zone->wp + ti->begin - start - part_offset;
				1103	}
				1104	ofst += sizeof(struct blk_zone);
				1105	hdr->nr_zones--;
				1106	nr_rep++;
				1107	}
				1108
				1109	if (addr != hdr)
				1110	kunmap_atomic(addr);
				1111
				1112	if (!hdr->nr_zones)
				1113	break;
				1114	}
				1115
				1116	if (hdr) {
				1117	hdr->nr_zones = nr_rep;
				1118	kunmap_atomic(hdr);
				1119	}
				1120
				1121	bio_advance(report_bio, report_bio->bi_iter.bi_size);
				1122
				1123	#else /* !CONFIG_BLK_DEV_ZONED */
				1124	bio->bi_status = BLK_STS_NOTSUPP;
				1125	#endif
				1126	}
				1127	EXPORT_SYMBOL_GPL(dm_remap_zone_report);
				1128
				1129	/*
				1130	* Flush current->bio_list when the target map method blocks.
				1131	* This fixes deadlocks in snapshot and possibly in other targets.
				1132	*/
				1133	struct dm_offload {
				1134	struct blk_plug plug;
				1135	struct blk_plug_cb cb;
				1136	};
				1137
				1138	static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
				1139	{
				1140	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
				1141	struct bio_list list;
				1142	struct bio *bio;
				1143	int i;
				1144
				1145	INIT_LIST_HEAD(&o->cb.list);
				1146
				1147	if (unlikely(!current->bio_list))
				1148	return;
				1149
				1150	for (i = 0; i < 2; i++) {
				1151	list = current->bio_list[i];
				1152	bio_list_init(&current->bio_list[i]);
				1153
				1154	while ((bio = bio_list_pop(&list))) {
				1155	struct bio_set *bs = bio->bi_pool;
				1156	if (unlikely(!bs) \|\| bs == fs_bio_set \|\|
				1157	!bs->rescue_workqueue) {
				1158	bio_list_add(&current->bio_list[i], bio);
				1159	continue;
				1160	}
				1161
				1162	spin_lock(&bs->rescue_lock);
				1163	bio_list_add(&bs->rescue_list, bio);
				1164	queue_work(bs->rescue_workqueue, &bs->rescue_work);
				1165	spin_unlock(&bs->rescue_lock);
				1166	}
				1167	}
				1168	}
				1169
				1170	static void dm_offload_start(struct dm_offload *o)
				1171	{
				1172	blk_start_plug(&o->plug);
				1173	o->cb.callback = flush_current_bio_list;
				1174	list_add(&o->cb.list, &current->plug->cb_list);
				1175	}
				1176
				1177	static void dm_offload_end(struct dm_offload *o)
				1178	{
				1179	list_del(&o->cb.list);
				1180	blk_finish_plug(&o->plug);
				1181	}
				1182
				1183	static void __map_bio(struct dm_target_io *tio)
				1184	{
				1185	int r;
				1186	sector_t sector;
				1187	struct dm_offload o;
				1188	struct bio *clone = &tio->clone;
				1189	struct dm_target *ti = tio->ti;
				1190
				1191	clone->bi_end_io = clone_endio;
				1192
				1193	/*
				1194	* Map the clone. If r == 0 we don't need to do
				1195	* anything, the target has assumed ownership of
				1196	* this io.
				1197	*/
				1198	atomic_inc(&tio->io->io_count);
				1199	sector = clone->bi_iter.bi_sector;
				1200
				1201	dm_offload_start(&o);
				1202	r = ti->type->map(ti, clone);
				1203	dm_offload_end(&o);
				1204
				1205	switch (r) {
				1206	case DM_MAPIO_SUBMITTED:
				1207	break;
				1208	case DM_MAPIO_REMAPPED:
				1209	/* the bio has been remapped so dispatch it */
				1210	trace_block_bio_remap(clone->bi_disk->queue, clone,
				1211	bio_dev(tio->io->bio), sector);
				1212	generic_make_request(clone);
				1213	break;
				1214	case DM_MAPIO_KILL:
				1215	dec_pending(tio->io, BLK_STS_IOERR);
				1216	free_tio(tio);
				1217	break;
				1218	case DM_MAPIO_REQUEUE:
				1219	dec_pending(tio->io, BLK_STS_DM_REQUEUE);
				1220	free_tio(tio);
				1221	break;
				1222	default:
				1223	DMWARN("unimplemented target map return value: %d", r);
				1224	BUG();
				1225	}
				1226	}
				1227
				1228	struct clone_info {
				1229	struct mapped_device *md;
				1230	struct dm_table *map;
				1231	struct bio *bio;
				1232	struct dm_io *io;
				1233	sector_t sector;
				1234	unsigned sector_count;
				1235	};
				1236
				1237	static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
				1238	{
				1239	bio->bi_iter.bi_sector = sector;
				1240	bio->bi_iter.bi_size = to_bytes(len);
				1241	}
				1242
				1243	/*
				1244	* Creates a bio that consists of range of complete bvecs.
				1245	*/
				1246	static int clone_bio(struct dm_target_io tio, struct bio bio,
				1247	sector_t sector, unsigned len)
				1248	{
				1249	struct bio *clone = &tio->clone;
				1250
				1251	__bio_clone_fast(clone, bio);
				1252
				1253	if (unlikely(bio_integrity(bio) != NULL)) {
				1254	int r;
				1255
				1256	if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
				1257	!dm_target_passes_integrity(tio->ti->type))) {
				1258	DMWARN("%s: the target %s doesn't support integrity data.",
				1259	dm_device_name(tio->io->md),
				1260	tio->ti->type->name);
				1261	return -EIO;
				1262	}
				1263
				1264	r = bio_integrity_clone(clone, bio, GFP_NOIO);
				1265	if (r < 0)
				1266	return r;
				1267	}
				1268
				1269	if (bio_op(bio) != REQ_OP_ZONE_REPORT)
				1270	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
				1271	clone->bi_iter.bi_size = to_bytes(len);
				1272
				1273	if (unlikely(bio_integrity(bio) != NULL))
				1274	bio_integrity_trim(clone);
				1275
				1276	return 0;
				1277	}
				1278
				1279	static struct dm_target_io alloc_tio(struct clone_info ci,
				1280	struct dm_target *ti,
				1281	unsigned target_bio_nr)
				1282	{
				1283	struct dm_target_io *tio;
				1284	struct bio *clone;
				1285
				1286	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
				1287	tio = container_of(clone, struct dm_target_io, clone);
				1288
				1289	tio->io = ci->io;
				1290	tio->ti = ti;
				1291	tio->target_bio_nr = target_bio_nr;
				1292
				1293	return tio;
				1294	}
				1295
				1296	static void __clone_and_map_simple_bio(struct clone_info *ci,
				1297	struct dm_target *ti,
				1298	unsigned target_bio_nr, unsigned *len)
				1299	{
				1300	struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
				1301	struct bio *clone = &tio->clone;
				1302
				1303	tio->len_ptr = len;
				1304
				1305	__bio_clone_fast(clone, ci->bio);
				1306	if (len)
				1307	bio_setup_sector(clone, ci->sector, *len);
				1308
				1309	__map_bio(tio);
				1310	}
				1311
				1312	static void __send_duplicate_bios(struct clone_info ci, struct dm_target ti,
				1313	unsigned num_bios, unsigned *len)
				1314	{
				1315	unsigned target_bio_nr;
				1316
				1317	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
				1318	__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
				1319	}
				1320
				1321	static int __send_empty_flush(struct clone_info *ci)
				1322	{
				1323	unsigned target_nr = 0;
				1324	struct dm_target *ti;
				1325
				1326	BUG_ON(bio_has_data(ci->bio));
				1327	while ((ti = dm_table_get_target(ci->map, target_nr++)))
				1328	__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
				1329
				1330	return 0;
				1331	}
				1332
				1333	static int __clone_and_map_data_bio(struct clone_info ci, struct dm_target ti,
				1334	sector_t sector, unsigned *len)
				1335	{
				1336	struct bio *bio = ci->bio;
				1337	struct dm_target_io *tio;
				1338	unsigned target_bio_nr;
				1339	unsigned num_target_bios = 1;
				1340	int r = 0;
				1341
				1342	/*
				1343	* Does the target want to receive duplicate copies of the bio?
				1344	*/
				1345	if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
				1346	num_target_bios = ti->num_write_bios(ti, bio);
				1347
				1348	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
				1349	tio = alloc_tio(ci, ti, target_bio_nr);
				1350	tio->len_ptr = len;
				1351	r = clone_bio(tio, bio, sector, *len);
				1352	if (r < 0) {
				1353	free_tio(tio);
				1354	break;
				1355	}
				1356	__map_bio(tio);
				1357	}
				1358
				1359	return r;
				1360	}
				1361
				1362	typedef unsigned (get_num_bios_fn)(struct dm_target ti);
				1363
				1364	static unsigned get_num_discard_bios(struct dm_target *ti)
				1365	{
				1366	return ti->num_discard_bios;
				1367	}
				1368
				1369	static unsigned get_num_write_same_bios(struct dm_target *ti)
				1370	{
				1371	return ti->num_write_same_bios;
				1372	}
				1373
				1374	static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
				1375	{
				1376	return ti->num_write_zeroes_bios;
				1377	}
				1378
				1379	typedef bool (is_split_required_fn)(struct dm_target ti);
				1380
				1381	static bool is_split_required_for_discard(struct dm_target *ti)
				1382	{
				1383	return ti->split_discard_bios;
				1384	}
				1385
				1386	static int __send_changing_extent_only(struct clone_info *ci,
				1387	get_num_bios_fn get_num_bios,
				1388	is_split_required_fn is_split_required)
				1389	{
				1390	struct dm_target *ti;
				1391	unsigned len;
				1392	unsigned num_bios;
				1393
				1394	do {
				1395	ti = dm_table_find_target(ci->map, ci->sector);
				1396	if (!dm_target_is_valid(ti))
				1397	return -EIO;
				1398
				1399	/*
				1400	* Even though the device advertised support for this type of
				1401	* request, that does not mean every target supports it, and
				1402	* reconfiguration might also have changed that since the
				1403	* check was performed.
				1404	*/
				1405	num_bios = get_num_bios ? get_num_bios(ti) : 0;
				1406	if (!num_bios)
				1407	return -EOPNOTSUPP;
				1408
				1409	if (is_split_required && !is_split_required(ti))
				1410	len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
				1411	else
				1412	len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
				1413
				1414	__send_duplicate_bios(ci, ti, num_bios, &len);
				1415
				1416	ci->sector += len;
				1417	} while (ci->sector_count -= len);
				1418
				1419	return 0;
				1420	}
				1421
				1422	static int __send_discard(struct clone_info *ci)
				1423	{
				1424	return __send_changing_extent_only(ci, get_num_discard_bios,
				1425	is_split_required_for_discard);
				1426	}
				1427
				1428	static int __send_write_same(struct clone_info *ci)
				1429	{
				1430	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
				1431	}
				1432
				1433	static int __send_write_zeroes(struct clone_info *ci)
				1434	{
				1435	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
				1436	}
				1437
				1438	/*
				1439	* Select the correct strategy for processing a non-flush bio.
				1440	*/
				1441	static int __split_and_process_non_flush(struct clone_info *ci)
				1442	{
				1443	struct bio *bio = ci->bio;
				1444	struct dm_target *ti;
				1445	unsigned len;
				1446	int r;
				1447
				1448	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
				1449	return __send_discard(ci);
				1450	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
				1451	return __send_write_same(ci);
				1452	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
				1453	return __send_write_zeroes(ci);
				1454
				1455	ti = dm_table_find_target(ci->map, ci->sector);
				1456	if (!dm_target_is_valid(ti))
				1457	return -EIO;
				1458
				1459	if (bio_op(bio) == REQ_OP_ZONE_REPORT)
				1460	len = ci->sector_count;
				1461	else
				1462	len = min_t(sector_t, max_io_len(ci->sector, ti),
				1463	ci->sector_count);
				1464
				1465	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
				1466	if (r < 0)
				1467	return r;
				1468
				1469	ci->sector += len;
				1470	ci->sector_count -= len;
				1471
				1472	return 0;
				1473	}
				1474
				1475	/*
				1476	* Entry point to split a bio into clones and submit them to the targets.
				1477	*/
				1478	static void __split_and_process_bio(struct mapped_device *md,
				1479	struct dm_table map, struct bio bio)
				1480	{
				1481	struct clone_info ci;
				1482	int error = 0;
				1483
				1484	if (unlikely(!map)) {
				1485	bio_io_error(bio);
				1486	return;
				1487	}
				1488
				1489	ci.map = map;
				1490	ci.md = md;
				1491	ci.io = alloc_io(md);
				1492	ci.io->status = 0;
				1493	atomic_set(&ci.io->io_count, 1);
				1494	ci.io->bio = bio;
				1495	ci.io->md = md;
				1496	spin_lock_init(&ci.io->endio_lock);
				1497	ci.sector = bio->bi_iter.bi_sector;
				1498
				1499	start_io_acct(ci.io);
				1500
				1501	if (bio->bi_opf & REQ_PREFLUSH) {
				1502	ci.bio = &ci.md->flush_bio;
				1503	ci.sector_count = 0;
				1504	error = __send_empty_flush(&ci);
				1505	/* dec_pending submits any data associated with flush */
				1506	} else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
				1507	ci.bio = bio;
				1508	ci.sector_count = 0;
				1509	error = __split_and_process_non_flush(&ci);
				1510	} else {
				1511	ci.bio = bio;
				1512	ci.sector_count = bio_sectors(bio);
				1513	while (ci.sector_count && !error)
				1514	error = __split_and_process_non_flush(&ci);
				1515	}
				1516
				1517	/* drop the extra reference count */
				1518	dec_pending(ci.io, errno_to_blk_status(error));
				1519	}
				1520	/*-----------------------------------------------------------------
				1521	* CRUD END
				1522	---------------------------------------------------------------/
				1523
				1524	/*
				1525	* The request function that just remaps the bio built up by
				1526	* dm_merge_bvec.
				1527	*/
				1528	static blk_qc_t dm_make_request(struct request_queue q, struct bio bio)
				1529	{
				1530	int rw = bio_data_dir(bio);
				1531	struct mapped_device *md = q->queuedata;
				1532	int srcu_idx;
				1533	struct dm_table *map;
				1534
				1535	map = dm_get_live_table(md, &srcu_idx);
				1536
				1537	generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
				1538
				1539	/* if we're suspended, we have to queue this io for later */
				1540	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
				1541	dm_put_live_table(md, srcu_idx);
				1542
				1543	if (!(bio->bi_opf & REQ_RAHEAD))
				1544	queue_io(md, bio);
				1545	else
				1546	bio_io_error(bio);
				1547	return BLK_QC_T_NONE;
				1548	}
				1549
				1550	__split_and_process_bio(md, map, bio);
				1551	dm_put_live_table(md, srcu_idx);
				1552	return BLK_QC_T_NONE;
				1553	}
				1554
				1555	static int dm_any_congested(void *congested_data, int bdi_bits)
				1556	{
				1557	int r = bdi_bits;
				1558	struct mapped_device *md = congested_data;
				1559	struct dm_table *map;
				1560
				1561	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				1562	if (dm_request_based(md)) {
				1563	/*
				1564	* With request-based DM we only need to check the
				1565	* top-level queue for congestion.
				1566	*/
				1567	r = md->queue->backing_dev_info->wb.state & bdi_bits;
				1568	} else {
				1569	map = dm_get_live_table_fast(md);
				1570	if (map)
				1571	r = dm_table_any_congested(map, bdi_bits);
				1572	dm_put_live_table_fast(md);
				1573	}
				1574	}
				1575
				1576	return r;
				1577	}
				1578
				1579	/*-----------------------------------------------------------------
				1580	* An IDR is used to keep track of allocated minor numbers.
				1581	---------------------------------------------------------------/
				1582	static void free_minor(int minor)
				1583	{
				1584	spin_lock(&_minor_lock);
				1585	idr_remove(&_minor_idr, minor);
				1586	spin_unlock(&_minor_lock);
				1587	}
				1588
				1589	/*
				1590	* See if the device with a specific minor # is free.
				1591	*/
				1592	static int specific_minor(int minor)
				1593	{
				1594	int r;
				1595
				1596	if (minor >= (1 << MINORBITS))
				1597	return -EINVAL;
				1598
				1599	idr_preload(GFP_KERNEL);
				1600	spin_lock(&_minor_lock);
				1601
				1602	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
				1603
				1604	spin_unlock(&_minor_lock);
				1605	idr_preload_end();
				1606	if (r < 0)
				1607	return r == -ENOSPC ? -EBUSY : r;
				1608	return 0;
				1609	}
				1610
				1611	static int next_free_minor(int *minor)
				1612	{
				1613	int r;
				1614
				1615	idr_preload(GFP_KERNEL);
				1616	spin_lock(&_minor_lock);
				1617
				1618	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
				1619
				1620	spin_unlock(&_minor_lock);
				1621	idr_preload_end();
				1622	if (r < 0)
				1623	return r;
				1624	*minor = r;
				1625	return 0;
				1626	}
				1627
				1628	static const struct block_device_operations dm_blk_dops;
				1629	static const struct dax_operations dm_dax_ops;
				1630
				1631	static void dm_wq_work(struct work_struct *work);
				1632
				1633	void dm_init_md_queue(struct mapped_device *md)
				1634	{
				1635	/*
				1636	* Request-based dm devices cannot be stacked on top of bio-based dm
				1637	* devices. The type of this dm device may not have been decided yet.
				1638	* The type is decided at the first table loading time.
				1639	* To prevent problematic device stacking, clear the queue flag
				1640	* for request stacking support until then.
				1641	*
				1642	* This queue is new, so no concurrency on the queue_flags.
				1643	*/
				1644	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
				1645
				1646	/*
				1647	* Initialize data that will only be used by a non-blk-mq DM queue
				1648	* - must do so here (in alloc_dev callchain) before queue is used
				1649	*/
				1650	md->queue->queuedata = md;
				1651	}
				1652
				1653	void dm_init_normal_md_queue(struct mapped_device *md)
				1654	{
				1655	md->use_blk_mq = false;
				1656	dm_init_md_queue(md);
				1657
				1658	/*
				1659	* Initialize aspects of queue that aren't relevant for blk-mq
				1660	*/
				1661	md->queue->backing_dev_info->congested_data = md;
				1662	md->queue->backing_dev_info->congested_fn = dm_any_congested;
				1663	}
				1664
				1665	static void cleanup_mapped_device(struct mapped_device *md)
				1666	{
				1667	if (md->wq)
				1668	destroy_workqueue(md->wq);
				1669	if (md->kworker_task)
				1670	kthread_stop(md->kworker_task);
				1671	mempool_destroy(md->io_pool);
				1672	if (md->bs)
				1673	bioset_free(md->bs);
				1674
				1675	if (md->dax_dev) {
				1676	kill_dax(md->dax_dev);
				1677	put_dax(md->dax_dev);
				1678	md->dax_dev = NULL;
				1679	}
				1680
				1681	if (md->disk) {
				1682	spin_lock(&_minor_lock);
				1683	md->disk->private_data = NULL;
				1684	spin_unlock(&_minor_lock);
				1685	del_gendisk(md->disk);
				1686	put_disk(md->disk);
				1687	}
				1688
				1689	if (md->queue)
				1690	blk_cleanup_queue(md->queue);
				1691
				1692	cleanup_srcu_struct(&md->io_barrier);
				1693
				1694	if (md->bdev) {
				1695	bdput(md->bdev);
				1696	md->bdev = NULL;
				1697	}
				1698
				1699	dm_mq_cleanup_mapped_device(md);
				1700	}
				1701
				1702	/*
				1703	* Allocate and initialise a blank device with a given minor.
				1704	*/
				1705	static struct mapped_device *alloc_dev(int minor)
				1706	{
				1707	int r, numa_node_id = dm_get_numa_node();
				1708	struct dax_device *dax_dev;
				1709	struct mapped_device *md;
				1710	void *old_md;
				1711
				1712	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
				1713	if (!md) {
				1714	DMWARN("unable to allocate device, out of memory.");
				1715	return NULL;
				1716	}
				1717
				1718	if (!try_module_get(THIS_MODULE))
				1719	goto bad_module_get;
				1720
				1721	/* get a minor number for the dev */
				1722	if (minor == DM_ANY_MINOR)
				1723	r = next_free_minor(&minor);
				1724	else
				1725	r = specific_minor(minor);
				1726	if (r < 0)
				1727	goto bad_minor;
				1728
				1729	r = init_srcu_struct(&md->io_barrier);
				1730	if (r < 0)
				1731	goto bad_io_barrier;
				1732
				1733	md->numa_node_id = numa_node_id;
				1734	md->use_blk_mq = dm_use_blk_mq_default();
				1735	md->init_tio_pdu = false;
				1736	md->type = DM_TYPE_NONE;
				1737	mutex_init(&md->suspend_lock);
				1738	mutex_init(&md->type_lock);
				1739	mutex_init(&md->table_devices_lock);
				1740	spin_lock_init(&md->deferred_lock);
				1741	atomic_set(&md->holders, 1);
				1742	atomic_set(&md->open_count, 0);
				1743	atomic_set(&md->event_nr, 0);
				1744	atomic_set(&md->uevent_seq, 0);
				1745	INIT_LIST_HEAD(&md->uevent_list);
				1746	INIT_LIST_HEAD(&md->table_devices);
				1747	spin_lock_init(&md->uevent_lock);
				1748
				1749	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
				1750	if (!md->queue)
				1751	goto bad;
				1752
				1753	dm_init_md_queue(md);
				1754	/*
				1755	* default to bio-based required ->make_request_fn until DM
				1756	* table is loaded and md->type established. If request-based
				1757	* table is loaded: blk-mq will override accordingly.
				1758	*/
				1759	blk_queue_make_request(md->queue, dm_make_request);
				1760
				1761	md->disk = alloc_disk_node(1, numa_node_id);
				1762	if (!md->disk)
				1763	goto bad;
				1764
				1765	atomic_set(&md->pending[0], 0);
				1766	atomic_set(&md->pending[1], 0);
				1767	init_waitqueue_head(&md->wait);
				1768	INIT_WORK(&md->work, dm_wq_work);
				1769	init_waitqueue_head(&md->eventq);
				1770	init_completion(&md->kobj_holder.completion);
				1771	md->kworker_task = NULL;
				1772
				1773	md->disk->major = _major;
				1774	md->disk->first_minor = minor;
				1775	md->disk->fops = &dm_blk_dops;
				1776	md->disk->queue = md->queue;
				1777	md->disk->private_data = md;
				1778	sprintf(md->disk->disk_name, "dm-%d", minor);
				1779
				1780	dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
				1781	if (!dax_dev)
				1782	goto bad;
				1783	md->dax_dev = dax_dev;
				1784
				1785	add_disk(md->disk);
				1786	format_dev_t(md->name, MKDEV(_major, minor));
				1787
				1788	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
				1789	if (!md->wq)
				1790	goto bad;
				1791
				1792	md->bdev = bdget_disk(md->disk, 0);
				1793	if (!md->bdev)
				1794	goto bad;
				1795
				1796	bio_init(&md->flush_bio, NULL, 0);
				1797	bio_set_dev(&md->flush_bio, md->bdev);
				1798	md->flush_bio.bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH \| REQ_SYNC;
				1799
				1800	dm_stats_init(&md->stats);
				1801
				1802	/* Populate the mapping, nobody knows we exist yet */
				1803	spin_lock(&_minor_lock);
				1804	old_md = idr_replace(&_minor_idr, md, minor);
				1805	spin_unlock(&_minor_lock);
				1806
				1807	BUG_ON(old_md != MINOR_ALLOCED);
				1808
				1809	return md;
				1810
				1811	bad:
				1812	cleanup_mapped_device(md);
				1813	bad_io_barrier:
				1814	free_minor(minor);
				1815	bad_minor:
				1816	module_put(THIS_MODULE);
				1817	bad_module_get:
				1818	kvfree(md);
				1819	return NULL;
				1820	}
				1821
				1822	static void unlock_fs(struct mapped_device *md);
				1823
				1824	static void free_dev(struct mapped_device *md)
				1825	{
				1826	int minor = MINOR(disk_devt(md->disk));
				1827
				1828	unlock_fs(md);
				1829
				1830	cleanup_mapped_device(md);
				1831
				1832	free_table_devices(&md->table_devices);
				1833	dm_stats_cleanup(&md->stats);
				1834	free_minor(minor);
				1835
				1836	module_put(THIS_MODULE);
				1837	kvfree(md);
				1838	}
				1839
				1840	static void __bind_mempools(struct mapped_device md, struct dm_table t)
				1841	{
				1842	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
				1843
				1844	if (md->bs) {
				1845	/* The md already has necessary mempools. */
				1846	if (dm_table_bio_based(t)) {
				1847	/*
				1848	* Reload bioset because front_pad may have changed
				1849	* because a different table was loaded.
				1850	*/
				1851	bioset_free(md->bs);
				1852	md->bs = p->bs;
				1853	p->bs = NULL;
				1854	}
				1855	/*
				1856	* There's no need to reload with request-based dm
				1857	* because the size of front_pad doesn't change.
				1858	* Note for future: If you are to reload bioset,
				1859	* prep-ed requests in the queue may refer
				1860	* to bio from the old bioset, so you must walk
				1861	* through the queue to unprep.
				1862	*/
				1863	goto out;
				1864	}
				1865
				1866	BUG_ON(!p \|\| md->io_pool \|\| md->bs);
				1867
				1868	md->io_pool = p->io_pool;
				1869	p->io_pool = NULL;
				1870	md->bs = p->bs;
				1871	p->bs = NULL;
				1872
				1873	out:
				1874	/* mempool bind completed, no longer need any mempools in the table */
				1875	dm_table_free_md_mempools(t);
				1876	}
				1877
				1878	/*
				1879	* Bind a table to the device.
				1880	*/
				1881	static void event_callback(void *context)
				1882	{
				1883	unsigned long flags;
				1884	LIST_HEAD(uevents);
				1885	struct mapped_device md = (struct mapped_device ) context;
				1886
				1887	spin_lock_irqsave(&md->uevent_lock, flags);
				1888	list_splice_init(&md->uevent_list, &uevents);
				1889	spin_unlock_irqrestore(&md->uevent_lock, flags);
				1890
				1891	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
				1892
				1893	atomic_inc(&md->event_nr);
				1894	wake_up(&md->eventq);
				1895	dm_issue_global_event();
				1896	}
				1897
				1898	/*
				1899	* Protected by md->suspend_lock obtained by dm_swap_table().
				1900	*/
				1901	static void __set_size(struct mapped_device *md, sector_t size)
				1902	{
				1903	lockdep_assert_held(&md->suspend_lock);
				1904
				1905	set_capacity(md->disk, size);
				1906
				1907	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
				1908	}
				1909
				1910	/*
				1911	* Returns old map, which caller must destroy.
				1912	*/
				1913	static struct dm_table __bind(struct mapped_device md, struct dm_table *t,
				1914	struct queue_limits *limits)
				1915	{
				1916	struct dm_table *old_map;
				1917	struct request_queue *q = md->queue;
				1918	sector_t size;
				1919
				1920	lockdep_assert_held(&md->suspend_lock);
				1921
				1922	size = dm_table_get_size(t);
				1923
				1924	/*
				1925	* Wipe any geometry if the size of the table changed.
				1926	*/
				1927	if (size != dm_get_size(md))
				1928	memset(&md->geometry, 0, sizeof(md->geometry));
				1929
				1930	__set_size(md, size);
				1931
				1932	dm_table_event_callback(t, event_callback, md);
				1933
				1934	/*
				1935	* The queue hasn't been stopped yet, if the old table type wasn't
				1936	* for request-based during suspension. So stop it to prevent
				1937	* I/O mapping before resume.
				1938	* This must be done before setting the queue restrictions,
				1939	* because request-based dm may be run just after the setting.
				1940	*/
				1941	if (dm_table_request_based(t)) {
				1942	dm_stop_queue(q);
				1943	/*
				1944	* Leverage the fact that request-based DM targets are
				1945	* immutable singletons and establish md->immutable_target
				1946	* - used to optimize both dm_request_fn and dm_mq_queue_rq
				1947	*/
				1948	md->immutable_target = dm_table_get_immutable_target(t);
				1949	}
				1950
				1951	__bind_mempools(md, t);
				1952
				1953	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				1954	rcu_assign_pointer(md->map, (void *)t);
				1955	md->immutable_target_type = dm_table_get_immutable_target_type(t);
				1956
				1957	dm_table_set_restrictions(t, q, limits);
				1958	if (old_map)
				1959	dm_sync_table(md);
				1960
				1961	return old_map;
				1962	}
				1963
				1964	/*
				1965	* Returns unbound table for the caller to free.
				1966	*/
				1967	static struct dm_table __unbind(struct mapped_device md)
				1968	{
				1969	struct dm_table *map = rcu_dereference_protected(md->map, 1);
				1970
				1971	if (!map)
				1972	return NULL;
				1973
				1974	dm_table_event_callback(map, NULL, NULL);
				1975	RCU_INIT_POINTER(md->map, NULL);
				1976	dm_sync_table(md);
				1977
				1978	return map;
				1979	}
				1980
				1981	/*
				1982	* Constructor for a new device.
				1983	*/
				1984	int dm_create(int minor, struct mapped_device **result)
				1985	{
				1986	struct mapped_device *md;
				1987
				1988	md = alloc_dev(minor);
				1989	if (!md)
				1990	return -ENXIO;
				1991
				1992	dm_sysfs_init(md);
				1993
				1994	*result = md;
				1995	return 0;
				1996	}
				1997
				1998	/*
				1999	* Functions to manage md->type.
				2000	* All are required to hold md->type_lock.
				2001	*/
				2002	void dm_lock_md_type(struct mapped_device *md)
				2003	{
				2004	mutex_lock(&md->type_lock);
				2005	}
				2006
				2007	void dm_unlock_md_type(struct mapped_device *md)
				2008	{
				2009	mutex_unlock(&md->type_lock);
				2010	}
				2011
				2012	void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
				2013	{
				2014	BUG_ON(!mutex_is_locked(&md->type_lock));
				2015	md->type = type;
				2016	}
				2017
				2018	enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
				2019	{
				2020	return md->type;
				2021	}
				2022
				2023	struct target_type dm_get_immutable_target_type(struct mapped_device md)
				2024	{
				2025	return md->immutable_target_type;
				2026	}
				2027
				2028	/*
				2029	* The queue_limits are only valid as long as you have a reference
				2030	* count on 'md'.
				2031	*/
				2032	struct queue_limits dm_get_queue_limits(struct mapped_device md)
				2033	{
				2034	BUG_ON(!atomic_read(&md->holders));
				2035	return &md->queue->limits;
				2036	}
				2037	EXPORT_SYMBOL_GPL(dm_get_queue_limits);
				2038
				2039	/*
				2040	* Setup the DM device's queue based on md's type
				2041	*/
				2042	int dm_setup_md_queue(struct mapped_device md, struct dm_table t)
				2043	{
				2044	int r;
				2045	enum dm_queue_mode type = dm_get_md_type(md);
				2046
				2047	switch (type) {
				2048	case DM_TYPE_REQUEST_BASED:
				2049	r = dm_old_init_request_queue(md, t);
				2050	if (r) {
				2051	DMERR("Cannot initialize queue for request-based mapped device");
				2052	return r;
				2053	}
				2054	break;
				2055	case DM_TYPE_MQ_REQUEST_BASED:
				2056	r = dm_mq_init_request_queue(md, t);
				2057	if (r) {
				2058	DMERR("Cannot initialize queue for request-based dm-mq mapped device");
				2059	return r;
				2060	}
				2061	break;
				2062	case DM_TYPE_BIO_BASED:
				2063	case DM_TYPE_DAX_BIO_BASED:
				2064	dm_init_normal_md_queue(md);
				2065	/*
				2066	* DM handles splitting bios as needed. Free the bio_split bioset
				2067	* since it won't be used (saves 1 process per bio-based DM device).
				2068	*/
				2069	bioset_free(md->queue->bio_split);
				2070	md->queue->bio_split = NULL;
				2071	break;
				2072	case DM_TYPE_NONE:
				2073	WARN_ON_ONCE(true);
				2074	break;
				2075	}
				2076
				2077	return 0;
				2078	}
				2079
				2080	struct mapped_device *dm_get_md(dev_t dev)
				2081	{
				2082	struct mapped_device *md;
				2083	unsigned minor = MINOR(dev);
				2084
				2085	if (MAJOR(dev) != _major \|\| minor >= (1 << MINORBITS))
				2086	return NULL;
				2087
				2088	spin_lock(&_minor_lock);
				2089
				2090	md = idr_find(&_minor_idr, minor);
				2091	if (md) {
				2092	if ((md == MINOR_ALLOCED \|\|
				2093	(MINOR(disk_devt(dm_disk(md))) != minor) \|\|
				2094	dm_deleting_md(md) \|\|
				2095	test_bit(DMF_FREEING, &md->flags))) {
				2096	md = NULL;
				2097	goto out;
				2098	}
				2099	dm_get(md);
				2100	}
				2101
				2102	out:
				2103	spin_unlock(&_minor_lock);
				2104
				2105	return md;
				2106	}
				2107	EXPORT_SYMBOL_GPL(dm_get_md);
				2108
				2109	void dm_get_mdptr(struct mapped_device md)
				2110	{
				2111	return md->interface_ptr;
				2112	}
				2113
				2114	void dm_set_mdptr(struct mapped_device md, void ptr)
				2115	{
				2116	md->interface_ptr = ptr;
				2117	}
				2118
				2119	void dm_get(struct mapped_device *md)
				2120	{
				2121	atomic_inc(&md->holders);
				2122	BUG_ON(test_bit(DMF_FREEING, &md->flags));
				2123	}
				2124
				2125	int dm_hold(struct mapped_device *md)
				2126	{
				2127	spin_lock(&_minor_lock);
				2128	if (test_bit(DMF_FREEING, &md->flags)) {
				2129	spin_unlock(&_minor_lock);
				2130	return -EBUSY;
				2131	}
				2132	dm_get(md);
				2133	spin_unlock(&_minor_lock);
				2134	return 0;
				2135	}
				2136	EXPORT_SYMBOL_GPL(dm_hold);
				2137
				2138	const char dm_device_name(struct mapped_device md)
				2139	{
				2140	return md->name;
				2141	}
				2142	EXPORT_SYMBOL_GPL(dm_device_name);
				2143
				2144	static void __dm_destroy(struct mapped_device *md, bool wait)
				2145	{
				2146	struct request_queue *q = dm_get_md_queue(md);
				2147	struct dm_table *map;
				2148	int srcu_idx;
				2149
				2150	might_sleep();
				2151
				2152	spin_lock(&_minor_lock);
				2153	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
				2154	set_bit(DMF_FREEING, &md->flags);
				2155	spin_unlock(&_minor_lock);
				2156
				2157	blk_set_queue_dying(q);
				2158
				2159	if (dm_request_based(md) && md->kworker_task)
				2160	kthread_flush_worker(&md->kworker);
				2161
				2162	/*
				2163	* Take suspend_lock so that presuspend and postsuspend methods
				2164	* do not race with internal suspend.
				2165	*/
				2166	mutex_lock(&md->suspend_lock);
				2167	map = dm_get_live_table(md, &srcu_idx);
				2168	if (!dm_suspended_md(md)) {
				2169	dm_table_presuspend_targets(map);
				2170	dm_table_postsuspend_targets(map);
				2171	}
				2172	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
				2173	dm_put_live_table(md, srcu_idx);
				2174	mutex_unlock(&md->suspend_lock);
				2175
				2176	/*
				2177	* Rare, but there may be I/O requests still going to complete,
				2178	* for example. Wait for all references to disappear.
				2179	* No one should increment the reference count of the mapped_device,
				2180	* after the mapped_device state becomes DMF_FREEING.
				2181	*/
				2182	if (wait)
				2183	while (atomic_read(&md->holders))
				2184	msleep(1);
				2185	else if (atomic_read(&md->holders))
				2186	DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
				2187	dm_device_name(md), atomic_read(&md->holders));
				2188
				2189	dm_sysfs_exit(md);
				2190	dm_table_destroy(__unbind(md));
				2191	free_dev(md);
				2192	}
				2193
				2194	void dm_destroy(struct mapped_device *md)
				2195	{
				2196	__dm_destroy(md, true);
				2197	}
				2198
				2199	void dm_destroy_immediate(struct mapped_device *md)
				2200	{
				2201	__dm_destroy(md, false);
				2202	}
				2203
				2204	void dm_put(struct mapped_device *md)
				2205	{
				2206	atomic_dec(&md->holders);
				2207	}
				2208	EXPORT_SYMBOL_GPL(dm_put);
				2209
				2210	static int dm_wait_for_completion(struct mapped_device *md, long task_state)
				2211	{
				2212	int r = 0;
				2213	DEFINE_WAIT(wait);
				2214
				2215	while (1) {
				2216	prepare_to_wait(&md->wait, &wait, task_state);
				2217
				2218	if (!md_in_flight(md))
				2219	break;
				2220
				2221	if (signal_pending_state(task_state, current)) {
				2222	r = -EINTR;
				2223	break;
				2224	}
				2225
				2226	io_schedule();
				2227	}
				2228	finish_wait(&md->wait, &wait);
				2229
				2230	return r;
				2231	}
				2232
				2233	/*
				2234	* Process the deferred bios
				2235	*/
				2236	static void dm_wq_work(struct work_struct *work)
				2237	{
				2238	struct mapped_device *md = container_of(work, struct mapped_device,
				2239	work);
				2240	struct bio *c;
				2241	int srcu_idx;
				2242	struct dm_table *map;
				2243
				2244	map = dm_get_live_table(md, &srcu_idx);
				2245
				2246	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				2247	spin_lock_irq(&md->deferred_lock);
				2248	c = bio_list_pop(&md->deferred);
				2249	spin_unlock_irq(&md->deferred_lock);
				2250
				2251	if (!c)
				2252	break;
				2253
				2254	if (dm_request_based(md))
				2255	generic_make_request(c);
				2256	else
				2257	__split_and_process_bio(md, map, c);
				2258	}
				2259
				2260	dm_put_live_table(md, srcu_idx);
				2261	}
				2262
				2263	static void dm_queue_flush(struct mapped_device *md)
				2264	{
				2265	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2266	smp_mb__after_atomic();
				2267	queue_work(md->wq, &md->work);
				2268	}
				2269
				2270	/*
				2271	* Swap in a new table, returning the old one for the caller to destroy.
				2272	*/
				2273	struct dm_table dm_swap_table(struct mapped_device md, struct dm_table *table)
				2274	{
				2275	struct dm_table live_map = NULL, map = ERR_PTR(-EINVAL);
				2276	struct queue_limits limits;
				2277	int r;
				2278
				2279	mutex_lock(&md->suspend_lock);
				2280
				2281	/* device must be suspended */
				2282	if (!dm_suspended_md(md))
				2283	goto out;
				2284
				2285	/*
				2286	* If the new table has no data devices, retain the existing limits.
				2287	* This helps multipath with queue_if_no_path if all paths disappear,
				2288	* then new I/O is queued based on these limits, and then some paths
				2289	* reappear.
				2290	*/
				2291	if (dm_table_has_no_data_devices(table)) {
				2292	live_map = dm_get_live_table_fast(md);
				2293	if (live_map)
				2294	limits = md->queue->limits;
				2295	dm_put_live_table_fast(md);
				2296	}
				2297
				2298	if (!live_map) {
				2299	r = dm_calculate_queue_limits(table, &limits);
				2300	if (r) {
				2301	map = ERR_PTR(r);
				2302	goto out;
				2303	}
				2304	}
				2305
				2306	map = __bind(md, table, &limits);
				2307	dm_issue_global_event();
				2308
				2309	out:
				2310	mutex_unlock(&md->suspend_lock);
				2311	return map;
				2312	}
				2313
				2314	/*
				2315	* Functions to lock and unlock any filesystem running on the
				2316	* device.
				2317	*/
				2318	static int lock_fs(struct mapped_device *md)
				2319	{
				2320	int r;
				2321
				2322	WARN_ON(md->frozen_sb);
				2323
				2324	md->frozen_sb = freeze_bdev(md->bdev);
				2325	if (IS_ERR(md->frozen_sb)) {
				2326	r = PTR_ERR(md->frozen_sb);
				2327	md->frozen_sb = NULL;
				2328	return r;
				2329	}
				2330
				2331	set_bit(DMF_FROZEN, &md->flags);
				2332
				2333	return 0;
				2334	}
				2335
				2336	static void unlock_fs(struct mapped_device *md)
				2337	{
				2338	if (!test_bit(DMF_FROZEN, &md->flags))
				2339	return;
				2340
				2341	thaw_bdev(md->bdev, md->frozen_sb);
				2342	md->frozen_sb = NULL;
				2343	clear_bit(DMF_FROZEN, &md->flags);
				2344	}
				2345
				2346	/*
				2347	* @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
				2348	* @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
				2349	* @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
				2350	*
				2351	* If __dm_suspend returns 0, the device is completely quiescent
				2352	* now. There is no request-processing activity. All new requests
				2353	* are being added to md->deferred list.
				2354	*/
				2355	static int __dm_suspend(struct mapped_device md, struct dm_table map,
				2356	unsigned suspend_flags, long task_state,
				2357	int dmf_suspended_flag)
				2358	{
				2359	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
				2360	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
				2361	int r;
				2362
				2363	lockdep_assert_held(&md->suspend_lock);
				2364
				2365	/*
				2366	* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
				2367	* This flag is cleared before dm_suspend returns.
				2368	*/
				2369	if (noflush)
				2370	set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2371	else
				2372	pr_debug("%s: suspending with flush\n", dm_device_name(md));
				2373
				2374	/*
				2375	* This gets reverted if there's an error later and the targets
				2376	* provide the .presuspend_undo hook.
				2377	*/
				2378	dm_table_presuspend_targets(map);
				2379
				2380	/*
				2381	* Flush I/O to the device.
				2382	* Any I/O submitted after lock_fs() may not be flushed.
				2383	* noflush takes precedence over do_lockfs.
				2384	* (lock_fs() flushes I/Os and waits for them to complete.)
				2385	*/
				2386	if (!noflush && do_lockfs) {
				2387	r = lock_fs(md);
				2388	if (r) {
				2389	dm_table_presuspend_undo_targets(map);
				2390	return r;
				2391	}
				2392	}
				2393
				2394	/*
				2395	* Here we must make sure that no processes are submitting requests
				2396	* to target drivers i.e. no one may be executing
				2397	* __split_and_process_bio. This is called from dm_request and
				2398	* dm_wq_work.
				2399	*
				2400	* To get all processes out of __split_and_process_bio in dm_request,
				2401	* we take the write lock. To prevent any process from reentering
				2402	* __split_and_process_bio from dm_request and quiesce the thread
				2403	* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
				2404	* flush_workqueue(md->wq).
				2405	*/
				2406	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2407	if (map)
				2408	synchronize_srcu(&md->io_barrier);
				2409
				2410	/*
				2411	* Stop md->queue before flushing md->wq in case request-based
				2412	* dm defers requests to md->wq from md->queue.
				2413	*/
				2414	if (dm_request_based(md)) {
				2415	dm_stop_queue(md->queue);
				2416	if (md->kworker_task)
				2417	kthread_flush_worker(&md->kworker);
				2418	}
				2419
				2420	flush_workqueue(md->wq);
				2421
				2422	/*
				2423	* At this point no more requests are entering target request routines.
				2424	* We call dm_wait_for_completion to wait for all existing requests
				2425	* to finish.
				2426	*/
				2427	r = dm_wait_for_completion(md, task_state);
				2428	if (!r)
				2429	set_bit(dmf_suspended_flag, &md->flags);
				2430
				2431	if (noflush)
				2432	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2433	if (map)
				2434	synchronize_srcu(&md->io_barrier);
				2435
				2436	/* were we interrupted ? */
				2437	if (r < 0) {
				2438	dm_queue_flush(md);
				2439
				2440	if (dm_request_based(md))
				2441	dm_start_queue(md->queue);
				2442
				2443	unlock_fs(md);
				2444	dm_table_presuspend_undo_targets(map);
				2445	/* pushback list is already flushed, so skip flush */
				2446	}
				2447
				2448	return r;
				2449	}
				2450
				2451	/*
				2452	* We need to be able to change a mapping table under a mounted
				2453	* filesystem. For example we might want to move some data in
				2454	* the background. Before the table can be swapped with
				2455	* dm_bind_table, dm_suspend must be called to flush any in
				2456	* flight bios and ensure that any further io gets deferred.
				2457	*/
				2458	/*
				2459	* Suspend mechanism in request-based dm.
				2460	*
				2461	* 1. Flush all I/Os by lock_fs() if needed.
				2462	* 2. Stop dispatching any I/O by stopping the request_queue.
				2463	* 3. Wait for all in-flight I/Os to be completed or requeued.
				2464	*
				2465	* To abort suspend, start the request_queue.
				2466	*/
				2467	int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
				2468	{
				2469	struct dm_table *map = NULL;
				2470	int r = 0;
				2471
				2472	retry:
				2473	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
				2474
				2475	if (dm_suspended_md(md)) {
				2476	r = -EINVAL;
				2477	goto out_unlock;
				2478	}
				2479
				2480	if (dm_suspended_internally_md(md)) {
				2481	/* already internally suspended, wait for internal resume */
				2482	mutex_unlock(&md->suspend_lock);
				2483	r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
				2484	if (r)
				2485	return r;
				2486	goto retry;
				2487	}
				2488
				2489	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2490
				2491	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
				2492	if (r)
				2493	goto out_unlock;
				2494
				2495	dm_table_postsuspend_targets(map);
				2496
				2497	out_unlock:
				2498	mutex_unlock(&md->suspend_lock);
				2499	return r;
				2500	}
				2501
				2502	static int __dm_resume(struct mapped_device md, struct dm_table map)
				2503	{
				2504	if (map) {
				2505	int r = dm_table_resume_targets(map);
				2506	if (r)
				2507	return r;
				2508	}
				2509
				2510	dm_queue_flush(md);
				2511
				2512	/*
				2513	* Flushing deferred I/Os must be done after targets are resumed
				2514	* so that mapping of targets can work correctly.
				2515	* Request-based dm is queueing the deferred I/Os in its request_queue.
				2516	*/
				2517	if (dm_request_based(md))
				2518	dm_start_queue(md->queue);
				2519
				2520	unlock_fs(md);
				2521
				2522	return 0;
				2523	}
				2524
				2525	int dm_resume(struct mapped_device *md)
				2526	{
				2527	int r;
				2528	struct dm_table *map = NULL;
				2529
				2530	retry:
				2531	r = -EINVAL;
				2532	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
				2533
				2534	if (!dm_suspended_md(md))
				2535	goto out;
				2536
				2537	if (dm_suspended_internally_md(md)) {
				2538	/* already internally suspended, wait for internal resume */
				2539	mutex_unlock(&md->suspend_lock);
				2540	r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
				2541	if (r)
				2542	return r;
				2543	goto retry;
				2544	}
				2545
				2546	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2547	if (!map \|\| !dm_table_get_size(map))
				2548	goto out;
				2549
				2550	r = __dm_resume(md, map);
				2551	if (r)
				2552	goto out;
				2553
				2554	clear_bit(DMF_SUSPENDED, &md->flags);
				2555	out:
				2556	mutex_unlock(&md->suspend_lock);
				2557
				2558	return r;
				2559	}
				2560
				2561	/*
				2562	* Internal suspend/resume works like userspace-driven suspend. It waits
				2563	* until all bios finish and prevents issuing new bios to the target drivers.
				2564	* It may be used only from the kernel.
				2565	*/
				2566
				2567	static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
				2568	{
				2569	struct dm_table *map = NULL;
				2570
				2571	lockdep_assert_held(&md->suspend_lock);
				2572
				2573	if (md->internal_suspend_count++)
				2574	return; /* nested internal suspend */
				2575
				2576	if (dm_suspended_md(md)) {
				2577	set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2578	return; /* nest suspend */
				2579	}
				2580
				2581	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2582
				2583	/*
				2584	* Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
				2585	* supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
				2586	* would require changing .presuspend to return an error -- avoid this
				2587	* until there is a need for more elaborate variants of internal suspend.
				2588	*/
				2589	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
				2590	DMF_SUSPENDED_INTERNALLY);
				2591
				2592	dm_table_postsuspend_targets(map);
				2593	}
				2594
				2595	static void __dm_internal_resume(struct mapped_device *md)
				2596	{
				2597	BUG_ON(!md->internal_suspend_count);
				2598
				2599	if (--md->internal_suspend_count)
				2600	return; /* resume from nested internal suspend */
				2601
				2602	if (dm_suspended_md(md))
				2603	goto done; /* resume from nested suspend */
				2604
				2605	/*
				2606	* NOTE: existing callers don't need to call dm_table_resume_targets
				2607	* (which may fail -- so best to avoid it for now by passing NULL map)
				2608	*/
				2609	(void) __dm_resume(md, NULL);
				2610
				2611	done:
				2612	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2613	smp_mb__after_atomic();
				2614	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
				2615	}
				2616
				2617	void dm_internal_suspend_noflush(struct mapped_device *md)
				2618	{
				2619	mutex_lock(&md->suspend_lock);
				2620	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
				2621	mutex_unlock(&md->suspend_lock);
				2622	}
				2623	EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
				2624
				2625	void dm_internal_resume(struct mapped_device *md)
				2626	{
				2627	mutex_lock(&md->suspend_lock);
				2628	__dm_internal_resume(md);
				2629	mutex_unlock(&md->suspend_lock);
				2630	}
				2631	EXPORT_SYMBOL_GPL(dm_internal_resume);
				2632
				2633	/*
				2634	* Fast variants of internal suspend/resume hold md->suspend_lock,
				2635	* which prevents interaction with userspace-driven suspend.
				2636	*/
				2637
				2638	void dm_internal_suspend_fast(struct mapped_device *md)
				2639	{
				2640	mutex_lock(&md->suspend_lock);
				2641	if (dm_suspended_md(md) \|\| dm_suspended_internally_md(md))
				2642	return;
				2643
				2644	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2645	synchronize_srcu(&md->io_barrier);
				2646	flush_workqueue(md->wq);
				2647	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
				2648	}
				2649	EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
				2650
				2651	void dm_internal_resume_fast(struct mapped_device *md)
				2652	{
				2653	if (dm_suspended_md(md) \|\| dm_suspended_internally_md(md))
				2654	goto done;
				2655
				2656	dm_queue_flush(md);
				2657
				2658	done:
				2659	mutex_unlock(&md->suspend_lock);
				2660	}
				2661	EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
				2662
				2663	/*-----------------------------------------------------------------
				2664	* Event notification.
				2665	---------------------------------------------------------------/
				2666	int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
				2667	unsigned cookie)
				2668	{
				2669	int r;
				2670	unsigned noio_flag;
				2671	char udev_cookie[DM_COOKIE_LENGTH];
				2672	char *envp[] = { udev_cookie, NULL };
				2673
				2674	noio_flag = memalloc_noio_save();
				2675
				2676	if (!cookie)
				2677	r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
				2678	else {
				2679	snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
				2680	DM_COOKIE_ENV_VAR_NAME, cookie);
				2681	r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
				2682	action, envp);
				2683	}
				2684
				2685	memalloc_noio_restore(noio_flag);
				2686
				2687	return r;
				2688	}
				2689
				2690	uint32_t dm_next_uevent_seq(struct mapped_device *md)
				2691	{
				2692	return atomic_add_return(1, &md->uevent_seq);
				2693	}
				2694
				2695	uint32_t dm_get_event_nr(struct mapped_device *md)
				2696	{
				2697	return atomic_read(&md->event_nr);
				2698	}
				2699
				2700	int dm_wait_event(struct mapped_device *md, int event_nr)
				2701	{
				2702	return wait_event_interruptible(md->eventq,
				2703	(event_nr != atomic_read(&md->event_nr)));
				2704	}
				2705
				2706	void dm_uevent_add(struct mapped_device md, struct list_head elist)
				2707	{
				2708	unsigned long flags;
				2709
				2710	spin_lock_irqsave(&md->uevent_lock, flags);
				2711	list_add(elist, &md->uevent_list);
				2712	spin_unlock_irqrestore(&md->uevent_lock, flags);
				2713	}
				2714
				2715	/*
				2716	* The gendisk is only valid as long as you have a reference
				2717	* count on 'md'.
				2718	*/
				2719	struct gendisk dm_disk(struct mapped_device md)
				2720	{
				2721	return md->disk;
				2722	}
				2723	EXPORT_SYMBOL_GPL(dm_disk);
				2724
				2725	struct kobject dm_kobject(struct mapped_device md)
				2726	{
				2727	return &md->kobj_holder.kobj;
				2728	}
				2729
				2730	struct mapped_device dm_get_from_kobject(struct kobject kobj)
				2731	{
				2732	struct mapped_device *md;
				2733
				2734	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
				2735
				2736	spin_lock(&_minor_lock);
				2737	if (test_bit(DMF_FREEING, &md->flags) \|\| dm_deleting_md(md)) {
				2738	md = NULL;
				2739	goto out;
				2740	}
				2741	dm_get(md);
				2742	out:
				2743	spin_unlock(&_minor_lock);
				2744
				2745	return md;
				2746	}
				2747
				2748	int dm_suspended_md(struct mapped_device *md)
				2749	{
				2750	return test_bit(DMF_SUSPENDED, &md->flags);
				2751	}
				2752
				2753	int dm_suspended_internally_md(struct mapped_device *md)
				2754	{
				2755	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2756	}
				2757
				2758	int dm_test_deferred_remove_flag(struct mapped_device *md)
				2759	{
				2760	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
				2761	}
				2762
				2763	int dm_suspended(struct dm_target *ti)
				2764	{
				2765	return dm_suspended_md(dm_table_get_md(ti->table));
				2766	}
				2767	EXPORT_SYMBOL_GPL(dm_suspended);
				2768
				2769	int dm_noflush_suspending(struct dm_target *ti)
				2770	{
				2771	return __noflush_suspending(dm_table_get_md(ti->table));
				2772	}
				2773	EXPORT_SYMBOL_GPL(dm_noflush_suspending);
				2774
				2775	struct dm_md_mempools dm_alloc_md_mempools(struct mapped_device md, enum dm_queue_mode type,
				2776	unsigned integrity, unsigned per_io_data_size)
				2777	{
				2778	struct dm_md_mempools pools = kzalloc_node(sizeof(pools), GFP_KERNEL, md->numa_node_id);
				2779	unsigned int pool_size = 0;
				2780	unsigned int front_pad;
				2781
				2782	if (!pools)
				2783	return NULL;
				2784
				2785	switch (type) {
				2786	case DM_TYPE_BIO_BASED:
				2787	case DM_TYPE_DAX_BIO_BASED:
				2788	pool_size = dm_get_reserved_bio_based_ios();
				2789	front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
				2790
				2791	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
				2792	if (!pools->io_pool)
				2793	goto out;
				2794	break;
				2795	case DM_TYPE_REQUEST_BASED:
				2796	case DM_TYPE_MQ_REQUEST_BASED:
				2797	pool_size = dm_get_reserved_rq_based_ios();
				2798	front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
				2799	/* per_io_data_size is used for blk-mq pdu at queue allocation */
				2800	break;
				2801	default:
				2802	BUG();
				2803	}
				2804
				2805	pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
				2806	if (!pools->bs)
				2807	goto out;
				2808
				2809	if (integrity && bioset_integrity_create(pools->bs, pool_size))
				2810	goto out;
				2811
				2812	return pools;
				2813
				2814	out:
				2815	dm_free_md_mempools(pools);
				2816
				2817	return NULL;
				2818	}
				2819
				2820	void dm_free_md_mempools(struct dm_md_mempools *pools)
				2821	{
				2822	if (!pools)
				2823	return;
				2824
				2825	mempool_destroy(pools->io_pool);
				2826
				2827	if (pools->bs)
				2828	bioset_free(pools->bs);
				2829
				2830	kfree(pools);
				2831	}
				2832
				2833	struct dm_pr {
				2834	u64 old_key;
				2835	u64 new_key;
				2836	u32 flags;
				2837	bool fail_early;
				2838	};
				2839
				2840	static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
				2841	void *data)
				2842	{
				2843	struct mapped_device *md = bdev->bd_disk->private_data;
				2844	struct dm_table *table;
				2845	struct dm_target *ti;
				2846	int ret = -ENOTTY, srcu_idx;
				2847
				2848	table = dm_get_live_table(md, &srcu_idx);
				2849	if (!table \|\| !dm_table_get_size(table))
				2850	goto out;
				2851
				2852	/* We only support devices that have a single target */
				2853	if (dm_table_get_num_targets(table) != 1)
				2854	goto out;
				2855	ti = dm_table_get_target(table, 0);
				2856
				2857	ret = -EINVAL;
				2858	if (!ti->type->iterate_devices)
				2859	goto out;
				2860
				2861	ret = ti->type->iterate_devices(ti, fn, data);
				2862	out:
				2863	dm_put_live_table(md, srcu_idx);
				2864	return ret;
				2865	}
				2866
				2867	/*
				2868	* For register / unregister we need to manually call out to every path.
				2869	*/
				2870	static int __dm_pr_register(struct dm_target ti, struct dm_dev dev,
				2871	sector_t start, sector_t len, void *data)
				2872	{
				2873	struct dm_pr *pr = data;
				2874	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
				2875
				2876	if (!ops \|\| !ops->pr_register)
				2877	return -EOPNOTSUPP;
				2878	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
				2879	}
				2880
				2881	static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
				2882	u32 flags)
				2883	{
				2884	struct dm_pr pr = {
				2885	.old_key = old_key,
				2886	.new_key = new_key,
				2887	.flags = flags,
				2888	.fail_early = true,
				2889	};
				2890	int ret;
				2891
				2892	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
				2893	if (ret && new_key) {
				2894	/* unregister all paths if we failed to register any path */
				2895	pr.old_key = new_key;
				2896	pr.new_key = 0;
				2897	pr.flags = 0;
				2898	pr.fail_early = false;
				2899	dm_call_pr(bdev, __dm_pr_register, &pr);
				2900	}
				2901
				2902	return ret;
				2903	}
				2904
				2905	static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
				2906	u32 flags)
				2907	{
				2908	struct mapped_device *md = bdev->bd_disk->private_data;
				2909	const struct pr_ops *ops;
				2910	fmode_t mode;
				2911	int r;
				2912
				2913	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
				2914	if (r < 0)
				2915	return r;
				2916
				2917	ops = bdev->bd_disk->fops->pr_ops;
				2918	if (ops && ops->pr_reserve)
				2919	r = ops->pr_reserve(bdev, key, type, flags);
				2920	else
				2921	r = -EOPNOTSUPP;
				2922
				2923	bdput(bdev);
				2924	return r;
				2925	}
				2926
				2927	static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
				2928	{
				2929	struct mapped_device *md = bdev->bd_disk->private_data;
				2930	const struct pr_ops *ops;
				2931	fmode_t mode;
				2932	int r;
				2933
				2934	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
				2935	if (r < 0)
				2936	return r;
				2937
				2938	ops = bdev->bd_disk->fops->pr_ops;
				2939	if (ops && ops->pr_release)
				2940	r = ops->pr_release(bdev, key, type);
				2941	else
				2942	r = -EOPNOTSUPP;
				2943
				2944	bdput(bdev);
				2945	return r;
				2946	}
				2947
				2948	static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
				2949	enum pr_type type, bool abort)
				2950	{
				2951	struct mapped_device *md = bdev->bd_disk->private_data;
				2952	const struct pr_ops *ops;
				2953	fmode_t mode;
				2954	int r;
				2955
				2956	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
				2957	if (r < 0)
				2958	return r;
				2959
				2960	ops = bdev->bd_disk->fops->pr_ops;
				2961	if (ops && ops->pr_preempt)
				2962	r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
				2963	else
				2964	r = -EOPNOTSUPP;
				2965
				2966	bdput(bdev);
				2967	return r;
				2968	}
				2969
				2970	static int dm_pr_clear(struct block_device *bdev, u64 key)
				2971	{
				2972	struct mapped_device *md = bdev->bd_disk->private_data;
				2973	const struct pr_ops *ops;
				2974	fmode_t mode;
				2975	int r;
				2976
				2977	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
				2978	if (r < 0)
				2979	return r;
				2980
				2981	ops = bdev->bd_disk->fops->pr_ops;
				2982	if (ops && ops->pr_clear)
				2983	r = ops->pr_clear(bdev, key);
				2984	else
				2985	r = -EOPNOTSUPP;
				2986
				2987	bdput(bdev);
				2988	return r;
				2989	}
				2990
				2991	static const struct pr_ops dm_pr_ops = {
				2992	.pr_register = dm_pr_register,
				2993	.pr_reserve = dm_pr_reserve,
				2994	.pr_release = dm_pr_release,
				2995	.pr_preempt = dm_pr_preempt,
				2996	.pr_clear = dm_pr_clear,
				2997	};
				2998
				2999	static const struct block_device_operations dm_blk_dops = {
				3000	.open = dm_blk_open,
				3001	.release = dm_blk_close,
				3002	.ioctl = dm_blk_ioctl,
				3003	.getgeo = dm_blk_getgeo,
				3004	.pr_ops = &dm_pr_ops,
				3005	.owner = THIS_MODULE
				3006	};
				3007
				3008	static const struct dax_operations dm_dax_ops = {
				3009	.direct_access = dm_dax_direct_access,
				3010	.copy_from_iter = dm_dax_copy_from_iter,
				3011	};
				3012
				3013	/*
				3014	* module hooks
				3015	*/
				3016	module_init(dm_init);
				3017	module_exit(dm_exit);
				3018
				3019	module_param(major, uint, 0);
				3020	MODULE_PARM_DESC(major, "The major number of the device mapper");
				3021
				3022	module_param(reserved_bio_based_ios, uint, S_IRUGO \| S_IWUSR);
				3023	MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
				3024
				3025	module_param(dm_numa_node, int, S_IRUGO \| S_IWUSR);
				3026	MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
				3027
				3028	MODULE_DESCRIPTION(DM_NAME " driver");
				3029	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				3030	MODULE_LICENSE("GPL");