Blame - src/kernel/linux/v4.19/drivers/md/dm.c - T800

blob: 926eeef42608d5319616c8c346c5dcfaeb06b690 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
				3	* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
				4	*
				5	* This file is released under the GPL.
				6	*/
				7
				8	#include "dm-core.h"
				9	#include "dm-rq.h"
				10	#include "dm-uevent.h"
				11
				12	#include <linux/init.h>
				13	#include <linux/module.h>
				14	#include <linux/mutex.h>
				15	#include <linux/sched/signal.h>
				16	#include <linux/blkpg.h>
				17	#include <linux/bio.h>
				18	#include <linux/mempool.h>
				19	#include <linux/dax.h>
				20	#include <linux/slab.h>
				21	#include <linux/idr.h>
				22	#include <linux/uio.h>
				23	#include <linux/hdreg.h>
				24	#include <linux/delay.h>
				25	#include <linux/wait.h>
				26	#include <linux/pr.h>
				27	#include <linux/refcount.h>
				28
				29	#define DM_MSG_PREFIX "core"
				30
				31	/*
				32	* Cookies are numeric values sent with CHANGE and REMOVE
				33	* uevents while resuming, removing or renaming the device.
				34	*/
				35	#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
				36	#define DM_COOKIE_LENGTH 24
				37
				38	static const char *_name = DM_NAME;
				39
				40	static unsigned int major = 0;
				41	static unsigned int _major = 0;
				42
				43	static DEFINE_IDR(_minor_idr);
				44
				45	static DEFINE_SPINLOCK(_minor_lock);
				46
				47	static void do_deferred_remove(struct work_struct *w);
				48
				49	static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
				50
				51	static struct workqueue_struct *deferred_remove_workqueue;
				52
				53	atomic_t dm_global_event_nr = ATOMIC_INIT(0);
				54	DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
				55
				56	void dm_issue_global_event(void)
				57	{
				58	atomic_inc(&dm_global_event_nr);
				59	wake_up(&dm_global_eventq);
				60	}
				61
				62	/*
				63	* One of these is allocated (on-stack) per original bio.
				64	*/
				65	struct clone_info {
				66	struct dm_table *map;
				67	struct bio *bio;
				68	struct dm_io *io;
				69	sector_t sector;
				70	unsigned sector_count;
				71	};
				72
				73	/*
				74	* One of these is allocated per clone bio.
				75	*/
				76	#define DM_TIO_MAGIC 7282014
				77	struct dm_target_io {
				78	unsigned magic;
				79	struct dm_io *io;
				80	struct dm_target *ti;
				81	unsigned target_bio_nr;
				82	unsigned *len_ptr;
				83	bool inside_dm_io;
				84	struct bio clone;
				85	};
				86
				87	/*
				88	* One of these is allocated per original bio.
				89	* It contains the first clone used for that original.
				90	*/
				91	#define DM_IO_MAGIC 5191977
				92	struct dm_io {
				93	unsigned magic;
				94	struct mapped_device *md;
				95	blk_status_t status;
				96	atomic_t io_count;
				97	struct bio *orig_bio;
				98	unsigned long start_time;
				99	spinlock_t endio_lock;
				100	struct dm_stats_aux stats_aux;
				101	/* last member of dm_target_io is 'struct bio' */
				102	struct dm_target_io tio;
				103	};
				104
				105	void dm_per_bio_data(struct bio bio, size_t data_size)
				106	{
				107	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				108	if (!tio->inside_dm_io)
				109	return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
				110	return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
				111	}
				112	EXPORT_SYMBOL_GPL(dm_per_bio_data);
				113
				114	struct bio dm_bio_from_per_bio_data(void data, size_t data_size)
				115	{
				116	struct dm_io io = (struct dm_io )((char *)data + data_size);
				117	if (io->magic == DM_IO_MAGIC)
				118	return (struct bio )((char )io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
				119	BUG_ON(io->magic != DM_TIO_MAGIC);
				120	return (struct bio )((char )io + offsetof(struct dm_target_io, clone));
				121	}
				122	EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
				123
				124	unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
				125	{
				126	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
				127	}
				128	EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
				129
				130	#define MINOR_ALLOCED ((void *)-1)
				131
				132	/*
				133	* Bits for the md->flags field.
				134	*/
				135	#define DMF_BLOCK_IO_FOR_SUSPEND 0
				136	#define DMF_SUSPENDED 1
				137	#define DMF_FROZEN 2
				138	#define DMF_FREEING 3
				139	#define DMF_DELETING 4
				140	#define DMF_NOFLUSH_SUSPENDING 5
				141	#define DMF_DEFERRED_REMOVE 6
				142	#define DMF_SUSPENDED_INTERNALLY 7
				143
				144	#define DM_NUMA_NODE NUMA_NO_NODE
				145	static int dm_numa_node = DM_NUMA_NODE;
				146
				147	/*
				148	* For mempools pre-allocation at the table loading time.
				149	*/
				150	struct dm_md_mempools {
				151	struct bio_set bs;
				152	struct bio_set io_bs;
				153	};
				154
				155	struct table_device {
				156	struct list_head list;
				157	refcount_t count;
				158	struct dm_dev dm_dev;
				159	};
				160
				161	static struct kmem_cache *_rq_tio_cache;
				162	static struct kmem_cache *_rq_cache;
				163
				164	/*
				165	* Bio-based DM's mempools' reserved IOs set by the user.
				166	*/
				167	#define RESERVED_BIO_BASED_IOS 16
				168	static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
				169
				170	static int __dm_get_module_param_int(int *module_param, int min, int max)
				171	{
				172	int param = READ_ONCE(*module_param);
				173	int modified_param = 0;
				174	bool modified = true;
				175
				176	if (param < min)
				177	modified_param = min;
				178	else if (param > max)
				179	modified_param = max;
				180	else
				181	modified = false;
				182
				183	if (modified) {
				184	(void)cmpxchg(module_param, param, modified_param);
				185	param = modified_param;
				186	}
				187
				188	return param;
				189	}
				190
				191	unsigned __dm_get_module_param(unsigned *module_param,
				192	unsigned def, unsigned max)
				193	{
				194	unsigned param = READ_ONCE(*module_param);
				195	unsigned modified_param = 0;
				196
				197	if (!param)
				198	modified_param = def;
				199	else if (param > max)
				200	modified_param = max;
				201
				202	if (modified_param) {
				203	(void)cmpxchg(module_param, param, modified_param);
				204	param = modified_param;
				205	}
				206
				207	return param;
				208	}
				209
				210	unsigned dm_get_reserved_bio_based_ios(void)
				211	{
				212	return __dm_get_module_param(&reserved_bio_based_ios,
				213	RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
				214	}
				215	EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
				216
				217	static unsigned dm_get_numa_node(void)
				218	{
				219	return __dm_get_module_param_int(&dm_numa_node,
				220	DM_NUMA_NODE, num_online_nodes() - 1);
				221	}
				222
				223	static int __init local_init(void)
				224	{
				225	int r = -ENOMEM;
				226
				227	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
				228	if (!_rq_tio_cache)
				229	return r;
				230
				231	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
				232	__alignof__(struct request), 0, NULL);
				233	if (!_rq_cache)
				234	goto out_free_rq_tio_cache;
				235
				236	r = dm_uevent_init();
				237	if (r)
				238	goto out_free_rq_cache;
				239
				240	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
				241	if (!deferred_remove_workqueue) {
				242	r = -ENOMEM;
				243	goto out_uevent_exit;
				244	}
				245
				246	_major = major;
				247	r = register_blkdev(_major, _name);
				248	if (r < 0)
				249	goto out_free_workqueue;
				250
				251	if (!_major)
				252	_major = r;
				253
				254	return 0;
				255
				256	out_free_workqueue:
				257	destroy_workqueue(deferred_remove_workqueue);
				258	out_uevent_exit:
				259	dm_uevent_exit();
				260	out_free_rq_cache:
				261	kmem_cache_destroy(_rq_cache);
				262	out_free_rq_tio_cache:
				263	kmem_cache_destroy(_rq_tio_cache);
				264
				265	return r;
				266	}
				267
				268	static void local_exit(void)
				269	{
				270	flush_scheduled_work();
				271	destroy_workqueue(deferred_remove_workqueue);
				272
				273	kmem_cache_destroy(_rq_cache);
				274	kmem_cache_destroy(_rq_tio_cache);
				275	unregister_blkdev(_major, _name);
				276	dm_uevent_exit();
				277
				278	_major = 0;
				279
				280	DMINFO("cleaned up");
				281	}
				282
				283	static int (*_inits[])(void) __initdata = {
				284	local_init,
				285	dm_target_init,
				286	dm_linear_init,
				287	dm_stripe_init,
				288	dm_io_init,
				289	dm_kcopyd_init,
				290	dm_interface_init,
				291	dm_statistics_init,
				292	};
				293
				294	static void (*_exits[])(void) = {
				295	local_exit,
				296	dm_target_exit,
				297	dm_linear_exit,
				298	dm_stripe_exit,
				299	dm_io_exit,
				300	dm_kcopyd_exit,
				301	dm_interface_exit,
				302	dm_statistics_exit,
				303	};
				304
				305	static int __init dm_init(void)
				306	{
				307	const int count = ARRAY_SIZE(_inits);
				308
				309	int r, i;
				310
				311	for (i = 0; i < count; i++) {
				312	r = _inits[i]();
				313	if (r)
				314	goto bad;
				315	}
				316
				317	return 0;
				318
				319	bad:
				320	while (i--)
				321	_exits[i]();
				322
				323	return r;
				324	}
				325
				326	static void __exit dm_exit(void)
				327	{
				328	int i = ARRAY_SIZE(_exits);
				329
				330	while (i--)
				331	_exits[i]();
				332
				333	/*
				334	* Should be empty by this point.
				335	*/
				336	idr_destroy(&_minor_idr);
				337	}
				338
				339	/*
				340	* Block device functions
				341	*/
				342	int dm_deleting_md(struct mapped_device *md)
				343	{
				344	return test_bit(DMF_DELETING, &md->flags);
				345	}
				346
				347	static int dm_blk_open(struct block_device *bdev, fmode_t mode)
				348	{
				349	struct mapped_device *md;
				350
				351	spin_lock(&_minor_lock);
				352
				353	md = bdev->bd_disk->private_data;
				354	if (!md)
				355	goto out;
				356
				357	if (test_bit(DMF_FREEING, &md->flags) \|\|
				358	dm_deleting_md(md)) {
				359	md = NULL;
				360	goto out;
				361	}
				362
				363	dm_get(md);
				364	atomic_inc(&md->open_count);
				365	out:
				366	spin_unlock(&_minor_lock);
				367
				368	return md ? 0 : -ENXIO;
				369	}
				370
				371	static void dm_blk_close(struct gendisk *disk, fmode_t mode)
				372	{
				373	struct mapped_device *md;
				374
				375	spin_lock(&_minor_lock);
				376
				377	md = disk->private_data;
				378	if (WARN_ON(!md))
				379	goto out;
				380
				381	if (atomic_dec_and_test(&md->open_count) &&
				382	(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
				383	queue_work(deferred_remove_workqueue, &deferred_remove_work);
				384
				385	dm_put(md);
				386	out:
				387	spin_unlock(&_minor_lock);
				388	}
				389
				390	int dm_open_count(struct mapped_device *md)
				391	{
				392	return atomic_read(&md->open_count);
				393	}
				394
				395	/*
				396	* Guarantees nothing is using the device before it's deleted.
				397	*/
				398	int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
				399	{
				400	int r = 0;
				401
				402	spin_lock(&_minor_lock);
				403
				404	if (dm_open_count(md)) {
				405	r = -EBUSY;
				406	if (mark_deferred)
				407	set_bit(DMF_DEFERRED_REMOVE, &md->flags);
				408	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
				409	r = -EEXIST;
				410	else
				411	set_bit(DMF_DELETING, &md->flags);
				412
				413	spin_unlock(&_minor_lock);
				414
				415	return r;
				416	}
				417
				418	int dm_cancel_deferred_remove(struct mapped_device *md)
				419	{
				420	int r = 0;
				421
				422	spin_lock(&_minor_lock);
				423
				424	if (test_bit(DMF_DELETING, &md->flags))
				425	r = -EBUSY;
				426	else
				427	clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
				428
				429	spin_unlock(&_minor_lock);
				430
				431	return r;
				432	}
				433
				434	static void do_deferred_remove(struct work_struct *w)
				435	{
				436	dm_deferred_remove();
				437	}
				438
				439	sector_t dm_get_size(struct mapped_device *md)
				440	{
				441	return get_capacity(md->disk);
				442	}
				443
				444	struct request_queue dm_get_md_queue(struct mapped_device md)
				445	{
				446	return md->queue;
				447	}
				448
				449	struct dm_stats dm_get_stats(struct mapped_device md)
				450	{
				451	return &md->stats;
				452	}
				453
				454	static int dm_blk_getgeo(struct block_device bdev, struct hd_geometry geo)
				455	{
				456	struct mapped_device *md = bdev->bd_disk->private_data;
				457
				458	return dm_get_geometry(md, geo);
				459	}
				460
				461	static int dm_prepare_ioctl(struct mapped_device md, int srcu_idx,
				462	struct block_device **bdev)
				463	__acquires(md->io_barrier)
				464	{
				465	struct dm_target *tgt;
				466	struct dm_table *map;
				467	int r;
				468
				469	retry:
				470	r = -ENOTTY;
				471	map = dm_get_live_table(md, srcu_idx);
				472	if (!map \|\| !dm_table_get_size(map))
				473	return r;
				474
				475	/* We only support devices that have a single target */
				476	if (dm_table_get_num_targets(map) != 1)
				477	return r;
				478
				479	tgt = dm_table_get_target(map, 0);
				480	if (!tgt->type->prepare_ioctl)
				481	return r;
				482
				483	if (dm_suspended_md(md))
				484	return -EAGAIN;
				485
				486	r = tgt->type->prepare_ioctl(tgt, bdev);
				487	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
				488	dm_put_live_table(md, *srcu_idx);
				489	msleep(10);
				490	goto retry;
				491	}
				492
				493	return r;
				494	}
				495
				496	static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
				497	__releases(md->io_barrier)
				498	{
				499	dm_put_live_table(md, srcu_idx);
				500	}
				501
				502	static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
				503	unsigned int cmd, unsigned long arg)
				504	{
				505	struct mapped_device *md = bdev->bd_disk->private_data;
				506	int r, srcu_idx;
				507
				508	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
				509	if (r < 0)
				510	goto out;
				511
				512	if (r > 0) {
				513	/*
				514	* Target determined this ioctl is being issued against a
				515	* subset of the parent bdev; require extra privileges.
				516	*/
				517	if (!capable(CAP_SYS_RAWIO)) {
				518	DMWARN_LIMIT(
				519	"%s: sending ioctl %x to DM device without required privilege.",
				520	current->comm, cmd);
				521	r = -ENOIOCTLCMD;
				522	goto out;
				523	}
				524	}
				525
				526	r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
				527	out:
				528	dm_unprepare_ioctl(md, srcu_idx);
				529	return r;
				530	}
				531
				532	static void start_io_acct(struct dm_io *io);
				533
				534	static struct dm_io alloc_io(struct mapped_device md, struct bio *bio)
				535	{
				536	struct dm_io *io;
				537	struct dm_target_io *tio;
				538	struct bio *clone;
				539
				540	clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
				541	if (!clone)
				542	return NULL;
				543
				544	tio = container_of(clone, struct dm_target_io, clone);
				545	tio->inside_dm_io = true;
				546	tio->io = NULL;
				547
				548	io = container_of(tio, struct dm_io, tio);
				549	io->magic = DM_IO_MAGIC;
				550	io->status = 0;
				551	atomic_set(&io->io_count, 1);
				552	io->orig_bio = bio;
				553	io->md = md;
				554	spin_lock_init(&io->endio_lock);
				555
				556	start_io_acct(io);
				557
				558	return io;
				559	}
				560
				561	static void free_io(struct mapped_device md, struct dm_io io)
				562	{
				563	bio_put(&io->tio.clone);
				564	}
				565
				566	static struct dm_target_io alloc_tio(struct clone_info ci, struct dm_target *ti,
				567	unsigned target_bio_nr, gfp_t gfp_mask)
				568	{
				569	struct dm_target_io *tio;
				570
				571	if (!ci->io->tio.io) {
				572	/* the dm_target_io embedded in ci->io is available */
				573	tio = &ci->io->tio;
				574	} else {
				575	struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
				576	if (!clone)
				577	return NULL;
				578
				579	tio = container_of(clone, struct dm_target_io, clone);
				580	tio->inside_dm_io = false;
				581	}
				582
				583	tio->magic = DM_TIO_MAGIC;
				584	tio->io = ci->io;
				585	tio->ti = ti;
				586	tio->target_bio_nr = target_bio_nr;
				587
				588	return tio;
				589	}
				590
				591	static void free_tio(struct dm_target_io *tio)
				592	{
				593	if (tio->inside_dm_io)
				594	return;
				595	bio_put(&tio->clone);
				596	}
				597
				598	int md_in_flight(struct mapped_device *md)
				599	{
				600	return atomic_read(&md->pending[READ]) +
				601	atomic_read(&md->pending[WRITE]);
				602	}
				603
				604	static void start_io_acct(struct dm_io *io)
				605	{
				606	struct mapped_device *md = io->md;
				607	struct bio *bio = io->orig_bio;
				608	int rw = bio_data_dir(bio);
				609
				610	io->start_time = jiffies;
				611
				612	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
				613	&dm_disk(md)->part0);
				614
				615	atomic_set(&dm_disk(md)->part0.in_flight[rw],
				616	atomic_inc_return(&md->pending[rw]));
				617
				618	if (unlikely(dm_stats_used(&md->stats)))
				619	dm_stats_account_io(&md->stats, bio_data_dir(bio),
				620	bio->bi_iter.bi_sector, bio_sectors(bio),
				621	false, 0, &io->stats_aux);
				622	}
				623
				624	static void end_io_acct(struct dm_io *io)
				625	{
				626	struct mapped_device *md = io->md;
				627	struct bio *bio = io->orig_bio;
				628	unsigned long duration = jiffies - io->start_time;
				629	int pending;
				630	int rw = bio_data_dir(bio);
				631
				632	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
				633	io->start_time);
				634
				635	if (unlikely(dm_stats_used(&md->stats)))
				636	dm_stats_account_io(&md->stats, bio_data_dir(bio),
				637	bio->bi_iter.bi_sector, bio_sectors(bio),
				638	true, duration, &io->stats_aux);
				639
				640	/*
				641	* After this is decremented the bio must not be touched if it is
				642	* a flush.
				643	*/
				644	pending = atomic_dec_return(&md->pending[rw]);
				645	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
				646	pending += atomic_read(&md->pending[rw^0x1]);
				647
				648	/* nudge anyone waiting on suspend queue */
				649	if (!pending)
				650	wake_up(&md->wait);
				651	}
				652
				653	/*
				654	* Add the bio to the list of deferred io.
				655	*/
				656	static void queue_io(struct mapped_device md, struct bio bio)
				657	{
				658	unsigned long flags;
				659
				660	spin_lock_irqsave(&md->deferred_lock, flags);
				661	bio_list_add(&md->deferred, bio);
				662	spin_unlock_irqrestore(&md->deferred_lock, flags);
				663	queue_work(md->wq, &md->work);
				664	}
				665
				666	/*
				667	* Everyone (including functions in this file), should use this
				668	* function to access the md->map field, and make sure they call
				669	* dm_put_live_table() when finished.
				670	*/
				671	struct dm_table dm_get_live_table(struct mapped_device md, int *srcu_idx) __acquires(md->io_barrier)
				672	{
				673	*srcu_idx = srcu_read_lock(&md->io_barrier);
				674
				675	return srcu_dereference(md->map, &md->io_barrier);
				676	}
				677
				678	void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
				679	{
				680	srcu_read_unlock(&md->io_barrier, srcu_idx);
				681	}
				682
				683	void dm_sync_table(struct mapped_device *md)
				684	{
				685	synchronize_srcu(&md->io_barrier);
				686	synchronize_rcu_expedited();
				687	}
				688
				689	/*
				690	* A fast alternative to dm_get_live_table/dm_put_live_table.
				691	* The caller must not block between these two functions.
				692	*/
				693	static struct dm_table dm_get_live_table_fast(struct mapped_device md) __acquires(RCU)
				694	{
				695	rcu_read_lock();
				696	return rcu_dereference(md->map);
				697	}
				698
				699	static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
				700	{
				701	rcu_read_unlock();
				702	}
				703
				704	static char *_dm_claim_ptr = "I belong to device-mapper";
				705
				706	/*
				707	* Open a table device so we can use it as a map destination.
				708	*/
				709	static int open_table_device(struct table_device *td, dev_t dev,
				710	struct mapped_device *md)
				711	{
				712	struct block_device *bdev;
				713
				714	int r;
				715
				716	BUG_ON(td->dm_dev.bdev);
				717
				718	bdev = blkdev_get_by_dev(dev, td->dm_dev.mode \| FMODE_EXCL, _dm_claim_ptr);
				719	if (IS_ERR(bdev))
				720	return PTR_ERR(bdev);
				721
				722	r = bd_link_disk_holder(bdev, dm_disk(md));
				723	if (r) {
				724	blkdev_put(bdev, td->dm_dev.mode \| FMODE_EXCL);
				725	return r;
				726	}
				727
				728	td->dm_dev.bdev = bdev;
				729	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
				730	return 0;
				731	}
				732
				733	/*
				734	* Close a table device that we've been using.
				735	*/
				736	static void close_table_device(struct table_device td, struct mapped_device md)
				737	{
				738	if (!td->dm_dev.bdev)
				739	return;
				740
				741	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
				742	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode \| FMODE_EXCL);
				743	put_dax(td->dm_dev.dax_dev);
				744	td->dm_dev.bdev = NULL;
				745	td->dm_dev.dax_dev = NULL;
				746	}
				747
				748	static struct table_device find_table_device(struct list_head l, dev_t dev,
				749	fmode_t mode) {
				750	struct table_device *td;
				751
				752	list_for_each_entry(td, l, list)
				753	if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
				754	return td;
				755
				756	return NULL;
				757	}
				758
				759	int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
				760	struct dm_dev **result) {
				761	int r;
				762	struct table_device *td;
				763
				764	mutex_lock(&md->table_devices_lock);
				765	td = find_table_device(&md->table_devices, dev, mode);
				766	if (!td) {
				767	td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
				768	if (!td) {
				769	mutex_unlock(&md->table_devices_lock);
				770	return -ENOMEM;
				771	}
				772
				773	td->dm_dev.mode = mode;
				774	td->dm_dev.bdev = NULL;
				775
				776	if ((r = open_table_device(td, dev, md))) {
				777	mutex_unlock(&md->table_devices_lock);
				778	kfree(td);
				779	return r;
				780	}
				781
				782	format_dev_t(td->dm_dev.name, dev);
				783
				784	refcount_set(&td->count, 1);
				785	list_add(&td->list, &md->table_devices);
				786	} else {
				787	refcount_inc(&td->count);
				788	}
				789	mutex_unlock(&md->table_devices_lock);
				790
				791	*result = &td->dm_dev;
				792	return 0;
				793	}
				794	EXPORT_SYMBOL_GPL(dm_get_table_device);
				795
				796	void dm_put_table_device(struct mapped_device md, struct dm_dev d)
				797	{
				798	struct table_device *td = container_of(d, struct table_device, dm_dev);
				799
				800	mutex_lock(&md->table_devices_lock);
				801	if (refcount_dec_and_test(&td->count)) {
				802	close_table_device(td, md);
				803	list_del(&td->list);
				804	kfree(td);
				805	}
				806	mutex_unlock(&md->table_devices_lock);
				807	}
				808	EXPORT_SYMBOL(dm_put_table_device);
				809
				810	static void free_table_devices(struct list_head *devices)
				811	{
				812	struct list_head tmp, next;
				813
				814	list_for_each_safe(tmp, next, devices) {
				815	struct table_device *td = list_entry(tmp, struct table_device, list);
				816
				817	DMWARN("dm_destroy: %s still exists with %d references",
				818	td->dm_dev.name, refcount_read(&td->count));
				819	kfree(td);
				820	}
				821	}
				822
				823	/*
				824	* Get the geometry associated with a dm device
				825	*/
				826	int dm_get_geometry(struct mapped_device md, struct hd_geometry geo)
				827	{
				828	*geo = md->geometry;
				829
				830	return 0;
				831	}
				832
				833	/*
				834	* Set the geometry of a device.
				835	*/
				836	int dm_set_geometry(struct mapped_device md, struct hd_geometry geo)
				837	{
				838	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
				839
				840	if (geo->start > sz) {
				841	DMWARN("Start sector is beyond the geometry limits.");
				842	return -EINVAL;
				843	}
				844
				845	md->geometry = *geo;
				846
				847	return 0;
				848	}
				849
				850	static int __noflush_suspending(struct mapped_device *md)
				851	{
				852	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				853	}
				854
				855	/*
				856	* Decrements the number of outstanding ios that a bio has been
				857	* cloned into, completing the original io if necc.
				858	*/
				859	static void dec_pending(struct dm_io *io, blk_status_t error)
				860	{
				861	unsigned long flags;
				862	blk_status_t io_error;
				863	struct bio *bio;
				864	struct mapped_device *md = io->md;
				865
				866	/* Push-back supersedes any I/O errors */
				867	if (unlikely(error)) {
				868	spin_lock_irqsave(&io->endio_lock, flags);
				869	if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
				870	io->status = error;
				871	spin_unlock_irqrestore(&io->endio_lock, flags);
				872	}
				873
				874	if (atomic_dec_and_test(&io->io_count)) {
				875	if (io->status == BLK_STS_DM_REQUEUE) {
				876	/*
				877	* Target requested pushing back the I/O.
				878	*/
				879	spin_lock_irqsave(&md->deferred_lock, flags);
				880	if (__noflush_suspending(md))
				881	/* NOTE early return due to BLK_STS_DM_REQUEUE below */
				882	bio_list_add_head(&md->deferred, io->orig_bio);
				883	else
				884	/* noflush suspend was interrupted. */
				885	io->status = BLK_STS_IOERR;
				886	spin_unlock_irqrestore(&md->deferred_lock, flags);
				887	}
				888
				889	io_error = io->status;
				890	bio = io->orig_bio;
				891	end_io_acct(io);
				892	free_io(md, io);
				893
				894	if (io_error == BLK_STS_DM_REQUEUE)
				895	return;
				896
				897	if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
				898	/*
				899	* Preflush done for flush with data, reissue
				900	* without REQ_PREFLUSH.
				901	*/
				902	bio->bi_opf &= ~REQ_PREFLUSH;
				903	queue_io(md, bio);
				904	} else {
				905	/* done with normal IO or empty flush */
				906	if (io_error)
				907	bio->bi_status = io_error;
				908	bio_endio(bio);
				909	}
				910	}
				911	}
				912
				913	void disable_discard(struct mapped_device *md)
				914	{
				915	struct queue_limits *limits = dm_get_queue_limits(md);
				916
				917	/* device doesn't really support DISCARD, disable it */
				918	limits->max_discard_sectors = 0;
				919	blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
				920	}
				921
				922	void disable_write_same(struct mapped_device *md)
				923	{
				924	struct queue_limits *limits = dm_get_queue_limits(md);
				925
				926	/* device doesn't really support WRITE SAME, disable it */
				927	limits->max_write_same_sectors = 0;
				928	}
				929
				930	void disable_write_zeroes(struct mapped_device *md)
				931	{
				932	struct queue_limits *limits = dm_get_queue_limits(md);
				933
				934	/* device doesn't really support WRITE ZEROES, disable it */
				935	limits->max_write_zeroes_sectors = 0;
				936	}
				937
				938	static void clone_endio(struct bio *bio)
				939	{
				940	blk_status_t error = bio->bi_status;
				941	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				942	struct dm_io *io = tio->io;
				943	struct mapped_device *md = tio->io->md;
				944	dm_endio_fn endio = tio->ti->type->end_io;
				945
				946	if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
				947	if (bio_op(bio) == REQ_OP_DISCARD &&
				948	!bio->bi_disk->queue->limits.max_discard_sectors)
				949	disable_discard(md);
				950	else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
				951	!bio->bi_disk->queue->limits.max_write_same_sectors)
				952	disable_write_same(md);
				953	else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
				954	!bio->bi_disk->queue->limits.max_write_zeroes_sectors)
				955	disable_write_zeroes(md);
				956	}
				957
				958	if (endio) {
				959	int r = endio(tio->ti, bio, &error);
				960	switch (r) {
				961	case DM_ENDIO_REQUEUE:
				962	error = BLK_STS_DM_REQUEUE;
				963	/FALLTHRU/
				964	case DM_ENDIO_DONE:
				965	break;
				966	case DM_ENDIO_INCOMPLETE:
				967	/* The target will handle the io */
				968	return;
				969	default:
				970	DMWARN("unimplemented target endio return value: %d", r);
				971	BUG();
				972	}
				973	}
				974
				975	free_tio(tio);
				976	dec_pending(io, error);
				977	}
				978
				979	/*
				980	* Return maximum size of I/O possible at the supplied sector up to the current
				981	* target boundary.
				982	*/
				983	static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
				984	{
				985	sector_t target_offset = dm_target_offset(ti, sector);
				986
				987	return ti->len - target_offset;
				988	}
				989
				990	static sector_t max_io_len(sector_t sector, struct dm_target *ti)
				991	{
				992	sector_t len = max_io_len_target_boundary(sector, ti);
				993	sector_t offset, max_len;
				994
				995	/*
				996	* Does the target need to split even further?
				997	*/
				998	if (ti->max_io_len) {
				999	offset = dm_target_offset(ti, sector);
				1000	if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
				1001	max_len = sector_div(offset, ti->max_io_len);
				1002	else
				1003	max_len = offset & (ti->max_io_len - 1);
				1004	max_len = ti->max_io_len - max_len;
				1005
				1006	if (len > max_len)
				1007	len = max_len;
				1008	}
				1009
				1010	return len;
				1011	}
				1012
				1013	int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
				1014	{
				1015	if (len > UINT_MAX) {
				1016	DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
				1017	(unsigned long long)len, UINT_MAX);
				1018	ti->error = "Maximum size of target IO is too large";
				1019	return -EINVAL;
				1020	}
				1021
				1022	ti->max_io_len = (uint32_t) len;
				1023
				1024	return 0;
				1025	}
				1026	EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
				1027
				1028	static struct dm_target dm_dax_get_live_target(struct mapped_device md,
				1029	sector_t sector, int *srcu_idx)
				1030	__acquires(md->io_barrier)
				1031	{
				1032	struct dm_table *map;
				1033	struct dm_target *ti;
				1034
				1035	map = dm_get_live_table(md, srcu_idx);
				1036	if (!map)
				1037	return NULL;
				1038
				1039	ti = dm_table_find_target(map, sector);
				1040	if (!dm_target_is_valid(ti))
				1041	return NULL;
				1042
				1043	return ti;
				1044	}
				1045
				1046	static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
				1047	long nr_pages, void *kaddr, pfn_t pfn)
				1048	{
				1049	struct mapped_device *md = dax_get_private(dax_dev);
				1050	sector_t sector = pgoff * PAGE_SECTORS;
				1051	struct dm_target *ti;
				1052	long len, ret = -EIO;
				1053	int srcu_idx;
				1054
				1055	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
				1056
				1057	if (!ti)
				1058	goto out;
				1059	if (!ti->type->direct_access)
				1060	goto out;
				1061	len = max_io_len(sector, ti) / PAGE_SECTORS;
				1062	if (len < 1)
				1063	goto out;
				1064	nr_pages = min(len, nr_pages);
				1065	ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
				1066
				1067	out:
				1068	dm_put_live_table(md, srcu_idx);
				1069
				1070	return ret;
				1071	}
				1072
				1073	static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
				1074	void addr, size_t bytes, struct iov_iter i)
				1075	{
				1076	struct mapped_device *md = dax_get_private(dax_dev);
				1077	sector_t sector = pgoff * PAGE_SECTORS;
				1078	struct dm_target *ti;
				1079	long ret = 0;
				1080	int srcu_idx;
				1081
				1082	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
				1083
				1084	if (!ti)
				1085	goto out;
				1086	if (!ti->type->dax_copy_from_iter) {
				1087	ret = copy_from_iter(addr, bytes, i);
				1088	goto out;
				1089	}
				1090	ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
				1091	out:
				1092	dm_put_live_table(md, srcu_idx);
				1093
				1094	return ret;
				1095	}
				1096
				1097	static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
				1098	void addr, size_t bytes, struct iov_iter i)
				1099	{
				1100	struct mapped_device *md = dax_get_private(dax_dev);
				1101	sector_t sector = pgoff * PAGE_SECTORS;
				1102	struct dm_target *ti;
				1103	long ret = 0;
				1104	int srcu_idx;
				1105
				1106	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
				1107
				1108	if (!ti)
				1109	goto out;
				1110	if (!ti->type->dax_copy_to_iter) {
				1111	ret = copy_to_iter(addr, bytes, i);
				1112	goto out;
				1113	}
				1114	ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
				1115	out:
				1116	dm_put_live_table(md, srcu_idx);
				1117
				1118	return ret;
				1119	}
				1120
				1121	/*
				1122	* A target may call dm_accept_partial_bio only from the map routine. It is
				1123	* allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
				1124	*
				1125	* dm_accept_partial_bio informs the dm that the target only wants to process
				1126	* additional n_sectors sectors of the bio and the rest of the data should be
				1127	* sent in a next bio.
				1128	*
				1129	* A diagram that explains the arithmetics:
				1130	* +--------------------+---------------+-------+
				1131	* \| 1 \| 2 \| 3 \|
				1132	* +--------------------+---------------+-------+
				1133	*
				1134	* <-------------- *tio->len_ptr --------------->
				1135	* <------- bi_size ------->
				1136	* <-- n_sectors -->
				1137	*
				1138	* Region 1 was already iterated over with bio_advance or similar function.
				1139	* (it may be empty if the target doesn't use bio_advance)
				1140	* Region 2 is the remaining bio size that the target wants to process.
				1141	* (it may be empty if region 1 is non-empty, although there is no reason
				1142	* to make it empty)
				1143	* The target requires that region 3 is to be sent in the next bio.
				1144	*
				1145	* If the target wants to receive multiple copies of the bio (via num_*bios, etc),
				1146	* the partially processed part (the sum of regions 1+2) must be the same for all
				1147	* copies of the bio.
				1148	*/
				1149	void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
				1150	{
				1151	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				1152	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
				1153	BUG_ON(bio->bi_opf & REQ_PREFLUSH);
				1154	BUG_ON(bi_size > *tio->len_ptr);
				1155	BUG_ON(n_sectors > bi_size);
				1156	*tio->len_ptr -= bi_size - n_sectors;
				1157	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
				1158	}
				1159	EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
				1160
				1161	/*
				1162	* The zone descriptors obtained with a zone report indicate zone positions
				1163	* within the target backing device, regardless of that device is a partition
				1164	* and regardless of the target mapping start sector on the device or partition.
				1165	* The zone descriptors start sector and write pointer position must be adjusted
				1166	* to match their relative position within the dm device.
				1167	* A target may call dm_remap_zone_report() after completion of a
				1168	* REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
				1169	* backing device.
				1170	*/
				1171	void dm_remap_zone_report(struct dm_target ti, struct bio bio, sector_t start)
				1172	{
				1173	#ifdef CONFIG_BLK_DEV_ZONED
				1174	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
				1175	struct bio *report_bio = tio->io->orig_bio;
				1176	struct blk_zone_report_hdr *hdr = NULL;
				1177	struct blk_zone *zone;
				1178	unsigned int nr_rep = 0;
				1179	unsigned int ofst;
				1180	sector_t part_offset;
				1181	struct bio_vec bvec;
				1182	struct bvec_iter iter;
				1183	void *addr;
				1184
				1185	if (bio->bi_status)
				1186	return;
				1187
				1188	/*
				1189	* bio sector was incremented by the request size on completion. Taking
				1190	* into account the original request sector, the target start offset on
				1191	* the backing device and the target mapping offset (ti->begin), the
				1192	* start sector of the backing device. The partition offset is always 0
				1193	* if the target uses a whole device.
				1194	*/
				1195	part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
				1196
				1197	/*
				1198	* Remap the start sector of the reported zones. For sequential zones,
				1199	* also remap the write pointer position.
				1200	*/
				1201	bio_for_each_segment(bvec, report_bio, iter) {
				1202	addr = kmap_atomic(bvec.bv_page);
				1203
				1204	/* Remember the report header in the first page */
				1205	if (!hdr) {
				1206	hdr = addr;
				1207	ofst = sizeof(struct blk_zone_report_hdr);
				1208	} else
				1209	ofst = 0;
				1210
				1211	/* Set zones start sector */
				1212	while (hdr->nr_zones && ofst < bvec.bv_len) {
				1213	zone = addr + ofst;
				1214	zone->start -= part_offset;
				1215	if (zone->start >= start + ti->len) {
				1216	hdr->nr_zones = 0;
				1217	break;
				1218	}
				1219	zone->start = zone->start + ti->begin - start;
				1220	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
				1221	if (zone->cond == BLK_ZONE_COND_FULL)
				1222	zone->wp = zone->start + zone->len;
				1223	else if (zone->cond == BLK_ZONE_COND_EMPTY)
				1224	zone->wp = zone->start;
				1225	else
				1226	zone->wp = zone->wp + ti->begin - start - part_offset;
				1227	}
				1228	ofst += sizeof(struct blk_zone);
				1229	hdr->nr_zones--;
				1230	nr_rep++;
				1231	}
				1232
				1233	if (addr != hdr)
				1234	kunmap_atomic(addr);
				1235
				1236	if (!hdr->nr_zones)
				1237	break;
				1238	}
				1239
				1240	if (hdr) {
				1241	hdr->nr_zones = nr_rep;
				1242	kunmap_atomic(hdr);
				1243	}
				1244
				1245	bio_advance(report_bio, report_bio->bi_iter.bi_size);
				1246
				1247	#else /* !CONFIG_BLK_DEV_ZONED */
				1248	bio->bi_status = BLK_STS_NOTSUPP;
				1249	#endif
				1250	}
				1251	EXPORT_SYMBOL_GPL(dm_remap_zone_report);
				1252
				1253	static blk_qc_t __map_bio(struct dm_target_io *tio)
				1254	{
				1255	int r;
				1256	sector_t sector;
				1257	struct bio *clone = &tio->clone;
				1258	struct dm_io *io = tio->io;
				1259	struct mapped_device *md = io->md;
				1260	struct dm_target *ti = tio->ti;
				1261	blk_qc_t ret = BLK_QC_T_NONE;
				1262
				1263	clone->bi_end_io = clone_endio;
				1264
				1265	/*
				1266	* Map the clone. If r == 0 we don't need to do
				1267	* anything, the target has assumed ownership of
				1268	* this io.
				1269	*/
				1270	atomic_inc(&io->io_count);
				1271	sector = clone->bi_iter.bi_sector;
				1272
				1273	r = ti->type->map(ti, clone);
				1274	switch (r) {
				1275	case DM_MAPIO_SUBMITTED:
				1276	break;
				1277	case DM_MAPIO_REMAPPED:
				1278	/* the bio has been remapped so dispatch it */
				1279	trace_block_bio_remap(clone->bi_disk->queue, clone,
				1280	bio_dev(io->orig_bio), sector);
				1281	if (md->type == DM_TYPE_NVME_BIO_BASED)
				1282	ret = direct_make_request(clone);
				1283	else
				1284	ret = generic_make_request(clone);
				1285	break;
				1286	case DM_MAPIO_KILL:
				1287	free_tio(tio);
				1288	dec_pending(io, BLK_STS_IOERR);
				1289	break;
				1290	case DM_MAPIO_REQUEUE:
				1291	free_tio(tio);
				1292	dec_pending(io, BLK_STS_DM_REQUEUE);
				1293	break;
				1294	default:
				1295	DMWARN("unimplemented target map return value: %d", r);
				1296	BUG();
				1297	}
				1298
				1299	return ret;
				1300	}
				1301
				1302	static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
				1303	{
				1304	bio->bi_iter.bi_sector = sector;
				1305	bio->bi_iter.bi_size = to_bytes(len);
				1306	}
				1307
				1308	/*
				1309	* Creates a bio that consists of range of complete bvecs.
				1310	*/
				1311	static int clone_bio(struct dm_target_io tio, struct bio bio,
				1312	sector_t sector, unsigned len)
				1313	{
				1314	struct bio *clone = &tio->clone;
				1315
				1316	__bio_clone_fast(clone, bio);
				1317
				1318	bio_crypt_clone(clone, bio, GFP_NOIO);
				1319
				1320	if (unlikely(bio_integrity(bio) != NULL)) {
				1321	int r;
				1322	if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
				1323	!dm_target_passes_integrity(tio->ti->type))) {
				1324	DMWARN("%s: the target %s doesn't support integrity data.",
				1325	dm_device_name(tio->io->md),
				1326	tio->ti->type->name);
				1327	return -EIO;
				1328	}
				1329
				1330	r = bio_integrity_clone(clone, bio, GFP_NOIO);
				1331	if (r < 0)
				1332	return r;
				1333	}
				1334
				1335	if (bio_op(bio) != REQ_OP_ZONE_REPORT)
				1336	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
				1337	clone->bi_iter.bi_size = to_bytes(len);
				1338
				1339	if (unlikely(bio_integrity(bio) != NULL))
				1340	bio_integrity_trim(clone);
				1341
				1342	return 0;
				1343	}
				1344
				1345	static void alloc_multiple_bios(struct bio_list blist, struct clone_info ci,
				1346	struct dm_target *ti, unsigned num_bios)
				1347	{
				1348	struct dm_target_io *tio;
				1349	int try;
				1350
				1351	if (!num_bios)
				1352	return;
				1353
				1354	if (num_bios == 1) {
				1355	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
				1356	bio_list_add(blist, &tio->clone);
				1357	return;
				1358	}
				1359
				1360	for (try = 0; try < 2; try++) {
				1361	int bio_nr;
				1362	struct bio *bio;
				1363
				1364	if (try)
				1365	mutex_lock(&ci->io->md->table_devices_lock);
				1366	for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
				1367	tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
				1368	if (!tio)
				1369	break;
				1370
				1371	bio_list_add(blist, &tio->clone);
				1372	}
				1373	if (try)
				1374	mutex_unlock(&ci->io->md->table_devices_lock);
				1375	if (bio_nr == num_bios)
				1376	return;
				1377
				1378	while ((bio = bio_list_pop(blist))) {
				1379	tio = container_of(bio, struct dm_target_io, clone);
				1380	free_tio(tio);
				1381	}
				1382	}
				1383	}
				1384
				1385	static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
				1386	struct dm_target_io tio, unsigned len)
				1387	{
				1388	struct bio *clone = &tio->clone;
				1389
				1390	tio->len_ptr = len;
				1391
				1392	__bio_clone_fast(clone, ci->bio);
				1393	if (len)
				1394	bio_setup_sector(clone, ci->sector, *len);
				1395
				1396	return __map_bio(tio);
				1397	}
				1398
				1399	static void __send_duplicate_bios(struct clone_info ci, struct dm_target ti,
				1400	unsigned num_bios, unsigned *len)
				1401	{
				1402	struct bio_list blist = BIO_EMPTY_LIST;
				1403	struct bio *bio;
				1404	struct dm_target_io *tio;
				1405
				1406	alloc_multiple_bios(&blist, ci, ti, num_bios);
				1407
				1408	while ((bio = bio_list_pop(&blist))) {
				1409	tio = container_of(bio, struct dm_target_io, clone);
				1410	(void) __clone_and_map_simple_bio(ci, tio, len);
				1411	}
				1412	}
				1413
				1414	static int __send_empty_flush(struct clone_info *ci)
				1415	{
				1416	unsigned target_nr = 0;
				1417	struct dm_target *ti;
				1418
				1419	BUG_ON(bio_has_data(ci->bio));
				1420	while ((ti = dm_table_get_target(ci->map, target_nr++)))
				1421	__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
				1422
				1423	return 0;
				1424	}
				1425
				1426	static int __clone_and_map_data_bio(struct clone_info ci, struct dm_target ti,
				1427	sector_t sector, unsigned *len)
				1428	{
				1429	struct bio *bio = ci->bio;
				1430	struct dm_target_io *tio;
				1431	int r;
				1432
				1433	tio = alloc_tio(ci, ti, 0, GFP_NOIO);
				1434	tio->len_ptr = len;
				1435	r = clone_bio(tio, bio, sector, *len);
				1436	if (r < 0) {
				1437	free_tio(tio);
				1438	return r;
				1439	}
				1440	(void) __map_bio(tio);
				1441
				1442	return 0;
				1443	}
				1444
				1445	typedef unsigned (get_num_bios_fn)(struct dm_target ti);
				1446
				1447	static unsigned get_num_discard_bios(struct dm_target *ti)
				1448	{
				1449	return ti->num_discard_bios;
				1450	}
				1451
				1452	static unsigned get_num_secure_erase_bios(struct dm_target *ti)
				1453	{
				1454	return ti->num_secure_erase_bios;
				1455	}
				1456
				1457	static unsigned get_num_write_same_bios(struct dm_target *ti)
				1458	{
				1459	return ti->num_write_same_bios;
				1460	}
				1461
				1462	static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
				1463	{
				1464	return ti->num_write_zeroes_bios;
				1465	}
				1466
				1467	typedef bool (is_split_required_fn)(struct dm_target ti);
				1468
				1469	static bool is_split_required_for_discard(struct dm_target *ti)
				1470	{
				1471	return ti->split_discard_bios;
				1472	}
				1473
				1474	static int __send_changing_extent_only(struct clone_info ci, struct dm_target ti,
				1475	get_num_bios_fn get_num_bios,
				1476	is_split_required_fn is_split_required)
				1477	{
				1478	unsigned len;
				1479	unsigned num_bios;
				1480
				1481	/*
				1482	* Even though the device advertised support for this type of
				1483	* request, that does not mean every target supports it, and
				1484	* reconfiguration might also have changed that since the
				1485	* check was performed.
				1486	*/
				1487	num_bios = get_num_bios ? get_num_bios(ti) : 0;
				1488	if (!num_bios)
				1489	return -EOPNOTSUPP;
				1490
				1491	if (is_split_required && !is_split_required(ti))
				1492	len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
				1493	else
				1494	len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
				1495
				1496	__send_duplicate_bios(ci, ti, num_bios, &len);
				1497
				1498	ci->sector += len;
				1499	ci->sector_count -= len;
				1500
				1501	return 0;
				1502	}
				1503
				1504	static int __send_discard(struct clone_info ci, struct dm_target ti)
				1505	{
				1506	return __send_changing_extent_only(ci, ti, get_num_discard_bios,
				1507	is_split_required_for_discard);
				1508	}
				1509
				1510	static int __send_secure_erase(struct clone_info ci, struct dm_target ti)
				1511	{
				1512	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
				1513	}
				1514
				1515	static int __send_write_same(struct clone_info ci, struct dm_target ti)
				1516	{
				1517	return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
				1518	}
				1519
				1520	static int __send_write_zeroes(struct clone_info ci, struct dm_target ti)
				1521	{
				1522	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
				1523	}
				1524
				1525	static bool __process_abnormal_io(struct clone_info ci, struct dm_target ti,
				1526	int *result)
				1527	{
				1528	struct bio *bio = ci->bio;
				1529
				1530	if (bio_op(bio) == REQ_OP_DISCARD)
				1531	*result = __send_discard(ci, ti);
				1532	else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
				1533	*result = __send_secure_erase(ci, ti);
				1534	else if (bio_op(bio) == REQ_OP_WRITE_SAME)
				1535	*result = __send_write_same(ci, ti);
				1536	else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
				1537	*result = __send_write_zeroes(ci, ti);
				1538	else
				1539	return false;
				1540
				1541	return true;
				1542	}
				1543
				1544	/*
				1545	* Select the correct strategy for processing a non-flush bio.
				1546	*/
				1547	static int __split_and_process_non_flush(struct clone_info *ci)
				1548	{
				1549	struct bio *bio = ci->bio;
				1550	struct dm_target *ti;
				1551	unsigned len;
				1552	int r;
				1553
				1554	ti = dm_table_find_target(ci->map, ci->sector);
				1555	if (!dm_target_is_valid(ti))
				1556	return -EIO;
				1557
				1558	if (unlikely(__process_abnormal_io(ci, ti, &r)))
				1559	return r;
				1560
				1561	if (bio_op(bio) == REQ_OP_ZONE_REPORT)
				1562	len = ci->sector_count;
				1563	else
				1564	len = min_t(sector_t, max_io_len(ci->sector, ti),
				1565	ci->sector_count);
				1566
				1567	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
				1568	if (r < 0)
				1569	return r;
				1570
				1571	ci->sector += len;
				1572	ci->sector_count -= len;
				1573
				1574	return 0;
				1575	}
				1576
				1577	static void init_clone_info(struct clone_info ci, struct mapped_device md,
				1578	struct dm_table map, struct bio bio)
				1579	{
				1580	ci->map = map;
				1581	ci->io = alloc_io(md, bio);
				1582	ci->sector = bio->bi_iter.bi_sector;
				1583	}
				1584
				1585	/*
				1586	* Entry point to split a bio into clones and submit them to the targets.
				1587	*/
				1588	static blk_qc_t __split_and_process_bio(struct mapped_device *md,
				1589	struct dm_table map, struct bio bio)
				1590	{
				1591	struct clone_info ci;
				1592	blk_qc_t ret = BLK_QC_T_NONE;
				1593	int error = 0;
				1594
				1595	if (unlikely(!map)) {
				1596	bio_io_error(bio);
				1597	return ret;
				1598	}
				1599
				1600	blk_queue_split(md->queue, &bio);
				1601
				1602	init_clone_info(&ci, md, map, bio);
				1603
				1604	if (bio->bi_opf & REQ_PREFLUSH) {
				1605	ci.bio = &ci.io->md->flush_bio;
				1606	ci.sector_count = 0;
				1607	error = __send_empty_flush(&ci);
				1608	/* dec_pending submits any data associated with flush */
				1609	} else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
				1610	ci.bio = bio;
				1611	ci.sector_count = 0;
				1612	error = __split_and_process_non_flush(&ci);
				1613	} else {
				1614	ci.bio = bio;
				1615	ci.sector_count = bio_sectors(bio);
				1616	while (ci.sector_count && !error) {
				1617	error = __split_and_process_non_flush(&ci);
				1618	if (current->bio_list && ci.sector_count && !error) {
				1619	/*
				1620	* Remainder must be passed to generic_make_request()
				1621	* so that it gets handled after bios already submitted
				1622	* have been completely processed.
				1623	* We take a clone of the original to store in
				1624	* ci.io->orig_bio to be used by end_io_acct() and
				1625	* for dec_pending to use for completion handling.
				1626	* As this path is not used for REQ_OP_ZONE_REPORT,
				1627	* the usage of io->orig_bio in dm_remap_zone_report()
				1628	* won't be affected by this reassignment.
				1629	*/
				1630	struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
				1631	GFP_NOIO, &md->queue->bio_split);
				1632	ci.io->orig_bio = b;
				1633	bio_chain(b, bio);
				1634	ret = generic_make_request(bio);
				1635	break;
				1636	}
				1637	}
				1638	}
				1639
				1640	/* drop the extra reference count */
				1641	dec_pending(ci.io, errno_to_blk_status(error));
				1642	return ret;
				1643	}
				1644
				1645	/*
				1646	* Optimized variant of __split_and_process_bio that leverages the
				1647	* fact that targets that use it do _not_ have a need to split bios.
				1648	*/
				1649	static blk_qc_t __process_bio(struct mapped_device *md,
				1650	struct dm_table map, struct bio bio)
				1651	{
				1652	struct clone_info ci;
				1653	blk_qc_t ret = BLK_QC_T_NONE;
				1654	int error = 0;
				1655
				1656	if (unlikely(!map)) {
				1657	bio_io_error(bio);
				1658	return ret;
				1659	}
				1660
				1661	init_clone_info(&ci, md, map, bio);
				1662
				1663	if (bio->bi_opf & REQ_PREFLUSH) {
				1664	ci.bio = &ci.io->md->flush_bio;
				1665	ci.sector_count = 0;
				1666	error = __send_empty_flush(&ci);
				1667	/* dec_pending submits any data associated with flush */
				1668	} else {
				1669	struct dm_target *ti = md->immutable_target;
				1670	struct dm_target_io *tio;
				1671
				1672	/*
				1673	* Defend against IO still getting in during teardown
				1674	* - as was seen for a time with nvme-fcloop
				1675	*/
				1676	if (unlikely(WARN_ON_ONCE(!ti \|\| !dm_target_is_valid(ti)))) {
				1677	error = -EIO;
				1678	goto out;
				1679	}
				1680
				1681	ci.bio = bio;
				1682	ci.sector_count = bio_sectors(bio);
				1683	if (unlikely(__process_abnormal_io(&ci, ti, &error)))
				1684	goto out;
				1685
				1686	tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
				1687	ret = __clone_and_map_simple_bio(&ci, tio, NULL);
				1688	}
				1689	out:
				1690	/* drop the extra reference count */
				1691	dec_pending(ci.io, errno_to_blk_status(error));
				1692	return ret;
				1693	}
				1694
				1695	typedef blk_qc_t (process_bio_fn)(struct mapped_device , struct dm_table , struct bio *);
				1696
				1697	static blk_qc_t __dm_make_request(struct request_queue q, struct bio bio,
				1698	process_bio_fn process_bio)
				1699	{
				1700	struct mapped_device *md = q->queuedata;
				1701	blk_qc_t ret = BLK_QC_T_NONE;
				1702	int srcu_idx;
				1703	struct dm_table *map;
				1704
				1705	map = dm_get_live_table(md, &srcu_idx);
				1706
				1707	/* if we're suspended, we have to queue this io for later */
				1708	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
				1709	dm_put_live_table(md, srcu_idx);
				1710
				1711	if (!(bio->bi_opf & REQ_RAHEAD))
				1712	queue_io(md, bio);
				1713	else
				1714	bio_io_error(bio);
				1715	return ret;
				1716	}
				1717
				1718	ret = process_bio(md, map, bio);
				1719
				1720	dm_put_live_table(md, srcu_idx);
				1721	return ret;
				1722	}
				1723
				1724	/*
				1725	* The request function that remaps the bio to one target and
				1726	* splits off any remainder.
				1727	*/
				1728	static blk_qc_t dm_make_request(struct request_queue q, struct bio bio)
				1729	{
				1730	return __dm_make_request(q, bio, __split_and_process_bio);
				1731	}
				1732
				1733	static blk_qc_t dm_make_request_nvme(struct request_queue q, struct bio bio)
				1734	{
				1735	return __dm_make_request(q, bio, __process_bio);
				1736	}
				1737
				1738	static int dm_any_congested(void *congested_data, int bdi_bits)
				1739	{
				1740	int r = bdi_bits;
				1741	struct mapped_device *md = congested_data;
				1742	struct dm_table *map;
				1743
				1744	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				1745	if (dm_request_based(md)) {
				1746	/*
				1747	* With request-based DM we only need to check the
				1748	* top-level queue for congestion.
				1749	*/
				1750	r = md->queue->backing_dev_info->wb.state & bdi_bits;
				1751	} else {
				1752	map = dm_get_live_table_fast(md);
				1753	if (map)
				1754	r = dm_table_any_congested(map, bdi_bits);
				1755	dm_put_live_table_fast(md);
				1756	}
				1757	}
				1758
				1759	return r;
				1760	}
				1761
				1762	/*-----------------------------------------------------------------
				1763	* An IDR is used to keep track of allocated minor numbers.
				1764	---------------------------------------------------------------/
				1765	static void free_minor(int minor)
				1766	{
				1767	spin_lock(&_minor_lock);
				1768	idr_remove(&_minor_idr, minor);
				1769	spin_unlock(&_minor_lock);
				1770	}
				1771
				1772	/*
				1773	* See if the device with a specific minor # is free.
				1774	*/
				1775	static int specific_minor(int minor)
				1776	{
				1777	int r;
				1778
				1779	if (minor >= (1 << MINORBITS))
				1780	return -EINVAL;
				1781
				1782	idr_preload(GFP_KERNEL);
				1783	spin_lock(&_minor_lock);
				1784
				1785	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
				1786
				1787	spin_unlock(&_minor_lock);
				1788	idr_preload_end();
				1789	if (r < 0)
				1790	return r == -ENOSPC ? -EBUSY : r;
				1791	return 0;
				1792	}
				1793
				1794	static int next_free_minor(int *minor)
				1795	{
				1796	int r;
				1797
				1798	idr_preload(GFP_KERNEL);
				1799	spin_lock(&_minor_lock);
				1800
				1801	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
				1802
				1803	spin_unlock(&_minor_lock);
				1804	idr_preload_end();
				1805	if (r < 0)
				1806	return r;
				1807	*minor = r;
				1808	return 0;
				1809	}
				1810
				1811	static const struct block_device_operations dm_blk_dops;
				1812	static const struct dax_operations dm_dax_ops;
				1813
				1814	static void dm_wq_work(struct work_struct *work);
				1815
				1816	static void dm_init_normal_md_queue(struct mapped_device *md)
				1817	{
				1818	md->use_blk_mq = false;
				1819
				1820	/*
				1821	* Initialize aspects of queue that aren't relevant for blk-mq
				1822	*/
				1823	md->queue->backing_dev_info->congested_fn = dm_any_congested;
				1824	}
				1825
				1826	static void cleanup_mapped_device(struct mapped_device *md)
				1827	{
				1828	if (md->wq)
				1829	destroy_workqueue(md->wq);
				1830	if (md->kworker_task)
				1831	kthread_stop(md->kworker_task);
				1832	bioset_exit(&md->bs);
				1833	bioset_exit(&md->io_bs);
				1834
				1835	if (md->dax_dev) {
				1836	kill_dax(md->dax_dev);
				1837	put_dax(md->dax_dev);
				1838	md->dax_dev = NULL;
				1839	}
				1840
				1841	if (md->disk) {
				1842	spin_lock(&_minor_lock);
				1843	md->disk->private_data = NULL;
				1844	spin_unlock(&_minor_lock);
				1845	del_gendisk(md->disk);
				1846	put_disk(md->disk);
				1847	}
				1848
				1849	if (md->queue)
				1850	blk_cleanup_queue(md->queue);
				1851
				1852	cleanup_srcu_struct(&md->io_barrier);
				1853
				1854	if (md->bdev) {
				1855	bdput(md->bdev);
				1856	md->bdev = NULL;
				1857	}
				1858
				1859	mutex_destroy(&md->suspend_lock);
				1860	mutex_destroy(&md->type_lock);
				1861	mutex_destroy(&md->table_devices_lock);
				1862
				1863	dm_mq_cleanup_mapped_device(md);
				1864	}
				1865
				1866	/*
				1867	* Allocate and initialise a blank device with a given minor.
				1868	*/
				1869	static struct mapped_device *alloc_dev(int minor)
				1870	{
				1871	int r, numa_node_id = dm_get_numa_node();
				1872	struct dax_device *dax_dev = NULL;
				1873	struct mapped_device *md;
				1874	void *old_md;
				1875
				1876	md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
				1877	if (!md) {
				1878	DMWARN("unable to allocate device, out of memory.");
				1879	return NULL;
				1880	}
				1881
				1882	if (!try_module_get(THIS_MODULE))
				1883	goto bad_module_get;
				1884
				1885	/* get a minor number for the dev */
				1886	if (minor == DM_ANY_MINOR)
				1887	r = next_free_minor(&minor);
				1888	else
				1889	r = specific_minor(minor);
				1890	if (r < 0)
				1891	goto bad_minor;
				1892
				1893	r = init_srcu_struct(&md->io_barrier);
				1894	if (r < 0)
				1895	goto bad_io_barrier;
				1896
				1897	md->numa_node_id = numa_node_id;
				1898	md->use_blk_mq = dm_use_blk_mq_default();
				1899	md->init_tio_pdu = false;
				1900	md->type = DM_TYPE_NONE;
				1901	mutex_init(&md->suspend_lock);
				1902	mutex_init(&md->type_lock);
				1903	mutex_init(&md->table_devices_lock);
				1904	spin_lock_init(&md->deferred_lock);
				1905	atomic_set(&md->holders, 1);
				1906	atomic_set(&md->open_count, 0);
				1907	atomic_set(&md->event_nr, 0);
				1908	atomic_set(&md->uevent_seq, 0);
				1909	INIT_LIST_HEAD(&md->uevent_list);
				1910	INIT_LIST_HEAD(&md->table_devices);
				1911	spin_lock_init(&md->uevent_lock);
				1912
				1913	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
				1914	if (!md->queue)
				1915	goto bad;
				1916	md->queue->queuedata = md;
				1917	md->queue->backing_dev_info->congested_data = md;
				1918
				1919	md->disk = alloc_disk_node(1, md->numa_node_id);
				1920	if (!md->disk)
				1921	goto bad;
				1922
				1923	atomic_set(&md->pending[0], 0);
				1924	atomic_set(&md->pending[1], 0);
				1925	init_waitqueue_head(&md->wait);
				1926	INIT_WORK(&md->work, dm_wq_work);
				1927	init_waitqueue_head(&md->eventq);
				1928	init_completion(&md->kobj_holder.completion);
				1929	md->kworker_task = NULL;
				1930
				1931	md->disk->major = _major;
				1932	md->disk->first_minor = minor;
				1933	md->disk->fops = &dm_blk_dops;
				1934	md->disk->queue = md->queue;
				1935	md->disk->private_data = md;
				1936	sprintf(md->disk->disk_name, "dm-%d", minor);
				1937
				1938	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
				1939	dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
				1940	if (!dax_dev)
				1941	goto bad;
				1942	}
				1943	md->dax_dev = dax_dev;
				1944
				1945	add_disk_no_queue_reg(md->disk);
				1946	format_dev_t(md->name, MKDEV(_major, minor));
				1947
				1948	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
				1949	if (!md->wq)
				1950	goto bad;
				1951
				1952	md->bdev = bdget_disk(md->disk, 0);
				1953	if (!md->bdev)
				1954	goto bad;
				1955
				1956	bio_init(&md->flush_bio, NULL, 0);
				1957	bio_set_dev(&md->flush_bio, md->bdev);
				1958	md->flush_bio.bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH \| REQ_SYNC;
				1959
				1960	dm_stats_init(&md->stats);
				1961
				1962	/* Populate the mapping, nobody knows we exist yet */
				1963	spin_lock(&_minor_lock);
				1964	old_md = idr_replace(&_minor_idr, md, minor);
				1965	spin_unlock(&_minor_lock);
				1966
				1967	BUG_ON(old_md != MINOR_ALLOCED);
				1968
				1969	return md;
				1970
				1971	bad:
				1972	cleanup_mapped_device(md);
				1973	bad_io_barrier:
				1974	free_minor(minor);
				1975	bad_minor:
				1976	module_put(THIS_MODULE);
				1977	bad_module_get:
				1978	kvfree(md);
				1979	return NULL;
				1980	}
				1981
				1982	static void unlock_fs(struct mapped_device *md);
				1983
				1984	static void free_dev(struct mapped_device *md)
				1985	{
				1986	int minor = MINOR(disk_devt(md->disk));
				1987
				1988	unlock_fs(md);
				1989
				1990	cleanup_mapped_device(md);
				1991
				1992	free_table_devices(&md->table_devices);
				1993	dm_stats_cleanup(&md->stats);
				1994	free_minor(minor);
				1995
				1996	module_put(THIS_MODULE);
				1997	kvfree(md);
				1998	}
				1999
				2000	static int __bind_mempools(struct mapped_device md, struct dm_table t)
				2001	{
				2002	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
				2003	int ret = 0;
				2004
				2005	if (dm_table_bio_based(t)) {
				2006	/*
				2007	* The md may already have mempools that need changing.
				2008	* If so, reload bioset because front_pad may have changed
				2009	* because a different table was loaded.
				2010	*/
				2011	bioset_exit(&md->bs);
				2012	bioset_exit(&md->io_bs);
				2013
				2014	} else if (bioset_initialized(&md->bs)) {
				2015	/*
				2016	* There's no need to reload with request-based dm
				2017	* because the size of front_pad doesn't change.
				2018	* Note for future: If you are to reload bioset,
				2019	* prep-ed requests in the queue may refer
				2020	* to bio from the old bioset, so you must walk
				2021	* through the queue to unprep.
				2022	*/
				2023	goto out;
				2024	}
				2025
				2026	BUG_ON(!p \|\|
				2027	bioset_initialized(&md->bs) \|\|
				2028	bioset_initialized(&md->io_bs));
				2029
				2030	ret = bioset_init_from_src(&md->bs, &p->bs);
				2031	if (ret)
				2032	goto out;
				2033	ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
				2034	if (ret)
				2035	bioset_exit(&md->bs);
				2036	out:
				2037	/* mempool bind completed, no longer need any mempools in the table */
				2038	dm_table_free_md_mempools(t);
				2039	return ret;
				2040	}
				2041
				2042	/*
				2043	* Bind a table to the device.
				2044	*/
				2045	static void event_callback(void *context)
				2046	{
				2047	unsigned long flags;
				2048	LIST_HEAD(uevents);
				2049	struct mapped_device md = (struct mapped_device ) context;
				2050
				2051	spin_lock_irqsave(&md->uevent_lock, flags);
				2052	list_splice_init(&md->uevent_list, &uevents);
				2053	spin_unlock_irqrestore(&md->uevent_lock, flags);
				2054
				2055	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
				2056
				2057	atomic_inc(&md->event_nr);
				2058	wake_up(&md->eventq);
				2059	dm_issue_global_event();
				2060	}
				2061
				2062	/*
				2063	* Protected by md->suspend_lock obtained by dm_swap_table().
				2064	*/
				2065	static void __set_size(struct mapped_device *md, sector_t size)
				2066	{
				2067	lockdep_assert_held(&md->suspend_lock);
				2068
				2069	set_capacity(md->disk, size);
				2070
				2071	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
				2072	}
				2073
				2074	/*
				2075	* Returns old map, which caller must destroy.
				2076	*/
				2077	static struct dm_table __bind(struct mapped_device md, struct dm_table *t,
				2078	struct queue_limits *limits)
				2079	{
				2080	struct dm_table *old_map;
				2081	struct request_queue *q = md->queue;
				2082	bool request_based = dm_table_request_based(t);
				2083	sector_t size;
				2084	int ret;
				2085
				2086	lockdep_assert_held(&md->suspend_lock);
				2087
				2088	size = dm_table_get_size(t);
				2089
				2090	/*
				2091	* Wipe any geometry if the size of the table changed.
				2092	*/
				2093	if (size != dm_get_size(md))
				2094	memset(&md->geometry, 0, sizeof(md->geometry));
				2095
				2096	__set_size(md, size);
				2097
				2098	dm_table_event_callback(t, event_callback, md);
				2099
				2100	/*
				2101	* The queue hasn't been stopped yet, if the old table type wasn't
				2102	* for request-based during suspension. So stop it to prevent
				2103	* I/O mapping before resume.
				2104	* This must be done before setting the queue restrictions,
				2105	* because request-based dm may be run just after the setting.
				2106	*/
				2107	if (request_based)
				2108	dm_stop_queue(q);
				2109
				2110	if (request_based \|\| md->type == DM_TYPE_NVME_BIO_BASED) {
				2111	/*
				2112	* Leverage the fact that request-based DM targets and
				2113	* NVMe bio based targets are immutable singletons
				2114	* - used to optimize both dm_request_fn and dm_mq_queue_rq;
				2115	* and __process_bio.
				2116	*/
				2117	md->immutable_target = dm_table_get_immutable_target(t);
				2118	}
				2119
				2120	ret = __bind_mempools(md, t);
				2121	if (ret) {
				2122	old_map = ERR_PTR(ret);
				2123	goto out;
				2124	}
				2125
				2126	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2127	rcu_assign_pointer(md->map, (void *)t);
				2128	md->immutable_target_type = dm_table_get_immutable_target_type(t);
				2129
				2130	dm_table_set_restrictions(t, q, limits);
				2131	if (old_map)
				2132	dm_sync_table(md);
				2133
				2134	out:
				2135	return old_map;
				2136	}
				2137
				2138	/*
				2139	* Returns unbound table for the caller to free.
				2140	*/
				2141	static struct dm_table __unbind(struct mapped_device md)
				2142	{
				2143	struct dm_table *map = rcu_dereference_protected(md->map, 1);
				2144
				2145	if (!map)
				2146	return NULL;
				2147
				2148	dm_table_event_callback(map, NULL, NULL);
				2149	RCU_INIT_POINTER(md->map, NULL);
				2150	dm_sync_table(md);
				2151
				2152	return map;
				2153	}
				2154
				2155	/*
				2156	* Constructor for a new device.
				2157	*/
				2158	int dm_create(int minor, struct mapped_device **result)
				2159	{
				2160	int r;
				2161	struct mapped_device *md;
				2162
				2163	md = alloc_dev(minor);
				2164	if (!md)
				2165	return -ENXIO;
				2166
				2167	r = dm_sysfs_init(md);
				2168	if (r) {
				2169	free_dev(md);
				2170	return r;
				2171	}
				2172
				2173	*result = md;
				2174	return 0;
				2175	}
				2176
				2177	/*
				2178	* Functions to manage md->type.
				2179	* All are required to hold md->type_lock.
				2180	*/
				2181	void dm_lock_md_type(struct mapped_device *md)
				2182	{
				2183	mutex_lock(&md->type_lock);
				2184	}
				2185
				2186	void dm_unlock_md_type(struct mapped_device *md)
				2187	{
				2188	mutex_unlock(&md->type_lock);
				2189	}
				2190
				2191	void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
				2192	{
				2193	BUG_ON(!mutex_is_locked(&md->type_lock));
				2194	md->type = type;
				2195	}
				2196
				2197	enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
				2198	{
				2199	return md->type;
				2200	}
				2201
				2202	struct target_type dm_get_immutable_target_type(struct mapped_device md)
				2203	{
				2204	return md->immutable_target_type;
				2205	}
				2206
				2207	/*
				2208	* The queue_limits are only valid as long as you have a reference
				2209	* count on 'md'.
				2210	*/
				2211	struct queue_limits dm_get_queue_limits(struct mapped_device md)
				2212	{
				2213	BUG_ON(!atomic_read(&md->holders));
				2214	return &md->queue->limits;
				2215	}
				2216	EXPORT_SYMBOL_GPL(dm_get_queue_limits);
				2217
				2218	/*
				2219	* Setup the DM device's queue based on md's type
				2220	*/
				2221	int dm_setup_md_queue(struct mapped_device md, struct dm_table t)
				2222	{
				2223	int r;
				2224	struct queue_limits limits;
				2225	enum dm_queue_mode type = dm_get_md_type(md);
				2226
				2227	switch (type) {
				2228	case DM_TYPE_REQUEST_BASED:
				2229	dm_init_normal_md_queue(md);
				2230	r = dm_old_init_request_queue(md, t);
				2231	if (r) {
				2232	DMERR("Cannot initialize queue for request-based mapped device");
				2233	return r;
				2234	}
				2235	break;
				2236	case DM_TYPE_MQ_REQUEST_BASED:
				2237	r = dm_mq_init_request_queue(md, t);
				2238	if (r) {
				2239	DMERR("Cannot initialize queue for request-based dm-mq mapped device");
				2240	return r;
				2241	}
				2242	break;
				2243	case DM_TYPE_BIO_BASED:
				2244	case DM_TYPE_DAX_BIO_BASED:
				2245	dm_init_normal_md_queue(md);
				2246	blk_queue_make_request(md->queue, dm_make_request);
				2247	break;
				2248	case DM_TYPE_NVME_BIO_BASED:
				2249	dm_init_normal_md_queue(md);
				2250	blk_queue_make_request(md->queue, dm_make_request_nvme);
				2251	break;
				2252	case DM_TYPE_NONE:
				2253	WARN_ON_ONCE(true);
				2254	break;
				2255	}
				2256
				2257	r = dm_calculate_queue_limits(t, &limits);
				2258	if (r) {
				2259	DMERR("Cannot calculate initial queue limits");
				2260	return r;
				2261	}
				2262	dm_table_set_restrictions(t, md->queue, &limits);
				2263	blk_register_queue(md->disk);
				2264
				2265	return 0;
				2266	}
				2267
				2268	struct mapped_device *dm_get_md(dev_t dev)
				2269	{
				2270	struct mapped_device *md;
				2271	unsigned minor = MINOR(dev);
				2272
				2273	if (MAJOR(dev) != _major \|\| minor >= (1 << MINORBITS))
				2274	return NULL;
				2275
				2276	spin_lock(&_minor_lock);
				2277
				2278	md = idr_find(&_minor_idr, minor);
				2279	if (!md \|\| md == MINOR_ALLOCED \|\| (MINOR(disk_devt(dm_disk(md))) != minor) \|\|
				2280	test_bit(DMF_FREEING, &md->flags) \|\| dm_deleting_md(md)) {
				2281	md = NULL;
				2282	goto out;
				2283	}
				2284	dm_get(md);
				2285	out:
				2286	spin_unlock(&_minor_lock);
				2287
				2288	return md;
				2289	}
				2290	EXPORT_SYMBOL_GPL(dm_get_md);
				2291
				2292	void dm_get_mdptr(struct mapped_device md)
				2293	{
				2294	return md->interface_ptr;
				2295	}
				2296
				2297	void dm_set_mdptr(struct mapped_device md, void ptr)
				2298	{
				2299	md->interface_ptr = ptr;
				2300	}
				2301
				2302	void dm_get(struct mapped_device *md)
				2303	{
				2304	atomic_inc(&md->holders);
				2305	BUG_ON(test_bit(DMF_FREEING, &md->flags));
				2306	}
				2307
				2308	int dm_hold(struct mapped_device *md)
				2309	{
				2310	spin_lock(&_minor_lock);
				2311	if (test_bit(DMF_FREEING, &md->flags)) {
				2312	spin_unlock(&_minor_lock);
				2313	return -EBUSY;
				2314	}
				2315	dm_get(md);
				2316	spin_unlock(&_minor_lock);
				2317	return 0;
				2318	}
				2319	EXPORT_SYMBOL_GPL(dm_hold);
				2320
				2321	const char dm_device_name(struct mapped_device md)
				2322	{
				2323	return md->name;
				2324	}
				2325	EXPORT_SYMBOL_GPL(dm_device_name);
				2326
				2327	static void __dm_destroy(struct mapped_device *md, bool wait)
				2328	{
				2329	struct dm_table *map;
				2330	int srcu_idx;
				2331
				2332	might_sleep();
				2333
				2334	spin_lock(&_minor_lock);
				2335	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
				2336	set_bit(DMF_FREEING, &md->flags);
				2337	spin_unlock(&_minor_lock);
				2338
				2339	blk_set_queue_dying(md->queue);
				2340
				2341	if (dm_request_based(md) && md->kworker_task)
				2342	kthread_flush_worker(&md->kworker);
				2343
				2344	/*
				2345	* Take suspend_lock so that presuspend and postsuspend methods
				2346	* do not race with internal suspend.
				2347	*/
				2348	mutex_lock(&md->suspend_lock);
				2349	map = dm_get_live_table(md, &srcu_idx);
				2350	if (!dm_suspended_md(md)) {
				2351	dm_table_presuspend_targets(map);
				2352	dm_table_postsuspend_targets(map);
				2353	}
				2354	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
				2355	dm_put_live_table(md, srcu_idx);
				2356	mutex_unlock(&md->suspend_lock);
				2357
				2358	/*
				2359	* Rare, but there may be I/O requests still going to complete,
				2360	* for example. Wait for all references to disappear.
				2361	* No one should increment the reference count of the mapped_device,
				2362	* after the mapped_device state becomes DMF_FREEING.
				2363	*/
				2364	if (wait)
				2365	while (atomic_read(&md->holders))
				2366	msleep(1);
				2367	else if (atomic_read(&md->holders))
				2368	DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
				2369	dm_device_name(md), atomic_read(&md->holders));
				2370
				2371	dm_sysfs_exit(md);
				2372	dm_table_destroy(__unbind(md));
				2373	free_dev(md);
				2374	}
				2375
				2376	void dm_destroy(struct mapped_device *md)
				2377	{
				2378	__dm_destroy(md, true);
				2379	}
				2380
				2381	void dm_destroy_immediate(struct mapped_device *md)
				2382	{
				2383	__dm_destroy(md, false);
				2384	}
				2385
				2386	void dm_put(struct mapped_device *md)
				2387	{
				2388	atomic_dec(&md->holders);
				2389	}
				2390	EXPORT_SYMBOL_GPL(dm_put);
				2391
				2392	static int dm_wait_for_completion(struct mapped_device *md, long task_state)
				2393	{
				2394	int r = 0;
				2395	DEFINE_WAIT(wait);
				2396
				2397	while (1) {
				2398	prepare_to_wait(&md->wait, &wait, task_state);
				2399
				2400	if (!md_in_flight(md))
				2401	break;
				2402
				2403	if (signal_pending_state(task_state, current)) {
				2404	r = -EINTR;
				2405	break;
				2406	}
				2407
				2408	io_schedule();
				2409	}
				2410	finish_wait(&md->wait, &wait);
				2411
				2412	return r;
				2413	}
				2414
				2415	/*
				2416	* Process the deferred bios
				2417	*/
				2418	static void dm_wq_work(struct work_struct *work)
				2419	{
				2420	struct mapped_device *md = container_of(work, struct mapped_device,
				2421	work);
				2422	struct bio *c;
				2423	int srcu_idx;
				2424	struct dm_table *map;
				2425
				2426	map = dm_get_live_table(md, &srcu_idx);
				2427
				2428	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
				2429	spin_lock_irq(&md->deferred_lock);
				2430	c = bio_list_pop(&md->deferred);
				2431	spin_unlock_irq(&md->deferred_lock);
				2432
				2433	if (!c)
				2434	break;
				2435
				2436	if (dm_request_based(md))
				2437	generic_make_request(c);
				2438	else
				2439	__split_and_process_bio(md, map, c);
				2440	}
				2441
				2442	dm_put_live_table(md, srcu_idx);
				2443	}
				2444
				2445	static void dm_queue_flush(struct mapped_device *md)
				2446	{
				2447	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2448	smp_mb__after_atomic();
				2449	queue_work(md->wq, &md->work);
				2450	}
				2451
				2452	/*
				2453	* Swap in a new table, returning the old one for the caller to destroy.
				2454	*/
				2455	struct dm_table dm_swap_table(struct mapped_device md, struct dm_table *table)
				2456	{
				2457	struct dm_table live_map = NULL, map = ERR_PTR(-EINVAL);
				2458	struct queue_limits limits;
				2459	int r;
				2460
				2461	mutex_lock(&md->suspend_lock);
				2462
				2463	/* device must be suspended */
				2464	if (!dm_suspended_md(md))
				2465	goto out;
				2466
				2467	/*
				2468	* If the new table has no data devices, retain the existing limits.
				2469	* This helps multipath with queue_if_no_path if all paths disappear,
				2470	* then new I/O is queued based on these limits, and then some paths
				2471	* reappear.
				2472	*/
				2473	if (dm_table_has_no_data_devices(table)) {
				2474	live_map = dm_get_live_table_fast(md);
				2475	if (live_map)
				2476	limits = md->queue->limits;
				2477	dm_put_live_table_fast(md);
				2478	}
				2479
				2480	if (!live_map) {
				2481	r = dm_calculate_queue_limits(table, &limits);
				2482	if (r) {
				2483	map = ERR_PTR(r);
				2484	goto out;
				2485	}
				2486	}
				2487
				2488	map = __bind(md, table, &limits);
				2489	dm_issue_global_event();
				2490
				2491	out:
				2492	mutex_unlock(&md->suspend_lock);
				2493	return map;
				2494	}
				2495
				2496	/*
				2497	* Functions to lock and unlock any filesystem running on the
				2498	* device.
				2499	*/
				2500	static int lock_fs(struct mapped_device *md)
				2501	{
				2502	int r;
				2503
				2504	WARN_ON(md->frozen_sb);
				2505
				2506	md->frozen_sb = freeze_bdev(md->bdev);
				2507	if (IS_ERR(md->frozen_sb)) {
				2508	r = PTR_ERR(md->frozen_sb);
				2509	md->frozen_sb = NULL;
				2510	return r;
				2511	}
				2512
				2513	set_bit(DMF_FROZEN, &md->flags);
				2514
				2515	return 0;
				2516	}
				2517
				2518	static void unlock_fs(struct mapped_device *md)
				2519	{
				2520	if (!test_bit(DMF_FROZEN, &md->flags))
				2521	return;
				2522
				2523	thaw_bdev(md->bdev, md->frozen_sb);
				2524	md->frozen_sb = NULL;
				2525	clear_bit(DMF_FROZEN, &md->flags);
				2526	}
				2527
				2528	/*
				2529	* @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
				2530	* @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
				2531	* @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
				2532	*
				2533	* If __dm_suspend returns 0, the device is completely quiescent
				2534	* now. There is no request-processing activity. All new requests
				2535	* are being added to md->deferred list.
				2536	*/
				2537	static int __dm_suspend(struct mapped_device md, struct dm_table map,
				2538	unsigned suspend_flags, long task_state,
				2539	int dmf_suspended_flag)
				2540	{
				2541	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
				2542	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
				2543	int r;
				2544
				2545	lockdep_assert_held(&md->suspend_lock);
				2546
				2547	/*
				2548	* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
				2549	* This flag is cleared before dm_suspend returns.
				2550	*/
				2551	if (noflush)
				2552	set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2553	else
				2554	pr_debug("%s: suspending with flush\n", dm_device_name(md));
				2555
				2556	/*
				2557	* This gets reverted if there's an error later and the targets
				2558	* provide the .presuspend_undo hook.
				2559	*/
				2560	dm_table_presuspend_targets(map);
				2561
				2562	/*
				2563	* Flush I/O to the device.
				2564	* Any I/O submitted after lock_fs() may not be flushed.
				2565	* noflush takes precedence over do_lockfs.
				2566	* (lock_fs() flushes I/Os and waits for them to complete.)
				2567	*/
				2568	if (!noflush && do_lockfs) {
				2569	r = lock_fs(md);
				2570	if (r) {
				2571	dm_table_presuspend_undo_targets(map);
				2572	return r;
				2573	}
				2574	}
				2575
				2576	/*
				2577	* Here we must make sure that no processes are submitting requests
				2578	* to target drivers i.e. no one may be executing
				2579	* __split_and_process_bio. This is called from dm_request and
				2580	* dm_wq_work.
				2581	*
				2582	* To get all processes out of __split_and_process_bio in dm_request,
				2583	* we take the write lock. To prevent any process from reentering
				2584	* __split_and_process_bio from dm_request and quiesce the thread
				2585	* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
				2586	* flush_workqueue(md->wq).
				2587	*/
				2588	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2589	if (map)
				2590	synchronize_srcu(&md->io_barrier);
				2591
				2592	/*
				2593	* Stop md->queue before flushing md->wq in case request-based
				2594	* dm defers requests to md->wq from md->queue.
				2595	*/
				2596	if (dm_request_based(md)) {
				2597	dm_stop_queue(md->queue);
				2598	if (md->kworker_task)
				2599	kthread_flush_worker(&md->kworker);
				2600	}
				2601
				2602	flush_workqueue(md->wq);
				2603
				2604	/*
				2605	* At this point no more requests are entering target request routines.
				2606	* We call dm_wait_for_completion to wait for all existing requests
				2607	* to finish.
				2608	*/
				2609	r = dm_wait_for_completion(md, task_state);
				2610	if (!r)
				2611	set_bit(dmf_suspended_flag, &md->flags);
				2612
				2613	if (noflush)
				2614	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
				2615	if (map)
				2616	synchronize_srcu(&md->io_barrier);
				2617
				2618	/* were we interrupted ? */
				2619	if (r < 0) {
				2620	dm_queue_flush(md);
				2621
				2622	if (dm_request_based(md))
				2623	dm_start_queue(md->queue);
				2624
				2625	unlock_fs(md);
				2626	dm_table_presuspend_undo_targets(map);
				2627	/* pushback list is already flushed, so skip flush */
				2628	}
				2629
				2630	return r;
				2631	}
				2632
				2633	/*
				2634	* We need to be able to change a mapping table under a mounted
				2635	* filesystem. For example we might want to move some data in
				2636	* the background. Before the table can be swapped with
				2637	* dm_bind_table, dm_suspend must be called to flush any in
				2638	* flight bios and ensure that any further io gets deferred.
				2639	*/
				2640	/*
				2641	* Suspend mechanism in request-based dm.
				2642	*
				2643	* 1. Flush all I/Os by lock_fs() if needed.
				2644	* 2. Stop dispatching any I/O by stopping the request_queue.
				2645	* 3. Wait for all in-flight I/Os to be completed or requeued.
				2646	*
				2647	* To abort suspend, start the request_queue.
				2648	*/
				2649	int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
				2650	{
				2651	struct dm_table *map = NULL;
				2652	int r = 0;
				2653
				2654	retry:
				2655	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
				2656
				2657	if (dm_suspended_md(md)) {
				2658	r = -EINVAL;
				2659	goto out_unlock;
				2660	}
				2661
				2662	if (dm_suspended_internally_md(md)) {
				2663	/* already internally suspended, wait for internal resume */
				2664	mutex_unlock(&md->suspend_lock);
				2665	r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
				2666	if (r)
				2667	return r;
				2668	goto retry;
				2669	}
				2670
				2671	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2672
				2673	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
				2674	if (r)
				2675	goto out_unlock;
				2676
				2677	dm_table_postsuspend_targets(map);
				2678
				2679	out_unlock:
				2680	mutex_unlock(&md->suspend_lock);
				2681	return r;
				2682	}
				2683
				2684	static int __dm_resume(struct mapped_device md, struct dm_table map)
				2685	{
				2686	if (map) {
				2687	int r = dm_table_resume_targets(map);
				2688	if (r)
				2689	return r;
				2690	}
				2691
				2692	dm_queue_flush(md);
				2693
				2694	/*
				2695	* Flushing deferred I/Os must be done after targets are resumed
				2696	* so that mapping of targets can work correctly.
				2697	* Request-based dm is queueing the deferred I/Os in its request_queue.
				2698	*/
				2699	if (dm_request_based(md))
				2700	dm_start_queue(md->queue);
				2701
				2702	unlock_fs(md);
				2703
				2704	return 0;
				2705	}
				2706
				2707	int dm_resume(struct mapped_device *md)
				2708	{
				2709	int r;
				2710	struct dm_table *map = NULL;
				2711
				2712	retry:
				2713	r = -EINVAL;
				2714	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
				2715
				2716	if (!dm_suspended_md(md))
				2717	goto out;
				2718
				2719	if (dm_suspended_internally_md(md)) {
				2720	/* already internally suspended, wait for internal resume */
				2721	mutex_unlock(&md->suspend_lock);
				2722	r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
				2723	if (r)
				2724	return r;
				2725	goto retry;
				2726	}
				2727
				2728	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2729	if (!map \|\| !dm_table_get_size(map))
				2730	goto out;
				2731
				2732	r = __dm_resume(md, map);
				2733	if (r)
				2734	goto out;
				2735
				2736	clear_bit(DMF_SUSPENDED, &md->flags);
				2737	out:
				2738	mutex_unlock(&md->suspend_lock);
				2739
				2740	return r;
				2741	}
				2742
				2743	/*
				2744	* Internal suspend/resume works like userspace-driven suspend. It waits
				2745	* until all bios finish and prevents issuing new bios to the target drivers.
				2746	* It may be used only from the kernel.
				2747	*/
				2748
				2749	static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
				2750	{
				2751	struct dm_table *map = NULL;
				2752
				2753	lockdep_assert_held(&md->suspend_lock);
				2754
				2755	if (md->internal_suspend_count++)
				2756	return; /* nested internal suspend */
				2757
				2758	if (dm_suspended_md(md)) {
				2759	set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2760	return; /* nest suspend */
				2761	}
				2762
				2763	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
				2764
				2765	/*
				2766	* Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
				2767	* supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
				2768	* would require changing .presuspend to return an error -- avoid this
				2769	* until there is a need for more elaborate variants of internal suspend.
				2770	*/
				2771	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
				2772	DMF_SUSPENDED_INTERNALLY);
				2773
				2774	dm_table_postsuspend_targets(map);
				2775	}
				2776
				2777	static void __dm_internal_resume(struct mapped_device *md)
				2778	{
				2779	BUG_ON(!md->internal_suspend_count);
				2780
				2781	if (--md->internal_suspend_count)
				2782	return; /* resume from nested internal suspend */
				2783
				2784	if (dm_suspended_md(md))
				2785	goto done; /* resume from nested suspend */
				2786
				2787	/*
				2788	* NOTE: existing callers don't need to call dm_table_resume_targets
				2789	* (which may fail -- so best to avoid it for now by passing NULL map)
				2790	*/
				2791	(void) __dm_resume(md, NULL);
				2792
				2793	done:
				2794	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2795	smp_mb__after_atomic();
				2796	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
				2797	}
				2798
				2799	void dm_internal_suspend_noflush(struct mapped_device *md)
				2800	{
				2801	mutex_lock(&md->suspend_lock);
				2802	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
				2803	mutex_unlock(&md->suspend_lock);
				2804	}
				2805	EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
				2806
				2807	void dm_internal_resume(struct mapped_device *md)
				2808	{
				2809	mutex_lock(&md->suspend_lock);
				2810	__dm_internal_resume(md);
				2811	mutex_unlock(&md->suspend_lock);
				2812	}
				2813	EXPORT_SYMBOL_GPL(dm_internal_resume);
				2814
				2815	/*
				2816	* Fast variants of internal suspend/resume hold md->suspend_lock,
				2817	* which prevents interaction with userspace-driven suspend.
				2818	*/
				2819
				2820	void dm_internal_suspend_fast(struct mapped_device *md)
				2821	{
				2822	mutex_lock(&md->suspend_lock);
				2823	if (dm_suspended_md(md) \|\| dm_suspended_internally_md(md))
				2824	return;
				2825
				2826	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
				2827	synchronize_srcu(&md->io_barrier);
				2828	flush_workqueue(md->wq);
				2829	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
				2830	}
				2831	EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
				2832
				2833	void dm_internal_resume_fast(struct mapped_device *md)
				2834	{
				2835	if (dm_suspended_md(md) \|\| dm_suspended_internally_md(md))
				2836	goto done;
				2837
				2838	dm_queue_flush(md);
				2839
				2840	done:
				2841	mutex_unlock(&md->suspend_lock);
				2842	}
				2843	EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
				2844
				2845	/*-----------------------------------------------------------------
				2846	* Event notification.
				2847	---------------------------------------------------------------/
				2848	int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
				2849	unsigned cookie)
				2850	{
				2851	char udev_cookie[DM_COOKIE_LENGTH];
				2852	char *envp[] = { udev_cookie, NULL };
				2853
				2854	if (!cookie)
				2855	return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
				2856	else {
				2857	snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
				2858	DM_COOKIE_ENV_VAR_NAME, cookie);
				2859	return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
				2860	action, envp);
				2861	}
				2862	}
				2863
				2864	uint32_t dm_next_uevent_seq(struct mapped_device *md)
				2865	{
				2866	return atomic_add_return(1, &md->uevent_seq);
				2867	}
				2868
				2869	uint32_t dm_get_event_nr(struct mapped_device *md)
				2870	{
				2871	return atomic_read(&md->event_nr);
				2872	}
				2873
				2874	int dm_wait_event(struct mapped_device *md, int event_nr)
				2875	{
				2876	return wait_event_interruptible(md->eventq,
				2877	(event_nr != atomic_read(&md->event_nr)));
				2878	}
				2879
				2880	void dm_uevent_add(struct mapped_device md, struct list_head elist)
				2881	{
				2882	unsigned long flags;
				2883
				2884	spin_lock_irqsave(&md->uevent_lock, flags);
				2885	list_add(elist, &md->uevent_list);
				2886	spin_unlock_irqrestore(&md->uevent_lock, flags);
				2887	}
				2888
				2889	/*
				2890	* The gendisk is only valid as long as you have a reference
				2891	* count on 'md'.
				2892	*/
				2893	struct gendisk dm_disk(struct mapped_device md)
				2894	{
				2895	return md->disk;
				2896	}
				2897	EXPORT_SYMBOL_GPL(dm_disk);
				2898
				2899	struct kobject dm_kobject(struct mapped_device md)
				2900	{
				2901	return &md->kobj_holder.kobj;
				2902	}
				2903
				2904	struct mapped_device dm_get_from_kobject(struct kobject kobj)
				2905	{
				2906	struct mapped_device *md;
				2907
				2908	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
				2909
				2910	spin_lock(&_minor_lock);
				2911	if (test_bit(DMF_FREEING, &md->flags) \|\| dm_deleting_md(md)) {
				2912	md = NULL;
				2913	goto out;
				2914	}
				2915	dm_get(md);
				2916	out:
				2917	spin_unlock(&_minor_lock);
				2918
				2919	return md;
				2920	}
				2921
				2922	int dm_suspended_md(struct mapped_device *md)
				2923	{
				2924	return test_bit(DMF_SUSPENDED, &md->flags);
				2925	}
				2926
				2927	int dm_suspended_internally_md(struct mapped_device *md)
				2928	{
				2929	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
				2930	}
				2931
				2932	int dm_test_deferred_remove_flag(struct mapped_device *md)
				2933	{
				2934	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
				2935	}
				2936
				2937	int dm_suspended(struct dm_target *ti)
				2938	{
				2939	return dm_suspended_md(dm_table_get_md(ti->table));
				2940	}
				2941	EXPORT_SYMBOL_GPL(dm_suspended);
				2942
				2943	int dm_noflush_suspending(struct dm_target *ti)
				2944	{
				2945	return __noflush_suspending(dm_table_get_md(ti->table));
				2946	}
				2947	EXPORT_SYMBOL_GPL(dm_noflush_suspending);
				2948
				2949	struct dm_md_mempools dm_alloc_md_mempools(struct mapped_device md, enum dm_queue_mode type,
				2950	unsigned integrity, unsigned per_io_data_size,
				2951	unsigned min_pool_size)
				2952	{
				2953	struct dm_md_mempools pools = kzalloc_node(sizeof(pools), GFP_KERNEL, md->numa_node_id);
				2954	unsigned int pool_size = 0;
				2955	unsigned int front_pad, io_front_pad;
				2956	int ret;
				2957
				2958	if (!pools)
				2959	return NULL;
				2960
				2961	switch (type) {
				2962	case DM_TYPE_BIO_BASED:
				2963	case DM_TYPE_DAX_BIO_BASED:
				2964	case DM_TYPE_NVME_BIO_BASED:
				2965	pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
				2966	front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
				2967	io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
				2968	ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
				2969	if (ret)
				2970	goto out;
				2971	if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
				2972	goto out;
				2973	break;
				2974	case DM_TYPE_REQUEST_BASED:
				2975	case DM_TYPE_MQ_REQUEST_BASED:
				2976	pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
				2977	front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
				2978	/* per_io_data_size is used for blk-mq pdu at queue allocation */
				2979	break;
				2980	default:
				2981	BUG();
				2982	}
				2983
				2984	ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
				2985	if (ret)
				2986	goto out;
				2987
				2988	if (integrity && bioset_integrity_create(&pools->bs, pool_size))
				2989	goto out;
				2990
				2991	return pools;
				2992
				2993	out:
				2994	dm_free_md_mempools(pools);
				2995
				2996	return NULL;
				2997	}
				2998
				2999	void dm_free_md_mempools(struct dm_md_mempools *pools)
				3000	{
				3001	if (!pools)
				3002	return;
				3003
				3004	bioset_exit(&pools->bs);
				3005	bioset_exit(&pools->io_bs);
				3006
				3007	kfree(pools);
				3008	}
				3009
				3010	struct dm_pr {
				3011	u64 old_key;
				3012	u64 new_key;
				3013	u32 flags;
				3014	bool fail_early;
				3015	};
				3016
				3017	static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
				3018	void *data)
				3019	{
				3020	struct mapped_device *md = bdev->bd_disk->private_data;
				3021	struct dm_table *table;
				3022	struct dm_target *ti;
				3023	int ret = -ENOTTY, srcu_idx;
				3024
				3025	table = dm_get_live_table(md, &srcu_idx);
				3026	if (!table \|\| !dm_table_get_size(table))
				3027	goto out;
				3028
				3029	/* We only support devices that have a single target */
				3030	if (dm_table_get_num_targets(table) != 1)
				3031	goto out;
				3032	ti = dm_table_get_target(table, 0);
				3033
				3034	ret = -EINVAL;
				3035	if (!ti->type->iterate_devices)
				3036	goto out;
				3037
				3038	ret = ti->type->iterate_devices(ti, fn, data);
				3039	out:
				3040	dm_put_live_table(md, srcu_idx);
				3041	return ret;
				3042	}
				3043
				3044	/*
				3045	* For register / unregister we need to manually call out to every path.
				3046	*/
				3047	static int __dm_pr_register(struct dm_target ti, struct dm_dev dev,
				3048	sector_t start, sector_t len, void *data)
				3049	{
				3050	struct dm_pr *pr = data;
				3051	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
				3052
				3053	if (!ops \|\| !ops->pr_register)
				3054	return -EOPNOTSUPP;
				3055	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
				3056	}
				3057
				3058	static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
				3059	u32 flags)
				3060	{
				3061	struct dm_pr pr = {
				3062	.old_key = old_key,
				3063	.new_key = new_key,
				3064	.flags = flags,
				3065	.fail_early = true,
				3066	};
				3067	int ret;
				3068
				3069	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
				3070	if (ret && new_key) {
				3071	/* unregister all paths if we failed to register any path */
				3072	pr.old_key = new_key;
				3073	pr.new_key = 0;
				3074	pr.flags = 0;
				3075	pr.fail_early = false;
				3076	dm_call_pr(bdev, __dm_pr_register, &pr);
				3077	}
				3078
				3079	return ret;
				3080	}
				3081
				3082	static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
				3083	u32 flags)
				3084	{
				3085	struct mapped_device *md = bdev->bd_disk->private_data;
				3086	const struct pr_ops *ops;
				3087	int r, srcu_idx;
				3088
				3089	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
				3090	if (r < 0)
				3091	goto out;
				3092
				3093	ops = bdev->bd_disk->fops->pr_ops;
				3094	if (ops && ops->pr_reserve)
				3095	r = ops->pr_reserve(bdev, key, type, flags);
				3096	else
				3097	r = -EOPNOTSUPP;
				3098	out:
				3099	dm_unprepare_ioctl(md, srcu_idx);
				3100	return r;
				3101	}
				3102
				3103	static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
				3104	{
				3105	struct mapped_device *md = bdev->bd_disk->private_data;
				3106	const struct pr_ops *ops;
				3107	int r, srcu_idx;
				3108
				3109	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
				3110	if (r < 0)
				3111	goto out;
				3112
				3113	ops = bdev->bd_disk->fops->pr_ops;
				3114	if (ops && ops->pr_release)
				3115	r = ops->pr_release(bdev, key, type);
				3116	else
				3117	r = -EOPNOTSUPP;
				3118	out:
				3119	dm_unprepare_ioctl(md, srcu_idx);
				3120	return r;
				3121	}
				3122
				3123	static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
				3124	enum pr_type type, bool abort)
				3125	{
				3126	struct mapped_device *md = bdev->bd_disk->private_data;
				3127	const struct pr_ops *ops;
				3128	int r, srcu_idx;
				3129
				3130	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
				3131	if (r < 0)
				3132	goto out;
				3133
				3134	ops = bdev->bd_disk->fops->pr_ops;
				3135	if (ops && ops->pr_preempt)
				3136	r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
				3137	else
				3138	r = -EOPNOTSUPP;
				3139	out:
				3140	dm_unprepare_ioctl(md, srcu_idx);
				3141	return r;
				3142	}
				3143
				3144	static int dm_pr_clear(struct block_device *bdev, u64 key)
				3145	{
				3146	struct mapped_device *md = bdev->bd_disk->private_data;
				3147	const struct pr_ops *ops;
				3148	int r, srcu_idx;
				3149
				3150	r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
				3151	if (r < 0)
				3152	goto out;
				3153
				3154	ops = bdev->bd_disk->fops->pr_ops;
				3155	if (ops && ops->pr_clear)
				3156	r = ops->pr_clear(bdev, key);
				3157	else
				3158	r = -EOPNOTSUPP;
				3159	out:
				3160	dm_unprepare_ioctl(md, srcu_idx);
				3161	return r;
				3162	}
				3163
				3164	static const struct pr_ops dm_pr_ops = {
				3165	.pr_register = dm_pr_register,
				3166	.pr_reserve = dm_pr_reserve,
				3167	.pr_release = dm_pr_release,
				3168	.pr_preempt = dm_pr_preempt,
				3169	.pr_clear = dm_pr_clear,
				3170	};
				3171
				3172	static const struct block_device_operations dm_blk_dops = {
				3173	.open = dm_blk_open,
				3174	.release = dm_blk_close,
				3175	.ioctl = dm_blk_ioctl,
				3176	.getgeo = dm_blk_getgeo,
				3177	.pr_ops = &dm_pr_ops,
				3178	.owner = THIS_MODULE
				3179	};
				3180
				3181	static const struct dax_operations dm_dax_ops = {
				3182	.direct_access = dm_dax_direct_access,
				3183	.copy_from_iter = dm_dax_copy_from_iter,
				3184	.copy_to_iter = dm_dax_copy_to_iter,
				3185	};
				3186
				3187	/*
				3188	* module hooks
				3189	*/
				3190	module_init(dm_init);
				3191	module_exit(dm_exit);
				3192
				3193	module_param(major, uint, 0);
				3194	MODULE_PARM_DESC(major, "The major number of the device mapper");
				3195
				3196	module_param(reserved_bio_based_ios, uint, S_IRUGO \| S_IWUSR);
				3197	MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
				3198
				3199	module_param(dm_numa_node, int, S_IRUGO \| S_IWUSR);
				3200	MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
				3201
				3202	MODULE_DESCRIPTION(DM_NAME " driver");
				3203	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				3204	MODULE_LICENSE("GPL");