Blame - marvell/linux/drivers/md/dm-clone-target.c - T108

blob: 355afca2969dc8021c1dab649f290a17e7e3df27 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
				4	*/
				5
				6	#include <linux/mm.h>
				7	#include <linux/bio.h>
				8	#include <linux/err.h>
				9	#include <linux/hash.h>
				10	#include <linux/list.h>
				11	#include <linux/log2.h>
				12	#include <linux/init.h>
				13	#include <linux/slab.h>
				14	#include <linux/wait.h>
				15	#include <linux/dm-io.h>
				16	#include <linux/mutex.h>
				17	#include <linux/atomic.h>
				18	#include <linux/bitops.h>
				19	#include <linux/blkdev.h>
				20	#include <linux/kdev_t.h>
				21	#include <linux/kernel.h>
				22	#include <linux/module.h>
				23	#include <linux/jiffies.h>
				24	#include <linux/mempool.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/blk_types.h>
				27	#include <linux/dm-kcopyd.h>
				28	#include <linux/workqueue.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/device-mapper.h>
				31
				32	#include "dm.h"
				33	#include "dm-clone-metadata.h"
				34
				35	#define DM_MSG_PREFIX "clone"
				36
				37	/*
				38	* Minimum and maximum allowed region sizes
				39	*/
				40	#define MIN_REGION_SIZE (1 << 3) /* 4KB */
				41	#define MAX_REGION_SIZE (1 << 21) /* 1GB */
				42
				43	#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
				44	#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
				45	#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
				46
				47	#define COMMIT_PERIOD HZ /* 1 sec */
				48
				49	/*
				50	* Hydration hash table size: 1 << HASH_TABLE_BITS
				51	*/
				52	#define HASH_TABLE_BITS 15
				53
				54	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
				55	"A percentage of time allocated for hydrating regions");
				56
				57	/* Slab cache for struct dm_clone_region_hydration */
				58	static struct kmem_cache *_hydration_cache;
				59
				60	/* dm-clone metadata modes */
				61	enum clone_metadata_mode {
				62	CM_WRITE, /* metadata may be changed */
				63	CM_READ_ONLY, /* metadata may not be changed */
				64	CM_FAIL, /* all metadata I/O fails */
				65	};
				66
				67	struct hash_table_bucket;
				68
				69	struct clone {
				70	struct dm_target *ti;
				71	struct dm_target_callbacks callbacks;
				72
				73	struct dm_dev *metadata_dev;
				74	struct dm_dev *dest_dev;
				75	struct dm_dev *source_dev;
				76
				77	unsigned long nr_regions;
				78	sector_t region_size;
				79	unsigned int region_shift;
				80
				81	/*
				82	* A metadata commit and the actions taken in case it fails should run
				83	* as a single atomic step.
				84	*/
				85	struct mutex commit_lock;
				86
				87	struct dm_clone_metadata *cmd;
				88
				89	/*
				90	* bio used to flush the destination device, before committing the
				91	* metadata.
				92	*/
				93	struct bio flush_bio;
				94
				95	/* Region hydration hash table */
				96	struct hash_table_bucket *ht;
				97
				98	atomic_t ios_in_flight;
				99
				100	wait_queue_head_t hydration_stopped;
				101
				102	mempool_t hydration_pool;
				103
				104	unsigned long last_commit_jiffies;
				105
				106	/*
				107	* We defer incoming WRITE bios for regions that are not hydrated,
				108	* until after these regions have been hydrated.
				109	*
				110	* Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
				111	* metadata have been committed.
				112	*/
				113	spinlock_t lock;
				114	struct bio_list deferred_bios;
				115	struct bio_list deferred_discard_bios;
				116	struct bio_list deferred_flush_bios;
				117	struct bio_list deferred_flush_completions;
				118
				119	/* Maximum number of regions being copied during background hydration. */
				120	unsigned int hydration_threshold;
				121
				122	/* Number of regions to batch together during background hydration. */
				123	unsigned int hydration_batch_size;
				124
				125	/* Which region to hydrate next */
				126	unsigned long hydration_offset;
				127
				128	atomic_t hydrations_in_flight;
				129
				130	/*
				131	* Save a copy of the table line rather than reconstructing it for the
				132	* status.
				133	*/
				134	unsigned int nr_ctr_args;
				135	const char **ctr_args;
				136
				137	struct workqueue_struct *wq;
				138	struct work_struct worker;
				139	struct delayed_work waker;
				140
				141	struct dm_kcopyd_client *kcopyd_client;
				142
				143	enum clone_metadata_mode mode;
				144	unsigned long flags;
				145	};
				146
				147	/*
				148	* dm-clone flags
				149	*/
				150	#define DM_CLONE_DISCARD_PASSDOWN 0
				151	#define DM_CLONE_HYDRATION_ENABLED 1
				152	#define DM_CLONE_HYDRATION_SUSPENDED 2
				153
				154	/---------------------------------------------------------------------------/
				155
				156	/*
				157	* Metadata failure handling.
				158	*/
				159	static enum clone_metadata_mode get_clone_mode(struct clone *clone)
				160	{
				161	return READ_ONCE(clone->mode);
				162	}
				163
				164	static const char clone_device_name(struct clone clone)
				165	{
				166	return dm_table_device_name(clone->ti->table);
				167	}
				168
				169	static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
				170	{
				171	const char *descs[] = {
				172	"read-write",
				173	"read-only",
				174	"fail"
				175	};
				176
				177	enum clone_metadata_mode old_mode = get_clone_mode(clone);
				178
				179	/* Never move out of fail mode */
				180	if (old_mode == CM_FAIL)
				181	new_mode = CM_FAIL;
				182
				183	switch (new_mode) {
				184	case CM_FAIL:
				185	case CM_READ_ONLY:
				186	dm_clone_metadata_set_read_only(clone->cmd);
				187	break;
				188
				189	case CM_WRITE:
				190	dm_clone_metadata_set_read_write(clone->cmd);
				191	break;
				192	}
				193
				194	WRITE_ONCE(clone->mode, new_mode);
				195
				196	if (new_mode != old_mode) {
				197	dm_table_event(clone->ti->table);
				198	DMINFO("%s: Switching to %s mode", clone_device_name(clone),
				199	descs[(int)new_mode]);
				200	}
				201	}
				202
				203	static void __abort_transaction(struct clone *clone)
				204	{
				205	const char *dev_name = clone_device_name(clone);
				206
				207	if (get_clone_mode(clone) >= CM_READ_ONLY)
				208	return;
				209
				210	DMERR("%s: Aborting current metadata transaction", dev_name);
				211	if (dm_clone_metadata_abort(clone->cmd)) {
				212	DMERR("%s: Failed to abort metadata transaction", dev_name);
				213	__set_clone_mode(clone, CM_FAIL);
				214	}
				215	}
				216
				217	static void __reload_in_core_bitset(struct clone *clone)
				218	{
				219	const char *dev_name = clone_device_name(clone);
				220
				221	if (get_clone_mode(clone) == CM_FAIL)
				222	return;
				223
				224	/* Reload the on-disk bitset */
				225	DMINFO("%s: Reloading on-disk bitmap", dev_name);
				226	if (dm_clone_reload_in_core_bitset(clone->cmd)) {
				227	DMERR("%s: Failed to reload on-disk bitmap", dev_name);
				228	__set_clone_mode(clone, CM_FAIL);
				229	}
				230	}
				231
				232	static void __metadata_operation_failed(struct clone clone, const char op, int r)
				233	{
				234	DMERR("%s: Metadata operation `%s' failed: error = %d",
				235	clone_device_name(clone), op, r);
				236
				237	__abort_transaction(clone);
				238	__set_clone_mode(clone, CM_READ_ONLY);
				239
				240	/*
				241	* dm_clone_reload_in_core_bitset() may run concurrently with either
				242	* dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
				243	* it's safe as we have already set the metadata to read-only mode.
				244	*/
				245	__reload_in_core_bitset(clone);
				246	}
				247
				248	/---------------------------------------------------------------------------/
				249
				250	/* Wake up anyone waiting for region hydrations to stop */
				251	static inline void wakeup_hydration_waiters(struct clone *clone)
				252	{
				253	wake_up_all(&clone->hydration_stopped);
				254	}
				255
				256	static inline void wake_worker(struct clone *clone)
				257	{
				258	queue_work(clone->wq, &clone->worker);
				259	}
				260
				261	/---------------------------------------------------------------------------/
				262
				263	/*
				264	* bio helper functions.
				265	*/
				266	static inline void remap_to_source(struct clone clone, struct bio bio)
				267	{
				268	bio_set_dev(bio, clone->source_dev->bdev);
				269	}
				270
				271	static inline void remap_to_dest(struct clone clone, struct bio bio)
				272	{
				273	bio_set_dev(bio, clone->dest_dev->bdev);
				274	}
				275
				276	static bool bio_triggers_commit(struct clone clone, struct bio bio)
				277	{
				278	return op_is_flush(bio->bi_opf) &&
				279	dm_clone_changed_this_transaction(clone->cmd);
				280	}
				281
				282	/* Get the address of the region in sectors */
				283	static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
				284	{
				285	return ((sector_t)region_nr << clone->region_shift);
				286	}
				287
				288	/* Get the region number of the bio */
				289	static inline unsigned long bio_to_region(struct clone clone, struct bio bio)
				290	{
				291	return (bio->bi_iter.bi_sector >> clone->region_shift);
				292	}
				293
				294	/* Get the region range covered by the bio */
				295	static void bio_region_range(struct clone clone, struct bio bio,
				296	unsigned long rs, unsigned long nr_regions)
				297	{
				298	unsigned long end;
				299
				300	*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
				301	end = bio_end_sector(bio) >> clone->region_shift;
				302
				303	if (*rs >= end)
				304	*nr_regions = 0;
				305	else
				306	nr_regions = end - rs;
				307	}
				308
				309	/* Check whether a bio overwrites a region */
				310	static inline bool is_overwrite_bio(struct clone clone, struct bio bio)
				311	{
				312	return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
				313	}
				314
				315	static void fail_bios(struct bio_list *bios, blk_status_t status)
				316	{
				317	struct bio *bio;
				318
				319	while ((bio = bio_list_pop(bios))) {
				320	bio->bi_status = status;
				321	bio_endio(bio);
				322	}
				323	}
				324
				325	static void submit_bios(struct bio_list *bios)
				326	{
				327	struct bio *bio;
				328	struct blk_plug plug;
				329
				330	blk_start_plug(&plug);
				331
				332	while ((bio = bio_list_pop(bios)))
				333	generic_make_request(bio);
				334
				335	blk_finish_plug(&plug);
				336	}
				337
				338	/*
				339	* Submit bio to the underlying device.
				340	*
				341	* If the bio triggers a commit, delay it, until after the metadata have been
				342	* committed.
				343	*
				344	* NOTE: The bio remapping must be performed by the caller.
				345	*/
				346	static void issue_bio(struct clone clone, struct bio bio)
				347	{
				348	if (!bio_triggers_commit(clone, bio)) {
				349	generic_make_request(bio);
				350	return;
				351	}
				352
				353	/*
				354	* If the metadata mode is RO or FAIL we won't be able to commit the
				355	* metadata, so we complete the bio with an error.
				356	*/
				357	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
				358	bio_io_error(bio);
				359	return;
				360	}
				361
				362	/*
				363	* Batch together any bios that trigger commits and then issue a single
				364	* commit for them in process_deferred_flush_bios().
				365	*/
				366	spin_lock_irq(&clone->lock);
				367	bio_list_add(&clone->deferred_flush_bios, bio);
				368	spin_unlock_irq(&clone->lock);
				369
				370	wake_worker(clone);
				371	}
				372
				373	/*
				374	* Remap bio to the destination device and submit it.
				375	*
				376	* If the bio triggers a commit, delay it, until after the metadata have been
				377	* committed.
				378	*/
				379	static void remap_and_issue(struct clone clone, struct bio bio)
				380	{
				381	remap_to_dest(clone, bio);
				382	issue_bio(clone, bio);
				383	}
				384
				385	/*
				386	* Issue bios that have been deferred until after their region has finished
				387	* hydrating.
				388	*
				389	* We delegate the bio submission to the worker thread, so this is safe to call
				390	* from interrupt context.
				391	*/
				392	static void issue_deferred_bios(struct clone clone, struct bio_list bios)
				393	{
				394	struct bio *bio;
				395	unsigned long flags;
				396	struct bio_list flush_bios = BIO_EMPTY_LIST;
				397	struct bio_list normal_bios = BIO_EMPTY_LIST;
				398
				399	if (bio_list_empty(bios))
				400	return;
				401
				402	while ((bio = bio_list_pop(bios))) {
				403	if (bio_triggers_commit(clone, bio))
				404	bio_list_add(&flush_bios, bio);
				405	else
				406	bio_list_add(&normal_bios, bio);
				407	}
				408
				409	spin_lock_irqsave(&clone->lock, flags);
				410	bio_list_merge(&clone->deferred_bios, &normal_bios);
				411	bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
				412	spin_unlock_irqrestore(&clone->lock, flags);
				413
				414	wake_worker(clone);
				415	}
				416
				417	static void complete_overwrite_bio(struct clone clone, struct bio bio)
				418	{
				419	unsigned long flags;
				420
				421	/*
				422	* If the bio has the REQ_FUA flag set we must commit the metadata
				423	* before signaling its completion.
				424	*
				425	* complete_overwrite_bio() is only called by hydration_complete(),
				426	* after having successfully updated the metadata. This means we don't
				427	* need to call dm_clone_changed_this_transaction() to check if the
				428	* metadata has changed and thus we can avoid taking the metadata spin
				429	* lock.
				430	*/
				431	if (!(bio->bi_opf & REQ_FUA)) {
				432	bio_endio(bio);
				433	return;
				434	}
				435
				436	/*
				437	* If the metadata mode is RO or FAIL we won't be able to commit the
				438	* metadata, so we complete the bio with an error.
				439	*/
				440	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
				441	bio_io_error(bio);
				442	return;
				443	}
				444
				445	/*
				446	* Batch together any bios that trigger commits and then issue a single
				447	* commit for them in process_deferred_flush_bios().
				448	*/
				449	spin_lock_irqsave(&clone->lock, flags);
				450	bio_list_add(&clone->deferred_flush_completions, bio);
				451	spin_unlock_irqrestore(&clone->lock, flags);
				452
				453	wake_worker(clone);
				454	}
				455
				456	static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
				457	{
				458	bio->bi_iter.bi_sector = sector;
				459	bio->bi_iter.bi_size = to_bytes(len);
				460	}
				461
				462	static void complete_discard_bio(struct clone clone, struct bio bio, bool success)
				463	{
				464	unsigned long rs, nr_regions;
				465
				466	/*
				467	* If the destination device supports discards, remap and trim the
				468	* discard bio and pass it down. Otherwise complete the bio
				469	* immediately.
				470	*/
				471	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
				472	remap_to_dest(clone, bio);
				473	bio_region_range(clone, bio, &rs, &nr_regions);
				474	trim_bio(bio, region_to_sector(clone, rs),
				475	nr_regions << clone->region_shift);
				476	generic_make_request(bio);
				477	} else
				478	bio_endio(bio);
				479	}
				480
				481	static void process_discard_bio(struct clone clone, struct bio bio)
				482	{
				483	unsigned long rs, nr_regions;
				484
				485	bio_region_range(clone, bio, &rs, &nr_regions);
				486	if (!nr_regions) {
				487	bio_endio(bio);
				488	return;
				489	}
				490
				491	if (WARN_ON(rs >= clone->nr_regions \|\| (rs + nr_regions) < rs \|\|
				492	(rs + nr_regions) > clone->nr_regions)) {
				493	DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
				494	clone_device_name(clone), rs, nr_regions,
				495	clone->nr_regions,
				496	(unsigned long long)bio->bi_iter.bi_sector,
				497	bio_sectors(bio));
				498	bio_endio(bio);
				499	return;
				500	}
				501
				502	/*
				503	* The covered regions are already hydrated so we just need to pass
				504	* down the discard.
				505	*/
				506	if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
				507	complete_discard_bio(clone, bio, true);
				508	return;
				509	}
				510
				511	/*
				512	* If the metadata mode is RO or FAIL we won't be able to update the
				513	* metadata for the regions covered by the discard so we just ignore
				514	* it.
				515	*/
				516	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
				517	bio_endio(bio);
				518	return;
				519	}
				520
				521	/*
				522	* Defer discard processing.
				523	*/
				524	spin_lock_irq(&clone->lock);
				525	bio_list_add(&clone->deferred_discard_bios, bio);
				526	spin_unlock_irq(&clone->lock);
				527
				528	wake_worker(clone);
				529	}
				530
				531	/---------------------------------------------------------------------------/
				532
				533	/*
				534	* dm-clone region hydrations.
				535	*/
				536	struct dm_clone_region_hydration {
				537	struct clone *clone;
				538	unsigned long region_nr;
				539
				540	struct bio *overwrite_bio;
				541	bio_end_io_t *overwrite_bio_end_io;
				542
				543	struct bio_list deferred_bios;
				544
				545	blk_status_t status;
				546
				547	/* Used by hydration batching */
				548	struct list_head list;
				549
				550	/* Used by hydration hash table */
				551	struct hlist_node h;
				552	};
				553
				554	/*
				555	* Hydration hash table implementation.
				556	*
				557	* Ideally we would like to use list_bl, which uses bit spin locks and employs
				558	* the least significant bit of the list head to lock the corresponding bucket,
				559	* reducing the memory overhead for the locks. But, currently, list_bl and bit
				560	* spin locks don't support IRQ safe versions. Since we have to take the lock
				561	* in both process and interrupt context, we must fall back to using regular
				562	* spin locks; one per hash table bucket.
				563	*/
				564	struct hash_table_bucket {
				565	struct hlist_head head;
				566
				567	/* Spinlock protecting the bucket */
				568	spinlock_t lock;
				569	};
				570
				571	#define bucket_lock_irqsave(bucket, flags) \
				572	spin_lock_irqsave(&(bucket)->lock, flags)
				573
				574	#define bucket_unlock_irqrestore(bucket, flags) \
				575	spin_unlock_irqrestore(&(bucket)->lock, flags)
				576
				577	static int hash_table_init(struct clone *clone)
				578	{
				579	unsigned int i, sz;
				580	struct hash_table_bucket *bucket;
				581
				582	sz = 1 << HASH_TABLE_BITS;
				583
				584	clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
				585	if (!clone->ht)
				586	return -ENOMEM;
				587
				588	for (i = 0; i < sz; i++) {
				589	bucket = clone->ht + i;
				590
				591	INIT_HLIST_HEAD(&bucket->head);
				592	spin_lock_init(&bucket->lock);
				593	}
				594
				595	return 0;
				596	}
				597
				598	static void hash_table_exit(struct clone *clone)
				599	{
				600	kvfree(clone->ht);
				601	}
				602
				603	static struct hash_table_bucket get_hash_table_bucket(struct clone clone,
				604	unsigned long region_nr)
				605	{
				606	return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
				607	}
				608
				609	/*
				610	* Search hash table for a hydration with hd->region_nr == region_nr
				611	*
				612	* NOTE: Must be called with the bucket lock held
				613	*/
				614	static struct dm_clone_region_hydration __hash_find(struct hash_table_bucket bucket,
				615	unsigned long region_nr)
				616	{
				617	struct dm_clone_region_hydration *hd;
				618
				619	hlist_for_each_entry(hd, &bucket->head, h) {
				620	if (hd->region_nr == region_nr)
				621	return hd;
				622	}
				623
				624	return NULL;
				625	}
				626
				627	/*
				628	* Insert a hydration into the hash table.
				629	*
				630	* NOTE: Must be called with the bucket lock held.
				631	*/
				632	static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
				633	struct dm_clone_region_hydration *hd)
				634	{
				635	hlist_add_head(&hd->h, &bucket->head);
				636	}
				637
				638	/*
				639	* This function inserts a hydration into the hash table, unless someone else
				640	* managed to insert a hydration for the same region first. In the latter case
				641	* it returns the existing hydration descriptor for this region.
				642	*
				643	* NOTE: Must be called with the hydration hash table lock held.
				644	*/
				645	static struct dm_clone_region_hydration *
				646	__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
				647	struct dm_clone_region_hydration *hd)
				648	{
				649	struct dm_clone_region_hydration *hd2;
				650
				651	hd2 = __hash_find(bucket, hd->region_nr);
				652	if (hd2)
				653	return hd2;
				654
				655	__insert_region_hydration(bucket, hd);
				656
				657	return hd;
				658	}
				659
				660	/---------------------------------------------------------------------------/
				661
				662	/* Allocate a hydration */
				663	static struct dm_clone_region_hydration alloc_hydration(struct clone clone)
				664	{
				665	struct dm_clone_region_hydration *hd;
				666
				667	/*
				668	* Allocate a hydration from the hydration mempool.
				669	* This might block but it can't fail.
				670	*/
				671	hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
				672	hd->clone = clone;
				673
				674	return hd;
				675	}
				676
				677	static inline void free_hydration(struct dm_clone_region_hydration *hd)
				678	{
				679	mempool_free(hd, &hd->clone->hydration_pool);
				680	}
				681
				682	/* Initialize a hydration */
				683	static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
				684	{
				685	hd->region_nr = region_nr;
				686	hd->overwrite_bio = NULL;
				687	bio_list_init(&hd->deferred_bios);
				688	hd->status = 0;
				689
				690	INIT_LIST_HEAD(&hd->list);
				691	INIT_HLIST_NODE(&hd->h);
				692	}
				693
				694	/---------------------------------------------------------------------------/
				695
				696	/*
				697	* Update dm-clone's metadata after a region has finished hydrating and remove
				698	* hydration from the hash table.
				699	*/
				700	static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
				701	{
				702	int r = 0;
				703	unsigned long flags;
				704	struct hash_table_bucket *bucket;
				705	struct clone *clone = hd->clone;
				706
				707	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
				708	r = -EPERM;
				709
				710	/* Update the metadata */
				711	if (likely(!r) && hd->status == BLK_STS_OK)
				712	r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
				713
				714	bucket = get_hash_table_bucket(clone, hd->region_nr);
				715
				716	/* Remove hydration from hash table */
				717	bucket_lock_irqsave(bucket, flags);
				718	hlist_del(&hd->h);
				719	bucket_unlock_irqrestore(bucket, flags);
				720
				721	return r;
				722	}
				723
				724	/*
				725	* Complete a region's hydration:
				726	*
				727	* 1. Update dm-clone's metadata.
				728	* 2. Remove hydration from hash table.
				729	* 3. Complete overwrite bio.
				730	* 4. Issue deferred bios.
				731	* 5. If this was the last hydration, wake up anyone waiting for
				732	* hydrations to finish.
				733	*/
				734	static void hydration_complete(struct dm_clone_region_hydration *hd)
				735	{
				736	int r;
				737	blk_status_t status;
				738	struct clone *clone = hd->clone;
				739
				740	r = hydration_update_metadata(hd);
				741
				742	if (hd->status == BLK_STS_OK && likely(!r)) {
				743	if (hd->overwrite_bio)
				744	complete_overwrite_bio(clone, hd->overwrite_bio);
				745
				746	issue_deferred_bios(clone, &hd->deferred_bios);
				747	} else {
				748	status = r ? BLK_STS_IOERR : hd->status;
				749
				750	if (hd->overwrite_bio)
				751	bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
				752
				753	fail_bios(&hd->deferred_bios, status);
				754	}
				755
				756	free_hydration(hd);
				757
				758	if (atomic_dec_and_test(&clone->hydrations_in_flight))
				759	wakeup_hydration_waiters(clone);
				760	}
				761
				762	static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
				763	{
				764	blk_status_t status;
				765
				766	struct dm_clone_region_hydration tmp, hd = context;
				767	struct clone *clone = hd->clone;
				768
				769	LIST_HEAD(batched_hydrations);
				770
				771	if (read_err \|\| write_err) {
				772	DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
				773	status = BLK_STS_IOERR;
				774	} else {
				775	status = BLK_STS_OK;
				776	}
				777	list_splice_tail(&hd->list, &batched_hydrations);
				778
				779	hd->status = status;
				780	hydration_complete(hd);
				781
				782	/* Complete batched hydrations */
				783	list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
				784	hd->status = status;
				785	hydration_complete(hd);
				786	}
				787
				788	/* Continue background hydration, if there is no I/O in-flight */
				789	if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
				790	!atomic_read(&clone->ios_in_flight))
				791	wake_worker(clone);
				792	}
				793
				794	static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
				795	{
				796	unsigned long region_start, region_end;
				797	sector_t tail_size, region_size, total_size;
				798	struct dm_io_region from, to;
				799	struct clone *clone = hd->clone;
				800
				801	if (WARN_ON(!nr_regions))
				802	return;
				803
				804	region_size = clone->region_size;
				805	region_start = hd->region_nr;
				806	region_end = region_start + nr_regions - 1;
				807
				808	total_size = region_to_sector(clone, nr_regions - 1);
				809
				810	if (region_end == clone->nr_regions - 1) {
				811	/*
				812	* The last region of the target might be smaller than
				813	* region_size.
				814	*/
				815	tail_size = clone->ti->len & (region_size - 1);
				816	if (!tail_size)
				817	tail_size = region_size;
				818	} else {
				819	tail_size = region_size;
				820	}
				821
				822	total_size += tail_size;
				823
				824	from.bdev = clone->source_dev->bdev;
				825	from.sector = region_to_sector(clone, region_start);
				826	from.count = total_size;
				827
				828	to.bdev = clone->dest_dev->bdev;
				829	to.sector = from.sector;
				830	to.count = from.count;
				831
				832	/* Issue copy */
				833	atomic_add(nr_regions, &clone->hydrations_in_flight);
				834	dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
				835	hydration_kcopyd_callback, hd);
				836	}
				837
				838	static void overwrite_endio(struct bio *bio)
				839	{
				840	struct dm_clone_region_hydration *hd = bio->bi_private;
				841
				842	bio->bi_end_io = hd->overwrite_bio_end_io;
				843	hd->status = bio->bi_status;
				844
				845	hydration_complete(hd);
				846	}
				847
				848	static void hydration_overwrite(struct dm_clone_region_hydration hd, struct bio bio)
				849	{
				850	/*
				851	* We don't need to save and restore bio->bi_private because device
				852	* mapper core generates a new bio for us to use, with clean
				853	* bi_private.
				854	*/
				855	hd->overwrite_bio = bio;
				856	hd->overwrite_bio_end_io = bio->bi_end_io;
				857
				858	bio->bi_end_io = overwrite_endio;
				859	bio->bi_private = hd;
				860
				861	atomic_inc(&hd->clone->hydrations_in_flight);
				862	generic_make_request(bio);
				863	}
				864
				865	/*
				866	* Hydrate bio's region.
				867	*
				868	* This function starts the hydration of the bio's region and puts the bio in
				869	* the list of deferred bios for this region. In case, by the time this
				870	* function is called, the region has finished hydrating it's submitted to the
				871	* destination device.
				872	*
				873	* NOTE: The bio remapping must be performed by the caller.
				874	*/
				875	static void hydrate_bio_region(struct clone clone, struct bio bio)
				876	{
				877	unsigned long flags;
				878	unsigned long region_nr;
				879	struct hash_table_bucket *bucket;
				880	struct dm_clone_region_hydration hd, hd2;
				881
				882	region_nr = bio_to_region(clone, bio);
				883	bucket = get_hash_table_bucket(clone, region_nr);
				884
				885	bucket_lock_irqsave(bucket, flags);
				886
				887	hd = __hash_find(bucket, region_nr);
				888	if (hd) {
				889	/* Someone else is hydrating the region */
				890	bio_list_add(&hd->deferred_bios, bio);
				891	bucket_unlock_irqrestore(bucket, flags);
				892	return;
				893	}
				894
				895	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
				896	/* The region has been hydrated */
				897	bucket_unlock_irqrestore(bucket, flags);
				898	issue_bio(clone, bio);
				899	return;
				900	}
				901
				902	/*
				903	* We must allocate a hydration descriptor and start the hydration of
				904	* the corresponding region.
				905	*/
				906	bucket_unlock_irqrestore(bucket, flags);
				907
				908	hd = alloc_hydration(clone);
				909	hydration_init(hd, region_nr);
				910
				911	bucket_lock_irqsave(bucket, flags);
				912
				913	/* Check if the region has been hydrated in the meantime. */
				914	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
				915	bucket_unlock_irqrestore(bucket, flags);
				916	free_hydration(hd);
				917	issue_bio(clone, bio);
				918	return;
				919	}
				920
				921	hd2 = __find_or_insert_region_hydration(bucket, hd);
				922	if (hd2 != hd) {
				923	/* Someone else started the region's hydration. */
				924	bio_list_add(&hd2->deferred_bios, bio);
				925	bucket_unlock_irqrestore(bucket, flags);
				926	free_hydration(hd);
				927	return;
				928	}
				929
				930	/*
				931	* If the metadata mode is RO or FAIL then there is no point starting a
				932	* hydration, since we will not be able to update the metadata when the
				933	* hydration finishes.
				934	*/
				935	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
				936	hlist_del(&hd->h);
				937	bucket_unlock_irqrestore(bucket, flags);
				938	free_hydration(hd);
				939	bio_io_error(bio);
				940	return;
				941	}
				942
				943	/*
				944	* Start region hydration.
				945	*
				946	* If a bio overwrites a region, i.e., its size is equal to the
				947	* region's size, then we don't need to copy the region from the source
				948	* to the destination device.
				949	*/
				950	if (is_overwrite_bio(clone, bio)) {
				951	bucket_unlock_irqrestore(bucket, flags);
				952	hydration_overwrite(hd, bio);
				953	} else {
				954	bio_list_add(&hd->deferred_bios, bio);
				955	bucket_unlock_irqrestore(bucket, flags);
				956	hydration_copy(hd, 1);
				957	}
				958	}
				959
				960	/---------------------------------------------------------------------------/
				961
				962	/*
				963	* Background hydrations.
				964	*/
				965
				966	/*
				967	* Batch region hydrations.
				968	*
				969	* To better utilize device bandwidth we batch together the hydration of
				970	* adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
				971	* is good for small, random write performance (because of the overwriting of
				972	* un-hydrated regions) and at the same time issue big copy requests to kcopyd
				973	* to achieve high hydration bandwidth.
				974	*/
				975	struct batch_info {
				976	struct dm_clone_region_hydration *head;
				977	unsigned int nr_batched_regions;
				978	};
				979
				980	static void __batch_hydration(struct batch_info *batch,
				981	struct dm_clone_region_hydration *hd)
				982	{
				983	struct clone *clone = hd->clone;
				984	unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
				985
				986	if (batch->head) {
				987	/* Try to extend the current batch */
				988	if (batch->nr_batched_regions < max_batch_size &&
				989	(batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
				990	list_add_tail(&hd->list, &batch->head->list);
				991	batch->nr_batched_regions++;
				992	hd = NULL;
				993	}
				994
				995	/* Check if we should issue the current batch */
				996	if (batch->nr_batched_regions >= max_batch_size \|\| hd) {
				997	hydration_copy(batch->head, batch->nr_batched_regions);
				998	batch->head = NULL;
				999	batch->nr_batched_regions = 0;
				1000	}
				1001	}
				1002
				1003	if (!hd)
				1004	return;
				1005
				1006	/* We treat max batch sizes of zero and one equivalently */
				1007	if (max_batch_size <= 1) {
				1008	hydration_copy(hd, 1);
				1009	return;
				1010	}
				1011
				1012	/* Start a new batch */
				1013	BUG_ON(!list_empty(&hd->list));
				1014	batch->head = hd;
				1015	batch->nr_batched_regions = 1;
				1016	}
				1017
				1018	static unsigned long __start_next_hydration(struct clone *clone,
				1019	unsigned long offset,
				1020	struct batch_info *batch)
				1021	{
				1022	unsigned long flags;
				1023	struct hash_table_bucket *bucket;
				1024	struct dm_clone_region_hydration *hd;
				1025	unsigned long nr_regions = clone->nr_regions;
				1026
				1027	hd = alloc_hydration(clone);
				1028
				1029	/* Try to find a region to hydrate. */
				1030	do {
				1031	offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
				1032	if (offset == nr_regions)
				1033	break;
				1034
				1035	bucket = get_hash_table_bucket(clone, offset);
				1036	bucket_lock_irqsave(bucket, flags);
				1037
				1038	if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
				1039	!__hash_find(bucket, offset)) {
				1040	hydration_init(hd, offset);
				1041	__insert_region_hydration(bucket, hd);
				1042	bucket_unlock_irqrestore(bucket, flags);
				1043
				1044	/* Batch hydration */
				1045	__batch_hydration(batch, hd);
				1046
				1047	return (offset + 1);
				1048	}
				1049
				1050	bucket_unlock_irqrestore(bucket, flags);
				1051
				1052	} while (++offset < nr_regions);
				1053
				1054	if (hd)
				1055	free_hydration(hd);
				1056
				1057	return offset;
				1058	}
				1059
				1060	/*
				1061	* This function searches for regions that still reside in the source device
				1062	* and starts their hydration.
				1063	*/
				1064	static void do_hydration(struct clone *clone)
				1065	{
				1066	unsigned int current_volume;
				1067	unsigned long offset, nr_regions = clone->nr_regions;
				1068
				1069	struct batch_info batch = {
				1070	.head = NULL,
				1071	.nr_batched_regions = 0,
				1072	};
				1073
				1074	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
				1075	return;
				1076
				1077	if (dm_clone_is_hydration_done(clone->cmd))
				1078	return;
				1079
				1080	/*
				1081	* Avoid race with device suspension.
				1082	*/
				1083	atomic_inc(&clone->hydrations_in_flight);
				1084
				1085	/*
				1086	* Make sure atomic_inc() is ordered before test_bit(), otherwise we
				1087	* might race with clone_postsuspend() and start a region hydration
				1088	* after the target has been suspended.
				1089	*
				1090	* This is paired with the smp_mb__after_atomic() in
				1091	* clone_postsuspend().
				1092	*/
				1093	smp_mb__after_atomic();
				1094
				1095	offset = clone->hydration_offset;
				1096	while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
				1097	!atomic_read(&clone->ios_in_flight) &&
				1098	test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
				1099	offset < nr_regions) {
				1100	current_volume = atomic_read(&clone->hydrations_in_flight);
				1101	current_volume += batch.nr_batched_regions;
				1102
				1103	if (current_volume > READ_ONCE(clone->hydration_threshold))
				1104	break;
				1105
				1106	offset = __start_next_hydration(clone, offset, &batch);
				1107	}
				1108
				1109	if (batch.head)
				1110	hydration_copy(batch.head, batch.nr_batched_regions);
				1111
				1112	if (offset >= nr_regions)
				1113	offset = 0;
				1114
				1115	clone->hydration_offset = offset;
				1116
				1117	if (atomic_dec_and_test(&clone->hydrations_in_flight))
				1118	wakeup_hydration_waiters(clone);
				1119	}
				1120
				1121	/---------------------------------------------------------------------------/
				1122
				1123	static bool need_commit_due_to_time(struct clone *clone)
				1124	{
				1125	return !time_in_range(jiffies, clone->last_commit_jiffies,
				1126	clone->last_commit_jiffies + COMMIT_PERIOD);
				1127	}
				1128
				1129	/*
				1130	* A non-zero return indicates read-only or fail mode.
				1131	*/
				1132	static int commit_metadata(struct clone clone, bool dest_dev_flushed)
				1133	{
				1134	int r = 0;
				1135
				1136	if (dest_dev_flushed)
				1137	*dest_dev_flushed = false;
				1138
				1139	mutex_lock(&clone->commit_lock);
				1140
				1141	if (!dm_clone_changed_this_transaction(clone->cmd))
				1142	goto out;
				1143
				1144	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
				1145	r = -EPERM;
				1146	goto out;
				1147	}
				1148
				1149	r = dm_clone_metadata_pre_commit(clone->cmd);
				1150	if (unlikely(r)) {
				1151	__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
				1152	goto out;
				1153	}
				1154
				1155	bio_reset(&clone->flush_bio);
				1156	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
				1157	clone->flush_bio.bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH;
				1158
				1159	r = submit_bio_wait(&clone->flush_bio);
				1160	if (unlikely(r)) {
				1161	__metadata_operation_failed(clone, "flush destination device", r);
				1162	goto out;
				1163	}
				1164
				1165	if (dest_dev_flushed)
				1166	*dest_dev_flushed = true;
				1167
				1168	r = dm_clone_metadata_commit(clone->cmd);
				1169	if (unlikely(r)) {
				1170	__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
				1171	goto out;
				1172	}
				1173
				1174	if (dm_clone_is_hydration_done(clone->cmd))
				1175	dm_table_event(clone->ti->table);
				1176	out:
				1177	mutex_unlock(&clone->commit_lock);
				1178
				1179	return r;
				1180	}
				1181
				1182	static void process_deferred_discards(struct clone *clone)
				1183	{
				1184	int r = -EPERM;
				1185	struct bio *bio;
				1186	struct blk_plug plug;
				1187	unsigned long rs, nr_regions;
				1188	struct bio_list discards = BIO_EMPTY_LIST;
				1189
				1190	spin_lock_irq(&clone->lock);
				1191	bio_list_merge(&discards, &clone->deferred_discard_bios);
				1192	bio_list_init(&clone->deferred_discard_bios);
				1193	spin_unlock_irq(&clone->lock);
				1194
				1195	if (bio_list_empty(&discards))
				1196	return;
				1197
				1198	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
				1199	goto out;
				1200
				1201	/* Update the metadata */
				1202	bio_list_for_each(bio, &discards) {
				1203	bio_region_range(clone, bio, &rs, &nr_regions);
				1204	/*
				1205	* A discard request might cover regions that have been already
				1206	* hydrated. There is no need to update the metadata for these
				1207	* regions.
				1208	*/
				1209	r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
				1210	if (unlikely(r))
				1211	break;
				1212	}
				1213	out:
				1214	blk_start_plug(&plug);
				1215	while ((bio = bio_list_pop(&discards)))
				1216	complete_discard_bio(clone, bio, r == 0);
				1217	blk_finish_plug(&plug);
				1218	}
				1219
				1220	static void process_deferred_bios(struct clone *clone)
				1221	{
				1222	struct bio_list bios = BIO_EMPTY_LIST;
				1223
				1224	spin_lock_irq(&clone->lock);
				1225	bio_list_merge(&bios, &clone->deferred_bios);
				1226	bio_list_init(&clone->deferred_bios);
				1227	spin_unlock_irq(&clone->lock);
				1228
				1229	if (bio_list_empty(&bios))
				1230	return;
				1231
				1232	submit_bios(&bios);
				1233	}
				1234
				1235	static void process_deferred_flush_bios(struct clone *clone)
				1236	{
				1237	struct bio *bio;
				1238	bool dest_dev_flushed;
				1239	struct bio_list bios = BIO_EMPTY_LIST;
				1240	struct bio_list bio_completions = BIO_EMPTY_LIST;
				1241
				1242	/*
				1243	* If there are any deferred flush bios, we must commit the metadata
				1244	* before issuing them or signaling their completion.
				1245	*/
				1246	spin_lock_irq(&clone->lock);
				1247	bio_list_merge(&bios, &clone->deferred_flush_bios);
				1248	bio_list_init(&clone->deferred_flush_bios);
				1249
				1250	bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
				1251	bio_list_init(&clone->deferred_flush_completions);
				1252	spin_unlock_irq(&clone->lock);
				1253
				1254	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
				1255	!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
				1256	return;
				1257
				1258	if (commit_metadata(clone, &dest_dev_flushed)) {
				1259	bio_list_merge(&bios, &bio_completions);
				1260
				1261	while ((bio = bio_list_pop(&bios)))
				1262	bio_io_error(bio);
				1263
				1264	return;
				1265	}
				1266
				1267	clone->last_commit_jiffies = jiffies;
				1268
				1269	while ((bio = bio_list_pop(&bio_completions)))
				1270	bio_endio(bio);
				1271
				1272	while ((bio = bio_list_pop(&bios))) {
				1273	if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
				1274	/* We just flushed the destination device as part of
				1275	* the metadata commit, so there is no reason to send
				1276	* another flush.
				1277	*/
				1278	bio_endio(bio);
				1279	} else {
				1280	generic_make_request(bio);
				1281	}
				1282	}
				1283	}
				1284
				1285	static void do_worker(struct work_struct *work)
				1286	{
				1287	struct clone clone = container_of(work, typeof(clone), worker);
				1288
				1289	process_deferred_bios(clone);
				1290	process_deferred_discards(clone);
				1291
				1292	/*
				1293	* process_deferred_flush_bios():
				1294	*
				1295	* - Commit metadata
				1296	*
				1297	* - Process deferred REQ_FUA completions
				1298	*
				1299	* - Process deferred REQ_PREFLUSH bios
				1300	*/
				1301	process_deferred_flush_bios(clone);
				1302
				1303	/* Background hydration */
				1304	do_hydration(clone);
				1305	}
				1306
				1307	/*
				1308	* Commit periodically so that not too much unwritten data builds up.
				1309	*
				1310	* Also, restart background hydration, if it has been stopped by in-flight I/O.
				1311	*/
				1312	static void do_waker(struct work_struct *work)
				1313	{
				1314	struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
				1315
				1316	wake_worker(clone);
				1317	queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
				1318	}
				1319
				1320	/---------------------------------------------------------------------------/
				1321
				1322	/*
				1323	* Target methods
				1324	*/
				1325	static int clone_map(struct dm_target ti, struct bio bio)
				1326	{
				1327	struct clone *clone = ti->private;
				1328	unsigned long region_nr;
				1329
				1330	atomic_inc(&clone->ios_in_flight);
				1331
				1332	if (unlikely(get_clone_mode(clone) == CM_FAIL))
				1333	return DM_MAPIO_KILL;
				1334
				1335	/*
				1336	* REQ_PREFLUSH bios carry no data:
				1337	*
				1338	* - Commit metadata, if changed
				1339	*
				1340	* - Pass down to destination device
				1341	*/
				1342	if (bio->bi_opf & REQ_PREFLUSH) {
				1343	remap_and_issue(clone, bio);
				1344	return DM_MAPIO_SUBMITTED;
				1345	}
				1346
				1347	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
				1348
				1349	/*
				1350	* dm-clone interprets discards and performs a fast hydration of the
				1351	* discarded regions, i.e., we skip the copy from the source device and
				1352	* just mark the regions as hydrated.
				1353	*/
				1354	if (bio_op(bio) == REQ_OP_DISCARD) {
				1355	process_discard_bio(clone, bio);
				1356	return DM_MAPIO_SUBMITTED;
				1357	}
				1358
				1359	/*
				1360	* If the bio's region is hydrated, redirect it to the destination
				1361	* device.
				1362	*
				1363	* If the region is not hydrated and the bio is a READ, redirect it to
				1364	* the source device.
				1365	*
				1366	* Else, defer WRITE bio until after its region has been hydrated and
				1367	* start the region's hydration immediately.
				1368	*/
				1369	region_nr = bio_to_region(clone, bio);
				1370	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
				1371	remap_and_issue(clone, bio);
				1372	return DM_MAPIO_SUBMITTED;
				1373	} else if (bio_data_dir(bio) == READ) {
				1374	remap_to_source(clone, bio);
				1375	return DM_MAPIO_REMAPPED;
				1376	}
				1377
				1378	remap_to_dest(clone, bio);
				1379	hydrate_bio_region(clone, bio);
				1380
				1381	return DM_MAPIO_SUBMITTED;
				1382	}
				1383
				1384	static int clone_endio(struct dm_target ti, struct bio bio, blk_status_t *error)
				1385	{
				1386	struct clone *clone = ti->private;
				1387
				1388	atomic_dec(&clone->ios_in_flight);
				1389
				1390	return DM_ENDIO_DONE;
				1391	}
				1392
				1393	static void emit_flags(struct clone clone, char result, unsigned int maxlen,
				1394	ssize_t *sz_ptr)
				1395	{
				1396	ssize_t sz = *sz_ptr;
				1397	unsigned int count;
				1398
				1399	count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
				1400	count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
				1401
				1402	DMEMIT("%u ", count);
				1403
				1404	if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
				1405	DMEMIT("no_hydration ");
				1406
				1407	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
				1408	DMEMIT("no_discard_passdown ");
				1409
				1410	*sz_ptr = sz;
				1411	}
				1412
				1413	static void emit_core_args(struct clone clone, char result,
				1414	unsigned int maxlen, ssize_t *sz_ptr)
				1415	{
				1416	ssize_t sz = *sz_ptr;
				1417	unsigned int count = 4;
				1418
				1419	DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
				1420	READ_ONCE(clone->hydration_threshold),
				1421	READ_ONCE(clone->hydration_batch_size));
				1422
				1423	*sz_ptr = sz;
				1424	}
				1425
				1426	/*
				1427	* Status format:
				1428	*
				1429	* <metadata block size> <#used metadata blocks>/<#total metadata blocks>
				1430	* <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
				1431	* <#features> <features>* <#core args> <core args>* <clone metadata mode>
				1432	*/
				1433	static void clone_status(struct dm_target *ti, status_type_t type,
				1434	unsigned int status_flags, char *result,
				1435	unsigned int maxlen)
				1436	{
				1437	int r;
				1438	unsigned int i;
				1439	ssize_t sz = 0;
				1440	dm_block_t nr_free_metadata_blocks = 0;
				1441	dm_block_t nr_metadata_blocks = 0;
				1442	char buf[BDEVNAME_SIZE];
				1443	struct clone *clone = ti->private;
				1444
				1445	switch (type) {
				1446	case STATUSTYPE_INFO:
				1447	if (get_clone_mode(clone) == CM_FAIL) {
				1448	DMEMIT("Fail");
				1449	break;
				1450	}
				1451
				1452	/* Commit to ensure statistics aren't out-of-date */
				1453	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
				1454	(void) commit_metadata(clone, NULL);
				1455
				1456	r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
				1457
				1458	if (r) {
				1459	DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
				1460	clone_device_name(clone), r);
				1461	goto error;
				1462	}
				1463
				1464	r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
				1465
				1466	if (r) {
				1467	DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
				1468	clone_device_name(clone), r);
				1469	goto error;
				1470	}
				1471
				1472	DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
				1473	DM_CLONE_METADATA_BLOCK_SIZE,
				1474	(unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
				1475	(unsigned long long)nr_metadata_blocks,
				1476	(unsigned long long)clone->region_size,
				1477	dm_clone_nr_of_hydrated_regions(clone->cmd),
				1478	clone->nr_regions,
				1479	atomic_read(&clone->hydrations_in_flight));
				1480
				1481	emit_flags(clone, result, maxlen, &sz);
				1482	emit_core_args(clone, result, maxlen, &sz);
				1483
				1484	switch (get_clone_mode(clone)) {
				1485	case CM_WRITE:
				1486	DMEMIT("rw");
				1487	break;
				1488	case CM_READ_ONLY:
				1489	DMEMIT("ro");
				1490	break;
				1491	case CM_FAIL:
				1492	DMEMIT("Fail");
				1493	}
				1494
				1495	break;
				1496
				1497	case STATUSTYPE_TABLE:
				1498	format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
				1499	DMEMIT("%s ", buf);
				1500
				1501	format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
				1502	DMEMIT("%s ", buf);
				1503
				1504	format_dev_t(buf, clone->source_dev->bdev->bd_dev);
				1505	DMEMIT("%s", buf);
				1506
				1507	for (i = 0; i < clone->nr_ctr_args; i++)
				1508	DMEMIT(" %s", clone->ctr_args[i]);
				1509	}
				1510
				1511	return;
				1512
				1513	error:
				1514	DMEMIT("Error");
				1515	}
				1516
				1517	static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1518	{
				1519	struct request_queue dest_q, source_q;
				1520	struct clone *clone = container_of(cb, struct clone, callbacks);
				1521
				1522	source_q = bdev_get_queue(clone->source_dev->bdev);
				1523	dest_q = bdev_get_queue(clone->dest_dev->bdev);
				1524
				1525	return (bdi_congested(dest_q->backing_dev_info, bdi_bits) \|
				1526	bdi_congested(source_q->backing_dev_info, bdi_bits));
				1527	}
				1528
				1529	static sector_t get_dev_size(struct dm_dev *dev)
				1530	{
				1531	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
				1532	}
				1533
				1534	/---------------------------------------------------------------------------/
				1535
				1536	/*
				1537	* Construct a clone device mapping:
				1538	*
				1539	* clone <metadata dev> <destination dev> <source dev> <region size>
				1540	* [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
				1541	*
				1542	* metadata dev: Fast device holding the persistent metadata
				1543	* destination dev: The destination device, which will become a clone of the
				1544	* source device
				1545	* source dev: The read-only source device that gets cloned
				1546	* region size: dm-clone unit size in sectors
				1547	*
				1548	* #feature args: Number of feature arguments passed
				1549	* feature args: E.g. no_hydration, no_discard_passdown
				1550	*
				1551	* #core arguments: An even number of core arguments
				1552	* core arguments: Key/value pairs for tuning the core
				1553	* E.g. 'hydration_threshold 256'
				1554	*/
				1555	static int parse_feature_args(struct dm_arg_set as, struct clone clone)
				1556	{
				1557	int r;
				1558	unsigned int argc;
				1559	const char *arg_name;
				1560	struct dm_target *ti = clone->ti;
				1561
				1562	const struct dm_arg args = {
				1563	.min = 0,
				1564	.max = 2,
				1565	.error = "Invalid number of feature arguments"
				1566	};
				1567
				1568	/* No feature arguments supplied */
				1569	if (!as->argc)
				1570	return 0;
				1571
				1572	r = dm_read_arg_group(&args, as, &argc, &ti->error);
				1573	if (r)
				1574	return r;
				1575
				1576	while (argc) {
				1577	arg_name = dm_shift_arg(as);
				1578	argc--;
				1579
				1580	if (!strcasecmp(arg_name, "no_hydration")) {
				1581	__clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
				1582	} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
				1583	__clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
				1584	} else {
				1585	ti->error = "Invalid feature argument";
				1586	return -EINVAL;
				1587	}
				1588	}
				1589
				1590	return 0;
				1591	}
				1592
				1593	static int parse_core_args(struct dm_arg_set as, struct clone clone)
				1594	{
				1595	int r;
				1596	unsigned int argc;
				1597	unsigned int value;
				1598	const char *arg_name;
				1599	struct dm_target *ti = clone->ti;
				1600
				1601	const struct dm_arg args = {
				1602	.min = 0,
				1603	.max = 4,
				1604	.error = "Invalid number of core arguments"
				1605	};
				1606
				1607	/* Initialize core arguments */
				1608	clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
				1609	clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
				1610
				1611	/* No core arguments supplied */
				1612	if (!as->argc)
				1613	return 0;
				1614
				1615	r = dm_read_arg_group(&args, as, &argc, &ti->error);
				1616	if (r)
				1617	return r;
				1618
				1619	if (argc & 1) {
				1620	ti->error = "Number of core arguments must be even";
				1621	return -EINVAL;
				1622	}
				1623
				1624	while (argc) {
				1625	arg_name = dm_shift_arg(as);
				1626	argc -= 2;
				1627
				1628	if (!strcasecmp(arg_name, "hydration_threshold")) {
				1629	if (kstrtouint(dm_shift_arg(as), 10, &value)) {
				1630	ti->error = "Invalid value for argument `hydration_threshold'";
				1631	return -EINVAL;
				1632	}
				1633	clone->hydration_threshold = value;
				1634	} else if (!strcasecmp(arg_name, "hydration_batch_size")) {
				1635	if (kstrtouint(dm_shift_arg(as), 10, &value)) {
				1636	ti->error = "Invalid value for argument `hydration_batch_size'";
				1637	return -EINVAL;
				1638	}
				1639	clone->hydration_batch_size = value;
				1640	} else {
				1641	ti->error = "Invalid core argument";
				1642	return -EINVAL;
				1643	}
				1644	}
				1645
				1646	return 0;
				1647	}
				1648
				1649	static int parse_region_size(struct clone clone, struct dm_arg_set as, char **error)
				1650	{
				1651	int r;
				1652	unsigned int region_size;
				1653	struct dm_arg arg;
				1654
				1655	arg.min = MIN_REGION_SIZE;
				1656	arg.max = MAX_REGION_SIZE;
				1657	arg.error = "Invalid region size";
				1658
				1659	r = dm_read_arg(&arg, as, &region_size, error);
				1660	if (r)
				1661	return r;
				1662
				1663	/* Check region size is a power of 2 */
				1664	if (!is_power_of_2(region_size)) {
				1665	*error = "Region size is not a power of 2";
				1666	return -EINVAL;
				1667	}
				1668
				1669	/* Validate the region size against the device logical block size */
				1670	if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) \|\|
				1671	region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
				1672	*error = "Region size is not a multiple of device logical block size";
				1673	return -EINVAL;
				1674	}
				1675
				1676	clone->region_size = region_size;
				1677
				1678	return 0;
				1679	}
				1680
				1681	static int validate_nr_regions(unsigned long n, char **error)
				1682	{
				1683	/*
				1684	* dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
				1685	* further to 2^31 regions.
				1686	*/
				1687	if (n > (1UL << 31)) {
				1688	*error = "Too many regions. Consider increasing the region size";
				1689	return -EINVAL;
				1690	}
				1691
				1692	return 0;
				1693	}
				1694
				1695	static int parse_metadata_dev(struct clone clone, struct dm_arg_set as, char **error)
				1696	{
				1697	int r;
				1698	sector_t metadata_dev_size;
				1699	char b[BDEVNAME_SIZE];
				1700
				1701	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1702	&clone->metadata_dev);
				1703	if (r) {
				1704	*error = "Error opening metadata device";
				1705	return r;
				1706	}
				1707
				1708	metadata_dev_size = get_dev_size(clone->metadata_dev);
				1709	if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
				1710	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1711	bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
				1712
				1713	return 0;
				1714	}
				1715
				1716	static int parse_dest_dev(struct clone clone, struct dm_arg_set as, char **error)
				1717	{
				1718	int r;
				1719	sector_t dest_dev_size;
				1720
				1721	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ \| FMODE_WRITE,
				1722	&clone->dest_dev);
				1723	if (r) {
				1724	*error = "Error opening destination device";
				1725	return r;
				1726	}
				1727
				1728	dest_dev_size = get_dev_size(clone->dest_dev);
				1729	if (dest_dev_size < clone->ti->len) {
				1730	dm_put_device(clone->ti, clone->dest_dev);
				1731	*error = "Device size larger than destination device";
				1732	return -EINVAL;
				1733	}
				1734
				1735	return 0;
				1736	}
				1737
				1738	static int parse_source_dev(struct clone clone, struct dm_arg_set as, char **error)
				1739	{
				1740	int r;
				1741	sector_t source_dev_size;
				1742
				1743	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
				1744	&clone->source_dev);
				1745	if (r) {
				1746	*error = "Error opening source device";
				1747	return r;
				1748	}
				1749
				1750	source_dev_size = get_dev_size(clone->source_dev);
				1751	if (source_dev_size < clone->ti->len) {
				1752	dm_put_device(clone->ti, clone->source_dev);
				1753	*error = "Device size larger than source device";
				1754	return -EINVAL;
				1755	}
				1756
				1757	return 0;
				1758	}
				1759
				1760	static int copy_ctr_args(struct clone clone, int argc, const char argv, char *error)
				1761	{
				1762	unsigned int i;
				1763	const char **copy;
				1764
				1765	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
				1766	if (!copy)
				1767	goto error;
				1768
				1769	for (i = 0; i < argc; i++) {
				1770	copy[i] = kstrdup(argv[i], GFP_KERNEL);
				1771
				1772	if (!copy[i]) {
				1773	while (i--)
				1774	kfree(copy[i]);
				1775	kfree(copy);
				1776	goto error;
				1777	}
				1778	}
				1779
				1780	clone->nr_ctr_args = argc;
				1781	clone->ctr_args = copy;
				1782	return 0;
				1783
				1784	error:
				1785	*error = "Failed to allocate memory for table line";
				1786	return -ENOMEM;
				1787	}
				1788
				1789	static int clone_ctr(struct dm_target ti, unsigned int argc, char *argv)
				1790	{
				1791	int r;
				1792	sector_t nr_regions;
				1793	struct clone *clone;
				1794	struct dm_arg_set as;
				1795
				1796	if (argc < 4) {
				1797	ti->error = "Invalid number of arguments";
				1798	return -EINVAL;
				1799	}
				1800
				1801	as.argc = argc;
				1802	as.argv = argv;
				1803
				1804	clone = kzalloc(sizeof(*clone), GFP_KERNEL);
				1805	if (!clone) {
				1806	ti->error = "Failed to allocate clone structure";
				1807	return -ENOMEM;
				1808	}
				1809
				1810	clone->ti = ti;
				1811
				1812	/* Initialize dm-clone flags */
				1813	__set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
				1814	__set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
				1815	__set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
				1816
				1817	r = parse_metadata_dev(clone, &as, &ti->error);
				1818	if (r)
				1819	goto out_with_clone;
				1820
				1821	r = parse_dest_dev(clone, &as, &ti->error);
				1822	if (r)
				1823	goto out_with_meta_dev;
				1824
				1825	r = parse_source_dev(clone, &as, &ti->error);
				1826	if (r)
				1827	goto out_with_dest_dev;
				1828
				1829	r = parse_region_size(clone, &as, &ti->error);
				1830	if (r)
				1831	goto out_with_source_dev;
				1832
				1833	clone->region_shift = __ffs(clone->region_size);
				1834	nr_regions = dm_sector_div_up(ti->len, clone->region_size);
				1835
				1836	/* Check for overflow */
				1837	if (nr_regions != (unsigned long)nr_regions) {
				1838	ti->error = "Too many regions. Consider increasing the region size";
				1839	r = -EOVERFLOW;
				1840	goto out_with_source_dev;
				1841	}
				1842
				1843	clone->nr_regions = nr_regions;
				1844
				1845	r = validate_nr_regions(clone->nr_regions, &ti->error);
				1846	if (r)
				1847	goto out_with_source_dev;
				1848
				1849	r = dm_set_target_max_io_len(ti, clone->region_size);
				1850	if (r) {
				1851	ti->error = "Failed to set max io len";
				1852	goto out_with_source_dev;
				1853	}
				1854
				1855	r = parse_feature_args(&as, clone);
				1856	if (r)
				1857	goto out_with_source_dev;
				1858
				1859	r = parse_core_args(&as, clone);
				1860	if (r)
				1861	goto out_with_source_dev;
				1862
				1863	/* Load metadata */
				1864	clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
				1865	clone->region_size);
				1866	if (IS_ERR(clone->cmd)) {
				1867	ti->error = "Failed to load metadata";
				1868	r = PTR_ERR(clone->cmd);
				1869	goto out_with_source_dev;
				1870	}
				1871
				1872	__set_clone_mode(clone, CM_WRITE);
				1873
				1874	if (get_clone_mode(clone) != CM_WRITE) {
				1875	ti->error = "Unable to get write access to metadata, please check/repair metadata";
				1876	r = -EPERM;
				1877	goto out_with_metadata;
				1878	}
				1879
				1880	clone->last_commit_jiffies = jiffies;
				1881
				1882	/* Allocate hydration hash table */
				1883	r = hash_table_init(clone);
				1884	if (r) {
				1885	ti->error = "Failed to allocate hydration hash table";
				1886	goto out_with_metadata;
				1887	}
				1888
				1889	atomic_set(&clone->ios_in_flight, 0);
				1890	init_waitqueue_head(&clone->hydration_stopped);
				1891	spin_lock_init(&clone->lock);
				1892	bio_list_init(&clone->deferred_bios);
				1893	bio_list_init(&clone->deferred_discard_bios);
				1894	bio_list_init(&clone->deferred_flush_bios);
				1895	bio_list_init(&clone->deferred_flush_completions);
				1896	clone->hydration_offset = 0;
				1897	atomic_set(&clone->hydrations_in_flight, 0);
				1898	bio_init(&clone->flush_bio, NULL, 0);
				1899
				1900	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
				1901	if (!clone->wq) {
				1902	ti->error = "Failed to allocate workqueue";
				1903	r = -ENOMEM;
				1904	goto out_with_ht;
				1905	}
				1906
				1907	INIT_WORK(&clone->worker, do_worker);
				1908	INIT_DELAYED_WORK(&clone->waker, do_waker);
				1909
				1910	clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
				1911	if (IS_ERR(clone->kcopyd_client)) {
				1912	r = PTR_ERR(clone->kcopyd_client);
				1913	goto out_with_wq;
				1914	}
				1915
				1916	r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
				1917	_hydration_cache);
				1918	if (r) {
				1919	ti->error = "Failed to create dm_clone_region_hydration memory pool";
				1920	goto out_with_kcopyd;
				1921	}
				1922
				1923	/* Save a copy of the table line */
				1924	r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
				1925	if (r)
				1926	goto out_with_mempool;
				1927
				1928	mutex_init(&clone->commit_lock);
				1929	clone->callbacks.congested_fn = clone_is_congested;
				1930	dm_table_add_target_callbacks(ti->table, &clone->callbacks);
				1931
				1932	/* Enable flushes */
				1933	ti->num_flush_bios = 1;
				1934	ti->flush_supported = true;
				1935
				1936	/* Enable discards */
				1937	ti->discards_supported = true;
				1938	ti->num_discard_bios = 1;
				1939
				1940	ti->private = clone;
				1941
				1942	return 0;
				1943
				1944	out_with_mempool:
				1945	mempool_exit(&clone->hydration_pool);
				1946	out_with_kcopyd:
				1947	dm_kcopyd_client_destroy(clone->kcopyd_client);
				1948	out_with_wq:
				1949	destroy_workqueue(clone->wq);
				1950	out_with_ht:
				1951	hash_table_exit(clone);
				1952	out_with_metadata:
				1953	dm_clone_metadata_close(clone->cmd);
				1954	out_with_source_dev:
				1955	dm_put_device(ti, clone->source_dev);
				1956	out_with_dest_dev:
				1957	dm_put_device(ti, clone->dest_dev);
				1958	out_with_meta_dev:
				1959	dm_put_device(ti, clone->metadata_dev);
				1960	out_with_clone:
				1961	kfree(clone);
				1962
				1963	return r;
				1964	}
				1965
				1966	static void clone_dtr(struct dm_target *ti)
				1967	{
				1968	unsigned int i;
				1969	struct clone *clone = ti->private;
				1970
				1971	mutex_destroy(&clone->commit_lock);
				1972	bio_uninit(&clone->flush_bio);
				1973
				1974	for (i = 0; i < clone->nr_ctr_args; i++)
				1975	kfree(clone->ctr_args[i]);
				1976	kfree(clone->ctr_args);
				1977
				1978	mempool_exit(&clone->hydration_pool);
				1979	dm_kcopyd_client_destroy(clone->kcopyd_client);
				1980	cancel_delayed_work_sync(&clone->waker);
				1981	destroy_workqueue(clone->wq);
				1982	hash_table_exit(clone);
				1983	dm_clone_metadata_close(clone->cmd);
				1984	dm_put_device(ti, clone->source_dev);
				1985	dm_put_device(ti, clone->dest_dev);
				1986	dm_put_device(ti, clone->metadata_dev);
				1987
				1988	kfree(clone);
				1989	}
				1990
				1991	/---------------------------------------------------------------------------/
				1992
				1993	static void clone_postsuspend(struct dm_target *ti)
				1994	{
				1995	struct clone *clone = ti->private;
				1996
				1997	/*
				1998	* To successfully suspend the device:
				1999	*
				2000	* - We cancel the delayed work for periodic commits and wait for
				2001	* it to finish.
				2002	*
				2003	* - We stop the background hydration, i.e. we prevent new region
				2004	* hydrations from starting.
				2005	*
				2006	* - We wait for any in-flight hydrations to finish.
				2007	*
				2008	* - We flush the workqueue.
				2009	*
				2010	* - We commit the metadata.
				2011	*/
				2012	cancel_delayed_work_sync(&clone->waker);
				2013
				2014	set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
				2015
				2016	/*
				2017	* Make sure set_bit() is ordered before atomic_read(), otherwise we
				2018	* might race with do_hydration() and miss some started region
				2019	* hydrations.
				2020	*
				2021	* This is paired with smp_mb__after_atomic() in do_hydration().
				2022	*/
				2023	smp_mb__after_atomic();
				2024
				2025	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
				2026	flush_workqueue(clone->wq);
				2027
				2028	(void) commit_metadata(clone, NULL);
				2029	}
				2030
				2031	static void clone_resume(struct dm_target *ti)
				2032	{
				2033	struct clone *clone = ti->private;
				2034
				2035	clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
				2036	do_waker(&clone->waker.work);
				2037	}
				2038
				2039	static bool bdev_supports_discards(struct block_device *bdev)
				2040	{
				2041	struct request_queue *q = bdev_get_queue(bdev);
				2042
				2043	return (q && blk_queue_discard(q));
				2044	}
				2045
				2046	/*
				2047	* If discard_passdown was enabled verify that the destination device supports
				2048	* discards. Disable discard_passdown if not.
				2049	*/
				2050	static void disable_passdown_if_not_supported(struct clone *clone)
				2051	{
				2052	struct block_device *dest_dev = clone->dest_dev->bdev;
				2053	struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
				2054	const char *reason = NULL;
				2055	char buf[BDEVNAME_SIZE];
				2056
				2057	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
				2058	return;
				2059
				2060	if (!bdev_supports_discards(dest_dev))
				2061	reason = "discard unsupported";
				2062	else if (dest_limits->max_discard_sectors < clone->region_size)
				2063	reason = "max discard sectors smaller than a region";
				2064
				2065	if (reason) {
				2066	DMWARN("Destination device (%s) %s: Disabling discard passdown.",
				2067	bdevname(dest_dev, buf), reason);
				2068	clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
				2069	}
				2070	}
				2071
				2072	static void set_discard_limits(struct clone clone, struct queue_limits limits)
				2073	{
				2074	struct block_device *dest_bdev = clone->dest_dev->bdev;
				2075	struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
				2076
				2077	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
				2078	/* No passdown is done so we set our own virtual limits */
				2079	limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
				2080	limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
				2081	return;
				2082	}
				2083
				2084	/*
				2085	* clone_iterate_devices() is stacking both the source and destination
				2086	* device limits but discards aren't passed to the source device, so
				2087	* inherit destination's limits.
				2088	*/
				2089	limits->max_discard_sectors = dest_limits->max_discard_sectors;
				2090	limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
				2091	limits->discard_granularity = dest_limits->discard_granularity;
				2092	limits->discard_alignment = dest_limits->discard_alignment;
				2093	limits->discard_misaligned = dest_limits->discard_misaligned;
				2094	limits->max_discard_segments = dest_limits->max_discard_segments;
				2095	}
				2096
				2097	static void clone_io_hints(struct dm_target ti, struct queue_limits limits)
				2098	{
				2099	struct clone *clone = ti->private;
				2100	u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
				2101
				2102	/*
				2103	* If the system-determined stacked limits are compatible with
				2104	* dm-clone's region size (io_opt is a factor) do not override them.
				2105	*/
				2106	if (io_opt_sectors < clone->region_size \|\|
				2107	do_div(io_opt_sectors, clone->region_size)) {
				2108	blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
				2109	blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
				2110	}
				2111
				2112	disable_passdown_if_not_supported(clone);
				2113	set_discard_limits(clone, limits);
				2114	}
				2115
				2116	static int clone_iterate_devices(struct dm_target *ti,
				2117	iterate_devices_callout_fn fn, void *data)
				2118	{
				2119	int ret;
				2120	struct clone *clone = ti->private;
				2121	struct dm_dev *dest_dev = clone->dest_dev;
				2122	struct dm_dev *source_dev = clone->source_dev;
				2123
				2124	ret = fn(ti, source_dev, 0, ti->len, data);
				2125	if (!ret)
				2126	ret = fn(ti, dest_dev, 0, ti->len, data);
				2127	return ret;
				2128	}
				2129
				2130	/*
				2131	* dm-clone message functions.
				2132	*/
				2133	static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
				2134	{
				2135	WRITE_ONCE(clone->hydration_threshold, nr_regions);
				2136
				2137	/*
				2138	* If user space sets hydration_threshold to zero then the hydration
				2139	* will stop. If at a later time the hydration_threshold is increased
				2140	* we must restart the hydration process by waking up the worker.
				2141	*/
				2142	wake_worker(clone);
				2143	}
				2144
				2145	static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
				2146	{
				2147	WRITE_ONCE(clone->hydration_batch_size, nr_regions);
				2148	}
				2149
				2150	static void enable_hydration(struct clone *clone)
				2151	{
				2152	if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
				2153	wake_worker(clone);
				2154	}
				2155
				2156	static void disable_hydration(struct clone *clone)
				2157	{
				2158	clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
				2159	}
				2160
				2161	static int clone_message(struct dm_target ti, unsigned int argc, char *argv,
				2162	char *result, unsigned int maxlen)
				2163	{
				2164	struct clone *clone = ti->private;
				2165	unsigned int value;
				2166
				2167	if (!argc)
				2168	return -EINVAL;
				2169
				2170	if (!strcasecmp(argv[0], "enable_hydration")) {
				2171	enable_hydration(clone);
				2172	return 0;
				2173	}
				2174
				2175	if (!strcasecmp(argv[0], "disable_hydration")) {
				2176	disable_hydration(clone);
				2177	return 0;
				2178	}
				2179
				2180	if (argc != 2)
				2181	return -EINVAL;
				2182
				2183	if (!strcasecmp(argv[0], "hydration_threshold")) {
				2184	if (kstrtouint(argv[1], 10, &value))
				2185	return -EINVAL;
				2186
				2187	set_hydration_threshold(clone, value);
				2188
				2189	return 0;
				2190	}
				2191
				2192	if (!strcasecmp(argv[0], "hydration_batch_size")) {
				2193	if (kstrtouint(argv[1], 10, &value))
				2194	return -EINVAL;
				2195
				2196	set_hydration_batch_size(clone, value);
				2197
				2198	return 0;
				2199	}
				2200
				2201	DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
				2202	return -EINVAL;
				2203	}
				2204
				2205	static struct target_type clone_target = {
				2206	.name = "clone",
				2207	.version = {1, 0, 0},
				2208	.module = THIS_MODULE,
				2209	.ctr = clone_ctr,
				2210	.dtr = clone_dtr,
				2211	.map = clone_map,
				2212	.end_io = clone_endio,
				2213	.postsuspend = clone_postsuspend,
				2214	.resume = clone_resume,
				2215	.status = clone_status,
				2216	.message = clone_message,
				2217	.io_hints = clone_io_hints,
				2218	.iterate_devices = clone_iterate_devices,
				2219	};
				2220
				2221	/---------------------------------------------------------------------------/
				2222
				2223	/* Module functions */
				2224	static int __init dm_clone_init(void)
				2225	{
				2226	int r;
				2227
				2228	_hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
				2229	if (!_hydration_cache)
				2230	return -ENOMEM;
				2231
				2232	r = dm_register_target(&clone_target);
				2233	if (r < 0) {
				2234	DMERR("Failed to register clone target");
				2235	kmem_cache_destroy(_hydration_cache);
				2236	return r;
				2237	}
				2238
				2239	return 0;
				2240	}
				2241
				2242	static void __exit dm_clone_exit(void)
				2243	{
				2244	dm_unregister_target(&clone_target);
				2245
				2246	kmem_cache_destroy(_hydration_cache);
				2247	_hydration_cache = NULL;
				2248	}
				2249
				2250	/* Module hooks */
				2251	module_init(dm_clone_init);
				2252	module_exit(dm_clone_exit);
				2253
				2254	MODULE_DESCRIPTION(DM_NAME " clone target");
				2255	MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
				2256	MODULE_LICENSE("GPL");