Blame - src/kernel/linux/v4.14/drivers/md/dm-kcopyd.c - T103

blob: 7ca2b1aaa79d435886806639d1c493f5d0a79175 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2002 Sistina Software (UK) Limited.
				3	* Copyright (C) 2006 Red Hat GmbH
				4	*
				5	* This file is released under the GPL.
				6	*
				7	* Kcopyd provides a simple interface for copying an area of one
				8	* block-device to one or more other block-devices, with an asynchronous
				9	* completion notification.
				10	*/
				11
				12	#include <linux/types.h>
				13	#include <linux/atomic.h>
				14	#include <linux/blkdev.h>
				15	#include <linux/fs.h>
				16	#include <linux/init.h>
				17	#include <linux/list.h>
				18	#include <linux/mempool.h>
				19	#include <linux/module.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/slab.h>
				22	#include <linux/vmalloc.h>
				23	#include <linux/workqueue.h>
				24	#include <linux/mutex.h>
				25	#include <linux/delay.h>
				26	#include <linux/device-mapper.h>
				27	#include <linux/dm-kcopyd.h>
				28
				29	#include "dm-core.h"
				30
				31	#define SUB_JOB_SIZE 128
				32	#define SPLIT_COUNT 8
				33	#define MIN_JOBS 8
				34	#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
				35
				36	/*-----------------------------------------------------------------
				37	* Each kcopyd client has its own little pool of preallocated
				38	* pages for kcopyd io.
				39	---------------------------------------------------------------/
				40	struct dm_kcopyd_client {
				41	struct page_list *pages;
				42	unsigned nr_reserved_pages;
				43	unsigned nr_free_pages;
				44
				45	struct dm_io_client *io_client;
				46
				47	wait_queue_head_t destroyq;
				48	atomic_t nr_jobs;
				49
				50	mempool_t *job_pool;
				51
				52	struct workqueue_struct *kcopyd_wq;
				53	struct work_struct kcopyd_work;
				54
				55	struct dm_kcopyd_throttle *throttle;
				56
				57	/*
				58	* We maintain four lists of jobs:
				59	*
				60	* i) jobs waiting for pages
				61	* ii) jobs that have pages, and are waiting for the io to be issued.
				62	* iii) jobs that don't need to do any IO and just run a callback
				63	* iv) jobs that have completed.
				64	*
				65	* All four of these are protected by job_lock.
				66	*/
				67	spinlock_t job_lock;
				68	struct list_head callback_jobs;
				69	struct list_head complete_jobs;
				70	struct list_head io_jobs;
				71	struct list_head pages_jobs;
				72	};
				73
				74	static struct page_list zero_page_list;
				75
				76	static DEFINE_SPINLOCK(throttle_spinlock);
				77
				78	/*
				79	* IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
				80	* When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
				81	* by 2.
				82	*/
				83	#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ
				84
				85	/*
				86	* Sleep this number of milliseconds.
				87	*
				88	* The value was decided experimentally.
				89	* Smaller values seem to cause an increased copy rate above the limit.
				90	* The reason for this is unknown but possibly due to jiffies rounding errors
				91	* or read/write cache inside the disk.
				92	*/
				93	#define SLEEP_MSEC 100
				94
				95	/*
				96	* Maximum number of sleep events. There is a theoretical livelock if more
				97	* kcopyd clients do work simultaneously which this limit avoids.
				98	*/
				99	#define MAX_SLEEPS 10
				100
				101	static void io_job_start(struct dm_kcopyd_throttle *t)
				102	{
				103	unsigned throttle, now, difference;
				104	int slept = 0, skew;
				105
				106	if (unlikely(!t))
				107	return;
				108
				109	try_again:
				110	spin_lock_irq(&throttle_spinlock);
				111
				112	throttle = ACCESS_ONCE(t->throttle);
				113
				114	if (likely(throttle >= 100))
				115	goto skip_limit;
				116
				117	now = jiffies;
				118	difference = now - t->last_jiffies;
				119	t->last_jiffies = now;
				120	if (t->num_io_jobs)
				121	t->io_period += difference;
				122	t->total_period += difference;
				123
				124	/*
				125	* Maintain sane values if we got a temporary overflow.
				126	*/
				127	if (unlikely(t->io_period > t->total_period))
				128	t->io_period = t->total_period;
				129
				130	if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
				131	int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
				132	t->total_period >>= shift;
				133	t->io_period >>= shift;
				134	}
				135
				136	skew = t->io_period - throttle * t->total_period / 100;
				137
				138	if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
				139	slept++;
				140	spin_unlock_irq(&throttle_spinlock);
				141	msleep(SLEEP_MSEC);
				142	goto try_again;
				143	}
				144
				145	skip_limit:
				146	t->num_io_jobs++;
				147
				148	spin_unlock_irq(&throttle_spinlock);
				149	}
				150
				151	static void io_job_finish(struct dm_kcopyd_throttle *t)
				152	{
				153	unsigned long flags;
				154
				155	if (unlikely(!t))
				156	return;
				157
				158	spin_lock_irqsave(&throttle_spinlock, flags);
				159
				160	t->num_io_jobs--;
				161
				162	if (likely(ACCESS_ONCE(t->throttle) >= 100))
				163	goto skip_limit;
				164
				165	if (!t->num_io_jobs) {
				166	unsigned now, difference;
				167
				168	now = jiffies;
				169	difference = now - t->last_jiffies;
				170	t->last_jiffies = now;
				171
				172	t->io_period += difference;
				173	t->total_period += difference;
				174
				175	/*
				176	* Maintain sane values if we got a temporary overflow.
				177	*/
				178	if (unlikely(t->io_period > t->total_period))
				179	t->io_period = t->total_period;
				180	}
				181
				182	skip_limit:
				183	spin_unlock_irqrestore(&throttle_spinlock, flags);
				184	}
				185
				186
				187	static void wake(struct dm_kcopyd_client *kc)
				188	{
				189	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
				190	}
				191
				192	/*
				193	* Obtain one page for the use of kcopyd.
				194	*/
				195	static struct page_list *alloc_pl(gfp_t gfp)
				196	{
				197	struct page_list *pl;
				198
				199	pl = kmalloc(sizeof(*pl), gfp);
				200	if (!pl)
				201	return NULL;
				202
				203	pl->page = alloc_page(gfp);
				204	if (!pl->page) {
				205	kfree(pl);
				206	return NULL;
				207	}
				208
				209	return pl;
				210	}
				211
				212	static void free_pl(struct page_list *pl)
				213	{
				214	__free_page(pl->page);
				215	kfree(pl);
				216	}
				217
				218	/*
				219	* Add the provided pages to a client's free page list, releasing
				220	* back to the system any beyond the reserved_pages limit.
				221	*/
				222	static void kcopyd_put_pages(struct dm_kcopyd_client kc, struct page_list pl)
				223	{
				224	struct page_list *next;
				225
				226	do {
				227	next = pl->next;
				228
				229	if (kc->nr_free_pages >= kc->nr_reserved_pages)
				230	free_pl(pl);
				231	else {
				232	pl->next = kc->pages;
				233	kc->pages = pl;
				234	kc->nr_free_pages++;
				235	}
				236
				237	pl = next;
				238	} while (pl);
				239	}
				240
				241	static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
				242	unsigned int nr, struct page_list **pages)
				243	{
				244	struct page_list *pl;
				245
				246	*pages = NULL;
				247
				248	do {
				249	pl = alloc_pl(__GFP_NOWARN \| __GFP_NORETRY \| __GFP_KSWAPD_RECLAIM);
				250	if (unlikely(!pl)) {
				251	/* Use reserved pages */
				252	pl = kc->pages;
				253	if (unlikely(!pl))
				254	goto out_of_memory;
				255	kc->pages = pl->next;
				256	kc->nr_free_pages--;
				257	}
				258	pl->next = *pages;
				259	*pages = pl;
				260	} while (--nr);
				261
				262	return 0;
				263
				264	out_of_memory:
				265	if (*pages)
				266	kcopyd_put_pages(kc, *pages);
				267	return -ENOMEM;
				268	}
				269
				270	/*
				271	* These three functions resize the page pool.
				272	*/
				273	static void drop_pages(struct page_list *pl)
				274	{
				275	struct page_list *next;
				276
				277	while (pl) {
				278	next = pl->next;
				279	free_pl(pl);
				280	pl = next;
				281	}
				282	}
				283
				284	/*
				285	* Allocate and reserve nr_pages for the use of a specific client.
				286	*/
				287	static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages)
				288	{
				289	unsigned i;
				290	struct page_list pl = NULL, next;
				291
				292	for (i = 0; i < nr_pages; i++) {
				293	next = alloc_pl(GFP_KERNEL);
				294	if (!next) {
				295	if (pl)
				296	drop_pages(pl);
				297	return -ENOMEM;
				298	}
				299	next->next = pl;
				300	pl = next;
				301	}
				302
				303	kc->nr_reserved_pages += nr_pages;
				304	kcopyd_put_pages(kc, pl);
				305
				306	return 0;
				307	}
				308
				309	static void client_free_pages(struct dm_kcopyd_client *kc)
				310	{
				311	BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
				312	drop_pages(kc->pages);
				313	kc->pages = NULL;
				314	kc->nr_free_pages = kc->nr_reserved_pages = 0;
				315	}
				316
				317	/*-----------------------------------------------------------------
				318	* kcopyd_jobs need to be allocated by the clients of kcopyd,
				319	* for this reason we use a mempool to prevent the client from
				320	* ever having to do io (which could cause a deadlock).
				321	---------------------------------------------------------------/
				322	struct kcopyd_job {
				323	struct dm_kcopyd_client *kc;
				324	struct list_head list;
				325	unsigned long flags;
				326
				327	/*
				328	* Error state of the job.
				329	*/
				330	int read_err;
				331	unsigned long write_err;
				332
				333	/*
				334	* Either READ or WRITE
				335	*/
				336	int rw;
				337	struct dm_io_region source;
				338
				339	/*
				340	* The destinations for the transfer.
				341	*/
				342	unsigned int num_dests;
				343	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
				344
				345	struct page_list *pages;
				346
				347	/*
				348	* Set this to ensure you are notified when the job has
				349	* completed. 'context' is for callback to use.
				350	*/
				351	dm_kcopyd_notify_fn fn;
				352	void *context;
				353
				354	/*
				355	* These fields are only used if the job has been split
				356	* into more manageable parts.
				357	*/
				358	struct mutex lock;
				359	atomic_t sub_jobs;
				360	sector_t progress;
				361	sector_t write_offset;
				362
				363	struct kcopyd_job *master_job;
				364	};
				365
				366	static struct kmem_cache *_job_cache;
				367
				368	int __init dm_kcopyd_init(void)
				369	{
				370	_job_cache = kmem_cache_create("kcopyd_job",
				371	sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
				372	__alignof__(struct kcopyd_job), 0, NULL);
				373	if (!_job_cache)
				374	return -ENOMEM;
				375
				376	zero_page_list.next = &zero_page_list;
				377	zero_page_list.page = ZERO_PAGE(0);
				378
				379	return 0;
				380	}
				381
				382	void dm_kcopyd_exit(void)
				383	{
				384	kmem_cache_destroy(_job_cache);
				385	_job_cache = NULL;
				386	}
				387
				388	/*
				389	* Functions to push and pop a job onto the head of a given job
				390	* list.
				391	*/
				392	static struct kcopyd_job pop_io_job(struct list_head jobs,
				393	struct dm_kcopyd_client *kc)
				394	{
				395	struct kcopyd_job *job;
				396
				397	/*
				398	* For I/O jobs, pop any read, any write without sequential write
				399	* constraint and sequential writes that are at the right position.
				400	*/
				401	list_for_each_entry(job, jobs, list) {
				402	if (job->rw == READ \|\| !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
				403	list_del(&job->list);
				404	return job;
				405	}
				406
				407	if (job->write_offset == job->master_job->write_offset) {
				408	job->master_job->write_offset += job->source.count;
				409	list_del(&job->list);
				410	return job;
				411	}
				412	}
				413
				414	return NULL;
				415	}
				416
				417	static struct kcopyd_job pop(struct list_head jobs,
				418	struct dm_kcopyd_client *kc)
				419	{
				420	struct kcopyd_job *job = NULL;
				421	unsigned long flags;
				422
				423	spin_lock_irqsave(&kc->job_lock, flags);
				424
				425	if (!list_empty(jobs)) {
				426	if (jobs == &kc->io_jobs)
				427	job = pop_io_job(jobs, kc);
				428	else {
				429	job = list_entry(jobs->next, struct kcopyd_job, list);
				430	list_del(&job->list);
				431	}
				432	}
				433	spin_unlock_irqrestore(&kc->job_lock, flags);
				434
				435	return job;
				436	}
				437
				438	static void push(struct list_head jobs, struct kcopyd_job job)
				439	{
				440	unsigned long flags;
				441	struct dm_kcopyd_client *kc = job->kc;
				442
				443	spin_lock_irqsave(&kc->job_lock, flags);
				444	list_add_tail(&job->list, jobs);
				445	spin_unlock_irqrestore(&kc->job_lock, flags);
				446	}
				447
				448
				449	static void push_head(struct list_head jobs, struct kcopyd_job job)
				450	{
				451	unsigned long flags;
				452	struct dm_kcopyd_client *kc = job->kc;
				453
				454	spin_lock_irqsave(&kc->job_lock, flags);
				455	list_add(&job->list, jobs);
				456	spin_unlock_irqrestore(&kc->job_lock, flags);
				457	}
				458
				459	/*
				460	* These three functions process 1 item from the corresponding
				461	* job list.
				462	*
				463	* They return:
				464	* < 0: error
				465	* 0: success
				466	* > 0: can't process yet.
				467	*/
				468	static int run_complete_job(struct kcopyd_job *job)
				469	{
				470	void *context = job->context;
				471	int read_err = job->read_err;
				472	unsigned long write_err = job->write_err;
				473	dm_kcopyd_notify_fn fn = job->fn;
				474	struct dm_kcopyd_client *kc = job->kc;
				475
				476	if (job->pages && job->pages != &zero_page_list)
				477	kcopyd_put_pages(kc, job->pages);
				478	/*
				479	* If this is the master job, the sub jobs have already
				480	* completed so we can free everything.
				481	*/
				482	if (job->master_job == job)
				483	mempool_free(job, kc->job_pool);
				484	fn(read_err, write_err, context);
				485
				486	if (atomic_dec_and_test(&kc->nr_jobs))
				487	wake_up(&kc->destroyq);
				488
				489	cond_resched();
				490
				491	return 0;
				492	}
				493
				494	static void complete_io(unsigned long error, void *context)
				495	{
				496	struct kcopyd_job job = (struct kcopyd_job ) context;
				497	struct dm_kcopyd_client *kc = job->kc;
				498
				499	io_job_finish(kc->throttle);
				500
				501	if (error) {
				502	if (op_is_write(job->rw))
				503	job->write_err \|= error;
				504	else
				505	job->read_err = 1;
				506
				507	if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
				508	push(&kc->complete_jobs, job);
				509	wake(kc);
				510	return;
				511	}
				512	}
				513
				514	if (op_is_write(job->rw))
				515	push(&kc->complete_jobs, job);
				516
				517	else {
				518	job->rw = WRITE;
				519	push(&kc->io_jobs, job);
				520	}
				521
				522	wake(kc);
				523	}
				524
				525	/*
				526	* Request io on as many buffer heads as we can currently get for
				527	* a particular job.
				528	*/
				529	static int run_io_job(struct kcopyd_job *job)
				530	{
				531	int r;
				532	struct dm_io_request io_req = {
				533	.bi_op = job->rw,
				534	.bi_op_flags = 0,
				535	.mem.type = DM_IO_PAGE_LIST,
				536	.mem.ptr.pl = job->pages,
				537	.mem.offset = 0,
				538	.notify.fn = complete_io,
				539	.notify.context = job,
				540	.client = job->kc->io_client,
				541	};
				542
				543	/*
				544	* If we need to write sequentially and some reads or writes failed,
				545	* no point in continuing.
				546	*/
				547	if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
				548	job->master_job->write_err) {
				549	job->write_err = job->master_job->write_err;
				550	return -EIO;
				551	}
				552
				553	io_job_start(job->kc->throttle);
				554
				555	if (job->rw == READ)
				556	r = dm_io(&io_req, 1, &job->source, NULL);
				557	else
				558	r = dm_io(&io_req, job->num_dests, job->dests, NULL);
				559
				560	return r;
				561	}
				562
				563	static int run_pages_job(struct kcopyd_job *job)
				564	{
				565	int r;
				566	unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
				567
				568	r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
				569	if (!r) {
				570	/* this job is ready for io */
				571	push(&job->kc->io_jobs, job);
				572	return 0;
				573	}
				574
				575	if (r == -ENOMEM)
				576	/* can't complete now */
				577	return 1;
				578
				579	return r;
				580	}
				581
				582	/*
				583	* Run through a list for as long as possible. Returns the count
				584	* of successful jobs.
				585	*/
				586	static int process_jobs(struct list_head jobs, struct dm_kcopyd_client kc,
				587	int (fn) (struct kcopyd_job ))
				588	{
				589	struct kcopyd_job *job;
				590	int r, count = 0;
				591
				592	while ((job = pop(jobs, kc))) {
				593
				594	r = fn(job);
				595
				596	if (r < 0) {
				597	/* error this rogue job */
				598	if (op_is_write(job->rw))
				599	job->write_err = (unsigned long) -1L;
				600	else
				601	job->read_err = 1;
				602	push(&kc->complete_jobs, job);
				603	wake(kc);
				604	break;
				605	}
				606
				607	if (r > 0) {
				608	/*
				609	* We couldn't service this job ATM, so
				610	* push this job back onto the list.
				611	*/
				612	push_head(jobs, job);
				613	break;
				614	}
				615
				616	count++;
				617	}
				618
				619	return count;
				620	}
				621
				622	/*
				623	* kcopyd does this every time it's woken up.
				624	*/
				625	static void do_work(struct work_struct *work)
				626	{
				627	struct dm_kcopyd_client *kc = container_of(work,
				628	struct dm_kcopyd_client, kcopyd_work);
				629	struct blk_plug plug;
				630	unsigned long flags;
				631
				632	/*
				633	* The order that these are called is very important.
				634	* complete jobs can free some pages for pages jobs.
				635	* Pages jobs when successful will jump onto the io jobs
				636	* list. io jobs call wake when they complete and it all
				637	* starts again.
				638	*/
				639	spin_lock_irqsave(&kc->job_lock, flags);
				640	list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
				641	spin_unlock_irqrestore(&kc->job_lock, flags);
				642
				643	blk_start_plug(&plug);
				644	process_jobs(&kc->complete_jobs, kc, run_complete_job);
				645	process_jobs(&kc->pages_jobs, kc, run_pages_job);
				646	process_jobs(&kc->io_jobs, kc, run_io_job);
				647	blk_finish_plug(&plug);
				648	}
				649
				650	/*
				651	* If we are copying a small region we just dispatch a single job
				652	* to do the copy, otherwise the io has to be split up into many
				653	* jobs.
				654	*/
				655	static void dispatch_job(struct kcopyd_job *job)
				656	{
				657	struct dm_kcopyd_client *kc = job->kc;
				658	atomic_inc(&kc->nr_jobs);
				659	if (unlikely(!job->source.count))
				660	push(&kc->callback_jobs, job);
				661	else if (job->pages == &zero_page_list)
				662	push(&kc->io_jobs, job);
				663	else
				664	push(&kc->pages_jobs, job);
				665	wake(kc);
				666	}
				667
				668	static void segment_complete(int read_err, unsigned long write_err,
				669	void *context)
				670	{
				671	/* FIXME: tidy this function */
				672	sector_t progress = 0;
				673	sector_t count = 0;
				674	struct kcopyd_job sub_job = (struct kcopyd_job ) context;
				675	struct kcopyd_job *job = sub_job->master_job;
				676	struct dm_kcopyd_client *kc = job->kc;
				677
				678	mutex_lock(&job->lock);
				679
				680	/* update the error */
				681	if (read_err)
				682	job->read_err = 1;
				683
				684	if (write_err)
				685	job->write_err \|= write_err;
				686
				687	/*
				688	* Only dispatch more work if there hasn't been an error.
				689	*/
				690	if ((!job->read_err && !job->write_err) \|\|
				691	test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
				692	/* get the next chunk of work */
				693	progress = job->progress;
				694	count = job->source.count - progress;
				695	if (count) {
				696	if (count > SUB_JOB_SIZE)
				697	count = SUB_JOB_SIZE;
				698
				699	job->progress += count;
				700	}
				701	}
				702	mutex_unlock(&job->lock);
				703
				704	if (count) {
				705	int i;
				706
				707	sub_job = job;
				708	sub_job->write_offset = progress;
				709	sub_job->source.sector += progress;
				710	sub_job->source.count = count;
				711
				712	for (i = 0; i < job->num_dests; i++) {
				713	sub_job->dests[i].sector += progress;
				714	sub_job->dests[i].count = count;
				715	}
				716
				717	sub_job->fn = segment_complete;
				718	sub_job->context = sub_job;
				719	dispatch_job(sub_job);
				720
				721	} else if (atomic_dec_and_test(&job->sub_jobs)) {
				722
				723	/*
				724	* Queue the completion callback to the kcopyd thread.
				725	*
				726	* Some callers assume that all the completions are called
				727	* from a single thread and don't race with each other.
				728	*
				729	* We must not call the callback directly here because this
				730	* code may not be executing in the thread.
				731	*/
				732	push(&kc->complete_jobs, job);
				733	wake(kc);
				734	}
				735	}
				736
				737	/*
				738	* Create some sub jobs to share the work between them.
				739	*/
				740	static void split_job(struct kcopyd_job *master_job)
				741	{
				742	int i;
				743
				744	atomic_inc(&master_job->kc->nr_jobs);
				745
				746	atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
				747	for (i = 0; i < SPLIT_COUNT; i++) {
				748	master_job[i + 1].master_job = master_job;
				749	segment_complete(0, 0u, &master_job[i + 1]);
				750	}
				751	}
				752
				753	int dm_kcopyd_copy(struct dm_kcopyd_client kc, struct dm_io_region from,
				754	unsigned int num_dests, struct dm_io_region *dests,
				755	unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
				756	{
				757	struct kcopyd_job *job;
				758	int i;
				759
				760	/*
				761	* Allocate an array of jobs consisting of one master job
				762	* followed by SPLIT_COUNT sub jobs.
				763	*/
				764	job = mempool_alloc(kc->job_pool, GFP_NOIO);
				765
				766	/*
				767	* set up for the read.
				768	*/
				769	job->kc = kc;
				770	job->flags = flags;
				771	job->read_err = 0;
				772	job->write_err = 0;
				773
				774	job->num_dests = num_dests;
				775	memcpy(&job->dests, dests, sizeof(dests) num_dests);
				776
				777	/*
				778	* If one of the destination is a host-managed zoned block device,
				779	* we need to write sequentially. If one of the destination is a
				780	* host-aware device, then leave it to the caller to choose what to do.
				781	*/
				782	if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
				783	for (i = 0; i < job->num_dests; i++) {
				784	if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
				785	set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
				786	break;
				787	}
				788	}
				789	}
				790
				791	/*
				792	* If we need to write sequentially, errors cannot be ignored.
				793	*/
				794	if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
				795	test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
				796	clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
				797
				798	if (from) {
				799	job->source = *from;
				800	job->pages = NULL;
				801	job->rw = READ;
				802	} else {
				803	memset(&job->source, 0, sizeof job->source);
				804	job->source.count = job->dests[0].count;
				805	job->pages = &zero_page_list;
				806
				807	/*
				808	* Use WRITE ZEROES to optimize zeroing if all dests support it.
				809	*/
				810	job->rw = REQ_OP_WRITE_ZEROES;
				811	for (i = 0; i < job->num_dests; i++)
				812	if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
				813	job->rw = WRITE;
				814	break;
				815	}
				816	}
				817
				818	job->fn = fn;
				819	job->context = context;
				820	job->master_job = job;
				821	job->write_offset = 0;
				822
				823	if (job->source.count <= SUB_JOB_SIZE)
				824	dispatch_job(job);
				825	else {
				826	mutex_init(&job->lock);
				827	job->progress = 0;
				828	split_job(job);
				829	}
				830
				831	return 0;
				832	}
				833	EXPORT_SYMBOL(dm_kcopyd_copy);
				834
				835	int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
				836	unsigned num_dests, struct dm_io_region *dests,
				837	unsigned flags, dm_kcopyd_notify_fn fn, void *context)
				838	{
				839	return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
				840	}
				841	EXPORT_SYMBOL(dm_kcopyd_zero);
				842
				843	void dm_kcopyd_prepare_callback(struct dm_kcopyd_client kc,
				844	dm_kcopyd_notify_fn fn, void *context)
				845	{
				846	struct kcopyd_job *job;
				847
				848	job = mempool_alloc(kc->job_pool, GFP_NOIO);
				849
				850	memset(job, 0, sizeof(struct kcopyd_job));
				851	job->kc = kc;
				852	job->fn = fn;
				853	job->context = context;
				854	job->master_job = job;
				855
				856	atomic_inc(&kc->nr_jobs);
				857
				858	return job;
				859	}
				860	EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
				861
				862	void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
				863	{
				864	struct kcopyd_job *job = j;
				865	struct dm_kcopyd_client *kc = job->kc;
				866
				867	job->read_err = read_err;
				868	job->write_err = write_err;
				869
				870	push(&kc->callback_jobs, job);
				871	wake(kc);
				872	}
				873	EXPORT_SYMBOL(dm_kcopyd_do_callback);
				874
				875	/*
				876	* Cancels a kcopyd job, eg. someone might be deactivating a
				877	* mirror.
				878	*/
				879	#if 0
				880	int kcopyd_cancel(struct kcopyd_job *job, int block)
				881	{
				882	/* FIXME: finish */
				883	return -1;
				884	}
				885	#endif /* 0 */
				886
				887	/*-----------------------------------------------------------------
				888	* Client setup
				889	---------------------------------------------------------------/
				890	struct dm_kcopyd_client dm_kcopyd_client_create(struct dm_kcopyd_throttle throttle)
				891	{
				892	int r = -ENOMEM;
				893	struct dm_kcopyd_client *kc;
				894
				895	kc = kzalloc(sizeof(*kc), GFP_KERNEL);
				896	if (!kc)
				897	return ERR_PTR(-ENOMEM);
				898
				899	spin_lock_init(&kc->job_lock);
				900	INIT_LIST_HEAD(&kc->callback_jobs);
				901	INIT_LIST_HEAD(&kc->complete_jobs);
				902	INIT_LIST_HEAD(&kc->io_jobs);
				903	INIT_LIST_HEAD(&kc->pages_jobs);
				904	kc->throttle = throttle;
				905
				906	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
				907	if (!kc->job_pool)
				908	goto bad_slab;
				909
				910	INIT_WORK(&kc->kcopyd_work, do_work);
				911	kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
				912	if (!kc->kcopyd_wq)
				913	goto bad_workqueue;
				914
				915	kc->pages = NULL;
				916	kc->nr_reserved_pages = kc->nr_free_pages = 0;
				917	r = client_reserve_pages(kc, RESERVE_PAGES);
				918	if (r)
				919	goto bad_client_pages;
				920
				921	kc->io_client = dm_io_client_create();
				922	if (IS_ERR(kc->io_client)) {
				923	r = PTR_ERR(kc->io_client);
				924	goto bad_io_client;
				925	}
				926
				927	init_waitqueue_head(&kc->destroyq);
				928	atomic_set(&kc->nr_jobs, 0);
				929
				930	return kc;
				931
				932	bad_io_client:
				933	client_free_pages(kc);
				934	bad_client_pages:
				935	destroy_workqueue(kc->kcopyd_wq);
				936	bad_workqueue:
				937	mempool_destroy(kc->job_pool);
				938	bad_slab:
				939	kfree(kc);
				940
				941	return ERR_PTR(r);
				942	}
				943	EXPORT_SYMBOL(dm_kcopyd_client_create);
				944
				945	void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
				946	{
				947	/* Wait for completion of all jobs submitted by this client. */
				948	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
				949
				950	BUG_ON(!list_empty(&kc->callback_jobs));
				951	BUG_ON(!list_empty(&kc->complete_jobs));
				952	BUG_ON(!list_empty(&kc->io_jobs));
				953	BUG_ON(!list_empty(&kc->pages_jobs));
				954	destroy_workqueue(kc->kcopyd_wq);
				955	dm_io_client_destroy(kc->io_client);
				956	client_free_pages(kc);
				957	mempool_destroy(kc->job_pool);
				958	kfree(kc);
				959	}
				960	EXPORT_SYMBOL(dm_kcopyd_client_destroy);