Blame - src/kernel/linux/v4.19/drivers/md/bcache/writeback.c - T800

blob: b5fc3c6c7178e00212cb5d78d9f2acb3e4e8ab69 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* background writeback - scan btree for dirty data and write it to the backing
				4	* device
				5	*
				6	* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
				7	* Copyright 2012 Google, Inc.
				8	*/
				9
				10	#include "bcache.h"
				11	#include "btree.h"
				12	#include "debug.h"
				13	#include "writeback.h"
				14
				15	#include <linux/delay.h>
				16	#include <linux/kthread.h>
				17	#include <linux/sched/clock.h>
				18	#include <trace/events/bcache.h>
				19
				20	/* Rate limiting */
				21	static uint64_t __calc_target_rate(struct cached_dev *dc)
				22	{
				23	struct cache_set *c = dc->disk.c;
				24
				25	/*
				26	* This is the size of the cache, minus the amount used for
				27	* flash-only devices
				28	*/
				29	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
				30	atomic_long_read(&c->flash_dev_dirty_sectors);
				31
				32	/*
				33	* Unfortunately there is no control of global dirty data. If the
				34	* user states that they want 10% dirty data in the cache, and has,
				35	* e.g., 5 backing volumes of equal size, we try and ensure each
				36	* backing volume uses about 2% of the cache for dirty data.
				37	*/
				38	uint32_t bdev_share =
				39	div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
				40	c->cached_dev_sectors);
				41
				42	uint64_t cache_dirty_target =
				43	div_u64(cache_sectors * dc->writeback_percent, 100);
				44
				45	/* Ensure each backing dev gets at least one dirty share */
				46	if (bdev_share < 1)
				47	bdev_share = 1;
				48
				49	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
				50	}
				51
				52	static void __update_writeback_rate(struct cached_dev *dc)
				53	{
				54	/*
				55	* PI controller:
				56	* Figures out the amount that should be written per second.
				57	*
				58	* First, the error (number of sectors that are dirty beyond our
				59	* target) is calculated. The error is accumulated (numerically
				60	* integrated).
				61	*
				62	* Then, the proportional value and integral value are scaled
				63	* based on configured values. These are stored as inverses to
				64	* avoid fixed point math and to make configuration easy-- e.g.
				65	* the default value of 40 for writeback_rate_p_term_inverse
				66	* attempts to write at a rate that would retire all the dirty
				67	* blocks in 40 seconds.
				68	*
				69	* The writeback_rate_i_inverse value of 10000 means that 1/10000th
				70	* of the error is accumulated in the integral term per second.
				71	* This acts as a slow, long-term average that is not subject to
				72	* variations in usage like the p term.
				73	*/
				74	int64_t target = __calc_target_rate(dc);
				75	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
				76	int64_t error = dirty - target;
				77	int64_t proportional_scaled =
				78	div_s64(error, dc->writeback_rate_p_term_inverse);
				79	int64_t integral_scaled;
				80	uint32_t new_rate;
				81
				82	if ((error < 0 && dc->writeback_rate_integral > 0) \|\|
				83	(error > 0 && time_before64(local_clock(),
				84	dc->writeback_rate.next + NSEC_PER_MSEC))) {
				85	/*
				86	* Only decrease the integral term if it's more than
				87	* zero. Only increase the integral term if the device
				88	* is keeping up. (Don't wind up the integral
				89	* ineffectively in either case).
				90	*
				91	* It's necessary to scale this by
				92	* writeback_rate_update_seconds to keep the integral
				93	* term dimensioned properly.
				94	*/
				95	dc->writeback_rate_integral += error *
				96	dc->writeback_rate_update_seconds;
				97	}
				98
				99	integral_scaled = div_s64(dc->writeback_rate_integral,
				100	dc->writeback_rate_i_term_inverse);
				101
				102	new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
				103	dc->writeback_rate_minimum, NSEC_PER_SEC);
				104
				105	dc->writeback_rate_proportional = proportional_scaled;
				106	dc->writeback_rate_integral_scaled = integral_scaled;
				107	dc->writeback_rate_change = new_rate -
				108	atomic_long_read(&dc->writeback_rate.rate);
				109	atomic_long_set(&dc->writeback_rate.rate, new_rate);
				110	dc->writeback_rate_target = target;
				111	}
				112
				113	static bool set_at_max_writeback_rate(struct cache_set *c,
				114	struct cached_dev *dc)
				115	{
				116	/*
				117	* Idle_counter is increased everytime when update_writeback_rate() is
				118	* called. If all backing devices attached to the same cache set have
				119	* identical dc->writeback_rate_update_seconds values, it is about 6
				120	* rounds of update_writeback_rate() on each backing device before
				121	* c->at_max_writeback_rate is set to 1, and then max wrteback rate set
				122	* to each dc->writeback_rate.rate.
				123	* In order to avoid extra locking cost for counting exact dirty cached
				124	* devices number, c->attached_dev_nr is used to calculate the idle
				125	* throushold. It might be bigger if not all cached device are in write-
				126	* back mode, but it still works well with limited extra rounds of
				127	* update_writeback_rate().
				128	*/
				129	if (atomic_inc_return(&c->idle_counter) <
				130	atomic_read(&c->attached_dev_nr) * 6)
				131	return false;
				132
				133	if (atomic_read(&c->at_max_writeback_rate) != 1)
				134	atomic_set(&c->at_max_writeback_rate, 1);
				135
				136	atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
				137
				138	/* keep writeback_rate_target as existing value */
				139	dc->writeback_rate_proportional = 0;
				140	dc->writeback_rate_integral_scaled = 0;
				141	dc->writeback_rate_change = 0;
				142
				143	/*
				144	* Check c->idle_counter and c->at_max_writeback_rate agagain in case
				145	* new I/O arrives during before set_at_max_writeback_rate() returns.
				146	* Then the writeback rate is set to 1, and its new value should be
				147	* decided via __update_writeback_rate().
				148	*/
				149	if ((atomic_read(&c->idle_counter) <
				150	atomic_read(&c->attached_dev_nr) * 6) \|\|
				151	!atomic_read(&c->at_max_writeback_rate))
				152	return false;
				153
				154	return true;
				155	}
				156
				157	static void update_writeback_rate(struct work_struct *work)
				158	{
				159	struct cached_dev *dc = container_of(to_delayed_work(work),
				160	struct cached_dev,
				161	writeback_rate_update);
				162	struct cache_set *c = dc->disk.c;
				163
				164	/*
				165	* should check BCACHE_DEV_RATE_DW_RUNNING before calling
				166	* cancel_delayed_work_sync().
				167	*/
				168	set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
				169	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
				170	smp_mb();
				171
				172	/*
				173	* CACHE_SET_IO_DISABLE might be set via sysfs interface,
				174	* check it here too.
				175	*/
				176	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) \|\|
				177	test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
				178	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
				179	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
				180	smp_mb();
				181	return;
				182	}
				183
				184	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
				185	/*
				186	* If the whole cache set is idle, set_at_max_writeback_rate()
				187	* will set writeback rate to a max number. Then it is
				188	* unncessary to update writeback rate for an idle cache set
				189	* in maximum writeback rate number(s).
				190	*/
				191	if (!set_at_max_writeback_rate(c, dc)) {
				192	down_read(&dc->writeback_lock);
				193	__update_writeback_rate(dc);
				194	up_read(&dc->writeback_lock);
				195	}
				196	}
				197
				198
				199	/*
				200	* CACHE_SET_IO_DISABLE might be set via sysfs interface,
				201	* check it here too.
				202	*/
				203	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
				204	!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
				205	schedule_delayed_work(&dc->writeback_rate_update,
				206	dc->writeback_rate_update_seconds * HZ);
				207	}
				208
				209	/*
				210	* should check BCACHE_DEV_RATE_DW_RUNNING before calling
				211	* cancel_delayed_work_sync().
				212	*/
				213	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
				214	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
				215	smp_mb();
				216	}
				217
				218	static unsigned int writeback_delay(struct cached_dev *dc,
				219	unsigned int sectors)
				220	{
				221	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) \|\|
				222	!dc->writeback_percent)
				223	return 0;
				224
				225	return bch_next_delay(&dc->writeback_rate, sectors);
				226	}
				227
				228	struct dirty_io {
				229	struct closure cl;
				230	struct cached_dev *dc;
				231	uint16_t sequence;
				232	struct bio bio;
				233	};
				234
				235	static void dirty_init(struct keybuf_key *w)
				236	{
				237	struct dirty_io *io = w->private;
				238	struct bio *bio = &io->bio;
				239
				240	bio_init(bio, bio->bi_inline_vecs,
				241	DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
				242	if (!io->dc->writeback_percent)
				243	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
				244
				245	bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
				246	bio->bi_private = w;
				247	bch_bio_map(bio, NULL);
				248	}
				249
				250	static void dirty_io_destructor(struct closure *cl)
				251	{
				252	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
				253
				254	kfree(io);
				255	}
				256
				257	static void write_dirty_finish(struct closure *cl)
				258	{
				259	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
				260	struct keybuf_key *w = io->bio.bi_private;
				261	struct cached_dev *dc = io->dc;
				262
				263	bio_free_pages(&io->bio);
				264
				265	/* This is kind of a dumb way of signalling errors. */
				266	if (KEY_DIRTY(&w->key)) {
				267	int ret;
				268	unsigned int i;
				269	struct keylist keys;
				270
				271	bch_keylist_init(&keys);
				272
				273	bkey_copy(keys.top, &w->key);
				274	SET_KEY_DIRTY(keys.top, false);
				275	bch_keylist_push(&keys);
				276
				277	for (i = 0; i < KEY_PTRS(&w->key); i++)
				278	atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
				279
				280	ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
				281
				282	if (ret)
				283	trace_bcache_writeback_collision(&w->key);
				284
				285	atomic_long_inc(ret
				286	? &dc->disk.c->writeback_keys_failed
				287	: &dc->disk.c->writeback_keys_done);
				288	}
				289
				290	bch_keybuf_del(&dc->writeback_keys, w);
				291	up(&dc->in_flight);
				292
				293	closure_return_with_destructor(cl, dirty_io_destructor);
				294	}
				295
				296	static void dirty_endio(struct bio *bio)
				297	{
				298	struct keybuf_key *w = bio->bi_private;
				299	struct dirty_io *io = w->private;
				300
				301	if (bio->bi_status) {
				302	SET_KEY_DIRTY(&w->key, false);
				303	bch_count_backing_io_errors(io->dc, bio);
				304	}
				305
				306	closure_put(&io->cl);
				307	}
				308
				309	static void write_dirty(struct closure *cl)
				310	{
				311	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
				312	struct keybuf_key *w = io->bio.bi_private;
				313	struct cached_dev *dc = io->dc;
				314
				315	uint16_t next_sequence;
				316
				317	if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
				318	/* Not our turn to write; wait for a write to complete */
				319	closure_wait(&dc->writeback_ordering_wait, cl);
				320
				321	if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
				322	/*
				323	* Edge case-- it happened in indeterminate order
				324	* relative to when we were added to wait list..
				325	*/
				326	closure_wake_up(&dc->writeback_ordering_wait);
				327	}
				328
				329	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
				330	return;
				331	}
				332
				333	next_sequence = io->sequence + 1;
				334
				335	/*
				336	* IO errors are signalled using the dirty bit on the key.
				337	* If we failed to read, we should not attempt to write to the
				338	* backing device. Instead, immediately go to write_dirty_finish
				339	* to clean up.
				340	*/
				341	if (KEY_DIRTY(&w->key)) {
				342	dirty_init(w);
				343	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
				344	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
				345	bio_set_dev(&io->bio, io->dc->bdev);
				346	io->bio.bi_end_io = dirty_endio;
				347
				348	/* I/O request sent to backing device */
				349	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
				350	}
				351
				352	atomic_set(&dc->writeback_sequence_next, next_sequence);
				353	closure_wake_up(&dc->writeback_ordering_wait);
				354
				355	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
				356	}
				357
				358	static void read_dirty_endio(struct bio *bio)
				359	{
				360	struct keybuf_key *w = bio->bi_private;
				361	struct dirty_io *io = w->private;
				362
				363	/* is_read = 1 */
				364	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
				365	bio->bi_status, 1,
				366	"reading dirty data from cache");
				367
				368	dirty_endio(bio);
				369	}
				370
				371	static void read_dirty_submit(struct closure *cl)
				372	{
				373	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
				374
				375	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
				376
				377	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
				378	}
				379
				380	static void read_dirty(struct cached_dev *dc)
				381	{
				382	unsigned int delay = 0;
				383	struct keybuf_key next, keys[MAX_WRITEBACKS_IN_PASS], *w;
				384	size_t size;
				385	int nk, i;
				386	struct dirty_io *io;
				387	struct closure cl;
				388	uint16_t sequence = 0;
				389
				390	BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
				391	atomic_set(&dc->writeback_sequence_next, sequence);
				392	closure_init_stack(&cl);
				393
				394	/*
				395	* XXX: if we error, background writeback just spins. Should use some
				396	* mempools.
				397	*/
				398
				399	next = bch_keybuf_next(&dc->writeback_keys);
				400
				401	while (!kthread_should_stop() &&
				402	!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
				403	next) {
				404	size = 0;
				405	nk = 0;
				406
				407	do {
				408	BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
				409
				410	/*
				411	* Don't combine too many operations, even if they
				412	* are all small.
				413	*/
				414	if (nk >= MAX_WRITEBACKS_IN_PASS)
				415	break;
				416
				417	/*
				418	* If the current operation is very large, don't
				419	* further combine operations.
				420	*/
				421	if (size >= MAX_WRITESIZE_IN_PASS)
				422	break;
				423
				424	/*
				425	* Operations are only eligible to be combined
				426	* if they are contiguous.
				427	*
				428	* TODO: add a heuristic willing to fire a
				429	* certain amount of non-contiguous IO per pass,
				430	* so that we can benefit from backing device
				431	* command queueing.
				432	*/
				433	if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
				434	&START_KEY(&next->key)))
				435	break;
				436
				437	size += KEY_SIZE(&next->key);
				438	keys[nk++] = next;
				439	} while ((next = bch_keybuf_next(&dc->writeback_keys)));
				440
				441	/* Now we have gathered a set of 1..5 keys to write back. */
				442	for (i = 0; i < nk; i++) {
				443	w = keys[i];
				444
				445	io = kzalloc(sizeof(struct dirty_io) +
				446	sizeof(struct bio_vec) *
				447	DIV_ROUND_UP(KEY_SIZE(&w->key),
				448	PAGE_SECTORS),
				449	GFP_KERNEL);
				450	if (!io)
				451	goto err;
				452
				453	w->private = io;
				454	io->dc = dc;
				455	io->sequence = sequence++;
				456
				457	dirty_init(w);
				458	bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
				459	io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
				460	bio_set_dev(&io->bio,
				461	PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
				462	io->bio.bi_end_io = read_dirty_endio;
				463
				464	if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
				465	goto err_free;
				466
				467	trace_bcache_writeback(&w->key);
				468
				469	down(&dc->in_flight);
				470
				471	/*
				472	* We've acquired a semaphore for the maximum
				473	* simultaneous number of writebacks; from here
				474	* everything happens asynchronously.
				475	*/
				476	closure_call(&io->cl, read_dirty_submit, NULL, &cl);
				477	}
				478
				479	delay = writeback_delay(dc, size);
				480
				481	while (!kthread_should_stop() &&
				482	!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
				483	delay) {
				484	schedule_timeout_interruptible(delay);
				485	delay = writeback_delay(dc, 0);
				486	}
				487	}
				488
				489	if (0) {
				490	err_free:
				491	kfree(w->private);
				492	err:
				493	bch_keybuf_del(&dc->writeback_keys, w);
				494	}
				495
				496	/*
				497	* Wait for outstanding writeback IOs to finish (and keybuf slots to be
				498	* freed) before refilling again
				499	*/
				500	closure_sync(&cl);
				501	}
				502
				503	/* Scan for dirty data */
				504
				505	void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
				506	uint64_t offset, int nr_sectors)
				507	{
				508	struct bcache_device *d = c->devices[inode];
				509	unsigned int stripe_offset, stripe, sectors_dirty;
				510
				511	if (!d)
				512	return;
				513
				514	if (UUID_FLASH_ONLY(&c->uuids[inode]))
				515	atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
				516
				517	stripe = offset_to_stripe(d, offset);
				518	stripe_offset = offset & (d->stripe_size - 1);
				519
				520	while (nr_sectors) {
				521	int s = min_t(unsigned int, abs(nr_sectors),
				522	d->stripe_size - stripe_offset);
				523
				524	if (nr_sectors < 0)
				525	s = -s;
				526
				527	if (stripe >= d->nr_stripes)
				528	return;
				529
				530	sectors_dirty = atomic_add_return(s,
				531	d->stripe_sectors_dirty + stripe);
				532	if (sectors_dirty == d->stripe_size)
				533	set_bit(stripe, d->full_dirty_stripes);
				534	else
				535	clear_bit(stripe, d->full_dirty_stripes);
				536
				537	nr_sectors -= s;
				538	stripe_offset = 0;
				539	stripe++;
				540	}
				541	}
				542
				543	static bool dirty_pred(struct keybuf buf, struct bkey k)
				544	{
				545	struct cached_dev *dc = container_of(buf,
				546	struct cached_dev,
				547	writeback_keys);
				548
				549	BUG_ON(KEY_INODE(k) != dc->disk.id);
				550
				551	return KEY_DIRTY(k);
				552	}
				553
				554	static void refill_full_stripes(struct cached_dev *dc)
				555	{
				556	struct keybuf *buf = &dc->writeback_keys;
				557	unsigned int start_stripe, stripe, next_stripe;
				558	bool wrapped = false;
				559
				560	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
				561
				562	if (stripe >= dc->disk.nr_stripes)
				563	stripe = 0;
				564
				565	start_stripe = stripe;
				566
				567	while (1) {
				568	stripe = find_next_bit(dc->disk.full_dirty_stripes,
				569	dc->disk.nr_stripes, stripe);
				570
				571	if (stripe == dc->disk.nr_stripes)
				572	goto next;
				573
				574	next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
				575	dc->disk.nr_stripes, stripe);
				576
				577	buf->last_scanned = KEY(dc->disk.id,
				578	stripe * dc->disk.stripe_size, 0);
				579
				580	bch_refill_keybuf(dc->disk.c, buf,
				581	&KEY(dc->disk.id,
				582	next_stripe * dc->disk.stripe_size, 0),
				583	dirty_pred);
				584
				585	if (array_freelist_empty(&buf->freelist))
				586	return;
				587
				588	stripe = next_stripe;
				589	next:
				590	if (wrapped && stripe > start_stripe)
				591	return;
				592
				593	if (stripe == dc->disk.nr_stripes) {
				594	stripe = 0;
				595	wrapped = true;
				596	}
				597	}
				598	}
				599
				600	/*
				601	* Returns true if we scanned the entire disk
				602	*/
				603	static bool refill_dirty(struct cached_dev *dc)
				604	{
				605	struct keybuf *buf = &dc->writeback_keys;
				606	struct bkey start = KEY(dc->disk.id, 0, 0);
				607	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
				608	struct bkey start_pos;
				609
				610	/*
				611	* make sure keybuf pos is inside the range for this disk - at bringup
				612	* we might not be attached yet so this disk's inode nr isn't
				613	* initialized then
				614	*/
				615	if (bkey_cmp(&buf->last_scanned, &start) < 0 \|\|
				616	bkey_cmp(&buf->last_scanned, &end) > 0)
				617	buf->last_scanned = start;
				618
				619	if (dc->partial_stripes_expensive) {
				620	refill_full_stripes(dc);
				621	if (array_freelist_empty(&buf->freelist))
				622	return false;
				623	}
				624
				625	start_pos = buf->last_scanned;
				626	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
				627
				628	if (bkey_cmp(&buf->last_scanned, &end) < 0)
				629	return false;
				630
				631	/*
				632	* If we get to the end start scanning again from the beginning, and
				633	* only scan up to where we initially started scanning from:
				634	*/
				635	buf->last_scanned = start;
				636	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
				637
				638	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
				639	}
				640
				641	static int bch_writeback_thread(void *arg)
				642	{
				643	struct cached_dev *dc = arg;
				644	struct cache_set *c = dc->disk.c;
				645	bool searched_full_index;
				646
				647	bch_ratelimit_reset(&dc->writeback_rate);
				648
				649	while (!kthread_should_stop() &&
				650	!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
				651	down_write(&dc->writeback_lock);
				652	set_current_state(TASK_INTERRUPTIBLE);
				653	/*
				654	* If the bache device is detaching, skip here and continue
				655	* to perform writeback. Otherwise, if no dirty data on cache,
				656	* or there is dirty data on cache but writeback is disabled,
				657	* the writeback thread should sleep here and wait for others
				658	* to wake up it.
				659	*/
				660	if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
				661	(!atomic_read(&dc->has_dirty) \|\| !dc->writeback_running)) {
				662	up_write(&dc->writeback_lock);
				663
				664	if (kthread_should_stop() \|\|
				665	test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
				666	set_current_state(TASK_RUNNING);
				667	break;
				668	}
				669
				670	schedule();
				671	continue;
				672	}
				673	set_current_state(TASK_RUNNING);
				674
				675	searched_full_index = refill_dirty(dc);
				676
				677	if (searched_full_index &&
				678	RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
				679	atomic_set(&dc->has_dirty, 0);
				680	SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
				681	bch_write_bdev_super(dc, NULL);
				682	/*
				683	* If bcache device is detaching via sysfs interface,
				684	* writeback thread should stop after there is no dirty
				685	* data on cache. BCACHE_DEV_DETACHING flag is set in
				686	* bch_cached_dev_detach().
				687	*/
				688	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) {
				689	up_write(&dc->writeback_lock);
				690	break;
				691	}
				692	}
				693
				694	up_write(&dc->writeback_lock);
				695
				696	read_dirty(dc);
				697
				698	if (searched_full_index) {
				699	unsigned int delay = dc->writeback_delay * HZ;
				700
				701	while (delay &&
				702	!kthread_should_stop() &&
				703	!test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
				704	!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
				705	delay = schedule_timeout_interruptible(delay);
				706
				707	bch_ratelimit_reset(&dc->writeback_rate);
				708	}
				709	}
				710
				711	if (dc->writeback_write_wq) {
				712	flush_workqueue(dc->writeback_write_wq);
				713	destroy_workqueue(dc->writeback_write_wq);
				714	}
				715	cached_dev_put(dc);
				716	wait_for_kthread_stop();
				717
				718	return 0;
				719	}
				720
				721	/* Init */
				722	#define INIT_KEYS_EACH_TIME 500000
				723	#define INIT_KEYS_SLEEP_MS 100
				724
				725	struct sectors_dirty_init {
				726	struct btree_op op;
				727	unsigned int inode;
				728	size_t count;
				729	struct bkey start;
				730	};
				731
				732	static int sectors_dirty_init_fn(struct btree_op _op, struct btree b,
				733	struct bkey *k)
				734	{
				735	struct sectors_dirty_init *op = container_of(_op,
				736	struct sectors_dirty_init, op);
				737	if (KEY_INODE(k) > op->inode)
				738	return MAP_DONE;
				739
				740	if (KEY_DIRTY(k))
				741	bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
				742	KEY_START(k), KEY_SIZE(k));
				743
				744	op->count++;
				745	if (atomic_read(&b->c->search_inflight) &&
				746	!(op->count % INIT_KEYS_EACH_TIME)) {
				747	bkey_copy_key(&op->start, k);
				748	return -EAGAIN;
				749	}
				750
				751	return MAP_CONTINUE;
				752	}
				753
				754	void bch_sectors_dirty_init(struct bcache_device *d)
				755	{
				756	struct sectors_dirty_init op;
				757	int ret;
				758
				759	bch_btree_op_init(&op.op, -1);
				760	op.inode = d->id;
				761	op.count = 0;
				762	op.start = KEY(op.inode, 0, 0);
				763
				764	do {
				765	ret = bch_btree_map_keys(&op.op, d->c, &op.start,
				766	sectors_dirty_init_fn, 0);
				767	if (ret == -EAGAIN)
				768	schedule_timeout_interruptible(
				769	msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
				770	else if (ret < 0) {
				771	pr_warn("sectors dirty init failed, ret=%d!", ret);
				772	break;
				773	}
				774	} while (ret == -EAGAIN);
				775	}
				776
				777	void bch_cached_dev_writeback_init(struct cached_dev *dc)
				778	{
				779	sema_init(&dc->in_flight, 64);
				780	init_rwsem(&dc->writeback_lock);
				781	bch_keybuf_init(&dc->writeback_keys);
				782
				783	dc->writeback_metadata = true;
				784	dc->writeback_running = false;
				785	dc->writeback_percent = 10;
				786	dc->writeback_delay = 30;
				787	atomic_long_set(&dc->writeback_rate.rate, 1024);
				788	dc->writeback_rate_minimum = 8;
				789
				790	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
				791	dc->writeback_rate_p_term_inverse = 40;
				792	dc->writeback_rate_i_term_inverse = 10000;
				793
				794	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
				795	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
				796	}
				797
				798	int bch_cached_dev_writeback_start(struct cached_dev *dc)
				799	{
				800	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
				801	WQ_MEM_RECLAIM, 0);
				802	if (!dc->writeback_write_wq)
				803	return -ENOMEM;
				804
				805	cached_dev_get(dc);
				806	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
				807	"bcache_writeback");
				808	if (IS_ERR(dc->writeback_thread)) {
				809	cached_dev_put(dc);
				810	destroy_workqueue(dc->writeback_write_wq);
				811	return PTR_ERR(dc->writeback_thread);
				812	}
				813	dc->writeback_running = true;
				814
				815	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
				816	schedule_delayed_work(&dc->writeback_rate_update,
				817	dc->writeback_rate_update_seconds * HZ);
				818
				819	bch_writeback_queue(dc);
				820
				821	return 0;
				822	}