Blame - src/kernel/linux/v4.19/drivers/md/bcache/journal.c - T800

blob: 7bb15cddca5ecb6dbd25d3f21b2ca201e9d106f8 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* bcache journalling code, for btree insertions
				4	*
				5	* Copyright 2012 Google, Inc.
				6	*/
				7
				8	#include "bcache.h"
				9	#include "btree.h"
				10	#include "debug.h"
				11	#include "extents.h"
				12
				13	#include <trace/events/bcache.h>
				14
				15	/*
				16	* Journal replay/recovery:
				17	*
				18	* This code is all driven from run_cache_set(); we first read the journal
				19	* entries, do some other stuff, then we mark all the keys in the journal
				20	* entries (same as garbage collection would), then we replay them - reinserting
				21	* them into the cache in precisely the same order as they appear in the
				22	* journal.
				23	*
				24	* We only journal keys that go in leaf nodes, which simplifies things quite a
				25	* bit.
				26	*/
				27
				28	static void journal_read_endio(struct bio *bio)
				29	{
				30	struct closure *cl = bio->bi_private;
				31
				32	closure_put(cl);
				33	}
				34
				35	static int journal_read_bucket(struct cache ca, struct list_head list,
				36	unsigned int bucket_index)
				37	{
				38	struct journal_device *ja = &ca->journal;
				39	struct bio *bio = &ja->bio;
				40
				41	struct journal_replay *i;
				42	struct jset j, data = ca->set->journal.w[0].data;
				43	struct closure cl;
				44	unsigned int len, left, offset = 0;
				45	int ret = 0;
				46	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
				47
				48	closure_init_stack(&cl);
				49
				50	pr_debug("reading %u", bucket_index);
				51
				52	while (offset < ca->sb.bucket_size) {
				53	reread: left = ca->sb.bucket_size - offset;
				54	len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
				55
				56	bio_reset(bio);
				57	bio->bi_iter.bi_sector = bucket + offset;
				58	bio_set_dev(bio, ca->bdev);
				59	bio->bi_iter.bi_size = len << 9;
				60
				61	bio->bi_end_io = journal_read_endio;
				62	bio->bi_private = &cl;
				63	bio_set_op_attrs(bio, REQ_OP_READ, 0);
				64	bch_bio_map(bio, data);
				65
				66	closure_bio_submit(ca->set, bio, &cl);
				67	closure_sync(&cl);
				68
				69	/* This function could be simpler now since we no longer write
				70	* journal entries that overlap bucket boundaries; this means
				71	* the start of a bucket will always have a valid journal entry
				72	* if it has any journal entries at all.
				73	*/
				74
				75	j = data;
				76	while (len) {
				77	struct list_head *where;
				78	size_t blocks, bytes = set_bytes(j);
				79
				80	if (j->magic != jset_magic(&ca->sb)) {
				81	pr_debug("%u: bad magic", bucket_index);
				82	return ret;
				83	}
				84
				85	if (bytes > left << 9 \|\|
				86	bytes > PAGE_SIZE << JSET_BITS) {
				87	pr_info("%u: too big, %zu bytes, offset %u",
				88	bucket_index, bytes, offset);
				89	return ret;
				90	}
				91
				92	if (bytes > len << 9)
				93	goto reread;
				94
				95	if (j->csum != csum_set(j)) {
				96	pr_info("%u: bad csum, %zu bytes, offset %u",
				97	bucket_index, bytes, offset);
				98	return ret;
				99	}
				100
				101	blocks = set_blocks(j, block_bytes(ca->set));
				102
				103	while (!list_empty(list)) {
				104	i = list_first_entry(list,
				105	struct journal_replay, list);
				106	if (i->j.seq >= j->last_seq)
				107	break;
				108	list_del(&i->list);
				109	kfree(i);
				110	}
				111
				112	list_for_each_entry_reverse(i, list, list) {
				113	if (j->seq == i->j.seq)
				114	goto next_set;
				115
				116	if (j->seq < i->j.last_seq)
				117	goto next_set;
				118
				119	if (j->seq > i->j.seq) {
				120	where = &i->list;
				121	goto add;
				122	}
				123	}
				124
				125	where = list;
				126	add:
				127	i = kmalloc(offsetof(struct journal_replay, j) +
				128	bytes, GFP_KERNEL);
				129	if (!i)
				130	return -ENOMEM;
				131	memcpy(&i->j, j, bytes);
				132	list_add(&i->list, where);
				133	ret = 1;
				134
				135	ja->seq[bucket_index] = j->seq;
				136	next_set:
				137	offset += blocks * ca->sb.block_size;
				138	len -= blocks * ca->sb.block_size;
				139	j = ((void ) j) + blocks block_bytes(ca);
				140	}
				141	}
				142
				143	return ret;
				144	}
				145
				146	int bch_journal_read(struct cache_set c, struct list_head list)
				147	{
				148	#define read_bucket(b) \
				149	({ \
				150	int ret = journal_read_bucket(ca, list, b); \
				151	__set_bit(b, bitmap); \
				152	if (ret < 0) \
				153	return ret; \
				154	ret; \
				155	})
				156
				157	struct cache *ca;
				158	unsigned int iter;
				159
				160	for_each_cache(ca, c, iter) {
				161	struct journal_device *ja = &ca->journal;
				162	DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
				163	unsigned int i, l, r, m;
				164	uint64_t seq;
				165
				166	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
				167	pr_debug("%u journal buckets", ca->sb.njournal_buckets);
				168
				169	/*
				170	* Read journal buckets ordered by golden ratio hash to quickly
				171	* find a sequence of buckets with valid journal entries
				172	*/
				173	for (i = 0; i < ca->sb.njournal_buckets; i++) {
				174	/*
				175	* We must try the index l with ZERO first for
				176	* correctness due to the scenario that the journal
				177	* bucket is circular buffer which might have wrapped
				178	*/
				179	l = (i * 2654435769U) % ca->sb.njournal_buckets;
				180
				181	if (test_bit(l, bitmap))
				182	break;
				183
				184	if (read_bucket(l))
				185	goto bsearch;
				186	}
				187
				188	/*
				189	* If that fails, check all the buckets we haven't checked
				190	* already
				191	*/
				192	pr_debug("falling back to linear search");
				193
				194	for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
				195	l < ca->sb.njournal_buckets;
				196	l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets,
				197	l + 1))
				198	if (read_bucket(l))
				199	goto bsearch;
				200
				201	/* no journal entries on this device? */
				202	if (l == ca->sb.njournal_buckets)
				203	continue;
				204	bsearch:
				205	BUG_ON(list_empty(list));
				206
				207	/* Binary search */
				208	m = l;
				209	r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
				210	pr_debug("starting binary search, l %u r %u", l, r);
				211
				212	while (l + 1 < r) {
				213	seq = list_entry(list->prev, struct journal_replay,
				214	list)->j.seq;
				215
				216	m = (l + r) >> 1;
				217	read_bucket(m);
				218
				219	if (seq != list_entry(list->prev, struct journal_replay,
				220	list)->j.seq)
				221	l = m;
				222	else
				223	r = m;
				224	}
				225
				226	/*
				227	* Read buckets in reverse order until we stop finding more
				228	* journal entries
				229	*/
				230	pr_debug("finishing up: m %u njournal_buckets %u",
				231	m, ca->sb.njournal_buckets);
				232	l = m;
				233
				234	while (1) {
				235	if (!l--)
				236	l = ca->sb.njournal_buckets - 1;
				237
				238	if (l == m)
				239	break;
				240
				241	if (test_bit(l, bitmap))
				242	continue;
				243
				244	if (!read_bucket(l))
				245	break;
				246	}
				247
				248	seq = 0;
				249
				250	for (i = 0; i < ca->sb.njournal_buckets; i++)
				251	if (ja->seq[i] > seq) {
				252	seq = ja->seq[i];
				253	/*
				254	* When journal_reclaim() goes to allocate for
				255	* the first time, it'll use the bucket after
				256	* ja->cur_idx
				257	*/
				258	ja->cur_idx = i;
				259	ja->last_idx = ja->discard_idx = (i + 1) %
				260	ca->sb.njournal_buckets;
				261
				262	}
				263	}
				264
				265	if (!list_empty(list))
				266	c->journal.seq = list_entry(list->prev,
				267	struct journal_replay,
				268	list)->j.seq;
				269
				270	return 0;
				271	#undef read_bucket
				272	}
				273
				274	void bch_journal_mark(struct cache_set c, struct list_head list)
				275	{
				276	atomic_t p = { 0 };
				277	struct bkey *k;
				278	struct journal_replay *i;
				279	struct journal *j = &c->journal;
				280	uint64_t last = j->seq;
				281
				282	/*
				283	* journal.pin should never fill up - we never write a journal
				284	* entry when it would fill up. But if for some reason it does, we
				285	* iterate over the list in reverse order so that we can just skip that
				286	* refcount instead of bugging.
				287	*/
				288
				289	list_for_each_entry_reverse(i, list, list) {
				290	BUG_ON(last < i->j.seq);
				291	i->pin = NULL;
				292
				293	while (last-- != i->j.seq)
				294	if (fifo_free(&j->pin) > 1) {
				295	fifo_push_front(&j->pin, p);
				296	atomic_set(&fifo_front(&j->pin), 0);
				297	}
				298
				299	if (fifo_free(&j->pin) > 1) {
				300	fifo_push_front(&j->pin, p);
				301	i->pin = &fifo_front(&j->pin);
				302	atomic_set(i->pin, 1);
				303	}
				304
				305	for (k = i->j.start;
				306	k < bset_bkey_last(&i->j);
				307	k = bkey_next(k))
				308	if (!__bch_extent_invalid(c, k)) {
				309	unsigned int j;
				310
				311	for (j = 0; j < KEY_PTRS(k); j++)
				312	if (ptr_available(c, k, j))
				313	atomic_inc(&PTR_BUCKET(c, k, j)->pin);
				314
				315	bch_initial_mark_key(c, 0, k);
				316	}
				317	}
				318	}
				319
				320	bool is_discard_enabled(struct cache_set *s)
				321	{
				322	struct cache *ca;
				323	unsigned int i;
				324
				325	for_each_cache(ca, s, i)
				326	if (ca->discard)
				327	return true;
				328
				329	return false;
				330	}
				331
				332	int bch_journal_replay(struct cache_set s, struct list_head list)
				333	{
				334	int ret = 0, keys = 0, entries = 0;
				335	struct bkey *k;
				336	struct journal_replay *i =
				337	list_entry(list->prev, struct journal_replay, list);
				338
				339	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
				340	struct keylist keylist;
				341
				342	list_for_each_entry(i, list, list) {
				343	BUG_ON(i->pin && atomic_read(i->pin) != 1);
				344
				345	if (n != i->j.seq) {
				346	if (n == start && is_discard_enabled(s))
				347	pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
				348	n, i->j.seq - 1, start, end);
				349	else {
				350	pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
				351	n, i->j.seq - 1, start, end);
				352	ret = -EIO;
				353	goto err;
				354	}
				355	}
				356
				357	for (k = i->j.start;
				358	k < bset_bkey_last(&i->j);
				359	k = bkey_next(k)) {
				360	trace_bcache_journal_replay_key(k);
				361
				362	bch_keylist_init_single(&keylist, k);
				363
				364	ret = bch_btree_insert(s, &keylist, i->pin, NULL);
				365	if (ret)
				366	goto err;
				367
				368	BUG_ON(!bch_keylist_empty(&keylist));
				369	keys++;
				370
				371	cond_resched();
				372	}
				373
				374	if (i->pin)
				375	atomic_dec(i->pin);
				376	n = i->j.seq + 1;
				377	entries++;
				378	}
				379
				380	pr_info("journal replay done, %i keys in %i entries, seq %llu",
				381	keys, entries, end);
				382	err:
				383	while (!list_empty(list)) {
				384	i = list_first_entry(list, struct journal_replay, list);
				385	list_del(&i->list);
				386	kfree(i);
				387	}
				388
				389	return ret;
				390	}
				391
				392	/* Journalling */
				393
				394	static void btree_flush_write(struct cache_set *c)
				395	{
				396	/*
				397	* Try to find the btree node with that references the oldest journal
				398	* entry, best is our current candidate and is locked if non NULL:
				399	*/
				400	struct btree b, best;
				401	unsigned int i;
				402
				403	atomic_long_inc(&c->flush_write);
				404	retry:
				405	best = NULL;
				406
				407	mutex_lock(&c->bucket_lock);
				408	for_each_cached_btree(b, c, i)
				409	if (btree_current_write(b)->journal) {
				410	if (!best)
				411	best = b;
				412	else if (journal_pin_cmp(c,
				413	btree_current_write(best)->journal,
				414	btree_current_write(b)->journal)) {
				415	best = b;
				416	}
				417	}
				418
				419	b = best;
				420	if (b)
				421	set_btree_node_journal_flush(b);
				422	mutex_unlock(&c->bucket_lock);
				423
				424	if (b) {
				425	mutex_lock(&b->write_lock);
				426	if (!btree_current_write(b)->journal) {
				427	clear_bit(BTREE_NODE_journal_flush, &b->flags);
				428	mutex_unlock(&b->write_lock);
				429	/* We raced */
				430	atomic_long_inc(&c->retry_flush_write);
				431	goto retry;
				432	}
				433
				434	__bch_btree_node_write(b, NULL);
				435	clear_bit(BTREE_NODE_journal_flush, &b->flags);
				436	mutex_unlock(&b->write_lock);
				437	}
				438	}
				439
				440	#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
				441
				442	static void journal_discard_endio(struct bio *bio)
				443	{
				444	struct journal_device *ja =
				445	container_of(bio, struct journal_device, discard_bio);
				446	struct cache *ca = container_of(ja, struct cache, journal);
				447
				448	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
				449
				450	closure_wake_up(&ca->set->journal.wait);
				451	closure_put(&ca->set->cl);
				452	}
				453
				454	static void journal_discard_work(struct work_struct *work)
				455	{
				456	struct journal_device *ja =
				457	container_of(work, struct journal_device, discard_work);
				458
				459	submit_bio(&ja->discard_bio);
				460	}
				461
				462	static void do_journal_discard(struct cache *ca)
				463	{
				464	struct journal_device *ja = &ca->journal;
				465	struct bio *bio = &ja->discard_bio;
				466
				467	if (!ca->discard) {
				468	ja->discard_idx = ja->last_idx;
				469	return;
				470	}
				471
				472	switch (atomic_read(&ja->discard_in_flight)) {
				473	case DISCARD_IN_FLIGHT:
				474	return;
				475
				476	case DISCARD_DONE:
				477	ja->discard_idx = (ja->discard_idx + 1) %
				478	ca->sb.njournal_buckets;
				479
				480	atomic_set(&ja->discard_in_flight, DISCARD_READY);
				481	/* fallthrough */
				482
				483	case DISCARD_READY:
				484	if (ja->discard_idx == ja->last_idx)
				485	return;
				486
				487	atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
				488
				489	bio_init(bio, bio->bi_inline_vecs, 1);
				490	bio_set_op_attrs(bio, REQ_OP_DISCARD, 0);
				491	bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
				492	ca->sb.d[ja->discard_idx]);
				493	bio_set_dev(bio, ca->bdev);
				494	bio->bi_iter.bi_size = bucket_bytes(ca);
				495	bio->bi_end_io = journal_discard_endio;
				496
				497	closure_get(&ca->set->cl);
				498	INIT_WORK(&ja->discard_work, journal_discard_work);
				499	queue_work(bch_journal_wq, &ja->discard_work);
				500	}
				501	}
				502
				503	static void journal_reclaim(struct cache_set *c)
				504	{
				505	struct bkey *k = &c->journal.key;
				506	struct cache *ca;
				507	uint64_t last_seq;
				508	unsigned int iter, n = 0;
				509	atomic_t p __maybe_unused;
				510
				511	atomic_long_inc(&c->reclaim);
				512
				513	while (!atomic_read(&fifo_front(&c->journal.pin)))
				514	fifo_pop(&c->journal.pin, p);
				515
				516	last_seq = last_seq(&c->journal);
				517
				518	/* Update last_idx */
				519
				520	for_each_cache(ca, c, iter) {
				521	struct journal_device *ja = &ca->journal;
				522
				523	while (ja->last_idx != ja->cur_idx &&
				524	ja->seq[ja->last_idx] < last_seq)
				525	ja->last_idx = (ja->last_idx + 1) %
				526	ca->sb.njournal_buckets;
				527	}
				528
				529	for_each_cache(ca, c, iter)
				530	do_journal_discard(ca);
				531
				532	if (c->journal.blocks_free)
				533	goto out;
				534
				535	/*
				536	* Allocate:
				537	* XXX: Sort by free journal space
				538	*/
				539
				540	for_each_cache(ca, c, iter) {
				541	struct journal_device *ja = &ca->journal;
				542	unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
				543
				544	/* No space available on this device */
				545	if (next == ja->discard_idx)
				546	continue;
				547
				548	ja->cur_idx = next;
				549	k->ptr[n++] = MAKE_PTR(0,
				550	bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
				551	ca->sb.nr_this_dev);
				552	}
				553
				554	if (n) {
				555	bkey_init(k);
				556	SET_KEY_PTRS(k, n);
				557	c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
				558	}
				559	out:
				560	if (!journal_full(&c->journal))
				561	__closure_wake_up(&c->journal.wait);
				562	}
				563
				564	void bch_journal_next(struct journal *j)
				565	{
				566	atomic_t p = { 1 };
				567
				568	j->cur = (j->cur == j->w)
				569	? &j->w[1]
				570	: &j->w[0];
				571
				572	/*
				573	* The fifo_push() needs to happen at the same time as j->seq is
				574	* incremented for last_seq() to be calculated correctly
				575	*/
				576	BUG_ON(!fifo_push(&j->pin, p));
				577	atomic_set(&fifo_back(&j->pin), 1);
				578
				579	j->cur->data->seq = ++j->seq;
				580	j->cur->dirty = false;
				581	j->cur->need_write = false;
				582	j->cur->data->keys = 0;
				583
				584	if (fifo_full(&j->pin))
				585	pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
				586	}
				587
				588	static void journal_write_endio(struct bio *bio)
				589	{
				590	struct journal_write *w = bio->bi_private;
				591
				592	cache_set_err_on(bio->bi_status, w->c, "journal io error");
				593	closure_put(&w->c->journal.io);
				594	}
				595
				596	static void journal_write(struct closure *cl);
				597
				598	static void journal_write_done(struct closure *cl)
				599	{
				600	struct journal *j = container_of(cl, struct journal, io);
				601	struct journal_write *w = (j->cur == j->w)
				602	? &j->w[1]
				603	: &j->w[0];
				604
				605	__closure_wake_up(&w->wait);
				606	continue_at_nobarrier(cl, journal_write, bch_journal_wq);
				607	}
				608
				609	static void journal_write_unlock(struct closure *cl)
				610	__releases(&c->journal.lock)
				611	{
				612	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
				613
				614	c->journal.io_in_flight = 0;
				615	spin_unlock(&c->journal.lock);
				616	}
				617
				618	static void journal_write_unlocked(struct closure *cl)
				619	__releases(c->journal.lock)
				620	{
				621	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
				622	struct cache *ca;
				623	struct journal_write *w = c->journal.cur;
				624	struct bkey *k = &c->journal.key;
				625	unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) *
				626	c->sb.block_size;
				627
				628	struct bio *bio;
				629	struct bio_list list;
				630
				631	bio_list_init(&list);
				632
				633	if (!w->need_write) {
				634	closure_return_with_destructor(cl, journal_write_unlock);
				635	return;
				636	} else if (journal_full(&c->journal)) {
				637	journal_reclaim(c);
				638	spin_unlock(&c->journal.lock);
				639
				640	btree_flush_write(c);
				641	continue_at(cl, journal_write, bch_journal_wq);
				642	return;
				643	}
				644
				645	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
				646
				647	w->data->btree_level = c->root->level;
				648
				649	bkey_copy(&w->data->btree_root, &c->root->key);
				650	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
				651
				652	for_each_cache(ca, c, i)
				653	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
				654
				655	w->data->magic = jset_magic(&c->sb);
				656	w->data->version = BCACHE_JSET_VERSION;
				657	w->data->last_seq = last_seq(&c->journal);
				658	w->data->csum = csum_set(w->data);
				659
				660	for (i = 0; i < KEY_PTRS(k); i++) {
				661	ca = PTR_CACHE(c, k, i);
				662	bio = &ca->journal.bio;
				663
				664	atomic_long_add(sectors, &ca->meta_sectors_written);
				665
				666	bio_reset(bio);
				667	bio->bi_iter.bi_sector = PTR_OFFSET(k, i);
				668	bio_set_dev(bio, ca->bdev);
				669	bio->bi_iter.bi_size = sectors << 9;
				670
				671	bio->bi_end_io = journal_write_endio;
				672	bio->bi_private = w;
				673	bio_set_op_attrs(bio, REQ_OP_WRITE,
				674	REQ_SYNC\|REQ_META\|REQ_PREFLUSH\|REQ_FUA);
				675	bch_bio_map(bio, w->data);
				676
				677	trace_bcache_journal_write(bio);
				678	bio_list_add(&list, bio);
				679
				680	SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
				681
				682	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
				683	}
				684
				685	/* If KEY_PTRS(k) == 0, this jset gets lost in air */
				686	BUG_ON(i == 0);
				687
				688	atomic_dec_bug(&fifo_back(&c->journal.pin));
				689	bch_journal_next(&c->journal);
				690	journal_reclaim(c);
				691
				692	spin_unlock(&c->journal.lock);
				693
				694	while ((bio = bio_list_pop(&list)))
				695	closure_bio_submit(c, bio, cl);
				696
				697	continue_at(cl, journal_write_done, NULL);
				698	}
				699
				700	static void journal_write(struct closure *cl)
				701	{
				702	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
				703
				704	spin_lock(&c->journal.lock);
				705	journal_write_unlocked(cl);
				706	}
				707
				708	static void journal_try_write(struct cache_set *c)
				709	__releases(c->journal.lock)
				710	{
				711	struct closure *cl = &c->journal.io;
				712	struct journal_write *w = c->journal.cur;
				713
				714	w->need_write = true;
				715
				716	if (!c->journal.io_in_flight) {
				717	c->journal.io_in_flight = 1;
				718	closure_call(cl, journal_write_unlocked, NULL, &c->cl);
				719	} else {
				720	spin_unlock(&c->journal.lock);
				721	}
				722	}
				723
				724	static struct journal_write journal_wait_for_write(struct cache_set c,
				725	unsigned int nkeys)
				726	__acquires(&c->journal.lock)
				727	{
				728	size_t sectors;
				729	struct closure cl;
				730	bool wait = false;
				731
				732	closure_init_stack(&cl);
				733
				734	spin_lock(&c->journal.lock);
				735
				736	while (1) {
				737	struct journal_write *w = c->journal.cur;
				738
				739	sectors = __set_blocks(w->data, w->data->keys + nkeys,
				740	block_bytes(c)) * c->sb.block_size;
				741
				742	if (sectors <= min_t(size_t,
				743	c->journal.blocks_free * c->sb.block_size,
				744	PAGE_SECTORS << JSET_BITS))
				745	return w;
				746
				747	if (wait)
				748	closure_wait(&c->journal.wait, &cl);
				749
				750	if (!journal_full(&c->journal)) {
				751	if (wait)
				752	trace_bcache_journal_entry_full(c);
				753
				754	/*
				755	* XXX: If we were inserting so many keys that they
				756	* won't fit in an _empty_ journal write, we'll
				757	* deadlock. For now, handle this in
				758	* bch_keylist_realloc() - but something to think about.
				759	*/
				760	BUG_ON(!w->data->keys);
				761
				762	journal_try_write(c); /* unlocks */
				763	} else {
				764	if (wait)
				765	trace_bcache_journal_full(c);
				766
				767	journal_reclaim(c);
				768	spin_unlock(&c->journal.lock);
				769
				770	btree_flush_write(c);
				771	}
				772
				773	closure_sync(&cl);
				774	spin_lock(&c->journal.lock);
				775	wait = true;
				776	}
				777	}
				778
				779	static void journal_write_work(struct work_struct *work)
				780	{
				781	struct cache_set *c = container_of(to_delayed_work(work),
				782	struct cache_set,
				783	journal.work);
				784	spin_lock(&c->journal.lock);
				785	if (c->journal.cur->dirty)
				786	journal_try_write(c);
				787	else
				788	spin_unlock(&c->journal.lock);
				789	}
				790
				791	/*
				792	* Entry point to the journalling code - bio_insert() and btree_invalidate()
				793	* pass bch_journal() a list of keys to be journalled, and then
				794	* bch_journal() hands those same keys off to btree_insert_async()
				795	*/
				796
				797	atomic_t bch_journal(struct cache_set c,
				798	struct keylist *keys,
				799	struct closure *parent)
				800	{
				801	struct journal_write *w;
				802	atomic_t *ret;
				803
				804	/* No journaling if CACHE_SET_IO_DISABLE set already */
				805	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
				806	return NULL;
				807
				808	if (!CACHE_SYNC(&c->sb))
				809	return NULL;
				810
				811	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
				812
				813	memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
				814	w->data->keys += bch_keylist_nkeys(keys);
				815
				816	ret = &fifo_back(&c->journal.pin);
				817	atomic_inc(ret);
				818
				819	if (parent) {
				820	closure_wait(&w->wait, parent);
				821	journal_try_write(c);
				822	} else if (!w->dirty) {
				823	w->dirty = true;
				824	schedule_delayed_work(&c->journal.work,
				825	msecs_to_jiffies(c->journal_delay_ms));
				826	spin_unlock(&c->journal.lock);
				827	} else {
				828	spin_unlock(&c->journal.lock);
				829	}
				830
				831
				832	return ret;
				833	}
				834
				835	void bch_journal_meta(struct cache_set c, struct closure cl)
				836	{
				837	struct keylist keys;
				838	atomic_t *ref;
				839
				840	bch_keylist_init(&keys);
				841
				842	ref = bch_journal(c, &keys, cl);
				843	if (ref)
				844	atomic_dec_bug(ref);
				845	}
				846
				847	void bch_journal_free(struct cache_set *c)
				848	{
				849	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
				850	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
				851	free_fifo(&c->journal.pin);
				852	}
				853
				854	int bch_journal_alloc(struct cache_set *c)
				855	{
				856	struct journal *j = &c->journal;
				857
				858	spin_lock_init(&j->lock);
				859	INIT_DELAYED_WORK(&j->work, journal_write_work);
				860
				861	c->journal_delay_ms = 100;
				862
				863	j->w[0].c = c;
				864	j->w[1].c = c;
				865
				866	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) \|\|
				867	!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) \|\|
				868	!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
				869	return -ENOMEM;
				870
				871	return 0;
				872	}