Blame - src/kernel/linux/v4.19/drivers/md/bcache/btree.h - T800

blob: 4d0cca145f6992a4efe7be1ff930102b4ce6c620 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0 */
				2	#ifndef _BCACHE_BTREE_H
				3	#define _BCACHE_BTREE_H
				4
				5	/*
				6	* THE BTREE:
				7	*
				8	* At a high level, bcache's btree is relatively standard b+ tree. All keys and
				9	* pointers are in the leaves; interior nodes only have pointers to the child
				10	* nodes.
				11	*
				12	* In the interior nodes, a struct bkey always points to a child btree node, and
				13	* the key is the highest key in the child node - except that the highest key in
				14	* an interior node is always MAX_KEY. The size field refers to the size on disk
				15	* of the child node - this would allow us to have variable sized btree nodes
				16	* (handy for keeping the depth of the btree 1 by expanding just the root).
				17	*
				18	* Btree nodes are themselves log structured, but this is hidden fairly
				19	* thoroughly. Btree nodes on disk will in practice have extents that overlap
				20	* (because they were written at different times), but in memory we never have
				21	* overlapping extents - when we read in a btree node from disk, the first thing
				22	* we do is resort all the sets of keys with a mergesort, and in the same pass
				23	* we check for overlapping extents and adjust them appropriately.
				24	*
				25	* struct btree_op is a central interface to the btree code. It's used for
				26	* specifying read vs. write locking, and the embedded closure is used for
				27	* waiting on IO or reserve memory.
				28	*
				29	* BTREE CACHE:
				30	*
				31	* Btree nodes are cached in memory; traversing the btree might require reading
				32	* in btree nodes which is handled mostly transparently.
				33	*
				34	* bch_btree_node_get() looks up a btree node in the cache and reads it in from
				35	* disk if necessary. This function is almost never called directly though - the
				36	* btree() macro is used to get a btree node, call some function on it, and
				37	* unlock the node after the function returns.
				38	*
				39	* The root is special cased - it's taken out of the cache's lru (thus pinning
				40	* it in memory), so we can find the root of the btree by just dereferencing a
				41	* pointer instead of looking it up in the cache. This makes locking a bit
				42	* tricky, since the root pointer is protected by the lock in the btree node it
				43	* points to - the btree_root() macro handles this.
				44	*
				45	* In various places we must be able to allocate memory for multiple btree nodes
				46	* in order to make forward progress. To do this we use the btree cache itself
				47	* as a reserve; if __get_free_pages() fails, we'll find a node in the btree
				48	* cache we can reuse. We can't allow more than one thread to be doing this at a
				49	* time, so there's a lock, implemented by a pointer to the btree_op closure -
				50	* this allows the btree_root() macro to implicitly release this lock.
				51	*
				52	* BTREE IO:
				53	*
				54	* Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
				55	* this.
				56	*
				57	* For writing, we have two btree_write structs embeddded in struct btree - one
				58	* write in flight, and one being set up, and we toggle between them.
				59	*
				60	* Writing is done with a single function - bch_btree_write() really serves two
				61	* different purposes and should be broken up into two different functions. When
				62	* passing now = false, it merely indicates that the node is now dirty - calling
				63	* it ensures that the dirty keys will be written at some point in the future.
				64	*
				65	* When passing now = true, bch_btree_write() causes a write to happen
				66	* "immediately" (if there was already a write in flight, it'll cause the write
				67	* to happen as soon as the previous write completes). It returns immediately
				68	* though - but it takes a refcount on the closure in struct btree_op you passed
				69	* to it, so a closure_sync() later can be used to wait for the write to
				70	* complete.
				71	*
				72	* This is handy because btree_split() and garbage collection can issue writes
				73	* in parallel, reducing the amount of time they have to hold write locks.
				74	*
				75	* LOCKING:
				76	*
				77	* When traversing the btree, we may need write locks starting at some level -
				78	* inserting a key into the btree will typically only require a write lock on
				79	* the leaf node.
				80	*
				81	* This is specified with the lock field in struct btree_op; lock = 0 means we
				82	* take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
				83	* checks this field and returns the node with the appropriate lock held.
				84	*
				85	* If, after traversing the btree, the insertion code discovers it has to split
				86	* then it must restart from the root and take new locks - to do this it changes
				87	* the lock field and returns -EINTR, which causes the btree_root() macro to
				88	* loop.
				89	*
				90	* Handling cache misses require a different mechanism for upgrading to a write
				91	* lock. We do cache lookups with only a read lock held, but if we get a cache
				92	* miss and we wish to insert this data into the cache, we have to insert a
				93	* placeholder key to detect races - otherwise, we could race with a write and
				94	* overwrite the data that was just written to the cache with stale data from
				95	* the backing device.
				96	*
				97	* For this we use a sequence number that write locks and unlocks increment - to
				98	* insert the check key it unlocks the btree node and then takes a write lock,
				99	* and fails if the sequence number doesn't match.
				100	*/
				101
				102	#include "bset.h"
				103	#include "debug.h"
				104
				105	struct btree_write {
				106	atomic_t *journal;
				107
				108	/* If btree_split() frees a btree node, it writes a new pointer to that
				109	* btree node indicating it was freed; it takes a refcount on
				110	* c->prio_blocked because we can't write the gens until the new
				111	* pointer is on disk. This allows btree_write_endio() to release the
				112	* refcount that btree_split() took.
				113	*/
				114	int prio_blocked;
				115	};
				116
				117	struct btree {
				118	/* Hottest entries first */
				119	struct hlist_node hash;
				120
				121	/* Key/pointer for this btree node */
				122	BKEY_PADDED(key);
				123
				124	/* Single bit - set when accessed, cleared by shrinker */
				125	unsigned long accessed;
				126	unsigned long seq;
				127	struct rw_semaphore lock;
				128	struct cache_set *c;
				129	struct btree *parent;
				130
				131	struct mutex write_lock;
				132
				133	unsigned long flags;
				134	uint16_t written; /* would be nice to kill */
				135	uint8_t level;
				136
				137	struct btree_keys keys;
				138
				139	/* For outstanding btree writes, used as a lock - protects write_idx */
				140	struct closure io;
				141	struct semaphore io_mutex;
				142
				143	struct list_head list;
				144	struct delayed_work work;
				145
				146	struct btree_write writes[2];
				147	struct bio *bio;
				148	};
				149
				150	#define BTREE_FLAG(flag) \
				151	static inline bool btree_node_ ## flag(struct btree *b) \
				152	{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
				153	\
				154	static inline void set_btree_node_ ## flag(struct btree *b) \
				155	{ set_bit(BTREE_NODE_ ## flag, &b->flags); }
				156
				157	enum btree_flags {
				158	BTREE_NODE_io_error,
				159	BTREE_NODE_dirty,
				160	BTREE_NODE_write_idx,
				161	BTREE_NODE_journal_flush,
				162	};
				163
				164	BTREE_FLAG(io_error);
				165	BTREE_FLAG(dirty);
				166	BTREE_FLAG(write_idx);
				167	BTREE_FLAG(journal_flush);
				168
				169	static inline struct btree_write btree_current_write(struct btree b)
				170	{
				171	return b->writes + btree_node_write_idx(b);
				172	}
				173
				174	static inline struct btree_write btree_prev_write(struct btree b)
				175	{
				176	return b->writes + (btree_node_write_idx(b) ^ 1);
				177	}
				178
				179	static inline struct bset btree_bset_first(struct btree b)
				180	{
				181	return b->keys.set->data;
				182	}
				183
				184	static inline struct bset btree_bset_last(struct btree b)
				185	{
				186	return bset_tree_last(&b->keys)->data;
				187	}
				188
				189	static inline unsigned int bset_block_offset(struct btree b, struct bset i)
				190	{
				191	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
				192	}
				193
				194	static inline void set_gc_sectors(struct cache_set *c)
				195	{
				196	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
				197	}
				198
				199	void bkey_put(struct cache_set c, struct bkey k);
				200
				201	/* Looping macros */
				202
				203	#define for_each_cached_btree(b, c, iter) \
				204	for (iter = 0; \
				205	iter < ARRAY_SIZE((c)->bucket_hash); \
				206	iter++) \
				207	hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
				208
				209	/* Recursing down the btree */
				210
				211	struct btree_op {
				212	/* for waiting on btree reserve in btree_split() */
				213	wait_queue_entry_t wait;
				214
				215	/* Btree level at which we start taking write locks */
				216	short lock;
				217
				218	unsigned int insert_collision:1;
				219	};
				220
				221	static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
				222	{
				223	memset(op, 0, sizeof(struct btree_op));
				224	init_wait(&op->wait);
				225	op->lock = write_lock_level;
				226	}
				227
				228	static inline void rw_lock(bool w, struct btree *b, int level)
				229	{
				230	w ? down_write_nested(&b->lock, level + 1)
				231	: down_read_nested(&b->lock, level + 1);
				232	if (w)
				233	b->seq++;
				234	}
				235
				236	static inline void rw_unlock(bool w, struct btree *b)
				237	{
				238	if (w)
				239	b->seq++;
				240	(w ? up_write : up_read)(&b->lock);
				241	}
				242
				243	void bch_btree_node_read_done(struct btree *b);
				244	void __bch_btree_node_write(struct btree b, struct closure parent);
				245	void bch_btree_node_write(struct btree b, struct closure parent);
				246
				247	void bch_btree_set_root(struct btree *b);
				248	struct btree __bch_btree_node_alloc(struct cache_set c, struct btree_op *op,
				249	int level, bool wait,
				250	struct btree *parent);
				251	struct btree bch_btree_node_get(struct cache_set c, struct btree_op *op,
				252	struct bkey *k, int level, bool write,
				253	struct btree *parent);
				254
				255	int bch_btree_insert_check_key(struct btree b, struct btree_op op,
				256	struct bkey *check_key);
				257	int bch_btree_insert(struct cache_set c, struct keylist keys,
				258	atomic_t journal_ref, struct bkey replace_key);
				259
				260	int bch_gc_thread_start(struct cache_set *c);
				261	void bch_initial_gc_finish(struct cache_set *c);
				262	void bch_moving_gc(struct cache_set *c);
				263	int bch_btree_check(struct cache_set *c);
				264	void bch_initial_mark_key(struct cache_set c, int level, struct bkey k);
				265
				266	static inline void wake_up_gc(struct cache_set *c)
				267	{
				268	wake_up(&c->gc_wait);
				269	}
				270
				271	#define MAP_DONE 0
				272	#define MAP_CONTINUE 1
				273
				274	#define MAP_ALL_NODES 0
				275	#define MAP_LEAF_NODES 1
				276
				277	#define MAP_END_KEY 1
				278
				279	typedef int (btree_map_nodes_fn)(struct btree_op b_op, struct btree b);
				280	int __bch_btree_map_nodes(struct btree_op op, struct cache_set c,
				281	struct bkey from, btree_map_nodes_fn fn, int flags);
				282
				283	static inline int bch_btree_map_nodes(struct btree_op op, struct cache_set c,
				284	struct bkey from, btree_map_nodes_fn fn)
				285	{
				286	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
				287	}
				288
				289	static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
				290	struct cache_set *c,
				291	struct bkey *from,
				292	btree_map_nodes_fn *fn)
				293	{
				294	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
				295	}
				296
				297	typedef int (btree_map_keys_fn)(struct btree_op op, struct btree b,
				298	struct bkey *k);
				299	int bch_btree_map_keys(struct btree_op op, struct cache_set c,
				300	struct bkey from, btree_map_keys_fn fn, int flags);
				301
				302	typedef bool (keybuf_pred_fn)(struct keybuf buf, struct bkey k);
				303
				304	void bch_keybuf_init(struct keybuf *buf);
				305	void bch_refill_keybuf(struct cache_set c, struct keybuf buf,
				306	struct bkey end, keybuf_pred_fn pred);
				307	bool bch_keybuf_check_overlapping(struct keybuf buf, struct bkey start,
				308	struct bkey *end);
				309	void bch_keybuf_del(struct keybuf buf, struct keybuf_key w);
				310	struct keybuf_key bch_keybuf_next(struct keybuf buf);
				311	struct keybuf_key bch_keybuf_next_rescan(struct cache_set c,
				312	struct keybuf *buf,
				313	struct bkey *end,
				314	keybuf_pred_fn *pred);
				315	void bch_update_bucket_in_use(struct cache_set c, struct gc_stat stats);
				316	#endif