Blame - marvell/linux/fs/xfs/libxfs/xfs_rmap_btree.c - T108

blob: fc78efa52c94ed45d7ae25a2aa256bab5e70bb72 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2014 Red Hat, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_sb.h"
				13	#include "xfs_mount.h"
				14	#include "xfs_trans.h"
				15	#include "xfs_alloc.h"
				16	#include "xfs_btree.h"
				17	#include "xfs_rmap.h"
				18	#include "xfs_rmap_btree.h"
				19	#include "xfs_trace.h"
				20	#include "xfs_error.h"
				21	#include "xfs_extent_busy.h"
				22	#include "xfs_ag_resv.h"
				23
				24	/*
				25	* Reverse map btree.
				26	*
				27	* This is a per-ag tree used to track the owner(s) of a given extent. With
				28	* reflink it is possible for there to be multiple owners, which is a departure
				29	* from classic XFS. Owner records for data extents are inserted when the
				30	* extent is mapped and removed when an extent is unmapped. Owner records for
				31	* all other block types (i.e. metadata) are inserted when an extent is
				32	* allocated and removed when an extent is freed. There can only be one owner
				33	* of a metadata extent, usually an inode or some other metadata structure like
				34	* an AG btree.
				35	*
				36	* The rmap btree is part of the free space management, so blocks for the tree
				37	* are sourced from the agfl. Hence we need transaction reservation support for
				38	* this tree so that the freelist is always large enough. This also impacts on
				39	* the minimum space we need to leave free in the AG.
				40	*
				41	* The tree is ordered by [ag block, owner, offset]. This is a large key size,
				42	* but it is the only way to enforce unique keys when a block can be owned by
				43	* multiple files at any offset. There's no need to order/search by extent
				44	* size for online updating/management of the tree. It is intended that most
				45	* reverse lookups will be to find the owner(s) of a particular block, or to
				46	* try to recover tree and file data from corrupt primary metadata.
				47	*/
				48
				49	static struct xfs_btree_cur *
				50	xfs_rmapbt_dup_cursor(
				51	struct xfs_btree_cur *cur)
				52	{
				53	return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
				54	cur->bc_private.a.agbp, cur->bc_private.a.agno);
				55	}
				56
				57	STATIC void
				58	xfs_rmapbt_set_root(
				59	struct xfs_btree_cur *cur,
				60	union xfs_btree_ptr *ptr,
				61	int inc)
				62	{
				63	struct xfs_buf *agbp = cur->bc_private.a.agbp;
				64	struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
				65	xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
				66	int btnum = cur->bc_btnum;
				67	struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
				68
				69	ASSERT(ptr->s != 0);
				70
				71	agf->agf_roots[btnum] = ptr->s;
				72	be32_add_cpu(&agf->agf_levels[btnum], inc);
				73	pag->pagf_levels[btnum] += inc;
				74	xfs_perag_put(pag);
				75
				76	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS \| XFS_AGF_LEVELS);
				77	}
				78
				79	STATIC int
				80	xfs_rmapbt_alloc_block(
				81	struct xfs_btree_cur *cur,
				82	union xfs_btree_ptr *start,
				83	union xfs_btree_ptr *new,
				84	int *stat)
				85	{
				86	struct xfs_buf *agbp = cur->bc_private.a.agbp;
				87	struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
				88	int error;
				89	xfs_agblock_t bno;
				90
				91	/* Allocate the new block from the freelist. If we can't, give up. */
				92	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
				93	&bno, 1);
				94	if (error)
				95	return error;
				96
				97	trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
				98	bno, 1);
				99	if (bno == NULLAGBLOCK) {
				100	*stat = 0;
				101	return 0;
				102	}
				103
				104	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
				105	false);
				106
				107	xfs_trans_agbtree_delta(cur->bc_tp, 1);
				108	new->s = cpu_to_be32(bno);
				109	be32_add_cpu(&agf->agf_rmap_blocks, 1);
				110	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
				111
				112	xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
				113
				114	*stat = 1;
				115	return 0;
				116	}
				117
				118	STATIC int
				119	xfs_rmapbt_free_block(
				120	struct xfs_btree_cur *cur,
				121	struct xfs_buf *bp)
				122	{
				123	struct xfs_buf *agbp = cur->bc_private.a.agbp;
				124	struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
				125	xfs_agblock_t bno;
				126	int error;
				127
				128	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
				129	trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
				130	bno, 1);
				131	be32_add_cpu(&agf->agf_rmap_blocks, -1);
				132	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
				133	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
				134	if (error)
				135	return error;
				136
				137	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
				138	XFS_EXTENT_BUSY_SKIP_DISCARD);
				139	xfs_trans_agbtree_delta(cur->bc_tp, -1);
				140
				141	xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
				142
				143	return 0;
				144	}
				145
				146	STATIC int
				147	xfs_rmapbt_get_minrecs(
				148	struct xfs_btree_cur *cur,
				149	int level)
				150	{
				151	return cur->bc_mp->m_rmap_mnr[level != 0];
				152	}
				153
				154	STATIC int
				155	xfs_rmapbt_get_maxrecs(
				156	struct xfs_btree_cur *cur,
				157	int level)
				158	{
				159	return cur->bc_mp->m_rmap_mxr[level != 0];
				160	}
				161
				162	STATIC void
				163	xfs_rmapbt_init_key_from_rec(
				164	union xfs_btree_key *key,
				165	union xfs_btree_rec *rec)
				166	{
				167	key->rmap.rm_startblock = rec->rmap.rm_startblock;
				168	key->rmap.rm_owner = rec->rmap.rm_owner;
				169	key->rmap.rm_offset = rec->rmap.rm_offset;
				170	}
				171
				172	/*
				173	* The high key for a reverse mapping record can be computed by shifting
				174	* the startblock and offset to the highest value that would still map
				175	* to that record. In practice this means that we add blockcount-1 to
				176	* the startblock for all records, and if the record is for a data/attr
				177	* fork mapping, we add blockcount-1 to the offset too.
				178	*/
				179	STATIC void
				180	xfs_rmapbt_init_high_key_from_rec(
				181	union xfs_btree_key *key,
				182	union xfs_btree_rec *rec)
				183	{
				184	uint64_t off;
				185	int adj;
				186
				187	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
				188
				189	key->rmap.rm_startblock = rec->rmap.rm_startblock;
				190	be32_add_cpu(&key->rmap.rm_startblock, adj);
				191	key->rmap.rm_owner = rec->rmap.rm_owner;
				192	key->rmap.rm_offset = rec->rmap.rm_offset;
				193	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) \|\|
				194	XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
				195	return;
				196	off = be64_to_cpu(key->rmap.rm_offset);
				197	off = (XFS_RMAP_OFF(off) + adj) \| (off & ~XFS_RMAP_OFF_MASK);
				198	key->rmap.rm_offset = cpu_to_be64(off);
				199	}
				200
				201	STATIC void
				202	xfs_rmapbt_init_rec_from_cur(
				203	struct xfs_btree_cur *cur,
				204	union xfs_btree_rec *rec)
				205	{
				206	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
				207	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
				208	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
				209	rec->rmap.rm_offset = cpu_to_be64(
				210	xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
				211	}
				212
				213	STATIC void
				214	xfs_rmapbt_init_ptr_from_cur(
				215	struct xfs_btree_cur *cur,
				216	union xfs_btree_ptr *ptr)
				217	{
				218	struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
				219
				220	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
				221
				222	ptr->s = agf->agf_roots[cur->bc_btnum];
				223	}
				224
				225	STATIC int64_t
				226	xfs_rmapbt_key_diff(
				227	struct xfs_btree_cur *cur,
				228	union xfs_btree_key *key)
				229	{
				230	struct xfs_rmap_irec *rec = &cur->bc_rec.r;
				231	struct xfs_rmap_key *kp = &key->rmap;
				232	__u64 x, y;
				233	int64_t d;
				234
				235	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
				236	if (d)
				237	return d;
				238
				239	x = be64_to_cpu(kp->rm_owner);
				240	y = rec->rm_owner;
				241	if (x > y)
				242	return 1;
				243	else if (y > x)
				244	return -1;
				245
				246	x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
				247	y = rec->rm_offset;
				248	if (x > y)
				249	return 1;
				250	else if (y > x)
				251	return -1;
				252	return 0;
				253	}
				254
				255	STATIC int64_t
				256	xfs_rmapbt_diff_two_keys(
				257	struct xfs_btree_cur *cur,
				258	union xfs_btree_key *k1,
				259	union xfs_btree_key *k2)
				260	{
				261	struct xfs_rmap_key *kp1 = &k1->rmap;
				262	struct xfs_rmap_key *kp2 = &k2->rmap;
				263	int64_t d;
				264	__u64 x, y;
				265
				266	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
				267	be32_to_cpu(kp2->rm_startblock);
				268	if (d)
				269	return d;
				270
				271	x = be64_to_cpu(kp1->rm_owner);
				272	y = be64_to_cpu(kp2->rm_owner);
				273	if (x > y)
				274	return 1;
				275	else if (y > x)
				276	return -1;
				277
				278	x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
				279	y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
				280	if (x > y)
				281	return 1;
				282	else if (y > x)
				283	return -1;
				284	return 0;
				285	}
				286
				287	static xfs_failaddr_t
				288	xfs_rmapbt_verify(
				289	struct xfs_buf *bp)
				290	{
				291	struct xfs_mount *mp = bp->b_mount;
				292	struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
				293	struct xfs_perag *pag = bp->b_pag;
				294	xfs_failaddr_t fa;
				295	unsigned int level;
				296
				297	/*
				298	* magic number and level verification
				299	*
				300	* During growfs operations, we can't verify the exact level or owner as
				301	* the perag is not fully initialised and hence not attached to the
				302	* buffer. In this case, check against the maximum tree depth.
				303	*
				304	* Similarly, during log recovery we will have a perag structure
				305	* attached, but the agf information will not yet have been initialised
				306	* from the on disk AGF. Again, we can only check against maximum limits
				307	* in this case.
				308	*/
				309	if (!xfs_verify_magic(bp, block->bb_magic))
				310	return __this_address;
				311
				312	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
				313	return __this_address;
				314	fa = xfs_btree_sblock_v5hdr_verify(bp);
				315	if (fa)
				316	return fa;
				317
				318	level = be16_to_cpu(block->bb_level);
				319	if (pag && pag->pagf_init) {
				320	if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
				321	return __this_address;
				322	} else if (level >= mp->m_rmap_maxlevels)
				323	return __this_address;
				324
				325	return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
				326	}
				327
				328	static void
				329	xfs_rmapbt_read_verify(
				330	struct xfs_buf *bp)
				331	{
				332	xfs_failaddr_t fa;
				333
				334	if (!xfs_btree_sblock_verify_crc(bp))
				335	xfs_verifier_error(bp, -EFSBADCRC, __this_address);
				336	else {
				337	fa = xfs_rmapbt_verify(bp);
				338	if (fa)
				339	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
				340	}
				341
				342	if (bp->b_error)
				343	trace_xfs_btree_corrupt(bp, _RET_IP_);
				344	}
				345
				346	static void
				347	xfs_rmapbt_write_verify(
				348	struct xfs_buf *bp)
				349	{
				350	xfs_failaddr_t fa;
				351
				352	fa = xfs_rmapbt_verify(bp);
				353	if (fa) {
				354	trace_xfs_btree_corrupt(bp, _RET_IP_);
				355	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
				356	return;
				357	}
				358	xfs_btree_sblock_calc_crc(bp);
				359
				360	}
				361
				362	const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
				363	.name = "xfs_rmapbt",
				364	.magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
				365	.verify_read = xfs_rmapbt_read_verify,
				366	.verify_write = xfs_rmapbt_write_verify,
				367	.verify_struct = xfs_rmapbt_verify,
				368	};
				369
				370	STATIC int
				371	xfs_rmapbt_keys_inorder(
				372	struct xfs_btree_cur *cur,
				373	union xfs_btree_key *k1,
				374	union xfs_btree_key *k2)
				375	{
				376	uint32_t x;
				377	uint32_t y;
				378	uint64_t a;
				379	uint64_t b;
				380
				381	x = be32_to_cpu(k1->rmap.rm_startblock);
				382	y = be32_to_cpu(k2->rmap.rm_startblock);
				383	if (x < y)
				384	return 1;
				385	else if (x > y)
				386	return 0;
				387	a = be64_to_cpu(k1->rmap.rm_owner);
				388	b = be64_to_cpu(k2->rmap.rm_owner);
				389	if (a < b)
				390	return 1;
				391	else if (a > b)
				392	return 0;
				393	a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
				394	b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
				395	if (a <= b)
				396	return 1;
				397	return 0;
				398	}
				399
				400	STATIC int
				401	xfs_rmapbt_recs_inorder(
				402	struct xfs_btree_cur *cur,
				403	union xfs_btree_rec *r1,
				404	union xfs_btree_rec *r2)
				405	{
				406	uint32_t x;
				407	uint32_t y;
				408	uint64_t a;
				409	uint64_t b;
				410
				411	x = be32_to_cpu(r1->rmap.rm_startblock);
				412	y = be32_to_cpu(r2->rmap.rm_startblock);
				413	if (x < y)
				414	return 1;
				415	else if (x > y)
				416	return 0;
				417	a = be64_to_cpu(r1->rmap.rm_owner);
				418	b = be64_to_cpu(r2->rmap.rm_owner);
				419	if (a < b)
				420	return 1;
				421	else if (a > b)
				422	return 0;
				423	a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
				424	b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
				425	if (a <= b)
				426	return 1;
				427	return 0;
				428	}
				429
				430	static const struct xfs_btree_ops xfs_rmapbt_ops = {
				431	.rec_len = sizeof(struct xfs_rmap_rec),
				432	.key_len = 2 * sizeof(struct xfs_rmap_key),
				433
				434	.dup_cursor = xfs_rmapbt_dup_cursor,
				435	.set_root = xfs_rmapbt_set_root,
				436	.alloc_block = xfs_rmapbt_alloc_block,
				437	.free_block = xfs_rmapbt_free_block,
				438	.get_minrecs = xfs_rmapbt_get_minrecs,
				439	.get_maxrecs = xfs_rmapbt_get_maxrecs,
				440	.init_key_from_rec = xfs_rmapbt_init_key_from_rec,
				441	.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
				442	.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
				443	.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
				444	.key_diff = xfs_rmapbt_key_diff,
				445	.buf_ops = &xfs_rmapbt_buf_ops,
				446	.diff_two_keys = xfs_rmapbt_diff_two_keys,
				447	.keys_inorder = xfs_rmapbt_keys_inorder,
				448	.recs_inorder = xfs_rmapbt_recs_inorder,
				449	};
				450
				451	/*
				452	* Allocate a new allocation btree cursor.
				453	*/
				454	struct xfs_btree_cur *
				455	xfs_rmapbt_init_cursor(
				456	struct xfs_mount *mp,
				457	struct xfs_trans *tp,
				458	struct xfs_buf *agbp,
				459	xfs_agnumber_t agno)
				460	{
				461	struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
				462	struct xfs_btree_cur *cur;
				463
				464	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
				465	cur->bc_tp = tp;
				466	cur->bc_mp = mp;
				467	/* Overlapping btree; 2 keys per pointer. */
				468	cur->bc_btnum = XFS_BTNUM_RMAP;
				469	cur->bc_flags = XFS_BTREE_CRC_BLOCKS \| XFS_BTREE_OVERLAPPING;
				470	cur->bc_blocklog = mp->m_sb.sb_blocklog;
				471	cur->bc_ops = &xfs_rmapbt_ops;
				472	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
				473	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
				474
				475	cur->bc_private.a.agbp = agbp;
				476	cur->bc_private.a.agno = agno;
				477
				478	return cur;
				479	}
				480
				481	/*
				482	* Calculate number of records in an rmap btree block.
				483	*/
				484	int
				485	xfs_rmapbt_maxrecs(
				486	int blocklen,
				487	int leaf)
				488	{
				489	blocklen -= XFS_RMAP_BLOCK_LEN;
				490
				491	if (leaf)
				492	return blocklen / sizeof(struct xfs_rmap_rec);
				493	return blocklen /
				494	(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
				495	}
				496
				497	/* Compute the maximum height of an rmap btree. */
				498	void
				499	xfs_rmapbt_compute_maxlevels(
				500	struct xfs_mount *mp)
				501	{
				502	/*
				503	* On a non-reflink filesystem, the maximum number of rmap
				504	* records is the number of blocks in the AG, hence the max
				505	* rmapbt height is log_$maxrecs($agblocks). However, with
				506	* reflink each AG block can have up to 2^32 (per the refcount
				507	* record format) owners, which means that theoretically we
				508	* could face up to 2^64 rmap records.
				509	*
				510	* That effectively means that the max rmapbt height must be
				511	* XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
				512	* blocks to feed the rmapbt long before the rmapbt reaches
				513	* maximum height. The reflink code uses ag_resv_critical to
				514	* disallow reflinking when less than 10% of the per-AG metadata
				515	* block reservation since the fallback is a regular file copy.
				516	*/
				517	if (xfs_sb_version_hasreflink(&mp->m_sb))
				518	mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
				519	else
				520	mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
				521	mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
				522	}
				523
				524	/* Calculate the refcount btree size for some records. */
				525	xfs_extlen_t
				526	xfs_rmapbt_calc_size(
				527	struct xfs_mount *mp,
				528	unsigned long long len)
				529	{
				530	return xfs_btree_calc_size(mp->m_rmap_mnr, len);
				531	}
				532
				533	/*
				534	* Calculate the maximum refcount btree size.
				535	*/
				536	xfs_extlen_t
				537	xfs_rmapbt_max_size(
				538	struct xfs_mount *mp,
				539	xfs_agblock_t agblocks)
				540	{
				541	/* Bail out if we're uninitialized, which can happen in mkfs. */
				542	if (mp->m_rmap_mxr[0] == 0)
				543	return 0;
				544
				545	return xfs_rmapbt_calc_size(mp, agblocks);
				546	}
				547
				548	/*
				549	* Figure out how many blocks to reserve and how many are used by this btree.
				550	*/
				551	int
				552	xfs_rmapbt_calc_reserves(
				553	struct xfs_mount *mp,
				554	struct xfs_trans *tp,
				555	xfs_agnumber_t agno,
				556	xfs_extlen_t *ask,
				557	xfs_extlen_t *used)
				558	{
				559	struct xfs_buf *agbp;
				560	struct xfs_agf *agf;
				561	xfs_agblock_t agblocks;
				562	xfs_extlen_t tree_len;
				563	int error;
				564
				565	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
				566	return 0;
				567
				568	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
				569	if (error)
				570	return error;
				571
				572	agf = XFS_BUF_TO_AGF(agbp);
				573	agblocks = be32_to_cpu(agf->agf_length);
				574	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
				575	xfs_trans_brelse(tp, agbp);
				576
				577	/*
				578	* The log is permanently allocated, so the space it occupies will
				579	* never be available for the kinds of things that would require btree
				580	* expansion. We therefore can pretend the space isn't there.
				581	*/
				582	if (mp->m_sb.sb_logstart &&
				583	XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
				584	agblocks -= mp->m_sb.sb_logblocks;
				585
				586	/* Reserve 1% of the AG or enough for 1 block per record. */
				587	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
				588	*used += tree_len;
				589
				590	return error;
				591	}