Blame - src/kernel/linux/v4.14/fs/xfs/xfs_reflink.c - T103

blob: 4d37f1b59436ccffcaca2fc9935701bb77d41e92 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2016 Oracle. All Rights Reserved.
				3	*
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version 2
				9	* of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it would be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	* GNU General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
				19	*/
				20	#include "xfs.h"
				21	#include "xfs_fs.h"
				22	#include "xfs_shared.h"
				23	#include "xfs_format.h"
				24	#include "xfs_log_format.h"
				25	#include "xfs_trans_resv.h"
				26	#include "xfs_mount.h"
				27	#include "xfs_defer.h"
				28	#include "xfs_da_format.h"
				29	#include "xfs_da_btree.h"
				30	#include "xfs_inode.h"
				31	#include "xfs_trans.h"
				32	#include "xfs_inode_item.h"
				33	#include "xfs_bmap.h"
				34	#include "xfs_bmap_util.h"
				35	#include "xfs_error.h"
				36	#include "xfs_dir2.h"
				37	#include "xfs_dir2_priv.h"
				38	#include "xfs_ioctl.h"
				39	#include "xfs_trace.h"
				40	#include "xfs_log.h"
				41	#include "xfs_icache.h"
				42	#include "xfs_pnfs.h"
				43	#include "xfs_btree.h"
				44	#include "xfs_refcount_btree.h"
				45	#include "xfs_refcount.h"
				46	#include "xfs_bmap_btree.h"
				47	#include "xfs_trans_space.h"
				48	#include "xfs_bit.h"
				49	#include "xfs_alloc.h"
				50	#include "xfs_quota_defs.h"
				51	#include "xfs_quota.h"
				52	#include "xfs_btree.h"
				53	#include "xfs_bmap_btree.h"
				54	#include "xfs_reflink.h"
				55	#include "xfs_iomap.h"
				56	#include "xfs_rmap_btree.h"
				57	#include "xfs_sb.h"
				58	#include "xfs_ag_resv.h"
				59
				60	/*
				61	* Copy on Write of Shared Blocks
				62	*
				63	* XFS must preserve "the usual" file semantics even when two files share
				64	* the same physical blocks. This means that a write to one file must not
				65	* alter the blocks in a different file; the way that we'll do that is
				66	* through the use of a copy-on-write mechanism. At a high level, that
				67	* means that when we want to write to a shared block, we allocate a new
				68	* block, write the data to the new block, and if that succeeds we map the
				69	* new block into the file.
				70	*
				71	* XFS provides a "delayed allocation" mechanism that defers the allocation
				72	* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
				73	* possible. This reduces fragmentation by enabling the filesystem to ask
				74	* for bigger chunks less often, which is exactly what we want for CoW.
				75	*
				76	* The delalloc mechanism begins when the kernel wants to make a block
				77	* writable (write_begin or page_mkwrite). If the offset is not mapped, we
				78	* create a delalloc mapping, which is a regular in-core extent, but without
				79	* a real startblock. (For delalloc mappings, the startblock encodes both
				80	* a flag that this is a delalloc mapping, and a worst-case estimate of how
				81	* many blocks might be required to put the mapping into the BMBT.) delalloc
				82	* mappings are a reservation against the free space in the filesystem;
				83	* adjacent mappings can also be combined into fewer larger mappings.
				84	*
				85	* As an optimization, the CoW extent size hint (cowextsz) creates
				86	* outsized aligned delalloc reservations in the hope of landing out of
				87	* order nearby CoW writes in a single extent on disk, thereby reducing
				88	* fragmentation and improving future performance.
				89	*
				90	* D: --RRRRRRSSSRRRRRRRR--- (data fork)
				91	* C: ------DDDDDDD--------- (CoW fork)
				92	*
				93	* When dirty pages are being written out (typically in writepage), the
				94	* delalloc reservations are converted into unwritten mappings by
				95	* allocating blocks and replacing the delalloc mapping with real ones.
				96	* A delalloc mapping can be replaced by several unwritten ones if the
				97	* free space is fragmented.
				98	*
				99	* D: --RRRRRRSSSRRRRRRRR---
				100	* C: ------UUUUUUU---------
				101	*
				102	* We want to adapt the delalloc mechanism for copy-on-write, since the
				103	* write paths are similar. The first two steps (creating the reservation
				104	* and allocating the blocks) are exactly the same as delalloc except that
				105	* the mappings must be stored in a separate CoW fork because we do not want
				106	* to disturb the mapping in the data fork until we're sure that the write
				107	* succeeded. IO completion in this case is the process of removing the old
				108	* mapping from the data fork and moving the new mapping from the CoW fork to
				109	* the data fork. This will be discussed shortly.
				110	*
				111	* For now, unaligned directio writes will be bounced back to the page cache.
				112	* Block-aligned directio writes will use the same mechanism as buffered
				113	* writes.
				114	*
				115	* Just prior to submitting the actual disk write requests, we convert
				116	* the extents representing the range of the file actually being written
				117	* (as opposed to extra pieces created for the cowextsize hint) to real
				118	* extents. This will become important in the next step:
				119	*
				120	* D: --RRRRRRSSSRRRRRRRR---
				121	* C: ------UUrrUUU---------
				122	*
				123	* CoW remapping must be done after the data block write completes,
				124	* because we don't want to destroy the old data fork map until we're sure
				125	* the new block has been written. Since the new mappings are kept in a
				126	* separate fork, we can simply iterate these mappings to find the ones
				127	* that cover the file blocks that we just CoW'd. For each extent, simply
				128	* unmap the corresponding range in the data fork, map the new range into
				129	* the data fork, and remove the extent from the CoW fork. Because of
				130	* the presence of the cowextsize hint, however, we must be careful
				131	* only to remap the blocks that we've actually written out -- we must
				132	* never remap delalloc reservations nor CoW staging blocks that have
				133	* yet to be written. This corresponds exactly to the real extents in
				134	* the CoW fork:
				135	*
				136	* D: --RRRRRRrrSRRRRRRRR---
				137	* C: ------UU--UUU---------
				138	*
				139	* Since the remapping operation can be applied to an arbitrary file
				140	* range, we record the need for the remap step as a flag in the ioend
				141	* instead of declaring a new IO type. This is required for direct io
				142	* because we only have ioend for the whole dio, and we have to be able to
				143	* remember the presence of unwritten blocks and CoW blocks with a single
				144	* ioend structure. Better yet, the more ground we can cover with one
				145	* ioend, the better.
				146	*/
				147
				148	/*
				149	* Given an AG extent, find the lowest-numbered run of shared blocks
				150	* within that range and return the range in fbno/flen. If
				151	* find_end_of_shared is true, return the longest contiguous extent of
				152	* shared blocks. If there are no shared extents, fbno and flen will
				153	* be set to NULLAGBLOCK and 0, respectively.
				154	*/
				155	int
				156	xfs_reflink_find_shared(
				157	struct xfs_mount *mp,
				158	struct xfs_trans *tp,
				159	xfs_agnumber_t agno,
				160	xfs_agblock_t agbno,
				161	xfs_extlen_t aglen,
				162	xfs_agblock_t *fbno,
				163	xfs_extlen_t *flen,
				164	bool find_end_of_shared)
				165	{
				166	struct xfs_buf *agbp;
				167	struct xfs_btree_cur *cur;
				168	int error;
				169
				170	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
				171	if (error)
				172	return error;
				173	if (!agbp)
				174	return -ENOMEM;
				175
				176	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
				177
				178	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
				179	find_end_of_shared);
				180
				181	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
				182
				183	xfs_trans_brelse(tp, agbp);
				184	return error;
				185	}
				186
				187	/*
				188	* Trim the mapping to the next block where there's a change in the
				189	* shared/unshared status. More specifically, this means that we
				190	* find the lowest-numbered extent of shared blocks that coincides with
				191	* the given block mapping. If the shared extent overlaps the start of
				192	* the mapping, trim the mapping to the end of the shared extent. If
				193	* the shared region intersects the mapping, trim the mapping to the
				194	* start of the shared extent. If there are no shared regions that
				195	* overlap, just return the original extent.
				196	*/
				197	int
				198	xfs_reflink_trim_around_shared(
				199	struct xfs_inode *ip,
				200	struct xfs_bmbt_irec *irec,
				201	bool *shared,
				202	bool *trimmed)
				203	{
				204	xfs_agnumber_t agno;
				205	xfs_agblock_t agbno;
				206	xfs_extlen_t aglen;
				207	xfs_agblock_t fbno;
				208	xfs_extlen_t flen;
				209	int error = 0;
				210
				211	/* Holes, unwritten, and delalloc extents cannot be shared */
				212	if (!xfs_is_reflink_inode(ip) \|\| !xfs_bmap_is_real_extent(irec)) {
				213	*shared = false;
				214	return 0;
				215	}
				216
				217	trace_xfs_reflink_trim_around_shared(ip, irec);
				218
				219	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
				220	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
				221	aglen = irec->br_blockcount;
				222
				223	error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
				224	aglen, &fbno, &flen, true);
				225	if (error)
				226	return error;
				227
				228	shared = trimmed = false;
				229	if (fbno == NULLAGBLOCK) {
				230	/* No shared blocks at all. */
				231	return 0;
				232	} else if (fbno == agbno) {
				233	/*
				234	* The start of this extent is shared. Truncate the
				235	* mapping at the end of the shared region so that a
				236	* subsequent iteration starts at the start of the
				237	* unshared region.
				238	*/
				239	irec->br_blockcount = flen;
				240	*shared = true;
				241	if (flen != aglen)
				242	*trimmed = true;
				243	return 0;
				244	} else {
				245	/*
				246	* There's a shared extent midway through this extent.
				247	* Truncate the mapping at the start of the shared
				248	* extent so that a subsequent iteration starts at the
				249	* start of the shared region.
				250	*/
				251	irec->br_blockcount = fbno - agbno;
				252	*trimmed = true;
				253	return 0;
				254	}
				255	}
				256
				257	/*
				258	* Trim the passed in imap to the next shared/unshared extent boundary, and
				259	* if imap->br_startoff points to a shared extent reserve space for it in the
				260	* COW fork. In this case *shared is set to true, else to false.
				261	*
				262	* Note that imap will always contain the block numbers for the existing blocks
				263	* in the data fork, as the upper layers need them for read-modify-write
				264	* operations.
				265	*/
				266	int
				267	xfs_reflink_reserve_cow(
				268	struct xfs_inode *ip,
				269	struct xfs_bmbt_irec *imap,
				270	bool *shared)
				271	{
				272	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				273	struct xfs_bmbt_irec got;
				274	int error = 0;
				275	bool eof = false, trimmed;
				276	xfs_extnum_t idx;
				277
				278	/*
				279	* Search the COW fork extent list first. This serves two purposes:
				280	* first this implement the speculative preallocation using cowextisze,
				281	* so that we also unshared block adjacent to shared blocks instead
				282	* of just the shared blocks themselves. Second the lookup in the
				283	* extent list is generally faster than going out to the shared extent
				284	* tree.
				285	*/
				286
				287	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
				288	eof = true;
				289	if (!eof && got.br_startoff <= imap->br_startoff) {
				290	trace_xfs_reflink_cow_found(ip, imap);
				291	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				292
				293	*shared = true;
				294	return 0;
				295	}
				296
				297	/* Trim the mapping to the nearest shared extent boundary. */
				298	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				299	if (error)
				300	return error;
				301
				302	/* Not shared? Just report the (potentially capped) extent. */
				303	if (!*shared)
				304	return 0;
				305
				306	/*
				307	* Fork all the shared blocks from our write offset until the end of
				308	* the extent.
				309	*/
				310	error = xfs_qm_dqattach_locked(ip, 0);
				311	if (error)
				312	return error;
				313
				314	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
				315	imap->br_blockcount, 0, &got, &idx, eof);
				316	if (error == -ENOSPC \|\| error == -EDQUOT)
				317	trace_xfs_reflink_cow_enospc(ip, imap);
				318	if (error)
				319	return error;
				320
				321	trace_xfs_reflink_cow_alloc(ip, &got);
				322	return 0;
				323	}
				324
				325	/* Convert part of an unwritten CoW extent to a real one. */
				326	STATIC int
				327	xfs_reflink_convert_cow_extent(
				328	struct xfs_inode *ip,
				329	struct xfs_bmbt_irec *imap,
				330	xfs_fileoff_t offset_fsb,
				331	xfs_filblks_t count_fsb,
				332	struct xfs_defer_ops *dfops)
				333	{
				334	xfs_fsblock_t first_block = NULLFSBLOCK;
				335	int nimaps = 1;
				336
				337	if (imap->br_state == XFS_EXT_NORM)
				338	return 0;
				339
				340	xfs_trim_extent(imap, offset_fsb, count_fsb);
				341	trace_xfs_reflink_convert_cow(ip, imap);
				342	if (imap->br_blockcount == 0)
				343	return 0;
				344	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
				345	XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT, &first_block,
				346	0, imap, &nimaps, dfops);
				347	}
				348
				349	/* Convert all of the unwritten CoW extents in a file's range to real ones. */
				350	int
				351	xfs_reflink_convert_cow(
				352	struct xfs_inode *ip,
				353	xfs_off_t offset,
				354	xfs_off_t count)
				355	{
				356	struct xfs_bmbt_irec got;
				357	struct xfs_defer_ops dfops;
				358	struct xfs_mount *mp = ip->i_mount;
				359	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				360	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
				361	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
				362	xfs_extnum_t idx;
				363	bool found;
				364	int error = 0;
				365
				366	xfs_ilock(ip, XFS_ILOCK_EXCL);
				367
				368	/* Convert all the extents to real from unwritten. */
				369	for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
				370	found && got.br_startoff < end_fsb;
				371	found = xfs_iext_get_extent(ifp, ++idx, &got)) {
				372	error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
				373	end_fsb - offset_fsb, &dfops);
				374	if (error)
				375	break;
				376	}
				377
				378	/* Finish up. */
				379	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				380	return error;
				381	}
				382
				383	/* Allocate all CoW reservations covering a range of blocks in a file. */
				384	int
				385	xfs_reflink_allocate_cow(
				386	struct xfs_inode *ip,
				387	struct xfs_bmbt_irec *imap,
				388	bool *shared,
				389	uint *lockmode)
				390	{
				391	struct xfs_mount *mp = ip->i_mount;
				392	xfs_fileoff_t offset_fsb = imap->br_startoff;
				393	xfs_filblks_t count_fsb = imap->br_blockcount;
				394	struct xfs_bmbt_irec got;
				395	struct xfs_defer_ops dfops;
				396	struct xfs_trans *tp = NULL;
				397	xfs_fsblock_t first_block;
				398	int nimaps, error = 0;
				399	bool trimmed;
				400	xfs_filblks_t resaligned;
				401	xfs_extlen_t resblks = 0;
				402	xfs_extnum_t idx;
				403
				404	retry:
				405	ASSERT(xfs_is_reflink_inode(ip));
				406	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL \| XFS_ILOCK_SHARED));
				407
				408	/*
				409	* Even if the extent is not shared we might have a preallocation for
				410	* it in the COW fork. If so use it.
				411	*/
				412	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
				413	got.br_startoff <= offset_fsb) {
				414	*shared = true;
				415
				416	/* If we have a real allocation in the COW fork we're done. */
				417	if (!isnullstartblock(got.br_startblock)) {
				418	xfs_trim_extent(&got, offset_fsb, count_fsb);
				419	*imap = got;
				420	goto convert;
				421	}
				422
				423	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				424	} else {
				425	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				426	if (error \|\| !*shared)
				427	goto out;
				428	}
				429
				430	if (!tp) {
				431	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
				432	imap->br_blockcount, xfs_get_cowextsz_hint(ip));
				433	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
				434
				435	xfs_iunlock(ip, *lockmode);
				436	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				437	*lockmode = XFS_ILOCK_EXCL;
				438	xfs_ilock(ip, *lockmode);
				439
				440	if (error)
				441	return error;
				442
				443	error = xfs_qm_dqattach_locked(ip, 0);
				444	if (error)
				445	goto out;
				446	goto retry;
				447	}
				448
				449	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
				450	XFS_QMOPT_RES_REGBLKS);
				451	if (error)
				452	goto out;
				453
				454	xfs_trans_ijoin(tp, ip, 0);
				455
				456	xfs_defer_init(&dfops, &first_block);
				457	nimaps = 1;
				458
				459	/* Allocate the entire reservation as unwritten blocks. */
				460	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
				461	XFS_BMAPI_COWFORK \| XFS_BMAPI_PREALLOC, &first_block,
				462	resblks, imap, &nimaps, &dfops);
				463	if (error)
				464	goto out_bmap_cancel;
				465
				466	/* Finish up. */
				467	error = xfs_defer_finish(&tp, &dfops);
				468	if (error)
				469	goto out_bmap_cancel;
				470
				471	error = xfs_trans_commit(tp);
				472	if (error)
				473	return error;
				474	convert:
				475	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb,
				476	&dfops);
				477	out_bmap_cancel:
				478	xfs_defer_cancel(&dfops);
				479	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
				480	XFS_QMOPT_RES_REGBLKS);
				481	out:
				482	if (tp)
				483	xfs_trans_cancel(tp);
				484	return error;
				485	}
				486
				487	/*
				488	* Find the CoW reservation for a given byte offset of a file.
				489	*/
				490	bool
				491	xfs_reflink_find_cow_mapping(
				492	struct xfs_inode *ip,
				493	xfs_off_t offset,
				494	struct xfs_bmbt_irec *imap)
				495	{
				496	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				497	xfs_fileoff_t offset_fsb;
				498	struct xfs_bmbt_irec got;
				499	xfs_extnum_t idx;
				500
				501	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL \| XFS_ILOCK_SHARED));
				502	ASSERT(xfs_is_reflink_inode(ip));
				503
				504	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				505	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
				506	return false;
				507	if (got.br_startoff > offset_fsb)
				508	return false;
				509
				510	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
				511	&got);
				512	*imap = got;
				513	return true;
				514	}
				515
				516	/*
				517	* Trim an extent to end at the next CoW reservation past offset_fsb.
				518	*/
				519	void
				520	xfs_reflink_trim_irec_to_next_cow(
				521	struct xfs_inode *ip,
				522	xfs_fileoff_t offset_fsb,
				523	struct xfs_bmbt_irec *imap)
				524	{
				525	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				526	struct xfs_bmbt_irec got;
				527	xfs_extnum_t idx;
				528
				529	if (!xfs_is_reflink_inode(ip))
				530	return;
				531
				532	/* Find the extent in the CoW fork. */
				533	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
				534	return;
				535
				536	/* This is the extent before; try sliding up one. */
				537	if (got.br_startoff < offset_fsb) {
				538	if (!xfs_iext_get_extent(ifp, idx + 1, &got))
				539	return;
				540	}
				541
				542	if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
				543	return;
				544
				545	imap->br_blockcount = got.br_startoff - imap->br_startoff;
				546	trace_xfs_reflink_trim_irec(ip, imap);
				547	}
				548
				549	/*
				550	* Cancel CoW reservations for some block range of an inode.
				551	*
				552	* If cancel_real is true this function cancels all COW fork extents for the
				553	* inode; if cancel_real is false, real extents are not cleared.
				554	*/
				555	int
				556	xfs_reflink_cancel_cow_blocks(
				557	struct xfs_inode *ip,
				558	struct xfs_trans **tpp,
				559	xfs_fileoff_t offset_fsb,
				560	xfs_fileoff_t end_fsb,
				561	bool cancel_real)
				562	{
				563	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				564	struct xfs_bmbt_irec got, del;
				565	xfs_extnum_t idx;
				566	xfs_fsblock_t firstfsb;
				567	struct xfs_defer_ops dfops;
				568	int error = 0;
				569
				570	if (!xfs_is_reflink_inode(ip))
				571	return 0;
				572	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
				573	return 0;
				574
				575	while (got.br_startoff < end_fsb) {
				576	del = got;
				577	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				578	trace_xfs_reflink_cancel_cow(ip, &del);
				579
				580	if (isnullstartblock(del.br_startblock)) {
				581	error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
				582	&idx, &got, &del);
				583	if (error)
				584	break;
				585	} else if (del.br_state == XFS_EXT_UNWRITTEN \|\| cancel_real) {
				586	xfs_trans_ijoin(*tpp, ip, 0);
				587	xfs_defer_init(&dfops, &firstfsb);
				588
				589	/* Free the CoW orphan record. */
				590	error = xfs_refcount_free_cow_extent(ip->i_mount,
				591	&dfops, del.br_startblock,
				592	del.br_blockcount);
				593	if (error)
				594	break;
				595
				596	xfs_bmap_add_free(ip->i_mount, &dfops,
				597	del.br_startblock, del.br_blockcount,
				598	NULL);
				599
				600	/* Update quota accounting */
				601	xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
				602	-(long)del.br_blockcount);
				603
				604	/* Roll the transaction */
				605	xfs_defer_ijoin(&dfops, ip);
				606	error = xfs_defer_finish(tpp, &dfops);
				607	if (error) {
				608	xfs_defer_cancel(&dfops);
				609	break;
				610	}
				611
				612	/* Remove the mapping from the CoW fork. */
				613	xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
				614	}
				615
				616	if (!xfs_iext_get_extent(ifp, ++idx, &got))
				617	break;
				618	}
				619
				620	/* clear tag if cow fork is emptied */
				621	if (!ifp->if_bytes)
				622	xfs_inode_clear_cowblocks_tag(ip);
				623
				624	return error;
				625	}
				626
				627	/*
				628	* Cancel CoW reservations for some byte range of an inode.
				629	*
				630	* If cancel_real is true this function cancels all COW fork extents for the
				631	* inode; if cancel_real is false, real extents are not cleared.
				632	*/
				633	int
				634	xfs_reflink_cancel_cow_range(
				635	struct xfs_inode *ip,
				636	xfs_off_t offset,
				637	xfs_off_t count,
				638	bool cancel_real)
				639	{
				640	struct xfs_trans *tp;
				641	xfs_fileoff_t offset_fsb;
				642	xfs_fileoff_t end_fsb;
				643	int error;
				644
				645	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
				646	ASSERT(xfs_is_reflink_inode(ip));
				647
				648	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				649	if (count == NULLFILEOFF)
				650	end_fsb = NULLFILEOFF;
				651	else
				652	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				653
				654	/* Start a rolling transaction to remove the mappings */
				655	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				656	0, 0, 0, &tp);
				657	if (error)
				658	goto out;
				659
				660	xfs_ilock(ip, XFS_ILOCK_EXCL);
				661	xfs_trans_ijoin(tp, ip, 0);
				662
				663	/* Scrape out the old CoW reservations */
				664	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
				665	cancel_real);
				666	if (error)
				667	goto out_cancel;
				668
				669	error = xfs_trans_commit(tp);
				670
				671	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				672	return error;
				673
				674	out_cancel:
				675	xfs_trans_cancel(tp);
				676	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				677	out:
				678	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
				679	return error;
				680	}
				681
				682	/*
				683	* Remap parts of a file's data fork after a successful CoW.
				684	*/
				685	int
				686	xfs_reflink_end_cow(
				687	struct xfs_inode *ip,
				688	xfs_off_t offset,
				689	xfs_off_t count)
				690	{
				691	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				692	struct xfs_bmbt_irec got, del;
				693	struct xfs_trans *tp;
				694	xfs_fileoff_t offset_fsb;
				695	xfs_fileoff_t end_fsb;
				696	xfs_fsblock_t firstfsb;
				697	struct xfs_defer_ops dfops;
				698	int error;
				699	unsigned int resblks;
				700	xfs_filblks_t rlen;
				701	xfs_extnum_t idx;
				702
				703	trace_xfs_reflink_end_cow(ip, offset, count);
				704
				705	/* No COW extents? That's easy! */
				706	if (ifp->if_bytes == 0)
				707	return 0;
				708
				709	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				710	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				711
				712	/*
				713	* Start a rolling transaction to switch the mappings. We're
				714	* unlikely ever to have to remap 16T worth of single-block
				715	* extents, so just cap the worst case extent count to 2^32-1.
				716	* Stick a warning in just in case, and avoid 64-bit division.
				717	*/
				718	BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
				719	if (end_fsb - offset_fsb > UINT_MAX) {
				720	error = -EFSCORRUPTED;
				721	xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
				722	ASSERT(0);
				723	goto out;
				724	}
				725	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
				726	(unsigned int)(end_fsb - offset_fsb),
				727	XFS_DATA_FORK);
				728	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				729	resblks, 0, 0, &tp);
				730	if (error)
				731	goto out;
				732
				733	xfs_ilock(ip, XFS_ILOCK_EXCL);
				734	xfs_trans_ijoin(tp, ip, 0);
				735
				736	/* If there is a hole at end_fsb - 1 go to the previous extent */
				737	if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) \|\|
				738	got.br_startoff > end_fsb) {
				739	/*
				740	* In case of racing, overlapping AIO writes no COW extents
				741	* might be left by the time I/O completes for the loser of
				742	* the race. In that case we are done.
				743	*/
				744	if (idx <= 0)
				745	goto out_cancel;
				746	xfs_iext_get_extent(ifp, --idx, &got);
				747	}
				748
				749	/* Walk backwards until we're out of the I/O range... */
				750	while (got.br_startoff + got.br_blockcount > offset_fsb) {
				751	del = got;
				752	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				753
				754	/* Extent delete may have bumped idx forward */
				755	if (!del.br_blockcount) {
				756	idx--;
				757	goto next_extent;
				758	}
				759
				760	ASSERT(!isnullstartblock(got.br_startblock));
				761
				762	/*
				763	* Don't remap unwritten extents; these are
				764	* speculatively preallocated CoW extents that have been
				765	* allocated but have not yet been involved in a write.
				766	*/
				767	if (got.br_state == XFS_EXT_UNWRITTEN) {
				768	idx--;
				769	goto next_extent;
				770	}
				771
				772	/* Unmap the old blocks in the data fork. */
				773	xfs_defer_init(&dfops, &firstfsb);
				774	rlen = del.br_blockcount;
				775	error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1,
				776	&firstfsb, &dfops);
				777	if (error)
				778	goto out_defer;
				779
				780	/* Trim the extent to whatever got unmapped. */
				781	if (rlen) {
				782	xfs_trim_extent(&del, del.br_startoff + rlen,
				783	del.br_blockcount - rlen);
				784	}
				785	trace_xfs_reflink_cow_remap(ip, &del);
				786
				787	/* Free the CoW orphan record. */
				788	error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops,
				789	del.br_startblock, del.br_blockcount);
				790	if (error)
				791	goto out_defer;
				792
				793	/* Map the new blocks into the data fork. */
				794	error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del);
				795	if (error)
				796	goto out_defer;
				797
				798	/* Remove the mapping from the CoW fork. */
				799	xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
				800
				801	xfs_defer_ijoin(&dfops, ip);
				802	error = xfs_defer_finish(&tp, &dfops);
				803	if (error)
				804	goto out_defer;
				805	next_extent:
				806	if (!xfs_iext_get_extent(ifp, idx, &got))
				807	break;
				808	}
				809
				810	error = xfs_trans_commit(tp);
				811	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				812	if (error)
				813	goto out;
				814	return 0;
				815
				816	out_defer:
				817	xfs_defer_cancel(&dfops);
				818	out_cancel:
				819	xfs_trans_cancel(tp);
				820	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				821	out:
				822	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
				823	return error;
				824	}
				825
				826	/*
				827	* Free leftover CoW reservations that didn't get cleaned out.
				828	*/
				829	int
				830	xfs_reflink_recover_cow(
				831	struct xfs_mount *mp)
				832	{
				833	xfs_agnumber_t agno;
				834	int error = 0;
				835
				836	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				837	return 0;
				838
				839	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
				840	error = xfs_refcount_recover_cow_leftovers(mp, agno);
				841	if (error)
				842	break;
				843	}
				844
				845	return error;
				846	}
				847
				848	/*
				849	* Reflinking (Block) Ranges of Two Files Together
				850	*
				851	* First, ensure that the reflink flag is set on both inodes. The flag is an
				852	* optimization to avoid unnecessary refcount btree lookups in the write path.
				853	*
				854	* Now we can iteratively remap the range of extents (and holes) in src to the
				855	* corresponding ranges in dest. Let drange and srange denote the ranges of
				856	* logical blocks in dest and src touched by the reflink operation.
				857	*
				858	* While the length of drange is greater than zero,
				859	* - Read src's bmbt at the start of srange ("imap")
				860	* - If imap doesn't exist, make imap appear to start at the end of srange
				861	* with zero length.
				862	* - If imap starts before srange, advance imap to start at srange.
				863	* - If imap goes beyond srange, truncate imap to end at the end of srange.
				864	* - Punch (imap start - srange start + imap len) blocks from dest at
				865	* offset (drange start).
				866	* - If imap points to a real range of pblks,
				867	* > Increase the refcount of the imap's pblks
				868	* > Map imap's pblks into dest at the offset
				869	* (drange start + imap start - srange start)
				870	* - Advance drange and srange by (imap start - srange start + imap len)
				871	*
				872	* Finally, if the reflink made dest longer, update both the in-core and
				873	* on-disk file sizes.
				874	*
				875	* ASCII Art Demonstration:
				876	*
				877	* Let's say we want to reflink this source file:
				878	*
				879	* ----SSSSSSS-SSSSS----SSSSSS (src file)
				880	* <-------------------->
				881	*
				882	* into this destination file:
				883	*
				884	* --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
				885	* <-------------------->
				886	* '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
				887	* Observe that the range has different logical offsets in either file.
				888	*
				889	* Consider that the first extent in the source file doesn't line up with our
				890	* reflink range. Unmapping and remapping are separate operations, so we can
				891	* unmap more blocks from the destination file than we remap.
				892	*
				893	* ----SSSSSSS-SSSSS----SSSSSS
				894	* <------->
				895	* --DDDDD---------DDDDD--DDD
				896	* <------->
				897	*
				898	* Now remap the source extent into the destination file:
				899	*
				900	* ----SSSSSSS-SSSSS----SSSSSS
				901	* <------->
				902	* --DDDDD--SSSSSSSDDDDD--DDD
				903	* <------->
				904	*
				905	* Do likewise with the second hole and extent in our range. Holes in the
				906	* unmap range don't affect our operation.
				907	*
				908	* ----SSSSSSS-SSSSS----SSSSSS
				909	* <---->
				910	* --DDDDD--SSSSSSS-SSSSS-DDD
				911	* <---->
				912	*
				913	* Finally, unmap and remap part of the third extent. This will increase the
				914	* size of the destination file.
				915	*
				916	* ----SSSSSSS-SSSSS----SSSSSS
				917	* <----->
				918	* --DDDDD--SSSSSSS-SSSSS----SSS
				919	* <----->
				920	*
				921	* Once we update the destination file's i_size, we're done.
				922	*/
				923
				924	/*
				925	* Ensure the reflink bit is set in both inodes.
				926	*/
				927	STATIC int
				928	xfs_reflink_set_inode_flag(
				929	struct xfs_inode *src,
				930	struct xfs_inode *dest)
				931	{
				932	struct xfs_mount *mp = src->i_mount;
				933	int error;
				934	struct xfs_trans *tp;
				935
				936	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
				937	return 0;
				938
				939	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				940	if (error)
				941	goto out_error;
				942
				943	/* Lock both files against IO */
				944	if (src->i_ino == dest->i_ino)
				945	xfs_ilock(src, XFS_ILOCK_EXCL);
				946	else
				947	xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL);
				948
				949	if (!xfs_is_reflink_inode(src)) {
				950	trace_xfs_reflink_set_inode_flag(src);
				951	xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
				952	src->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				953	xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
				954	xfs_ifork_init_cow(src);
				955	} else
				956	xfs_iunlock(src, XFS_ILOCK_EXCL);
				957
				958	if (src->i_ino == dest->i_ino)
				959	goto commit_flags;
				960
				961	if (!xfs_is_reflink_inode(dest)) {
				962	trace_xfs_reflink_set_inode_flag(dest);
				963	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				964	dest->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				965	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				966	xfs_ifork_init_cow(dest);
				967	} else
				968	xfs_iunlock(dest, XFS_ILOCK_EXCL);
				969
				970	commit_flags:
				971	error = xfs_trans_commit(tp);
				972	if (error)
				973	goto out_error;
				974	return error;
				975
				976	out_error:
				977	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
				978	return error;
				979	}
				980
				981	/*
				982	* Update destination inode size & cowextsize hint, if necessary.
				983	*/
				984	STATIC int
				985	xfs_reflink_update_dest(
				986	struct xfs_inode *dest,
				987	xfs_off_t newlen,
				988	xfs_extlen_t cowextsize,
				989	bool is_dedupe)
				990	{
				991	struct xfs_mount *mp = dest->i_mount;
				992	struct xfs_trans *tp;
				993	int error;
				994
				995	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
				996	return 0;
				997
				998	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				999	if (error)
				1000	goto out_error;
				1001
				1002	xfs_ilock(dest, XFS_ILOCK_EXCL);
				1003	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				1004
				1005	if (newlen > i_size_read(VFS_I(dest))) {
				1006	trace_xfs_reflink_update_inode_size(dest, newlen);
				1007	i_size_write(VFS_I(dest), newlen);
				1008	dest->i_d.di_size = newlen;
				1009	}
				1010
				1011	if (cowextsize) {
				1012	dest->i_d.di_cowextsize = cowextsize;
				1013	dest->i_d.di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
				1014	}
				1015
				1016	if (!is_dedupe) {
				1017	xfs_trans_ichgtime(tp, dest,
				1018	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				1019	}
				1020	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				1021
				1022	error = xfs_trans_commit(tp);
				1023	if (error)
				1024	goto out_error;
				1025	return error;
				1026
				1027	out_error:
				1028	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
				1029	return error;
				1030	}
				1031
				1032	/*
				1033	* Do we have enough reserve in this AG to handle a reflink? The refcount
				1034	* btree already reserved all the space it needs, but the rmap btree can grow
				1035	* infinitely, so we won't allow more reflinks when the AG is down to the
				1036	* btree reserves.
				1037	*/
				1038	static int
				1039	xfs_reflink_ag_has_free_space(
				1040	struct xfs_mount *mp,
				1041	xfs_agnumber_t agno)
				1042	{
				1043	struct xfs_perag *pag;
				1044	int error = 0;
				1045
				1046	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
				1047	return 0;
				1048
				1049	pag = xfs_perag_get(mp, agno);
				1050	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) \|\|
				1051	xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
				1052	error = -ENOSPC;
				1053	xfs_perag_put(pag);
				1054	return error;
				1055	}
				1056
				1057	/*
				1058	* Unmap a range of blocks from a file, then map other blocks into the hole.
				1059	* The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
				1060	* The extent irec is mapped into dest at irec->br_startoff.
				1061	*/
				1062	STATIC int
				1063	xfs_reflink_remap_extent(
				1064	struct xfs_inode *ip,
				1065	struct xfs_bmbt_irec *irec,
				1066	xfs_fileoff_t destoff,
				1067	xfs_off_t new_isize)
				1068	{
				1069	struct xfs_mount *mp = ip->i_mount;
				1070	bool real_extent = xfs_bmap_is_real_extent(irec);
				1071	struct xfs_trans *tp;
				1072	xfs_fsblock_t firstfsb;
				1073	unsigned int resblks;
				1074	struct xfs_defer_ops dfops;
				1075	struct xfs_bmbt_irec uirec;
				1076	xfs_filblks_t rlen;
				1077	xfs_filblks_t unmap_len;
				1078	xfs_off_t newlen;
				1079	int64_t qres;
				1080	int error;
				1081
				1082	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
				1083	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
				1084
				1085	/* No reflinking if we're low on space */
				1086	if (real_extent) {
				1087	error = xfs_reflink_ag_has_free_space(mp,
				1088	XFS_FSB_TO_AGNO(mp, irec->br_startblock));
				1089	if (error)
				1090	goto out;
				1091	}
				1092
				1093	/* Start a rolling transaction to switch the mappings */
				1094	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
				1095	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				1096	if (error)
				1097	goto out;
				1098
				1099	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1100	xfs_trans_ijoin(tp, ip, 0);
				1101
				1102	/*
				1103	* Reserve quota for this operation. We don't know if the first unmap
				1104	* in the dest file will cause a bmap btree split, so we always reserve
				1105	* at least enough blocks for that split. If the extent being mapped
				1106	* in is written, we need to reserve quota for that too.
				1107	*/
				1108	qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
				1109	if (real_extent)
				1110	qres += irec->br_blockcount;
				1111	error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
				1112	XFS_QMOPT_RES_REGBLKS);
				1113	if (error)
				1114	goto out_cancel;
				1115
				1116	trace_xfs_reflink_remap(ip, irec->br_startoff,
				1117	irec->br_blockcount, irec->br_startblock);
				1118
				1119	/* Unmap the old blocks in the data fork. */
				1120	rlen = unmap_len;
				1121	while (rlen) {
				1122	xfs_defer_init(&dfops, &firstfsb);
				1123	error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
				1124	&firstfsb, &dfops);
				1125	if (error)
				1126	goto out_defer;
				1127
				1128	/*
				1129	* Trim the extent to whatever got unmapped.
				1130	* Remember, bunmapi works backwards.
				1131	*/
				1132	uirec.br_startblock = irec->br_startblock + rlen;
				1133	uirec.br_startoff = irec->br_startoff + rlen;
				1134	uirec.br_blockcount = unmap_len - rlen;
				1135	uirec.br_state = irec->br_state;
				1136	unmap_len = rlen;
				1137
				1138	/* If this isn't a real mapping, we're done. */
				1139	if (!real_extent \|\| uirec.br_blockcount == 0)
				1140	goto next_extent;
				1141
				1142	trace_xfs_reflink_remap(ip, uirec.br_startoff,
				1143	uirec.br_blockcount, uirec.br_startblock);
				1144
				1145	/* Update the refcount tree */
				1146	error = xfs_refcount_increase_extent(mp, &dfops, &uirec);
				1147	if (error)
				1148	goto out_defer;
				1149
				1150	/* Map the new blocks into the data fork. */
				1151	error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec);
				1152	if (error)
				1153	goto out_defer;
				1154
				1155	/* Update quota accounting. */
				1156	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
				1157	uirec.br_blockcount);
				1158
				1159	/* Update dest isize if needed. */
				1160	newlen = XFS_FSB_TO_B(mp,
				1161	uirec.br_startoff + uirec.br_blockcount);
				1162	newlen = min_t(xfs_off_t, newlen, new_isize);
				1163	if (newlen > i_size_read(VFS_I(ip))) {
				1164	trace_xfs_reflink_update_inode_size(ip, newlen);
				1165	i_size_write(VFS_I(ip), newlen);
				1166	ip->i_d.di_size = newlen;
				1167	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1168	}
				1169
				1170	next_extent:
				1171	/* Process all the deferred stuff. */
				1172	xfs_defer_ijoin(&dfops, ip);
				1173	error = xfs_defer_finish(&tp, &dfops);
				1174	if (error)
				1175	goto out_defer;
				1176	}
				1177
				1178	error = xfs_trans_commit(tp);
				1179	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1180	if (error)
				1181	goto out;
				1182	return 0;
				1183
				1184	out_defer:
				1185	xfs_defer_cancel(&dfops);
				1186	out_cancel:
				1187	xfs_trans_cancel(tp);
				1188	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1189	out:
				1190	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
				1191	return error;
				1192	}
				1193
				1194	/*
				1195	* Iteratively remap one file's extents (and holes) to another's.
				1196	*/
				1197	STATIC int
				1198	xfs_reflink_remap_blocks(
				1199	struct xfs_inode *src,
				1200	xfs_fileoff_t srcoff,
				1201	struct xfs_inode *dest,
				1202	xfs_fileoff_t destoff,
				1203	xfs_filblks_t len,
				1204	xfs_off_t new_isize)
				1205	{
				1206	struct xfs_bmbt_irec imap;
				1207	int nimaps;
				1208	int error = 0;
				1209	xfs_filblks_t range_len;
				1210
				1211	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
				1212	while (len) {
				1213	trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
				1214	dest, destoff);
				1215	/* Read extent from the source file */
				1216	nimaps = 1;
				1217	xfs_ilock(src, XFS_ILOCK_EXCL);
				1218	error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
				1219	xfs_iunlock(src, XFS_ILOCK_EXCL);
				1220	if (error)
				1221	goto err;
				1222	ASSERT(nimaps == 1);
				1223
				1224	trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
				1225	&imap);
				1226
				1227	/* Translate imap into the destination file. */
				1228	range_len = imap.br_startoff + imap.br_blockcount - srcoff;
				1229	imap.br_startoff += destoff - srcoff;
				1230
				1231	/* Clear dest from destoff to the end of imap and map it in. */
				1232	error = xfs_reflink_remap_extent(dest, &imap, destoff,
				1233	new_isize);
				1234	if (error)
				1235	goto err;
				1236
				1237	if (fatal_signal_pending(current)) {
				1238	error = -EINTR;
				1239	goto err;
				1240	}
				1241
				1242	/* Advance drange/srange */
				1243	srcoff += range_len;
				1244	destoff += range_len;
				1245	len -= range_len;
				1246	}
				1247
				1248	return 0;
				1249
				1250	err:
				1251	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
				1252	return error;
				1253	}
				1254
				1255	/*
				1256	* Link a range of blocks from one file to another.
				1257	*/
				1258	int
				1259	xfs_reflink_remap_range(
				1260	struct file *file_in,
				1261	loff_t pos_in,
				1262	struct file *file_out,
				1263	loff_t pos_out,
				1264	u64 len,
				1265	bool is_dedupe)
				1266	{
				1267	struct inode *inode_in = file_inode(file_in);
				1268	struct xfs_inode *src = XFS_I(inode_in);
				1269	struct inode *inode_out = file_inode(file_out);
				1270	struct xfs_inode *dest = XFS_I(inode_out);
				1271	struct xfs_mount *mp = src->i_mount;
				1272	bool same_inode = (inode_in == inode_out);
				1273	xfs_fileoff_t sfsbno, dfsbno;
				1274	xfs_filblks_t fsblen;
				1275	xfs_extlen_t cowextsize;
				1276	ssize_t ret;
				1277
				1278	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				1279	return -EOPNOTSUPP;
				1280
				1281	if (XFS_FORCED_SHUTDOWN(mp))
				1282	return -EIO;
				1283
				1284	/* Lock both files against IO */
				1285	lock_two_nondirectories(inode_in, inode_out);
				1286	if (same_inode)
				1287	xfs_ilock(src, XFS_MMAPLOCK_EXCL);
				1288	else
				1289	xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
				1290
				1291	/* Check file eligibility and prepare for block sharing. */
				1292	ret = -EINVAL;
				1293	/* Don't reflink realtime inodes */
				1294	if (XFS_IS_REALTIME_INODE(src) \|\| XFS_IS_REALTIME_INODE(dest))
				1295	goto out_unlock;
				1296
				1297	/* Don't share DAX file data for now. */
				1298	if (IS_DAX(inode_in) \|\| IS_DAX(inode_out))
				1299	goto out_unlock;
				1300
				1301	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
				1302	&len, is_dedupe);
				1303	if (ret <= 0)
				1304	goto out_unlock;
				1305
				1306	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
				1307
				1308	/* Set flags and remap blocks. */
				1309	ret = xfs_reflink_set_inode_flag(src, dest);
				1310	if (ret)
				1311	goto out_unlock;
				1312
				1313	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
				1314	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
				1315	fsblen = XFS_B_TO_FSB(mp, len);
				1316	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
				1317	pos_out + len);
				1318	if (ret)
				1319	goto out_unlock;
				1320
				1321	/* Zap any page cache for the destination file's range. */
				1322	truncate_inode_pages_range(&inode_out->i_data, pos_out,
				1323	PAGE_ALIGN(pos_out + len) - 1);
				1324
				1325	/*
				1326	* Carry the cowextsize hint from src to dest if we're sharing the
				1327	* entire source file to the entire destination file, the source file
				1328	* has a cowextsize hint, and the destination file does not.
				1329	*/
				1330	cowextsize = 0;
				1331	if (pos_in == 0 && len == i_size_read(inode_in) &&
				1332	(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
				1333	pos_out == 0 && len >= i_size_read(inode_out) &&
				1334	!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
				1335	cowextsize = src->i_d.di_cowextsize;
				1336
				1337	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
				1338	is_dedupe);
				1339
				1340	out_unlock:
				1341	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
				1342	if (!same_inode)
				1343	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
				1344	unlock_two_nondirectories(inode_in, inode_out);
				1345	if (ret)
				1346	trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
				1347	return ret;
				1348	}
				1349
				1350	/*
				1351	* The user wants to preemptively CoW all shared blocks in this file,
				1352	* which enables us to turn off the reflink flag. Iterate all
				1353	* extents which are not prealloc/delalloc to see which ranges are
				1354	* mentioned in the refcount tree, then read those blocks into the
				1355	* pagecache, dirty them, fsync them back out, and then we can update
				1356	* the inode flag. What happens if we run out of memory? :)
				1357	*/
				1358	STATIC int
				1359	xfs_reflink_dirty_extents(
				1360	struct xfs_inode *ip,
				1361	xfs_fileoff_t fbno,
				1362	xfs_filblks_t end,
				1363	xfs_off_t isize)
				1364	{
				1365	struct xfs_mount *mp = ip->i_mount;
				1366	xfs_agnumber_t agno;
				1367	xfs_agblock_t agbno;
				1368	xfs_extlen_t aglen;
				1369	xfs_agblock_t rbno;
				1370	xfs_extlen_t rlen;
				1371	xfs_off_t fpos;
				1372	xfs_off_t flen;
				1373	struct xfs_bmbt_irec map[2];
				1374	int nmaps;
				1375	int error = 0;
				1376
				1377	while (end - fbno > 0) {
				1378	nmaps = 1;
				1379	/*
				1380	* Look for extents in the file. Skip holes, delalloc, or
				1381	* unwritten extents; they can't be reflinked.
				1382	*/
				1383	error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
				1384	if (error)
				1385	goto out;
				1386	if (nmaps == 0)
				1387	break;
				1388	if (!xfs_bmap_is_real_extent(&map[0]))
				1389	goto next;
				1390
				1391	map[1] = map[0];
				1392	while (map[1].br_blockcount) {
				1393	agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
				1394	agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
				1395	aglen = map[1].br_blockcount;
				1396
				1397	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
				1398	aglen, &rbno, &rlen, true);
				1399	if (error)
				1400	goto out;
				1401	if (rbno == NULLAGBLOCK)
				1402	break;
				1403
				1404	/* Dirty the pages */
				1405	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1406	fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
				1407	(rbno - agbno));
				1408	flen = XFS_FSB_TO_B(mp, rlen);
				1409	if (fpos + flen > isize)
				1410	flen = isize - fpos;
				1411	error = iomap_file_dirty(VFS_I(ip), fpos, flen,
				1412	&xfs_iomap_ops);
				1413	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1414	if (error)
				1415	goto out;
				1416
				1417	map[1].br_blockcount -= (rbno - agbno + rlen);
				1418	map[1].br_startoff += (rbno - agbno + rlen);
				1419	map[1].br_startblock += (rbno - agbno + rlen);
				1420	}
				1421
				1422	next:
				1423	fbno = map[0].br_startoff + map[0].br_blockcount;
				1424	}
				1425	out:
				1426	return error;
				1427	}
				1428
				1429	/* Does this inode need the reflink flag? */
				1430	int
				1431	xfs_reflink_inode_has_shared_extents(
				1432	struct xfs_trans *tp,
				1433	struct xfs_inode *ip,
				1434	bool *has_shared)
				1435	{
				1436	struct xfs_bmbt_irec got;
				1437	struct xfs_mount *mp = ip->i_mount;
				1438	struct xfs_ifork *ifp;
				1439	xfs_agnumber_t agno;
				1440	xfs_agblock_t agbno;
				1441	xfs_extlen_t aglen;
				1442	xfs_agblock_t rbno;
				1443	xfs_extlen_t rlen;
				1444	xfs_extnum_t idx;
				1445	bool found;
				1446	int error;
				1447
				1448	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
				1449	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
				1450	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
				1451	if (error)
				1452	return error;
				1453	}
				1454
				1455	*has_shared = false;
				1456	found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got);
				1457	while (found) {
				1458	if (isnullstartblock(got.br_startblock) \|\|
				1459	got.br_state != XFS_EXT_NORM)
				1460	goto next;
				1461	agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
				1462	agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
				1463	aglen = got.br_blockcount;
				1464
				1465	error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
				1466	&rbno, &rlen, false);
				1467	if (error)
				1468	return error;
				1469	/* Is there still a shared block here? */
				1470	if (rbno != NULLAGBLOCK) {
				1471	*has_shared = true;
				1472	return 0;
				1473	}
				1474	next:
				1475	found = xfs_iext_get_extent(ifp, ++idx, &got);
				1476	}
				1477
				1478	return 0;
				1479	}
				1480
				1481	/* Clear the inode reflink flag if there are no shared extents. */
				1482	int
				1483	xfs_reflink_clear_inode_flag(
				1484	struct xfs_inode *ip,
				1485	struct xfs_trans **tpp)
				1486	{
				1487	bool needs_flag;
				1488	int error = 0;
				1489
				1490	ASSERT(xfs_is_reflink_inode(ip));
				1491
				1492	error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
				1493	if (error \|\| needs_flag)
				1494	return error;
				1495
				1496	/*
				1497	* We didn't find any shared blocks so turn off the reflink flag.
				1498	* First, get rid of any leftover CoW mappings.
				1499	*/
				1500	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
				1501	if (error)
				1502	return error;
				1503
				1504	/* Clear the inode flag. */
				1505	trace_xfs_reflink_unset_inode_flag(ip);
				1506	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
				1507	xfs_inode_clear_cowblocks_tag(ip);
				1508	xfs_trans_ijoin(*tpp, ip, 0);
				1509	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
				1510
				1511	return error;
				1512	}
				1513
				1514	/*
				1515	* Clear the inode reflink flag if there are no shared extents and the size
				1516	* hasn't changed.
				1517	*/
				1518	STATIC int
				1519	xfs_reflink_try_clear_inode_flag(
				1520	struct xfs_inode *ip)
				1521	{
				1522	struct xfs_mount *mp = ip->i_mount;
				1523	struct xfs_trans *tp;
				1524	int error = 0;
				1525
				1526	/* Start a rolling transaction to remove the mappings */
				1527	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
				1528	if (error)
				1529	return error;
				1530
				1531	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1532	xfs_trans_ijoin(tp, ip, 0);
				1533
				1534	error = xfs_reflink_clear_inode_flag(ip, &tp);
				1535	if (error)
				1536	goto cancel;
				1537
				1538	error = xfs_trans_commit(tp);
				1539	if (error)
				1540	goto out;
				1541
				1542	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1543	return 0;
				1544	cancel:
				1545	xfs_trans_cancel(tp);
				1546	out:
				1547	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1548	return error;
				1549	}
				1550
				1551	/*
				1552	* Pre-COW all shared blocks within a given byte range of a file and turn off
				1553	* the reflink flag if we unshare all of the file's blocks.
				1554	*/
				1555	int
				1556	xfs_reflink_unshare(
				1557	struct xfs_inode *ip,
				1558	xfs_off_t offset,
				1559	xfs_off_t len)
				1560	{
				1561	struct xfs_mount *mp = ip->i_mount;
				1562	xfs_fileoff_t fbno;
				1563	xfs_filblks_t end;
				1564	xfs_off_t isize;
				1565	int error;
				1566
				1567	if (!xfs_is_reflink_inode(ip))
				1568	return 0;
				1569
				1570	trace_xfs_reflink_unshare(ip, offset, len);
				1571
				1572	inode_dio_wait(VFS_I(ip));
				1573
				1574	/* Try to CoW the selected ranges */
				1575	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1576	fbno = XFS_B_TO_FSBT(mp, offset);
				1577	isize = i_size_read(VFS_I(ip));
				1578	end = XFS_B_TO_FSB(mp, offset + len);
				1579	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
				1580	if (error)
				1581	goto out_unlock;
				1582	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1583
				1584	/* Wait for the IO to finish */
				1585	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
				1586	if (error)
				1587	goto out;
				1588
				1589	/* Turn off the reflink flag if possible. */
				1590	error = xfs_reflink_try_clear_inode_flag(ip);
				1591	if (error)
				1592	goto out;
				1593
				1594	return 0;
				1595
				1596	out_unlock:
				1597	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1598	out:
				1599	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
				1600	return error;
				1601	}