Blame - src/kernel/linux/v4.19/fs/xfs/xfs_reflink.c - T800

blob: f3c393f309e19c8b99e6f2c683c84766efa6c1a8 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0+
				2	/*
				3	* Copyright (C) 2016 Oracle. All Rights Reserved.
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_defer.h"
				14	#include "xfs_da_format.h"
				15	#include "xfs_da_btree.h"
				16	#include "xfs_inode.h"
				17	#include "xfs_trans.h"
				18	#include "xfs_inode_item.h"
				19	#include "xfs_bmap.h"
				20	#include "xfs_bmap_util.h"
				21	#include "xfs_error.h"
				22	#include "xfs_dir2.h"
				23	#include "xfs_dir2_priv.h"
				24	#include "xfs_ioctl.h"
				25	#include "xfs_trace.h"
				26	#include "xfs_log.h"
				27	#include "xfs_icache.h"
				28	#include "xfs_pnfs.h"
				29	#include "xfs_btree.h"
				30	#include "xfs_refcount_btree.h"
				31	#include "xfs_refcount.h"
				32	#include "xfs_bmap_btree.h"
				33	#include "xfs_trans_space.h"
				34	#include "xfs_bit.h"
				35	#include "xfs_alloc.h"
				36	#include "xfs_quota_defs.h"
				37	#include "xfs_quota.h"
				38	#include "xfs_reflink.h"
				39	#include "xfs_iomap.h"
				40	#include "xfs_rmap_btree.h"
				41	#include "xfs_sb.h"
				42	#include "xfs_ag_resv.h"
				43
				44	/*
				45	* Copy on Write of Shared Blocks
				46	*
				47	* XFS must preserve "the usual" file semantics even when two files share
				48	* the same physical blocks. This means that a write to one file must not
				49	* alter the blocks in a different file; the way that we'll do that is
				50	* through the use of a copy-on-write mechanism. At a high level, that
				51	* means that when we want to write to a shared block, we allocate a new
				52	* block, write the data to the new block, and if that succeeds we map the
				53	* new block into the file.
				54	*
				55	* XFS provides a "delayed allocation" mechanism that defers the allocation
				56	* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
				57	* possible. This reduces fragmentation by enabling the filesystem to ask
				58	* for bigger chunks less often, which is exactly what we want for CoW.
				59	*
				60	* The delalloc mechanism begins when the kernel wants to make a block
				61	* writable (write_begin or page_mkwrite). If the offset is not mapped, we
				62	* create a delalloc mapping, which is a regular in-core extent, but without
				63	* a real startblock. (For delalloc mappings, the startblock encodes both
				64	* a flag that this is a delalloc mapping, and a worst-case estimate of how
				65	* many blocks might be required to put the mapping into the BMBT.) delalloc
				66	* mappings are a reservation against the free space in the filesystem;
				67	* adjacent mappings can also be combined into fewer larger mappings.
				68	*
				69	* As an optimization, the CoW extent size hint (cowextsz) creates
				70	* outsized aligned delalloc reservations in the hope of landing out of
				71	* order nearby CoW writes in a single extent on disk, thereby reducing
				72	* fragmentation and improving future performance.
				73	*
				74	* D: --RRRRRRSSSRRRRRRRR--- (data fork)
				75	* C: ------DDDDDDD--------- (CoW fork)
				76	*
				77	* When dirty pages are being written out (typically in writepage), the
				78	* delalloc reservations are converted into unwritten mappings by
				79	* allocating blocks and replacing the delalloc mapping with real ones.
				80	* A delalloc mapping can be replaced by several unwritten ones if the
				81	* free space is fragmented.
				82	*
				83	* D: --RRRRRRSSSRRRRRRRR---
				84	* C: ------UUUUUUU---------
				85	*
				86	* We want to adapt the delalloc mechanism for copy-on-write, since the
				87	* write paths are similar. The first two steps (creating the reservation
				88	* and allocating the blocks) are exactly the same as delalloc except that
				89	* the mappings must be stored in a separate CoW fork because we do not want
				90	* to disturb the mapping in the data fork until we're sure that the write
				91	* succeeded. IO completion in this case is the process of removing the old
				92	* mapping from the data fork and moving the new mapping from the CoW fork to
				93	* the data fork. This will be discussed shortly.
				94	*
				95	* For now, unaligned directio writes will be bounced back to the page cache.
				96	* Block-aligned directio writes will use the same mechanism as buffered
				97	* writes.
				98	*
				99	* Just prior to submitting the actual disk write requests, we convert
				100	* the extents representing the range of the file actually being written
				101	* (as opposed to extra pieces created for the cowextsize hint) to real
				102	* extents. This will become important in the next step:
				103	*
				104	* D: --RRRRRRSSSRRRRRRRR---
				105	* C: ------UUrrUUU---------
				106	*
				107	* CoW remapping must be done after the data block write completes,
				108	* because we don't want to destroy the old data fork map until we're sure
				109	* the new block has been written. Since the new mappings are kept in a
				110	* separate fork, we can simply iterate these mappings to find the ones
				111	* that cover the file blocks that we just CoW'd. For each extent, simply
				112	* unmap the corresponding range in the data fork, map the new range into
				113	* the data fork, and remove the extent from the CoW fork. Because of
				114	* the presence of the cowextsize hint, however, we must be careful
				115	* only to remap the blocks that we've actually written out -- we must
				116	* never remap delalloc reservations nor CoW staging blocks that have
				117	* yet to be written. This corresponds exactly to the real extents in
				118	* the CoW fork:
				119	*
				120	* D: --RRRRRRrrSRRRRRRRR---
				121	* C: ------UU--UUU---------
				122	*
				123	* Since the remapping operation can be applied to an arbitrary file
				124	* range, we record the need for the remap step as a flag in the ioend
				125	* instead of declaring a new IO type. This is required for direct io
				126	* because we only have ioend for the whole dio, and we have to be able to
				127	* remember the presence of unwritten blocks and CoW blocks with a single
				128	* ioend structure. Better yet, the more ground we can cover with one
				129	* ioend, the better.
				130	*/
				131
				132	/*
				133	* Given an AG extent, find the lowest-numbered run of shared blocks
				134	* within that range and return the range in fbno/flen. If
				135	* find_end_of_shared is true, return the longest contiguous extent of
				136	* shared blocks. If there are no shared extents, fbno and flen will
				137	* be set to NULLAGBLOCK and 0, respectively.
				138	*/
				139	int
				140	xfs_reflink_find_shared(
				141	struct xfs_mount *mp,
				142	struct xfs_trans *tp,
				143	xfs_agnumber_t agno,
				144	xfs_agblock_t agbno,
				145	xfs_extlen_t aglen,
				146	xfs_agblock_t *fbno,
				147	xfs_extlen_t *flen,
				148	bool find_end_of_shared)
				149	{
				150	struct xfs_buf *agbp;
				151	struct xfs_btree_cur *cur;
				152	int error;
				153
				154	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
				155	if (error)
				156	return error;
				157	if (!agbp)
				158	return -ENOMEM;
				159
				160	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
				161
				162	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
				163	find_end_of_shared);
				164
				165	xfs_btree_del_cursor(cur, error);
				166
				167	xfs_trans_brelse(tp, agbp);
				168	return error;
				169	}
				170
				171	/*
				172	* Trim the mapping to the next block where there's a change in the
				173	* shared/unshared status. More specifically, this means that we
				174	* find the lowest-numbered extent of shared blocks that coincides with
				175	* the given block mapping. If the shared extent overlaps the start of
				176	* the mapping, trim the mapping to the end of the shared extent. If
				177	* the shared region intersects the mapping, trim the mapping to the
				178	* start of the shared extent. If there are no shared regions that
				179	* overlap, just return the original extent.
				180	*/
				181	int
				182	xfs_reflink_trim_around_shared(
				183	struct xfs_inode *ip,
				184	struct xfs_bmbt_irec *irec,
				185	bool *shared,
				186	bool *trimmed)
				187	{
				188	xfs_agnumber_t agno;
				189	xfs_agblock_t agbno;
				190	xfs_extlen_t aglen;
				191	xfs_agblock_t fbno;
				192	xfs_extlen_t flen;
				193	int error = 0;
				194
				195	/* Holes, unwritten, and delalloc extents cannot be shared */
				196	if (!xfs_is_reflink_inode(ip) \|\| !xfs_bmap_is_real_extent(irec)) {
				197	*shared = false;
				198	return 0;
				199	}
				200
				201	trace_xfs_reflink_trim_around_shared(ip, irec);
				202
				203	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
				204	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
				205	aglen = irec->br_blockcount;
				206
				207	error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
				208	aglen, &fbno, &flen, true);
				209	if (error)
				210	return error;
				211
				212	shared = trimmed = false;
				213	if (fbno == NULLAGBLOCK) {
				214	/* No shared blocks at all. */
				215	return 0;
				216	} else if (fbno == agbno) {
				217	/*
				218	* The start of this extent is shared. Truncate the
				219	* mapping at the end of the shared region so that a
				220	* subsequent iteration starts at the start of the
				221	* unshared region.
				222	*/
				223	irec->br_blockcount = flen;
				224	*shared = true;
				225	if (flen != aglen)
				226	*trimmed = true;
				227	return 0;
				228	} else {
				229	/*
				230	* There's a shared extent midway through this extent.
				231	* Truncate the mapping at the start of the shared
				232	* extent so that a subsequent iteration starts at the
				233	* start of the shared region.
				234	*/
				235	irec->br_blockcount = fbno - agbno;
				236	*trimmed = true;
				237	return 0;
				238	}
				239	}
				240
				241	/*
				242	* Trim the passed in imap to the next shared/unshared extent boundary, and
				243	* if imap->br_startoff points to a shared extent reserve space for it in the
				244	* COW fork. In this case *shared is set to true, else to false.
				245	*
				246	* Note that imap will always contain the block numbers for the existing blocks
				247	* in the data fork, as the upper layers need them for read-modify-write
				248	* operations.
				249	*/
				250	int
				251	xfs_reflink_reserve_cow(
				252	struct xfs_inode *ip,
				253	struct xfs_bmbt_irec *imap,
				254	bool *shared)
				255	{
				256	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				257	struct xfs_bmbt_irec got;
				258	int error = 0;
				259	bool eof = false, trimmed;
				260	struct xfs_iext_cursor icur;
				261
				262	/*
				263	* Search the COW fork extent list first. This serves two purposes:
				264	* first this implement the speculative preallocation using cowextisze,
				265	* so that we also unshared block adjacent to shared blocks instead
				266	* of just the shared blocks themselves. Second the lookup in the
				267	* extent list is generally faster than going out to the shared extent
				268	* tree.
				269	*/
				270
				271	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
				272	eof = true;
				273	if (!eof && got.br_startoff <= imap->br_startoff) {
				274	trace_xfs_reflink_cow_found(ip, imap);
				275	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				276
				277	*shared = true;
				278	return 0;
				279	}
				280
				281	/* Trim the mapping to the nearest shared extent boundary. */
				282	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				283	if (error)
				284	return error;
				285
				286	/* Not shared? Just report the (potentially capped) extent. */
				287	if (!*shared)
				288	return 0;
				289
				290	/*
				291	* Fork all the shared blocks from our write offset until the end of
				292	* the extent.
				293	*/
				294	error = xfs_qm_dqattach_locked(ip, false);
				295	if (error)
				296	return error;
				297
				298	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
				299	imap->br_blockcount, 0, &got, &icur, eof);
				300	if (error == -ENOSPC \|\| error == -EDQUOT)
				301	trace_xfs_reflink_cow_enospc(ip, imap);
				302	if (error)
				303	return error;
				304
				305	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				306	trace_xfs_reflink_cow_alloc(ip, &got);
				307	return 0;
				308	}
				309
				310	/* Convert part of an unwritten CoW extent to a real one. */
				311	STATIC int
				312	xfs_reflink_convert_cow_extent(
				313	struct xfs_inode *ip,
				314	struct xfs_bmbt_irec *imap,
				315	xfs_fileoff_t offset_fsb,
				316	xfs_filblks_t count_fsb)
				317	{
				318	int nimaps = 1;
				319
				320	if (imap->br_state == XFS_EXT_NORM)
				321	return 0;
				322
				323	xfs_trim_extent(imap, offset_fsb, count_fsb);
				324	trace_xfs_reflink_convert_cow(ip, imap);
				325	if (imap->br_blockcount == 0)
				326	return 0;
				327	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
				328	XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT, 0, imap,
				329	&nimaps);
				330	}
				331
				332	/* Convert all of the unwritten CoW extents in a file's range to real ones. */
				333	int
				334	xfs_reflink_convert_cow(
				335	struct xfs_inode *ip,
				336	xfs_off_t offset,
				337	xfs_off_t count)
				338	{
				339	struct xfs_mount *mp = ip->i_mount;
				340	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
				341	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
				342	xfs_filblks_t count_fsb = end_fsb - offset_fsb;
				343	struct xfs_bmbt_irec imap;
				344	int nimaps = 1, error = 0;
				345
				346	ASSERT(count != 0);
				347
				348	xfs_ilock(ip, XFS_ILOCK_EXCL);
				349	error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
				350	XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT \|
				351	XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
				352	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				353	return error;
				354	}
				355
				356	/*
				357	* Find the extent that maps the given range in the COW fork. Even if the extent
				358	* is not shared we might have a preallocation for it in the COW fork. If so we
				359	* use it that rather than trigger a new allocation.
				360	*/
				361	static int
				362	xfs_find_trim_cow_extent(
				363	struct xfs_inode *ip,
				364	struct xfs_bmbt_irec *imap,
				365	bool *shared,
				366	bool *found)
				367	{
				368	xfs_fileoff_t offset_fsb = imap->br_startoff;
				369	xfs_filblks_t count_fsb = imap->br_blockcount;
				370	struct xfs_iext_cursor icur;
				371	struct xfs_bmbt_irec got;
				372	bool trimmed;
				373
				374	*found = false;
				375
				376	/*
				377	* If we don't find an overlapping extent, trim the range we need to
				378	* allocate to fit the hole we found.
				379	*/
				380	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) \|\|
				381	got.br_startoff > offset_fsb)
				382	return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				383
				384	*shared = true;
				385	if (isnullstartblock(got.br_startblock)) {
				386	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				387	return 0;
				388	}
				389
				390	/* real extent found - no need to allocate */
				391	xfs_trim_extent(&got, offset_fsb, count_fsb);
				392	*imap = got;
				393	*found = true;
				394	return 0;
				395	}
				396
				397	/* Allocate all CoW reservations covering a range of blocks in a file. */
				398	int
				399	xfs_reflink_allocate_cow(
				400	struct xfs_inode *ip,
				401	struct xfs_bmbt_irec *imap,
				402	bool *shared,
				403	uint *lockmode)
				404	{
				405	struct xfs_mount *mp = ip->i_mount;
				406	xfs_fileoff_t offset_fsb = imap->br_startoff;
				407	xfs_filblks_t count_fsb = imap->br_blockcount;
				408	struct xfs_trans *tp;
				409	int nimaps, error = 0;
				410	bool found;
				411	xfs_filblks_t resaligned;
				412	xfs_extlen_t resblks = 0;
				413
				414	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				415	ASSERT(xfs_is_reflink_inode(ip));
				416
				417	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
				418	if (error \|\| !*shared)
				419	return error;
				420	if (found)
				421	goto convert;
				422
				423	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
				424	imap->br_blockcount, xfs_get_cowextsz_hint(ip));
				425	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
				426
				427	xfs_iunlock(ip, *lockmode);
				428	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				429	*lockmode = XFS_ILOCK_EXCL;
				430	xfs_ilock(ip, *lockmode);
				431
				432	if (error)
				433	return error;
				434
				435	error = xfs_qm_dqattach_locked(ip, false);
				436	if (error)
				437	goto out_trans_cancel;
				438
				439	/*
				440	* Check for an overlapping extent again now that we dropped the ilock.
				441	*/
				442	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
				443	if (error \|\| !*shared)
				444	goto out_trans_cancel;
				445	if (found) {
				446	xfs_trans_cancel(tp);
				447	goto convert;
				448	}
				449
				450	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
				451	XFS_QMOPT_RES_REGBLKS);
				452	if (error)
				453	goto out_trans_cancel;
				454
				455	xfs_trans_ijoin(tp, ip, 0);
				456
				457	/* Allocate the entire reservation as unwritten blocks. */
				458	nimaps = 1;
				459	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
				460	XFS_BMAPI_COWFORK \| XFS_BMAPI_PREALLOC,
				461	resblks, imap, &nimaps);
				462	if (error)
				463	goto out_unreserve;
				464
				465	xfs_inode_set_cowblocks_tag(ip);
				466	error = xfs_trans_commit(tp);
				467	if (error)
				468	return error;
				469
				470	/*
				471	* Allocation succeeded but the requested range was not even partially
				472	* satisfied? Bail out!
				473	*/
				474	if (nimaps == 0)
				475	return -ENOSPC;
				476	convert:
				477	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
				478
				479	out_unreserve:
				480	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
				481	XFS_QMOPT_RES_REGBLKS);
				482	out_trans_cancel:
				483	xfs_trans_cancel(tp);
				484	return error;
				485	}
				486
				487	/*
				488	* Cancel CoW reservations for some block range of an inode.
				489	*
				490	* If cancel_real is true this function cancels all COW fork extents for the
				491	* inode; if cancel_real is false, real extents are not cleared.
				492	*
				493	* Caller must have already joined the inode to the current transaction. The
				494	* inode will be joined to the transaction returned to the caller.
				495	*/
				496	int
				497	xfs_reflink_cancel_cow_blocks(
				498	struct xfs_inode *ip,
				499	struct xfs_trans **tpp,
				500	xfs_fileoff_t offset_fsb,
				501	xfs_fileoff_t end_fsb,
				502	bool cancel_real)
				503	{
				504	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				505	struct xfs_bmbt_irec got, del;
				506	struct xfs_iext_cursor icur;
				507	int error = 0;
				508
				509	if (!xfs_inode_has_cow_data(ip))
				510	return 0;
				511	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
				512	return 0;
				513
				514	/* Walk backwards until we're out of the I/O range... */
				515	while (got.br_startoff + got.br_blockcount > offset_fsb) {
				516	del = got;
				517	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				518
				519	/* Extent delete may have bumped ext forward */
				520	if (!del.br_blockcount) {
				521	xfs_iext_prev(ifp, &icur);
				522	goto next_extent;
				523	}
				524
				525	trace_xfs_reflink_cancel_cow(ip, &del);
				526
				527	if (isnullstartblock(del.br_startblock)) {
				528	error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
				529	&icur, &got, &del);
				530	if (error)
				531	break;
				532	} else if (del.br_state == XFS_EXT_UNWRITTEN \|\| cancel_real) {
				533	ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
				534
				535	/* Free the CoW orphan record. */
				536	error = xfs_refcount_free_cow_extent(*tpp,
				537	del.br_startblock, del.br_blockcount);
				538	if (error)
				539	break;
				540
				541	xfs_bmap_add_free(*tpp, del.br_startblock,
				542	del.br_blockcount, NULL);
				543
				544	/* Roll the transaction */
				545	error = xfs_defer_finish(tpp);
				546	if (error)
				547	break;
				548
				549	/* Remove the mapping from the CoW fork. */
				550	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
				551
				552	/* Remove the quota reservation */
				553	error = xfs_trans_reserve_quota_nblks(NULL, ip,
				554	-(long)del.br_blockcount, 0,
				555	XFS_QMOPT_RES_REGBLKS);
				556	if (error)
				557	break;
				558	} else {
				559	/* Didn't do anything, push cursor back. */
				560	xfs_iext_prev(ifp, &icur);
				561	}
				562	next_extent:
				563	if (!xfs_iext_get_extent(ifp, &icur, &got))
				564	break;
				565	}
				566
				567	/* clear tag if cow fork is emptied */
				568	if (!ifp->if_bytes)
				569	xfs_inode_clear_cowblocks_tag(ip);
				570	return error;
				571	}
				572
				573	/*
				574	* Cancel CoW reservations for some byte range of an inode.
				575	*
				576	* If cancel_real is true this function cancels all COW fork extents for the
				577	* inode; if cancel_real is false, real extents are not cleared.
				578	*/
				579	int
				580	xfs_reflink_cancel_cow_range(
				581	struct xfs_inode *ip,
				582	xfs_off_t offset,
				583	xfs_off_t count,
				584	bool cancel_real)
				585	{
				586	struct xfs_trans *tp;
				587	xfs_fileoff_t offset_fsb;
				588	xfs_fileoff_t end_fsb;
				589	int error;
				590
				591	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
				592	ASSERT(xfs_is_reflink_inode(ip));
				593
				594	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				595	if (count == NULLFILEOFF)
				596	end_fsb = NULLFILEOFF;
				597	else
				598	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				599
				600	/* Start a rolling transaction to remove the mappings */
				601	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				602	0, 0, XFS_TRANS_NOFS, &tp);
				603	if (error)
				604	goto out;
				605
				606	xfs_ilock(ip, XFS_ILOCK_EXCL);
				607	xfs_trans_ijoin(tp, ip, 0);
				608
				609	/* Scrape out the old CoW reservations */
				610	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
				611	cancel_real);
				612	if (error)
				613	goto out_cancel;
				614
				615	error = xfs_trans_commit(tp);
				616
				617	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				618	return error;
				619
				620	out_cancel:
				621	xfs_trans_cancel(tp);
				622	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				623	out:
				624	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
				625	return error;
				626	}
				627
				628	/*
				629	* Remap parts of a file's data fork after a successful CoW.
				630	*/
				631	int
				632	xfs_reflink_end_cow(
				633	struct xfs_inode *ip,
				634	xfs_off_t offset,
				635	xfs_off_t count)
				636	{
				637	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				638	struct xfs_bmbt_irec got, del;
				639	struct xfs_trans *tp;
				640	xfs_fileoff_t offset_fsb;
				641	xfs_fileoff_t end_fsb;
				642	int error;
				643	unsigned int resblks;
				644	xfs_filblks_t rlen;
				645	struct xfs_iext_cursor icur;
				646
				647	trace_xfs_reflink_end_cow(ip, offset, count);
				648
				649	/* No COW extents? That's easy! */
				650	if (ifp->if_bytes == 0)
				651	return 0;
				652
				653	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				654	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				655
				656	/*
				657	* Start a rolling transaction to switch the mappings. We're
				658	* unlikely ever to have to remap 16T worth of single-block
				659	* extents, so just cap the worst case extent count to 2^32-1.
				660	* Stick a warning in just in case, and avoid 64-bit division.
				661	*/
				662	BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
				663	if (end_fsb - offset_fsb > UINT_MAX) {
				664	error = -EFSCORRUPTED;
				665	xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
				666	ASSERT(0);
				667	goto out;
				668	}
				669	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
				670	(unsigned int)(end_fsb - offset_fsb),
				671	XFS_DATA_FORK);
				672	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				673	resblks, 0, XFS_TRANS_RESERVE \| XFS_TRANS_NOFS, &tp);
				674	if (error)
				675	goto out;
				676
				677	xfs_ilock(ip, XFS_ILOCK_EXCL);
				678	xfs_trans_ijoin(tp, ip, 0);
				679
				680	/*
				681	* In case of racing, overlapping AIO writes no COW extents might be
				682	* left by the time I/O completes for the loser of the race. In that
				683	* case we are done.
				684	*/
				685	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
				686	goto out_cancel;
				687
				688	/* Walk backwards until we're out of the I/O range... */
				689	while (got.br_startoff + got.br_blockcount > offset_fsb) {
				690	del = got;
				691	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				692
				693	/* Extent delete may have bumped ext forward */
				694	if (!del.br_blockcount)
				695	goto prev_extent;
				696
				697	/*
				698	* Only remap real extent that contain data. With AIO
				699	* speculatively preallocations can leak into the range we
				700	* are called upon, and we need to skip them.
				701	*/
				702	if (!xfs_bmap_is_real_extent(&got))
				703	goto prev_extent;
				704
				705	/* Unmap the old blocks in the data fork. */
				706	ASSERT(tp->t_firstblock == NULLFSBLOCK);
				707	rlen = del.br_blockcount;
				708	error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
				709	if (error)
				710	goto out_cancel;
				711
				712	/* Trim the extent to whatever got unmapped. */
				713	if (rlen) {
				714	xfs_trim_extent(&del, del.br_startoff + rlen,
				715	del.br_blockcount - rlen);
				716	}
				717	trace_xfs_reflink_cow_remap(ip, &del);
				718
				719	/* Free the CoW orphan record. */
				720	error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
				721	del.br_blockcount);
				722	if (error)
				723	goto out_cancel;
				724
				725	/* Map the new blocks into the data fork. */
				726	error = xfs_bmap_map_extent(tp, ip, &del);
				727	if (error)
				728	goto out_cancel;
				729
				730	/* Charge this new data fork mapping to the on-disk quota. */
				731	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
				732	(long)del.br_blockcount);
				733
				734	/* Remove the mapping from the CoW fork. */
				735	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
				736
				737	error = xfs_defer_finish(&tp);
				738	if (error)
				739	goto out_cancel;
				740	if (!xfs_iext_get_extent(ifp, &icur, &got))
				741	break;
				742	continue;
				743	prev_extent:
				744	if (!xfs_iext_prev_extent(ifp, &icur, &got))
				745	break;
				746	}
				747
				748	error = xfs_trans_commit(tp);
				749	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				750	if (error)
				751	goto out;
				752	return 0;
				753
				754	out_cancel:
				755	xfs_trans_cancel(tp);
				756	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				757	out:
				758	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
				759	return error;
				760	}
				761
				762	/*
				763	* Free leftover CoW reservations that didn't get cleaned out.
				764	*/
				765	int
				766	xfs_reflink_recover_cow(
				767	struct xfs_mount *mp)
				768	{
				769	xfs_agnumber_t agno;
				770	int error = 0;
				771
				772	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				773	return 0;
				774
				775	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
				776	error = xfs_refcount_recover_cow_leftovers(mp, agno);
				777	if (error)
				778	break;
				779	}
				780
				781	return error;
				782	}
				783
				784	/*
				785	* Reflinking (Block) Ranges of Two Files Together
				786	*
				787	* First, ensure that the reflink flag is set on both inodes. The flag is an
				788	* optimization to avoid unnecessary refcount btree lookups in the write path.
				789	*
				790	* Now we can iteratively remap the range of extents (and holes) in src to the
				791	* corresponding ranges in dest. Let drange and srange denote the ranges of
				792	* logical blocks in dest and src touched by the reflink operation.
				793	*
				794	* While the length of drange is greater than zero,
				795	* - Read src's bmbt at the start of srange ("imap")
				796	* - If imap doesn't exist, make imap appear to start at the end of srange
				797	* with zero length.
				798	* - If imap starts before srange, advance imap to start at srange.
				799	* - If imap goes beyond srange, truncate imap to end at the end of srange.
				800	* - Punch (imap start - srange start + imap len) blocks from dest at
				801	* offset (drange start).
				802	* - If imap points to a real range of pblks,
				803	* > Increase the refcount of the imap's pblks
				804	* > Map imap's pblks into dest at the offset
				805	* (drange start + imap start - srange start)
				806	* - Advance drange and srange by (imap start - srange start + imap len)
				807	*
				808	* Finally, if the reflink made dest longer, update both the in-core and
				809	* on-disk file sizes.
				810	*
				811	* ASCII Art Demonstration:
				812	*
				813	* Let's say we want to reflink this source file:
				814	*
				815	* ----SSSSSSS-SSSSS----SSSSSS (src file)
				816	* <-------------------->
				817	*
				818	* into this destination file:
				819	*
				820	* --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
				821	* <-------------------->
				822	* '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
				823	* Observe that the range has different logical offsets in either file.
				824	*
				825	* Consider that the first extent in the source file doesn't line up with our
				826	* reflink range. Unmapping and remapping are separate operations, so we can
				827	* unmap more blocks from the destination file than we remap.
				828	*
				829	* ----SSSSSSS-SSSSS----SSSSSS
				830	* <------->
				831	* --DDDDD---------DDDDD--DDD
				832	* <------->
				833	*
				834	* Now remap the source extent into the destination file:
				835	*
				836	* ----SSSSSSS-SSSSS----SSSSSS
				837	* <------->
				838	* --DDDDD--SSSSSSSDDDDD--DDD
				839	* <------->
				840	*
				841	* Do likewise with the second hole and extent in our range. Holes in the
				842	* unmap range don't affect our operation.
				843	*
				844	* ----SSSSSSS-SSSSS----SSSSSS
				845	* <---->
				846	* --DDDDD--SSSSSSS-SSSSS-DDD
				847	* <---->
				848	*
				849	* Finally, unmap and remap part of the third extent. This will increase the
				850	* size of the destination file.
				851	*
				852	* ----SSSSSSS-SSSSS----SSSSSS
				853	* <----->
				854	* --DDDDD--SSSSSSS-SSSSS----SSS
				855	* <----->
				856	*
				857	* Once we update the destination file's i_size, we're done.
				858	*/
				859
				860	/*
				861	* Ensure the reflink bit is set in both inodes.
				862	*/
				863	STATIC int
				864	xfs_reflink_set_inode_flag(
				865	struct xfs_inode *src,
				866	struct xfs_inode *dest)
				867	{
				868	struct xfs_mount *mp = src->i_mount;
				869	int error;
				870	struct xfs_trans *tp;
				871
				872	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
				873	return 0;
				874
				875	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				876	if (error)
				877	goto out_error;
				878
				879	/* Lock both files against IO */
				880	if (src->i_ino == dest->i_ino)
				881	xfs_ilock(src, XFS_ILOCK_EXCL);
				882	else
				883	xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
				884
				885	if (!xfs_is_reflink_inode(src)) {
				886	trace_xfs_reflink_set_inode_flag(src);
				887	xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
				888	src->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				889	xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
				890	xfs_ifork_init_cow(src);
				891	} else
				892	xfs_iunlock(src, XFS_ILOCK_EXCL);
				893
				894	if (src->i_ino == dest->i_ino)
				895	goto commit_flags;
				896
				897	if (!xfs_is_reflink_inode(dest)) {
				898	trace_xfs_reflink_set_inode_flag(dest);
				899	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				900	dest->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				901	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				902	xfs_ifork_init_cow(dest);
				903	} else
				904	xfs_iunlock(dest, XFS_ILOCK_EXCL);
				905
				906	commit_flags:
				907	error = xfs_trans_commit(tp);
				908	if (error)
				909	goto out_error;
				910	return error;
				911
				912	out_error:
				913	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
				914	return error;
				915	}
				916
				917	/*
				918	* Update destination inode size & cowextsize hint, if necessary.
				919	*/
				920	STATIC int
				921	xfs_reflink_update_dest(
				922	struct xfs_inode *dest,
				923	xfs_off_t newlen,
				924	xfs_extlen_t cowextsize,
				925	bool is_dedupe)
				926	{
				927	struct xfs_mount *mp = dest->i_mount;
				928	struct xfs_trans *tp;
				929	int error;
				930
				931	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
				932	return 0;
				933
				934	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				935	if (error)
				936	goto out_error;
				937
				938	xfs_ilock(dest, XFS_ILOCK_EXCL);
				939	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				940
				941	if (newlen > i_size_read(VFS_I(dest))) {
				942	trace_xfs_reflink_update_inode_size(dest, newlen);
				943	i_size_write(VFS_I(dest), newlen);
				944	dest->i_d.di_size = newlen;
				945	}
				946
				947	if (cowextsize) {
				948	dest->i_d.di_cowextsize = cowextsize;
				949	dest->i_d.di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
				950	}
				951
				952	if (!is_dedupe) {
				953	xfs_trans_ichgtime(tp, dest,
				954	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				955	}
				956	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				957
				958	error = xfs_trans_commit(tp);
				959	if (error)
				960	goto out_error;
				961	return error;
				962
				963	out_error:
				964	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
				965	return error;
				966	}
				967
				968	/*
				969	* Do we have enough reserve in this AG to handle a reflink? The refcount
				970	* btree already reserved all the space it needs, but the rmap btree can grow
				971	* infinitely, so we won't allow more reflinks when the AG is down to the
				972	* btree reserves.
				973	*/
				974	static int
				975	xfs_reflink_ag_has_free_space(
				976	struct xfs_mount *mp,
				977	xfs_agnumber_t agno)
				978	{
				979	struct xfs_perag *pag;
				980	int error = 0;
				981
				982	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
				983	return 0;
				984
				985	pag = xfs_perag_get(mp, agno);
				986	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) \|\|
				987	xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
				988	error = -ENOSPC;
				989	xfs_perag_put(pag);
				990	return error;
				991	}
				992
				993	/*
				994	* Unmap a range of blocks from a file, then map other blocks into the hole.
				995	* The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
				996	* The extent irec is mapped into dest at irec->br_startoff.
				997	*/
				998	STATIC int
				999	xfs_reflink_remap_extent(
				1000	struct xfs_inode *ip,
				1001	struct xfs_bmbt_irec *irec,
				1002	xfs_fileoff_t destoff,
				1003	xfs_off_t new_isize)
				1004	{
				1005	struct xfs_mount *mp = ip->i_mount;
				1006	bool real_extent = xfs_bmap_is_real_extent(irec);
				1007	struct xfs_trans *tp;
				1008	unsigned int resblks;
				1009	struct xfs_bmbt_irec uirec;
				1010	xfs_filblks_t rlen;
				1011	xfs_filblks_t unmap_len;
				1012	xfs_off_t newlen;
				1013	int error;
				1014
				1015	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
				1016	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
				1017
				1018	/* No reflinking if we're low on space */
				1019	if (real_extent) {
				1020	error = xfs_reflink_ag_has_free_space(mp,
				1021	XFS_FSB_TO_AGNO(mp, irec->br_startblock));
				1022	if (error)
				1023	goto out;
				1024	}
				1025
				1026	/* Start a rolling transaction to switch the mappings */
				1027	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
				1028	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				1029	if (error)
				1030	goto out;
				1031
				1032	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1033	xfs_trans_ijoin(tp, ip, 0);
				1034
				1035	/* If we're not just clearing space, then do we have enough quota? */
				1036	if (real_extent) {
				1037	error = xfs_trans_reserve_quota_nblks(tp, ip,
				1038	irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
				1039	if (error)
				1040	goto out_cancel;
				1041	}
				1042
				1043	trace_xfs_reflink_remap(ip, irec->br_startoff,
				1044	irec->br_blockcount, irec->br_startblock);
				1045
				1046	/* Unmap the old blocks in the data fork. */
				1047	rlen = unmap_len;
				1048	while (rlen) {
				1049	ASSERT(tp->t_firstblock == NULLFSBLOCK);
				1050	error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
				1051	if (error)
				1052	goto out_cancel;
				1053
				1054	/*
				1055	* Trim the extent to whatever got unmapped.
				1056	* Remember, bunmapi works backwards.
				1057	*/
				1058	uirec.br_startblock = irec->br_startblock + rlen;
				1059	uirec.br_startoff = irec->br_startoff + rlen;
				1060	uirec.br_blockcount = unmap_len - rlen;
				1061	unmap_len = rlen;
				1062
				1063	/* If this isn't a real mapping, we're done. */
				1064	if (!real_extent \|\| uirec.br_blockcount == 0)
				1065	goto next_extent;
				1066
				1067	trace_xfs_reflink_remap(ip, uirec.br_startoff,
				1068	uirec.br_blockcount, uirec.br_startblock);
				1069
				1070	/* Update the refcount tree */
				1071	error = xfs_refcount_increase_extent(tp, &uirec);
				1072	if (error)
				1073	goto out_cancel;
				1074
				1075	/* Map the new blocks into the data fork. */
				1076	error = xfs_bmap_map_extent(tp, ip, &uirec);
				1077	if (error)
				1078	goto out_cancel;
				1079
				1080	/* Update quota accounting. */
				1081	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
				1082	uirec.br_blockcount);
				1083
				1084	/* Update dest isize if needed. */
				1085	newlen = XFS_FSB_TO_B(mp,
				1086	uirec.br_startoff + uirec.br_blockcount);
				1087	newlen = min_t(xfs_off_t, newlen, new_isize);
				1088	if (newlen > i_size_read(VFS_I(ip))) {
				1089	trace_xfs_reflink_update_inode_size(ip, newlen);
				1090	i_size_write(VFS_I(ip), newlen);
				1091	ip->i_d.di_size = newlen;
				1092	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1093	}
				1094
				1095	next_extent:
				1096	/* Process all the deferred stuff. */
				1097	error = xfs_defer_finish(&tp);
				1098	if (error)
				1099	goto out_cancel;
				1100	}
				1101
				1102	error = xfs_trans_commit(tp);
				1103	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1104	if (error)
				1105	goto out;
				1106	return 0;
				1107
				1108	out_cancel:
				1109	xfs_trans_cancel(tp);
				1110	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1111	out:
				1112	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
				1113	return error;
				1114	}
				1115
				1116	/*
				1117	* Iteratively remap one file's extents (and holes) to another's.
				1118	*/
				1119	STATIC int
				1120	xfs_reflink_remap_blocks(
				1121	struct xfs_inode *src,
				1122	xfs_fileoff_t srcoff,
				1123	struct xfs_inode *dest,
				1124	xfs_fileoff_t destoff,
				1125	xfs_filblks_t len,
				1126	xfs_off_t new_isize)
				1127	{
				1128	struct xfs_bmbt_irec imap;
				1129	int nimaps;
				1130	int error = 0;
				1131	xfs_filblks_t range_len;
				1132
				1133	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
				1134	while (len) {
				1135	uint lock_mode;
				1136
				1137	trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
				1138	dest, destoff);
				1139
				1140	/* Read extent from the source file */
				1141	nimaps = 1;
				1142	lock_mode = xfs_ilock_data_map_shared(src);
				1143	error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
				1144	xfs_iunlock(src, lock_mode);
				1145	if (error)
				1146	goto err;
				1147	ASSERT(nimaps == 1);
				1148
				1149	trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
				1150	&imap);
				1151
				1152	/* Translate imap into the destination file. */
				1153	range_len = imap.br_startoff + imap.br_blockcount - srcoff;
				1154	imap.br_startoff += destoff - srcoff;
				1155
				1156	/* Clear dest from destoff to the end of imap and map it in. */
				1157	error = xfs_reflink_remap_extent(dest, &imap, destoff,
				1158	new_isize);
				1159	if (error)
				1160	goto err;
				1161
				1162	if (fatal_signal_pending(current)) {
				1163	error = -EINTR;
				1164	goto err;
				1165	}
				1166
				1167	/* Advance drange/srange */
				1168	srcoff += range_len;
				1169	destoff += range_len;
				1170	len -= range_len;
				1171	}
				1172
				1173	return 0;
				1174
				1175	err:
				1176	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
				1177	return error;
				1178	}
				1179
				1180	/*
				1181	* Grab the exclusive iolock for a data copy from src to dest, making
				1182	* sure to abide vfs locking order (lowest pointer value goes first) and
				1183	* breaking the pnfs layout leases on dest before proceeding. The loop
				1184	* is needed because we cannot call the blocking break_layout() with the
				1185	* src iolock held, and therefore have to back out both locks.
				1186	*/
				1187	static int
				1188	xfs_iolock_two_inodes_and_break_layout(
				1189	struct inode *src,
				1190	struct inode *dest)
				1191	{
				1192	int error;
				1193
				1194	retry:
				1195	if (src < dest) {
				1196	inode_lock_shared(src);
				1197	inode_lock_nested(dest, I_MUTEX_NONDIR2);
				1198	} else {
				1199	/* src >= dest */
				1200	inode_lock(dest);
				1201	}
				1202
				1203	error = break_layout(dest, false);
				1204	if (error == -EWOULDBLOCK) {
				1205	inode_unlock(dest);
				1206	if (src < dest)
				1207	inode_unlock_shared(src);
				1208	error = break_layout(dest, true);
				1209	if (error)
				1210	return error;
				1211	goto retry;
				1212	}
				1213	if (error) {
				1214	inode_unlock(dest);
				1215	if (src < dest)
				1216	inode_unlock_shared(src);
				1217	return error;
				1218	}
				1219	if (src > dest)
				1220	inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
				1221	return 0;
				1222	}
				1223
				1224	/* Unlock both inodes after they've been prepped for a range clone. */
				1225	STATIC void
				1226	xfs_reflink_remap_unlock(
				1227	struct file *file_in,
				1228	struct file *file_out)
				1229	{
				1230	struct inode *inode_in = file_inode(file_in);
				1231	struct xfs_inode *src = XFS_I(inode_in);
				1232	struct inode *inode_out = file_inode(file_out);
				1233	struct xfs_inode *dest = XFS_I(inode_out);
				1234	bool same_inode = (inode_in == inode_out);
				1235
				1236	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
				1237	if (!same_inode)
				1238	xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
				1239	inode_unlock(inode_out);
				1240	if (!same_inode)
				1241	inode_unlock_shared(inode_in);
				1242	}
				1243
				1244	/*
				1245	* If we're reflinking to a point past the destination file's EOF, we must
				1246	* zero any speculative post-EOF preallocations that sit between the old EOF
				1247	* and the destination file offset.
				1248	*/
				1249	static int
				1250	xfs_reflink_zero_posteof(
				1251	struct xfs_inode *ip,
				1252	loff_t pos)
				1253	{
				1254	loff_t isize = i_size_read(VFS_I(ip));
				1255
				1256	if (pos <= isize)
				1257	return 0;
				1258
				1259	trace_xfs_zero_eof(ip, isize, pos - isize);
				1260	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
				1261	&xfs_iomap_ops);
				1262	}
				1263
				1264	/*
				1265	* Prepare two files for range cloning. Upon a successful return both inodes
				1266	* will have the iolock and mmaplock held, the page cache of the out file will
				1267	* be truncated, and any leases on the out file will have been broken. This
				1268	* function borrows heavily from xfs_file_aio_write_checks.
				1269	*
				1270	* The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
				1271	* checked that the bytes beyond EOF physically match. Hence we cannot use the
				1272	* EOF block in the source dedupe range because it's not a complete block match,
				1273	* hence can introduce a corruption into the file that has it's block replaced.
				1274	*
				1275	* In similar fashion, the VFS file cloning also allows partial EOF blocks to be
				1276	* "block aligned" for the purposes of cloning entire files. However, if the
				1277	* source file range includes the EOF block and it lands within the existing EOF
				1278	* of the destination file, then we can expose stale data from beyond the source
				1279	* file EOF in the destination file.
				1280	*
				1281	* XFS doesn't support partial block sharing, so in both cases we have check
				1282	* these cases ourselves. For dedupe, we can simply round the length to dedupe
				1283	* down to the previous whole block and ignore the partial EOF block. While this
				1284	* means we can't dedupe the last block of a file, this is an acceptible
				1285	* tradeoff for simplicity on implementation.
				1286	*
				1287	* For cloning, we want to share the partial EOF block if it is also the new EOF
				1288	* block of the destination file. If the partial EOF block lies inside the
				1289	* existing destination EOF, then we have to abort the clone to avoid exposing
				1290	* stale data in the destination file. Hence we reject these clone attempts with
				1291	* -EINVAL in this case.
				1292	*/
				1293	STATIC int
				1294	xfs_reflink_remap_prep(
				1295	struct file *file_in,
				1296	loff_t pos_in,
				1297	struct file *file_out,
				1298	loff_t pos_out,
				1299	u64 *len,
				1300	bool is_dedupe)
				1301	{
				1302	struct inode *inode_in = file_inode(file_in);
				1303	struct xfs_inode *src = XFS_I(inode_in);
				1304	struct inode *inode_out = file_inode(file_out);
				1305	struct xfs_inode *dest = XFS_I(inode_out);
				1306	bool same_inode = (inode_in == inode_out);
				1307	u64 blkmask = i_blocksize(inode_in) - 1;
				1308	ssize_t ret;
				1309
				1310	/* Lock both files against IO */
				1311	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
				1312	if (ret)
				1313	return ret;
				1314	if (same_inode)
				1315	xfs_ilock(src, XFS_MMAPLOCK_EXCL);
				1316	else
				1317	xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
				1318	XFS_MMAPLOCK_EXCL);
				1319
				1320	/* Check file eligibility and prepare for block sharing. */
				1321	ret = -EINVAL;
				1322	/* Don't reflink realtime inodes */
				1323	if (XFS_IS_REALTIME_INODE(src) \|\| XFS_IS_REALTIME_INODE(dest))
				1324	goto out_unlock;
				1325
				1326	/* Don't share DAX file data for now. */
				1327	if (IS_DAX(inode_in) \|\| IS_DAX(inode_out))
				1328	goto out_unlock;
				1329
				1330	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
				1331	len, is_dedupe);
				1332	if (ret <= 0)
				1333	goto out_unlock;
				1334
				1335	/*
				1336	* If the dedupe data matches, chop off the partial EOF block
				1337	* from the source file so we don't try to dedupe the partial
				1338	* EOF block.
				1339	*/
				1340	if (is_dedupe) {
				1341	*len &= ~blkmask;
				1342	} else if (*len & blkmask) {
				1343	/*
				1344	* The user is attempting to share a partial EOF block,
				1345	* if it's inside the destination EOF then reject it.
				1346	*/
				1347	if (pos_out + *len < i_size_read(inode_out)) {
				1348	ret = -EINVAL;
				1349	goto out_unlock;
				1350	}
				1351	}
				1352
				1353	/* Attach dquots to dest inode before changing block map */
				1354	ret = xfs_qm_dqattach(dest);
				1355	if (ret)
				1356	goto out_unlock;
				1357
				1358	/*
				1359	* Zero existing post-eof speculative preallocations in the destination
				1360	* file.
				1361	*/
				1362	ret = xfs_reflink_zero_posteof(dest, pos_out);
				1363	if (ret)
				1364	goto out_unlock;
				1365
				1366	/* Set flags and remap blocks. */
				1367	ret = xfs_reflink_set_inode_flag(src, dest);
				1368	if (ret)
				1369	goto out_unlock;
				1370
				1371	/*
				1372	* If pos_out > EOF, we may have dirtied blocks between EOF and
				1373	* pos_out. In that case, we need to extend the flush and unmap to cover
				1374	* from EOF to the end of the copy length.
				1375	*/
				1376	if (pos_out > XFS_ISIZE(dest)) {
				1377	loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
				1378	ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
				1379	} else {
				1380	ret = xfs_flush_unmap_range(dest, pos_out, *len);
				1381	}
				1382	if (ret)
				1383	goto out_unlock;
				1384
				1385	/* If we're altering the file contents... */
				1386	if (!is_dedupe) {
				1387	/*
				1388	* ...update the timestamps (which will grab the ilock again
				1389	* from xfs_fs_dirty_inode, so we have to call it before we
				1390	* take the ilock).
				1391	*/
				1392	if (!(file_out->f_mode & FMODE_NOCMTIME)) {
				1393	ret = file_update_time(file_out);
				1394	if (ret)
				1395	goto out_unlock;
				1396	}
				1397
				1398	/*
				1399	* ...clear the security bits if the process is not being run
				1400	* by root. This keeps people from modifying setuid and setgid
				1401	* binaries.
				1402	*/
				1403	ret = file_remove_privs(file_out);
				1404	if (ret)
				1405	goto out_unlock;
				1406	}
				1407
				1408	return 1;
				1409	out_unlock:
				1410	xfs_reflink_remap_unlock(file_in, file_out);
				1411	return ret;
				1412	}
				1413
				1414	/*
				1415	* Link a range of blocks from one file to another.
				1416	*/
				1417	int
				1418	xfs_reflink_remap_range(
				1419	struct file *file_in,
				1420	loff_t pos_in,
				1421	struct file *file_out,
				1422	loff_t pos_out,
				1423	u64 len,
				1424	bool is_dedupe)
				1425	{
				1426	struct inode *inode_in = file_inode(file_in);
				1427	struct xfs_inode *src = XFS_I(inode_in);
				1428	struct inode *inode_out = file_inode(file_out);
				1429	struct xfs_inode *dest = XFS_I(inode_out);
				1430	struct xfs_mount *mp = src->i_mount;
				1431	xfs_fileoff_t sfsbno, dfsbno;
				1432	xfs_filblks_t fsblen;
				1433	xfs_extlen_t cowextsize;
				1434	ssize_t ret;
				1435
				1436	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				1437	return -EOPNOTSUPP;
				1438
				1439	if (XFS_FORCED_SHUTDOWN(mp))
				1440	return -EIO;
				1441
				1442	/* Prepare and then clone file data. */
				1443	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
				1444	&len, is_dedupe);
				1445	if (ret <= 0)
				1446	return ret;
				1447
				1448	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
				1449
				1450	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
				1451	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
				1452	fsblen = XFS_B_TO_FSB(mp, len);
				1453	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
				1454	pos_out + len);
				1455	if (ret)
				1456	goto out_unlock;
				1457
				1458	/*
				1459	* Carry the cowextsize hint from src to dest if we're sharing the
				1460	* entire source file to the entire destination file, the source file
				1461	* has a cowextsize hint, and the destination file does not.
				1462	*/
				1463	cowextsize = 0;
				1464	if (pos_in == 0 && len == i_size_read(inode_in) &&
				1465	(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
				1466	pos_out == 0 && len >= i_size_read(inode_out) &&
				1467	!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
				1468	cowextsize = src->i_d.di_cowextsize;
				1469
				1470	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
				1471	is_dedupe);
				1472
				1473	out_unlock:
				1474	xfs_reflink_remap_unlock(file_in, file_out);
				1475	if (ret)
				1476	trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
				1477	return ret;
				1478	}
				1479
				1480	/*
				1481	* The user wants to preemptively CoW all shared blocks in this file,
				1482	* which enables us to turn off the reflink flag. Iterate all
				1483	* extents which are not prealloc/delalloc to see which ranges are
				1484	* mentioned in the refcount tree, then read those blocks into the
				1485	* pagecache, dirty them, fsync them back out, and then we can update
				1486	* the inode flag. What happens if we run out of memory? :)
				1487	*/
				1488	STATIC int
				1489	xfs_reflink_dirty_extents(
				1490	struct xfs_inode *ip,
				1491	xfs_fileoff_t fbno,
				1492	xfs_filblks_t end,
				1493	xfs_off_t isize)
				1494	{
				1495	struct xfs_mount *mp = ip->i_mount;
				1496	xfs_agnumber_t agno;
				1497	xfs_agblock_t agbno;
				1498	xfs_extlen_t aglen;
				1499	xfs_agblock_t rbno;
				1500	xfs_extlen_t rlen;
				1501	xfs_off_t fpos;
				1502	xfs_off_t flen;
				1503	struct xfs_bmbt_irec map[2];
				1504	int nmaps;
				1505	int error = 0;
				1506
				1507	while (end - fbno > 0) {
				1508	nmaps = 1;
				1509	/*
				1510	* Look for extents in the file. Skip holes, delalloc, or
				1511	* unwritten extents; they can't be reflinked.
				1512	*/
				1513	error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
				1514	if (error)
				1515	goto out;
				1516	if (nmaps == 0)
				1517	break;
				1518	if (!xfs_bmap_is_real_extent(&map[0]))
				1519	goto next;
				1520
				1521	map[1] = map[0];
				1522	while (map[1].br_blockcount) {
				1523	agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
				1524	agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
				1525	aglen = map[1].br_blockcount;
				1526
				1527	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
				1528	aglen, &rbno, &rlen, true);
				1529	if (error)
				1530	goto out;
				1531	if (rbno == NULLAGBLOCK)
				1532	break;
				1533
				1534	/* Dirty the pages */
				1535	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1536	fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
				1537	(rbno - agbno));
				1538	flen = XFS_FSB_TO_B(mp, rlen);
				1539	if (fpos + flen > isize)
				1540	flen = isize - fpos;
				1541	error = iomap_file_dirty(VFS_I(ip), fpos, flen,
				1542	&xfs_iomap_ops);
				1543	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1544	if (error)
				1545	goto out;
				1546
				1547	map[1].br_blockcount -= (rbno - agbno + rlen);
				1548	map[1].br_startoff += (rbno - agbno + rlen);
				1549	map[1].br_startblock += (rbno - agbno + rlen);
				1550	}
				1551
				1552	next:
				1553	fbno = map[0].br_startoff + map[0].br_blockcount;
				1554	}
				1555	out:
				1556	return error;
				1557	}
				1558
				1559	/* Does this inode need the reflink flag? */
				1560	int
				1561	xfs_reflink_inode_has_shared_extents(
				1562	struct xfs_trans *tp,
				1563	struct xfs_inode *ip,
				1564	bool *has_shared)
				1565	{
				1566	struct xfs_bmbt_irec got;
				1567	struct xfs_mount *mp = ip->i_mount;
				1568	struct xfs_ifork *ifp;
				1569	xfs_agnumber_t agno;
				1570	xfs_agblock_t agbno;
				1571	xfs_extlen_t aglen;
				1572	xfs_agblock_t rbno;
				1573	xfs_extlen_t rlen;
				1574	struct xfs_iext_cursor icur;
				1575	bool found;
				1576	int error;
				1577
				1578	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
				1579	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
				1580	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
				1581	if (error)
				1582	return error;
				1583	}
				1584
				1585	*has_shared = false;
				1586	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
				1587	while (found) {
				1588	if (isnullstartblock(got.br_startblock) \|\|
				1589	got.br_state != XFS_EXT_NORM)
				1590	goto next;
				1591	agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
				1592	agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
				1593	aglen = got.br_blockcount;
				1594
				1595	error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
				1596	&rbno, &rlen, false);
				1597	if (error)
				1598	return error;
				1599	/* Is there still a shared block here? */
				1600	if (rbno != NULLAGBLOCK) {
				1601	*has_shared = true;
				1602	return 0;
				1603	}
				1604	next:
				1605	found = xfs_iext_next_extent(ifp, &icur, &got);
				1606	}
				1607
				1608	return 0;
				1609	}
				1610
				1611	/*
				1612	* Clear the inode reflink flag if there are no shared extents.
				1613	*
				1614	* The caller is responsible for joining the inode to the transaction passed in.
				1615	* The inode will be joined to the transaction that is returned to the caller.
				1616	*/
				1617	int
				1618	xfs_reflink_clear_inode_flag(
				1619	struct xfs_inode *ip,
				1620	struct xfs_trans **tpp)
				1621	{
				1622	bool needs_flag;
				1623	int error = 0;
				1624
				1625	ASSERT(xfs_is_reflink_inode(ip));
				1626
				1627	error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
				1628	if (error \|\| needs_flag)
				1629	return error;
				1630
				1631	/*
				1632	* We didn't find any shared blocks so turn off the reflink flag.
				1633	* First, get rid of any leftover CoW mappings.
				1634	*/
				1635	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
				1636	if (error)
				1637	return error;
				1638
				1639	/* Clear the inode flag. */
				1640	trace_xfs_reflink_unset_inode_flag(ip);
				1641	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
				1642	xfs_inode_clear_cowblocks_tag(ip);
				1643	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
				1644
				1645	return error;
				1646	}
				1647
				1648	/*
				1649	* Clear the inode reflink flag if there are no shared extents and the size
				1650	* hasn't changed.
				1651	*/
				1652	STATIC int
				1653	xfs_reflink_try_clear_inode_flag(
				1654	struct xfs_inode *ip)
				1655	{
				1656	struct xfs_mount *mp = ip->i_mount;
				1657	struct xfs_trans *tp;
				1658	int error = 0;
				1659
				1660	/* Start a rolling transaction to remove the mappings */
				1661	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
				1662	if (error)
				1663	return error;
				1664
				1665	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1666	xfs_trans_ijoin(tp, ip, 0);
				1667
				1668	error = xfs_reflink_clear_inode_flag(ip, &tp);
				1669	if (error)
				1670	goto cancel;
				1671
				1672	error = xfs_trans_commit(tp);
				1673	if (error)
				1674	goto out;
				1675
				1676	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1677	return 0;
				1678	cancel:
				1679	xfs_trans_cancel(tp);
				1680	out:
				1681	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1682	return error;
				1683	}
				1684
				1685	/*
				1686	* Pre-COW all shared blocks within a given byte range of a file and turn off
				1687	* the reflink flag if we unshare all of the file's blocks.
				1688	*/
				1689	int
				1690	xfs_reflink_unshare(
				1691	struct xfs_inode *ip,
				1692	xfs_off_t offset,
				1693	xfs_off_t len)
				1694	{
				1695	struct xfs_mount *mp = ip->i_mount;
				1696	xfs_fileoff_t fbno;
				1697	xfs_filblks_t end;
				1698	xfs_off_t isize;
				1699	int error;
				1700
				1701	if (!xfs_is_reflink_inode(ip))
				1702	return 0;
				1703
				1704	trace_xfs_reflink_unshare(ip, offset, len);
				1705
				1706	inode_dio_wait(VFS_I(ip));
				1707
				1708	/* Try to CoW the selected ranges */
				1709	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1710	fbno = XFS_B_TO_FSBT(mp, offset);
				1711	isize = i_size_read(VFS_I(ip));
				1712	end = XFS_B_TO_FSB(mp, offset + len);
				1713	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
				1714	if (error)
				1715	goto out_unlock;
				1716	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1717
				1718	/* Wait for the IO to finish */
				1719	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
				1720	if (error)
				1721	goto out;
				1722
				1723	/* Turn off the reflink flag if possible. */
				1724	error = xfs_reflink_try_clear_inode_flag(ip);
				1725	if (error)
				1726	goto out;
				1727
				1728	return 0;
				1729
				1730	out_unlock:
				1731	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1732	out:
				1733	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
				1734	return error;
				1735	}