Blame - src/kernel/linux/v4.19/fs/xfs/xfs_inode.c - T800

blob: 5ed84d6c70597f02e0accd3bc907e1773d1e2293 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include <linux/log2.h>
				7	#include <linux/iversion.h>
				8
				9	#include "xfs.h"
				10	#include "xfs_fs.h"
				11	#include "xfs_shared.h"
				12	#include "xfs_format.h"
				13	#include "xfs_log_format.h"
				14	#include "xfs_trans_resv.h"
				15	#include "xfs_sb.h"
				16	#include "xfs_mount.h"
				17	#include "xfs_defer.h"
				18	#include "xfs_inode.h"
				19	#include "xfs_da_format.h"
				20	#include "xfs_da_btree.h"
				21	#include "xfs_dir2.h"
				22	#include "xfs_attr_sf.h"
				23	#include "xfs_attr.h"
				24	#include "xfs_trans_space.h"
				25	#include "xfs_trans.h"
				26	#include "xfs_buf_item.h"
				27	#include "xfs_inode_item.h"
				28	#include "xfs_ialloc.h"
				29	#include "xfs_bmap.h"
				30	#include "xfs_bmap_util.h"
				31	#include "xfs_errortag.h"
				32	#include "xfs_error.h"
				33	#include "xfs_quota.h"
				34	#include "xfs_filestream.h"
				35	#include "xfs_cksum.h"
				36	#include "xfs_trace.h"
				37	#include "xfs_icache.h"
				38	#include "xfs_symlink.h"
				39	#include "xfs_trans_priv.h"
				40	#include "xfs_log.h"
				41	#include "xfs_bmap_btree.h"
				42	#include "xfs_reflink.h"
				43	#include "xfs_dir2_priv.h"
				44
				45	kmem_zone_t *xfs_inode_zone;
				46
				47	/*
				48	* Used in xfs_itruncate_extents(). This is the maximum number of extents
				49	* freed from a file in a single transaction.
				50	*/
				51	#define XFS_ITRUNC_MAX_EXTENTS 2
				52
				53	STATIC int xfs_iflush_int(struct xfs_inode , struct xfs_buf );
				54	STATIC int xfs_iunlink(struct xfs_trans , struct xfs_inode );
				55	STATIC int xfs_iunlink_remove(struct xfs_trans , struct xfs_inode );
				56
				57	/*
				58	* helper function to extract extent size hint from inode
				59	*/
				60	xfs_extlen_t
				61	xfs_get_extsz_hint(
				62	struct xfs_inode *ip)
				63	{
				64	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
				65	return ip->i_d.di_extsize;
				66	if (XFS_IS_REALTIME_INODE(ip))
				67	return ip->i_mount->m_sb.sb_rextsize;
				68	return 0;
				69	}
				70
				71	/*
				72	* Helper function to extract CoW extent size hint from inode.
				73	* Between the extent size hint and the CoW extent size hint, we
				74	* return the greater of the two. If the value is zero (automatic),
				75	* use the default size.
				76	*/
				77	xfs_extlen_t
				78	xfs_get_cowextsz_hint(
				79	struct xfs_inode *ip)
				80	{
				81	xfs_extlen_t a, b;
				82
				83	a = 0;
				84	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
				85	a = ip->i_d.di_cowextsize;
				86	b = xfs_get_extsz_hint(ip);
				87
				88	a = max(a, b);
				89	if (a == 0)
				90	return XFS_DEFAULT_COWEXTSZ_HINT;
				91	return a;
				92	}
				93
				94	/*
				95	* These two are wrapper routines around the xfs_ilock() routine used to
				96	* centralize some grungy code. They are used in places that wish to lock the
				97	* inode solely for reading the extents. The reason these places can't just
				98	* call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
				99	* bringing in of the extents from disk for a file in b-tree format. If the
				100	* inode is in b-tree format, then we need to lock the inode exclusively until
				101	* the extents are read in. Locking it exclusively all the time would limit
				102	* our parallelism unnecessarily, though. What we do instead is check to see
				103	* if the extents have been read in yet, and only lock the inode exclusively
				104	* if they have not.
				105	*
				106	* The functions return a value which should be given to the corresponding
				107	* xfs_iunlock() call.
				108	*/
				109	uint
				110	xfs_ilock_data_map_shared(
				111	struct xfs_inode *ip)
				112	{
				113	uint lock_mode = XFS_ILOCK_SHARED;
				114
				115	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
				116	(ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
				117	lock_mode = XFS_ILOCK_EXCL;
				118	xfs_ilock(ip, lock_mode);
				119	return lock_mode;
				120	}
				121
				122	uint
				123	xfs_ilock_attr_map_shared(
				124	struct xfs_inode *ip)
				125	{
				126	uint lock_mode = XFS_ILOCK_SHARED;
				127
				128	if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
				129	(ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
				130	lock_mode = XFS_ILOCK_EXCL;
				131	xfs_ilock(ip, lock_mode);
				132	return lock_mode;
				133	}
				134
				135	/*
				136	* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
				137	* multi-reader locks: i_mmap_lock and the i_lock. This routine allows
				138	* various combinations of the locks to be obtained.
				139	*
				140	* The 3 locks should always be ordered so that the IO lock is obtained first,
				141	* the mmap lock second and the ilock last in order to prevent deadlock.
				142	*
				143	* Basic locking order:
				144	*
				145	* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
				146	*
				147	* mmap_sem locking order:
				148	*
				149	* i_rwsem -> page lock -> mmap_sem
				150	* mmap_sem -> i_mmap_lock -> page_lock
				151	*
				152	* The difference in mmap_sem locking order mean that we cannot hold the
				153	* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
				154	* fault in pages during copy in/out (for buffered IO) or require the mmap_sem
				155	* in get_user_pages() to map the user pages into the kernel address space for
				156	* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
				157	* page faults already hold the mmap_sem.
				158	*
				159	* Hence to serialise fully against both syscall and mmap based IO, we need to
				160	* take both the i_rwsem and the i_mmap_lock. These locks should only be both
				161	* taken in places where we need to invalidate the page cache in a race
				162	* free manner (e.g. truncate, hole punch and other extent manipulation
				163	* functions).
				164	*/
				165	void
				166	xfs_ilock(
				167	xfs_inode_t *ip,
				168	uint lock_flags)
				169	{
				170	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
				171
				172	/*
				173	* You can't set both SHARED and EXCL for the same lock,
				174	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				175	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				176	*/
				177	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				178	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				179	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				180	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				181	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				182	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				183	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				184
				185	if (lock_flags & XFS_IOLOCK_EXCL) {
				186	down_write_nested(&VFS_I(ip)->i_rwsem,
				187	XFS_IOLOCK_DEP(lock_flags));
				188	} else if (lock_flags & XFS_IOLOCK_SHARED) {
				189	down_read_nested(&VFS_I(ip)->i_rwsem,
				190	XFS_IOLOCK_DEP(lock_flags));
				191	}
				192
				193	if (lock_flags & XFS_MMAPLOCK_EXCL)
				194	mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
				195	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				196	mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
				197
				198	if (lock_flags & XFS_ILOCK_EXCL)
				199	mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
				200	else if (lock_flags & XFS_ILOCK_SHARED)
				201	mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
				202	}
				203
				204	/*
				205	* This is just like xfs_ilock(), except that the caller
				206	* is guaranteed not to sleep. It returns 1 if it gets
				207	* the requested locks and 0 otherwise. If the IO lock is
				208	* obtained but the inode lock cannot be, then the IO lock
				209	* is dropped before returning.
				210	*
				211	* ip -- the inode being locked
				212	* lock_flags -- this parameter indicates the inode's locks to be
				213	* to be locked. See the comment for xfs_ilock() for a list
				214	* of valid values.
				215	*/
				216	int
				217	xfs_ilock_nowait(
				218	xfs_inode_t *ip,
				219	uint lock_flags)
				220	{
				221	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
				222
				223	/*
				224	* You can't set both SHARED and EXCL for the same lock,
				225	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				226	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				227	*/
				228	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				229	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				230	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				231	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				232	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				233	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				234	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				235
				236	if (lock_flags & XFS_IOLOCK_EXCL) {
				237	if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
				238	goto out;
				239	} else if (lock_flags & XFS_IOLOCK_SHARED) {
				240	if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
				241	goto out;
				242	}
				243
				244	if (lock_flags & XFS_MMAPLOCK_EXCL) {
				245	if (!mrtryupdate(&ip->i_mmaplock))
				246	goto out_undo_iolock;
				247	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
				248	if (!mrtryaccess(&ip->i_mmaplock))
				249	goto out_undo_iolock;
				250	}
				251
				252	if (lock_flags & XFS_ILOCK_EXCL) {
				253	if (!mrtryupdate(&ip->i_lock))
				254	goto out_undo_mmaplock;
				255	} else if (lock_flags & XFS_ILOCK_SHARED) {
				256	if (!mrtryaccess(&ip->i_lock))
				257	goto out_undo_mmaplock;
				258	}
				259	return 1;
				260
				261	out_undo_mmaplock:
				262	if (lock_flags & XFS_MMAPLOCK_EXCL)
				263	mrunlock_excl(&ip->i_mmaplock);
				264	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				265	mrunlock_shared(&ip->i_mmaplock);
				266	out_undo_iolock:
				267	if (lock_flags & XFS_IOLOCK_EXCL)
				268	up_write(&VFS_I(ip)->i_rwsem);
				269	else if (lock_flags & XFS_IOLOCK_SHARED)
				270	up_read(&VFS_I(ip)->i_rwsem);
				271	out:
				272	return 0;
				273	}
				274
				275	/*
				276	* xfs_iunlock() is used to drop the inode locks acquired with
				277	* xfs_ilock() and xfs_ilock_nowait(). The caller must pass
				278	* in the flags given to xfs_ilock() or xfs_ilock_nowait() so
				279	* that we know which locks to drop.
				280	*
				281	* ip -- the inode being unlocked
				282	* lock_flags -- this parameter indicates the inode's locks to be
				283	* to be unlocked. See the comment for xfs_ilock() for a list
				284	* of valid values for this parameter.
				285	*
				286	*/
				287	void
				288	xfs_iunlock(
				289	xfs_inode_t *ip,
				290	uint lock_flags)
				291	{
				292	/*
				293	* You can't set both SHARED and EXCL for the same lock,
				294	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				295	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				296	*/
				297	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				298	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				299	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				300	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				301	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				302	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				303	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				304	ASSERT(lock_flags != 0);
				305
				306	if (lock_flags & XFS_IOLOCK_EXCL)
				307	up_write(&VFS_I(ip)->i_rwsem);
				308	else if (lock_flags & XFS_IOLOCK_SHARED)
				309	up_read(&VFS_I(ip)->i_rwsem);
				310
				311	if (lock_flags & XFS_MMAPLOCK_EXCL)
				312	mrunlock_excl(&ip->i_mmaplock);
				313	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				314	mrunlock_shared(&ip->i_mmaplock);
				315
				316	if (lock_flags & XFS_ILOCK_EXCL)
				317	mrunlock_excl(&ip->i_lock);
				318	else if (lock_flags & XFS_ILOCK_SHARED)
				319	mrunlock_shared(&ip->i_lock);
				320
				321	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
				322	}
				323
				324	/*
				325	* give up write locks. the i/o lock cannot be held nested
				326	* if it is being demoted.
				327	*/
				328	void
				329	xfs_ilock_demote(
				330	xfs_inode_t *ip,
				331	uint lock_flags)
				332	{
				333	ASSERT(lock_flags & (XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL));
				334	ASSERT((lock_flags &
				335	~(XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL)) == 0);
				336
				337	if (lock_flags & XFS_ILOCK_EXCL)
				338	mrdemote(&ip->i_lock);
				339	if (lock_flags & XFS_MMAPLOCK_EXCL)
				340	mrdemote(&ip->i_mmaplock);
				341	if (lock_flags & XFS_IOLOCK_EXCL)
				342	downgrade_write(&VFS_I(ip)->i_rwsem);
				343
				344	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
				345	}
				346
				347	#if defined(DEBUG) \|\| defined(XFS_WARN)
				348	int
				349	xfs_isilocked(
				350	xfs_inode_t *ip,
				351	uint lock_flags)
				352	{
				353	if (lock_flags & (XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED)) {
				354	if (!(lock_flags & XFS_ILOCK_SHARED))
				355	return !!ip->i_lock.mr_writer;
				356	return rwsem_is_locked(&ip->i_lock.mr_lock);
				357	}
				358
				359	if (lock_flags & (XFS_MMAPLOCK_EXCL\|XFS_MMAPLOCK_SHARED)) {
				360	if (!(lock_flags & XFS_MMAPLOCK_SHARED))
				361	return !!ip->i_mmaplock.mr_writer;
				362	return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
				363	}
				364
				365	if (lock_flags & (XFS_IOLOCK_EXCL\|XFS_IOLOCK_SHARED)) {
				366	if (!(lock_flags & XFS_IOLOCK_SHARED))
				367	return !debug_locks \|\|
				368	lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
				369	return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
				370	}
				371
				372	ASSERT(0);
				373	return 0;
				374	}
				375	#endif
				376
				377	/*
				378	* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
				379	* DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
				380	* when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
				381	* errors and warnings.
				382	*/
				383	#if (defined(DEBUG) \|\| defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
				384	static bool
				385	xfs_lockdep_subclass_ok(
				386	int subclass)
				387	{
				388	return subclass < MAX_LOCKDEP_SUBCLASSES;
				389	}
				390	#else
				391	#define xfs_lockdep_subclass_ok(subclass) (true)
				392	#endif
				393
				394	/*
				395	* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
				396	* value. This can be called for any type of inode lock combination, including
				397	* parent locking. Care must be taken to ensure we don't overrun the subclass
				398	* storage fields in the class mask we build.
				399	*/
				400	static inline int
				401	xfs_lock_inumorder(int lock_mode, int subclass)
				402	{
				403	int class = 0;
				404
				405	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT \| XFS_ILOCK_RTBITMAP \|
				406	XFS_ILOCK_RTSUM)));
				407	ASSERT(xfs_lockdep_subclass_ok(subclass));
				408
				409	if (lock_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)) {
				410	ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
				411	class += subclass << XFS_IOLOCK_SHIFT;
				412	}
				413
				414	if (lock_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) {
				415	ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
				416	class += subclass << XFS_MMAPLOCK_SHIFT;
				417	}
				418
				419	if (lock_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)) {
				420	ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
				421	class += subclass << XFS_ILOCK_SHIFT;
				422	}
				423
				424	return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) \| class;
				425	}
				426
				427	/*
				428	* The following routine will lock n inodes in exclusive mode. We assume the
				429	* caller calls us with the inodes in i_ino order.
				430	*
				431	* We need to detect deadlock where an inode that we lock is in the AIL and we
				432	* start waiting for another inode that is locked by a thread in a long running
				433	* transaction (such as truncate). This can result in deadlock since the long
				434	* running trans might need to wait for the inode we just locked in order to
				435	* push the tail and free space in the log.
				436	*
				437	* xfs_lock_inodes() can only be used to lock one type of lock at a time -
				438	* the iolock, the mmaplock or the ilock, but not more than one at a time. If we
				439	* lock more than one at a time, lockdep will report false positives saying we
				440	* have violated locking orders.
				441	*/
				442	static void
				443	xfs_lock_inodes(
				444	xfs_inode_t **ips,
				445	int inodes,
				446	uint lock_mode)
				447	{
				448	int attempts = 0, i, j, try_lock;
				449	xfs_log_item_t *lp;
				450
				451	/*
				452	* Currently supports between 2 and 5 inodes with exclusive locking. We
				453	* support an arbitrary depth of locking here, but absolute limits on
				454	* inodes depend on the the type of locking and the limits placed by
				455	* lockdep annotations in xfs_lock_inumorder. These are all checked by
				456	* the asserts.
				457	*/
				458	ASSERT(ips && inodes >= 2 && inodes <= 5);
				459	ASSERT(lock_mode & (XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL \|
				460	XFS_ILOCK_EXCL));
				461	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED \| XFS_MMAPLOCK_SHARED \|
				462	XFS_ILOCK_SHARED)));
				463	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) \|\|
				464	inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
				465	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) \|\|
				466	inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
				467
				468	if (lock_mode & XFS_IOLOCK_EXCL) {
				469	ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL \| XFS_ILOCK_EXCL)));
				470	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
				471	ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
				472
				473	try_lock = 0;
				474	i = 0;
				475	again:
				476	for (; i < inodes; i++) {
				477	ASSERT(ips[i]);
				478
				479	if (i && (ips[i] == ips[i - 1])) /* Already locked */
				480	continue;
				481
				482	/*
				483	* If try_lock is not set yet, make sure all locked inodes are
				484	* not in the AIL. If any are, set try_lock to be used later.
				485	*/
				486	if (!try_lock) {
				487	for (j = (i - 1); j >= 0 && !try_lock; j--) {
				488	lp = (xfs_log_item_t *)ips[j]->i_itemp;
				489	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
				490	try_lock++;
				491	}
				492	}
				493
				494	/*
				495	* If any of the previous locks we have locked is in the AIL,
				496	* we must TRY to get the second and subsequent locks. If
				497	* we can't get any, we must release all we have
				498	* and try again.
				499	*/
				500	if (!try_lock) {
				501	xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
				502	continue;
				503	}
				504
				505	/* try_lock means we have an inode locked that is in the AIL. */
				506	ASSERT(i != 0);
				507	if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
				508	continue;
				509
				510	/*
				511	* Unlock all previous guys and try again. xfs_iunlock will try
				512	* to push the tail if the inode is in the AIL.
				513	*/
				514	attempts++;
				515	for (j = i - 1; j >= 0; j--) {
				516	/*
				517	* Check to see if we've already unlocked this one. Not
				518	* the first one going back, and the inode ptr is the
				519	* same.
				520	*/
				521	if (j != (i - 1) && ips[j] == ips[j + 1])
				522	continue;
				523
				524	xfs_iunlock(ips[j], lock_mode);
				525	}
				526
				527	if ((attempts % 5) == 0) {
				528	delay(1); /* Don't just spin the CPU */
				529	}
				530	i = 0;
				531	try_lock = 0;
				532	goto again;
				533	}
				534	}
				535
				536	/*
				537	* xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
				538	* the mmaplock or the ilock, but not more than one type at a time. If we lock
				539	* more than one at a time, lockdep will report false positives saying we have
				540	* violated locking orders. The iolock must be double-locked separately since
				541	* we use i_rwsem for that. We now support taking one lock EXCL and the other
				542	* SHARED.
				543	*/
				544	void
				545	xfs_lock_two_inodes(
				546	struct xfs_inode *ip0,
				547	uint ip0_mode,
				548	struct xfs_inode *ip1,
				549	uint ip1_mode)
				550	{
				551	struct xfs_inode *temp;
				552	uint mode_temp;
				553	int attempts = 0;
				554	xfs_log_item_t *lp;
				555
				556	ASSERT(hweight32(ip0_mode) == 1);
				557	ASSERT(hweight32(ip1_mode) == 1);
				558	ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)));
				559	ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)));
				560	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) \|\|
				561	!(ip0_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)));
				562	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) \|\|
				563	!(ip1_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)));
				564	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) \|\|
				565	!(ip0_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)));
				566	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) \|\|
				567	!(ip1_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)));
				568
				569	ASSERT(ip0->i_ino != ip1->i_ino);
				570
				571	if (ip0->i_ino > ip1->i_ino) {
				572	temp = ip0;
				573	ip0 = ip1;
				574	ip1 = temp;
				575	mode_temp = ip0_mode;
				576	ip0_mode = ip1_mode;
				577	ip1_mode = mode_temp;
				578	}
				579
				580	again:
				581	xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
				582
				583	/*
				584	* If the first lock we have locked is in the AIL, we must TRY to get
				585	* the second lock. If we can't get it, we must release the first one
				586	* and try again.
				587	*/
				588	lp = (xfs_log_item_t *)ip0->i_itemp;
				589	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
				590	if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
				591	xfs_iunlock(ip0, ip0_mode);
				592	if ((++attempts % 5) == 0)
				593	delay(1); /* Don't just spin the CPU */
				594	goto again;
				595	}
				596	} else {
				597	xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
				598	}
				599	}
				600
				601	void
				602	__xfs_iflock(
				603	struct xfs_inode *ip)
				604	{
				605	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
				606	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
				607
				608	do {
				609	prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				610	if (xfs_isiflocked(ip))
				611	io_schedule();
				612	} while (!xfs_iflock_nowait(ip));
				613
				614	finish_wait(wq, &wait.wq_entry);
				615	}
				616
				617	STATIC uint
				618	_xfs_dic2xflags(
				619	uint16_t di_flags,
				620	uint64_t di_flags2,
				621	bool has_attr)
				622	{
				623	uint flags = 0;
				624
				625	if (di_flags & XFS_DIFLAG_ANY) {
				626	if (di_flags & XFS_DIFLAG_REALTIME)
				627	flags \|= FS_XFLAG_REALTIME;
				628	if (di_flags & XFS_DIFLAG_PREALLOC)
				629	flags \|= FS_XFLAG_PREALLOC;
				630	if (di_flags & XFS_DIFLAG_IMMUTABLE)
				631	flags \|= FS_XFLAG_IMMUTABLE;
				632	if (di_flags & XFS_DIFLAG_APPEND)
				633	flags \|= FS_XFLAG_APPEND;
				634	if (di_flags & XFS_DIFLAG_SYNC)
				635	flags \|= FS_XFLAG_SYNC;
				636	if (di_flags & XFS_DIFLAG_NOATIME)
				637	flags \|= FS_XFLAG_NOATIME;
				638	if (di_flags & XFS_DIFLAG_NODUMP)
				639	flags \|= FS_XFLAG_NODUMP;
				640	if (di_flags & XFS_DIFLAG_RTINHERIT)
				641	flags \|= FS_XFLAG_RTINHERIT;
				642	if (di_flags & XFS_DIFLAG_PROJINHERIT)
				643	flags \|= FS_XFLAG_PROJINHERIT;
				644	if (di_flags & XFS_DIFLAG_NOSYMLINKS)
				645	flags \|= FS_XFLAG_NOSYMLINKS;
				646	if (di_flags & XFS_DIFLAG_EXTSIZE)
				647	flags \|= FS_XFLAG_EXTSIZE;
				648	if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
				649	flags \|= FS_XFLAG_EXTSZINHERIT;
				650	if (di_flags & XFS_DIFLAG_NODEFRAG)
				651	flags \|= FS_XFLAG_NODEFRAG;
				652	if (di_flags & XFS_DIFLAG_FILESTREAM)
				653	flags \|= FS_XFLAG_FILESTREAM;
				654	}
				655
				656	if (di_flags2 & XFS_DIFLAG2_ANY) {
				657	if (di_flags2 & XFS_DIFLAG2_DAX)
				658	flags \|= FS_XFLAG_DAX;
				659	if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
				660	flags \|= FS_XFLAG_COWEXTSIZE;
				661	}
				662
				663	if (has_attr)
				664	flags \|= FS_XFLAG_HASATTR;
				665
				666	return flags;
				667	}
				668
				669	uint
				670	xfs_ip2xflags(
				671	struct xfs_inode *ip)
				672	{
				673	struct xfs_icdinode *dic = &ip->i_d;
				674
				675	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
				676	}
				677
				678	/*
				679	* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
				680	* is allowed, otherwise it has to be an exact match. If a CI match is found,
				681	* ci_name->name will point to a the actual name (caller must free) or
				682	* will be set to NULL if an exact match is found.
				683	*/
				684	int
				685	xfs_lookup(
				686	xfs_inode_t *dp,
				687	struct xfs_name *name,
				688	xfs_inode_t **ipp,
				689	struct xfs_name *ci_name)
				690	{
				691	xfs_ino_t inum;
				692	int error;
				693
				694	trace_xfs_lookup(dp, name);
				695
				696	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
				697	return -EIO;
				698
				699	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
				700	if (error)
				701	goto out_unlock;
				702
				703	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
				704	if (error)
				705	goto out_free_name;
				706
				707	return 0;
				708
				709	out_free_name:
				710	if (ci_name)
				711	kmem_free(ci_name->name);
				712	out_unlock:
				713	*ipp = NULL;
				714	return error;
				715	}
				716
				717	/*
				718	* Allocate an inode on disk and return a copy of its in-core version.
				719	* The in-core inode is locked exclusively. Set mode, nlink, and rdev
				720	* appropriately within the inode. The uid and gid for the inode are
				721	* set according to the contents of the given cred structure.
				722	*
				723	* Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
				724	* has a free inode available, call xfs_iget() to obtain the in-core
				725	* version of the allocated inode. Finally, fill in the inode and
				726	* log its initial contents. In this case, ialloc_context would be
				727	* set to NULL.
				728	*
				729	* If xfs_dialloc() does not have an available inode, it will replenish
				730	* its supply by doing an allocation. Since we can only do one
				731	* allocation within a transaction without deadlocks, we must commit
				732	* the current transaction before returning the inode itself.
				733	* In this case, therefore, we will set ialloc_context and return.
				734	* The caller should then commit the current transaction, start a new
				735	* transaction, and call xfs_ialloc() again to actually get the inode.
				736	*
				737	* To ensure that some other process does not grab the inode that
				738	* was allocated during the first call to xfs_ialloc(), this routine
				739	* also returns the [locked] bp pointing to the head of the freelist
				740	* as ialloc_context. The caller should hold this buffer across
				741	* the commit and pass it back into this routine on the second call.
				742	*
				743	* If we are allocating quota inodes, we do not have a parent inode
				744	* to attach to or associate with (i.e. pip == NULL) because they
				745	* are not linked into the directory structure - they are attached
				746	* directly to the superblock - and so have no parent.
				747	*/
				748	static int
				749	xfs_ialloc(
				750	xfs_trans_t *tp,
				751	xfs_inode_t *pip,
				752	umode_t mode,
				753	xfs_nlink_t nlink,
				754	dev_t rdev,
				755	prid_t prid,
				756	xfs_buf_t **ialloc_context,
				757	xfs_inode_t **ipp)
				758	{
				759	struct xfs_mount *mp = tp->t_mountp;
				760	xfs_ino_t ino;
				761	xfs_inode_t *ip;
				762	uint flags;
				763	int error;
				764	struct timespec64 tv;
				765	struct inode *inode;
				766
				767	/*
				768	* Call the space management code to pick
				769	* the on-disk inode to be allocated.
				770	*/
				771	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
				772	ialloc_context, &ino);
				773	if (error)
				774	return error;
				775	if (*ialloc_context \|\| ino == NULLFSINO) {
				776	*ipp = NULL;
				777	return 0;
				778	}
				779	ASSERT(*ialloc_context == NULL);
				780
				781	/*
				782	* Protect against obviously corrupt allocation btree records. Later
				783	* xfs_iget checks will catch re-allocation of other active in-memory
				784	* and on-disk inodes. If we don't catch reallocating the parent inode
				785	* here we will deadlock in xfs_iget() so we have to do these checks
				786	* first.
				787	*/
				788	if ((pip && ino == pip->i_ino) \|\| !xfs_verify_dir_ino(mp, ino)) {
				789	xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
				790	return -EFSCORRUPTED;
				791	}
				792
				793	/*
				794	* Get the in-core inode with the lock held exclusively.
				795	* This is because we're setting fields here we need
				796	* to prevent others from looking at until we're done.
				797	*/
				798	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
				799	XFS_ILOCK_EXCL, &ip);
				800	if (error)
				801	return error;
				802	ASSERT(ip != NULL);
				803	inode = VFS_I(ip);
				804
				805	/*
				806	* We always convert v1 inodes to v2 now - we only support filesystems
				807	* with >= v2 inode capability, so there is no reason for ever leaving
				808	* an inode in v1 format.
				809	*/
				810	if (ip->i_d.di_version == 1)
				811	ip->i_d.di_version = 2;
				812
				813	inode->i_mode = mode;
				814	set_nlink(inode, nlink);
				815	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
				816	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
				817	inode->i_rdev = rdev;
				818	xfs_set_projid(ip, prid);
				819
				820	if (pip && XFS_INHERIT_GID(pip)) {
				821	ip->i_d.di_gid = pip->i_d.di_gid;
				822	if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
				823	inode->i_mode \|= S_ISGID;
				824	}
				825
				826	/*
				827	* If the group ID of the new file does not match the effective group
				828	* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
				829	* (and only if the irix_sgid_inherit compatibility variable is set).
				830	*/
				831	if ((irix_sgid_inherit) &&
				832	(inode->i_mode & S_ISGID) &&
				833	(!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
				834	inode->i_mode &= ~S_ISGID;
				835
				836	ip->i_d.di_size = 0;
				837	ip->i_d.di_nextents = 0;
				838	ASSERT(ip->i_d.di_nblocks == 0);
				839
				840	tv = current_time(inode);
				841	inode->i_mtime = tv;
				842	inode->i_atime = tv;
				843	inode->i_ctime = tv;
				844
				845	ip->i_d.di_extsize = 0;
				846	ip->i_d.di_dmevmask = 0;
				847	ip->i_d.di_dmstate = 0;
				848	ip->i_d.di_flags = 0;
				849
				850	if (ip->i_d.di_version == 3) {
				851	inode_set_iversion(inode, 1);
				852	ip->i_d.di_flags2 = 0;
				853	ip->i_d.di_cowextsize = 0;
				854	ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
				855	ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
				856	}
				857
				858
				859	flags = XFS_ILOG_CORE;
				860	switch (mode & S_IFMT) {
				861	case S_IFIFO:
				862	case S_IFCHR:
				863	case S_IFBLK:
				864	case S_IFSOCK:
				865	ip->i_d.di_format = XFS_DINODE_FMT_DEV;
				866	ip->i_df.if_flags = 0;
				867	flags \|= XFS_ILOG_DEV;
				868	break;
				869	case S_IFREG:
				870	case S_IFDIR:
				871	if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
				872	uint di_flags = 0;
				873
				874	if (S_ISDIR(mode)) {
				875	if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
				876	di_flags \|= XFS_DIFLAG_RTINHERIT;
				877	if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
				878	di_flags \|= XFS_DIFLAG_EXTSZINHERIT;
				879	ip->i_d.di_extsize = pip->i_d.di_extsize;
				880	}
				881	if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
				882	di_flags \|= XFS_DIFLAG_PROJINHERIT;
				883	} else if (S_ISREG(mode)) {
				884	if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
				885	di_flags \|= XFS_DIFLAG_REALTIME;
				886	if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
				887	di_flags \|= XFS_DIFLAG_EXTSIZE;
				888	ip->i_d.di_extsize = pip->i_d.di_extsize;
				889	}
				890	}
				891	if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
				892	xfs_inherit_noatime)
				893	di_flags \|= XFS_DIFLAG_NOATIME;
				894	if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
				895	xfs_inherit_nodump)
				896	di_flags \|= XFS_DIFLAG_NODUMP;
				897	if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
				898	xfs_inherit_sync)
				899	di_flags \|= XFS_DIFLAG_SYNC;
				900	if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
				901	xfs_inherit_nosymlinks)
				902	di_flags \|= XFS_DIFLAG_NOSYMLINKS;
				903	if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
				904	xfs_inherit_nodefrag)
				905	di_flags \|= XFS_DIFLAG_NODEFRAG;
				906	if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
				907	di_flags \|= XFS_DIFLAG_FILESTREAM;
				908
				909	ip->i_d.di_flags \|= di_flags;
				910	}
				911	if (pip &&
				912	(pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
				913	pip->i_d.di_version == 3 &&
				914	ip->i_d.di_version == 3) {
				915	uint64_t di_flags2 = 0;
				916
				917	if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
				918	di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
				919	ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
				920	}
				921	if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
				922	di_flags2 \|= XFS_DIFLAG2_DAX;
				923
				924	ip->i_d.di_flags2 \|= di_flags2;
				925	}
				926	/* FALLTHROUGH */
				927	case S_IFLNK:
				928	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
				929	ip->i_df.if_flags = XFS_IFEXTENTS;
				930	ip->i_df.if_bytes = 0;
				931	ip->i_df.if_u1.if_root = NULL;
				932	break;
				933	default:
				934	ASSERT(0);
				935	}
				936	/*
				937	* Attribute fork settings for new inode.
				938	*/
				939	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
				940	ip->i_d.di_anextents = 0;
				941
				942	/*
				943	* Log the new values stuffed into the inode.
				944	*/
				945	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				946	xfs_trans_log_inode(tp, ip, flags);
				947
				948	/* now that we have an i_mode we can setup the inode structure */
				949	xfs_setup_inode(ip);
				950
				951	*ipp = ip;
				952	return 0;
				953	}
				954
				955	/*
				956	* Allocates a new inode from disk and return a pointer to the
				957	* incore copy. This routine will internally commit the current
				958	* transaction and allocate a new one if the Space Manager needed
				959	* to do an allocation to replenish the inode free-list.
				960	*
				961	* This routine is designed to be called from xfs_create and
				962	* xfs_create_dir.
				963	*
				964	*/
				965	int
				966	xfs_dir_ialloc(
				967	xfs_trans_t *tpp, / input: current transaction;
				968	output: may be a new transaction. */
				969	xfs_inode_t dp, / directory within whose allocate
				970	the inode. */
				971	umode_t mode,
				972	xfs_nlink_t nlink,
				973	dev_t rdev,
				974	prid_t prid, /* project id */
				975	xfs_inode_t *ipp) / pointer to inode; it will be
				976	locked. */
				977	{
				978	xfs_trans_t *tp;
				979	xfs_inode_t *ip;
				980	xfs_buf_t *ialloc_context = NULL;
				981	int code;
				982	void *dqinfo;
				983	uint tflags;
				984
				985	tp = *tpp;
				986	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
				987
				988	/*
				989	* xfs_ialloc will return a pointer to an incore inode if
				990	* the Space Manager has an available inode on the free
				991	* list. Otherwise, it will do an allocation and replenish
				992	* the freelist. Since we can only do one allocation per
				993	* transaction without deadlocks, we will need to commit the
				994	* current transaction and start a new one. We will then
				995	* need to call xfs_ialloc again to get the inode.
				996	*
				997	* If xfs_ialloc did an allocation to replenish the freelist,
				998	* it returns the bp containing the head of the freelist as
				999	* ialloc_context. We will hold a lock on it across the
				1000	* transaction commit so that no other process can steal
				1001	* the inode(s) that we've just allocated.
				1002	*/
				1003	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
				1004	&ip);
				1005
				1006	/*
				1007	* Return an error if we were unable to allocate a new inode.
				1008	* This should only happen if we run out of space on disk or
				1009	* encounter a disk error.
				1010	*/
				1011	if (code) {
				1012	*ipp = NULL;
				1013	return code;
				1014	}
				1015	if (!ialloc_context && !ip) {
				1016	*ipp = NULL;
				1017	return -ENOSPC;
				1018	}
				1019
				1020	/*
				1021	* If the AGI buffer is non-NULL, then we were unable to get an
				1022	* inode in one operation. We need to commit the current
				1023	* transaction and call xfs_ialloc() again. It is guaranteed
				1024	* to succeed the second time.
				1025	*/
				1026	if (ialloc_context) {
				1027	/*
				1028	* Normally, xfs_trans_commit releases all the locks.
				1029	* We call bhold to hang on to the ialloc_context across
				1030	* the commit. Holding this buffer prevents any other
				1031	* processes from doing any allocations in this
				1032	* allocation group.
				1033	*/
				1034	xfs_trans_bhold(tp, ialloc_context);
				1035
				1036	/*
				1037	* We want the quota changes to be associated with the next
				1038	* transaction, NOT this one. So, detach the dqinfo from this
				1039	* and attach it to the next transaction.
				1040	*/
				1041	dqinfo = NULL;
				1042	tflags = 0;
				1043	if (tp->t_dqinfo) {
				1044	dqinfo = (void *)tp->t_dqinfo;
				1045	tp->t_dqinfo = NULL;
				1046	tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
				1047	tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
				1048	}
				1049
				1050	code = xfs_trans_roll(&tp);
				1051
				1052	/*
				1053	* Re-attach the quota info that we detached from prev trx.
				1054	*/
				1055	if (dqinfo) {
				1056	tp->t_dqinfo = dqinfo;
				1057	tp->t_flags \|= tflags;
				1058	}
				1059
				1060	if (code) {
				1061	xfs_buf_relse(ialloc_context);
				1062	*tpp = tp;
				1063	*ipp = NULL;
				1064	return code;
				1065	}
				1066	xfs_trans_bjoin(tp, ialloc_context);
				1067
				1068	/*
				1069	* Call ialloc again. Since we've locked out all
				1070	* other allocations in this allocation group,
				1071	* this call should always succeed.
				1072	*/
				1073	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
				1074	&ialloc_context, &ip);
				1075
				1076	/*
				1077	* If we get an error at this point, return to the caller
				1078	* so that the current transaction can be aborted.
				1079	*/
				1080	if (code) {
				1081	*tpp = tp;
				1082	*ipp = NULL;
				1083	return code;
				1084	}
				1085	ASSERT(!ialloc_context && ip);
				1086
				1087	}
				1088
				1089	*ipp = ip;
				1090	*tpp = tp;
				1091
				1092	return 0;
				1093	}
				1094
				1095	/*
				1096	* Decrement the link count on an inode & log the change. If this causes the
				1097	* link count to go to zero, move the inode to AGI unlinked list so that it can
				1098	* be freed when the last active reference goes away via xfs_inactive().
				1099	*/
				1100	static int /* error */
				1101	xfs_droplink(
				1102	xfs_trans_t *tp,
				1103	xfs_inode_t *ip)
				1104	{
				1105	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
				1106
				1107	drop_nlink(VFS_I(ip));
				1108	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1109
				1110	if (VFS_I(ip)->i_nlink)
				1111	return 0;
				1112
				1113	return xfs_iunlink(tp, ip);
				1114	}
				1115
				1116	/*
				1117	* Increment the link count on an inode & log the change.
				1118	*/
				1119	static int
				1120	xfs_bumplink(
				1121	xfs_trans_t *tp,
				1122	xfs_inode_t *ip)
				1123	{
				1124	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
				1125
				1126	ASSERT(ip->i_d.di_version > 1);
				1127	inc_nlink(VFS_I(ip));
				1128	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1129	return 0;
				1130	}
				1131
				1132	int
				1133	xfs_create(
				1134	xfs_inode_t *dp,
				1135	struct xfs_name *name,
				1136	umode_t mode,
				1137	dev_t rdev,
				1138	xfs_inode_t **ipp)
				1139	{
				1140	int is_dir = S_ISDIR(mode);
				1141	struct xfs_mount *mp = dp->i_mount;
				1142	struct xfs_inode *ip = NULL;
				1143	struct xfs_trans *tp = NULL;
				1144	int error;
				1145	bool unlock_dp_on_error = false;
				1146	prid_t prid;
				1147	struct xfs_dquot *udqp = NULL;
				1148	struct xfs_dquot *gdqp = NULL;
				1149	struct xfs_dquot *pdqp = NULL;
				1150	struct xfs_trans_res *tres;
				1151	uint resblks;
				1152
				1153	trace_xfs_create(dp, name);
				1154
				1155	if (XFS_FORCED_SHUTDOWN(mp))
				1156	return -EIO;
				1157
				1158	prid = xfs_get_initial_prid(dp);
				1159
				1160	/*
				1161	* Make sure that we have allocated dquot(s) on disk.
				1162	*/
				1163	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
				1164	xfs_kgid_to_gid(current_fsgid()), prid,
				1165	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
				1166	&udqp, &gdqp, &pdqp);
				1167	if (error)
				1168	return error;
				1169
				1170	if (is_dir) {
				1171	resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
				1172	tres = &M_RES(mp)->tr_mkdir;
				1173	} else {
				1174	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
				1175	tres = &M_RES(mp)->tr_create;
				1176	}
				1177
				1178	/*
				1179	* Initially assume that the file does not exist and
				1180	* reserve the resources for that case. If that is not
				1181	* the case we'll drop the one we have and get a more
				1182	* appropriate transaction later.
				1183	*/
				1184	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1185	if (error == -ENOSPC) {
				1186	/* flush outstanding delalloc blocks and retry */
				1187	xfs_flush_inodes(mp);
				1188	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1189	}
				1190	if (error)
				1191	goto out_release_inode;
				1192
				1193	xfs_ilock(dp, XFS_ILOCK_EXCL \| XFS_ILOCK_PARENT);
				1194	unlock_dp_on_error = true;
				1195
				1196	/*
				1197	* Reserve disk quota and the inode.
				1198	*/
				1199	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
				1200	pdqp, resblks, 1, 0);
				1201	if (error)
				1202	goto out_trans_cancel;
				1203
				1204	/*
				1205	* A newly created regular or special file just has one directory
				1206	* entry pointing to them, but a directory also the "." entry
				1207	* pointing to itself.
				1208	*/
				1209	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
				1210	if (error)
				1211	goto out_trans_cancel;
				1212
				1213	/*
				1214	* Now we join the directory inode to the transaction. We do not do it
				1215	* earlier because xfs_dir_ialloc might commit the previous transaction
				1216	* (and release all the locks). An error from here on will result in
				1217	* the transaction cancel unlocking dp so don't do it explicitly in the
				1218	* error path.
				1219	*/
				1220	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
				1221	unlock_dp_on_error = false;
				1222
				1223	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
				1224	resblks ?
				1225	resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
				1226	if (error) {
				1227	ASSERT(error != -ENOSPC);
				1228	goto out_trans_cancel;
				1229	}
				1230	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				1231	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
				1232
				1233	if (is_dir) {
				1234	error = xfs_dir_init(tp, ip, dp);
				1235	if (error)
				1236	goto out_trans_cancel;
				1237
				1238	error = xfs_bumplink(tp, dp);
				1239	if (error)
				1240	goto out_trans_cancel;
				1241	}
				1242
				1243	/*
				1244	* If this is a synchronous mount, make sure that the
				1245	* create transaction goes to disk before returning to
				1246	* the user.
				1247	*/
				1248	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				1249	xfs_trans_set_sync(tp);
				1250
				1251	/*
				1252	* Attach the dquot(s) to the inodes and modify them incore.
				1253	* These ids of the inode couldn't have changed since the new
				1254	* inode has been locked ever since it was created.
				1255	*/
				1256	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
				1257
				1258	error = xfs_trans_commit(tp);
				1259	if (error)
				1260	goto out_release_inode;
				1261
				1262	xfs_qm_dqrele(udqp);
				1263	xfs_qm_dqrele(gdqp);
				1264	xfs_qm_dqrele(pdqp);
				1265
				1266	*ipp = ip;
				1267	return 0;
				1268
				1269	out_trans_cancel:
				1270	xfs_trans_cancel(tp);
				1271	out_release_inode:
				1272	/*
				1273	* Wait until after the current transaction is aborted to finish the
				1274	* setup of the inode and release the inode. This prevents recursive
				1275	* transactions and deadlocks from xfs_inactive.
				1276	*/
				1277	if (ip) {
				1278	xfs_finish_inode_setup(ip);
				1279	xfs_irele(ip);
				1280	}
				1281
				1282	xfs_qm_dqrele(udqp);
				1283	xfs_qm_dqrele(gdqp);
				1284	xfs_qm_dqrele(pdqp);
				1285
				1286	if (unlock_dp_on_error)
				1287	xfs_iunlock(dp, XFS_ILOCK_EXCL);
				1288	return error;
				1289	}
				1290
				1291	int
				1292	xfs_create_tmpfile(
				1293	struct xfs_inode *dp,
				1294	umode_t mode,
				1295	struct xfs_inode **ipp)
				1296	{
				1297	struct xfs_mount *mp = dp->i_mount;
				1298	struct xfs_inode *ip = NULL;
				1299	struct xfs_trans *tp = NULL;
				1300	int error;
				1301	prid_t prid;
				1302	struct xfs_dquot *udqp = NULL;
				1303	struct xfs_dquot *gdqp = NULL;
				1304	struct xfs_dquot *pdqp = NULL;
				1305	struct xfs_trans_res *tres;
				1306	uint resblks;
				1307
				1308	if (XFS_FORCED_SHUTDOWN(mp))
				1309	return -EIO;
				1310
				1311	prid = xfs_get_initial_prid(dp);
				1312
				1313	/*
				1314	* Make sure that we have allocated dquot(s) on disk.
				1315	*/
				1316	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
				1317	xfs_kgid_to_gid(current_fsgid()), prid,
				1318	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
				1319	&udqp, &gdqp, &pdqp);
				1320	if (error)
				1321	return error;
				1322
				1323	resblks = XFS_IALLOC_SPACE_RES(mp);
				1324	tres = &M_RES(mp)->tr_create_tmpfile;
				1325
				1326	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1327	if (error)
				1328	goto out_release_inode;
				1329
				1330	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
				1331	pdqp, resblks, 1, 0);
				1332	if (error)
				1333	goto out_trans_cancel;
				1334
				1335	error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
				1336	if (error)
				1337	goto out_trans_cancel;
				1338
				1339	if (mp->m_flags & XFS_MOUNT_WSYNC)
				1340	xfs_trans_set_sync(tp);
				1341
				1342	/*
				1343	* Attach the dquot(s) to the inodes and modify them incore.
				1344	* These ids of the inode couldn't have changed since the new
				1345	* inode has been locked ever since it was created.
				1346	*/
				1347	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
				1348
				1349	error = xfs_iunlink(tp, ip);
				1350	if (error)
				1351	goto out_trans_cancel;
				1352
				1353	error = xfs_trans_commit(tp);
				1354	if (error)
				1355	goto out_release_inode;
				1356
				1357	xfs_qm_dqrele(udqp);
				1358	xfs_qm_dqrele(gdqp);
				1359	xfs_qm_dqrele(pdqp);
				1360
				1361	*ipp = ip;
				1362	return 0;
				1363
				1364	out_trans_cancel:
				1365	xfs_trans_cancel(tp);
				1366	out_release_inode:
				1367	/*
				1368	* Wait until after the current transaction is aborted to finish the
				1369	* setup of the inode and release the inode. This prevents recursive
				1370	* transactions and deadlocks from xfs_inactive.
				1371	*/
				1372	if (ip) {
				1373	xfs_finish_inode_setup(ip);
				1374	xfs_irele(ip);
				1375	}
				1376
				1377	xfs_qm_dqrele(udqp);
				1378	xfs_qm_dqrele(gdqp);
				1379	xfs_qm_dqrele(pdqp);
				1380
				1381	return error;
				1382	}
				1383
				1384	int
				1385	xfs_link(
				1386	xfs_inode_t *tdp,
				1387	xfs_inode_t *sip,
				1388	struct xfs_name *target_name)
				1389	{
				1390	xfs_mount_t *mp = tdp->i_mount;
				1391	xfs_trans_t *tp;
				1392	int error;
				1393	int resblks;
				1394
				1395	trace_xfs_link(tdp, target_name);
				1396
				1397	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
				1398
				1399	if (XFS_FORCED_SHUTDOWN(mp))
				1400	return -EIO;
				1401
				1402	error = xfs_qm_dqattach(sip);
				1403	if (error)
				1404	goto std_return;
				1405
				1406	error = xfs_qm_dqattach(tdp);
				1407	if (error)
				1408	goto std_return;
				1409
				1410	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
				1411	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
				1412	if (error == -ENOSPC) {
				1413	resblks = 0;
				1414	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
				1415	}
				1416	if (error)
				1417	goto std_return;
				1418
				1419	xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
				1420
				1421	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
				1422	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
				1423
				1424	/*
				1425	* If we are using project inheritance, we only allow hard link
				1426	* creation in our tree when the project IDs are the same; else
				1427	* the tree quota mechanism could be circumvented.
				1428	*/
				1429	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
				1430	(xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
				1431	error = -EXDEV;
				1432	goto error_return;
				1433	}
				1434
				1435	if (!resblks) {
				1436	error = xfs_dir_canenter(tp, tdp, target_name);
				1437	if (error)
				1438	goto error_return;
				1439	}
				1440
				1441	/*
				1442	* Handle initial link state of O_TMPFILE inode
				1443	*/
				1444	if (VFS_I(sip)->i_nlink == 0) {
				1445	error = xfs_iunlink_remove(tp, sip);
				1446	if (error)
				1447	goto error_return;
				1448	}
				1449
				1450	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
				1451	resblks);
				1452	if (error)
				1453	goto error_return;
				1454	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				1455	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
				1456
				1457	error = xfs_bumplink(tp, sip);
				1458	if (error)
				1459	goto error_return;
				1460
				1461	/*
				1462	* If this is a synchronous mount, make sure that the
				1463	* link transaction goes to disk before returning to
				1464	* the user.
				1465	*/
				1466	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				1467	xfs_trans_set_sync(tp);
				1468
				1469	return xfs_trans_commit(tp);
				1470
				1471	error_return:
				1472	xfs_trans_cancel(tp);
				1473	std_return:
				1474	return error;
				1475	}
				1476
				1477	/* Clear the reflink flag and the cowblocks tag if possible. */
				1478	static void
				1479	xfs_itruncate_clear_reflink_flags(
				1480	struct xfs_inode *ip)
				1481	{
				1482	struct xfs_ifork *dfork;
				1483	struct xfs_ifork *cfork;
				1484
				1485	if (!xfs_is_reflink_inode(ip))
				1486	return;
				1487	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
				1488	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				1489	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
				1490	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
				1491	if (cfork->if_bytes == 0)
				1492	xfs_inode_clear_cowblocks_tag(ip);
				1493	}
				1494
				1495	/*
				1496	* Free up the underlying blocks past new_size. The new size must be smaller
				1497	* than the current size. This routine can be used both for the attribute and
				1498	* data fork, and does not modify the inode size, which is left to the caller.
				1499	*
				1500	* The transaction passed to this routine must have made a permanent log
				1501	* reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
				1502	* given transaction and start new ones, so make sure everything involved in
				1503	* the transaction is tidy before calling here. Some transaction will be
				1504	* returned to the caller to be committed. The incoming transaction must
				1505	* already include the inode, and both inode locks must be held exclusively.
				1506	* The inode must also be "held" within the transaction. On return the inode
				1507	* will be "held" within the returned transaction. This routine does NOT
				1508	* require any disk space to be reserved for it within the transaction.
				1509	*
				1510	* If we get an error, we must return with the inode locked and linked into the
				1511	* current transaction. This keeps things simple for the higher level code,
				1512	* because it always knows that the inode is locked and held in the transaction
				1513	* that returns to it whether errors occur or not. We don't mark the inode
				1514	* dirty on error so that transactions can be easily aborted if possible.
				1515	*/
				1516	int
				1517	xfs_itruncate_extents_flags(
				1518	struct xfs_trans **tpp,
				1519	struct xfs_inode *ip,
				1520	int whichfork,
				1521	xfs_fsize_t new_size,
				1522	int flags)
				1523	{
				1524	struct xfs_mount *mp = ip->i_mount;
				1525	struct xfs_trans tp = tpp;
				1526	xfs_fileoff_t first_unmap_block;
				1527	xfs_fileoff_t last_block;
				1528	xfs_filblks_t unmap_len;
				1529	int error = 0;
				1530	int done = 0;
				1531
				1532	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				1533	ASSERT(!atomic_read(&VFS_I(ip)->i_count) \|\|
				1534	xfs_isilocked(ip, XFS_IOLOCK_EXCL));
				1535	ASSERT(new_size <= XFS_ISIZE(ip));
				1536	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
				1537	ASSERT(ip->i_itemp != NULL);
				1538	ASSERT(ip->i_itemp->ili_lock_flags == 0);
				1539	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
				1540
				1541	trace_xfs_itruncate_extents_start(ip, new_size);
				1542
				1543	flags \|= xfs_bmapi_aflag(whichfork);
				1544
				1545	/*
				1546	* Since it is possible for space to become allocated beyond
				1547	* the end of the file (in a crash where the space is allocated
				1548	* but the inode size is not yet updated), simply remove any
				1549	* blocks which show up between the new EOF and the maximum
				1550	* possible file size. If the first block to be removed is
				1551	* beyond the maximum file size (ie it is the same as last_block),
				1552	* then there is nothing to do.
				1553	*/
				1554	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
				1555	last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
				1556	if (first_unmap_block == last_block)
				1557	return 0;
				1558
				1559	ASSERT(first_unmap_block < last_block);
				1560	unmap_len = last_block - first_unmap_block + 1;
				1561	while (!done) {
				1562	ASSERT(tp->t_firstblock == NULLFSBLOCK);
				1563	error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
				1564	XFS_ITRUNC_MAX_EXTENTS, &done);
				1565	if (error)
				1566	goto out;
				1567
				1568	/*
				1569	* Duplicate the transaction that has the permanent
				1570	* reservation and commit the old transaction.
				1571	*/
				1572	error = xfs_defer_finish(&tp);
				1573	if (error)
				1574	goto out;
				1575
				1576	error = xfs_trans_roll_inode(&tp, ip);
				1577	if (error)
				1578	goto out;
				1579	}
				1580
				1581	if (whichfork == XFS_DATA_FORK) {
				1582	/* Remove all pending CoW reservations. */
				1583	error = xfs_reflink_cancel_cow_blocks(ip, &tp,
				1584	first_unmap_block, last_block, true);
				1585	if (error)
				1586	goto out;
				1587
				1588	xfs_itruncate_clear_reflink_flags(ip);
				1589	}
				1590
				1591	/*
				1592	* Always re-log the inode so that our permanent transaction can keep
				1593	* on rolling it forward in the log.
				1594	*/
				1595	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1596
				1597	trace_xfs_itruncate_extents_end(ip, new_size);
				1598
				1599	out:
				1600	*tpp = tp;
				1601	return error;
				1602	}
				1603
				1604	int
				1605	xfs_release(
				1606	xfs_inode_t *ip)
				1607	{
				1608	xfs_mount_t *mp = ip->i_mount;
				1609	int error;
				1610
				1611	if (!S_ISREG(VFS_I(ip)->i_mode) \|\| (VFS_I(ip)->i_mode == 0))
				1612	return 0;
				1613
				1614	/* If this is a read-only mount, don't do this (would generate I/O) */
				1615	if (mp->m_flags & XFS_MOUNT_RDONLY)
				1616	return 0;
				1617
				1618	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1619	int truncated;
				1620
				1621	/*
				1622	* If we previously truncated this file and removed old data
				1623	* in the process, we want to initiate "early" writeout on
				1624	* the last close. This is an attempt to combat the notorious
				1625	* NULL files problem which is particularly noticeable from a
				1626	* truncate down, buffered (re-)write (delalloc), followed by
				1627	* a crash. What we are effectively doing here is
				1628	* significantly reducing the time window where we'd otherwise
				1629	* be exposed to that problem.
				1630	*/
				1631	truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
				1632	if (truncated) {
				1633	xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
				1634	if (ip->i_delayed_blks > 0) {
				1635	error = filemap_flush(VFS_I(ip)->i_mapping);
				1636	if (error)
				1637	return error;
				1638	}
				1639	}
				1640	}
				1641
				1642	if (VFS_I(ip)->i_nlink == 0)
				1643	return 0;
				1644
				1645	if (xfs_can_free_eofblocks(ip, false)) {
				1646
				1647	/*
				1648	* Check if the inode is being opened, written and closed
				1649	* frequently and we have delayed allocation blocks outstanding
				1650	* (e.g. streaming writes from the NFS server), truncating the
				1651	* blocks past EOF will cause fragmentation to occur.
				1652	*
				1653	* In this case don't do the truncation, but we have to be
				1654	* careful how we detect this case. Blocks beyond EOF show up as
				1655	* i_delayed_blks even when the inode is clean, so we need to
				1656	* truncate them away first before checking for a dirty release.
				1657	* Hence on the first dirty close we will still remove the
				1658	* speculative allocation, but after that we will leave it in
				1659	* place.
				1660	*/
				1661	if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
				1662	return 0;
				1663	/*
				1664	* If we can't get the iolock just skip truncating the blocks
				1665	* past EOF because we could deadlock with the mmap_sem
				1666	* otherwise. We'll get another chance to drop them once the
				1667	* last reference to the inode is dropped, so we'll never leak
				1668	* blocks permanently.
				1669	*/
				1670	if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
				1671	error = xfs_free_eofblocks(ip);
				1672	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
				1673	if (error)
				1674	return error;
				1675	}
				1676
				1677	/* delalloc blocks after truncation means it really is dirty */
				1678	if (ip->i_delayed_blks)
				1679	xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
				1680	}
				1681	return 0;
				1682	}
				1683
				1684	/*
				1685	* xfs_inactive_truncate
				1686	*
				1687	* Called to perform a truncate when an inode becomes unlinked.
				1688	*/
				1689	STATIC int
				1690	xfs_inactive_truncate(
				1691	struct xfs_inode *ip)
				1692	{
				1693	struct xfs_mount *mp = ip->i_mount;
				1694	struct xfs_trans *tp;
				1695	int error;
				1696
				1697	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
				1698	if (error) {
				1699	ASSERT(XFS_FORCED_SHUTDOWN(mp));
				1700	return error;
				1701	}
				1702	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1703	xfs_trans_ijoin(tp, ip, 0);
				1704
				1705	/*
				1706	* Log the inode size first to prevent stale data exposure in the event
				1707	* of a system crash before the truncate completes. See the related
				1708	* comment in xfs_vn_setattr_size() for details.
				1709	*/
				1710	ip->i_d.di_size = 0;
				1711	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1712
				1713	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
				1714	if (error)
				1715	goto error_trans_cancel;
				1716
				1717	ASSERT(ip->i_d.di_nextents == 0);
				1718
				1719	error = xfs_trans_commit(tp);
				1720	if (error)
				1721	goto error_unlock;
				1722
				1723	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1724	return 0;
				1725
				1726	error_trans_cancel:
				1727	xfs_trans_cancel(tp);
				1728	error_unlock:
				1729	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1730	return error;
				1731	}
				1732
				1733	/*
				1734	* xfs_inactive_ifree()
				1735	*
				1736	* Perform the inode free when an inode is unlinked.
				1737	*/
				1738	STATIC int
				1739	xfs_inactive_ifree(
				1740	struct xfs_inode *ip)
				1741	{
				1742	struct xfs_mount *mp = ip->i_mount;
				1743	struct xfs_trans *tp;
				1744	int error;
				1745
				1746	/*
				1747	* We try to use a per-AG reservation for any block needed by the finobt
				1748	* tree, but as the finobt feature predates the per-AG reservation
				1749	* support a degraded file system might not have enough space for the
				1750	* reservation at mount time. In that case try to dip into the reserved
				1751	* pool and pray.
				1752	*
				1753	* Send a warning if the reservation does happen to fail, as the inode
				1754	* now remains allocated and sits on the unlinked list until the fs is
				1755	* repaired.
				1756	*/
				1757	if (unlikely(mp->m_finobt_nores)) {
				1758	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
				1759	XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
				1760	&tp);
				1761	} else {
				1762	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
				1763	}
				1764	if (error) {
				1765	if (error == -ENOSPC) {
				1766	xfs_warn_ratelimited(mp,
				1767	"Failed to remove inode(s) from unlinked list. "
				1768	"Please free space, unmount and run xfs_repair.");
				1769	} else {
				1770	ASSERT(XFS_FORCED_SHUTDOWN(mp));
				1771	}
				1772	return error;
				1773	}
				1774
				1775	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1776	xfs_trans_ijoin(tp, ip, 0);
				1777
				1778	error = xfs_ifree(tp, ip);
				1779	if (error) {
				1780	/*
				1781	* If we fail to free the inode, shut down. The cancel
				1782	* might do that, we need to make sure. Otherwise the
				1783	* inode might be lost for a long time or forever.
				1784	*/
				1785	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1786	xfs_notice(mp, "%s: xfs_ifree returned error %d",
				1787	__func__, error);
				1788	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
				1789	}
				1790	xfs_trans_cancel(tp);
				1791	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1792	return error;
				1793	}
				1794
				1795	/*
				1796	* Credit the quota account(s). The inode is gone.
				1797	*/
				1798	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
				1799
				1800	/*
				1801	* Just ignore errors at this point. There is nothing we can do except
				1802	* to try to keep going. Make sure it's not a silent error.
				1803	*/
				1804	error = xfs_trans_commit(tp);
				1805	if (error)
				1806	xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
				1807	__func__, error);
				1808
				1809	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1810	return 0;
				1811	}
				1812
				1813	/*
				1814	* xfs_inactive
				1815	*
				1816	* This is called when the vnode reference count for the vnode
				1817	* goes to zero. If the file has been unlinked, then it must
				1818	* now be truncated. Also, we clear all of the read-ahead state
				1819	* kept for the inode here since the file is now closed.
				1820	*/
				1821	void
				1822	xfs_inactive(
				1823	xfs_inode_t *ip)
				1824	{
				1825	struct xfs_mount *mp;
				1826	int error;
				1827	int truncate = 0;
				1828
				1829	/*
				1830	* If the inode is already free, then there can be nothing
				1831	* to clean up here.
				1832	*/
				1833	if (VFS_I(ip)->i_mode == 0) {
				1834	ASSERT(ip->i_df.if_broot_bytes == 0);
				1835	return;
				1836	}
				1837
				1838	mp = ip->i_mount;
				1839	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
				1840
				1841	/* If this is a read-only mount, don't do this (would generate I/O) */
				1842	if (mp->m_flags & XFS_MOUNT_RDONLY)
				1843	return;
				1844
				1845	/* Try to clean out the cow blocks if there are any. */
				1846	if (xfs_inode_has_cow_data(ip))
				1847	xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
				1848
				1849	if (VFS_I(ip)->i_nlink != 0) {
				1850	/*
				1851	* force is true because we are evicting an inode from the
				1852	* cache. Post-eof blocks must be freed, lest we end up with
				1853	* broken free space accounting.
				1854	*
				1855	* Note: don't bother with iolock here since lockdep complains
				1856	* about acquiring it in reclaim context. We have the only
				1857	* reference to the inode at this point anyways.
				1858	*/
				1859	if (xfs_can_free_eofblocks(ip, true))
				1860	xfs_free_eofblocks(ip);
				1861
				1862	return;
				1863	}
				1864
				1865	if (S_ISREG(VFS_I(ip)->i_mode) &&
				1866	(ip->i_d.di_size != 0 \|\| XFS_ISIZE(ip) != 0 \|\|
				1867	ip->i_d.di_nextents > 0 \|\| ip->i_delayed_blks > 0))
				1868	truncate = 1;
				1869
				1870	error = xfs_qm_dqattach(ip);
				1871	if (error)
				1872	return;
				1873
				1874	if (S_ISLNK(VFS_I(ip)->i_mode))
				1875	error = xfs_inactive_symlink(ip);
				1876	else if (truncate)
				1877	error = xfs_inactive_truncate(ip);
				1878	if (error)
				1879	return;
				1880
				1881	/*
				1882	* If there are attributes associated with the file then blow them away
				1883	* now. The code calls a routine that recursively deconstructs the
				1884	* attribute fork. If also blows away the in-core attribute fork.
				1885	*/
				1886	if (XFS_IFORK_Q(ip)) {
				1887	error = xfs_attr_inactive(ip);
				1888	if (error)
				1889	return;
				1890	}
				1891
				1892	ASSERT(!ip->i_afp);
				1893	ASSERT(ip->i_d.di_anextents == 0);
				1894	ASSERT(ip->i_d.di_forkoff == 0);
				1895
				1896	/*
				1897	* Free the inode.
				1898	*/
				1899	error = xfs_inactive_ifree(ip);
				1900	if (error)
				1901	return;
				1902
				1903	/*
				1904	* Release the dquots held by inode, if any.
				1905	*/
				1906	xfs_qm_dqdetach(ip);
				1907	}
				1908
				1909	/*
				1910	* This is called when the inode's link count has gone to 0 or we are creating
				1911	* a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
				1912	*
				1913	* We place the on-disk inode on a list in the AGI. It will be pulled from this
				1914	* list when the inode is freed.
				1915	*/
				1916	STATIC int
				1917	xfs_iunlink(
				1918	struct xfs_trans *tp,
				1919	struct xfs_inode *ip)
				1920	{
				1921	xfs_mount_t *mp = tp->t_mountp;
				1922	xfs_agi_t *agi;
				1923	xfs_dinode_t *dip;
				1924	xfs_buf_t *agibp;
				1925	xfs_buf_t *ibp;
				1926	xfs_agino_t agino;
				1927	short bucket_index;
				1928	int offset;
				1929	int error;
				1930
				1931	ASSERT(VFS_I(ip)->i_nlink == 0);
				1932	ASSERT(VFS_I(ip)->i_mode != 0);
				1933
				1934	/*
				1935	* Get the agi buffer first. It ensures lock ordering
				1936	* on the list.
				1937	*/
				1938	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
				1939	if (error)
				1940	return error;
				1941	agi = XFS_BUF_TO_AGI(agibp);
				1942
				1943	/*
				1944	* Get the index into the agi hash table for the
				1945	* list this inode will go on.
				1946	*/
				1947	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
				1948	ASSERT(agino != 0);
				1949	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
				1950	ASSERT(agi->agi_unlinked[bucket_index]);
				1951	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
				1952
				1953	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
				1954	/*
				1955	* There is already another inode in the bucket we need
				1956	* to add ourselves to. Add us at the front of the list.
				1957	* Here we put the head pointer into our next pointer,
				1958	* and then we fall through to point the head at us.
				1959	*/
				1960	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				1961	0, 0);
				1962	if (error)
				1963	return error;
				1964
				1965	ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
				1966	dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
				1967	offset = ip->i_imap.im_boffset +
				1968	offsetof(xfs_dinode_t, di_next_unlinked);
				1969
				1970	/* need to recalc the inode CRC if appropriate */
				1971	xfs_dinode_calc_crc(mp, dip);
				1972
				1973	xfs_trans_inode_buf(tp, ibp);
				1974	xfs_trans_log_buf(tp, ibp, offset,
				1975	(offset + sizeof(xfs_agino_t) - 1));
				1976	xfs_inobp_check(mp, ibp);
				1977	}
				1978
				1979	/*
				1980	* Point the bucket head pointer at the inode being inserted.
				1981	*/
				1982	ASSERT(agino != 0);
				1983	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
				1984	offset = offsetof(xfs_agi_t, agi_unlinked) +
				1985	(sizeof(xfs_agino_t) * bucket_index);
				1986	xfs_trans_log_buf(tp, agibp, offset,
				1987	(offset + sizeof(xfs_agino_t) - 1));
				1988	return 0;
				1989	}
				1990
				1991	/*
				1992	* Pull the on-disk inode from the AGI unlinked list.
				1993	*/
				1994	STATIC int
				1995	xfs_iunlink_remove(
				1996	xfs_trans_t *tp,
				1997	xfs_inode_t *ip)
				1998	{
				1999	xfs_ino_t next_ino;
				2000	xfs_mount_t *mp;
				2001	xfs_agi_t *agi;
				2002	xfs_dinode_t *dip;
				2003	xfs_buf_t *agibp;
				2004	xfs_buf_t *ibp;
				2005	xfs_agnumber_t agno;
				2006	xfs_agino_t agino;
				2007	xfs_agino_t next_agino;
				2008	xfs_buf_t *last_ibp;
				2009	xfs_dinode_t *last_dip = NULL;
				2010	short bucket_index;
				2011	int offset, last_offset = 0;
				2012	int error;
				2013
				2014	mp = tp->t_mountp;
				2015	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
				2016
				2017	/*
				2018	* Get the agi buffer first. It ensures lock ordering
				2019	* on the list.
				2020	*/
				2021	error = xfs_read_agi(mp, tp, agno, &agibp);
				2022	if (error)
				2023	return error;
				2024
				2025	agi = XFS_BUF_TO_AGI(agibp);
				2026
				2027	/*
				2028	* Get the index into the agi hash table for the
				2029	* list this inode will go on.
				2030	*/
				2031	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
				2032	if (!xfs_verify_agino(mp, agno, agino))
				2033	return -EFSCORRUPTED;
				2034	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
				2035	if (!xfs_verify_agino(mp, agno,
				2036	be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
				2037	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
				2038	agi, sizeof(*agi));
				2039	return -EFSCORRUPTED;
				2040	}
				2041
				2042	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
				2043	/*
				2044	* We're at the head of the list. Get the inode's on-disk
				2045	* buffer to see if there is anyone after us on the list.
				2046	* Only modify our next pointer if it is not already NULLAGINO.
				2047	* This saves us the overhead of dealing with the buffer when
				2048	* there is no need to change it.
				2049	*/
				2050	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				2051	0, 0);
				2052	if (error) {
				2053	xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
				2054	__func__, error);
				2055	return error;
				2056	}
				2057	next_agino = be32_to_cpu(dip->di_next_unlinked);
				2058	ASSERT(next_agino != 0);
				2059	if (next_agino != NULLAGINO) {
				2060	dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
				2061	offset = ip->i_imap.im_boffset +
				2062	offsetof(xfs_dinode_t, di_next_unlinked);
				2063
				2064	/* need to recalc the inode CRC if appropriate */
				2065	xfs_dinode_calc_crc(mp, dip);
				2066
				2067	xfs_trans_inode_buf(tp, ibp);
				2068	xfs_trans_log_buf(tp, ibp, offset,
				2069	(offset + sizeof(xfs_agino_t) - 1));
				2070	xfs_inobp_check(mp, ibp);
				2071	} else {
				2072	xfs_trans_brelse(tp, ibp);
				2073	}
				2074	/*
				2075	* Point the bucket head pointer at the next inode.
				2076	*/
				2077	ASSERT(next_agino != 0);
				2078	ASSERT(next_agino != agino);
				2079	agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
				2080	offset = offsetof(xfs_agi_t, agi_unlinked) +
				2081	(sizeof(xfs_agino_t) * bucket_index);
				2082	xfs_trans_log_buf(tp, agibp, offset,
				2083	(offset + sizeof(xfs_agino_t) - 1));
				2084	} else {
				2085	/*
				2086	* We need to search the list for the inode being freed.
				2087	*/
				2088	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
				2089	last_ibp = NULL;
				2090	while (next_agino != agino) {
				2091	struct xfs_imap imap;
				2092
				2093	if (last_ibp)
				2094	xfs_trans_brelse(tp, last_ibp);
				2095
				2096	imap.im_blkno = 0;
				2097	next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
				2098
				2099	error = xfs_imap(mp, tp, next_ino, &imap, 0);
				2100	if (error) {
				2101	xfs_warn(mp,
				2102	"%s: xfs_imap returned error %d.",
				2103	__func__, error);
				2104	return error;
				2105	}
				2106
				2107	error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
				2108	&last_ibp, 0, 0);
				2109	if (error) {
				2110	xfs_warn(mp,
				2111	"%s: xfs_imap_to_bp returned error %d.",
				2112	__func__, error);
				2113	return error;
				2114	}
				2115
				2116	last_offset = imap.im_boffset;
				2117	next_agino = be32_to_cpu(last_dip->di_next_unlinked);
				2118	if (!xfs_verify_agino(mp, agno, next_agino)) {
				2119	XFS_CORRUPTION_ERROR(__func__,
				2120	XFS_ERRLEVEL_LOW, mp,
				2121	last_dip, sizeof(*last_dip));
				2122	return -EFSCORRUPTED;
				2123	}
				2124	}
				2125
				2126	/*
				2127	* Now last_ibp points to the buffer previous to us on the
				2128	* unlinked list. Pull us from the list.
				2129	*/
				2130	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				2131	0, 0);
				2132	if (error) {
				2133	xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
				2134	__func__, error);
				2135	return error;
				2136	}
				2137	next_agino = be32_to_cpu(dip->di_next_unlinked);
				2138	ASSERT(next_agino != 0);
				2139	ASSERT(next_agino != agino);
				2140	if (next_agino != NULLAGINO) {
				2141	dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
				2142	offset = ip->i_imap.im_boffset +
				2143	offsetof(xfs_dinode_t, di_next_unlinked);
				2144
				2145	/* need to recalc the inode CRC if appropriate */
				2146	xfs_dinode_calc_crc(mp, dip);
				2147
				2148	xfs_trans_inode_buf(tp, ibp);
				2149	xfs_trans_log_buf(tp, ibp, offset,
				2150	(offset + sizeof(xfs_agino_t) - 1));
				2151	xfs_inobp_check(mp, ibp);
				2152	} else {
				2153	xfs_trans_brelse(tp, ibp);
				2154	}
				2155	/*
				2156	* Point the previous inode on the list to the next inode.
				2157	*/
				2158	last_dip->di_next_unlinked = cpu_to_be32(next_agino);
				2159	ASSERT(next_agino != 0);
				2160	offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
				2161
				2162	/* need to recalc the inode CRC if appropriate */
				2163	xfs_dinode_calc_crc(mp, last_dip);
				2164
				2165	xfs_trans_inode_buf(tp, last_ibp);
				2166	xfs_trans_log_buf(tp, last_ibp, offset,
				2167	(offset + sizeof(xfs_agino_t) - 1));
				2168	xfs_inobp_check(mp, last_ibp);
				2169	}
				2170	return 0;
				2171	}
				2172
				2173	/*
				2174	* A big issue when freeing the inode cluster is that we _cannot_ skip any
				2175	* inodes that are in memory - they all must be marked stale and attached to
				2176	* the cluster buffer.
				2177	*/
				2178	STATIC int
				2179	xfs_ifree_cluster(
				2180	xfs_inode_t *free_ip,
				2181	xfs_trans_t *tp,
				2182	struct xfs_icluster *xic)
				2183	{
				2184	xfs_mount_t *mp = free_ip->i_mount;
				2185	int blks_per_cluster;
				2186	int inodes_per_cluster;
				2187	int nbufs;
				2188	int i, j;
				2189	int ioffset;
				2190	xfs_daddr_t blkno;
				2191	xfs_buf_t *bp;
				2192	xfs_inode_t *ip;
				2193	xfs_inode_log_item_t *iip;
				2194	struct xfs_log_item *lip;
				2195	struct xfs_perag *pag;
				2196	xfs_ino_t inum;
				2197
				2198	inum = xic->first_ino;
				2199	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
				2200	blks_per_cluster = xfs_icluster_size_fsb(mp);
				2201	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
				2202	nbufs = mp->m_ialloc_blks / blks_per_cluster;
				2203
				2204	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
				2205	/*
				2206	* The allocation bitmap tells us which inodes of the chunk were
				2207	* physically allocated. Skip the cluster if an inode falls into
				2208	* a sparse region.
				2209	*/
				2210	ioffset = inum - xic->first_ino;
				2211	if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
				2212	ASSERT(ioffset % inodes_per_cluster == 0);
				2213	continue;
				2214	}
				2215
				2216	blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
				2217	XFS_INO_TO_AGBNO(mp, inum));
				2218
				2219	/*
				2220	* We obtain and lock the backing buffer first in the process
				2221	* here, as we have to ensure that any dirty inode that we
				2222	* can't get the flush lock on is attached to the buffer.
				2223	* If we scan the in-memory inodes first, then buffer IO can
				2224	* complete before we get a lock on it, and hence we may fail
				2225	* to mark all the active inodes on the buffer stale.
				2226	*/
				2227	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
				2228	mp->m_bsize * blks_per_cluster,
				2229	XBF_UNMAPPED);
				2230
				2231	if (!bp)
				2232	return -ENOMEM;
				2233
				2234	/*
				2235	* This buffer may not have been correctly initialised as we
				2236	* didn't read it from disk. That's not important because we are
				2237	* only using to mark the buffer as stale in the log, and to
				2238	* attach stale cached inodes on it. That means it will never be
				2239	* dispatched for IO. If it is, we want to know about it, and we
				2240	* want it to fail. We can acheive this by adding a write
				2241	* verifier to the buffer.
				2242	*/
				2243	bp->b_ops = &xfs_inode_buf_ops;
				2244
				2245	/*
				2246	* Walk the inodes already attached to the buffer and mark them
				2247	* stale. These will all have the flush locks held, so an
				2248	* in-memory inode walk can't lock them. By marking them all
				2249	* stale first, we will not attempt to lock them in the loop
				2250	* below as the XFS_ISTALE flag will be set.
				2251	*/
				2252	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
				2253	if (lip->li_type == XFS_LI_INODE) {
				2254	iip = (xfs_inode_log_item_t *)lip;
				2255	ASSERT(iip->ili_logged == 1);
				2256	lip->li_cb = xfs_istale_done;
				2257	xfs_trans_ail_copy_lsn(mp->m_ail,
				2258	&iip->ili_flush_lsn,
				2259	&iip->ili_item.li_lsn);
				2260	xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
				2261	}
				2262	}
				2263
				2264
				2265	/*
				2266	* For each inode in memory attempt to add it to the inode
				2267	* buffer and set it up for being staled on buffer IO
				2268	* completion. This is safe as we've locked out tail pushing
				2269	* and flushing by locking the buffer.
				2270	*
				2271	* We have already marked every inode that was part of a
				2272	* transaction stale above, which means there is no point in
				2273	* even trying to lock them.
				2274	*/
				2275	for (i = 0; i < inodes_per_cluster; i++) {
				2276	retry:
				2277	rcu_read_lock();
				2278	ip = radix_tree_lookup(&pag->pag_ici_root,
				2279	XFS_INO_TO_AGINO(mp, (inum + i)));
				2280
				2281	/* Inode not in memory, nothing to do */
				2282	if (!ip) {
				2283	rcu_read_unlock();
				2284	continue;
				2285	}
				2286
				2287	/*
				2288	* because this is an RCU protected lookup, we could
				2289	* find a recently freed or even reallocated inode
				2290	* during the lookup. We need to check under the
				2291	* i_flags_lock for a valid inode here. Skip it if it
				2292	* is not valid, the wrong inode or stale.
				2293	*/
				2294	spin_lock(&ip->i_flags_lock);
				2295	if (ip->i_ino != inum + i \|\|
				2296	__xfs_iflags_test(ip, XFS_ISTALE)) {
				2297	spin_unlock(&ip->i_flags_lock);
				2298	rcu_read_unlock();
				2299	continue;
				2300	}
				2301	spin_unlock(&ip->i_flags_lock);
				2302
				2303	/*
				2304	* Don't try to lock/unlock the current inode, but we
				2305	* _cannot_ skip the other inodes that we did not find
				2306	* in the list attached to the buffer and are not
				2307	* already marked stale. If we can't lock it, back off
				2308	* and retry.
				2309	*/
				2310	if (ip != free_ip) {
				2311	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
				2312	rcu_read_unlock();
				2313	delay(1);
				2314	goto retry;
				2315	}
				2316
				2317	/*
				2318	* Check the inode number again in case we're
				2319	* racing with freeing in xfs_reclaim_inode().
				2320	* See the comments in that function for more
				2321	* information as to why the initial check is
				2322	* not sufficient.
				2323	*/
				2324	if (ip->i_ino != inum + i) {
				2325	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2326	rcu_read_unlock();
				2327	continue;
				2328	}
				2329	}
				2330	rcu_read_unlock();
				2331
				2332	xfs_iflock(ip);
				2333	xfs_iflags_set(ip, XFS_ISTALE);
				2334
				2335	/*
				2336	* we don't need to attach clean inodes or those only
				2337	* with unlogged changes (which we throw away, anyway).
				2338	*/
				2339	iip = ip->i_itemp;
				2340	if (!iip \|\| xfs_inode_clean(ip)) {
				2341	ASSERT(ip != free_ip);
				2342	xfs_ifunlock(ip);
				2343	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2344	continue;
				2345	}
				2346
				2347	iip->ili_last_fields = iip->ili_fields;
				2348	iip->ili_fields = 0;
				2349	iip->ili_fsync_fields = 0;
				2350	iip->ili_logged = 1;
				2351	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
				2352	&iip->ili_item.li_lsn);
				2353
				2354	xfs_buf_attach_iodone(bp, xfs_istale_done,
				2355	&iip->ili_item);
				2356
				2357	if (ip != free_ip)
				2358	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2359	}
				2360
				2361	xfs_trans_stale_inode_buf(tp, bp);
				2362	xfs_trans_binval(tp, bp);
				2363	}
				2364
				2365	xfs_perag_put(pag);
				2366	return 0;
				2367	}
				2368
				2369	/*
				2370	* Free any local-format buffers sitting around before we reset to
				2371	* extents format.
				2372	*/
				2373	static inline void
				2374	xfs_ifree_local_data(
				2375	struct xfs_inode *ip,
				2376	int whichfork)
				2377	{
				2378	struct xfs_ifork *ifp;
				2379
				2380	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
				2381	return;
				2382
				2383	ifp = XFS_IFORK_PTR(ip, whichfork);
				2384	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
				2385	}
				2386
				2387	/*
				2388	* This is called to return an inode to the inode free list.
				2389	* The inode should already be truncated to 0 length and have
				2390	* no pages associated with it. This routine also assumes that
				2391	* the inode is already a part of the transaction.
				2392	*
				2393	* The on-disk copy of the inode will have been added to the list
				2394	* of unlinked inodes in the AGI. We need to remove the inode from
				2395	* that list atomically with respect to freeing it here.
				2396	*/
				2397	int
				2398	xfs_ifree(
				2399	struct xfs_trans *tp,
				2400	struct xfs_inode *ip)
				2401	{
				2402	int error;
				2403	struct xfs_icluster xic = { 0 };
				2404
				2405	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				2406	ASSERT(VFS_I(ip)->i_nlink == 0);
				2407	ASSERT(ip->i_d.di_nextents == 0);
				2408	ASSERT(ip->i_d.di_anextents == 0);
				2409	ASSERT(ip->i_d.di_size == 0 \|\| !S_ISREG(VFS_I(ip)->i_mode));
				2410	ASSERT(ip->i_d.di_nblocks == 0);
				2411
				2412	/*
				2413	* Pull the on-disk inode from the AGI unlinked list.
				2414	*/
				2415	error = xfs_iunlink_remove(tp, ip);
				2416	if (error)
				2417	return error;
				2418
				2419	error = xfs_difree(tp, ip->i_ino, &xic);
				2420	if (error)
				2421	return error;
				2422
				2423	xfs_ifree_local_data(ip, XFS_DATA_FORK);
				2424	xfs_ifree_local_data(ip, XFS_ATTR_FORK);
				2425
				2426	VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
				2427	ip->i_d.di_flags = 0;
				2428	ip->i_d.di_flags2 = 0;
				2429	ip->i_d.di_dmevmask = 0;
				2430	ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
				2431	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
				2432	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
				2433
				2434	/* Don't attempt to replay owner changes for a deleted inode */
				2435	ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER\|XFS_ILOG_DOWNER);
				2436
				2437	/*
				2438	* Bump the generation count so no one will be confused
				2439	* by reincarnations of this inode.
				2440	*/
				2441	VFS_I(ip)->i_generation++;
				2442	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				2443
				2444	if (xic.deleted)
				2445	error = xfs_ifree_cluster(ip, tp, &xic);
				2446
				2447	return error;
				2448	}
				2449
				2450	/*
				2451	* This is called to unpin an inode. The caller must have the inode locked
				2452	* in at least shared mode so that the buffer cannot be subsequently pinned
				2453	* once someone is waiting for it to be unpinned.
				2454	*/
				2455	static void
				2456	xfs_iunpin(
				2457	struct xfs_inode *ip)
				2458	{
				2459	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				2460
				2461	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
				2462
				2463	/* Give the log a push to start the unpinning I/O */
				2464	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
				2465
				2466	}
				2467
				2468	static void
				2469	__xfs_iunpin_wait(
				2470	struct xfs_inode *ip)
				2471	{
				2472	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
				2473	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
				2474
				2475	xfs_iunpin(ip);
				2476
				2477	do {
				2478	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				2479	if (xfs_ipincount(ip))
				2480	io_schedule();
				2481	} while (xfs_ipincount(ip));
				2482	finish_wait(wq, &wait.wq_entry);
				2483	}
				2484
				2485	void
				2486	xfs_iunpin_wait(
				2487	struct xfs_inode *ip)
				2488	{
				2489	if (xfs_ipincount(ip))
				2490	__xfs_iunpin_wait(ip);
				2491	}
				2492
				2493	/*
				2494	* Removing an inode from the namespace involves removing the directory entry
				2495	* and dropping the link count on the inode. Removing the directory entry can
				2496	* result in locking an AGF (directory blocks were freed) and removing a link
				2497	* count can result in placing the inode on an unlinked list which results in
				2498	* locking an AGI.
				2499	*
				2500	* The big problem here is that we have an ordering constraint on AGF and AGI
				2501	* locking - inode allocation locks the AGI, then can allocate a new extent for
				2502	* new inodes, locking the AGF after the AGI. Similarly, freeing the inode
				2503	* removes the inode from the unlinked list, requiring that we lock the AGI
				2504	* first, and then freeing the inode can result in an inode chunk being freed
				2505	* and hence freeing disk space requiring that we lock an AGF.
				2506	*
				2507	* Hence the ordering that is imposed by other parts of the code is AGI before
				2508	* AGF. This means we cannot remove the directory entry before we drop the inode
				2509	* reference count and put it on the unlinked list as this results in a lock
				2510	* order of AGF then AGI, and this can deadlock against inode allocation and
				2511	* freeing. Therefore we must drop the link counts before we remove the
				2512	* directory entry.
				2513	*
				2514	* This is still safe from a transactional point of view - it is not until we
				2515	* get to xfs_defer_finish() that we have the possibility of multiple
				2516	* transactions in this operation. Hence as long as we remove the directory
				2517	* entry and drop the link count in the first transaction of the remove
				2518	* operation, there are no transactional constraints on the ordering here.
				2519	*/
				2520	int
				2521	xfs_remove(
				2522	xfs_inode_t *dp,
				2523	struct xfs_name *name,
				2524	xfs_inode_t *ip)
				2525	{
				2526	xfs_mount_t *mp = dp->i_mount;
				2527	xfs_trans_t *tp = NULL;
				2528	int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
				2529	int error = 0;
				2530	uint resblks;
				2531
				2532	trace_xfs_remove(dp, name);
				2533
				2534	if (XFS_FORCED_SHUTDOWN(mp))
				2535	return -EIO;
				2536
				2537	error = xfs_qm_dqattach(dp);
				2538	if (error)
				2539	goto std_return;
				2540
				2541	error = xfs_qm_dqattach(ip);
				2542	if (error)
				2543	goto std_return;
				2544
				2545	/*
				2546	* We try to get the real space reservation first,
				2547	* allowing for directory btree deletion(s) implying
				2548	* possible bmap insert(s). If we can't get the space
				2549	* reservation then we use 0 instead, and avoid the bmap
				2550	* btree insert(s) in the directory code by, if the bmap
				2551	* insert tries to happen, instead trimming the LAST
				2552	* block from the directory.
				2553	*/
				2554	resblks = XFS_REMOVE_SPACE_RES(mp);
				2555	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
				2556	if (error == -ENOSPC) {
				2557	resblks = 0;
				2558	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
				2559	&tp);
				2560	}
				2561	if (error) {
				2562	ASSERT(error != -ENOSPC);
				2563	goto std_return;
				2564	}
				2565
				2566	xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
				2567
				2568	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
				2569	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				2570
				2571	/*
				2572	* If we're removing a directory perform some additional validation.
				2573	*/
				2574	if (is_dir) {
				2575	ASSERT(VFS_I(ip)->i_nlink >= 2);
				2576	if (VFS_I(ip)->i_nlink != 2) {
				2577	error = -ENOTEMPTY;
				2578	goto out_trans_cancel;
				2579	}
				2580	if (!xfs_dir_isempty(ip)) {
				2581	error = -ENOTEMPTY;
				2582	goto out_trans_cancel;
				2583	}
				2584
				2585	/* Drop the link from ip's "..". */
				2586	error = xfs_droplink(tp, dp);
				2587	if (error)
				2588	goto out_trans_cancel;
				2589
				2590	/* Drop the "." link from ip to self. */
				2591	error = xfs_droplink(tp, ip);
				2592	if (error)
				2593	goto out_trans_cancel;
				2594	} else {
				2595	/*
				2596	* When removing a non-directory we need to log the parent
				2597	* inode here. For a directory this is done implicitly
				2598	* by the xfs_droplink call for the ".." entry.
				2599	*/
				2600	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
				2601	}
				2602	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				2603
				2604	/* Drop the link from dp to ip. */
				2605	error = xfs_droplink(tp, ip);
				2606	if (error)
				2607	goto out_trans_cancel;
				2608
				2609	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
				2610	if (error) {
				2611	ASSERT(error != -ENOENT);
				2612	goto out_trans_cancel;
				2613	}
				2614
				2615	/*
				2616	* If this is a synchronous mount, make sure that the
				2617	* remove transaction goes to disk before returning to
				2618	* the user.
				2619	*/
				2620	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				2621	xfs_trans_set_sync(tp);
				2622
				2623	error = xfs_trans_commit(tp);
				2624	if (error)
				2625	goto std_return;
				2626
				2627	if (is_dir && xfs_inode_is_filestream(ip))
				2628	xfs_filestream_deassociate(ip);
				2629
				2630	return 0;
				2631
				2632	out_trans_cancel:
				2633	xfs_trans_cancel(tp);
				2634	std_return:
				2635	return error;
				2636	}
				2637
				2638	/*
				2639	* Enter all inodes for a rename transaction into a sorted array.
				2640	*/
				2641	#define __XFS_SORT_INODES 5
				2642	STATIC void
				2643	xfs_sort_for_rename(
				2644	struct xfs_inode dp1, / in: old (source) directory inode */
				2645	struct xfs_inode dp2, / in: new (target) directory inode */
				2646	struct xfs_inode ip1, / in: inode of old entry */
				2647	struct xfs_inode ip2, / in: inode of new entry */
				2648	struct xfs_inode wip, / in: whiteout inode */
				2649	struct xfs_inode *i_tab,/ out: sorted array of inodes */
				2650	int num_inodes) / in/out: inodes in array */
				2651	{
				2652	int i, j;
				2653
				2654	ASSERT(*num_inodes == __XFS_SORT_INODES);
				2655	memset(i_tab, 0, num_inodes sizeof(struct xfs_inode *));
				2656
				2657	/*
				2658	* i_tab contains a list of pointers to inodes. We initialize
				2659	* the table here & we'll sort it. We will then use it to
				2660	* order the acquisition of the inode locks.
				2661	*
				2662	* Note that the table may contain duplicates. e.g., dp1 == dp2.
				2663	*/
				2664	i = 0;
				2665	i_tab[i++] = dp1;
				2666	i_tab[i++] = dp2;
				2667	i_tab[i++] = ip1;
				2668	if (ip2)
				2669	i_tab[i++] = ip2;
				2670	if (wip)
				2671	i_tab[i++] = wip;
				2672	*num_inodes = i;
				2673
				2674	/*
				2675	* Sort the elements via bubble sort. (Remember, there are at
				2676	* most 5 elements to sort, so this is adequate.)
				2677	*/
				2678	for (i = 0; i < *num_inodes; i++) {
				2679	for (j = 1; j < *num_inodes; j++) {
				2680	if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
				2681	struct xfs_inode *temp = i_tab[j];
				2682	i_tab[j] = i_tab[j-1];
				2683	i_tab[j-1] = temp;
				2684	}
				2685	}
				2686	}
				2687	}
				2688
				2689	static int
				2690	xfs_finish_rename(
				2691	struct xfs_trans *tp)
				2692	{
				2693	/*
				2694	* If this is a synchronous mount, make sure that the rename transaction
				2695	* goes to disk before returning to the user.
				2696	*/
				2697	if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				2698	xfs_trans_set_sync(tp);
				2699
				2700	return xfs_trans_commit(tp);
				2701	}
				2702
				2703	/*
				2704	* xfs_cross_rename()
				2705	*
				2706	* responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
				2707	*/
				2708	STATIC int
				2709	xfs_cross_rename(
				2710	struct xfs_trans *tp,
				2711	struct xfs_inode *dp1,
				2712	struct xfs_name *name1,
				2713	struct xfs_inode *ip1,
				2714	struct xfs_inode *dp2,
				2715	struct xfs_name *name2,
				2716	struct xfs_inode *ip2,
				2717	int spaceres)
				2718	{
				2719	int error = 0;
				2720	int ip1_flags = 0;
				2721	int ip2_flags = 0;
				2722	int dp2_flags = 0;
				2723
				2724	/* Swap inode number for dirent in first parent */
				2725	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
				2726	if (error)
				2727	goto out_trans_abort;
				2728
				2729	/* Swap inode number for dirent in second parent */
				2730	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
				2731	if (error)
				2732	goto out_trans_abort;
				2733
				2734	/*
				2735	* If we're renaming one or more directories across different parents,
				2736	* update the respective ".." entries (and link counts) to match the new
				2737	* parents.
				2738	*/
				2739	if (dp1 != dp2) {
				2740	dp2_flags = XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2741
				2742	if (S_ISDIR(VFS_I(ip2)->i_mode)) {
				2743	error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
				2744	dp1->i_ino, spaceres);
				2745	if (error)
				2746	goto out_trans_abort;
				2747
				2748	/* transfer ip2 ".." reference to dp1 */
				2749	if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
				2750	error = xfs_droplink(tp, dp2);
				2751	if (error)
				2752	goto out_trans_abort;
				2753	error = xfs_bumplink(tp, dp1);
				2754	if (error)
				2755	goto out_trans_abort;
				2756	}
				2757
				2758	/*
				2759	* Although ip1 isn't changed here, userspace needs
				2760	* to be warned about the change, so that applications
				2761	* relying on it (like backup ones), will properly
				2762	* notify the change
				2763	*/
				2764	ip1_flags \|= XFS_ICHGTIME_CHG;
				2765	ip2_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2766	}
				2767
				2768	if (S_ISDIR(VFS_I(ip1)->i_mode)) {
				2769	error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
				2770	dp2->i_ino, spaceres);
				2771	if (error)
				2772	goto out_trans_abort;
				2773
				2774	/* transfer ip1 ".." reference to dp2 */
				2775	if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
				2776	error = xfs_droplink(tp, dp1);
				2777	if (error)
				2778	goto out_trans_abort;
				2779	error = xfs_bumplink(tp, dp2);
				2780	if (error)
				2781	goto out_trans_abort;
				2782	}
				2783
				2784	/*
				2785	* Although ip2 isn't changed here, userspace needs
				2786	* to be warned about the change, so that applications
				2787	* relying on it (like backup ones), will properly
				2788	* notify the change
				2789	*/
				2790	ip1_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2791	ip2_flags \|= XFS_ICHGTIME_CHG;
				2792	}
				2793	}
				2794
				2795	if (ip1_flags) {
				2796	xfs_trans_ichgtime(tp, ip1, ip1_flags);
				2797	xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
				2798	}
				2799	if (ip2_flags) {
				2800	xfs_trans_ichgtime(tp, ip2, ip2_flags);
				2801	xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
				2802	}
				2803	if (dp2_flags) {
				2804	xfs_trans_ichgtime(tp, dp2, dp2_flags);
				2805	xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
				2806	}
				2807	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				2808	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
				2809	return xfs_finish_rename(tp);
				2810
				2811	out_trans_abort:
				2812	xfs_trans_cancel(tp);
				2813	return error;
				2814	}
				2815
				2816	/*
				2817	* xfs_rename_alloc_whiteout()
				2818	*
				2819	* Return a referenced, unlinked, unlocked inode that that can be used as a
				2820	* whiteout in a rename transaction. We use a tmpfile inode here so that if we
				2821	* crash between allocating the inode and linking it into the rename transaction
				2822	* recovery will free the inode and we won't leak it.
				2823	*/
				2824	static int
				2825	xfs_rename_alloc_whiteout(
				2826	struct xfs_inode *dp,
				2827	struct xfs_inode **wip)
				2828	{
				2829	struct xfs_inode *tmpfile;
				2830	int error;
				2831
				2832	error = xfs_create_tmpfile(dp, S_IFCHR \| WHITEOUT_MODE, &tmpfile);
				2833	if (error)
				2834	return error;
				2835
				2836	/*
				2837	* Prepare the tmpfile inode as if it were created through the VFS.
				2838	* Complete the inode setup and flag it as linkable. nlink is already
				2839	* zero, so we can skip the drop_nlink.
				2840	*/
				2841	xfs_setup_iops(tmpfile);
				2842	xfs_finish_inode_setup(tmpfile);
				2843	VFS_I(tmpfile)->i_state \|= I_LINKABLE;
				2844
				2845	*wip = tmpfile;
				2846	return 0;
				2847	}
				2848
				2849	/*
				2850	* xfs_rename
				2851	*/
				2852	int
				2853	xfs_rename(
				2854	struct xfs_inode *src_dp,
				2855	struct xfs_name *src_name,
				2856	struct xfs_inode *src_ip,
				2857	struct xfs_inode *target_dp,
				2858	struct xfs_name *target_name,
				2859	struct xfs_inode *target_ip,
				2860	unsigned int flags)
				2861	{
				2862	struct xfs_mount *mp = src_dp->i_mount;
				2863	struct xfs_trans *tp;
				2864	struct xfs_inode wip = NULL; / whiteout inode */
				2865	struct xfs_inode *inodes[__XFS_SORT_INODES];
				2866	int num_inodes = __XFS_SORT_INODES;
				2867	bool new_parent = (src_dp != target_dp);
				2868	bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
				2869	int spaceres;
				2870	int error;
				2871
				2872	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
				2873
				2874	if ((flags & RENAME_EXCHANGE) && !target_ip)
				2875	return -EINVAL;
				2876
				2877	/*
				2878	* If we are doing a whiteout operation, allocate the whiteout inode
				2879	* we will be placing at the target and ensure the type is set
				2880	* appropriately.
				2881	*/
				2882	if (flags & RENAME_WHITEOUT) {
				2883	ASSERT(!(flags & (RENAME_NOREPLACE \| RENAME_EXCHANGE)));
				2884	error = xfs_rename_alloc_whiteout(target_dp, &wip);
				2885	if (error)
				2886	return error;
				2887
				2888	/* setup target dirent info as whiteout */
				2889	src_name->type = XFS_DIR3_FT_CHRDEV;
				2890	}
				2891
				2892	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
				2893	inodes, &num_inodes);
				2894
				2895	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
				2896	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
				2897	if (error == -ENOSPC) {
				2898	spaceres = 0;
				2899	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
				2900	&tp);
				2901	}
				2902	if (error)
				2903	goto out_release_wip;
				2904
				2905	/*
				2906	* Attach the dquots to the inodes
				2907	*/
				2908	error = xfs_qm_vop_rename_dqattach(inodes);
				2909	if (error)
				2910	goto out_trans_cancel;
				2911
				2912	/*
				2913	* Lock all the participating inodes. Depending upon whether
				2914	* the target_name exists in the target directory, and
				2915	* whether the target directory is the same as the source
				2916	* directory, we can lock from 2 to 4 inodes.
				2917	*/
				2918	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
				2919
				2920	/*
				2921	* Join all the inodes to the transaction. From this point on,
				2922	* we can rely on either trans_commit or trans_cancel to unlock
				2923	* them.
				2924	*/
				2925	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
				2926	if (new_parent)
				2927	xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
				2928	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
				2929	if (target_ip)
				2930	xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
				2931	if (wip)
				2932	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
				2933
				2934	/*
				2935	* If we are using project inheritance, we only allow renames
				2936	* into our tree when the project IDs are the same; else the
				2937	* tree quota mechanism would be circumvented.
				2938	*/
				2939	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
				2940	(xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
				2941	error = -EXDEV;
				2942	goto out_trans_cancel;
				2943	}
				2944
				2945	/* RENAME_EXCHANGE is unique from here on. */
				2946	if (flags & RENAME_EXCHANGE)
				2947	return xfs_cross_rename(tp, src_dp, src_name, src_ip,
				2948	target_dp, target_name, target_ip,
				2949	spaceres);
				2950
				2951	/*
				2952	* Set up the target.
				2953	*/
				2954	if (target_ip == NULL) {
				2955	/*
				2956	* If there's no space reservation, check the entry will
				2957	* fit before actually inserting it.
				2958	*/
				2959	if (!spaceres) {
				2960	error = xfs_dir_canenter(tp, target_dp, target_name);
				2961	if (error)
				2962	goto out_trans_cancel;
				2963	}
				2964	/*
				2965	* If target does not exist and the rename crosses
				2966	* directories, adjust the target directory link count
				2967	* to account for the ".." reference from the new entry.
				2968	*/
				2969	error = xfs_dir_createname(tp, target_dp, target_name,
				2970	src_ip->i_ino, spaceres);
				2971	if (error)
				2972	goto out_trans_cancel;
				2973
				2974	xfs_trans_ichgtime(tp, target_dp,
				2975	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				2976
				2977	if (new_parent && src_is_directory) {
				2978	error = xfs_bumplink(tp, target_dp);
				2979	if (error)
				2980	goto out_trans_cancel;
				2981	}
				2982	} else { /* target_ip != NULL */
				2983	/*
				2984	* If target exists and it's a directory, check that both
				2985	* target and source are directories and that target can be
				2986	* destroyed, or that neither is a directory.
				2987	*/
				2988	if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
				2989	/*
				2990	* Make sure target dir is empty.
				2991	*/
				2992	if (!(xfs_dir_isempty(target_ip)) \|\|
				2993	(VFS_I(target_ip)->i_nlink > 2)) {
				2994	error = -EEXIST;
				2995	goto out_trans_cancel;
				2996	}
				2997	}
				2998
				2999	/*
				3000	* Link the source inode under the target name.
				3001	* If the source inode is a directory and we are moving
				3002	* it across directories, its ".." entry will be
				3003	* inconsistent until we replace that down below.
				3004	*
				3005	* In case there is already an entry with the same
				3006	* name at the destination directory, remove it first.
				3007	*/
				3008	error = xfs_dir_replace(tp, target_dp, target_name,
				3009	src_ip->i_ino, spaceres);
				3010	if (error)
				3011	goto out_trans_cancel;
				3012
				3013	xfs_trans_ichgtime(tp, target_dp,
				3014	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				3015
				3016	/*
				3017	* Decrement the link count on the target since the target
				3018	* dir no longer points to it.
				3019	*/
				3020	error = xfs_droplink(tp, target_ip);
				3021	if (error)
				3022	goto out_trans_cancel;
				3023
				3024	if (src_is_directory) {
				3025	/*
				3026	* Drop the link from the old "." entry.
				3027	*/
				3028	error = xfs_droplink(tp, target_ip);
				3029	if (error)
				3030	goto out_trans_cancel;
				3031	}
				3032	} /* target_ip != NULL */
				3033
				3034	/*
				3035	* Remove the source.
				3036	*/
				3037	if (new_parent && src_is_directory) {
				3038	/*
				3039	* Rewrite the ".." entry to point to the new
				3040	* directory.
				3041	*/
				3042	error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
				3043	target_dp->i_ino, spaceres);
				3044	ASSERT(error != -EEXIST);
				3045	if (error)
				3046	goto out_trans_cancel;
				3047	}
				3048
				3049	/*
				3050	* We always want to hit the ctime on the source inode.
				3051	*
				3052	* This isn't strictly required by the standards since the source
				3053	* inode isn't really being changed, but old unix file systems did
				3054	* it and some incremental backup programs won't work without it.
				3055	*/
				3056	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
				3057	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
				3058
				3059	/*
				3060	* Adjust the link count on src_dp. This is necessary when
				3061	* renaming a directory, either within one parent when
				3062	* the target existed, or across two parent directories.
				3063	*/
				3064	if (src_is_directory && (new_parent \|\| target_ip != NULL)) {
				3065
				3066	/*
				3067	* Decrement link count on src_directory since the
				3068	* entry that's moved no longer points to it.
				3069	*/
				3070	error = xfs_droplink(tp, src_dp);
				3071	if (error)
				3072	goto out_trans_cancel;
				3073	}
				3074
				3075	/*
				3076	* For whiteouts, we only need to update the source dirent with the
				3077	* inode number of the whiteout inode rather than removing it
				3078	* altogether.
				3079	*/
				3080	if (wip) {
				3081	error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
				3082	spaceres);
				3083	} else
				3084	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
				3085	spaceres);
				3086	if (error)
				3087	goto out_trans_cancel;
				3088
				3089	/*
				3090	* For whiteouts, we need to bump the link count on the whiteout inode.
				3091	* This means that failures all the way up to this point leave the inode
				3092	* on the unlinked list and so cleanup is a simple matter of dropping
				3093	* the remaining reference to it. If we fail here after bumping the link
				3094	* count, we're shutting down the filesystem so we'll never see the
				3095	* intermediate state on disk.
				3096	*/
				3097	if (wip) {
				3098	ASSERT(VFS_I(wip)->i_nlink == 0);
				3099	error = xfs_bumplink(tp, wip);
				3100	if (error)
				3101	goto out_trans_cancel;
				3102	error = xfs_iunlink_remove(tp, wip);
				3103	if (error)
				3104	goto out_trans_cancel;
				3105	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
				3106
				3107	/*
				3108	* Now we have a real link, clear the "I'm a tmpfile" state
				3109	* flag from the inode so it doesn't accidentally get misused in
				3110	* future.
				3111	*/
				3112	VFS_I(wip)->i_state &= ~I_LINKABLE;
				3113	}
				3114
				3115	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				3116	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
				3117	if (new_parent)
				3118	xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
				3119
				3120	error = xfs_finish_rename(tp);
				3121	if (wip)
				3122	xfs_irele(wip);
				3123	return error;
				3124
				3125	out_trans_cancel:
				3126	xfs_trans_cancel(tp);
				3127	out_release_wip:
				3128	if (wip)
				3129	xfs_irele(wip);
				3130	return error;
				3131	}
				3132
				3133	STATIC int
				3134	xfs_iflush_cluster(
				3135	struct xfs_inode *ip,
				3136	struct xfs_buf *bp)
				3137	{
				3138	struct xfs_mount *mp = ip->i_mount;
				3139	struct xfs_perag *pag;
				3140	unsigned long first_index, mask;
				3141	unsigned long inodes_per_cluster;
				3142	int cilist_size;
				3143	struct xfs_inode **cilist;
				3144	struct xfs_inode *cip;
				3145	int nr_found;
				3146	int clcount = 0;
				3147	int i;
				3148
				3149	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
				3150
				3151	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
				3152	cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
				3153	cilist = kmem_alloc(cilist_size, KM_MAYFAIL\|KM_NOFS);
				3154	if (!cilist)
				3155	goto out_put;
				3156
				3157	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
				3158	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
				3159	rcu_read_lock();
				3160	/* really need a gang lookup range call here */
				3161	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
				3162	first_index, inodes_per_cluster);
				3163	if (nr_found == 0)
				3164	goto out_free;
				3165
				3166	for (i = 0; i < nr_found; i++) {
				3167	cip = cilist[i];
				3168	if (cip == ip)
				3169	continue;
				3170
				3171	/*
				3172	* because this is an RCU protected lookup, we could find a
				3173	* recently freed or even reallocated inode during the lookup.
				3174	* We need to check under the i_flags_lock for a valid inode
				3175	* here. Skip it if it is not valid or the wrong inode.
				3176	*/
				3177	spin_lock(&cip->i_flags_lock);
				3178	if (!cip->i_ino \|\|
				3179	__xfs_iflags_test(cip, XFS_ISTALE)) {
				3180	spin_unlock(&cip->i_flags_lock);
				3181	continue;
				3182	}
				3183
				3184	/*
				3185	* Once we fall off the end of the cluster, no point checking
				3186	* any more inodes in the list because they will also all be
				3187	* outside the cluster.
				3188	*/
				3189	if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
				3190	spin_unlock(&cip->i_flags_lock);
				3191	break;
				3192	}
				3193	spin_unlock(&cip->i_flags_lock);
				3194
				3195	/*
				3196	* Do an un-protected check to see if the inode is dirty and
				3197	* is a candidate for flushing. These checks will be repeated
				3198	* later after the appropriate locks are acquired.
				3199	*/
				3200	if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
				3201	continue;
				3202
				3203	/*
				3204	* Try to get locks. If any are unavailable or it is pinned,
				3205	* then this inode cannot be flushed and is skipped.
				3206	*/
				3207
				3208	if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
				3209	continue;
				3210	if (!xfs_iflock_nowait(cip)) {
				3211	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3212	continue;
				3213	}
				3214	if (xfs_ipincount(cip)) {
				3215	xfs_ifunlock(cip);
				3216	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3217	continue;
				3218	}
				3219
				3220
				3221	/*
				3222	* Check the inode number again, just to be certain we are not
				3223	* racing with freeing in xfs_reclaim_inode(). See the comments
				3224	* in that function for more information as to why the initial
				3225	* check is not sufficient.
				3226	*/
				3227	if (!cip->i_ino) {
				3228	xfs_ifunlock(cip);
				3229	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3230	continue;
				3231	}
				3232
				3233	/*
				3234	* arriving here means that this inode can be flushed. First
				3235	* re-check that it's dirty before flushing.
				3236	*/
				3237	if (!xfs_inode_clean(cip)) {
				3238	int error;
				3239	error = xfs_iflush_int(cip, bp);
				3240	if (error) {
				3241	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3242	goto cluster_corrupt_out;
				3243	}
				3244	clcount++;
				3245	} else {
				3246	xfs_ifunlock(cip);
				3247	}
				3248	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3249	}
				3250
				3251	if (clcount) {
				3252	XFS_STATS_INC(mp, xs_icluster_flushcnt);
				3253	XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
				3254	}
				3255
				3256	out_free:
				3257	rcu_read_unlock();
				3258	kmem_free(cilist);
				3259	out_put:
				3260	xfs_perag_put(pag);
				3261	return 0;
				3262
				3263
				3264	cluster_corrupt_out:
				3265	/*
				3266	* Corruption detected in the clustering loop. Invalidate the
				3267	* inode buffer and shut down the filesystem.
				3268	*/
				3269	rcu_read_unlock();
				3270	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
				3271
				3272	/*
				3273	* We'll always have an inode attached to the buffer for completion
				3274	* process by the time we are called from xfs_iflush(). Hence we have
				3275	* always need to do IO completion processing to abort the inodes
				3276	* attached to the buffer. handle them just like the shutdown case in
				3277	* xfs_buf_submit().
				3278	*/
				3279	ASSERT(bp->b_iodone);
				3280	bp->b_flags &= ~XBF_DONE;
				3281	xfs_buf_stale(bp);
				3282	xfs_buf_ioerror(bp, -EIO);
				3283	xfs_buf_ioend(bp);
				3284
				3285	/* abort the corrupt inode, as it was not attached to the buffer */
				3286	xfs_iflush_abort(cip, false);
				3287	kmem_free(cilist);
				3288	xfs_perag_put(pag);
				3289	return -EFSCORRUPTED;
				3290	}
				3291
				3292	/*
				3293	* Flush dirty inode metadata into the backing buffer.
				3294	*
				3295	* The caller must have the inode lock and the inode flush lock held. The
				3296	* inode lock will still be held upon return to the caller, and the inode
				3297	* flush lock will be released after the inode has reached the disk.
				3298	*
				3299	* The caller must write out the buffer returned in *bpp and release it.
				3300	*/
				3301	int
				3302	xfs_iflush(
				3303	struct xfs_inode *ip,
				3304	struct xfs_buf **bpp)
				3305	{
				3306	struct xfs_mount *mp = ip->i_mount;
				3307	struct xfs_buf *bp = NULL;
				3308	struct xfs_dinode *dip;
				3309	int error;
				3310
				3311	XFS_STATS_INC(mp, xs_iflush_count);
				3312
				3313	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				3314	ASSERT(xfs_isiflocked(ip));
				3315	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
				3316	ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
				3317
				3318	*bpp = NULL;
				3319
				3320	xfs_iunpin_wait(ip);
				3321
				3322	/*
				3323	* For stale inodes we cannot rely on the backing buffer remaining
				3324	* stale in cache for the remaining life of the stale inode and so
				3325	* xfs_imap_to_bp() below may give us a buffer that no longer contains
				3326	* inodes below. We have to check this after ensuring the inode is
				3327	* unpinned so that it is safe to reclaim the stale inode after the
				3328	* flush call.
				3329	*/
				3330	if (xfs_iflags_test(ip, XFS_ISTALE)) {
				3331	xfs_ifunlock(ip);
				3332	return 0;
				3333	}
				3334
				3335	/*
				3336	* This may have been unpinned because the filesystem is shutting
				3337	* down forcibly. If that's the case we must not write this inode
				3338	* to disk, because the log record didn't make it to disk.
				3339	*
				3340	* We also have to remove the log item from the AIL in this case,
				3341	* as we wait for an empty AIL as part of the unmount process.
				3342	*/
				3343	if (XFS_FORCED_SHUTDOWN(mp)) {
				3344	error = -EIO;
				3345	goto abort_out;
				3346	}
				3347
				3348	/*
				3349	* Get the buffer containing the on-disk inode. We are doing a try-lock
				3350	* operation here, so we may get an EAGAIN error. In that case, we
				3351	* simply want to return with the inode still dirty.
				3352	*
				3353	* If we get any other error, we effectively have a corruption situation
				3354	* and we cannot flush the inode, so we treat it the same as failing
				3355	* xfs_iflush_int().
				3356	*/
				3357	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
				3358	0);
				3359	if (error == -EAGAIN) {
				3360	xfs_ifunlock(ip);
				3361	return error;
				3362	}
				3363	if (error)
				3364	goto corrupt_out;
				3365
				3366	/*
				3367	* First flush out the inode that xfs_iflush was called with.
				3368	*/
				3369	error = xfs_iflush_int(ip, bp);
				3370	if (error)
				3371	goto corrupt_out;
				3372
				3373	/*
				3374	* If the buffer is pinned then push on the log now so we won't
				3375	* get stuck waiting in the write for too long.
				3376	*/
				3377	if (xfs_buf_ispinned(bp))
				3378	xfs_log_force(mp, 0);
				3379
				3380	/*
				3381	* inode clustering: try to gather other inodes into this write
				3382	*
				3383	* Note: Any error during clustering will result in the filesystem
				3384	* being shut down and completion callbacks run on the cluster buffer.
				3385	* As we have already flushed and attached this inode to the buffer,
				3386	* it has already been aborted and released by xfs_iflush_cluster() and
				3387	* so we have no further error handling to do here.
				3388	*/
				3389	error = xfs_iflush_cluster(ip, bp);
				3390	if (error)
				3391	return error;
				3392
				3393	*bpp = bp;
				3394	return 0;
				3395
				3396	corrupt_out:
				3397	if (bp)
				3398	xfs_buf_relse(bp);
				3399	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
				3400	abort_out:
				3401	/* abort the corrupt inode, as it was not attached to the buffer */
				3402	xfs_iflush_abort(ip, false);
				3403	return error;
				3404	}
				3405
				3406	/*
				3407	* If there are inline format data / attr forks attached to this inode,
				3408	* make sure they're not corrupt.
				3409	*/
				3410	bool
				3411	xfs_inode_verify_forks(
				3412	struct xfs_inode *ip)
				3413	{
				3414	struct xfs_ifork *ifp;
				3415	xfs_failaddr_t fa;
				3416
				3417	fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
				3418	if (fa) {
				3419	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
				3420	xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
				3421	ifp->if_u1.if_data, ifp->if_bytes, fa);
				3422	return false;
				3423	}
				3424
				3425	fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
				3426	if (fa) {
				3427	ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
				3428	xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
				3429	ifp ? ifp->if_u1.if_data : NULL,
				3430	ifp ? ifp->if_bytes : 0, fa);
				3431	return false;
				3432	}
				3433	return true;
				3434	}
				3435
				3436	STATIC int
				3437	xfs_iflush_int(
				3438	struct xfs_inode *ip,
				3439	struct xfs_buf *bp)
				3440	{
				3441	struct xfs_inode_log_item *iip = ip->i_itemp;
				3442	struct xfs_dinode *dip;
				3443	struct xfs_mount *mp = ip->i_mount;
				3444
				3445	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				3446	ASSERT(xfs_isiflocked(ip));
				3447	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
				3448	ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
				3449	ASSERT(iip != NULL && iip->ili_fields != 0);
				3450	ASSERT(ip->i_d.di_version > 1);
				3451
				3452	/* set dip = inode's place in the buffer /
				3453	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
				3454
				3455	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
				3456	mp, XFS_ERRTAG_IFLUSH_1)) {
				3457	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3458	"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
				3459	__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
				3460	goto corrupt_out;
				3461	}
				3462	if (S_ISREG(VFS_I(ip)->i_mode)) {
				3463	if (XFS_TEST_ERROR(
				3464	(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
				3465	(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
				3466	mp, XFS_ERRTAG_IFLUSH_3)) {
				3467	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3468	"%s: Bad regular inode %Lu, ptr "PTR_FMT,
				3469	__func__, ip->i_ino, ip);
				3470	goto corrupt_out;
				3471	}
				3472	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
				3473	if (XFS_TEST_ERROR(
				3474	(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
				3475	(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
				3476	(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
				3477	mp, XFS_ERRTAG_IFLUSH_4)) {
				3478	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3479	"%s: Bad directory inode %Lu, ptr "PTR_FMT,
				3480	__func__, ip->i_ino, ip);
				3481	goto corrupt_out;
				3482	}
				3483	}
				3484	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
				3485	ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
				3486	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3487	"%s: detected corrupt incore inode %Lu, "
				3488	"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
				3489	__func__, ip->i_ino,
				3490	ip->i_d.di_nextents + ip->i_d.di_anextents,
				3491	ip->i_d.di_nblocks, ip);
				3492	goto corrupt_out;
				3493	}
				3494	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
				3495	mp, XFS_ERRTAG_IFLUSH_6)) {
				3496	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3497	"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
				3498	__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
				3499	goto corrupt_out;
				3500	}
				3501
				3502	/*
				3503	* Inode item log recovery for v2 inodes are dependent on the
				3504	* di_flushiter count for correct sequencing. We bump the flush
				3505	* iteration count so we can detect flushes which postdate a log record
				3506	* during recovery. This is redundant as we now log every change and
				3507	* hence this can't happen but we need to still do it to ensure
				3508	* backwards compatibility with old kernels that predate logging all
				3509	* inode changes.
				3510	*/
				3511	if (ip->i_d.di_version < 3)
				3512	ip->i_d.di_flushiter++;
				3513
				3514	/* Check the inline fork data before we write out. */
				3515	if (!xfs_inode_verify_forks(ip))
				3516	goto corrupt_out;
				3517
				3518	/*
				3519	* Copy the dirty parts of the inode into the on-disk inode. We always
				3520	* copy out the core of the inode, because if the inode is dirty at all
				3521	* the core must be.
				3522	*/
				3523	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
				3524
				3525	/* Wrap, we never let the log put out DI_MAX_FLUSH */
				3526	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
				3527	ip->i_d.di_flushiter = 0;
				3528
				3529	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
				3530	if (XFS_IFORK_Q(ip))
				3531	xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
				3532	xfs_inobp_check(mp, bp);
				3533
				3534	/*
				3535	* We've recorded everything logged in the inode, so we'd like to clear
				3536	* the ili_fields bits so we don't log and flush things unnecessarily.
				3537	* However, we can't stop logging all this information until the data
				3538	* we've copied into the disk buffer is written to disk. If we did we
				3539	* might overwrite the copy of the inode in the log with all the data
				3540	* after re-logging only part of it, and in the face of a crash we
				3541	* wouldn't have all the data we need to recover.
				3542	*
				3543	* What we do is move the bits to the ili_last_fields field. When
				3544	* logging the inode, these bits are moved back to the ili_fields field.
				3545	* In the xfs_iflush_done() routine we clear ili_last_fields, since we
				3546	* know that the information those bits represent is permanently on
				3547	* disk. As long as the flush completes before the inode is logged
				3548	* again, then both ili_fields and ili_last_fields will be cleared.
				3549	*
				3550	* We can play with the ili_fields bits here, because the inode lock
				3551	* must be held exclusively in order to set bits there and the flush
				3552	* lock protects the ili_last_fields bits. Set ili_logged so the flush
				3553	* done routine can tell whether or not to look in the AIL. Also, store
				3554	* the current LSN of the inode so that we can tell whether the item has
				3555	* moved in the AIL from xfs_iflush_done(). In order to read the lsn we
				3556	* need the AIL lock, because it is a 64 bit value that cannot be read
				3557	* atomically.
				3558	*/
				3559	iip->ili_last_fields = iip->ili_fields;
				3560	iip->ili_fields = 0;
				3561	iip->ili_fsync_fields = 0;
				3562	iip->ili_logged = 1;
				3563
				3564	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
				3565	&iip->ili_item.li_lsn);
				3566
				3567	/*
				3568	* Attach the function xfs_iflush_done to the inode's
				3569	* buffer. This will remove the inode from the AIL
				3570	* and unlock the inode's flush lock when the inode is
				3571	* completely written to disk.
				3572	*/
				3573	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
				3574
				3575	/* generate the checksum. */
				3576	xfs_dinode_calc_crc(mp, dip);
				3577
				3578	ASSERT(!list_empty(&bp->b_li_list));
				3579	ASSERT(bp->b_iodone != NULL);
				3580	return 0;
				3581
				3582	corrupt_out:
				3583	return -EFSCORRUPTED;
				3584	}
				3585
				3586	/* Release an inode. */
				3587	void
				3588	xfs_irele(
				3589	struct xfs_inode *ip)
				3590	{
				3591	trace_xfs_irele(ip, _RET_IP_);
				3592	iput(VFS_I(ip));
				3593	}