Blame - src/kernel/linux/v4.14/fs/xfs/xfs_inode.c - T103

blob: 7cfbe2b0f8867ba060ee9c1f5dd2eb90f6f73d8b [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
				3	* All Rights Reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License as
				7	* published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it would be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				12	* GNU General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public License
				15	* along with this program; if not, write the Free Software Foundation,
				16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				17	*/
				18	#include <linux/log2.h>
				19
				20	#include "xfs.h"
				21	#include "xfs_fs.h"
				22	#include "xfs_shared.h"
				23	#include "xfs_format.h"
				24	#include "xfs_log_format.h"
				25	#include "xfs_trans_resv.h"
				26	#include "xfs_sb.h"
				27	#include "xfs_mount.h"
				28	#include "xfs_defer.h"
				29	#include "xfs_inode.h"
				30	#include "xfs_da_format.h"
				31	#include "xfs_da_btree.h"
				32	#include "xfs_dir2.h"
				33	#include "xfs_attr_sf.h"
				34	#include "xfs_attr.h"
				35	#include "xfs_trans_space.h"
				36	#include "xfs_trans.h"
				37	#include "xfs_buf_item.h"
				38	#include "xfs_inode_item.h"
				39	#include "xfs_ialloc.h"
				40	#include "xfs_bmap.h"
				41	#include "xfs_bmap_util.h"
				42	#include "xfs_error.h"
				43	#include "xfs_quota.h"
				44	#include "xfs_filestream.h"
				45	#include "xfs_cksum.h"
				46	#include "xfs_trace.h"
				47	#include "xfs_icache.h"
				48	#include "xfs_symlink.h"
				49	#include "xfs_trans_priv.h"
				50	#include "xfs_log.h"
				51	#include "xfs_bmap_btree.h"
				52	#include "xfs_reflink.h"
				53	#include "xfs_dir2_priv.h"
				54
				55	kmem_zone_t *xfs_inode_zone;
				56
				57	/*
				58	* Used in xfs_itruncate_extents(). This is the maximum number of extents
				59	* freed from a file in a single transaction.
				60	*/
				61	#define XFS_ITRUNC_MAX_EXTENTS 2
				62
				63	STATIC int xfs_iflush_int(struct xfs_inode , struct xfs_buf );
				64	STATIC int xfs_iunlink(struct xfs_trans , struct xfs_inode );
				65	STATIC int xfs_iunlink_remove(struct xfs_trans , struct xfs_inode );
				66
				67	/*
				68	* helper function to extract extent size hint from inode
				69	*/
				70	xfs_extlen_t
				71	xfs_get_extsz_hint(
				72	struct xfs_inode *ip)
				73	{
				74	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
				75	return ip->i_d.di_extsize;
				76	if (XFS_IS_REALTIME_INODE(ip))
				77	return ip->i_mount->m_sb.sb_rextsize;
				78	return 0;
				79	}
				80
				81	/*
				82	* Helper function to extract CoW extent size hint from inode.
				83	* Between the extent size hint and the CoW extent size hint, we
				84	* return the greater of the two. If the value is zero (automatic),
				85	* use the default size.
				86	*/
				87	xfs_extlen_t
				88	xfs_get_cowextsz_hint(
				89	struct xfs_inode *ip)
				90	{
				91	xfs_extlen_t a, b;
				92
				93	a = 0;
				94	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
				95	a = ip->i_d.di_cowextsize;
				96	b = xfs_get_extsz_hint(ip);
				97
				98	a = max(a, b);
				99	if (a == 0)
				100	return XFS_DEFAULT_COWEXTSZ_HINT;
				101	return a;
				102	}
				103
				104	/*
				105	* These two are wrapper routines around the xfs_ilock() routine used to
				106	* centralize some grungy code. They are used in places that wish to lock the
				107	* inode solely for reading the extents. The reason these places can't just
				108	* call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
				109	* bringing in of the extents from disk for a file in b-tree format. If the
				110	* inode is in b-tree format, then we need to lock the inode exclusively until
				111	* the extents are read in. Locking it exclusively all the time would limit
				112	* our parallelism unnecessarily, though. What we do instead is check to see
				113	* if the extents have been read in yet, and only lock the inode exclusively
				114	* if they have not.
				115	*
				116	* The functions return a value which should be given to the corresponding
				117	* xfs_iunlock() call.
				118	*/
				119	uint
				120	xfs_ilock_data_map_shared(
				121	struct xfs_inode *ip)
				122	{
				123	uint lock_mode = XFS_ILOCK_SHARED;
				124
				125	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
				126	(ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
				127	lock_mode = XFS_ILOCK_EXCL;
				128	xfs_ilock(ip, lock_mode);
				129	return lock_mode;
				130	}
				131
				132	uint
				133	xfs_ilock_attr_map_shared(
				134	struct xfs_inode *ip)
				135	{
				136	uint lock_mode = XFS_ILOCK_SHARED;
				137
				138	if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
				139	(ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
				140	lock_mode = XFS_ILOCK_EXCL;
				141	xfs_ilock(ip, lock_mode);
				142	return lock_mode;
				143	}
				144
				145	/*
				146	* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
				147	* multi-reader locks: i_mmap_lock and the i_lock. This routine allows
				148	* various combinations of the locks to be obtained.
				149	*
				150	* The 3 locks should always be ordered so that the IO lock is obtained first,
				151	* the mmap lock second and the ilock last in order to prevent deadlock.
				152	*
				153	* Basic locking order:
				154	*
				155	* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
				156	*
				157	* mmap_sem locking order:
				158	*
				159	* i_rwsem -> page lock -> mmap_sem
				160	* mmap_sem -> i_mmap_lock -> page_lock
				161	*
				162	* The difference in mmap_sem locking order mean that we cannot hold the
				163	* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
				164	* fault in pages during copy in/out (for buffered IO) or require the mmap_sem
				165	* in get_user_pages() to map the user pages into the kernel address space for
				166	* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
				167	* page faults already hold the mmap_sem.
				168	*
				169	* Hence to serialise fully against both syscall and mmap based IO, we need to
				170	* take both the i_rwsem and the i_mmap_lock. These locks should only be both
				171	* taken in places where we need to invalidate the page cache in a race
				172	* free manner (e.g. truncate, hole punch and other extent manipulation
				173	* functions).
				174	*/
				175	void
				176	xfs_ilock(
				177	xfs_inode_t *ip,
				178	uint lock_flags)
				179	{
				180	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
				181
				182	/*
				183	* You can't set both SHARED and EXCL for the same lock,
				184	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				185	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				186	*/
				187	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				188	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				189	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				190	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				191	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				192	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				193	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				194
				195	if (lock_flags & XFS_IOLOCK_EXCL) {
				196	down_write_nested(&VFS_I(ip)->i_rwsem,
				197	XFS_IOLOCK_DEP(lock_flags));
				198	} else if (lock_flags & XFS_IOLOCK_SHARED) {
				199	down_read_nested(&VFS_I(ip)->i_rwsem,
				200	XFS_IOLOCK_DEP(lock_flags));
				201	}
				202
				203	if (lock_flags & XFS_MMAPLOCK_EXCL)
				204	mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
				205	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				206	mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
				207
				208	if (lock_flags & XFS_ILOCK_EXCL)
				209	mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
				210	else if (lock_flags & XFS_ILOCK_SHARED)
				211	mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
				212	}
				213
				214	/*
				215	* This is just like xfs_ilock(), except that the caller
				216	* is guaranteed not to sleep. It returns 1 if it gets
				217	* the requested locks and 0 otherwise. If the IO lock is
				218	* obtained but the inode lock cannot be, then the IO lock
				219	* is dropped before returning.
				220	*
				221	* ip -- the inode being locked
				222	* lock_flags -- this parameter indicates the inode's locks to be
				223	* to be locked. See the comment for xfs_ilock() for a list
				224	* of valid values.
				225	*/
				226	int
				227	xfs_ilock_nowait(
				228	xfs_inode_t *ip,
				229	uint lock_flags)
				230	{
				231	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
				232
				233	/*
				234	* You can't set both SHARED and EXCL for the same lock,
				235	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				236	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				237	*/
				238	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				239	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				240	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				241	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				242	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				243	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				244	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				245
				246	if (lock_flags & XFS_IOLOCK_EXCL) {
				247	if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
				248	goto out;
				249	} else if (lock_flags & XFS_IOLOCK_SHARED) {
				250	if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
				251	goto out;
				252	}
				253
				254	if (lock_flags & XFS_MMAPLOCK_EXCL) {
				255	if (!mrtryupdate(&ip->i_mmaplock))
				256	goto out_undo_iolock;
				257	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
				258	if (!mrtryaccess(&ip->i_mmaplock))
				259	goto out_undo_iolock;
				260	}
				261
				262	if (lock_flags & XFS_ILOCK_EXCL) {
				263	if (!mrtryupdate(&ip->i_lock))
				264	goto out_undo_mmaplock;
				265	} else if (lock_flags & XFS_ILOCK_SHARED) {
				266	if (!mrtryaccess(&ip->i_lock))
				267	goto out_undo_mmaplock;
				268	}
				269	return 1;
				270
				271	out_undo_mmaplock:
				272	if (lock_flags & XFS_MMAPLOCK_EXCL)
				273	mrunlock_excl(&ip->i_mmaplock);
				274	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				275	mrunlock_shared(&ip->i_mmaplock);
				276	out_undo_iolock:
				277	if (lock_flags & XFS_IOLOCK_EXCL)
				278	up_write(&VFS_I(ip)->i_rwsem);
				279	else if (lock_flags & XFS_IOLOCK_SHARED)
				280	up_read(&VFS_I(ip)->i_rwsem);
				281	out:
				282	return 0;
				283	}
				284
				285	/*
				286	* xfs_iunlock() is used to drop the inode locks acquired with
				287	* xfs_ilock() and xfs_ilock_nowait(). The caller must pass
				288	* in the flags given to xfs_ilock() or xfs_ilock_nowait() so
				289	* that we know which locks to drop.
				290	*
				291	* ip -- the inode being unlocked
				292	* lock_flags -- this parameter indicates the inode's locks to be
				293	* to be unlocked. See the comment for xfs_ilock() for a list
				294	* of valid values for this parameter.
				295	*
				296	*/
				297	void
				298	xfs_iunlock(
				299	xfs_inode_t *ip,
				300	uint lock_flags)
				301	{
				302	/*
				303	* You can't set both SHARED and EXCL for the same lock,
				304	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
				305	* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
				306	*/
				307	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
				308	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
				309	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
				310	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
				311	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
				312	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
				313	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == 0);
				314	ASSERT(lock_flags != 0);
				315
				316	if (lock_flags & XFS_IOLOCK_EXCL)
				317	up_write(&VFS_I(ip)->i_rwsem);
				318	else if (lock_flags & XFS_IOLOCK_SHARED)
				319	up_read(&VFS_I(ip)->i_rwsem);
				320
				321	if (lock_flags & XFS_MMAPLOCK_EXCL)
				322	mrunlock_excl(&ip->i_mmaplock);
				323	else if (lock_flags & XFS_MMAPLOCK_SHARED)
				324	mrunlock_shared(&ip->i_mmaplock);
				325
				326	if (lock_flags & XFS_ILOCK_EXCL)
				327	mrunlock_excl(&ip->i_lock);
				328	else if (lock_flags & XFS_ILOCK_SHARED)
				329	mrunlock_shared(&ip->i_lock);
				330
				331	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
				332	}
				333
				334	/*
				335	* give up write locks. the i/o lock cannot be held nested
				336	* if it is being demoted.
				337	*/
				338	void
				339	xfs_ilock_demote(
				340	xfs_inode_t *ip,
				341	uint lock_flags)
				342	{
				343	ASSERT(lock_flags & (XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL));
				344	ASSERT((lock_flags &
				345	~(XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL)) == 0);
				346
				347	if (lock_flags & XFS_ILOCK_EXCL)
				348	mrdemote(&ip->i_lock);
				349	if (lock_flags & XFS_MMAPLOCK_EXCL)
				350	mrdemote(&ip->i_mmaplock);
				351	if (lock_flags & XFS_IOLOCK_EXCL)
				352	downgrade_write(&VFS_I(ip)->i_rwsem);
				353
				354	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
				355	}
				356
				357	#if defined(DEBUG) \|\| defined(XFS_WARN)
				358	int
				359	xfs_isilocked(
				360	xfs_inode_t *ip,
				361	uint lock_flags)
				362	{
				363	if (lock_flags & (XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED)) {
				364	if (!(lock_flags & XFS_ILOCK_SHARED))
				365	return !!ip->i_lock.mr_writer;
				366	return rwsem_is_locked(&ip->i_lock.mr_lock);
				367	}
				368
				369	if (lock_flags & (XFS_MMAPLOCK_EXCL\|XFS_MMAPLOCK_SHARED)) {
				370	if (!(lock_flags & XFS_MMAPLOCK_SHARED))
				371	return !!ip->i_mmaplock.mr_writer;
				372	return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
				373	}
				374
				375	if (lock_flags & (XFS_IOLOCK_EXCL\|XFS_IOLOCK_SHARED)) {
				376	if (!(lock_flags & XFS_IOLOCK_SHARED))
				377	return !debug_locks \|\|
				378	lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
				379	return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
				380	}
				381
				382	ASSERT(0);
				383	return 0;
				384	}
				385	#endif
				386
				387	#ifdef DEBUG
				388	int xfs_locked_n;
				389	int xfs_small_retries;
				390	int xfs_middle_retries;
				391	int xfs_lots_retries;
				392	int xfs_lock_delays;
				393	#endif
				394
				395	/*
				396	* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
				397	* DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
				398	* when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
				399	* errors and warnings.
				400	*/
				401	#if (defined(DEBUG) \|\| defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
				402	static bool
				403	xfs_lockdep_subclass_ok(
				404	int subclass)
				405	{
				406	return subclass < MAX_LOCKDEP_SUBCLASSES;
				407	}
				408	#else
				409	#define xfs_lockdep_subclass_ok(subclass) (true)
				410	#endif
				411
				412	/*
				413	* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
				414	* value. This can be called for any type of inode lock combination, including
				415	* parent locking. Care must be taken to ensure we don't overrun the subclass
				416	* storage fields in the class mask we build.
				417	*/
				418	static inline int
				419	xfs_lock_inumorder(int lock_mode, int subclass)
				420	{
				421	int class = 0;
				422
				423	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT \| XFS_ILOCK_RTBITMAP \|
				424	XFS_ILOCK_RTSUM)));
				425	ASSERT(xfs_lockdep_subclass_ok(subclass));
				426
				427	if (lock_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)) {
				428	ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
				429	class += subclass << XFS_IOLOCK_SHIFT;
				430	}
				431
				432	if (lock_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) {
				433	ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
				434	class += subclass << XFS_MMAPLOCK_SHIFT;
				435	}
				436
				437	if (lock_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)) {
				438	ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
				439	class += subclass << XFS_ILOCK_SHIFT;
				440	}
				441
				442	return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) \| class;
				443	}
				444
				445	/*
				446	* The following routine will lock n inodes in exclusive mode. We assume the
				447	* caller calls us with the inodes in i_ino order.
				448	*
				449	* We need to detect deadlock where an inode that we lock is in the AIL and we
				450	* start waiting for another inode that is locked by a thread in a long running
				451	* transaction (such as truncate). This can result in deadlock since the long
				452	* running trans might need to wait for the inode we just locked in order to
				453	* push the tail and free space in the log.
				454	*
				455	* xfs_lock_inodes() can only be used to lock one type of lock at a time -
				456	* the iolock, the mmaplock or the ilock, but not more than one at a time. If we
				457	* lock more than one at a time, lockdep will report false positives saying we
				458	* have violated locking orders.
				459	*/
				460	static void
				461	xfs_lock_inodes(
				462	xfs_inode_t **ips,
				463	int inodes,
				464	uint lock_mode)
				465	{
				466	int attempts = 0, i, j, try_lock;
				467	xfs_log_item_t *lp;
				468
				469	/*
				470	* Currently supports between 2 and 5 inodes with exclusive locking. We
				471	* support an arbitrary depth of locking here, but absolute limits on
				472	* inodes depend on the the type of locking and the limits placed by
				473	* lockdep annotations in xfs_lock_inumorder. These are all checked by
				474	* the asserts.
				475	*/
				476	ASSERT(ips && inodes >= 2 && inodes <= 5);
				477	ASSERT(lock_mode & (XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL \|
				478	XFS_ILOCK_EXCL));
				479	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED \| XFS_MMAPLOCK_SHARED \|
				480	XFS_ILOCK_SHARED)));
				481	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) \|\|
				482	inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
				483	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) \|\|
				484	inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
				485
				486	if (lock_mode & XFS_IOLOCK_EXCL) {
				487	ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL \| XFS_ILOCK_EXCL)));
				488	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
				489	ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
				490
				491	try_lock = 0;
				492	i = 0;
				493	again:
				494	for (; i < inodes; i++) {
				495	ASSERT(ips[i]);
				496
				497	if (i && (ips[i] == ips[i - 1])) /* Already locked */
				498	continue;
				499
				500	/*
				501	* If try_lock is not set yet, make sure all locked inodes are
				502	* not in the AIL. If any are, set try_lock to be used later.
				503	*/
				504	if (!try_lock) {
				505	for (j = (i - 1); j >= 0 && !try_lock; j--) {
				506	lp = (xfs_log_item_t *)ips[j]->i_itemp;
				507	if (lp && (lp->li_flags & XFS_LI_IN_AIL))
				508	try_lock++;
				509	}
				510	}
				511
				512	/*
				513	* If any of the previous locks we have locked is in the AIL,
				514	* we must TRY to get the second and subsequent locks. If
				515	* we can't get any, we must release all we have
				516	* and try again.
				517	*/
				518	if (!try_lock) {
				519	xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
				520	continue;
				521	}
				522
				523	/* try_lock means we have an inode locked that is in the AIL. */
				524	ASSERT(i != 0);
				525	if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
				526	continue;
				527
				528	/*
				529	* Unlock all previous guys and try again. xfs_iunlock will try
				530	* to push the tail if the inode is in the AIL.
				531	*/
				532	attempts++;
				533	for (j = i - 1; j >= 0; j--) {
				534	/*
				535	* Check to see if we've already unlocked this one. Not
				536	* the first one going back, and the inode ptr is the
				537	* same.
				538	*/
				539	if (j != (i - 1) && ips[j] == ips[j + 1])
				540	continue;
				541
				542	xfs_iunlock(ips[j], lock_mode);
				543	}
				544
				545	if ((attempts % 5) == 0) {
				546	delay(1); /* Don't just spin the CPU */
				547	#ifdef DEBUG
				548	xfs_lock_delays++;
				549	#endif
				550	}
				551	i = 0;
				552	try_lock = 0;
				553	goto again;
				554	}
				555
				556	#ifdef DEBUG
				557	if (attempts) {
				558	if (attempts < 5) xfs_small_retries++;
				559	else if (attempts < 100) xfs_middle_retries++;
				560	else xfs_lots_retries++;
				561	} else {
				562	xfs_locked_n++;
				563	}
				564	#endif
				565	}
				566
				567	/*
				568	* xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
				569	* the iolock, the mmaplock or the ilock, but not more than one at a time. If we
				570	* lock more than one at a time, lockdep will report false positives saying we
				571	* have violated locking orders.
				572	*/
				573	void
				574	xfs_lock_two_inodes(
				575	xfs_inode_t *ip0,
				576	xfs_inode_t *ip1,
				577	uint lock_mode)
				578	{
				579	xfs_inode_t *temp;
				580	int attempts = 0;
				581	xfs_log_item_t *lp;
				582
				583	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)));
				584	if (lock_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL))
				585	ASSERT(!(lock_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)));
				586
				587	ASSERT(ip0->i_ino != ip1->i_ino);
				588
				589	if (ip0->i_ino > ip1->i_ino) {
				590	temp = ip0;
				591	ip0 = ip1;
				592	ip1 = temp;
				593	}
				594
				595	again:
				596	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
				597
				598	/*
				599	* If the first lock we have locked is in the AIL, we must TRY to get
				600	* the second lock. If we can't get it, we must release the first one
				601	* and try again.
				602	*/
				603	lp = (xfs_log_item_t *)ip0->i_itemp;
				604	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
				605	if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
				606	xfs_iunlock(ip0, lock_mode);
				607	if ((++attempts % 5) == 0)
				608	delay(1); /* Don't just spin the CPU */
				609	goto again;
				610	}
				611	} else {
				612	xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
				613	}
				614	}
				615
				616
				617	void
				618	__xfs_iflock(
				619	struct xfs_inode *ip)
				620	{
				621	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
				622	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
				623
				624	do {
				625	prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				626	if (xfs_isiflocked(ip))
				627	io_schedule();
				628	} while (!xfs_iflock_nowait(ip));
				629
				630	finish_wait(wq, &wait.wq_entry);
				631	}
				632
				633	STATIC uint
				634	_xfs_dic2xflags(
				635	uint16_t di_flags,
				636	uint64_t di_flags2,
				637	bool has_attr)
				638	{
				639	uint flags = 0;
				640
				641	if (di_flags & XFS_DIFLAG_ANY) {
				642	if (di_flags & XFS_DIFLAG_REALTIME)
				643	flags \|= FS_XFLAG_REALTIME;
				644	if (di_flags & XFS_DIFLAG_PREALLOC)
				645	flags \|= FS_XFLAG_PREALLOC;
				646	if (di_flags & XFS_DIFLAG_IMMUTABLE)
				647	flags \|= FS_XFLAG_IMMUTABLE;
				648	if (di_flags & XFS_DIFLAG_APPEND)
				649	flags \|= FS_XFLAG_APPEND;
				650	if (di_flags & XFS_DIFLAG_SYNC)
				651	flags \|= FS_XFLAG_SYNC;
				652	if (di_flags & XFS_DIFLAG_NOATIME)
				653	flags \|= FS_XFLAG_NOATIME;
				654	if (di_flags & XFS_DIFLAG_NODUMP)
				655	flags \|= FS_XFLAG_NODUMP;
				656	if (di_flags & XFS_DIFLAG_RTINHERIT)
				657	flags \|= FS_XFLAG_RTINHERIT;
				658	if (di_flags & XFS_DIFLAG_PROJINHERIT)
				659	flags \|= FS_XFLAG_PROJINHERIT;
				660	if (di_flags & XFS_DIFLAG_NOSYMLINKS)
				661	flags \|= FS_XFLAG_NOSYMLINKS;
				662	if (di_flags & XFS_DIFLAG_EXTSIZE)
				663	flags \|= FS_XFLAG_EXTSIZE;
				664	if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
				665	flags \|= FS_XFLAG_EXTSZINHERIT;
				666	if (di_flags & XFS_DIFLAG_NODEFRAG)
				667	flags \|= FS_XFLAG_NODEFRAG;
				668	if (di_flags & XFS_DIFLAG_FILESTREAM)
				669	flags \|= FS_XFLAG_FILESTREAM;
				670	}
				671
				672	if (di_flags2 & XFS_DIFLAG2_ANY) {
				673	if (di_flags2 & XFS_DIFLAG2_DAX)
				674	flags \|= FS_XFLAG_DAX;
				675	if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
				676	flags \|= FS_XFLAG_COWEXTSIZE;
				677	}
				678
				679	if (has_attr)
				680	flags \|= FS_XFLAG_HASATTR;
				681
				682	return flags;
				683	}
				684
				685	uint
				686	xfs_ip2xflags(
				687	struct xfs_inode *ip)
				688	{
				689	struct xfs_icdinode *dic = &ip->i_d;
				690
				691	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
				692	}
				693
				694	/*
				695	* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
				696	* is allowed, otherwise it has to be an exact match. If a CI match is found,
				697	* ci_name->name will point to a the actual name (caller must free) or
				698	* will be set to NULL if an exact match is found.
				699	*/
				700	int
				701	xfs_lookup(
				702	xfs_inode_t *dp,
				703	struct xfs_name *name,
				704	xfs_inode_t **ipp,
				705	struct xfs_name *ci_name)
				706	{
				707	xfs_ino_t inum;
				708	int error;
				709
				710	trace_xfs_lookup(dp, name);
				711
				712	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
				713	return -EIO;
				714
				715	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
				716	if (error)
				717	goto out_unlock;
				718
				719	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
				720	if (error)
				721	goto out_free_name;
				722
				723	return 0;
				724
				725	out_free_name:
				726	if (ci_name)
				727	kmem_free(ci_name->name);
				728	out_unlock:
				729	*ipp = NULL;
				730	return error;
				731	}
				732
				733	/*
				734	* Allocate an inode on disk and return a copy of its in-core version.
				735	* The in-core inode is locked exclusively. Set mode, nlink, and rdev
				736	* appropriately within the inode. The uid and gid for the inode are
				737	* set according to the contents of the given cred structure.
				738	*
				739	* Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
				740	* has a free inode available, call xfs_iget() to obtain the in-core
				741	* version of the allocated inode. Finally, fill in the inode and
				742	* log its initial contents. In this case, ialloc_context would be
				743	* set to NULL.
				744	*
				745	* If xfs_dialloc() does not have an available inode, it will replenish
				746	* its supply by doing an allocation. Since we can only do one
				747	* allocation within a transaction without deadlocks, we must commit
				748	* the current transaction before returning the inode itself.
				749	* In this case, therefore, we will set ialloc_context and return.
				750	* The caller should then commit the current transaction, start a new
				751	* transaction, and call xfs_ialloc() again to actually get the inode.
				752	*
				753	* To ensure that some other process does not grab the inode that
				754	* was allocated during the first call to xfs_ialloc(), this routine
				755	* also returns the [locked] bp pointing to the head of the freelist
				756	* as ialloc_context. The caller should hold this buffer across
				757	* the commit and pass it back into this routine on the second call.
				758	*
				759	* If we are allocating quota inodes, we do not have a parent inode
				760	* to attach to or associate with (i.e. pip == NULL) because they
				761	* are not linked into the directory structure - they are attached
				762	* directly to the superblock - and so have no parent.
				763	*/
				764	static int
				765	xfs_ialloc(
				766	xfs_trans_t *tp,
				767	xfs_inode_t *pip,
				768	umode_t mode,
				769	xfs_nlink_t nlink,
				770	xfs_dev_t rdev,
				771	prid_t prid,
				772	int okalloc,
				773	xfs_buf_t **ialloc_context,
				774	xfs_inode_t **ipp)
				775	{
				776	struct xfs_mount *mp = tp->t_mountp;
				777	xfs_ino_t ino;
				778	xfs_inode_t *ip;
				779	uint flags;
				780	int error;
				781	struct timespec tv;
				782	struct inode *inode;
				783
				784	/*
				785	* Call the space management code to pick
				786	* the on-disk inode to be allocated.
				787	*/
				788	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
				789	ialloc_context, &ino);
				790	if (error)
				791	return error;
				792	if (*ialloc_context \|\| ino == NULLFSINO) {
				793	*ipp = NULL;
				794	return 0;
				795	}
				796	ASSERT(*ialloc_context == NULL);
				797
				798	/*
				799	* Get the in-core inode with the lock held exclusively.
				800	* This is because we're setting fields here we need
				801	* to prevent others from looking at until we're done.
				802	*/
				803	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
				804	XFS_ILOCK_EXCL, &ip);
				805	if (error)
				806	return error;
				807	ASSERT(ip != NULL);
				808	inode = VFS_I(ip);
				809
				810	/*
				811	* We always convert v1 inodes to v2 now - we only support filesystems
				812	* with >= v2 inode capability, so there is no reason for ever leaving
				813	* an inode in v1 format.
				814	*/
				815	if (ip->i_d.di_version == 1)
				816	ip->i_d.di_version = 2;
				817
				818	inode->i_mode = mode;
				819	set_nlink(inode, nlink);
				820	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
				821	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
				822	xfs_set_projid(ip, prid);
				823
				824	if (pip && XFS_INHERIT_GID(pip)) {
				825	ip->i_d.di_gid = pip->i_d.di_gid;
				826	if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
				827	inode->i_mode \|= S_ISGID;
				828	}
				829
				830	/*
				831	* If the group ID of the new file does not match the effective group
				832	* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
				833	* (and only if the irix_sgid_inherit compatibility variable is set).
				834	*/
				835	if ((irix_sgid_inherit) &&
				836	(inode->i_mode & S_ISGID) &&
				837	(!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
				838	inode->i_mode &= ~S_ISGID;
				839
				840	ip->i_d.di_size = 0;
				841	ip->i_d.di_nextents = 0;
				842	ASSERT(ip->i_d.di_nblocks == 0);
				843
				844	tv = current_time(inode);
				845	inode->i_mtime = tv;
				846	inode->i_atime = tv;
				847	inode->i_ctime = tv;
				848
				849	ip->i_d.di_extsize = 0;
				850	ip->i_d.di_dmevmask = 0;
				851	ip->i_d.di_dmstate = 0;
				852	ip->i_d.di_flags = 0;
				853
				854	if (ip->i_d.di_version == 3) {
				855	inode->i_version = 1;
				856	ip->i_d.di_flags2 = 0;
				857	ip->i_d.di_cowextsize = 0;
				858	ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
				859	ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
				860	}
				861
				862
				863	flags = XFS_ILOG_CORE;
				864	switch (mode & S_IFMT) {
				865	case S_IFIFO:
				866	case S_IFCHR:
				867	case S_IFBLK:
				868	case S_IFSOCK:
				869	ip->i_d.di_format = XFS_DINODE_FMT_DEV;
				870	ip->i_df.if_u2.if_rdev = rdev;
				871	ip->i_df.if_flags = 0;
				872	flags \|= XFS_ILOG_DEV;
				873	break;
				874	case S_IFREG:
				875	case S_IFDIR:
				876	if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
				877	uint di_flags = 0;
				878
				879	if (S_ISDIR(mode)) {
				880	if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
				881	di_flags \|= XFS_DIFLAG_RTINHERIT;
				882	if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
				883	di_flags \|= XFS_DIFLAG_EXTSZINHERIT;
				884	ip->i_d.di_extsize = pip->i_d.di_extsize;
				885	}
				886	if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
				887	di_flags \|= XFS_DIFLAG_PROJINHERIT;
				888	} else if (S_ISREG(mode)) {
				889	if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
				890	di_flags \|= XFS_DIFLAG_REALTIME;
				891	if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
				892	di_flags \|= XFS_DIFLAG_EXTSIZE;
				893	ip->i_d.di_extsize = pip->i_d.di_extsize;
				894	}
				895	}
				896	if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
				897	xfs_inherit_noatime)
				898	di_flags \|= XFS_DIFLAG_NOATIME;
				899	if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
				900	xfs_inherit_nodump)
				901	di_flags \|= XFS_DIFLAG_NODUMP;
				902	if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
				903	xfs_inherit_sync)
				904	di_flags \|= XFS_DIFLAG_SYNC;
				905	if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
				906	xfs_inherit_nosymlinks)
				907	di_flags \|= XFS_DIFLAG_NOSYMLINKS;
				908	if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
				909	xfs_inherit_nodefrag)
				910	di_flags \|= XFS_DIFLAG_NODEFRAG;
				911	if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
				912	di_flags \|= XFS_DIFLAG_FILESTREAM;
				913
				914	ip->i_d.di_flags \|= di_flags;
				915	}
				916	if (pip &&
				917	(pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
				918	pip->i_d.di_version == 3 &&
				919	ip->i_d.di_version == 3) {
				920	uint64_t di_flags2 = 0;
				921
				922	if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
				923	di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
				924	ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
				925	}
				926	if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
				927	di_flags2 \|= XFS_DIFLAG2_DAX;
				928
				929	ip->i_d.di_flags2 \|= di_flags2;
				930	}
				931	/* FALLTHROUGH */
				932	case S_IFLNK:
				933	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
				934	ip->i_df.if_flags = XFS_IFEXTENTS;
				935	ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
				936	ip->i_df.if_u1.if_extents = NULL;
				937	break;
				938	default:
				939	ASSERT(0);
				940	}
				941	/*
				942	* Attribute fork settings for new inode.
				943	*/
				944	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
				945	ip->i_d.di_anextents = 0;
				946
				947	/*
				948	* Log the new values stuffed into the inode.
				949	*/
				950	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				951	xfs_trans_log_inode(tp, ip, flags);
				952
				953	/* now that we have an i_mode we can setup the inode structure */
				954	xfs_setup_inode(ip);
				955
				956	*ipp = ip;
				957	return 0;
				958	}
				959
				960	/*
				961	* Allocates a new inode from disk and return a pointer to the
				962	* incore copy. This routine will internally commit the current
				963	* transaction and allocate a new one if the Space Manager needed
				964	* to do an allocation to replenish the inode free-list.
				965	*
				966	* This routine is designed to be called from xfs_create and
				967	* xfs_create_dir.
				968	*
				969	*/
				970	int
				971	xfs_dir_ialloc(
				972	xfs_trans_t *tpp, / input: current transaction;
				973	output: may be a new transaction. */
				974	xfs_inode_t dp, / directory within whose allocate
				975	the inode. */
				976	umode_t mode,
				977	xfs_nlink_t nlink,
				978	xfs_dev_t rdev,
				979	prid_t prid, /* project id */
				980	int okalloc, /* ok to allocate new space */
				981	xfs_inode_t *ipp, / pointer to inode; it will be
				982	locked. */
				983	int *committed)
				984
				985	{
				986	xfs_trans_t *tp;
				987	xfs_inode_t *ip;
				988	xfs_buf_t *ialloc_context = NULL;
				989	int code;
				990	void *dqinfo;
				991	uint tflags;
				992
				993	tp = *tpp;
				994	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
				995
				996	/*
				997	* xfs_ialloc will return a pointer to an incore inode if
				998	* the Space Manager has an available inode on the free
				999	* list. Otherwise, it will do an allocation and replenish
				1000	* the freelist. Since we can only do one allocation per
				1001	* transaction without deadlocks, we will need to commit the
				1002	* current transaction and start a new one. We will then
				1003	* need to call xfs_ialloc again to get the inode.
				1004	*
				1005	* If xfs_ialloc did an allocation to replenish the freelist,
				1006	* it returns the bp containing the head of the freelist as
				1007	* ialloc_context. We will hold a lock on it across the
				1008	* transaction commit so that no other process can steal
				1009	* the inode(s) that we've just allocated.
				1010	*/
				1011	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
				1012	&ialloc_context, &ip);
				1013
				1014	/*
				1015	* Return an error if we were unable to allocate a new inode.
				1016	* This should only happen if we run out of space on disk or
				1017	* encounter a disk error.
				1018	*/
				1019	if (code) {
				1020	*ipp = NULL;
				1021	return code;
				1022	}
				1023	if (!ialloc_context && !ip) {
				1024	*ipp = NULL;
				1025	return -ENOSPC;
				1026	}
				1027
				1028	/*
				1029	* If the AGI buffer is non-NULL, then we were unable to get an
				1030	* inode in one operation. We need to commit the current
				1031	* transaction and call xfs_ialloc() again. It is guaranteed
				1032	* to succeed the second time.
				1033	*/
				1034	if (ialloc_context) {
				1035	/*
				1036	* Normally, xfs_trans_commit releases all the locks.
				1037	* We call bhold to hang on to the ialloc_context across
				1038	* the commit. Holding this buffer prevents any other
				1039	* processes from doing any allocations in this
				1040	* allocation group.
				1041	*/
				1042	xfs_trans_bhold(tp, ialloc_context);
				1043
				1044	/*
				1045	* We want the quota changes to be associated with the next
				1046	* transaction, NOT this one. So, detach the dqinfo from this
				1047	* and attach it to the next transaction.
				1048	*/
				1049	dqinfo = NULL;
				1050	tflags = 0;
				1051	if (tp->t_dqinfo) {
				1052	dqinfo = (void *)tp->t_dqinfo;
				1053	tp->t_dqinfo = NULL;
				1054	tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
				1055	tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
				1056	}
				1057
				1058	code = xfs_trans_roll(&tp);
				1059	if (committed != NULL)
				1060	*committed = 1;
				1061
				1062	/*
				1063	* Re-attach the quota info that we detached from prev trx.
				1064	*/
				1065	if (dqinfo) {
				1066	tp->t_dqinfo = dqinfo;
				1067	tp->t_flags \|= tflags;
				1068	}
				1069
				1070	if (code) {
				1071	xfs_buf_relse(ialloc_context);
				1072	*tpp = tp;
				1073	*ipp = NULL;
				1074	return code;
				1075	}
				1076	xfs_trans_bjoin(tp, ialloc_context);
				1077
				1078	/*
				1079	* Call ialloc again. Since we've locked out all
				1080	* other allocations in this allocation group,
				1081	* this call should always succeed.
				1082	*/
				1083	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
				1084	okalloc, &ialloc_context, &ip);
				1085
				1086	/*
				1087	* If we get an error at this point, return to the caller
				1088	* so that the current transaction can be aborted.
				1089	*/
				1090	if (code) {
				1091	*tpp = tp;
				1092	*ipp = NULL;
				1093	return code;
				1094	}
				1095	ASSERT(!ialloc_context && ip);
				1096
				1097	} else {
				1098	if (committed != NULL)
				1099	*committed = 0;
				1100	}
				1101
				1102	*ipp = ip;
				1103	*tpp = tp;
				1104
				1105	return 0;
				1106	}
				1107
				1108	/*
				1109	* Decrement the link count on an inode & log the change. If this causes the
				1110	* link count to go to zero, move the inode to AGI unlinked list so that it can
				1111	* be freed when the last active reference goes away via xfs_inactive().
				1112	*/
				1113	static int /* error */
				1114	xfs_droplink(
				1115	xfs_trans_t *tp,
				1116	xfs_inode_t *ip)
				1117	{
				1118	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
				1119
				1120	drop_nlink(VFS_I(ip));
				1121	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1122
				1123	if (VFS_I(ip)->i_nlink)
				1124	return 0;
				1125
				1126	return xfs_iunlink(tp, ip);
				1127	}
				1128
				1129	/*
				1130	* Increment the link count on an inode & log the change.
				1131	*/
				1132	static int
				1133	xfs_bumplink(
				1134	xfs_trans_t *tp,
				1135	xfs_inode_t *ip)
				1136	{
				1137	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
				1138
				1139	ASSERT(ip->i_d.di_version > 1);
				1140	inc_nlink(VFS_I(ip));
				1141	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1142	return 0;
				1143	}
				1144
				1145	int
				1146	xfs_create(
				1147	xfs_inode_t *dp,
				1148	struct xfs_name *name,
				1149	umode_t mode,
				1150	xfs_dev_t rdev,
				1151	xfs_inode_t **ipp)
				1152	{
				1153	int is_dir = S_ISDIR(mode);
				1154	struct xfs_mount *mp = dp->i_mount;
				1155	struct xfs_inode *ip = NULL;
				1156	struct xfs_trans *tp = NULL;
				1157	int error;
				1158	struct xfs_defer_ops dfops;
				1159	xfs_fsblock_t first_block;
				1160	bool unlock_dp_on_error = false;
				1161	prid_t prid;
				1162	struct xfs_dquot *udqp = NULL;
				1163	struct xfs_dquot *gdqp = NULL;
				1164	struct xfs_dquot *pdqp = NULL;
				1165	struct xfs_trans_res *tres;
				1166	uint resblks;
				1167
				1168	trace_xfs_create(dp, name);
				1169
				1170	if (XFS_FORCED_SHUTDOWN(mp))
				1171	return -EIO;
				1172
				1173	prid = xfs_get_initial_prid(dp);
				1174
				1175	/*
				1176	* Make sure that we have allocated dquot(s) on disk.
				1177	*/
				1178	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
				1179	xfs_kgid_to_gid(current_fsgid()), prid,
				1180	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
				1181	&udqp, &gdqp, &pdqp);
				1182	if (error)
				1183	return error;
				1184
				1185	if (is_dir) {
				1186	rdev = 0;
				1187	resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
				1188	tres = &M_RES(mp)->tr_mkdir;
				1189	} else {
				1190	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
				1191	tres = &M_RES(mp)->tr_create;
				1192	}
				1193
				1194	/*
				1195	* Initially assume that the file does not exist and
				1196	* reserve the resources for that case. If that is not
				1197	* the case we'll drop the one we have and get a more
				1198	* appropriate transaction later.
				1199	*/
				1200	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1201	if (error == -ENOSPC) {
				1202	/* flush outstanding delalloc blocks and retry */
				1203	xfs_flush_inodes(mp);
				1204	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1205	}
				1206	if (error == -ENOSPC) {
				1207	/* No space at all so try a "no-allocation" reservation */
				1208	resblks = 0;
				1209	error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
				1210	}
				1211	if (error)
				1212	goto out_release_inode;
				1213
				1214	xfs_ilock(dp, XFS_ILOCK_EXCL \| XFS_ILOCK_PARENT);
				1215	unlock_dp_on_error = true;
				1216
				1217	xfs_defer_init(&dfops, &first_block);
				1218
				1219	/*
				1220	* Reserve disk quota and the inode.
				1221	*/
				1222	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
				1223	pdqp, resblks, 1, 0);
				1224	if (error)
				1225	goto out_trans_cancel;
				1226
				1227	if (!resblks) {
				1228	error = xfs_dir_canenter(tp, dp, name);
				1229	if (error)
				1230	goto out_trans_cancel;
				1231	}
				1232
				1233	/*
				1234	* A newly created regular or special file just has one directory
				1235	* entry pointing to them, but a directory also the "." entry
				1236	* pointing to itself.
				1237	*/
				1238	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
				1239	prid, resblks > 0, &ip, NULL);
				1240	if (error)
				1241	goto out_trans_cancel;
				1242
				1243	/*
				1244	* Now we join the directory inode to the transaction. We do not do it
				1245	* earlier because xfs_dir_ialloc might commit the previous transaction
				1246	* (and release all the locks). An error from here on will result in
				1247	* the transaction cancel unlocking dp so don't do it explicitly in the
				1248	* error path.
				1249	*/
				1250	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
				1251	unlock_dp_on_error = false;
				1252
				1253	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
				1254	&first_block, &dfops, resblks ?
				1255	resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
				1256	if (error) {
				1257	ASSERT(error != -ENOSPC);
				1258	goto out_trans_cancel;
				1259	}
				1260	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				1261	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
				1262
				1263	if (is_dir) {
				1264	error = xfs_dir_init(tp, ip, dp);
				1265	if (error)
				1266	goto out_bmap_cancel;
				1267
				1268	error = xfs_bumplink(tp, dp);
				1269	if (error)
				1270	goto out_bmap_cancel;
				1271	}
				1272
				1273	/*
				1274	* If this is a synchronous mount, make sure that the
				1275	* create transaction goes to disk before returning to
				1276	* the user.
				1277	*/
				1278	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				1279	xfs_trans_set_sync(tp);
				1280
				1281	/*
				1282	* Attach the dquot(s) to the inodes and modify them incore.
				1283	* These ids of the inode couldn't have changed since the new
				1284	* inode has been locked ever since it was created.
				1285	*/
				1286	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
				1287
				1288	error = xfs_defer_finish(&tp, &dfops);
				1289	if (error)
				1290	goto out_bmap_cancel;
				1291
				1292	error = xfs_trans_commit(tp);
				1293	if (error)
				1294	goto out_release_inode;
				1295
				1296	xfs_qm_dqrele(udqp);
				1297	xfs_qm_dqrele(gdqp);
				1298	xfs_qm_dqrele(pdqp);
				1299
				1300	*ipp = ip;
				1301	return 0;
				1302
				1303	out_bmap_cancel:
				1304	xfs_defer_cancel(&dfops);
				1305	out_trans_cancel:
				1306	xfs_trans_cancel(tp);
				1307	out_release_inode:
				1308	/*
				1309	* Wait until after the current transaction is aborted to finish the
				1310	* setup of the inode and release the inode. This prevents recursive
				1311	* transactions and deadlocks from xfs_inactive.
				1312	*/
				1313	if (ip) {
				1314	xfs_finish_inode_setup(ip);
				1315	IRELE(ip);
				1316	}
				1317
				1318	xfs_qm_dqrele(udqp);
				1319	xfs_qm_dqrele(gdqp);
				1320	xfs_qm_dqrele(pdqp);
				1321
				1322	if (unlock_dp_on_error)
				1323	xfs_iunlock(dp, XFS_ILOCK_EXCL);
				1324	return error;
				1325	}
				1326
				1327	int
				1328	xfs_create_tmpfile(
				1329	struct xfs_inode *dp,
				1330	struct dentry *dentry,
				1331	umode_t mode,
				1332	struct xfs_inode **ipp)
				1333	{
				1334	struct xfs_mount *mp = dp->i_mount;
				1335	struct xfs_inode *ip = NULL;
				1336	struct xfs_trans *tp = NULL;
				1337	int error;
				1338	prid_t prid;
				1339	struct xfs_dquot *udqp = NULL;
				1340	struct xfs_dquot *gdqp = NULL;
				1341	struct xfs_dquot *pdqp = NULL;
				1342	struct xfs_trans_res *tres;
				1343	uint resblks;
				1344
				1345	if (XFS_FORCED_SHUTDOWN(mp))
				1346	return -EIO;
				1347
				1348	prid = xfs_get_initial_prid(dp);
				1349
				1350	/*
				1351	* Make sure that we have allocated dquot(s) on disk.
				1352	*/
				1353	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
				1354	xfs_kgid_to_gid(current_fsgid()), prid,
				1355	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
				1356	&udqp, &gdqp, &pdqp);
				1357	if (error)
				1358	return error;
				1359
				1360	resblks = XFS_IALLOC_SPACE_RES(mp);
				1361	tres = &M_RES(mp)->tr_create_tmpfile;
				1362
				1363	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
				1364	if (error == -ENOSPC) {
				1365	/* No space at all so try a "no-allocation" reservation */
				1366	resblks = 0;
				1367	error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
				1368	}
				1369	if (error)
				1370	goto out_release_inode;
				1371
				1372	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
				1373	pdqp, resblks, 1, 0);
				1374	if (error)
				1375	goto out_trans_cancel;
				1376
				1377	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
				1378	prid, resblks > 0, &ip, NULL);
				1379	if (error)
				1380	goto out_trans_cancel;
				1381
				1382	if (mp->m_flags & XFS_MOUNT_WSYNC)
				1383	xfs_trans_set_sync(tp);
				1384
				1385	/*
				1386	* Attach the dquot(s) to the inodes and modify them incore.
				1387	* These ids of the inode couldn't have changed since the new
				1388	* inode has been locked ever since it was created.
				1389	*/
				1390	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
				1391
				1392	error = xfs_iunlink(tp, ip);
				1393	if (error)
				1394	goto out_trans_cancel;
				1395
				1396	error = xfs_trans_commit(tp);
				1397	if (error)
				1398	goto out_release_inode;
				1399
				1400	xfs_qm_dqrele(udqp);
				1401	xfs_qm_dqrele(gdqp);
				1402	xfs_qm_dqrele(pdqp);
				1403
				1404	*ipp = ip;
				1405	return 0;
				1406
				1407	out_trans_cancel:
				1408	xfs_trans_cancel(tp);
				1409	out_release_inode:
				1410	/*
				1411	* Wait until after the current transaction is aborted to finish the
				1412	* setup of the inode and release the inode. This prevents recursive
				1413	* transactions and deadlocks from xfs_inactive.
				1414	*/
				1415	if (ip) {
				1416	xfs_finish_inode_setup(ip);
				1417	IRELE(ip);
				1418	}
				1419
				1420	xfs_qm_dqrele(udqp);
				1421	xfs_qm_dqrele(gdqp);
				1422	xfs_qm_dqrele(pdqp);
				1423
				1424	return error;
				1425	}
				1426
				1427	int
				1428	xfs_link(
				1429	xfs_inode_t *tdp,
				1430	xfs_inode_t *sip,
				1431	struct xfs_name *target_name)
				1432	{
				1433	xfs_mount_t *mp = tdp->i_mount;
				1434	xfs_trans_t *tp;
				1435	int error;
				1436	struct xfs_defer_ops dfops;
				1437	xfs_fsblock_t first_block;
				1438	int resblks;
				1439
				1440	trace_xfs_link(tdp, target_name);
				1441
				1442	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
				1443
				1444	if (XFS_FORCED_SHUTDOWN(mp))
				1445	return -EIO;
				1446
				1447	error = xfs_qm_dqattach(sip, 0);
				1448	if (error)
				1449	goto std_return;
				1450
				1451	error = xfs_qm_dqattach(tdp, 0);
				1452	if (error)
				1453	goto std_return;
				1454
				1455	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
				1456	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
				1457	if (error == -ENOSPC) {
				1458	resblks = 0;
				1459	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
				1460	}
				1461	if (error)
				1462	goto std_return;
				1463
				1464	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
				1465
				1466	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
				1467	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
				1468
				1469	/*
				1470	* If we are using project inheritance, we only allow hard link
				1471	* creation in our tree when the project IDs are the same; else
				1472	* the tree quota mechanism could be circumvented.
				1473	*/
				1474	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
				1475	(xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
				1476	error = -EXDEV;
				1477	goto error_return;
				1478	}
				1479
				1480	if (!resblks) {
				1481	error = xfs_dir_canenter(tp, tdp, target_name);
				1482	if (error)
				1483	goto error_return;
				1484	}
				1485
				1486	xfs_defer_init(&dfops, &first_block);
				1487
				1488	/*
				1489	* Handle initial link state of O_TMPFILE inode
				1490	*/
				1491	if (VFS_I(sip)->i_nlink == 0) {
				1492	error = xfs_iunlink_remove(tp, sip);
				1493	if (error)
				1494	goto error_return;
				1495	}
				1496
				1497	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
				1498	&first_block, &dfops, resblks);
				1499	if (error)
				1500	goto error_return;
				1501	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				1502	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
				1503
				1504	error = xfs_bumplink(tp, sip);
				1505	if (error)
				1506	goto error_return;
				1507
				1508	/*
				1509	* If this is a synchronous mount, make sure that the
				1510	* link transaction goes to disk before returning to
				1511	* the user.
				1512	*/
				1513	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				1514	xfs_trans_set_sync(tp);
				1515
				1516	error = xfs_defer_finish(&tp, &dfops);
				1517	if (error) {
				1518	xfs_defer_cancel(&dfops);
				1519	goto error_return;
				1520	}
				1521
				1522	return xfs_trans_commit(tp);
				1523
				1524	error_return:
				1525	xfs_trans_cancel(tp);
				1526	std_return:
				1527	return error;
				1528	}
				1529
				1530	/*
				1531	* Free up the underlying blocks past new_size. The new size must be smaller
				1532	* than the current size. This routine can be used both for the attribute and
				1533	* data fork, and does not modify the inode size, which is left to the caller.
				1534	*
				1535	* The transaction passed to this routine must have made a permanent log
				1536	* reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
				1537	* given transaction and start new ones, so make sure everything involved in
				1538	* the transaction is tidy before calling here. Some transaction will be
				1539	* returned to the caller to be committed. The incoming transaction must
				1540	* already include the inode, and both inode locks must be held exclusively.
				1541	* The inode must also be "held" within the transaction. On return the inode
				1542	* will be "held" within the returned transaction. This routine does NOT
				1543	* require any disk space to be reserved for it within the transaction.
				1544	*
				1545	* If we get an error, we must return with the inode locked and linked into the
				1546	* current transaction. This keeps things simple for the higher level code,
				1547	* because it always knows that the inode is locked and held in the transaction
				1548	* that returns to it whether errors occur or not. We don't mark the inode
				1549	* dirty on error so that transactions can be easily aborted if possible.
				1550	*/
				1551	int
				1552	xfs_itruncate_extents(
				1553	struct xfs_trans **tpp,
				1554	struct xfs_inode *ip,
				1555	int whichfork,
				1556	xfs_fsize_t new_size)
				1557	{
				1558	struct xfs_mount *mp = ip->i_mount;
				1559	struct xfs_trans tp = tpp;
				1560	struct xfs_defer_ops dfops;
				1561	xfs_fsblock_t first_block;
				1562	xfs_fileoff_t first_unmap_block;
				1563	xfs_fileoff_t last_block;
				1564	xfs_filblks_t unmap_len;
				1565	int error = 0;
				1566	int done = 0;
				1567
				1568	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				1569	ASSERT(!atomic_read(&VFS_I(ip)->i_count) \|\|
				1570	xfs_isilocked(ip, XFS_IOLOCK_EXCL));
				1571	ASSERT(new_size <= XFS_ISIZE(ip));
				1572	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
				1573	ASSERT(ip->i_itemp != NULL);
				1574	ASSERT(ip->i_itemp->ili_lock_flags == 0);
				1575	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
				1576
				1577	trace_xfs_itruncate_extents_start(ip, new_size);
				1578
				1579	/*
				1580	* Since it is possible for space to become allocated beyond
				1581	* the end of the file (in a crash where the space is allocated
				1582	* but the inode size is not yet updated), simply remove any
				1583	* blocks which show up between the new EOF and the maximum
				1584	* possible file size. If the first block to be removed is
				1585	* beyond the maximum file size (ie it is the same as last_block),
				1586	* then there is nothing to do.
				1587	*/
				1588	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
				1589	last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
				1590	if (first_unmap_block == last_block)
				1591	return 0;
				1592
				1593	ASSERT(first_unmap_block < last_block);
				1594	unmap_len = last_block - first_unmap_block + 1;
				1595	while (!done) {
				1596	xfs_defer_init(&dfops, &first_block);
				1597	error = xfs_bunmapi(tp, ip,
				1598	first_unmap_block, unmap_len,
				1599	xfs_bmapi_aflag(whichfork),
				1600	XFS_ITRUNC_MAX_EXTENTS,
				1601	&first_block, &dfops,
				1602	&done);
				1603	if (error)
				1604	goto out_bmap_cancel;
				1605
				1606	/*
				1607	* Duplicate the transaction that has the permanent
				1608	* reservation and commit the old transaction.
				1609	*/
				1610	xfs_defer_ijoin(&dfops, ip);
				1611	error = xfs_defer_finish(&tp, &dfops);
				1612	if (error)
				1613	goto out_bmap_cancel;
				1614
				1615	error = xfs_trans_roll_inode(&tp, ip);
				1616	if (error)
				1617	goto out;
				1618	}
				1619
				1620	/* Remove all pending CoW reservations. */
				1621	error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
				1622	last_block, true);
				1623	if (error)
				1624	goto out;
				1625
				1626	/*
				1627	* Clear the reflink flag if there are no data fork blocks and
				1628	* there are no extents staged in the cow fork.
				1629	*/
				1630	if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
				1631	if (ip->i_d.di_nblocks == 0)
				1632	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
				1633	xfs_inode_clear_cowblocks_tag(ip);
				1634	}
				1635
				1636	/*
				1637	* Always re-log the inode so that our permanent transaction can keep
				1638	* on rolling it forward in the log.
				1639	*/
				1640	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1641
				1642	trace_xfs_itruncate_extents_end(ip, new_size);
				1643
				1644	out:
				1645	*tpp = tp;
				1646	return error;
				1647	out_bmap_cancel:
				1648	/*
				1649	* If the bunmapi call encounters an error, return to the caller where
				1650	* the transaction can be properly aborted. We just need to make sure
				1651	* we're not holding any resources that we were not when we came in.
				1652	*/
				1653	xfs_defer_cancel(&dfops);
				1654	goto out;
				1655	}
				1656
				1657	int
				1658	xfs_release(
				1659	xfs_inode_t *ip)
				1660	{
				1661	xfs_mount_t *mp = ip->i_mount;
				1662	int error;
				1663
				1664	if (!S_ISREG(VFS_I(ip)->i_mode) \|\| (VFS_I(ip)->i_mode == 0))
				1665	return 0;
				1666
				1667	/* If this is a read-only mount, don't do this (would generate I/O) */
				1668	if (mp->m_flags & XFS_MOUNT_RDONLY)
				1669	return 0;
				1670
				1671	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1672	int truncated;
				1673
				1674	/*
				1675	* If we previously truncated this file and removed old data
				1676	* in the process, we want to initiate "early" writeout on
				1677	* the last close. This is an attempt to combat the notorious
				1678	* NULL files problem which is particularly noticeable from a
				1679	* truncate down, buffered (re-)write (delalloc), followed by
				1680	* a crash. What we are effectively doing here is
				1681	* significantly reducing the time window where we'd otherwise
				1682	* be exposed to that problem.
				1683	*/
				1684	truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
				1685	if (truncated) {
				1686	xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
				1687	if (ip->i_delayed_blks > 0) {
				1688	error = filemap_flush(VFS_I(ip)->i_mapping);
				1689	if (error)
				1690	return error;
				1691	}
				1692	}
				1693	}
				1694
				1695	if (VFS_I(ip)->i_nlink == 0)
				1696	return 0;
				1697
				1698	if (xfs_can_free_eofblocks(ip, false)) {
				1699
				1700	/*
				1701	* Check if the inode is being opened, written and closed
				1702	* frequently and we have delayed allocation blocks outstanding
				1703	* (e.g. streaming writes from the NFS server), truncating the
				1704	* blocks past EOF will cause fragmentation to occur.
				1705	*
				1706	* In this case don't do the truncation, but we have to be
				1707	* careful how we detect this case. Blocks beyond EOF show up as
				1708	* i_delayed_blks even when the inode is clean, so we need to
				1709	* truncate them away first before checking for a dirty release.
				1710	* Hence on the first dirty close we will still remove the
				1711	* speculative allocation, but after that we will leave it in
				1712	* place.
				1713	*/
				1714	if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
				1715	return 0;
				1716	/*
				1717	* If we can't get the iolock just skip truncating the blocks
				1718	* past EOF because we could deadlock with the mmap_sem
				1719	* otherwise. We'll get another chance to drop them once the
				1720	* last reference to the inode is dropped, so we'll never leak
				1721	* blocks permanently.
				1722	*/
				1723	if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
				1724	error = xfs_free_eofblocks(ip);
				1725	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
				1726	if (error)
				1727	return error;
				1728	}
				1729
				1730	/* delalloc blocks after truncation means it really is dirty */
				1731	if (ip->i_delayed_blks)
				1732	xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
				1733	}
				1734	return 0;
				1735	}
				1736
				1737	/*
				1738	* xfs_inactive_truncate
				1739	*
				1740	* Called to perform a truncate when an inode becomes unlinked.
				1741	*/
				1742	STATIC int
				1743	xfs_inactive_truncate(
				1744	struct xfs_inode *ip)
				1745	{
				1746	struct xfs_mount *mp = ip->i_mount;
				1747	struct xfs_trans *tp;
				1748	int error;
				1749
				1750	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
				1751	if (error) {
				1752	ASSERT(XFS_FORCED_SHUTDOWN(mp));
				1753	return error;
				1754	}
				1755
				1756	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1757	xfs_trans_ijoin(tp, ip, 0);
				1758
				1759	/*
				1760	* Log the inode size first to prevent stale data exposure in the event
				1761	* of a system crash before the truncate completes. See the related
				1762	* comment in xfs_vn_setattr_size() for details.
				1763	*/
				1764	ip->i_d.di_size = 0;
				1765	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1766
				1767	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
				1768	if (error)
				1769	goto error_trans_cancel;
				1770
				1771	ASSERT(ip->i_d.di_nextents == 0);
				1772
				1773	error = xfs_trans_commit(tp);
				1774	if (error)
				1775	goto error_unlock;
				1776
				1777	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1778	return 0;
				1779
				1780	error_trans_cancel:
				1781	xfs_trans_cancel(tp);
				1782	error_unlock:
				1783	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1784	return error;
				1785	}
				1786
				1787	/*
				1788	* xfs_inactive_ifree()
				1789	*
				1790	* Perform the inode free when an inode is unlinked.
				1791	*/
				1792	STATIC int
				1793	xfs_inactive_ifree(
				1794	struct xfs_inode *ip)
				1795	{
				1796	struct xfs_defer_ops dfops;
				1797	xfs_fsblock_t first_block;
				1798	struct xfs_mount *mp = ip->i_mount;
				1799	struct xfs_trans *tp;
				1800	int error;
				1801
				1802	/*
				1803	* We try to use a per-AG reservation for any block needed by the finobt
				1804	* tree, but as the finobt feature predates the per-AG reservation
				1805	* support a degraded file system might not have enough space for the
				1806	* reservation at mount time. In that case try to dip into the reserved
				1807	* pool and pray.
				1808	*
				1809	* Send a warning if the reservation does happen to fail, as the inode
				1810	* now remains allocated and sits on the unlinked list until the fs is
				1811	* repaired.
				1812	*/
				1813	if (unlikely(mp->m_inotbt_nores)) {
				1814	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
				1815	XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
				1816	&tp);
				1817	} else {
				1818	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
				1819	}
				1820	if (error) {
				1821	if (error == -ENOSPC) {
				1822	xfs_warn_ratelimited(mp,
				1823	"Failed to remove inode(s) from unlinked list. "
				1824	"Please free space, unmount and run xfs_repair.");
				1825	} else {
				1826	ASSERT(XFS_FORCED_SHUTDOWN(mp));
				1827	}
				1828	return error;
				1829	}
				1830
				1831	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1832	xfs_trans_ijoin(tp, ip, 0);
				1833
				1834	xfs_defer_init(&dfops, &first_block);
				1835	error = xfs_ifree(tp, ip, &dfops);
				1836	if (error) {
				1837	/*
				1838	* If we fail to free the inode, shut down. The cancel
				1839	* might do that, we need to make sure. Otherwise the
				1840	* inode might be lost for a long time or forever.
				1841	*/
				1842	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1843	xfs_notice(mp, "%s: xfs_ifree returned error %d",
				1844	__func__, error);
				1845	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
				1846	}
				1847	xfs_trans_cancel(tp);
				1848	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1849	return error;
				1850	}
				1851
				1852	/*
				1853	* Credit the quota account(s). The inode is gone.
				1854	*/
				1855	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
				1856
				1857	/*
				1858	* Just ignore errors at this point. There is nothing we can do except
				1859	* to try to keep going. Make sure it's not a silent error.
				1860	*/
				1861	error = xfs_defer_finish(&tp, &dfops);
				1862	if (error) {
				1863	xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
				1864	__func__, error);
				1865	xfs_defer_cancel(&dfops);
				1866	}
				1867	error = xfs_trans_commit(tp);
				1868	if (error)
				1869	xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
				1870	__func__, error);
				1871
				1872	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1873	return 0;
				1874	}
				1875
				1876	/*
				1877	* xfs_inactive
				1878	*
				1879	* This is called when the vnode reference count for the vnode
				1880	* goes to zero. If the file has been unlinked, then it must
				1881	* now be truncated. Also, we clear all of the read-ahead state
				1882	* kept for the inode here since the file is now closed.
				1883	*/
				1884	void
				1885	xfs_inactive(
				1886	xfs_inode_t *ip)
				1887	{
				1888	struct xfs_mount *mp;
				1889	int error;
				1890	int truncate = 0;
				1891
				1892	/*
				1893	* If the inode is already free, then there can be nothing
				1894	* to clean up here.
				1895	*/
				1896	if (VFS_I(ip)->i_mode == 0) {
				1897	ASSERT(ip->i_df.if_real_bytes == 0);
				1898	ASSERT(ip->i_df.if_broot_bytes == 0);
				1899	return;
				1900	}
				1901
				1902	mp = ip->i_mount;
				1903	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
				1904
				1905	/* If this is a read-only mount, don't do this (would generate I/O) */
				1906	if (mp->m_flags & XFS_MOUNT_RDONLY)
				1907	return;
				1908
				1909	if (VFS_I(ip)->i_nlink != 0) {
				1910	/*
				1911	* force is true because we are evicting an inode from the
				1912	* cache. Post-eof blocks must be freed, lest we end up with
				1913	* broken free space accounting.
				1914	*
				1915	* Note: don't bother with iolock here since lockdep complains
				1916	* about acquiring it in reclaim context. We have the only
				1917	* reference to the inode at this point anyways.
				1918	*/
				1919	if (xfs_can_free_eofblocks(ip, true))
				1920	xfs_free_eofblocks(ip);
				1921
				1922	return;
				1923	}
				1924
				1925	if (S_ISREG(VFS_I(ip)->i_mode) &&
				1926	(ip->i_d.di_size != 0 \|\| XFS_ISIZE(ip) != 0 \|\|
				1927	ip->i_d.di_nextents > 0 \|\| ip->i_delayed_blks > 0))
				1928	truncate = 1;
				1929
				1930	error = xfs_qm_dqattach(ip, 0);
				1931	if (error)
				1932	return;
				1933
				1934	if (S_ISLNK(VFS_I(ip)->i_mode))
				1935	error = xfs_inactive_symlink(ip);
				1936	else if (truncate)
				1937	error = xfs_inactive_truncate(ip);
				1938	if (error)
				1939	return;
				1940
				1941	/*
				1942	* If there are attributes associated with the file then blow them away
				1943	* now. The code calls a routine that recursively deconstructs the
				1944	* attribute fork. If also blows away the in-core attribute fork.
				1945	*/
				1946	if (XFS_IFORK_Q(ip)) {
				1947	error = xfs_attr_inactive(ip);
				1948	if (error)
				1949	return;
				1950	}
				1951
				1952	ASSERT(!ip->i_afp);
				1953	ASSERT(ip->i_d.di_anextents == 0);
				1954	ASSERT(ip->i_d.di_forkoff == 0);
				1955
				1956	/*
				1957	* Free the inode.
				1958	*/
				1959	error = xfs_inactive_ifree(ip);
				1960	if (error)
				1961	return;
				1962
				1963	/*
				1964	* Release the dquots held by inode, if any.
				1965	*/
				1966	xfs_qm_dqdetach(ip);
				1967	}
				1968
				1969	/*
				1970	* This is called when the inode's link count goes to 0 or we are creating a
				1971	* tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
				1972	* set to true as the link count is dropped to zero by the VFS after we've
				1973	* created the file successfully, so we have to add it to the unlinked list
				1974	* while the link count is non-zero.
				1975	*
				1976	* We place the on-disk inode on a list in the AGI. It will be pulled from this
				1977	* list when the inode is freed.
				1978	*/
				1979	STATIC int
				1980	xfs_iunlink(
				1981	struct xfs_trans *tp,
				1982	struct xfs_inode *ip)
				1983	{
				1984	xfs_mount_t *mp = tp->t_mountp;
				1985	xfs_agi_t *agi;
				1986	xfs_dinode_t *dip;
				1987	xfs_buf_t *agibp;
				1988	xfs_buf_t *ibp;
				1989	xfs_agino_t agino;
				1990	short bucket_index;
				1991	int offset;
				1992	int error;
				1993
				1994	ASSERT(VFS_I(ip)->i_mode != 0);
				1995
				1996	/*
				1997	* Get the agi buffer first. It ensures lock ordering
				1998	* on the list.
				1999	*/
				2000	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
				2001	if (error)
				2002	return error;
				2003	agi = XFS_BUF_TO_AGI(agibp);
				2004
				2005	/*
				2006	* Get the index into the agi hash table for the
				2007	* list this inode will go on.
				2008	*/
				2009	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
				2010	ASSERT(agino != 0);
				2011	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
				2012	ASSERT(agi->agi_unlinked[bucket_index]);
				2013	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
				2014
				2015	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
				2016	/*
				2017	* There is already another inode in the bucket we need
				2018	* to add ourselves to. Add us at the front of the list.
				2019	* Here we put the head pointer into our next pointer,
				2020	* and then we fall through to point the head at us.
				2021	*/
				2022	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				2023	0, 0);
				2024	if (error)
				2025	return error;
				2026
				2027	ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
				2028	dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
				2029	offset = ip->i_imap.im_boffset +
				2030	offsetof(xfs_dinode_t, di_next_unlinked);
				2031
				2032	/* need to recalc the inode CRC if appropriate */
				2033	xfs_dinode_calc_crc(mp, dip);
				2034
				2035	xfs_trans_inode_buf(tp, ibp);
				2036	xfs_trans_log_buf(tp, ibp, offset,
				2037	(offset + sizeof(xfs_agino_t) - 1));
				2038	xfs_inobp_check(mp, ibp);
				2039	}
				2040
				2041	/*
				2042	* Point the bucket head pointer at the inode being inserted.
				2043	*/
				2044	ASSERT(agino != 0);
				2045	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
				2046	offset = offsetof(xfs_agi_t, agi_unlinked) +
				2047	(sizeof(xfs_agino_t) * bucket_index);
				2048	xfs_trans_log_buf(tp, agibp, offset,
				2049	(offset + sizeof(xfs_agino_t) - 1));
				2050	return 0;
				2051	}
				2052
				2053	/*
				2054	* Pull the on-disk inode from the AGI unlinked list.
				2055	*/
				2056	STATIC int
				2057	xfs_iunlink_remove(
				2058	xfs_trans_t *tp,
				2059	xfs_inode_t *ip)
				2060	{
				2061	xfs_ino_t next_ino;
				2062	xfs_mount_t *mp;
				2063	xfs_agi_t *agi;
				2064	xfs_dinode_t *dip;
				2065	xfs_buf_t *agibp;
				2066	xfs_buf_t *ibp;
				2067	xfs_agnumber_t agno;
				2068	xfs_agino_t agino;
				2069	xfs_agino_t next_agino;
				2070	xfs_buf_t *last_ibp;
				2071	xfs_dinode_t *last_dip = NULL;
				2072	short bucket_index;
				2073	int offset, last_offset = 0;
				2074	int error;
				2075
				2076	mp = tp->t_mountp;
				2077	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
				2078
				2079	/*
				2080	* Get the agi buffer first. It ensures lock ordering
				2081	* on the list.
				2082	*/
				2083	error = xfs_read_agi(mp, tp, agno, &agibp);
				2084	if (error)
				2085	return error;
				2086
				2087	agi = XFS_BUF_TO_AGI(agibp);
				2088
				2089	/*
				2090	* Get the index into the agi hash table for the
				2091	* list this inode will go on.
				2092	*/
				2093	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
				2094	ASSERT(agino != 0);
				2095	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
				2096	ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
				2097	ASSERT(agi->agi_unlinked[bucket_index]);
				2098
				2099	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
				2100	/*
				2101	* We're at the head of the list. Get the inode's on-disk
				2102	* buffer to see if there is anyone after us on the list.
				2103	* Only modify our next pointer if it is not already NULLAGINO.
				2104	* This saves us the overhead of dealing with the buffer when
				2105	* there is no need to change it.
				2106	*/
				2107	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				2108	0, 0);
				2109	if (error) {
				2110	xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
				2111	__func__, error);
				2112	return error;
				2113	}
				2114	next_agino = be32_to_cpu(dip->di_next_unlinked);
				2115	ASSERT(next_agino != 0);
				2116	if (next_agino != NULLAGINO) {
				2117	dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
				2118	offset = ip->i_imap.im_boffset +
				2119	offsetof(xfs_dinode_t, di_next_unlinked);
				2120
				2121	/* need to recalc the inode CRC if appropriate */
				2122	xfs_dinode_calc_crc(mp, dip);
				2123
				2124	xfs_trans_inode_buf(tp, ibp);
				2125	xfs_trans_log_buf(tp, ibp, offset,
				2126	(offset + sizeof(xfs_agino_t) - 1));
				2127	xfs_inobp_check(mp, ibp);
				2128	} else {
				2129	xfs_trans_brelse(tp, ibp);
				2130	}
				2131	/*
				2132	* Point the bucket head pointer at the next inode.
				2133	*/
				2134	ASSERT(next_agino != 0);
				2135	ASSERT(next_agino != agino);
				2136	agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
				2137	offset = offsetof(xfs_agi_t, agi_unlinked) +
				2138	(sizeof(xfs_agino_t) * bucket_index);
				2139	xfs_trans_log_buf(tp, agibp, offset,
				2140	(offset + sizeof(xfs_agino_t) - 1));
				2141	} else {
				2142	/*
				2143	* We need to search the list for the inode being freed.
				2144	*/
				2145	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
				2146	last_ibp = NULL;
				2147	while (next_agino != agino) {
				2148	struct xfs_imap imap;
				2149
				2150	if (last_ibp)
				2151	xfs_trans_brelse(tp, last_ibp);
				2152
				2153	imap.im_blkno = 0;
				2154	next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
				2155
				2156	error = xfs_imap(mp, tp, next_ino, &imap, 0);
				2157	if (error) {
				2158	xfs_warn(mp,
				2159	"%s: xfs_imap returned error %d.",
				2160	__func__, error);
				2161	return error;
				2162	}
				2163
				2164	error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
				2165	&last_ibp, 0, 0);
				2166	if (error) {
				2167	xfs_warn(mp,
				2168	"%s: xfs_imap_to_bp returned error %d.",
				2169	__func__, error);
				2170	return error;
				2171	}
				2172
				2173	last_offset = imap.im_boffset;
				2174	next_agino = be32_to_cpu(last_dip->di_next_unlinked);
				2175	ASSERT(next_agino != NULLAGINO);
				2176	ASSERT(next_agino != 0);
				2177	}
				2178
				2179	/*
				2180	* Now last_ibp points to the buffer previous to us on the
				2181	* unlinked list. Pull us from the list.
				2182	*/
				2183	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
				2184	0, 0);
				2185	if (error) {
				2186	xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
				2187	__func__, error);
				2188	return error;
				2189	}
				2190	next_agino = be32_to_cpu(dip->di_next_unlinked);
				2191	ASSERT(next_agino != 0);
				2192	ASSERT(next_agino != agino);
				2193	if (next_agino != NULLAGINO) {
				2194	dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
				2195	offset = ip->i_imap.im_boffset +
				2196	offsetof(xfs_dinode_t, di_next_unlinked);
				2197
				2198	/* need to recalc the inode CRC if appropriate */
				2199	xfs_dinode_calc_crc(mp, dip);
				2200
				2201	xfs_trans_inode_buf(tp, ibp);
				2202	xfs_trans_log_buf(tp, ibp, offset,
				2203	(offset + sizeof(xfs_agino_t) - 1));
				2204	xfs_inobp_check(mp, ibp);
				2205	} else {
				2206	xfs_trans_brelse(tp, ibp);
				2207	}
				2208	/*
				2209	* Point the previous inode on the list to the next inode.
				2210	*/
				2211	last_dip->di_next_unlinked = cpu_to_be32(next_agino);
				2212	ASSERT(next_agino != 0);
				2213	offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
				2214
				2215	/* need to recalc the inode CRC if appropriate */
				2216	xfs_dinode_calc_crc(mp, last_dip);
				2217
				2218	xfs_trans_inode_buf(tp, last_ibp);
				2219	xfs_trans_log_buf(tp, last_ibp, offset,
				2220	(offset + sizeof(xfs_agino_t) - 1));
				2221	xfs_inobp_check(mp, last_ibp);
				2222	}
				2223	return 0;
				2224	}
				2225
				2226	/*
				2227	* A big issue when freeing the inode cluster is that we _cannot_ skip any
				2228	* inodes that are in memory - they all must be marked stale and attached to
				2229	* the cluster buffer.
				2230	*/
				2231	STATIC int
				2232	xfs_ifree_cluster(
				2233	xfs_inode_t *free_ip,
				2234	xfs_trans_t *tp,
				2235	struct xfs_icluster *xic)
				2236	{
				2237	xfs_mount_t *mp = free_ip->i_mount;
				2238	int blks_per_cluster;
				2239	int inodes_per_cluster;
				2240	int nbufs;
				2241	int i, j;
				2242	int ioffset;
				2243	xfs_daddr_t blkno;
				2244	xfs_buf_t *bp;
				2245	xfs_inode_t *ip;
				2246	xfs_inode_log_item_t *iip;
				2247	xfs_log_item_t *lip;
				2248	struct xfs_perag *pag;
				2249	xfs_ino_t inum;
				2250
				2251	inum = xic->first_ino;
				2252	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
				2253	blks_per_cluster = xfs_icluster_size_fsb(mp);
				2254	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
				2255	nbufs = mp->m_ialloc_blks / blks_per_cluster;
				2256
				2257	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
				2258	/*
				2259	* The allocation bitmap tells us which inodes of the chunk were
				2260	* physically allocated. Skip the cluster if an inode falls into
				2261	* a sparse region.
				2262	*/
				2263	ioffset = inum - xic->first_ino;
				2264	if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
				2265	ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
				2266	continue;
				2267	}
				2268
				2269	blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
				2270	XFS_INO_TO_AGBNO(mp, inum));
				2271
				2272	/*
				2273	* We obtain and lock the backing buffer first in the process
				2274	* here, as we have to ensure that any dirty inode that we
				2275	* can't get the flush lock on is attached to the buffer.
				2276	* If we scan the in-memory inodes first, then buffer IO can
				2277	* complete before we get a lock on it, and hence we may fail
				2278	* to mark all the active inodes on the buffer stale.
				2279	*/
				2280	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
				2281	mp->m_bsize * blks_per_cluster,
				2282	XBF_UNMAPPED);
				2283
				2284	if (!bp)
				2285	return -ENOMEM;
				2286
				2287	/*
				2288	* This buffer may not have been correctly initialised as we
				2289	* didn't read it from disk. That's not important because we are
				2290	* only using to mark the buffer as stale in the log, and to
				2291	* attach stale cached inodes on it. That means it will never be
				2292	* dispatched for IO. If it is, we want to know about it, and we
				2293	* want it to fail. We can acheive this by adding a write
				2294	* verifier to the buffer.
				2295	*/
				2296	bp->b_ops = &xfs_inode_buf_ops;
				2297
				2298	/*
				2299	* Walk the inodes already attached to the buffer and mark them
				2300	* stale. These will all have the flush locks held, so an
				2301	* in-memory inode walk can't lock them. By marking them all
				2302	* stale first, we will not attempt to lock them in the loop
				2303	* below as the XFS_ISTALE flag will be set.
				2304	*/
				2305	lip = bp->b_fspriv;
				2306	while (lip) {
				2307	if (lip->li_type == XFS_LI_INODE) {
				2308	iip = (xfs_inode_log_item_t *)lip;
				2309	ASSERT(iip->ili_logged == 1);
				2310	lip->li_cb = xfs_istale_done;
				2311	xfs_trans_ail_copy_lsn(mp->m_ail,
				2312	&iip->ili_flush_lsn,
				2313	&iip->ili_item.li_lsn);
				2314	xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
				2315	}
				2316	lip = lip->li_bio_list;
				2317	}
				2318
				2319
				2320	/*
				2321	* For each inode in memory attempt to add it to the inode
				2322	* buffer and set it up for being staled on buffer IO
				2323	* completion. This is safe as we've locked out tail pushing
				2324	* and flushing by locking the buffer.
				2325	*
				2326	* We have already marked every inode that was part of a
				2327	* transaction stale above, which means there is no point in
				2328	* even trying to lock them.
				2329	*/
				2330	for (i = 0; i < inodes_per_cluster; i++) {
				2331	retry:
				2332	rcu_read_lock();
				2333	ip = radix_tree_lookup(&pag->pag_ici_root,
				2334	XFS_INO_TO_AGINO(mp, (inum + i)));
				2335
				2336	/* Inode not in memory, nothing to do */
				2337	if (!ip) {
				2338	rcu_read_unlock();
				2339	continue;
				2340	}
				2341
				2342	/*
				2343	* because this is an RCU protected lookup, we could
				2344	* find a recently freed or even reallocated inode
				2345	* during the lookup. We need to check under the
				2346	* i_flags_lock for a valid inode here. Skip it if it
				2347	* is not valid, the wrong inode or stale.
				2348	*/
				2349	spin_lock(&ip->i_flags_lock);
				2350	if (ip->i_ino != inum + i \|\|
				2351	__xfs_iflags_test(ip, XFS_ISTALE)) {
				2352	spin_unlock(&ip->i_flags_lock);
				2353	rcu_read_unlock();
				2354	continue;
				2355	}
				2356	spin_unlock(&ip->i_flags_lock);
				2357
				2358	/*
				2359	* Don't try to lock/unlock the current inode, but we
				2360	* _cannot_ skip the other inodes that we did not find
				2361	* in the list attached to the buffer and are not
				2362	* already marked stale. If we can't lock it, back off
				2363	* and retry.
				2364	*/
				2365	if (ip != free_ip) {
				2366	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
				2367	rcu_read_unlock();
				2368	delay(1);
				2369	goto retry;
				2370	}
				2371
				2372	/*
				2373	* Check the inode number again in case we're
				2374	* racing with freeing in xfs_reclaim_inode().
				2375	* See the comments in that function for more
				2376	* information as to why the initial check is
				2377	* not sufficient.
				2378	*/
				2379	if (ip->i_ino != inum + i) {
				2380	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2381	rcu_read_unlock();
				2382	continue;
				2383	}
				2384	}
				2385	rcu_read_unlock();
				2386
				2387	xfs_iflock(ip);
				2388	xfs_iflags_set(ip, XFS_ISTALE);
				2389
				2390	/*
				2391	* we don't need to attach clean inodes or those only
				2392	* with unlogged changes (which we throw away, anyway).
				2393	*/
				2394	iip = ip->i_itemp;
				2395	if (!iip \|\| xfs_inode_clean(ip)) {
				2396	ASSERT(ip != free_ip);
				2397	xfs_ifunlock(ip);
				2398	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2399	continue;
				2400	}
				2401
				2402	iip->ili_last_fields = iip->ili_fields;
				2403	iip->ili_fields = 0;
				2404	iip->ili_fsync_fields = 0;
				2405	iip->ili_logged = 1;
				2406	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
				2407	&iip->ili_item.li_lsn);
				2408
				2409	xfs_buf_attach_iodone(bp, xfs_istale_done,
				2410	&iip->ili_item);
				2411
				2412	if (ip != free_ip)
				2413	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				2414	}
				2415
				2416	xfs_trans_stale_inode_buf(tp, bp);
				2417	xfs_trans_binval(tp, bp);
				2418	}
				2419
				2420	xfs_perag_put(pag);
				2421	return 0;
				2422	}
				2423
				2424	/*
				2425	* Free any local-format buffers sitting around before we reset to
				2426	* extents format.
				2427	*/
				2428	static inline void
				2429	xfs_ifree_local_data(
				2430	struct xfs_inode *ip,
				2431	int whichfork)
				2432	{
				2433	struct xfs_ifork *ifp;
				2434
				2435	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
				2436	return;
				2437
				2438	ifp = XFS_IFORK_PTR(ip, whichfork);
				2439	xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
				2440	}
				2441
				2442	/*
				2443	* This is called to return an inode to the inode free list.
				2444	* The inode should already be truncated to 0 length and have
				2445	* no pages associated with it. This routine also assumes that
				2446	* the inode is already a part of the transaction.
				2447	*
				2448	* The on-disk copy of the inode will have been added to the list
				2449	* of unlinked inodes in the AGI. We need to remove the inode from
				2450	* that list atomically with respect to freeing it here.
				2451	*/
				2452	int
				2453	xfs_ifree(
				2454	xfs_trans_t *tp,
				2455	xfs_inode_t *ip,
				2456	struct xfs_defer_ops *dfops)
				2457	{
				2458	int error;
				2459	struct xfs_icluster xic = { 0 };
				2460
				2461	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				2462	ASSERT(VFS_I(ip)->i_nlink == 0);
				2463	ASSERT(ip->i_d.di_nextents == 0);
				2464	ASSERT(ip->i_d.di_anextents == 0);
				2465	ASSERT(ip->i_d.di_size == 0 \|\| !S_ISREG(VFS_I(ip)->i_mode));
				2466	ASSERT(ip->i_d.di_nblocks == 0);
				2467
				2468	/*
				2469	* Pull the on-disk inode from the AGI unlinked list.
				2470	*/
				2471	error = xfs_iunlink_remove(tp, ip);
				2472	if (error)
				2473	return error;
				2474
				2475	error = xfs_difree(tp, ip->i_ino, dfops, &xic);
				2476	if (error)
				2477	return error;
				2478
				2479	xfs_ifree_local_data(ip, XFS_DATA_FORK);
				2480	xfs_ifree_local_data(ip, XFS_ATTR_FORK);
				2481
				2482	VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
				2483	ip->i_d.di_flags = 0;
				2484	ip->i_d.di_dmevmask = 0;
				2485	ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
				2486	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
				2487	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
				2488	/*
				2489	* Bump the generation count so no one will be confused
				2490	* by reincarnations of this inode.
				2491	*/
				2492	VFS_I(ip)->i_generation++;
				2493	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				2494
				2495	if (xic.deleted)
				2496	error = xfs_ifree_cluster(ip, tp, &xic);
				2497
				2498	return error;
				2499	}
				2500
				2501	/*
				2502	* This is called to unpin an inode. The caller must have the inode locked
				2503	* in at least shared mode so that the buffer cannot be subsequently pinned
				2504	* once someone is waiting for it to be unpinned.
				2505	*/
				2506	static void
				2507	xfs_iunpin(
				2508	struct xfs_inode *ip)
				2509	{
				2510	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				2511
				2512	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
				2513
				2514	/* Give the log a push to start the unpinning I/O */
				2515	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
				2516
				2517	}
				2518
				2519	static void
				2520	__xfs_iunpin_wait(
				2521	struct xfs_inode *ip)
				2522	{
				2523	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
				2524	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
				2525
				2526	xfs_iunpin(ip);
				2527
				2528	do {
				2529	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				2530	if (xfs_ipincount(ip))
				2531	io_schedule();
				2532	} while (xfs_ipincount(ip));
				2533	finish_wait(wq, &wait.wq_entry);
				2534	}
				2535
				2536	void
				2537	xfs_iunpin_wait(
				2538	struct xfs_inode *ip)
				2539	{
				2540	if (xfs_ipincount(ip))
				2541	__xfs_iunpin_wait(ip);
				2542	}
				2543
				2544	/*
				2545	* Removing an inode from the namespace involves removing the directory entry
				2546	* and dropping the link count on the inode. Removing the directory entry can
				2547	* result in locking an AGF (directory blocks were freed) and removing a link
				2548	* count can result in placing the inode on an unlinked list which results in
				2549	* locking an AGI.
				2550	*
				2551	* The big problem here is that we have an ordering constraint on AGF and AGI
				2552	* locking - inode allocation locks the AGI, then can allocate a new extent for
				2553	* new inodes, locking the AGF after the AGI. Similarly, freeing the inode
				2554	* removes the inode from the unlinked list, requiring that we lock the AGI
				2555	* first, and then freeing the inode can result in an inode chunk being freed
				2556	* and hence freeing disk space requiring that we lock an AGF.
				2557	*
				2558	* Hence the ordering that is imposed by other parts of the code is AGI before
				2559	* AGF. This means we cannot remove the directory entry before we drop the inode
				2560	* reference count and put it on the unlinked list as this results in a lock
				2561	* order of AGF then AGI, and this can deadlock against inode allocation and
				2562	* freeing. Therefore we must drop the link counts before we remove the
				2563	* directory entry.
				2564	*
				2565	* This is still safe from a transactional point of view - it is not until we
				2566	* get to xfs_defer_finish() that we have the possibility of multiple
				2567	* transactions in this operation. Hence as long as we remove the directory
				2568	* entry and drop the link count in the first transaction of the remove
				2569	* operation, there are no transactional constraints on the ordering here.
				2570	*/
				2571	int
				2572	xfs_remove(
				2573	xfs_inode_t *dp,
				2574	struct xfs_name *name,
				2575	xfs_inode_t *ip)
				2576	{
				2577	xfs_mount_t *mp = dp->i_mount;
				2578	xfs_trans_t *tp = NULL;
				2579	int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
				2580	int error = 0;
				2581	struct xfs_defer_ops dfops;
				2582	xfs_fsblock_t first_block;
				2583	uint resblks;
				2584
				2585	trace_xfs_remove(dp, name);
				2586
				2587	if (XFS_FORCED_SHUTDOWN(mp))
				2588	return -EIO;
				2589
				2590	error = xfs_qm_dqattach(dp, 0);
				2591	if (error)
				2592	goto std_return;
				2593
				2594	error = xfs_qm_dqattach(ip, 0);
				2595	if (error)
				2596	goto std_return;
				2597
				2598	/*
				2599	* We try to get the real space reservation first,
				2600	* allowing for directory btree deletion(s) implying
				2601	* possible bmap insert(s). If we can't get the space
				2602	* reservation then we use 0 instead, and avoid the bmap
				2603	* btree insert(s) in the directory code by, if the bmap
				2604	* insert tries to happen, instead trimming the LAST
				2605	* block from the directory.
				2606	*/
				2607	resblks = XFS_REMOVE_SPACE_RES(mp);
				2608	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
				2609	if (error == -ENOSPC) {
				2610	resblks = 0;
				2611	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
				2612	&tp);
				2613	}
				2614	if (error) {
				2615	ASSERT(error != -ENOSPC);
				2616	goto std_return;
				2617	}
				2618
				2619	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
				2620
				2621	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
				2622	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				2623
				2624	/*
				2625	* If we're removing a directory perform some additional validation.
				2626	*/
				2627	if (is_dir) {
				2628	ASSERT(VFS_I(ip)->i_nlink >= 2);
				2629	if (VFS_I(ip)->i_nlink != 2) {
				2630	error = -ENOTEMPTY;
				2631	goto out_trans_cancel;
				2632	}
				2633	if (!xfs_dir_isempty(ip)) {
				2634	error = -ENOTEMPTY;
				2635	goto out_trans_cancel;
				2636	}
				2637
				2638	/* Drop the link from ip's "..". */
				2639	error = xfs_droplink(tp, dp);
				2640	if (error)
				2641	goto out_trans_cancel;
				2642
				2643	/* Drop the "." link from ip to self. */
				2644	error = xfs_droplink(tp, ip);
				2645	if (error)
				2646	goto out_trans_cancel;
				2647	} else {
				2648	/*
				2649	* When removing a non-directory we need to log the parent
				2650	* inode here. For a directory this is done implicitly
				2651	* by the xfs_droplink call for the ".." entry.
				2652	*/
				2653	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
				2654	}
				2655	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				2656
				2657	/* Drop the link from dp to ip. */
				2658	error = xfs_droplink(tp, ip);
				2659	if (error)
				2660	goto out_trans_cancel;
				2661
				2662	xfs_defer_init(&dfops, &first_block);
				2663	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
				2664	&first_block, &dfops, resblks);
				2665	if (error) {
				2666	ASSERT(error != -ENOENT);
				2667	goto out_bmap_cancel;
				2668	}
				2669
				2670	/*
				2671	* If this is a synchronous mount, make sure that the
				2672	* remove transaction goes to disk before returning to
				2673	* the user.
				2674	*/
				2675	if (mp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				2676	xfs_trans_set_sync(tp);
				2677
				2678	error = xfs_defer_finish(&tp, &dfops);
				2679	if (error)
				2680	goto out_bmap_cancel;
				2681
				2682	error = xfs_trans_commit(tp);
				2683	if (error)
				2684	goto std_return;
				2685
				2686	if (is_dir && xfs_inode_is_filestream(ip))
				2687	xfs_filestream_deassociate(ip);
				2688
				2689	return 0;
				2690
				2691	out_bmap_cancel:
				2692	xfs_defer_cancel(&dfops);
				2693	out_trans_cancel:
				2694	xfs_trans_cancel(tp);
				2695	std_return:
				2696	return error;
				2697	}
				2698
				2699	/*
				2700	* Enter all inodes for a rename transaction into a sorted array.
				2701	*/
				2702	#define __XFS_SORT_INODES 5
				2703	STATIC void
				2704	xfs_sort_for_rename(
				2705	struct xfs_inode dp1, / in: old (source) directory inode */
				2706	struct xfs_inode dp2, / in: new (target) directory inode */
				2707	struct xfs_inode ip1, / in: inode of old entry */
				2708	struct xfs_inode ip2, / in: inode of new entry */
				2709	struct xfs_inode wip, / in: whiteout inode */
				2710	struct xfs_inode *i_tab,/ out: sorted array of inodes */
				2711	int num_inodes) / in/out: inodes in array */
				2712	{
				2713	int i, j;
				2714
				2715	ASSERT(*num_inodes == __XFS_SORT_INODES);
				2716	memset(i_tab, 0, num_inodes sizeof(struct xfs_inode *));
				2717
				2718	/*
				2719	* i_tab contains a list of pointers to inodes. We initialize
				2720	* the table here & we'll sort it. We will then use it to
				2721	* order the acquisition of the inode locks.
				2722	*
				2723	* Note that the table may contain duplicates. e.g., dp1 == dp2.
				2724	*/
				2725	i = 0;
				2726	i_tab[i++] = dp1;
				2727	i_tab[i++] = dp2;
				2728	i_tab[i++] = ip1;
				2729	if (ip2)
				2730	i_tab[i++] = ip2;
				2731	if (wip)
				2732	i_tab[i++] = wip;
				2733	*num_inodes = i;
				2734
				2735	/*
				2736	* Sort the elements via bubble sort. (Remember, there are at
				2737	* most 5 elements to sort, so this is adequate.)
				2738	*/
				2739	for (i = 0; i < *num_inodes; i++) {
				2740	for (j = 1; j < *num_inodes; j++) {
				2741	if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
				2742	struct xfs_inode *temp = i_tab[j];
				2743	i_tab[j] = i_tab[j-1];
				2744	i_tab[j-1] = temp;
				2745	}
				2746	}
				2747	}
				2748	}
				2749
				2750	static int
				2751	xfs_finish_rename(
				2752	struct xfs_trans *tp,
				2753	struct xfs_defer_ops *dfops)
				2754	{
				2755	int error;
				2756
				2757	/*
				2758	* If this is a synchronous mount, make sure that the rename transaction
				2759	* goes to disk before returning to the user.
				2760	*/
				2761	if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC\|XFS_MOUNT_DIRSYNC))
				2762	xfs_trans_set_sync(tp);
				2763
				2764	error = xfs_defer_finish(&tp, dfops);
				2765	if (error) {
				2766	xfs_defer_cancel(dfops);
				2767	xfs_trans_cancel(tp);
				2768	return error;
				2769	}
				2770
				2771	return xfs_trans_commit(tp);
				2772	}
				2773
				2774	/*
				2775	* xfs_cross_rename()
				2776	*
				2777	* responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
				2778	*/
				2779	STATIC int
				2780	xfs_cross_rename(
				2781	struct xfs_trans *tp,
				2782	struct xfs_inode *dp1,
				2783	struct xfs_name *name1,
				2784	struct xfs_inode *ip1,
				2785	struct xfs_inode *dp2,
				2786	struct xfs_name *name2,
				2787	struct xfs_inode *ip2,
				2788	struct xfs_defer_ops *dfops,
				2789	xfs_fsblock_t *first_block,
				2790	int spaceres)
				2791	{
				2792	int error = 0;
				2793	int ip1_flags = 0;
				2794	int ip2_flags = 0;
				2795	int dp2_flags = 0;
				2796
				2797	/* Swap inode number for dirent in first parent */
				2798	error = xfs_dir_replace(tp, dp1, name1,
				2799	ip2->i_ino,
				2800	first_block, dfops, spaceres);
				2801	if (error)
				2802	goto out_trans_abort;
				2803
				2804	/* Swap inode number for dirent in second parent */
				2805	error = xfs_dir_replace(tp, dp2, name2,
				2806	ip1->i_ino,
				2807	first_block, dfops, spaceres);
				2808	if (error)
				2809	goto out_trans_abort;
				2810
				2811	/*
				2812	* If we're renaming one or more directories across different parents,
				2813	* update the respective ".." entries (and link counts) to match the new
				2814	* parents.
				2815	*/
				2816	if (dp1 != dp2) {
				2817	dp2_flags = XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2818
				2819	if (S_ISDIR(VFS_I(ip2)->i_mode)) {
				2820	error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
				2821	dp1->i_ino, first_block,
				2822	dfops, spaceres);
				2823	if (error)
				2824	goto out_trans_abort;
				2825
				2826	/* transfer ip2 ".." reference to dp1 */
				2827	if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
				2828	error = xfs_droplink(tp, dp2);
				2829	if (error)
				2830	goto out_trans_abort;
				2831	error = xfs_bumplink(tp, dp1);
				2832	if (error)
				2833	goto out_trans_abort;
				2834	}
				2835
				2836	/*
				2837	* Although ip1 isn't changed here, userspace needs
				2838	* to be warned about the change, so that applications
				2839	* relying on it (like backup ones), will properly
				2840	* notify the change
				2841	*/
				2842	ip1_flags \|= XFS_ICHGTIME_CHG;
				2843	ip2_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2844	}
				2845
				2846	if (S_ISDIR(VFS_I(ip1)->i_mode)) {
				2847	error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
				2848	dp2->i_ino, first_block,
				2849	dfops, spaceres);
				2850	if (error)
				2851	goto out_trans_abort;
				2852
				2853	/* transfer ip1 ".." reference to dp2 */
				2854	if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
				2855	error = xfs_droplink(tp, dp1);
				2856	if (error)
				2857	goto out_trans_abort;
				2858	error = xfs_bumplink(tp, dp2);
				2859	if (error)
				2860	goto out_trans_abort;
				2861	}
				2862
				2863	/*
				2864	* Although ip2 isn't changed here, userspace needs
				2865	* to be warned about the change, so that applications
				2866	* relying on it (like backup ones), will properly
				2867	* notify the change
				2868	*/
				2869	ip1_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
				2870	ip2_flags \|= XFS_ICHGTIME_CHG;
				2871	}
				2872	}
				2873
				2874	if (ip1_flags) {
				2875	xfs_trans_ichgtime(tp, ip1, ip1_flags);
				2876	xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
				2877	}
				2878	if (ip2_flags) {
				2879	xfs_trans_ichgtime(tp, ip2, ip2_flags);
				2880	xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
				2881	}
				2882	if (dp2_flags) {
				2883	xfs_trans_ichgtime(tp, dp2, dp2_flags);
				2884	xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
				2885	}
				2886	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				2887	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
				2888	return xfs_finish_rename(tp, dfops);
				2889
				2890	out_trans_abort:
				2891	xfs_defer_cancel(dfops);
				2892	xfs_trans_cancel(tp);
				2893	return error;
				2894	}
				2895
				2896	/*
				2897	* xfs_rename_alloc_whiteout()
				2898	*
				2899	* Return a referenced, unlinked, unlocked inode that that can be used as a
				2900	* whiteout in a rename transaction. We use a tmpfile inode here so that if we
				2901	* crash between allocating the inode and linking it into the rename transaction
				2902	* recovery will free the inode and we won't leak it.
				2903	*/
				2904	static int
				2905	xfs_rename_alloc_whiteout(
				2906	struct xfs_inode *dp,
				2907	struct xfs_inode **wip)
				2908	{
				2909	struct xfs_inode *tmpfile;
				2910	int error;
				2911
				2912	error = xfs_create_tmpfile(dp, NULL, S_IFCHR \| WHITEOUT_MODE, &tmpfile);
				2913	if (error)
				2914	return error;
				2915
				2916	/*
				2917	* Prepare the tmpfile inode as if it were created through the VFS.
				2918	* Otherwise, the link increment paths will complain about nlink 0->1.
				2919	* Drop the link count as done by d_tmpfile(), complete the inode setup
				2920	* and flag it as linkable.
				2921	*/
				2922	drop_nlink(VFS_I(tmpfile));
				2923	xfs_setup_iops(tmpfile);
				2924	xfs_finish_inode_setup(tmpfile);
				2925	VFS_I(tmpfile)->i_state \|= I_LINKABLE;
				2926
				2927	*wip = tmpfile;
				2928	return 0;
				2929	}
				2930
				2931	/*
				2932	* xfs_rename
				2933	*/
				2934	int
				2935	xfs_rename(
				2936	struct xfs_inode *src_dp,
				2937	struct xfs_name *src_name,
				2938	struct xfs_inode *src_ip,
				2939	struct xfs_inode *target_dp,
				2940	struct xfs_name *target_name,
				2941	struct xfs_inode *target_ip,
				2942	unsigned int flags)
				2943	{
				2944	struct xfs_mount *mp = src_dp->i_mount;
				2945	struct xfs_trans *tp;
				2946	struct xfs_defer_ops dfops;
				2947	xfs_fsblock_t first_block;
				2948	struct xfs_inode wip = NULL; / whiteout inode */
				2949	struct xfs_inode *inodes[__XFS_SORT_INODES];
				2950	int num_inodes = __XFS_SORT_INODES;
				2951	bool new_parent = (src_dp != target_dp);
				2952	bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
				2953	int spaceres;
				2954	int error;
				2955
				2956	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
				2957
				2958	if ((flags & RENAME_EXCHANGE) && !target_ip)
				2959	return -EINVAL;
				2960
				2961	/*
				2962	* If we are doing a whiteout operation, allocate the whiteout inode
				2963	* we will be placing at the target and ensure the type is set
				2964	* appropriately.
				2965	*/
				2966	if (flags & RENAME_WHITEOUT) {
				2967	ASSERT(!(flags & (RENAME_NOREPLACE \| RENAME_EXCHANGE)));
				2968	error = xfs_rename_alloc_whiteout(target_dp, &wip);
				2969	if (error)
				2970	return error;
				2971
				2972	/* setup target dirent info as whiteout */
				2973	src_name->type = XFS_DIR3_FT_CHRDEV;
				2974	}
				2975
				2976	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
				2977	inodes, &num_inodes);
				2978
				2979	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
				2980	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
				2981	if (error == -ENOSPC) {
				2982	spaceres = 0;
				2983	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
				2984	&tp);
				2985	}
				2986	if (error)
				2987	goto out_release_wip;
				2988
				2989	/*
				2990	* Attach the dquots to the inodes
				2991	*/
				2992	error = xfs_qm_vop_rename_dqattach(inodes);
				2993	if (error)
				2994	goto out_trans_cancel;
				2995
				2996	/*
				2997	* Lock all the participating inodes. Depending upon whether
				2998	* the target_name exists in the target directory, and
				2999	* whether the target directory is the same as the source
				3000	* directory, we can lock from 2 to 4 inodes.
				3001	*/
				3002	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
				3003
				3004	/*
				3005	* Join all the inodes to the transaction. From this point on,
				3006	* we can rely on either trans_commit or trans_cancel to unlock
				3007	* them.
				3008	*/
				3009	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
				3010	if (new_parent)
				3011	xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
				3012	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
				3013	if (target_ip)
				3014	xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
				3015	if (wip)
				3016	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
				3017
				3018	/*
				3019	* If we are using project inheritance, we only allow renames
				3020	* into our tree when the project IDs are the same; else the
				3021	* tree quota mechanism would be circumvented.
				3022	*/
				3023	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
				3024	(xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
				3025	error = -EXDEV;
				3026	goto out_trans_cancel;
				3027	}
				3028
				3029	xfs_defer_init(&dfops, &first_block);
				3030
				3031	/* RENAME_EXCHANGE is unique from here on. */
				3032	if (flags & RENAME_EXCHANGE)
				3033	return xfs_cross_rename(tp, src_dp, src_name, src_ip,
				3034	target_dp, target_name, target_ip,
				3035	&dfops, &first_block, spaceres);
				3036
				3037	/*
				3038	* Check for expected errors before we dirty the transaction
				3039	* so we can return an error without a transaction abort.
				3040	*/
				3041	if (target_ip == NULL) {
				3042	/*
				3043	* If there's no space reservation, check the entry will
				3044	* fit before actually inserting it.
				3045	*/
				3046	if (!spaceres) {
				3047	error = xfs_dir_canenter(tp, target_dp, target_name);
				3048	if (error)
				3049	goto out_trans_cancel;
				3050	}
				3051	} else {
				3052	/*
				3053	* If target exists and it's a directory, check that whether
				3054	* it can be destroyed.
				3055	*/
				3056	if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
				3057	(!xfs_dir_isempty(target_ip) \|\|
				3058	(VFS_I(target_ip)->i_nlink > 2))) {
				3059	error = -EEXIST;
				3060	goto out_trans_cancel;
				3061	}
				3062	}
				3063
				3064	/*
				3065	* Directory entry creation below may acquire the AGF. Remove
				3066	* the whiteout from the unlinked list first to preserve correct
				3067	* AGI/AGF locking order. This dirties the transaction so failures
				3068	* after this point will abort and log recovery will clean up the
				3069	* mess.
				3070	*
				3071	* For whiteouts, we need to bump the link count on the whiteout
				3072	* inode. After this point, we have a real link, clear the tmpfile
				3073	* state flag from the inode so it doesn't accidentally get misused
				3074	* in future.
				3075	*/
				3076	if (wip) {
				3077	ASSERT(VFS_I(wip)->i_nlink == 0);
				3078	error = xfs_iunlink_remove(tp, wip);
				3079	if (error)
				3080	goto out_trans_cancel;
				3081
				3082	xfs_bumplink(tp, wip);
				3083	xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
				3084	VFS_I(wip)->i_state &= ~I_LINKABLE;
				3085	}
				3086
				3087	/*
				3088	* Set up the target.
				3089	*/
				3090	if (target_ip == NULL) {
				3091	/*
				3092	* If target does not exist and the rename crosses
				3093	* directories, adjust the target directory link count
				3094	* to account for the ".." reference from the new entry.
				3095	*/
				3096	error = xfs_dir_createname(tp, target_dp, target_name,
				3097	src_ip->i_ino, &first_block,
				3098	&dfops, spaceres);
				3099	if (error)
				3100	goto out_bmap_cancel;
				3101
				3102	xfs_trans_ichgtime(tp, target_dp,
				3103	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				3104
				3105	if (new_parent && src_is_directory) {
				3106	error = xfs_bumplink(tp, target_dp);
				3107	if (error)
				3108	goto out_bmap_cancel;
				3109	}
				3110	} else { /* target_ip != NULL */
				3111	/*
				3112	* Link the source inode under the target name.
				3113	* If the source inode is a directory and we are moving
				3114	* it across directories, its ".." entry will be
				3115	* inconsistent until we replace that down below.
				3116	*
				3117	* In case there is already an entry with the same
				3118	* name at the destination directory, remove it first.
				3119	*/
				3120	error = xfs_dir_replace(tp, target_dp, target_name,
				3121	src_ip->i_ino,
				3122	&first_block, &dfops, spaceres);
				3123	if (error)
				3124	goto out_bmap_cancel;
				3125
				3126	xfs_trans_ichgtime(tp, target_dp,
				3127	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				3128
				3129	/*
				3130	* Decrement the link count on the target since the target
				3131	* dir no longer points to it.
				3132	*/
				3133	error = xfs_droplink(tp, target_ip);
				3134	if (error)
				3135	goto out_bmap_cancel;
				3136
				3137	if (src_is_directory) {
				3138	/*
				3139	* Drop the link from the old "." entry.
				3140	*/
				3141	error = xfs_droplink(tp, target_ip);
				3142	if (error)
				3143	goto out_bmap_cancel;
				3144	}
				3145	} /* target_ip != NULL */
				3146
				3147	/*
				3148	* Remove the source.
				3149	*/
				3150	if (new_parent && src_is_directory) {
				3151	/*
				3152	* Rewrite the ".." entry to point to the new
				3153	* directory.
				3154	*/
				3155	error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
				3156	target_dp->i_ino,
				3157	&first_block, &dfops, spaceres);
				3158	ASSERT(error != -EEXIST);
				3159	if (error)
				3160	goto out_bmap_cancel;
				3161	}
				3162
				3163	/*
				3164	* We always want to hit the ctime on the source inode.
				3165	*
				3166	* This isn't strictly required by the standards since the source
				3167	* inode isn't really being changed, but old unix file systems did
				3168	* it and some incremental backup programs won't work without it.
				3169	*/
				3170	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
				3171	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
				3172
				3173	/*
				3174	* Adjust the link count on src_dp. This is necessary when
				3175	* renaming a directory, either within one parent when
				3176	* the target existed, or across two parent directories.
				3177	*/
				3178	if (src_is_directory && (new_parent \|\| target_ip != NULL)) {
				3179
				3180	/*
				3181	* Decrement link count on src_directory since the
				3182	* entry that's moved no longer points to it.
				3183	*/
				3184	error = xfs_droplink(tp, src_dp);
				3185	if (error)
				3186	goto out_bmap_cancel;
				3187	}
				3188
				3189	/*
				3190	* For whiteouts, we only need to update the source dirent with the
				3191	* inode number of the whiteout inode rather than removing it
				3192	* altogether.
				3193	*/
				3194	if (wip) {
				3195	error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
				3196	&first_block, &dfops, spaceres);
				3197	} else
				3198	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
				3199	&first_block, &dfops, spaceres);
				3200	if (error)
				3201	goto out_bmap_cancel;
				3202
				3203	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				3204	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
				3205	if (new_parent)
				3206	xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
				3207
				3208	error = xfs_finish_rename(tp, &dfops);
				3209	if (wip)
				3210	IRELE(wip);
				3211	return error;
				3212
				3213	out_bmap_cancel:
				3214	xfs_defer_cancel(&dfops);
				3215	out_trans_cancel:
				3216	xfs_trans_cancel(tp);
				3217	out_release_wip:
				3218	if (wip)
				3219	IRELE(wip);
				3220	return error;
				3221	}
				3222
				3223	STATIC int
				3224	xfs_iflush_cluster(
				3225	struct xfs_inode *ip,
				3226	struct xfs_buf *bp)
				3227	{
				3228	struct xfs_mount *mp = ip->i_mount;
				3229	struct xfs_perag *pag;
				3230	unsigned long first_index, mask;
				3231	unsigned long inodes_per_cluster;
				3232	int cilist_size;
				3233	struct xfs_inode **cilist;
				3234	struct xfs_inode *cip;
				3235	int nr_found;
				3236	int clcount = 0;
				3237	int bufwasdelwri;
				3238	int i;
				3239
				3240	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
				3241
				3242	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
				3243	cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
				3244	cilist = kmem_alloc(cilist_size, KM_MAYFAIL\|KM_NOFS);
				3245	if (!cilist)
				3246	goto out_put;
				3247
				3248	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
				3249	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
				3250	rcu_read_lock();
				3251	/* really need a gang lookup range call here */
				3252	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
				3253	first_index, inodes_per_cluster);
				3254	if (nr_found == 0)
				3255	goto out_free;
				3256
				3257	for (i = 0; i < nr_found; i++) {
				3258	cip = cilist[i];
				3259	if (cip == ip)
				3260	continue;
				3261
				3262	/*
				3263	* because this is an RCU protected lookup, we could find a
				3264	* recently freed or even reallocated inode during the lookup.
				3265	* We need to check under the i_flags_lock for a valid inode
				3266	* here. Skip it if it is not valid or the wrong inode.
				3267	*/
				3268	spin_lock(&cip->i_flags_lock);
				3269	if (!cip->i_ino \|\|
				3270	__xfs_iflags_test(cip, XFS_ISTALE)) {
				3271	spin_unlock(&cip->i_flags_lock);
				3272	continue;
				3273	}
				3274
				3275	/*
				3276	* Once we fall off the end of the cluster, no point checking
				3277	* any more inodes in the list because they will also all be
				3278	* outside the cluster.
				3279	*/
				3280	if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
				3281	spin_unlock(&cip->i_flags_lock);
				3282	break;
				3283	}
				3284	spin_unlock(&cip->i_flags_lock);
				3285
				3286	/*
				3287	* Do an un-protected check to see if the inode is dirty and
				3288	* is a candidate for flushing. These checks will be repeated
				3289	* later after the appropriate locks are acquired.
				3290	*/
				3291	if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
				3292	continue;
				3293
				3294	/*
				3295	* Try to get locks. If any are unavailable or it is pinned,
				3296	* then this inode cannot be flushed and is skipped.
				3297	*/
				3298
				3299	if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
				3300	continue;
				3301	if (!xfs_iflock_nowait(cip)) {
				3302	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3303	continue;
				3304	}
				3305	if (xfs_ipincount(cip)) {
				3306	xfs_ifunlock(cip);
				3307	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3308	continue;
				3309	}
				3310
				3311
				3312	/*
				3313	* Check the inode number again, just to be certain we are not
				3314	* racing with freeing in xfs_reclaim_inode(). See the comments
				3315	* in that function for more information as to why the initial
				3316	* check is not sufficient.
				3317	*/
				3318	if (!cip->i_ino) {
				3319	xfs_ifunlock(cip);
				3320	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3321	continue;
				3322	}
				3323
				3324	/*
				3325	* arriving here means that this inode can be flushed. First
				3326	* re-check that it's dirty before flushing.
				3327	*/
				3328	if (!xfs_inode_clean(cip)) {
				3329	int error;
				3330	error = xfs_iflush_int(cip, bp);
				3331	if (error) {
				3332	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3333	goto cluster_corrupt_out;
				3334	}
				3335	clcount++;
				3336	} else {
				3337	xfs_ifunlock(cip);
				3338	}
				3339	xfs_iunlock(cip, XFS_ILOCK_SHARED);
				3340	}
				3341
				3342	if (clcount) {
				3343	XFS_STATS_INC(mp, xs_icluster_flushcnt);
				3344	XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
				3345	}
				3346
				3347	out_free:
				3348	rcu_read_unlock();
				3349	kmem_free(cilist);
				3350	out_put:
				3351	xfs_perag_put(pag);
				3352	return 0;
				3353
				3354
				3355	cluster_corrupt_out:
				3356	/*
				3357	* Corruption detected in the clustering loop. Invalidate the
				3358	* inode buffer and shut down the filesystem.
				3359	*/
				3360	rcu_read_unlock();
				3361	/*
				3362	* Clean up the buffer. If it was delwri, just release it --
				3363	* brelse can handle it with no problems. If not, shut down the
				3364	* filesystem before releasing the buffer.
				3365	*/
				3366	bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
				3367	if (bufwasdelwri)
				3368	xfs_buf_relse(bp);
				3369
				3370	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
				3371
				3372	if (!bufwasdelwri) {
				3373	/*
				3374	* Just like incore_relse: if we have b_iodone functions,
				3375	* mark the buffer as an error and call them. Otherwise
				3376	* mark it as stale and brelse.
				3377	*/
				3378	if (bp->b_iodone) {
				3379	bp->b_flags &= ~XBF_DONE;
				3380	xfs_buf_stale(bp);
				3381	xfs_buf_ioerror(bp, -EIO);
				3382	xfs_buf_ioend(bp);
				3383	} else {
				3384	xfs_buf_stale(bp);
				3385	xfs_buf_relse(bp);
				3386	}
				3387	}
				3388
				3389	/*
				3390	* Unlocks the flush lock
				3391	*/
				3392	xfs_iflush_abort(cip, false);
				3393	kmem_free(cilist);
				3394	xfs_perag_put(pag);
				3395	return -EFSCORRUPTED;
				3396	}
				3397
				3398	/*
				3399	* Flush dirty inode metadata into the backing buffer.
				3400	*
				3401	* The caller must have the inode lock and the inode flush lock held. The
				3402	* inode lock will still be held upon return to the caller, and the inode
				3403	* flush lock will be released after the inode has reached the disk.
				3404	*
				3405	* The caller must write out the buffer returned in *bpp and release it.
				3406	*/
				3407	int
				3408	xfs_iflush(
				3409	struct xfs_inode *ip,
				3410	struct xfs_buf **bpp)
				3411	{
				3412	struct xfs_mount *mp = ip->i_mount;
				3413	struct xfs_buf *bp = NULL;
				3414	struct xfs_dinode *dip;
				3415	int error;
				3416
				3417	XFS_STATS_INC(mp, xs_iflush_count);
				3418
				3419	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				3420	ASSERT(xfs_isiflocked(ip));
				3421	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
				3422	ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
				3423
				3424	*bpp = NULL;
				3425
				3426	xfs_iunpin_wait(ip);
				3427
				3428	/*
				3429	* For stale inodes we cannot rely on the backing buffer remaining
				3430	* stale in cache for the remaining life of the stale inode and so
				3431	* xfs_imap_to_bp() below may give us a buffer that no longer contains
				3432	* inodes below. We have to check this after ensuring the inode is
				3433	* unpinned so that it is safe to reclaim the stale inode after the
				3434	* flush call.
				3435	*/
				3436	if (xfs_iflags_test(ip, XFS_ISTALE)) {
				3437	xfs_ifunlock(ip);
				3438	return 0;
				3439	}
				3440
				3441	/*
				3442	* This may have been unpinned because the filesystem is shutting
				3443	* down forcibly. If that's the case we must not write this inode
				3444	* to disk, because the log record didn't make it to disk.
				3445	*
				3446	* We also have to remove the log item from the AIL in this case,
				3447	* as we wait for an empty AIL as part of the unmount process.
				3448	*/
				3449	if (XFS_FORCED_SHUTDOWN(mp)) {
				3450	error = -EIO;
				3451	goto abort_out;
				3452	}
				3453
				3454	/*
				3455	* Get the buffer containing the on-disk inode. We are doing a try-lock
				3456	* operation here, so we may get an EAGAIN error. In that case, we
				3457	* simply want to return with the inode still dirty.
				3458	*
				3459	* If we get any other error, we effectively have a corruption situation
				3460	* and we cannot flush the inode, so we treat it the same as failing
				3461	* xfs_iflush_int().
				3462	*/
				3463	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
				3464	0);
				3465	if (error == -EAGAIN) {
				3466	xfs_ifunlock(ip);
				3467	return error;
				3468	}
				3469	if (error)
				3470	goto corrupt_out;
				3471
				3472	/*
				3473	* First flush out the inode that xfs_iflush was called with.
				3474	*/
				3475	error = xfs_iflush_int(ip, bp);
				3476	if (error)
				3477	goto corrupt_out;
				3478
				3479	/*
				3480	* If the buffer is pinned then push on the log now so we won't
				3481	* get stuck waiting in the write for too long.
				3482	*/
				3483	if (xfs_buf_ispinned(bp))
				3484	xfs_log_force(mp, 0);
				3485
				3486	/*
				3487	* inode clustering:
				3488	* see if other inodes can be gathered into this write
				3489	*/
				3490	error = xfs_iflush_cluster(ip, bp);
				3491	if (error)
				3492	goto cluster_corrupt_out;
				3493
				3494	*bpp = bp;
				3495	return 0;
				3496
				3497	corrupt_out:
				3498	if (bp)
				3499	xfs_buf_relse(bp);
				3500	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
				3501	cluster_corrupt_out:
				3502	error = -EFSCORRUPTED;
				3503	abort_out:
				3504	/*
				3505	* Unlocks the flush lock
				3506	*/
				3507	xfs_iflush_abort(ip, false);
				3508	return error;
				3509	}
				3510
				3511	STATIC int
				3512	xfs_iflush_int(
				3513	struct xfs_inode *ip,
				3514	struct xfs_buf *bp)
				3515	{
				3516	struct xfs_inode_log_item *iip = ip->i_itemp;
				3517	struct xfs_dinode *dip;
				3518	struct xfs_mount *mp = ip->i_mount;
				3519
				3520	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
				3521	ASSERT(xfs_isiflocked(ip));
				3522	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
				3523	ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
				3524	ASSERT(iip != NULL && iip->ili_fields != 0);
				3525	ASSERT(ip->i_d.di_version > 1);
				3526
				3527	/* set dip = inode's place in the buffer /
				3528	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
				3529
				3530	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
				3531	mp, XFS_ERRTAG_IFLUSH_1)) {
				3532	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3533	"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
				3534	__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
				3535	goto corrupt_out;
				3536	}
				3537	if (S_ISREG(VFS_I(ip)->i_mode)) {
				3538	if (XFS_TEST_ERROR(
				3539	(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
				3540	(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
				3541	mp, XFS_ERRTAG_IFLUSH_3)) {
				3542	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3543	"%s: Bad regular inode %Lu, ptr 0x%p",
				3544	__func__, ip->i_ino, ip);
				3545	goto corrupt_out;
				3546	}
				3547	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
				3548	if (XFS_TEST_ERROR(
				3549	(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
				3550	(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
				3551	(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
				3552	mp, XFS_ERRTAG_IFLUSH_4)) {
				3553	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3554	"%s: Bad directory inode %Lu, ptr 0x%p",
				3555	__func__, ip->i_ino, ip);
				3556	goto corrupt_out;
				3557	}
				3558	}
				3559	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
				3560	ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
				3561	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3562	"%s: detected corrupt incore inode %Lu, "
				3563	"total extents = %d, nblocks = %Ld, ptr 0x%p",
				3564	__func__, ip->i_ino,
				3565	ip->i_d.di_nextents + ip->i_d.di_anextents,
				3566	ip->i_d.di_nblocks, ip);
				3567	goto corrupt_out;
				3568	}
				3569	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
				3570	mp, XFS_ERRTAG_IFLUSH_6)) {
				3571	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
				3572	"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
				3573	__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
				3574	goto corrupt_out;
				3575	}
				3576
				3577	/*
				3578	* Inode item log recovery for v2 inodes are dependent on the
				3579	* di_flushiter count for correct sequencing. We bump the flush
				3580	* iteration count so we can detect flushes which postdate a log record
				3581	* during recovery. This is redundant as we now log every change and
				3582	* hence this can't happen but we need to still do it to ensure
				3583	* backwards compatibility with old kernels that predate logging all
				3584	* inode changes.
				3585	*/
				3586	if (ip->i_d.di_version < 3)
				3587	ip->i_d.di_flushiter++;
				3588
				3589	/* Check the inline directory data. */
				3590	if (S_ISDIR(VFS_I(ip)->i_mode) &&
				3591	ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
				3592	xfs_dir2_sf_verify(ip))
				3593	goto corrupt_out;
				3594
				3595	/*
				3596	* Copy the dirty parts of the inode into the on-disk inode. We always
				3597	* copy out the core of the inode, because if the inode is dirty at all
				3598	* the core must be.
				3599	*/
				3600	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
				3601
				3602	/* Wrap, we never let the log put out DI_MAX_FLUSH */
				3603	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
				3604	ip->i_d.di_flushiter = 0;
				3605
				3606	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
				3607	if (XFS_IFORK_Q(ip))
				3608	xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
				3609	xfs_inobp_check(mp, bp);
				3610
				3611	/*
				3612	* We've recorded everything logged in the inode, so we'd like to clear
				3613	* the ili_fields bits so we don't log and flush things unnecessarily.
				3614	* However, we can't stop logging all this information until the data
				3615	* we've copied into the disk buffer is written to disk. If we did we
				3616	* might overwrite the copy of the inode in the log with all the data
				3617	* after re-logging only part of it, and in the face of a crash we
				3618	* wouldn't have all the data we need to recover.
				3619	*
				3620	* What we do is move the bits to the ili_last_fields field. When
				3621	* logging the inode, these bits are moved back to the ili_fields field.
				3622	* In the xfs_iflush_done() routine we clear ili_last_fields, since we
				3623	* know that the information those bits represent is permanently on
				3624	* disk. As long as the flush completes before the inode is logged
				3625	* again, then both ili_fields and ili_last_fields will be cleared.
				3626	*
				3627	* We can play with the ili_fields bits here, because the inode lock
				3628	* must be held exclusively in order to set bits there and the flush
				3629	* lock protects the ili_last_fields bits. Set ili_logged so the flush
				3630	* done routine can tell whether or not to look in the AIL. Also, store
				3631	* the current LSN of the inode so that we can tell whether the item has
				3632	* moved in the AIL from xfs_iflush_done(). In order to read the lsn we
				3633	* need the AIL lock, because it is a 64 bit value that cannot be read
				3634	* atomically.
				3635	*/
				3636	iip->ili_last_fields = iip->ili_fields;
				3637	iip->ili_fields = 0;
				3638	iip->ili_fsync_fields = 0;
				3639	iip->ili_logged = 1;
				3640
				3641	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
				3642	&iip->ili_item.li_lsn);
				3643
				3644	/*
				3645	* Attach the function xfs_iflush_done to the inode's
				3646	* buffer. This will remove the inode from the AIL
				3647	* and unlock the inode's flush lock when the inode is
				3648	* completely written to disk.
				3649	*/
				3650	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
				3651
				3652	/* generate the checksum. */
				3653	xfs_dinode_calc_crc(mp, dip);
				3654
				3655	ASSERT(bp->b_fspriv != NULL);
				3656	ASSERT(bp->b_iodone != NULL);
				3657	return 0;
				3658
				3659	corrupt_out:
				3660	return -EFSCORRUPTED;
				3661	}