Blame - src/kernel/linux/v4.19/fs/xfs/xfs_file.c - T800

blob: 259549698ba7e607b105360f36ececb7e92fff51 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_da_format.h"
				14	#include "xfs_da_btree.h"
				15	#include "xfs_inode.h"
				16	#include "xfs_trans.h"
				17	#include "xfs_inode_item.h"
				18	#include "xfs_bmap.h"
				19	#include "xfs_bmap_util.h"
				20	#include "xfs_error.h"
				21	#include "xfs_dir2.h"
				22	#include "xfs_dir2_priv.h"
				23	#include "xfs_ioctl.h"
				24	#include "xfs_trace.h"
				25	#include "xfs_log.h"
				26	#include "xfs_icache.h"
				27	#include "xfs_pnfs.h"
				28	#include "xfs_iomap.h"
				29	#include "xfs_reflink.h"
				30
				31	#include <linux/dcache.h>
				32	#include <linux/falloc.h>
				33	#include <linux/pagevec.h>
				34	#include <linux/backing-dev.h>
				35	#include <linux/mman.h>
				36
				37	static const struct vm_operations_struct xfs_file_vm_ops;
				38
				39	int
				40	xfs_update_prealloc_flags(
				41	struct xfs_inode *ip,
				42	enum xfs_prealloc_flags flags)
				43	{
				44	struct xfs_trans *tp;
				45	int error;
				46
				47	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
				48	0, 0, 0, &tp);
				49	if (error)
				50	return error;
				51
				52	xfs_ilock(ip, XFS_ILOCK_EXCL);
				53	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				54
				55	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
				56	VFS_I(ip)->i_mode &= ~S_ISUID;
				57	if (VFS_I(ip)->i_mode & S_IXGRP)
				58	VFS_I(ip)->i_mode &= ~S_ISGID;
				59	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				60	}
				61
				62	if (flags & XFS_PREALLOC_SET)
				63	ip->i_d.di_flags \|= XFS_DIFLAG_PREALLOC;
				64	if (flags & XFS_PREALLOC_CLEAR)
				65	ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
				66
				67	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				68	if (flags & XFS_PREALLOC_SYNC)
				69	xfs_trans_set_sync(tp);
				70	return xfs_trans_commit(tp);
				71	}
				72
				73	/*
				74	* Fsync operations on directories are much simpler than on regular files,
				75	* as there is no file data to flush, and thus also no need for explicit
				76	* cache flush operations, and there are no non-transaction metadata updates
				77	* on directories either.
				78	*/
				79	STATIC int
				80	xfs_dir_fsync(
				81	struct file *file,
				82	loff_t start,
				83	loff_t end,
				84	int datasync)
				85	{
				86	struct xfs_inode *ip = XFS_I(file->f_mapping->host);
				87	struct xfs_mount *mp = ip->i_mount;
				88	xfs_lsn_t lsn = 0;
				89
				90	trace_xfs_dir_fsync(ip);
				91
				92	xfs_ilock(ip, XFS_ILOCK_SHARED);
				93	if (xfs_ipincount(ip))
				94	lsn = ip->i_itemp->ili_last_lsn;
				95	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				96
				97	if (!lsn)
				98	return 0;
				99	return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
				100	}
				101
				102	STATIC int
				103	xfs_file_fsync(
				104	struct file *file,
				105	loff_t start,
				106	loff_t end,
				107	int datasync)
				108	{
				109	struct inode *inode = file->f_mapping->host;
				110	struct xfs_inode *ip = XFS_I(inode);
				111	struct xfs_mount *mp = ip->i_mount;
				112	int error = 0;
				113	int log_flushed = 0;
				114	xfs_lsn_t lsn = 0;
				115
				116	trace_xfs_file_fsync(ip);
				117
				118	error = file_write_and_wait_range(file, start, end);
				119	if (error)
				120	return error;
				121
				122	if (XFS_FORCED_SHUTDOWN(mp))
				123	return -EIO;
				124
				125	xfs_iflags_clear(ip, XFS_ITRUNCATED);
				126
				127	/*
				128	* If we have an RT and/or log subvolume we need to make sure to flush
				129	* the write cache the device used for file data first. This is to
				130	* ensure newly written file data make it to disk before logging the new
				131	* inode size in case of an extending write.
				132	*/
				133	if (XFS_IS_REALTIME_INODE(ip))
				134	xfs_blkdev_issue_flush(mp->m_rtdev_targp);
				135	else if (mp->m_logdev_targp != mp->m_ddev_targp)
				136	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				137
				138	/*
				139	* All metadata updates are logged, which means that we just have to
				140	* flush the log up to the latest LSN that touched the inode. If we have
				141	* concurrent fsync/fdatasync() calls, we need them to all block on the
				142	* log force before we clear the ili_fsync_fields field. This ensures
				143	* that we don't get a racing sync operation that does not wait for the
				144	* metadata to hit the journal before returning. If we race with
				145	* clearing the ili_fsync_fields, then all that will happen is the log
				146	* force will do nothing as the lsn will already be on disk. We can't
				147	* race with setting ili_fsync_fields because that is done under
				148	* XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
				149	* until after the ili_fsync_fields is cleared.
				150	*/
				151	xfs_ilock(ip, XFS_ILOCK_SHARED);
				152	if (xfs_ipincount(ip)) {
				153	if (!datasync \|\|
				154	(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
				155	lsn = ip->i_itemp->ili_last_lsn;
				156	}
				157
				158	if (lsn) {
				159	error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
				160	ip->i_itemp->ili_fsync_fields = 0;
				161	}
				162	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				163
				164	/*
				165	* If we only have a single device, and the log force about was
				166	* a no-op we might have to flush the data device cache here.
				167	* This can only happen for fdatasync/O_DSYNC if we were overwriting
				168	* an already allocated file and thus do not have any metadata to
				169	* commit.
				170	*/
				171	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
				172	mp->m_logdev_targp == mp->m_ddev_targp)
				173	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				174
				175	return error;
				176	}
				177
				178	STATIC ssize_t
				179	xfs_file_dio_aio_read(
				180	struct kiocb *iocb,
				181	struct iov_iter *to)
				182	{
				183	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				184	size_t count = iov_iter_count(to);
				185	ssize_t ret;
				186
				187	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
				188
				189	if (!count)
				190	return 0; /* skip atime */
				191
				192	file_accessed(iocb->ki_filp);
				193
				194	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				195	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
				196	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				197
				198	return ret;
				199	}
				200
				201	static noinline ssize_t
				202	xfs_file_dax_read(
				203	struct kiocb *iocb,
				204	struct iov_iter *to)
				205	{
				206	struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
				207	size_t count = iov_iter_count(to);
				208	ssize_t ret = 0;
				209
				210	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
				211
				212	if (!count)
				213	return 0; /* skip atime */
				214
				215	if (iocb->ki_flags & IOCB_NOWAIT) {
				216	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				217	return -EAGAIN;
				218	} else {
				219	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				220	}
				221
				222	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
				223	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				224
				225	file_accessed(iocb->ki_filp);
				226	return ret;
				227	}
				228
				229	STATIC ssize_t
				230	xfs_file_buffered_aio_read(
				231	struct kiocb *iocb,
				232	struct iov_iter *to)
				233	{
				234	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				235	ssize_t ret;
				236
				237	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
				238
				239	if (iocb->ki_flags & IOCB_NOWAIT) {
				240	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				241	return -EAGAIN;
				242	} else {
				243	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				244	}
				245	ret = generic_file_read_iter(iocb, to);
				246	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				247
				248	return ret;
				249	}
				250
				251	STATIC ssize_t
				252	xfs_file_read_iter(
				253	struct kiocb *iocb,
				254	struct iov_iter *to)
				255	{
				256	struct inode *inode = file_inode(iocb->ki_filp);
				257	struct xfs_mount *mp = XFS_I(inode)->i_mount;
				258	ssize_t ret = 0;
				259
				260	XFS_STATS_INC(mp, xs_read_calls);
				261
				262	if (XFS_FORCED_SHUTDOWN(mp))
				263	return -EIO;
				264
				265	if (IS_DAX(inode))
				266	ret = xfs_file_dax_read(iocb, to);
				267	else if (iocb->ki_flags & IOCB_DIRECT)
				268	ret = xfs_file_dio_aio_read(iocb, to);
				269	else
				270	ret = xfs_file_buffered_aio_read(iocb, to);
				271
				272	if (ret > 0)
				273	XFS_STATS_ADD(mp, xs_read_bytes, ret);
				274	return ret;
				275	}
				276
				277	/*
				278	* Common pre-write limit and setup checks.
				279	*
				280	* Called with the iolocked held either shared and exclusive according to
				281	* @iolock, and returns with it held. Might upgrade the iolock to exclusive
				282	* if called for a direct write beyond i_size.
				283	*/
				284	STATIC ssize_t
				285	xfs_file_aio_write_checks(
				286	struct kiocb *iocb,
				287	struct iov_iter *from,
				288	int *iolock)
				289	{
				290	struct file *file = iocb->ki_filp;
				291	struct inode *inode = file->f_mapping->host;
				292	struct xfs_inode *ip = XFS_I(inode);
				293	ssize_t error = 0;
				294	size_t count = iov_iter_count(from);
				295	bool drained_dio = false;
				296	loff_t isize;
				297
				298	restart:
				299	error = generic_write_checks(iocb, from);
				300	if (error <= 0)
				301	return error;
				302
				303	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
				304	if (error)
				305	return error;
				306
				307	/*
				308	* For changing security info in file_remove_privs() we need i_rwsem
				309	* exclusively.
				310	*/
				311	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
				312	xfs_iunlock(ip, *iolock);
				313	*iolock = XFS_IOLOCK_EXCL;
				314	xfs_ilock(ip, *iolock);
				315	goto restart;
				316	}
				317	/*
				318	* If the offset is beyond the size of the file, we need to zero any
				319	* blocks that fall between the existing EOF and the start of this
				320	* write. If zeroing is needed and we are currently holding the
				321	* iolock shared, we need to update it to exclusive which implies
				322	* having to redo all checks before.
				323	*
				324	* We need to serialise against EOF updates that occur in IO
				325	* completions here. We want to make sure that nobody is changing the
				326	* size while we do this check until we have placed an IO barrier (i.e.
				327	* hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
				328	* The spinlock effectively forms a memory barrier once we have the
				329	* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
				330	* and hence be able to correctly determine if we need to run zeroing.
				331	*/
				332	spin_lock(&ip->i_flags_lock);
				333	isize = i_size_read(inode);
				334	if (iocb->ki_pos > isize) {
				335	spin_unlock(&ip->i_flags_lock);
				336	if (!drained_dio) {
				337	if (*iolock == XFS_IOLOCK_SHARED) {
				338	xfs_iunlock(ip, *iolock);
				339	*iolock = XFS_IOLOCK_EXCL;
				340	xfs_ilock(ip, *iolock);
				341	iov_iter_reexpand(from, count);
				342	}
				343	/*
				344	* We now have an IO submission barrier in place, but
				345	* AIO can do EOF updates during IO completion and hence
				346	* we now need to wait for all of them to drain. Non-AIO
				347	* DIO will have drained before we are given the
				348	* XFS_IOLOCK_EXCL, and so for most cases this wait is a
				349	* no-op.
				350	*/
				351	inode_dio_wait(inode);
				352	drained_dio = true;
				353	goto restart;
				354	}
				355
				356	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
				357	error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
				358	NULL, &xfs_iomap_ops);
				359	if (error)
				360	return error;
				361	} else
				362	spin_unlock(&ip->i_flags_lock);
				363
				364	/*
				365	* Updating the timestamps will grab the ilock again from
				366	* xfs_fs_dirty_inode, so we have to call it after dropping the
				367	* lock above. Eventually we should look into a way to avoid
				368	* the pointless lock roundtrip.
				369	*/
				370	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
				371	error = file_update_time(file);
				372	if (error)
				373	return error;
				374	}
				375
				376	/*
				377	* If we're writing the file then make sure to clear the setuid and
				378	* setgid bits if the process is not being run by root. This keeps
				379	* people from modifying setuid and setgid binaries.
				380	*/
				381	if (!IS_NOSEC(inode))
				382	return file_remove_privs(file);
				383	return 0;
				384	}
				385
				386	static int
				387	xfs_dio_write_end_io(
				388	struct kiocb *iocb,
				389	ssize_t size,
				390	unsigned flags)
				391	{
				392	struct inode *inode = file_inode(iocb->ki_filp);
				393	struct xfs_inode *ip = XFS_I(inode);
				394	loff_t offset = iocb->ki_pos;
				395	int error = 0;
				396
				397	trace_xfs_end_io_direct_write(ip, offset, size);
				398
				399	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				400	return -EIO;
				401
				402	if (size <= 0)
				403	return size;
				404
				405	/*
				406	* Capture amount written on completion as we can't reliably account
				407	* for it on submission.
				408	*/
				409	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
				410
				411	if (flags & IOMAP_DIO_COW) {
				412	error = xfs_reflink_end_cow(ip, offset, size);
				413	if (error)
				414	return error;
				415	}
				416
				417	/*
				418	* Unwritten conversion updates the in-core isize after extent
				419	* conversion but before updating the on-disk size. Updating isize any
				420	* earlier allows a racing dio read to find unwritten extents before
				421	* they are converted.
				422	*/
				423	if (flags & IOMAP_DIO_UNWRITTEN)
				424	return xfs_iomap_write_unwritten(ip, offset, size, true);
				425
				426	/*
				427	* We need to update the in-core inode size here so that we don't end up
				428	* with the on-disk inode size being outside the in-core inode size. We
				429	* have no other method of updating EOF for AIO, so always do it here
				430	* if necessary.
				431	*
				432	* We need to lock the test/set EOF update as we can be racing with
				433	* other IO completions here to update the EOF. Failing to serialise
				434	* here can result in EOF moving backwards and Bad Things Happen when
				435	* that occurs.
				436	*/
				437	spin_lock(&ip->i_flags_lock);
				438	if (offset + size > i_size_read(inode)) {
				439	i_size_write(inode, offset + size);
				440	spin_unlock(&ip->i_flags_lock);
				441	error = xfs_setfilesize(ip, offset, size);
				442	} else {
				443	spin_unlock(&ip->i_flags_lock);
				444	}
				445
				446	return error;
				447	}
				448
				449	/*
				450	* xfs_file_dio_aio_write - handle direct IO writes
				451	*
				452	* Lock the inode appropriately to prepare for and issue a direct IO write.
				453	* By separating it from the buffered write path we remove all the tricky to
				454	* follow locking changes and looping.
				455	*
				456	* If there are cached pages or we're extending the file, we need IOLOCK_EXCL
				457	* until we're sure the bytes at the new EOF have been zeroed and/or the cached
				458	* pages are flushed out.
				459	*
				460	* In most cases the direct IO writes will be done holding IOLOCK_SHARED
				461	* allowing them to be done in parallel with reads and other direct IO writes.
				462	* However, if the IO is not aligned to filesystem blocks, the direct IO layer
				463	* needs to do sub-block zeroing and that requires serialisation against other
				464	* direct IOs to the same block. In this case we need to serialise the
				465	* submission of the unaligned IOs so that we don't get racing block zeroing in
				466	* the dio layer. To avoid the problem with aio, we also need to wait for
				467	* outstanding IOs to complete so that unwritten extent conversion is completed
				468	* before we try to map the overlapping block. This is currently implemented by
				469	* hitting it with a big hammer (i.e. inode_dio_wait()).
				470	*
				471	* Returns with locks held indicated by @iolock and errors indicated by
				472	* negative return values.
				473	*/
				474	STATIC ssize_t
				475	xfs_file_dio_aio_write(
				476	struct kiocb *iocb,
				477	struct iov_iter *from)
				478	{
				479	struct file *file = iocb->ki_filp;
				480	struct address_space *mapping = file->f_mapping;
				481	struct inode *inode = mapping->host;
				482	struct xfs_inode *ip = XFS_I(inode);
				483	struct xfs_mount *mp = ip->i_mount;
				484	ssize_t ret = 0;
				485	int unaligned_io = 0;
				486	int iolock;
				487	size_t count = iov_iter_count(from);
				488	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
				489	mp->m_rtdev_targp : mp->m_ddev_targp;
				490
				491	/* DIO must be aligned to device logical sector size */
				492	if ((iocb->ki_pos \| count) & target->bt_logical_sectormask)
				493	return -EINVAL;
				494
				495	/*
				496	* Don't take the exclusive iolock here unless the I/O is unaligned to
				497	* the file system block size. We don't need to consider the EOF
				498	* extension case here because xfs_file_aio_write_checks() will relock
				499	* the inode as necessary for EOF zeroing cases and fill out the new
				500	* inode size as appropriate.
				501	*/
				502	if ((iocb->ki_pos & mp->m_blockmask) \|\|
				503	((iocb->ki_pos + count) & mp->m_blockmask)) {
				504	unaligned_io = 1;
				505
				506	/*
				507	* We can't properly handle unaligned direct I/O to reflink
				508	* files yet, as we can't unshare a partial block.
				509	*/
				510	if (xfs_is_reflink_inode(ip)) {
				511	trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
				512	return -EREMCHG;
				513	}
				514	iolock = XFS_IOLOCK_EXCL;
				515	} else {
				516	iolock = XFS_IOLOCK_SHARED;
				517	}
				518
				519	if (iocb->ki_flags & IOCB_NOWAIT) {
				520	/* unaligned dio always waits, bail */
				521	if (unaligned_io)
				522	return -EAGAIN;
				523	if (!xfs_ilock_nowait(ip, iolock))
				524	return -EAGAIN;
				525	} else {
				526	xfs_ilock(ip, iolock);
				527	}
				528
				529	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				530	if (ret)
				531	goto out;
				532	count = iov_iter_count(from);
				533
				534	/*
				535	* If we are doing unaligned IO, we can't allow any other overlapping IO
				536	* in-flight at the same time or we risk data corruption. Wait for all
				537	* other IO to drain before we submit. If the IO is aligned, demote the
				538	* iolock if we had to take the exclusive lock in
				539	* xfs_file_aio_write_checks() for other reasons.
				540	*/
				541	if (unaligned_io) {
				542	inode_dio_wait(inode);
				543	} else if (iolock == XFS_IOLOCK_EXCL) {
				544	xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
				545	iolock = XFS_IOLOCK_SHARED;
				546	}
				547
				548	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
				549	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
				550
				551	/*
				552	* If unaligned, this is the only IO in-flight. If it has not yet
				553	* completed, wait on it before we release the iolock to prevent
				554	* subsequent overlapping IO.
				555	*/
				556	if (ret == -EIOCBQUEUED && unaligned_io)
				557	inode_dio_wait(inode);
				558	out:
				559	xfs_iunlock(ip, iolock);
				560
				561	/*
				562	* No fallback to buffered IO on errors for XFS, direct IO will either
				563	* complete fully or fail.
				564	*/
				565	ASSERT(ret < 0 \|\| ret == count);
				566	return ret;
				567	}
				568
				569	static noinline ssize_t
				570	xfs_file_dax_write(
				571	struct kiocb *iocb,
				572	struct iov_iter *from)
				573	{
				574	struct inode *inode = iocb->ki_filp->f_mapping->host;
				575	struct xfs_inode *ip = XFS_I(inode);
				576	int iolock = XFS_IOLOCK_EXCL;
				577	ssize_t ret, error = 0;
				578	size_t count;
				579	loff_t pos;
				580
				581	if (iocb->ki_flags & IOCB_NOWAIT) {
				582	if (!xfs_ilock_nowait(ip, iolock))
				583	return -EAGAIN;
				584	} else {
				585	xfs_ilock(ip, iolock);
				586	}
				587
				588	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				589	if (ret)
				590	goto out;
				591
				592	pos = iocb->ki_pos;
				593	count = iov_iter_count(from);
				594
				595	trace_xfs_file_dax_write(ip, count, pos);
				596	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
				597	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
				598	i_size_write(inode, iocb->ki_pos);
				599	error = xfs_setfilesize(ip, pos, ret);
				600	}
				601	out:
				602	xfs_iunlock(ip, iolock);
				603	if (error)
				604	return error;
				605
				606	if (ret > 0) {
				607	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				608
				609	/* Handle various SYNC-type writes */
				610	ret = generic_write_sync(iocb, ret);
				611	}
				612	return ret;
				613	}
				614
				615	STATIC ssize_t
				616	xfs_file_buffered_aio_write(
				617	struct kiocb *iocb,
				618	struct iov_iter *from)
				619	{
				620	struct file *file = iocb->ki_filp;
				621	struct address_space *mapping = file->f_mapping;
				622	struct inode *inode = mapping->host;
				623	struct xfs_inode *ip = XFS_I(inode);
				624	ssize_t ret;
				625	int enospc = 0;
				626	int iolock;
				627
				628	if (iocb->ki_flags & IOCB_NOWAIT)
				629	return -EOPNOTSUPP;
				630
				631	write_retry:
				632	iolock = XFS_IOLOCK_EXCL;
				633	xfs_ilock(ip, iolock);
				634
				635	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				636	if (ret)
				637	goto out;
				638
				639	/* We can write back this queue in page reclaim */
				640	current->backing_dev_info = inode_to_bdi(inode);
				641
				642	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
				643	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
				644	if (likely(ret >= 0))
				645	iocb->ki_pos += ret;
				646
				647	/*
				648	* If we hit a space limit, try to free up some lingering preallocated
				649	* space before returning an error. In the case of ENOSPC, first try to
				650	* write back all dirty inodes to free up some of the excess reserved
				651	* metadata space. This reduces the chances that the eofblocks scan
				652	* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
				653	* also behaves as a filter to prevent too many eofblocks scans from
				654	* running at the same time.
				655	*/
				656	if (ret == -EDQUOT && !enospc) {
				657	xfs_iunlock(ip, iolock);
				658	enospc = xfs_inode_free_quota_eofblocks(ip);
				659	if (enospc)
				660	goto write_retry;
				661	enospc = xfs_inode_free_quota_cowblocks(ip);
				662	if (enospc)
				663	goto write_retry;
				664	iolock = 0;
				665	} else if (ret == -ENOSPC && !enospc) {
				666	struct xfs_eofblocks eofb = {0};
				667
				668	enospc = 1;
				669	xfs_flush_inodes(ip->i_mount);
				670
				671	xfs_iunlock(ip, iolock);
				672	eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
				673	xfs_icache_free_eofblocks(ip->i_mount, &eofb);
				674	xfs_icache_free_cowblocks(ip->i_mount, &eofb);
				675	goto write_retry;
				676	}
				677
				678	current->backing_dev_info = NULL;
				679	out:
				680	if (iolock)
				681	xfs_iunlock(ip, iolock);
				682
				683	if (ret > 0) {
				684	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				685	/* Handle various SYNC-type writes */
				686	ret = generic_write_sync(iocb, ret);
				687	}
				688	return ret;
				689	}
				690
				691	STATIC ssize_t
				692	xfs_file_write_iter(
				693	struct kiocb *iocb,
				694	struct iov_iter *from)
				695	{
				696	struct file *file = iocb->ki_filp;
				697	struct address_space *mapping = file->f_mapping;
				698	struct inode *inode = mapping->host;
				699	struct xfs_inode *ip = XFS_I(inode);
				700	ssize_t ret;
				701	size_t ocount = iov_iter_count(from);
				702
				703	XFS_STATS_INC(ip->i_mount, xs_write_calls);
				704
				705	if (ocount == 0)
				706	return 0;
				707
				708	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				709	return -EIO;
				710
				711	if (IS_DAX(inode))
				712	return xfs_file_dax_write(iocb, from);
				713
				714	if (iocb->ki_flags & IOCB_DIRECT) {
				715	/*
				716	* Allow a directio write to fall back to a buffered
				717	* write only in the case that we're doing a reflink
				718	* CoW. In all other directio scenarios we do not
				719	* allow an operation to fall back to buffered mode.
				720	*/
				721	ret = xfs_file_dio_aio_write(iocb, from);
				722	if (ret != -EREMCHG)
				723	return ret;
				724	}
				725
				726	return xfs_file_buffered_aio_write(iocb, from);
				727	}
				728
				729	static void
				730	xfs_wait_dax_page(
				731	struct inode *inode)
				732	{
				733	struct xfs_inode *ip = XFS_I(inode);
				734
				735	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
				736	schedule();
				737	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
				738	}
				739
				740	static int
				741	xfs_break_dax_layouts(
				742	struct inode *inode,
				743	bool *retry)
				744	{
				745	struct page *page;
				746
				747	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
				748
				749	page = dax_layout_busy_page(inode->i_mapping);
				750	if (!page)
				751	return 0;
				752
				753	*retry = true;
				754	return ___wait_var_event(&page->_refcount,
				755	atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
				756	0, 0, xfs_wait_dax_page(inode));
				757	}
				758
				759	int
				760	xfs_break_layouts(
				761	struct inode *inode,
				762	uint *iolock,
				763	enum layout_break_reason reason)
				764	{
				765	bool retry;
				766	int error;
				767
				768	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL));
				769
				770	do {
				771	retry = false;
				772	switch (reason) {
				773	case BREAK_UNMAP:
				774	error = xfs_break_dax_layouts(inode, &retry);
				775	if (error \|\| retry)
				776	break;
				777	/* fall through */
				778	case BREAK_WRITE:
				779	error = xfs_break_leased_layouts(inode, iolock, &retry);
				780	break;
				781	default:
				782	WARN_ON_ONCE(1);
				783	error = -EINVAL;
				784	}
				785	} while (error == 0 && retry);
				786
				787	return error;
				788	}
				789
				790	#define XFS_FALLOC_FL_SUPPORTED \
				791	(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \| \
				792	FALLOC_FL_COLLAPSE_RANGE \| FALLOC_FL_ZERO_RANGE \| \
				793	FALLOC_FL_INSERT_RANGE \| FALLOC_FL_UNSHARE_RANGE)
				794
				795	STATIC long
				796	xfs_file_fallocate(
				797	struct file *file,
				798	int mode,
				799	loff_t offset,
				800	loff_t len)
				801	{
				802	struct inode *inode = file_inode(file);
				803	struct xfs_inode *ip = XFS_I(inode);
				804	long error;
				805	enum xfs_prealloc_flags flags = 0;
				806	uint iolock = XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL;
				807	loff_t new_size = 0;
				808	bool do_file_insert = false;
				809
				810	if (!S_ISREG(inode->i_mode))
				811	return -EINVAL;
				812	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
				813	return -EOPNOTSUPP;
				814
				815	xfs_ilock(ip, iolock);
				816	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
				817	if (error)
				818	goto out_unlock;
				819
				820	if (mode & FALLOC_FL_PUNCH_HOLE) {
				821	error = xfs_free_file_space(ip, offset, len);
				822	if (error)
				823	goto out_unlock;
				824	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
				825	unsigned int blksize_mask = i_blocksize(inode) - 1;
				826
				827	if (offset & blksize_mask \|\| len & blksize_mask) {
				828	error = -EINVAL;
				829	goto out_unlock;
				830	}
				831
				832	/*
				833	* There is no need to overlap collapse range with EOF,
				834	* in which case it is effectively a truncate operation
				835	*/
				836	if (offset + len >= i_size_read(inode)) {
				837	error = -EINVAL;
				838	goto out_unlock;
				839	}
				840
				841	new_size = i_size_read(inode) - len;
				842
				843	error = xfs_collapse_file_space(ip, offset, len);
				844	if (error)
				845	goto out_unlock;
				846	} else if (mode & FALLOC_FL_INSERT_RANGE) {
				847	unsigned int blksize_mask = i_blocksize(inode) - 1;
				848	loff_t isize = i_size_read(inode);
				849
				850	if (offset & blksize_mask \|\| len & blksize_mask) {
				851	error = -EINVAL;
				852	goto out_unlock;
				853	}
				854
				855	/*
				856	* New inode size must not exceed ->s_maxbytes, accounting for
				857	* possible signed overflow.
				858	*/
				859	if (inode->i_sb->s_maxbytes - isize < len) {
				860	error = -EFBIG;
				861	goto out_unlock;
				862	}
				863	new_size = isize + len;
				864
				865	/* Offset should be less than i_size */
				866	if (offset >= isize) {
				867	error = -EINVAL;
				868	goto out_unlock;
				869	}
				870	do_file_insert = true;
				871	} else {
				872	flags \|= XFS_PREALLOC_SET;
				873
				874	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
				875	offset + len > i_size_read(inode)) {
				876	new_size = offset + len;
				877	error = inode_newsize_ok(inode, new_size);
				878	if (error)
				879	goto out_unlock;
				880	}
				881
				882	if (mode & FALLOC_FL_ZERO_RANGE)
				883	error = xfs_zero_file_space(ip, offset, len);
				884	else {
				885	if (mode & FALLOC_FL_UNSHARE_RANGE) {
				886	error = xfs_reflink_unshare(ip, offset, len);
				887	if (error)
				888	goto out_unlock;
				889	}
				890	error = xfs_alloc_file_space(ip, offset, len,
				891	XFS_BMAPI_PREALLOC);
				892	}
				893	if (error)
				894	goto out_unlock;
				895	}
				896
				897	if (file->f_flags & O_DSYNC)
				898	flags \|= XFS_PREALLOC_SYNC;
				899
				900	error = xfs_update_prealloc_flags(ip, flags);
				901	if (error)
				902	goto out_unlock;
				903
				904	/* Change file size if needed */
				905	if (new_size) {
				906	struct iattr iattr;
				907
				908	iattr.ia_valid = ATTR_SIZE;
				909	iattr.ia_size = new_size;
				910	error = xfs_vn_setattr_size(file_dentry(file), &iattr);
				911	if (error)
				912	goto out_unlock;
				913	}
				914
				915	/*
				916	* Perform hole insertion now that the file size has been
				917	* updated so that if we crash during the operation we don't
				918	* leave shifted extents past EOF and hence losing access to
				919	* the data that is contained within them.
				920	*/
				921	if (do_file_insert)
				922	error = xfs_insert_file_space(ip, offset, len);
				923
				924	out_unlock:
				925	xfs_iunlock(ip, iolock);
				926	return error;
				927	}
				928
				929	STATIC int
				930	xfs_file_clone_range(
				931	struct file *file_in,
				932	loff_t pos_in,
				933	struct file *file_out,
				934	loff_t pos_out,
				935	u64 len)
				936	{
				937	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
				938	len, false);
				939	}
				940
				941	STATIC int
				942	xfs_file_dedupe_range(
				943	struct file *file_in,
				944	loff_t pos_in,
				945	struct file *file_out,
				946	loff_t pos_out,
				947	u64 len)
				948	{
				949	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
				950	len, true);
				951	}
				952
				953	STATIC int
				954	xfs_file_open(
				955	struct inode *inode,
				956	struct file *file)
				957	{
				958	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
				959	return -EFBIG;
				960	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
				961	return -EIO;
				962	file->f_mode \|= FMODE_NOWAIT;
				963	return 0;
				964	}
				965
				966	STATIC int
				967	xfs_dir_open(
				968	struct inode *inode,
				969	struct file *file)
				970	{
				971	struct xfs_inode *ip = XFS_I(inode);
				972	int mode;
				973	int error;
				974
				975	error = xfs_file_open(inode, file);
				976	if (error)
				977	return error;
				978
				979	/*
				980	* If there are any blocks, read-ahead block 0 as we're almost
				981	* certain to have the next operation be a read there.
				982	*/
				983	mode = xfs_ilock_data_map_shared(ip);
				984	if (ip->i_d.di_nextents > 0)
				985	error = xfs_dir3_data_readahead(ip, 0, -1);
				986	xfs_iunlock(ip, mode);
				987	return error;
				988	}
				989
				990	STATIC int
				991	xfs_file_release(
				992	struct inode *inode,
				993	struct file *filp)
				994	{
				995	return xfs_release(XFS_I(inode));
				996	}
				997
				998	STATIC int
				999	xfs_file_readdir(
				1000	struct file *file,
				1001	struct dir_context *ctx)
				1002	{
				1003	struct inode *inode = file_inode(file);
				1004	xfs_inode_t *ip = XFS_I(inode);
				1005	size_t bufsize;
				1006
				1007	/*
				1008	* The Linux API doesn't pass down the total size of the buffer
				1009	* we read into down to the filesystem. With the filldir concept
				1010	* it's not needed for correct information, but the XFS dir2 leaf
				1011	* code wants an estimate of the buffer size to calculate it's
				1012	* readahead window and size the buffers used for mapping to
				1013	* physical blocks.
				1014	*
				1015	* Try to give it an estimate that's good enough, maybe at some
				1016	* point we can change the ->readdir prototype to include the
				1017	* buffer size. For now we use the current glibc buffer size.
				1018	*/
				1019	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
				1020
				1021	return xfs_readdir(NULL, ip, ctx, bufsize);
				1022	}
				1023
				1024	STATIC loff_t
				1025	xfs_file_llseek(
				1026	struct file *file,
				1027	loff_t offset,
				1028	int whence)
				1029	{
				1030	struct inode *inode = file->f_mapping->host;
				1031
				1032	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
				1033	return -EIO;
				1034
				1035	switch (whence) {
				1036	default:
				1037	return generic_file_llseek(file, offset, whence);
				1038	case SEEK_HOLE:
				1039	offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
				1040	break;
				1041	case SEEK_DATA:
				1042	offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
				1043	break;
				1044	}
				1045
				1046	if (offset < 0)
				1047	return offset;
				1048	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				1049	}
				1050
				1051	/*
				1052	* Locking for serialisation of IO during page faults. This results in a lock
				1053	* ordering of:
				1054	*
				1055	* mmap_sem (MM)
				1056	* sb_start_pagefault(vfs, freeze)
				1057	* i_mmaplock (XFS - truncate serialisation)
				1058	* page_lock (MM)
				1059	* i_lock (XFS - extent map serialisation)
				1060	*/
				1061	static vm_fault_t
				1062	__xfs_filemap_fault(
				1063	struct vm_fault *vmf,
				1064	enum page_entry_size pe_size,
				1065	bool write_fault)
				1066	{
				1067	struct inode *inode = file_inode(vmf->vma->vm_file);
				1068	struct xfs_inode *ip = XFS_I(inode);
				1069	vm_fault_t ret;
				1070
				1071	trace_xfs_filemap_fault(ip, pe_size, write_fault);
				1072
				1073	if (write_fault) {
				1074	sb_start_pagefault(inode->i_sb);
				1075	file_update_time(vmf->vma->vm_file);
				1076	}
				1077
				1078	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1079	if (IS_DAX(inode)) {
				1080	pfn_t pfn;
				1081
				1082	ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
				1083	if (ret & VM_FAULT_NEEDDSYNC)
				1084	ret = dax_finish_sync_fault(vmf, pe_size, pfn);
				1085	} else {
				1086	if (write_fault)
				1087	ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
				1088	else
				1089	ret = filemap_fault(vmf);
				1090	}
				1091	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1092
				1093	if (write_fault)
				1094	sb_end_pagefault(inode->i_sb);
				1095	return ret;
				1096	}
				1097
				1098	static vm_fault_t
				1099	xfs_filemap_fault(
				1100	struct vm_fault *vmf)
				1101	{
				1102	/* DAX can shortcut the normal fault path on write faults! */
				1103	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
				1104	IS_DAX(file_inode(vmf->vma->vm_file)) &&
				1105	(vmf->flags & FAULT_FLAG_WRITE));
				1106	}
				1107
				1108	static vm_fault_t
				1109	xfs_filemap_huge_fault(
				1110	struct vm_fault *vmf,
				1111	enum page_entry_size pe_size)
				1112	{
				1113	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
				1114	return VM_FAULT_FALLBACK;
				1115
				1116	/* DAX can shortcut the normal fault path on write faults! */
				1117	return __xfs_filemap_fault(vmf, pe_size,
				1118	(vmf->flags & FAULT_FLAG_WRITE));
				1119	}
				1120
				1121	static vm_fault_t
				1122	xfs_filemap_page_mkwrite(
				1123	struct vm_fault *vmf)
				1124	{
				1125	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1126	}
				1127
				1128	/*
				1129	* pfn_mkwrite was originally intended to ensure we capture time stamp updates
				1130	* on write faults. In reality, it needs to serialise against truncate and
				1131	* prepare memory for writing so handle is as standard write fault.
				1132	*/
				1133	static vm_fault_t
				1134	xfs_filemap_pfn_mkwrite(
				1135	struct vm_fault *vmf)
				1136	{
				1137
				1138	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1139	}
				1140
				1141	static const struct vm_operations_struct xfs_file_vm_ops = {
				1142	.fault = xfs_filemap_fault,
				1143	.huge_fault = xfs_filemap_huge_fault,
				1144	.map_pages = filemap_map_pages,
				1145	.page_mkwrite = xfs_filemap_page_mkwrite,
				1146	.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
				1147	};
				1148
				1149	STATIC int
				1150	xfs_file_mmap(
				1151	struct file *filp,
				1152	struct vm_area_struct *vma)
				1153	{
				1154	/*
				1155	* We don't support synchronous mappings for non-DAX files. At least
				1156	* until someone comes with a sensible use case.
				1157	*/
				1158	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
				1159	return -EOPNOTSUPP;
				1160
				1161	file_accessed(filp);
				1162	vma->vm_ops = &xfs_file_vm_ops;
				1163	if (IS_DAX(file_inode(filp)))
				1164	vma->vm_flags \|= VM_HUGEPAGE;
				1165	return 0;
				1166	}
				1167
				1168	const struct file_operations xfs_file_operations = {
				1169	.llseek = xfs_file_llseek,
				1170	.read_iter = xfs_file_read_iter,
				1171	.write_iter = xfs_file_write_iter,
				1172	.splice_read = generic_file_splice_read,
				1173	.splice_write = iter_file_splice_write,
				1174	.unlocked_ioctl = xfs_file_ioctl,
				1175	#ifdef CONFIG_COMPAT
				1176	.compat_ioctl = xfs_file_compat_ioctl,
				1177	#endif
				1178	.mmap = xfs_file_mmap,
				1179	.mmap_supported_flags = MAP_SYNC,
				1180	.open = xfs_file_open,
				1181	.release = xfs_file_release,
				1182	.fsync = xfs_file_fsync,
				1183	.get_unmapped_area = thp_get_unmapped_area,
				1184	.fallocate = xfs_file_fallocate,
				1185	.clone_file_range = xfs_file_clone_range,
				1186	.dedupe_file_range = xfs_file_dedupe_range,
				1187	};
				1188
				1189	const struct file_operations xfs_dir_file_operations = {
				1190	.open = xfs_dir_open,
				1191	.read = generic_read_dir,
				1192	.iterate_shared = xfs_file_readdir,
				1193	.llseek = generic_file_llseek,
				1194	.unlocked_ioctl = xfs_file_ioctl,
				1195	#ifdef CONFIG_COMPAT
				1196	.compat_ioctl = xfs_file_compat_ioctl,
				1197	#endif
				1198	.fsync = xfs_dir_fsync,
				1199	};