Blame - marvell/linux/fs/xfs/xfs_file.c - T108

blob: 5d0755efc12a6acad40e3c2f9ab843a11295f9a6 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_inode.h"
				14	#include "xfs_trans.h"
				15	#include "xfs_inode_item.h"
				16	#include "xfs_bmap.h"
				17	#include "xfs_bmap_util.h"
				18	#include "xfs_dir2.h"
				19	#include "xfs_dir2_priv.h"
				20	#include "xfs_ioctl.h"
				21	#include "xfs_trace.h"
				22	#include "xfs_log.h"
				23	#include "xfs_icache.h"
				24	#include "xfs_pnfs.h"
				25	#include "xfs_iomap.h"
				26	#include "xfs_reflink.h"
				27
				28	#include <linux/falloc.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/mman.h>
				31	#include <linux/fadvise.h>
				32
				33	static const struct vm_operations_struct xfs_file_vm_ops;
				34
				35	int
				36	xfs_update_prealloc_flags(
				37	struct xfs_inode *ip,
				38	enum xfs_prealloc_flags flags)
				39	{
				40	struct xfs_trans *tp;
				41	int error;
				42
				43	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
				44	0, 0, 0, &tp);
				45	if (error)
				46	return error;
				47
				48	xfs_ilock(ip, XFS_ILOCK_EXCL);
				49	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				50
				51	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
				52	VFS_I(ip)->i_mode &= ~S_ISUID;
				53	if (VFS_I(ip)->i_mode & S_IXGRP)
				54	VFS_I(ip)->i_mode &= ~S_ISGID;
				55	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				56	}
				57
				58	if (flags & XFS_PREALLOC_SET)
				59	ip->i_d.di_flags \|= XFS_DIFLAG_PREALLOC;
				60	if (flags & XFS_PREALLOC_CLEAR)
				61	ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
				62
				63	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				64	if (flags & XFS_PREALLOC_SYNC)
				65	xfs_trans_set_sync(tp);
				66	return xfs_trans_commit(tp);
				67	}
				68
				69	/*
				70	* Fsync operations on directories are much simpler than on regular files,
				71	* as there is no file data to flush, and thus also no need for explicit
				72	* cache flush operations, and there are no non-transaction metadata updates
				73	* on directories either.
				74	*/
				75	STATIC int
				76	xfs_dir_fsync(
				77	struct file *file,
				78	loff_t start,
				79	loff_t end,
				80	int datasync)
				81	{
				82	struct xfs_inode *ip = XFS_I(file->f_mapping->host);
				83
				84	trace_xfs_dir_fsync(ip);
				85	return xfs_log_force_inode(ip);
				86	}
				87
				88	STATIC int
				89	xfs_file_fsync(
				90	struct file *file,
				91	loff_t start,
				92	loff_t end,
				93	int datasync)
				94	{
				95	struct inode *inode = file->f_mapping->host;
				96	struct xfs_inode *ip = XFS_I(inode);
				97	struct xfs_mount *mp = ip->i_mount;
				98	int error = 0;
				99	int log_flushed = 0;
				100	xfs_lsn_t lsn = 0;
				101
				102	trace_xfs_file_fsync(ip);
				103
				104	error = file_write_and_wait_range(file, start, end);
				105	if (error)
				106	return error;
				107
				108	if (XFS_FORCED_SHUTDOWN(mp))
				109	return -EIO;
				110
				111	xfs_iflags_clear(ip, XFS_ITRUNCATED);
				112
				113	/*
				114	* If we have an RT and/or log subvolume we need to make sure to flush
				115	* the write cache the device used for file data first. This is to
				116	* ensure newly written file data make it to disk before logging the new
				117	* inode size in case of an extending write.
				118	*/
				119	if (XFS_IS_REALTIME_INODE(ip))
				120	xfs_blkdev_issue_flush(mp->m_rtdev_targp);
				121	else if (mp->m_logdev_targp != mp->m_ddev_targp)
				122	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				123
				124	/*
				125	* All metadata updates are logged, which means that we just have to
				126	* flush the log up to the latest LSN that touched the inode. If we have
				127	* concurrent fsync/fdatasync() calls, we need them to all block on the
				128	* log force before we clear the ili_fsync_fields field. This ensures
				129	* that we don't get a racing sync operation that does not wait for the
				130	* metadata to hit the journal before returning. If we race with
				131	* clearing the ili_fsync_fields, then all that will happen is the log
				132	* force will do nothing as the lsn will already be on disk. We can't
				133	* race with setting ili_fsync_fields because that is done under
				134	* XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
				135	* until after the ili_fsync_fields is cleared.
				136	*/
				137	xfs_ilock(ip, XFS_ILOCK_SHARED);
				138	if (xfs_ipincount(ip)) {
				139	if (!datasync \|\|
				140	(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
				141	lsn = ip->i_itemp->ili_last_lsn;
				142	}
				143
				144	if (lsn) {
				145	error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
				146	ip->i_itemp->ili_fsync_fields = 0;
				147	}
				148	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				149
				150	/*
				151	* If we only have a single device, and the log force about was
				152	* a no-op we might have to flush the data device cache here.
				153	* This can only happen for fdatasync/O_DSYNC if we were overwriting
				154	* an already allocated file and thus do not have any metadata to
				155	* commit.
				156	*/
				157	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
				158	mp->m_logdev_targp == mp->m_ddev_targp)
				159	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				160
				161	return error;
				162	}
				163
				164	STATIC ssize_t
				165	xfs_file_dio_aio_read(
				166	struct kiocb *iocb,
				167	struct iov_iter *to)
				168	{
				169	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				170	size_t count = iov_iter_count(to);
				171	ssize_t ret;
				172
				173	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
				174
				175	if (!count)
				176	return 0; /* skip atime */
				177
				178	file_accessed(iocb->ki_filp);
				179
				180	if (iocb->ki_flags & IOCB_NOWAIT) {
				181	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				182	return -EAGAIN;
				183	} else {
				184	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				185	}
				186	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
				187	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				188
				189	return ret;
				190	}
				191
				192	static noinline ssize_t
				193	xfs_file_dax_read(
				194	struct kiocb *iocb,
				195	struct iov_iter *to)
				196	{
				197	struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
				198	size_t count = iov_iter_count(to);
				199	ssize_t ret = 0;
				200
				201	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
				202
				203	if (!count)
				204	return 0; /* skip atime */
				205
				206	if (iocb->ki_flags & IOCB_NOWAIT) {
				207	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				208	return -EAGAIN;
				209	} else {
				210	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				211	}
				212
				213	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
				214	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				215
				216	file_accessed(iocb->ki_filp);
				217	return ret;
				218	}
				219
				220	STATIC ssize_t
				221	xfs_file_buffered_aio_read(
				222	struct kiocb *iocb,
				223	struct iov_iter *to)
				224	{
				225	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				226	ssize_t ret;
				227
				228	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
				229
				230	if (iocb->ki_flags & IOCB_NOWAIT) {
				231	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				232	return -EAGAIN;
				233	} else {
				234	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				235	}
				236	ret = generic_file_read_iter(iocb, to);
				237	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				238
				239	return ret;
				240	}
				241
				242	STATIC ssize_t
				243	xfs_file_read_iter(
				244	struct kiocb *iocb,
				245	struct iov_iter *to)
				246	{
				247	struct inode *inode = file_inode(iocb->ki_filp);
				248	struct xfs_mount *mp = XFS_I(inode)->i_mount;
				249	ssize_t ret = 0;
				250
				251	XFS_STATS_INC(mp, xs_read_calls);
				252
				253	if (XFS_FORCED_SHUTDOWN(mp))
				254	return -EIO;
				255
				256	if (IS_DAX(inode))
				257	ret = xfs_file_dax_read(iocb, to);
				258	else if (iocb->ki_flags & IOCB_DIRECT)
				259	ret = xfs_file_dio_aio_read(iocb, to);
				260	else
				261	ret = xfs_file_buffered_aio_read(iocb, to);
				262
				263	if (ret > 0)
				264	XFS_STATS_ADD(mp, xs_read_bytes, ret);
				265	return ret;
				266	}
				267
				268	/*
				269	* Common pre-write limit and setup checks.
				270	*
				271	* Called with the iolocked held either shared and exclusive according to
				272	* @iolock, and returns with it held. Might upgrade the iolock to exclusive
				273	* if called for a direct write beyond i_size.
				274	*/
				275	STATIC ssize_t
				276	xfs_file_aio_write_checks(
				277	struct kiocb *iocb,
				278	struct iov_iter *from,
				279	int *iolock)
				280	{
				281	struct file *file = iocb->ki_filp;
				282	struct inode *inode = file->f_mapping->host;
				283	struct xfs_inode *ip = XFS_I(inode);
				284	ssize_t error = 0;
				285	size_t count = iov_iter_count(from);
				286	bool drained_dio = false;
				287	loff_t isize;
				288
				289	restart:
				290	error = generic_write_checks(iocb, from);
				291	if (error <= 0)
				292	return error;
				293
				294	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
				295	if (error)
				296	return error;
				297
				298	/*
				299	* For changing security info in file_remove_privs() we need i_rwsem
				300	* exclusively.
				301	*/
				302	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
				303	xfs_iunlock(ip, *iolock);
				304	*iolock = XFS_IOLOCK_EXCL;
				305	xfs_ilock(ip, *iolock);
				306	goto restart;
				307	}
				308	/*
				309	* If the offset is beyond the size of the file, we need to zero any
				310	* blocks that fall between the existing EOF and the start of this
				311	* write. If zeroing is needed and we are currently holding the
				312	* iolock shared, we need to update it to exclusive which implies
				313	* having to redo all checks before.
				314	*
				315	* We need to serialise against EOF updates that occur in IO
				316	* completions here. We want to make sure that nobody is changing the
				317	* size while we do this check until we have placed an IO barrier (i.e.
				318	* hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
				319	* The spinlock effectively forms a memory barrier once we have the
				320	* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
				321	* and hence be able to correctly determine if we need to run zeroing.
				322	*/
				323	spin_lock(&ip->i_flags_lock);
				324	isize = i_size_read(inode);
				325	if (iocb->ki_pos > isize) {
				326	spin_unlock(&ip->i_flags_lock);
				327	if (!drained_dio) {
				328	if (*iolock == XFS_IOLOCK_SHARED) {
				329	xfs_iunlock(ip, *iolock);
				330	*iolock = XFS_IOLOCK_EXCL;
				331	xfs_ilock(ip, *iolock);
				332	iov_iter_reexpand(from, count);
				333	}
				334	/*
				335	* We now have an IO submission barrier in place, but
				336	* AIO can do EOF updates during IO completion and hence
				337	* we now need to wait for all of them to drain. Non-AIO
				338	* DIO will have drained before we are given the
				339	* XFS_IOLOCK_EXCL, and so for most cases this wait is a
				340	* no-op.
				341	*/
				342	inode_dio_wait(inode);
				343	drained_dio = true;
				344	goto restart;
				345	}
				346
				347	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
				348	error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
				349	NULL, &xfs_iomap_ops);
				350	if (error)
				351	return error;
				352	} else
				353	spin_unlock(&ip->i_flags_lock);
				354
				355	/*
				356	* Updating the timestamps will grab the ilock again from
				357	* xfs_fs_dirty_inode, so we have to call it after dropping the
				358	* lock above. Eventually we should look into a way to avoid
				359	* the pointless lock roundtrip.
				360	*/
				361	return file_modified(file);
				362	}
				363
				364	static int
				365	xfs_dio_write_end_io(
				366	struct kiocb *iocb,
				367	ssize_t size,
				368	int error,
				369	unsigned flags)
				370	{
				371	struct inode *inode = file_inode(iocb->ki_filp);
				372	struct xfs_inode *ip = XFS_I(inode);
				373	loff_t offset = iocb->ki_pos;
				374	unsigned int nofs_flag;
				375
				376	trace_xfs_end_io_direct_write(ip, offset, size);
				377
				378	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				379	return -EIO;
				380
				381	if (error)
				382	return error;
				383	if (!size)
				384	return 0;
				385
				386	/*
				387	* Capture amount written on completion as we can't reliably account
				388	* for it on submission.
				389	*/
				390	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
				391
				392	/*
				393	* We can allocate memory here while doing writeback on behalf of
				394	* memory reclaim. To avoid memory allocation deadlocks set the
				395	* task-wide nofs context for the following operations.
				396	*/
				397	nofs_flag = memalloc_nofs_save();
				398
				399	if (flags & IOMAP_DIO_COW) {
				400	error = xfs_reflink_end_cow(ip, offset, size);
				401	if (error)
				402	goto out;
				403	}
				404
				405	/*
				406	* Unwritten conversion updates the in-core isize after extent
				407	* conversion but before updating the on-disk size. Updating isize any
				408	* earlier allows a racing dio read to find unwritten extents before
				409	* they are converted.
				410	*/
				411	if (flags & IOMAP_DIO_UNWRITTEN) {
				412	error = xfs_iomap_write_unwritten(ip, offset, size, true);
				413	goto out;
				414	}
				415
				416	/*
				417	* We need to update the in-core inode size here so that we don't end up
				418	* with the on-disk inode size being outside the in-core inode size. We
				419	* have no other method of updating EOF for AIO, so always do it here
				420	* if necessary.
				421	*
				422	* We need to lock the test/set EOF update as we can be racing with
				423	* other IO completions here to update the EOF. Failing to serialise
				424	* here can result in EOF moving backwards and Bad Things Happen when
				425	* that occurs.
				426	*/
				427	spin_lock(&ip->i_flags_lock);
				428	if (offset + size > i_size_read(inode)) {
				429	i_size_write(inode, offset + size);
				430	spin_unlock(&ip->i_flags_lock);
				431	error = xfs_setfilesize(ip, offset, size);
				432	} else {
				433	spin_unlock(&ip->i_flags_lock);
				434	}
				435
				436	out:
				437	memalloc_nofs_restore(nofs_flag);
				438	return error;
				439	}
				440
				441	static const struct iomap_dio_ops xfs_dio_write_ops = {
				442	.end_io = xfs_dio_write_end_io,
				443	};
				444
				445	/*
				446	* xfs_file_dio_aio_write - handle direct IO writes
				447	*
				448	* Lock the inode appropriately to prepare for and issue a direct IO write.
				449	* By separating it from the buffered write path we remove all the tricky to
				450	* follow locking changes and looping.
				451	*
				452	* If there are cached pages or we're extending the file, we need IOLOCK_EXCL
				453	* until we're sure the bytes at the new EOF have been zeroed and/or the cached
				454	* pages are flushed out.
				455	*
				456	* In most cases the direct IO writes will be done holding IOLOCK_SHARED
				457	* allowing them to be done in parallel with reads and other direct IO writes.
				458	* However, if the IO is not aligned to filesystem blocks, the direct IO layer
				459	* needs to do sub-block zeroing and that requires serialisation against other
				460	* direct IOs to the same block. In this case we need to serialise the
				461	* submission of the unaligned IOs so that we don't get racing block zeroing in
				462	* the dio layer. To avoid the problem with aio, we also need to wait for
				463	* outstanding IOs to complete so that unwritten extent conversion is completed
				464	* before we try to map the overlapping block. This is currently implemented by
				465	* hitting it with a big hammer (i.e. inode_dio_wait()).
				466	*
				467	* Returns with locks held indicated by @iolock and errors indicated by
				468	* negative return values.
				469	*/
				470	STATIC ssize_t
				471	xfs_file_dio_aio_write(
				472	struct kiocb *iocb,
				473	struct iov_iter *from)
				474	{
				475	struct file *file = iocb->ki_filp;
				476	struct address_space *mapping = file->f_mapping;
				477	struct inode *inode = mapping->host;
				478	struct xfs_inode *ip = XFS_I(inode);
				479	struct xfs_mount *mp = ip->i_mount;
				480	ssize_t ret = 0;
				481	int unaligned_io = 0;
				482	int iolock;
				483	size_t count = iov_iter_count(from);
				484	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
				485	mp->m_rtdev_targp : mp->m_ddev_targp;
				486
				487	/* DIO must be aligned to device logical sector size */
				488	if ((iocb->ki_pos \| count) & target->bt_logical_sectormask)
				489	return -EINVAL;
				490
				491	/*
				492	* Don't take the exclusive iolock here unless the I/O is unaligned to
				493	* the file system block size. We don't need to consider the EOF
				494	* extension case here because xfs_file_aio_write_checks() will relock
				495	* the inode as necessary for EOF zeroing cases and fill out the new
				496	* inode size as appropriate.
				497	*/
				498	if ((iocb->ki_pos & mp->m_blockmask) \|\|
				499	((iocb->ki_pos + count) & mp->m_blockmask)) {
				500	unaligned_io = 1;
				501
				502	/*
				503	* We can't properly handle unaligned direct I/O to reflink
				504	* files yet, as we can't unshare a partial block.
				505	*/
				506	if (xfs_is_cow_inode(ip)) {
				507	trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
				508	return -EREMCHG;
				509	}
				510	iolock = XFS_IOLOCK_EXCL;
				511	} else {
				512	iolock = XFS_IOLOCK_SHARED;
				513	}
				514
				515	if (iocb->ki_flags & IOCB_NOWAIT) {
				516	/* unaligned dio always waits, bail */
				517	if (unaligned_io)
				518	return -EAGAIN;
				519	if (!xfs_ilock_nowait(ip, iolock))
				520	return -EAGAIN;
				521	} else {
				522	xfs_ilock(ip, iolock);
				523	}
				524
				525	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				526	if (ret)
				527	goto out;
				528	count = iov_iter_count(from);
				529
				530	/*
				531	* If we are doing unaligned IO, we can't allow any other overlapping IO
				532	* in-flight at the same time or we risk data corruption. Wait for all
				533	* other IO to drain before we submit. If the IO is aligned, demote the
				534	* iolock if we had to take the exclusive lock in
				535	* xfs_file_aio_write_checks() for other reasons.
				536	*/
				537	if (unaligned_io) {
				538	inode_dio_wait(inode);
				539	} else if (iolock == XFS_IOLOCK_EXCL) {
				540	xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
				541	iolock = XFS_IOLOCK_SHARED;
				542	}
				543
				544	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
				545	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
				546
				547	/*
				548	* If unaligned, this is the only IO in-flight. If it has not yet
				549	* completed, wait on it before we release the iolock to prevent
				550	* subsequent overlapping IO.
				551	*/
				552	if (ret == -EIOCBQUEUED && unaligned_io)
				553	inode_dio_wait(inode);
				554	out:
				555	xfs_iunlock(ip, iolock);
				556
				557	/*
				558	* No fallback to buffered IO on errors for XFS, direct IO will either
				559	* complete fully or fail.
				560	*/
				561	ASSERT(ret < 0 \|\| ret == count);
				562	return ret;
				563	}
				564
				565	static noinline ssize_t
				566	xfs_file_dax_write(
				567	struct kiocb *iocb,
				568	struct iov_iter *from)
				569	{
				570	struct inode *inode = iocb->ki_filp->f_mapping->host;
				571	struct xfs_inode *ip = XFS_I(inode);
				572	int iolock = XFS_IOLOCK_EXCL;
				573	ssize_t ret, error = 0;
				574	size_t count;
				575	loff_t pos;
				576
				577	if (iocb->ki_flags & IOCB_NOWAIT) {
				578	if (!xfs_ilock_nowait(ip, iolock))
				579	return -EAGAIN;
				580	} else {
				581	xfs_ilock(ip, iolock);
				582	}
				583
				584	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				585	if (ret)
				586	goto out;
				587
				588	pos = iocb->ki_pos;
				589	count = iov_iter_count(from);
				590
				591	trace_xfs_file_dax_write(ip, count, pos);
				592	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
				593	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
				594	i_size_write(inode, iocb->ki_pos);
				595	error = xfs_setfilesize(ip, pos, ret);
				596	}
				597	out:
				598	xfs_iunlock(ip, iolock);
				599	if (error)
				600	return error;
				601
				602	if (ret > 0) {
				603	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				604
				605	/* Handle various SYNC-type writes */
				606	ret = generic_write_sync(iocb, ret);
				607	}
				608	return ret;
				609	}
				610
				611	STATIC ssize_t
				612	xfs_file_buffered_aio_write(
				613	struct kiocb *iocb,
				614	struct iov_iter *from)
				615	{
				616	struct file *file = iocb->ki_filp;
				617	struct address_space *mapping = file->f_mapping;
				618	struct inode *inode = mapping->host;
				619	struct xfs_inode *ip = XFS_I(inode);
				620	ssize_t ret;
				621	int enospc = 0;
				622	int iolock;
				623
				624	if (iocb->ki_flags & IOCB_NOWAIT)
				625	return -EOPNOTSUPP;
				626
				627	write_retry:
				628	iolock = XFS_IOLOCK_EXCL;
				629	xfs_ilock(ip, iolock);
				630
				631	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				632	if (ret)
				633	goto out;
				634
				635	/* We can write back this queue in page reclaim */
				636	current->backing_dev_info = inode_to_bdi(inode);
				637
				638	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
				639	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
				640	if (likely(ret >= 0))
				641	iocb->ki_pos += ret;
				642
				643	/*
				644	* If we hit a space limit, try to free up some lingering preallocated
				645	* space before returning an error. In the case of ENOSPC, first try to
				646	* write back all dirty inodes to free up some of the excess reserved
				647	* metadata space. This reduces the chances that the eofblocks scan
				648	* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
				649	* also behaves as a filter to prevent too many eofblocks scans from
				650	* running at the same time.
				651	*/
				652	if (ret == -EDQUOT && !enospc) {
				653	xfs_iunlock(ip, iolock);
				654	enospc = xfs_inode_free_quota_eofblocks(ip);
				655	if (enospc)
				656	goto write_retry;
				657	enospc = xfs_inode_free_quota_cowblocks(ip);
				658	if (enospc)
				659	goto write_retry;
				660	iolock = 0;
				661	} else if (ret == -ENOSPC && !enospc) {
				662	struct xfs_eofblocks eofb = {0};
				663
				664	enospc = 1;
				665	xfs_flush_inodes(ip->i_mount);
				666
				667	xfs_iunlock(ip, iolock);
				668	eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
				669	xfs_icache_free_eofblocks(ip->i_mount, &eofb);
				670	xfs_icache_free_cowblocks(ip->i_mount, &eofb);
				671	goto write_retry;
				672	}
				673
				674	current->backing_dev_info = NULL;
				675	out:
				676	if (iolock)
				677	xfs_iunlock(ip, iolock);
				678
				679	if (ret > 0) {
				680	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				681	/* Handle various SYNC-type writes */
				682	ret = generic_write_sync(iocb, ret);
				683	}
				684	return ret;
				685	}
				686
				687	STATIC ssize_t
				688	xfs_file_write_iter(
				689	struct kiocb *iocb,
				690	struct iov_iter *from)
				691	{
				692	struct file *file = iocb->ki_filp;
				693	struct address_space *mapping = file->f_mapping;
				694	struct inode *inode = mapping->host;
				695	struct xfs_inode *ip = XFS_I(inode);
				696	ssize_t ret;
				697	size_t ocount = iov_iter_count(from);
				698
				699	XFS_STATS_INC(ip->i_mount, xs_write_calls);
				700
				701	if (ocount == 0)
				702	return 0;
				703
				704	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				705	return -EIO;
				706
				707	if (IS_DAX(inode))
				708	return xfs_file_dax_write(iocb, from);
				709
				710	if (iocb->ki_flags & IOCB_DIRECT) {
				711	/*
				712	* Allow a directio write to fall back to a buffered
				713	* write only in the case that we're doing a reflink
				714	* CoW. In all other directio scenarios we do not
				715	* allow an operation to fall back to buffered mode.
				716	*/
				717	ret = xfs_file_dio_aio_write(iocb, from);
				718	if (ret != -EREMCHG)
				719	return ret;
				720	}
				721
				722	return xfs_file_buffered_aio_write(iocb, from);
				723	}
				724
				725	static void
				726	xfs_wait_dax_page(
				727	struct inode *inode)
				728	{
				729	struct xfs_inode *ip = XFS_I(inode);
				730
				731	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
				732	schedule();
				733	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
				734	}
				735
				736	static int
				737	xfs_break_dax_layouts(
				738	struct inode *inode,
				739	bool *retry)
				740	{
				741	struct page *page;
				742
				743	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
				744
				745	page = dax_layout_busy_page(inode->i_mapping);
				746	if (!page)
				747	return 0;
				748
				749	*retry = true;
				750	return ___wait_var_event(&page->_refcount,
				751	atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
				752	0, 0, xfs_wait_dax_page(inode));
				753	}
				754
				755	int
				756	xfs_break_layouts(
				757	struct inode *inode,
				758	uint *iolock,
				759	enum layout_break_reason reason)
				760	{
				761	bool retry;
				762	int error;
				763
				764	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL));
				765
				766	do {
				767	retry = false;
				768	switch (reason) {
				769	case BREAK_UNMAP:
				770	error = xfs_break_dax_layouts(inode, &retry);
				771	if (error \|\| retry)
				772	break;
				773	/* fall through */
				774	case BREAK_WRITE:
				775	error = xfs_break_leased_layouts(inode, iolock, &retry);
				776	break;
				777	default:
				778	WARN_ON_ONCE(1);
				779	error = -EINVAL;
				780	}
				781	} while (error == 0 && retry);
				782
				783	return error;
				784	}
				785
				786	#define XFS_FALLOC_FL_SUPPORTED \
				787	(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \| \
				788	FALLOC_FL_COLLAPSE_RANGE \| FALLOC_FL_ZERO_RANGE \| \
				789	FALLOC_FL_INSERT_RANGE \| FALLOC_FL_UNSHARE_RANGE)
				790
				791	STATIC long
				792	xfs_file_fallocate(
				793	struct file *file,
				794	int mode,
				795	loff_t offset,
				796	loff_t len)
				797	{
				798	struct inode *inode = file_inode(file);
				799	struct xfs_inode *ip = XFS_I(inode);
				800	long error;
				801	enum xfs_prealloc_flags flags = 0;
				802	uint iolock = XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL;
				803	loff_t new_size = 0;
				804	bool do_file_insert = false;
				805
				806	if (!S_ISREG(inode->i_mode))
				807	return -EINVAL;
				808	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
				809	return -EOPNOTSUPP;
				810
				811	xfs_ilock(ip, iolock);
				812	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
				813	if (error)
				814	goto out_unlock;
				815
				816	/*
				817	* Must wait for all AIO to complete before we continue as AIO can
				818	* change the file size on completion without holding any locks we
				819	* currently hold. We must do this first because AIO can update both
				820	* the on disk and in memory inode sizes, and the operations that follow
				821	* require the in-memory size to be fully up-to-date.
				822	*/
				823	inode_dio_wait(inode);
				824
				825	/*
				826	* Now AIO and DIO has drained we flush and (if necessary) invalidate
				827	* the cached range over the first operation we are about to run.
				828	*
				829	* We care about zero and collapse here because they both run a hole
				830	* punch over the range first. Because that can zero data, and the range
				831	* of invalidation for the shift operations is much larger, we still do
				832	* the required flush for collapse in xfs_prepare_shift().
				833	*
				834	* Insert has the same range requirements as collapse, and we extend the
				835	* file first which can zero data. Hence insert has the same
				836	* flush/invalidate requirements as collapse and so they are both
				837	* handled at the right time by xfs_prepare_shift().
				838	*/
				839	if (mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE \|
				840	FALLOC_FL_COLLAPSE_RANGE)) {
				841	error = xfs_flush_unmap_range(ip, offset, len);
				842	if (error)
				843	goto out_unlock;
				844	}
				845
				846	if (mode & FALLOC_FL_PUNCH_HOLE) {
				847	error = xfs_free_file_space(ip, offset, len);
				848	if (error)
				849	goto out_unlock;
				850	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
				851	unsigned int blksize_mask = i_blocksize(inode) - 1;
				852
				853	if (offset & blksize_mask \|\| len & blksize_mask) {
				854	error = -EINVAL;
				855	goto out_unlock;
				856	}
				857
				858	/*
				859	* There is no need to overlap collapse range with EOF,
				860	* in which case it is effectively a truncate operation
				861	*/
				862	if (offset + len >= i_size_read(inode)) {
				863	error = -EINVAL;
				864	goto out_unlock;
				865	}
				866
				867	new_size = i_size_read(inode) - len;
				868
				869	error = xfs_collapse_file_space(ip, offset, len);
				870	if (error)
				871	goto out_unlock;
				872	} else if (mode & FALLOC_FL_INSERT_RANGE) {
				873	unsigned int blksize_mask = i_blocksize(inode) - 1;
				874	loff_t isize = i_size_read(inode);
				875
				876	if (offset & blksize_mask \|\| len & blksize_mask) {
				877	error = -EINVAL;
				878	goto out_unlock;
				879	}
				880
				881	/*
				882	* New inode size must not exceed ->s_maxbytes, accounting for
				883	* possible signed overflow.
				884	*/
				885	if (inode->i_sb->s_maxbytes - isize < len) {
				886	error = -EFBIG;
				887	goto out_unlock;
				888	}
				889	new_size = isize + len;
				890
				891	/* Offset should be less than i_size */
				892	if (offset >= isize) {
				893	error = -EINVAL;
				894	goto out_unlock;
				895	}
				896	do_file_insert = true;
				897	} else {
				898	flags \|= XFS_PREALLOC_SET;
				899
				900	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
				901	offset + len > i_size_read(inode)) {
				902	new_size = offset + len;
				903	error = inode_newsize_ok(inode, new_size);
				904	if (error)
				905	goto out_unlock;
				906	}
				907
				908	if (mode & FALLOC_FL_ZERO_RANGE) {
				909	error = xfs_zero_file_space(ip, offset, len);
				910	} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
				911	error = xfs_reflink_unshare(ip, offset, len);
				912	if (error)
				913	goto out_unlock;
				914
				915	if (!xfs_is_always_cow_inode(ip)) {
				916	error = xfs_alloc_file_space(ip, offset, len,
				917	XFS_BMAPI_PREALLOC);
				918	}
				919	} else {
				920	/*
				921	* If always_cow mode we can't use preallocations and
				922	* thus should not create them.
				923	*/
				924	if (xfs_is_always_cow_inode(ip)) {
				925	error = -EOPNOTSUPP;
				926	goto out_unlock;
				927	}
				928
				929	error = xfs_alloc_file_space(ip, offset, len,
				930	XFS_BMAPI_PREALLOC);
				931	}
				932	if (error)
				933	goto out_unlock;
				934	}
				935
				936	if (file->f_flags & O_DSYNC)
				937	flags \|= XFS_PREALLOC_SYNC;
				938
				939	error = xfs_update_prealloc_flags(ip, flags);
				940	if (error)
				941	goto out_unlock;
				942
				943	/* Change file size if needed */
				944	if (new_size) {
				945	struct iattr iattr;
				946
				947	iattr.ia_valid = ATTR_SIZE;
				948	iattr.ia_size = new_size;
				949	error = xfs_vn_setattr_size(file_dentry(file), &iattr);
				950	if (error)
				951	goto out_unlock;
				952	}
				953
				954	/*
				955	* Perform hole insertion now that the file size has been
				956	* updated so that if we crash during the operation we don't
				957	* leave shifted extents past EOF and hence losing access to
				958	* the data that is contained within them.
				959	*/
				960	if (do_file_insert)
				961	error = xfs_insert_file_space(ip, offset, len);
				962
				963	out_unlock:
				964	xfs_iunlock(ip, iolock);
				965	return error;
				966	}
				967
				968	STATIC int
				969	xfs_file_fadvise(
				970	struct file *file,
				971	loff_t start,
				972	loff_t end,
				973	int advice)
				974	{
				975	struct xfs_inode *ip = XFS_I(file_inode(file));
				976	int ret;
				977	int lockflags = 0;
				978
				979	/*
				980	* Operations creating pages in page cache need protection from hole
				981	* punching and similar ops
				982	*/
				983	if (advice == POSIX_FADV_WILLNEED) {
				984	lockflags = XFS_IOLOCK_SHARED;
				985	xfs_ilock(ip, lockflags);
				986	}
				987	ret = generic_fadvise(file, start, end, advice);
				988	if (lockflags)
				989	xfs_iunlock(ip, lockflags);
				990	return ret;
				991	}
				992
				993	/* Does this file, inode, or mount want synchronous writes? */
				994	static inline bool xfs_file_sync_writes(struct file *filp)
				995	{
				996	struct xfs_inode *ip = XFS_I(file_inode(filp));
				997
				998	if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
				999	return true;
				1000	if (filp->f_flags & (__O_SYNC \| O_DSYNC))
				1001	return true;
				1002	if (IS_SYNC(file_inode(filp)))
				1003	return true;
				1004
				1005	return false;
				1006	}
				1007
				1008	STATIC loff_t
				1009	xfs_file_remap_range(
				1010	struct file *file_in,
				1011	loff_t pos_in,
				1012	struct file *file_out,
				1013	loff_t pos_out,
				1014	loff_t len,
				1015	unsigned int remap_flags)
				1016	{
				1017	struct inode *inode_in = file_inode(file_in);
				1018	struct xfs_inode *src = XFS_I(inode_in);
				1019	struct inode *inode_out = file_inode(file_out);
				1020	struct xfs_inode *dest = XFS_I(inode_out);
				1021	struct xfs_mount *mp = src->i_mount;
				1022	loff_t remapped = 0;
				1023	xfs_extlen_t cowextsize;
				1024	int ret;
				1025
				1026	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				1027	return -EINVAL;
				1028
				1029	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				1030	return -EOPNOTSUPP;
				1031
				1032	if (XFS_FORCED_SHUTDOWN(mp))
				1033	return -EIO;
				1034
				1035	/* Prepare and then clone file data. */
				1036	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
				1037	&len, remap_flags);
				1038	if (ret < 0 \|\| len == 0)
				1039	return ret;
				1040
				1041	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
				1042
				1043	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
				1044	&remapped);
				1045	if (ret)
				1046	goto out_unlock;
				1047
				1048	/*
				1049	* Carry the cowextsize hint from src to dest if we're sharing the
				1050	* entire source file to the entire destination file, the source file
				1051	* has a cowextsize hint, and the destination file does not.
				1052	*/
				1053	cowextsize = 0;
				1054	if (pos_in == 0 && len == i_size_read(inode_in) &&
				1055	(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
				1056	pos_out == 0 && len >= i_size_read(inode_out) &&
				1057	!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
				1058	cowextsize = src->i_d.di_cowextsize;
				1059
				1060	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
				1061	remap_flags);
				1062	if (ret)
				1063	goto out_unlock;
				1064
				1065	if (xfs_file_sync_writes(file_in) \|\| xfs_file_sync_writes(file_out))
				1066	xfs_log_force_inode(dest);
				1067	out_unlock:
				1068	xfs_reflink_remap_unlock(file_in, file_out);
				1069	if (ret)
				1070	trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
				1071	/*
				1072	* If the caller did not set CAN_SHORTEN, then it is not prepared to
				1073	* handle partial results -- either the whole remap succeeds, or we
				1074	* must say why it did not. In this case, any error should be returned
				1075	* to the caller.
				1076	*/
				1077	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
				1078	return ret;
				1079	return remapped > 0 ? remapped : ret;
				1080	}
				1081
				1082	STATIC int
				1083	xfs_file_open(
				1084	struct inode *inode,
				1085	struct file *file)
				1086	{
				1087	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
				1088	return -EFBIG;
				1089	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
				1090	return -EIO;
				1091	file->f_mode \|= FMODE_NOWAIT;
				1092	return 0;
				1093	}
				1094
				1095	STATIC int
				1096	xfs_dir_open(
				1097	struct inode *inode,
				1098	struct file *file)
				1099	{
				1100	struct xfs_inode *ip = XFS_I(inode);
				1101	int mode;
				1102	int error;
				1103
				1104	error = xfs_file_open(inode, file);
				1105	if (error)
				1106	return error;
				1107
				1108	/*
				1109	* If there are any blocks, read-ahead block 0 as we're almost
				1110	* certain to have the next operation be a read there.
				1111	*/
				1112	mode = xfs_ilock_data_map_shared(ip);
				1113	if (ip->i_d.di_nextents > 0)
				1114	error = xfs_dir3_data_readahead(ip, 0, -1);
				1115	xfs_iunlock(ip, mode);
				1116	return error;
				1117	}
				1118
				1119	STATIC int
				1120	xfs_file_release(
				1121	struct inode *inode,
				1122	struct file *filp)
				1123	{
				1124	return xfs_release(XFS_I(inode));
				1125	}
				1126
				1127	STATIC int
				1128	xfs_file_readdir(
				1129	struct file *file,
				1130	struct dir_context *ctx)
				1131	{
				1132	struct inode *inode = file_inode(file);
				1133	xfs_inode_t *ip = XFS_I(inode);
				1134	size_t bufsize;
				1135
				1136	/*
				1137	* The Linux API doesn't pass down the total size of the buffer
				1138	* we read into down to the filesystem. With the filldir concept
				1139	* it's not needed for correct information, but the XFS dir2 leaf
				1140	* code wants an estimate of the buffer size to calculate it's
				1141	* readahead window and size the buffers used for mapping to
				1142	* physical blocks.
				1143	*
				1144	* Try to give it an estimate that's good enough, maybe at some
				1145	* point we can change the ->readdir prototype to include the
				1146	* buffer size. For now we use the current glibc buffer size.
				1147	*/
				1148	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
				1149
				1150	return xfs_readdir(NULL, ip, ctx, bufsize);
				1151	}
				1152
				1153	STATIC loff_t
				1154	xfs_file_llseek(
				1155	struct file *file,
				1156	loff_t offset,
				1157	int whence)
				1158	{
				1159	struct inode *inode = file->f_mapping->host;
				1160
				1161	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
				1162	return -EIO;
				1163
				1164	switch (whence) {
				1165	default:
				1166	return generic_file_llseek(file, offset, whence);
				1167	case SEEK_HOLE:
				1168	offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
				1169	break;
				1170	case SEEK_DATA:
				1171	offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
				1172	break;
				1173	}
				1174
				1175	if (offset < 0)
				1176	return offset;
				1177	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				1178	}
				1179
				1180	/*
				1181	* Locking for serialisation of IO during page faults. This results in a lock
				1182	* ordering of:
				1183	*
				1184	* mmap_sem (MM)
				1185	* sb_start_pagefault(vfs, freeze)
				1186	* i_mmaplock (XFS - truncate serialisation)
				1187	* page_lock (MM)
				1188	* i_lock (XFS - extent map serialisation)
				1189	*/
				1190	static vm_fault_t
				1191	__xfs_filemap_fault(
				1192	struct vm_fault *vmf,
				1193	enum page_entry_size pe_size,
				1194	bool write_fault)
				1195	{
				1196	struct inode *inode = file_inode(vmf->vma->vm_file);
				1197	struct xfs_inode *ip = XFS_I(inode);
				1198	vm_fault_t ret;
				1199
				1200	trace_xfs_filemap_fault(ip, pe_size, write_fault);
				1201
				1202	if (write_fault) {
				1203	sb_start_pagefault(inode->i_sb);
				1204	file_update_time(vmf->vma->vm_file);
				1205	}
				1206
				1207	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1208	if (IS_DAX(inode)) {
				1209	pfn_t pfn;
				1210
				1211	ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
				1212	if (ret & VM_FAULT_NEEDDSYNC)
				1213	ret = dax_finish_sync_fault(vmf, pe_size, pfn);
				1214	} else {
				1215	if (write_fault)
				1216	ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
				1217	else
				1218	ret = filemap_fault(vmf);
				1219	}
				1220	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1221
				1222	if (write_fault)
				1223	sb_end_pagefault(inode->i_sb);
				1224	return ret;
				1225	}
				1226
				1227	static inline bool
				1228	xfs_is_write_fault(
				1229	struct vm_fault *vmf)
				1230	{
				1231	return (vmf->flags & FAULT_FLAG_WRITE) &&
				1232	(vmf->vma->vm_flags & VM_SHARED);
				1233	}
				1234
				1235	static vm_fault_t
				1236	xfs_filemap_fault(
				1237	struct vm_fault *vmf)
				1238	{
				1239	/* DAX can shortcut the normal fault path on write faults! */
				1240	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
				1241	IS_DAX(file_inode(vmf->vma->vm_file)) &&
				1242	xfs_is_write_fault(vmf));
				1243	}
				1244
				1245	static vm_fault_t
				1246	xfs_filemap_huge_fault(
				1247	struct vm_fault *vmf,
				1248	enum page_entry_size pe_size)
				1249	{
				1250	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
				1251	return VM_FAULT_FALLBACK;
				1252
				1253	/* DAX can shortcut the normal fault path on write faults! */
				1254	return __xfs_filemap_fault(vmf, pe_size,
				1255	xfs_is_write_fault(vmf));
				1256	}
				1257
				1258	static vm_fault_t
				1259	xfs_filemap_page_mkwrite(
				1260	struct vm_fault *vmf)
				1261	{
				1262	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1263	}
				1264
				1265	/*
				1266	* pfn_mkwrite was originally intended to ensure we capture time stamp updates
				1267	* on write faults. In reality, it needs to serialise against truncate and
				1268	* prepare memory for writing so handle is as standard write fault.
				1269	*/
				1270	static vm_fault_t
				1271	xfs_filemap_pfn_mkwrite(
				1272	struct vm_fault *vmf)
				1273	{
				1274
				1275	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1276	}
				1277
				1278	static void
				1279	xfs_filemap_map_pages(
				1280	struct vm_fault *vmf,
				1281	pgoff_t start_pgoff,
				1282	pgoff_t end_pgoff)
				1283	{
				1284	struct inode *inode = file_inode(vmf->vma->vm_file);
				1285
				1286	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1287	filemap_map_pages(vmf, start_pgoff, end_pgoff);
				1288	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1289	}
				1290
				1291	static const struct vm_operations_struct xfs_file_vm_ops = {
				1292	.fault = xfs_filemap_fault,
				1293	.huge_fault = xfs_filemap_huge_fault,
				1294	.map_pages = xfs_filemap_map_pages,
				1295	.page_mkwrite = xfs_filemap_page_mkwrite,
				1296	.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
				1297	};
				1298
				1299	STATIC int
				1300	xfs_file_mmap(
				1301	struct file *filp,
				1302	struct vm_area_struct *vma)
				1303	{
				1304	struct dax_device *dax_dev;
				1305
				1306	dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
				1307	/*
				1308	* We don't support synchronous mappings for non-DAX files and
				1309	* for DAX files if underneath dax_device is not synchronous.
				1310	*/
				1311	if (!daxdev_mapping_supported(vma, dax_dev))
				1312	return -EOPNOTSUPP;
				1313
				1314	file_accessed(filp);
				1315	vma->vm_ops = &xfs_file_vm_ops;
				1316	if (IS_DAX(file_inode(filp)))
				1317	vma->vm_flags \|= VM_HUGEPAGE;
				1318	return 0;
				1319	}
				1320
				1321	const struct file_operations xfs_file_operations = {
				1322	.llseek = xfs_file_llseek,
				1323	.read_iter = xfs_file_read_iter,
				1324	.write_iter = xfs_file_write_iter,
				1325	.splice_read = generic_file_splice_read,
				1326	.splice_write = iter_file_splice_write,
				1327	.iopoll = iomap_dio_iopoll,
				1328	.unlocked_ioctl = xfs_file_ioctl,
				1329	#ifdef CONFIG_COMPAT
				1330	.compat_ioctl = xfs_file_compat_ioctl,
				1331	#endif
				1332	.mmap = xfs_file_mmap,
				1333	.mmap_supported_flags = MAP_SYNC,
				1334	.open = xfs_file_open,
				1335	.release = xfs_file_release,
				1336	.fsync = xfs_file_fsync,
				1337	.get_unmapped_area = thp_get_unmapped_area,
				1338	.fallocate = xfs_file_fallocate,
				1339	.fadvise = xfs_file_fadvise,
				1340	.remap_file_range = xfs_file_remap_range,
				1341	};
				1342
				1343	const struct file_operations xfs_dir_file_operations = {
				1344	.open = xfs_dir_open,
				1345	.read = generic_read_dir,
				1346	.iterate_shared = xfs_file_readdir,
				1347	.llseek = generic_file_llseek,
				1348	.unlocked_ioctl = xfs_file_ioctl,
				1349	#ifdef CONFIG_COMPAT
				1350	.compat_ioctl = xfs_file_compat_ioctl,
				1351	#endif
				1352	.fsync = xfs_dir_fsync,
				1353	};