Blame - src/kernel/linux/v4.19/fs/xfs/xfs_aops.c - T800

blob: b697866946d26e49fe6cbbd0753f24d8c3b46964 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
				4	* Copyright (c) 2016-2018 Christoph Hellwig.
				5	* All Rights Reserved.
				6	*/
				7	#include "xfs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_inode.h"
				14	#include "xfs_trans.h"
				15	#include "xfs_inode_item.h"
				16	#include "xfs_alloc.h"
				17	#include "xfs_error.h"
				18	#include "xfs_iomap.h"
				19	#include "xfs_trace.h"
				20	#include "xfs_bmap.h"
				21	#include "xfs_bmap_util.h"
				22	#include "xfs_bmap_btree.h"
				23	#include "xfs_reflink.h"
				24	#include <linux/writeback.h>
				25
				26	/*
				27	* structure owned by writepages passed to individual writepage calls
				28	*/
				29	struct xfs_writepage_ctx {
				30	struct xfs_bmbt_irec imap;
				31	unsigned int io_type;
				32	unsigned int cow_seq;
				33	struct xfs_ioend *ioend;
				34	};
				35
				36	struct block_device *
				37	xfs_find_bdev_for_inode(
				38	struct inode *inode)
				39	{
				40	struct xfs_inode *ip = XFS_I(inode);
				41	struct xfs_mount *mp = ip->i_mount;
				42
				43	if (XFS_IS_REALTIME_INODE(ip))
				44	return mp->m_rtdev_targp->bt_bdev;
				45	else
				46	return mp->m_ddev_targp->bt_bdev;
				47	}
				48
				49	struct dax_device *
				50	xfs_find_daxdev_for_inode(
				51	struct inode *inode)
				52	{
				53	struct xfs_inode *ip = XFS_I(inode);
				54	struct xfs_mount *mp = ip->i_mount;
				55
				56	if (XFS_IS_REALTIME_INODE(ip))
				57	return mp->m_rtdev_targp->bt_daxdev;
				58	else
				59	return mp->m_ddev_targp->bt_daxdev;
				60	}
				61
				62	static void
				63	xfs_finish_page_writeback(
				64	struct inode *inode,
				65	struct bio_vec *bvec,
				66	int error)
				67	{
				68	struct iomap_page *iop = to_iomap_page(bvec->bv_page);
				69
				70	if (error) {
				71	SetPageError(bvec->bv_page);
				72	mapping_set_error(inode->i_mapping, -EIO);
				73	}
				74
				75	ASSERT(iop \|\| i_blocksize(inode) == PAGE_SIZE);
				76	ASSERT(!iop \|\| atomic_read(&iop->write_count) > 0);
				77
				78	if (!iop \|\| atomic_dec_and_test(&iop->write_count))
				79	end_page_writeback(bvec->bv_page);
				80	}
				81
				82	/*
				83	* We're now finished for good with this ioend structure. Update the page
				84	* state, release holds on bios, and finally free up memory. Do not use the
				85	* ioend after this.
				86	*/
				87	STATIC void
				88	xfs_destroy_ioend(
				89	struct xfs_ioend *ioend,
				90	int error)
				91	{
				92	struct inode *inode = ioend->io_inode;
				93	struct bio *bio = &ioend->io_inline_bio;
				94	struct bio last = ioend->io_bio, next;
				95	u64 start = bio->bi_iter.bi_sector;
				96	bool quiet = bio_flagged(bio, BIO_QUIET);
				97
				98	for (bio = &ioend->io_inline_bio; bio; bio = next) {
				99	struct bio_vec *bvec;
				100	int i;
				101
				102	/*
				103	* For the last bio, bi_private points to the ioend, so we
				104	* need to explicitly end the iteration here.
				105	*/
				106	if (bio == last)
				107	next = NULL;
				108	else
				109	next = bio->bi_private;
				110
				111	/* walk each page on bio, ending page IO on them */
				112	bio_for_each_segment_all(bvec, bio, i)
				113	xfs_finish_page_writeback(inode, bvec, error);
				114	bio_put(bio);
				115	}
				116
				117	if (unlikely(error && !quiet)) {
				118	xfs_err_ratelimited(XFS_I(inode)->i_mount,
				119	"writeback error on sector %llu", start);
				120	}
				121	}
				122
				123	/*
				124	* Fast and loose check if this write could update the on-disk inode size.
				125	*/
				126	static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
				127	{
				128	return ioend->io_offset + ioend->io_size >
				129	XFS_I(ioend->io_inode)->i_d.di_size;
				130	}
				131
				132	STATIC int
				133	xfs_setfilesize_trans_alloc(
				134	struct xfs_ioend *ioend)
				135	{
				136	struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
				137	struct xfs_trans *tp;
				138	int error;
				139
				140	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
				141	XFS_TRANS_NOFS, &tp);
				142	if (error)
				143	return error;
				144
				145	ioend->io_append_trans = tp;
				146
				147	/*
				148	* We may pass freeze protection with a transaction. So tell lockdep
				149	* we released it.
				150	*/
				151	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
				152	/*
				153	* We hand off the transaction to the completion thread now, so
				154	* clear the flag here.
				155	*/
				156	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
				157	return 0;
				158	}
				159
				160	/*
				161	* Update on-disk file size now that data has been written to disk.
				162	*/
				163	STATIC int
				164	__xfs_setfilesize(
				165	struct xfs_inode *ip,
				166	struct xfs_trans *tp,
				167	xfs_off_t offset,
				168	size_t size)
				169	{
				170	xfs_fsize_t isize;
				171
				172	xfs_ilock(ip, XFS_ILOCK_EXCL);
				173	isize = xfs_new_eof(ip, offset + size);
				174	if (!isize) {
				175	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				176	xfs_trans_cancel(tp);
				177	return 0;
				178	}
				179
				180	trace_xfs_setfilesize(ip, offset, size);
				181
				182	ip->i_d.di_size = isize;
				183	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				184	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				185
				186	return xfs_trans_commit(tp);
				187	}
				188
				189	int
				190	xfs_setfilesize(
				191	struct xfs_inode *ip,
				192	xfs_off_t offset,
				193	size_t size)
				194	{
				195	struct xfs_mount *mp = ip->i_mount;
				196	struct xfs_trans *tp;
				197	int error;
				198
				199	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
				200	if (error)
				201	return error;
				202
				203	return __xfs_setfilesize(ip, tp, offset, size);
				204	}
				205
				206	STATIC int
				207	xfs_setfilesize_ioend(
				208	struct xfs_ioend *ioend,
				209	int error)
				210	{
				211	struct xfs_inode *ip = XFS_I(ioend->io_inode);
				212	struct xfs_trans *tp = ioend->io_append_trans;
				213
				214	/*
				215	* The transaction may have been allocated in the I/O submission thread,
				216	* thus we need to mark ourselves as being in a transaction manually.
				217	* Similarly for freeze protection.
				218	*/
				219	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
				220	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
				221
				222	/* we abort the update if there was an IO error */
				223	if (error) {
				224	xfs_trans_cancel(tp);
				225	return error;
				226	}
				227
				228	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
				229	}
				230
				231	/*
				232	* IO write completion.
				233	*/
				234	STATIC void
				235	xfs_end_io(
				236	struct work_struct *work)
				237	{
				238	struct xfs_ioend *ioend =
				239	container_of(work, struct xfs_ioend, io_work);
				240	struct xfs_inode *ip = XFS_I(ioend->io_inode);
				241	xfs_off_t offset = ioend->io_offset;
				242	size_t size = ioend->io_size;
				243	int error;
				244
				245	/*
				246	* Just clean up the in-memory strutures if the fs has been shut down.
				247	*/
				248	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
				249	error = -EIO;
				250	goto done;
				251	}
				252
				253	/*
				254	* Clean up any COW blocks on an I/O error.
				255	*/
				256	error = blk_status_to_errno(ioend->io_bio->bi_status);
				257	if (unlikely(error)) {
				258	switch (ioend->io_type) {
				259	case XFS_IO_COW:
				260	xfs_reflink_cancel_cow_range(ip, offset, size, true);
				261	break;
				262	}
				263
				264	goto done;
				265	}
				266
				267	/*
				268	* Success: commit the COW or unwritten blocks if needed.
				269	*/
				270	switch (ioend->io_type) {
				271	case XFS_IO_COW:
				272	error = xfs_reflink_end_cow(ip, offset, size);
				273	break;
				274	case XFS_IO_UNWRITTEN:
				275	/* writeback should never update isize */
				276	error = xfs_iomap_write_unwritten(ip, offset, size, false);
				277	break;
				278	default:
				279	ASSERT(!xfs_ioend_is_append(ioend) \|\| ioend->io_append_trans);
				280	break;
				281	}
				282
				283	done:
				284	if (ioend->io_append_trans)
				285	error = xfs_setfilesize_ioend(ioend, error);
				286	xfs_destroy_ioend(ioend, error);
				287	}
				288
				289	STATIC void
				290	xfs_end_bio(
				291	struct bio *bio)
				292	{
				293	struct xfs_ioend *ioend = bio->bi_private;
				294	struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
				295
				296	if (ioend->io_type == XFS_IO_UNWRITTEN \|\| ioend->io_type == XFS_IO_COW)
				297	queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
				298	else if (ioend->io_append_trans)
				299	queue_work(mp->m_data_workqueue, &ioend->io_work);
				300	else
				301	xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
				302	}
				303
				304	STATIC int
				305	xfs_map_blocks(
				306	struct xfs_writepage_ctx *wpc,
				307	struct inode *inode,
				308	loff_t offset)
				309	{
				310	struct xfs_inode *ip = XFS_I(inode);
				311	struct xfs_mount *mp = ip->i_mount;
				312	ssize_t count = i_blocksize(inode);
				313	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
				314	xfs_fileoff_t cow_fsb = NULLFILEOFF;
				315	struct xfs_bmbt_irec imap;
				316	int whichfork = XFS_DATA_FORK;
				317	struct xfs_iext_cursor icur;
				318	bool imap_valid;
				319	int error = 0;
				320
				321	/*
				322	* We have to make sure the cached mapping is within EOF to protect
				323	* against eofblocks trimming on file release leaving us with a stale
				324	* mapping. Otherwise, a page for a subsequent file extending buffered
				325	* write could get picked up by this writeback cycle and written to the
				326	* wrong blocks.
				327	*
				328	* Note that what we really want here is a generic mapping invalidation
				329	* mechanism to protect us from arbitrary extent modifying contexts, not
				330	* just eofblocks.
				331	*/
				332	xfs_trim_extent_eof(&wpc->imap, ip);
				333
				334	/*
				335	* COW fork blocks can overlap data fork blocks even if the blocks
				336	* aren't shared. COW I/O always takes precedent, so we must always
				337	* check for overlap on reflink inodes unless the mapping is already a
				338	* COW one, or the COW fork hasn't changed from the last time we looked
				339	* at it.
				340	*
				341	* It's safe to check the COW fork if_seq here without the ILOCK because
				342	* we've indirectly protected against concurrent updates: writeback has
				343	* the page locked, which prevents concurrent invalidations by reflink
				344	* and directio and prevents concurrent buffered writes to the same
				345	* page. Changes to if_seq always happen under i_lock, which protects
				346	* against concurrent updates and provides a memory barrier on the way
				347	* out that ensures that we always see the current value.
				348	*/
				349	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
				350	offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
				351	if (imap_valid &&
				352	(!xfs_inode_has_cow_data(ip) \|\|
				353	wpc->io_type == XFS_IO_COW \|\|
				354	wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
				355	return 0;
				356
				357	if (XFS_FORCED_SHUTDOWN(mp))
				358	return -EIO;
				359
				360	/*
				361	* If we don't have a valid map, now it's time to get a new one for this
				362	* offset. This will convert delayed allocations (including COW ones)
				363	* into real extents. If we return without a valid map, it means we
				364	* landed in a hole and we skip the block.
				365	*/
				366	xfs_ilock(ip, XFS_ILOCK_SHARED);
				367	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
				368	(ip->i_df.if_flags & XFS_IFEXTENTS));
				369	ASSERT(offset <= mp->m_super->s_maxbytes);
				370
				371	if (offset > mp->m_super->s_maxbytes - count)
				372	count = mp->m_super->s_maxbytes - offset;
				373	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
				374
				375	/*
				376	* Check if this is offset is covered by a COW extents, and if yes use
				377	* it directly instead of looking up anything in the data fork.
				378	*/
				379	if (xfs_inode_has_cow_data(ip) &&
				380	xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
				381	cow_fsb = imap.br_startoff;
				382	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
				383	wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
				384	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				385	/*
				386	* Truncate can race with writeback since writeback doesn't
				387	* take the iolock and truncate decreases the file size before
				388	* it starts truncating the pages between new_size and old_size.
				389	* Therefore, we can end up in the situation where writeback
				390	* gets a CoW fork mapping but the truncate makes the mapping
				391	* invalid and we end up in here trying to get a new mapping.
				392	* bail out here so that we simply never get a valid mapping
				393	* and so we drop the write altogether. The page truncation
				394	* will kill the contents anyway.
				395	*/
				396	if (offset > i_size_read(inode)) {
				397	wpc->io_type = XFS_IO_HOLE;
				398	return 0;
				399	}
				400	whichfork = XFS_COW_FORK;
				401	wpc->io_type = XFS_IO_COW;
				402	goto allocate_blocks;
				403	}
				404
				405	/*
				406	* Map valid and no COW extent in the way? We're done.
				407	*/
				408	if (imap_valid) {
				409	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				410	return 0;
				411	}
				412
				413	/*
				414	* If we don't have a valid map, now it's time to get a new one for this
				415	* offset. This will convert delayed allocations (including COW ones)
				416	* into real extents.
				417	*/
				418	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
				419	imap.br_startoff = end_fsb; /* fake a hole past EOF */
				420	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				421
				422	if (imap.br_startoff > offset_fsb) {
				423	/* landed in a hole or beyond EOF */
				424	imap.br_blockcount = imap.br_startoff - offset_fsb;
				425	imap.br_startoff = offset_fsb;
				426	imap.br_startblock = HOLESTARTBLOCK;
				427	wpc->io_type = XFS_IO_HOLE;
				428	} else {
				429	/*
				430	* Truncate to the next COW extent if there is one. This is the
				431	* only opportunity to do this because we can skip COW fork
				432	* lookups for the subsequent blocks in the mapping; however,
				433	* the requirement to treat the COW range separately remains.
				434	*/
				435	if (cow_fsb != NULLFILEOFF &&
				436	cow_fsb < imap.br_startoff + imap.br_blockcount)
				437	imap.br_blockcount = cow_fsb - imap.br_startoff;
				438
				439	if (isnullstartblock(imap.br_startblock)) {
				440	/* got a delalloc extent */
				441	wpc->io_type = XFS_IO_DELALLOC;
				442	goto allocate_blocks;
				443	}
				444
				445	if (imap.br_state == XFS_EXT_UNWRITTEN)
				446	wpc->io_type = XFS_IO_UNWRITTEN;
				447	else
				448	wpc->io_type = XFS_IO_OVERWRITE;
				449	}
				450
				451	wpc->imap = imap;
				452	xfs_trim_extent_eof(&wpc->imap, ip);
				453	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
				454	return 0;
				455	allocate_blocks:
				456	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
				457	&wpc->cow_seq);
				458	if (error)
				459	return error;
				460	ASSERT(whichfork == XFS_COW_FORK \|\| cow_fsb == NULLFILEOFF \|\|
				461	imap.br_startoff + imap.br_blockcount <= cow_fsb);
				462	wpc->imap = imap;
				463	xfs_trim_extent_eof(&wpc->imap, ip);
				464	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
				465	return 0;
				466	}
				467
				468	/*
				469	* Submit the bio for an ioend. We are passed an ioend with a bio attached to
				470	* it, and we submit that bio. The ioend may be used for multiple bio
				471	* submissions, so we only want to allocate an append transaction for the ioend
				472	* once. In the case of multiple bio submission, each bio will take an IO
				473	* reference to the ioend to ensure that the ioend completion is only done once
				474	* all bios have been submitted and the ioend is really done.
				475	*
				476	* If @fail is non-zero, it means that we have a situation where some part of
				477	* the submission process has failed after we have marked paged for writeback
				478	* and unlocked them. In this situation, we need to fail the bio and ioend
				479	* rather than submit it to IO. This typically only happens on a filesystem
				480	* shutdown.
				481	*/
				482	STATIC int
				483	xfs_submit_ioend(
				484	struct writeback_control *wbc,
				485	struct xfs_ioend *ioend,
				486	int status)
				487	{
				488	/* Convert CoW extents to regular */
				489	if (!status && ioend->io_type == XFS_IO_COW) {
				490	/*
				491	* Yuk. This can do memory allocation, but is not a
				492	* transactional operation so everything is done in GFP_KERNEL
				493	* context. That can deadlock, because we hold pages in
				494	* writeback state and GFP_KERNEL allocations can block on them.
				495	* Hence we must operate in nofs conditions here.
				496	*/
				497	unsigned nofs_flag;
				498
				499	nofs_flag = memalloc_nofs_save();
				500	status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
				501	ioend->io_offset, ioend->io_size);
				502	memalloc_nofs_restore(nofs_flag);
				503	}
				504
				505	/* Reserve log space if we might write beyond the on-disk inode size. */
				506	if (!status &&
				507	ioend->io_type != XFS_IO_UNWRITTEN &&
				508	xfs_ioend_is_append(ioend) &&
				509	!ioend->io_append_trans)
				510	status = xfs_setfilesize_trans_alloc(ioend);
				511
				512	ioend->io_bio->bi_private = ioend;
				513	ioend->io_bio->bi_end_io = xfs_end_bio;
				514	ioend->io_bio->bi_opf = REQ_OP_WRITE \| wbc_to_write_flags(wbc);
				515
				516	/*
				517	* If we are failing the IO now, just mark the ioend with an
				518	* error and finish it. This will run IO completion immediately
				519	* as there is only one reference to the ioend at this point in
				520	* time.
				521	*/
				522	if (status) {
				523	ioend->io_bio->bi_status = errno_to_blk_status(status);
				524	bio_endio(ioend->io_bio);
				525	return status;
				526	}
				527
				528	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
				529	submit_bio(ioend->io_bio);
				530	return 0;
				531	}
				532
				533	static struct xfs_ioend *
				534	xfs_alloc_ioend(
				535	struct inode *inode,
				536	unsigned int type,
				537	xfs_off_t offset,
				538	struct block_device *bdev,
				539	sector_t sector)
				540	{
				541	struct xfs_ioend *ioend;
				542	struct bio *bio;
				543
				544	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
				545	bio_set_dev(bio, bdev);
				546	bio->bi_iter.bi_sector = sector;
				547
				548	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
				549	INIT_LIST_HEAD(&ioend->io_list);
				550	ioend->io_type = type;
				551	ioend->io_inode = inode;
				552	ioend->io_size = 0;
				553	ioend->io_offset = offset;
				554	INIT_WORK(&ioend->io_work, xfs_end_io);
				555	ioend->io_append_trans = NULL;
				556	ioend->io_bio = bio;
				557	return ioend;
				558	}
				559
				560	/*
				561	* Allocate a new bio, and chain the old bio to the new one.
				562	*
				563	* Note that we have to do perform the chaining in this unintuitive order
				564	* so that the bi_private linkage is set up in the right direction for the
				565	* traversal in xfs_destroy_ioend().
				566	*/
				567	static void
				568	xfs_chain_bio(
				569	struct xfs_ioend *ioend,
				570	struct writeback_control *wbc,
				571	struct block_device *bdev,
				572	sector_t sector)
				573	{
				574	struct bio *new;
				575
				576	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
				577	bio_set_dev(new, bdev);
				578	new->bi_iter.bi_sector = sector;
				579	bio_chain(ioend->io_bio, new);
				580	bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
				581	ioend->io_bio->bi_opf = REQ_OP_WRITE \| wbc_to_write_flags(wbc);
				582	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
				583	submit_bio(ioend->io_bio);
				584	ioend->io_bio = new;
				585	}
				586
				587	/*
				588	* Test to see if we have an existing ioend structure that we could append to
				589	* first, otherwise finish off the current ioend and start another.
				590	*/
				591	STATIC void
				592	xfs_add_to_ioend(
				593	struct inode *inode,
				594	xfs_off_t offset,
				595	struct page *page,
				596	struct iomap_page *iop,
				597	struct xfs_writepage_ctx *wpc,
				598	struct writeback_control *wbc,
				599	struct list_head *iolist)
				600	{
				601	struct xfs_inode *ip = XFS_I(inode);
				602	struct xfs_mount *mp = ip->i_mount;
				603	struct block_device *bdev = xfs_find_bdev_for_inode(inode);
				604	unsigned len = i_blocksize(inode);
				605	unsigned poff = offset & (PAGE_SIZE - 1);
				606	sector_t sector;
				607
				608	sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
				609	((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
				610
				611	if (!wpc->ioend \|\| wpc->io_type != wpc->ioend->io_type \|\|
				612	sector != bio_end_sector(wpc->ioend->io_bio) \|\|
				613	offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
				614	if (wpc->ioend)
				615	list_add(&wpc->ioend->io_list, iolist);
				616	wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
				617	bdev, sector);
				618	}
				619
				620	if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
				621	if (iop)
				622	atomic_inc(&iop->write_count);
				623	if (bio_full(wpc->ioend->io_bio))
				624	xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
				625	__bio_add_page(wpc->ioend->io_bio, page, len, poff);
				626	}
				627
				628	wpc->ioend->io_size += len;
				629	}
				630
				631	STATIC void
				632	xfs_vm_invalidatepage(
				633	struct page *page,
				634	unsigned int offset,
				635	unsigned int length)
				636	{
				637	trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
				638	iomap_invalidatepage(page, offset, length);
				639	}
				640
				641	/*
				642	* If the page has delalloc blocks on it, we need to punch them out before we
				643	* invalidate the page. If we don't, we leave a stale delalloc mapping on the
				644	* inode that can trip up a later direct I/O read operation on the same region.
				645	*
				646	* We prevent this by truncating away the delalloc regions on the page. Because
				647	* they are delalloc, we can do this without needing a transaction. Indeed - if
				648	* we get ENOSPC errors, we have to be able to do this truncation without a
				649	* transaction as there is no space left for block reservation (typically why we
				650	* see a ENOSPC in writeback).
				651	*/
				652	STATIC void
				653	xfs_aops_discard_page(
				654	struct page *page)
				655	{
				656	struct inode *inode = page->mapping->host;
				657	struct xfs_inode *ip = XFS_I(inode);
				658	struct xfs_mount *mp = ip->i_mount;
				659	loff_t offset = page_offset(page);
				660	xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
				661	int error;
				662
				663	if (XFS_FORCED_SHUTDOWN(mp))
				664	goto out_invalidate;
				665
				666	xfs_alert(mp,
				667	"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
				668	page, ip->i_ino, offset);
				669
				670	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
				671	PAGE_SIZE / i_blocksize(inode));
				672	if (error && !XFS_FORCED_SHUTDOWN(mp))
				673	xfs_alert(mp, "page discard unable to remove delalloc mapping.");
				674	out_invalidate:
				675	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
				676	}
				677
				678	/*
				679	* We implement an immediate ioend submission policy here to avoid needing to
				680	* chain multiple ioends and hence nest mempool allocations which can violate
				681	* forward progress guarantees we need to provide. The current ioend we are
				682	* adding blocks to is cached on the writepage context, and if the new block
				683	* does not append to the cached ioend it will create a new ioend and cache that
				684	* instead.
				685	*
				686	* If a new ioend is created and cached, the old ioend is returned and queued
				687	* locally for submission once the entire page is processed or an error has been
				688	* detected. While ioends are submitted immediately after they are completed,
				689	* batching optimisations are provided by higher level block plugging.
				690	*
				691	* At the end of a writeback pass, there will be a cached ioend remaining on the
				692	* writepage context that the caller will need to submit.
				693	*/
				694	static int
				695	xfs_writepage_map(
				696	struct xfs_writepage_ctx *wpc,
				697	struct writeback_control *wbc,
				698	struct inode *inode,
				699	struct page *page,
				700	uint64_t end_offset)
				701	{
				702	LIST_HEAD(submit_list);
				703	struct iomap_page *iop = to_iomap_page(page);
				704	unsigned len = i_blocksize(inode);
				705	struct xfs_ioend ioend, next;
				706	uint64_t file_offset; /* file offset of page */
				707	int error = 0, count = 0, i;
				708
				709	ASSERT(iop \|\| i_blocksize(inode) == PAGE_SIZE);
				710	ASSERT(!iop \|\| atomic_read(&iop->write_count) == 0);
				711
				712	/*
				713	* Walk through the page to find areas to write back. If we run off the
				714	* end of the current map or find the current map invalid, grab a new
				715	* one.
				716	*/
				717	for (i = 0, file_offset = page_offset(page);
				718	i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
				719	i++, file_offset += len) {
				720	if (iop && !test_bit(i, iop->uptodate))
				721	continue;
				722
				723	error = xfs_map_blocks(wpc, inode, file_offset);
				724	if (error)
				725	break;
				726	if (wpc->io_type == XFS_IO_HOLE)
				727	continue;
				728	xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
				729	&submit_list);
				730	count++;
				731	}
				732
				733	ASSERT(wpc->ioend \|\| list_empty(&submit_list));
				734	ASSERT(PageLocked(page));
				735	ASSERT(!PageWriteback(page));
				736
				737	/*
				738	* On error, we have to fail the ioend here because we may have set
				739	* pages under writeback, we have to make sure we run IO completion to
				740	* mark the error state of the IO appropriately, so we can't cancel the
				741	* ioend directly here. That means we have to mark this page as under
				742	* writeback if we included any blocks from it in the ioend chain so
				743	* that completion treats it correctly.
				744	*
				745	* If we didn't include the page in the ioend, the on error we can
				746	* simply discard and unlock it as there are no other users of the page
				747	* now. The caller will still need to trigger submission of outstanding
				748	* ioends on the writepage context so they are treated correctly on
				749	* error.
				750	*/
				751	if (unlikely(error)) {
				752	if (!count) {
				753	xfs_aops_discard_page(page);
				754	ClearPageUptodate(page);
				755	unlock_page(page);
				756	goto done;
				757	}
				758
				759	/*
				760	* If the page was not fully cleaned, we need to ensure that the
				761	* higher layers come back to it correctly. That means we need
				762	* to keep the page dirty, and for WB_SYNC_ALL writeback we need
				763	* to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
				764	* so another attempt to write this page in this writeback sweep
				765	* will be made.
				766	*/
				767	set_page_writeback_keepwrite(page);
				768	} else {
				769	clear_page_dirty_for_io(page);
				770	set_page_writeback(page);
				771	}
				772
				773	unlock_page(page);
				774
				775	/*
				776	* Preserve the original error if there was one, otherwise catch
				777	* submission errors here and propagate into subsequent ioend
				778	* submissions.
				779	*/
				780	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
				781	int error2;
				782
				783	list_del_init(&ioend->io_list);
				784	error2 = xfs_submit_ioend(wbc, ioend, error);
				785	if (error2 && !error)
				786	error = error2;
				787	}
				788
				789	/*
				790	* We can end up here with no error and nothing to write only if we race
				791	* with a partial page truncate on a sub-page block sized filesystem.
				792	*/
				793	if (!count)
				794	end_page_writeback(page);
				795	done:
				796	mapping_set_error(page->mapping, error);
				797	return error;
				798	}
				799
				800	/*
				801	* Write out a dirty page.
				802	*
				803	* For delalloc space on the page we need to allocate space and flush it.
				804	* For unwritten space on the page we need to start the conversion to
				805	* regular allocated space.
				806	*/
				807	STATIC int
				808	xfs_do_writepage(
				809	struct page *page,
				810	struct writeback_control *wbc,
				811	void *data)
				812	{
				813	struct xfs_writepage_ctx *wpc = data;
				814	struct inode *inode = page->mapping->host;
				815	loff_t offset;
				816	uint64_t end_offset;
				817	pgoff_t end_index;
				818
				819	trace_xfs_writepage(inode, page, 0, 0);
				820
				821	/*
				822	* Refuse to write the page out if we are called from reclaim context.
				823	*
				824	* This avoids stack overflows when called from deeply used stacks in
				825	* random callers for direct reclaim or memcg reclaim. We explicitly
				826	* allow reclaim from kswapd as the stack usage there is relatively low.
				827	*
				828	* This should never happen except in the case of a VM regression so
				829	* warn about it.
				830	*/
				831	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC\|PF_KSWAPD)) ==
				832	PF_MEMALLOC))
				833	goto redirty;
				834
				835	/*
				836	* Given that we do not allow direct reclaim to call us, we should
				837	* never be called while in a filesystem transaction.
				838	*/
				839	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
				840	goto redirty;
				841
				842	/*
				843	* Is this page beyond the end of the file?
				844	*
				845	* The page index is less than the end_index, adjust the end_offset
				846	* to the highest offset that this page should represent.
				847	* -----------------------------------------------------
				848	* \| file mapping \| <EOF> \|
				849	* -----------------------------------------------------
				850	* \| Page ... \| Page N-2 \| Page N-1 \| Page N \| \|
				851	* ^--------------------------------^----------\|--------
				852	* \| desired writeback range \| see else \|
				853	* ---------------------------------^------------------\|
				854	*/
				855	offset = i_size_read(inode);
				856	end_index = offset >> PAGE_SHIFT;
				857	if (page->index < end_index)
				858	end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
				859	else {
				860	/*
				861	* Check whether the page to write out is beyond or straddles
				862	* i_size or not.
				863	* -------------------------------------------------------
				864	* \| file mapping \| <EOF> \|
				865	* -------------------------------------------------------
				866	* \| Page ... \| Page N-2 \| Page N-1 \| Page N \| Beyond \|
				867	* ^--------------------------------^-----------\|---------
				868	* \| \| Straddles \|
				869	* ---------------------------------^-----------\|--------\|
				870	*/
				871	unsigned offset_into_page = offset & (PAGE_SIZE - 1);
				872
				873	/*
				874	* Skip the page if it is fully outside i_size, e.g. due to a
				875	* truncate operation that is in progress. We must redirty the
				876	* page so that reclaim stops reclaiming it. Otherwise
				877	* xfs_vm_releasepage() is called on it and gets confused.
				878	*
				879	* Note that the end_index is unsigned long, it would overflow
				880	* if the given offset is greater than 16TB on 32-bit system
				881	* and if we do check the page is fully outside i_size or not
				882	* via "if (page->index >= end_index + 1)" as "end_index + 1"
				883	* will be evaluated to 0. Hence this page will be redirtied
				884	* and be written out repeatedly which would result in an
				885	* infinite loop, the user program that perform this operation
				886	* will hang. Instead, we can verify this situation by checking
				887	* if the page to write is totally beyond the i_size or if it's
				888	* offset is just equal to the EOF.
				889	*/
				890	if (page->index > end_index \|\|
				891	(page->index == end_index && offset_into_page == 0))
				892	goto redirty;
				893
				894	/*
				895	* The page straddles i_size. It must be zeroed out on each
				896	* and every writepage invocation because it may be mmapped.
				897	* "A file is mapped in multiples of the page size. For a file
				898	* that is not a multiple of the page size, the remaining
				899	* memory is zeroed when mapped, and writes to that region are
				900	* not written out to the file."
				901	*/
				902	zero_user_segment(page, offset_into_page, PAGE_SIZE);
				903
				904	/* Adjust the end_offset to the end of file */
				905	end_offset = offset;
				906	}
				907
				908	return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
				909
				910	redirty:
				911	redirty_page_for_writepage(wbc, page);
				912	unlock_page(page);
				913	return 0;
				914	}
				915
				916	STATIC int
				917	xfs_vm_writepage(
				918	struct page *page,
				919	struct writeback_control *wbc)
				920	{
				921	struct xfs_writepage_ctx wpc = {
				922	.io_type = XFS_IO_INVALID,
				923	};
				924	int ret;
				925
				926	ret = xfs_do_writepage(page, wbc, &wpc);
				927	if (wpc.ioend)
				928	ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
				929	return ret;
				930	}
				931
				932	STATIC int
				933	xfs_vm_writepages(
				934	struct address_space *mapping,
				935	struct writeback_control *wbc)
				936	{
				937	struct xfs_writepage_ctx wpc = {
				938	.io_type = XFS_IO_INVALID,
				939	};
				940	int ret;
				941
				942	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
				943	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
				944	if (wpc.ioend)
				945	ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
				946	return ret;
				947	}
				948
				949	STATIC int
				950	xfs_dax_writepages(
				951	struct address_space *mapping,
				952	struct writeback_control *wbc)
				953	{
				954	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
				955	return dax_writeback_mapping_range(mapping,
				956	xfs_find_bdev_for_inode(mapping->host), wbc);
				957	}
				958
				959	STATIC int
				960	xfs_vm_releasepage(
				961	struct page *page,
				962	gfp_t gfp_mask)
				963	{
				964	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
				965	return iomap_releasepage(page, gfp_mask);
				966	}
				967
				968	STATIC sector_t
				969	xfs_vm_bmap(
				970	struct address_space *mapping,
				971	sector_t block)
				972	{
				973	struct xfs_inode *ip = XFS_I(mapping->host);
				974
				975	trace_xfs_vm_bmap(ip);
				976
				977	/*
				978	* The swap code (ab-)uses ->bmap to get a block mapping and then
				979	* bypasses the file system for actual I/O. We really can't allow
				980	* that on reflinks inodes, so we have to skip out here. And yes,
				981	* 0 is the magic code for a bmap error.
				982	*
				983	* Since we don't pass back blockdev info, we can't return bmap
				984	* information for rt files either.
				985	*/
				986	if (xfs_is_reflink_inode(ip) \|\| XFS_IS_REALTIME_INODE(ip))
				987	return 0;
				988	return iomap_bmap(mapping, block, &xfs_iomap_ops);
				989	}
				990
				991	STATIC int
				992	xfs_vm_readpage(
				993	struct file *unused,
				994	struct page *page)
				995	{
				996	trace_xfs_vm_readpage(page->mapping->host, 1);
				997	return iomap_readpage(page, &xfs_iomap_ops);
				998	}
				999
				1000	STATIC int
				1001	xfs_vm_readpages(
				1002	struct file *unused,
				1003	struct address_space *mapping,
				1004	struct list_head *pages,
				1005	unsigned nr_pages)
				1006	{
				1007	trace_xfs_vm_readpages(mapping->host, nr_pages);
				1008	return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
				1009	}
				1010
				1011	static int
				1012	xfs_iomap_swapfile_activate(
				1013	struct swap_info_struct *sis,
				1014	struct file *swap_file,
				1015	sector_t *span)
				1016	{
				1017	sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
				1018	return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
				1019	}
				1020
				1021	const struct address_space_operations xfs_address_space_operations = {
				1022	.readpage = xfs_vm_readpage,
				1023	.readpages = xfs_vm_readpages,
				1024	.writepage = xfs_vm_writepage,
				1025	.writepages = xfs_vm_writepages,
				1026	.set_page_dirty = iomap_set_page_dirty,
				1027	.releasepage = xfs_vm_releasepage,
				1028	.invalidatepage = xfs_vm_invalidatepage,
				1029	.bmap = xfs_vm_bmap,
				1030	.direct_IO = noop_direct_IO,
				1031	.migratepage = iomap_migrate_page,
				1032	.is_partially_uptodate = iomap_is_partially_uptodate,
				1033	.error_remove_page = generic_error_remove_page,
				1034	.swap_activate = xfs_iomap_swapfile_activate,
				1035	};
				1036
				1037	const struct address_space_operations xfs_dax_aops = {
				1038	.writepages = xfs_dax_writepages,
				1039	.direct_IO = noop_direct_IO,
				1040	.set_page_dirty = noop_set_page_dirty,
				1041	.invalidatepage = noop_invalidatepage,
				1042	.swap_activate = xfs_iomap_swapfile_activate,
				1043	};