Blame - marvell/linux/fs/ocfs2/file.c - T108

blob: 54b843f5b73f5fd8d28ec68d84a0512d63cb2c76 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/* -- mode: c; c-basic-offset: 8; --
				3	* vim: noexpandtab sw=8 ts=8 sts=0:
				4	*
				5	* file.c
				6	*
				7	* File open, close, extend, truncate
				8	*
				9	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				10	*/
				11
				12	#include <linux/capability.h>
				13	#include <linux/fs.h>
				14	#include <linux/types.h>
				15	#include <linux/slab.h>
				16	#include <linux/highmem.h>
				17	#include <linux/pagemap.h>
				18	#include <linux/uio.h>
				19	#include <linux/sched.h>
				20	#include <linux/splice.h>
				21	#include <linux/mount.h>
				22	#include <linux/writeback.h>
				23	#include <linux/falloc.h>
				24	#include <linux/quotaops.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
				27
				28	#include <cluster/masklog.h>
				29
				30	#include "ocfs2.h"
				31
				32	#include "alloc.h"
				33	#include "aops.h"
				34	#include "dir.h"
				35	#include "dlmglue.h"
				36	#include "extent_map.h"
				37	#include "file.h"
				38	#include "sysfile.h"
				39	#include "inode.h"
				40	#include "ioctl.h"
				41	#include "journal.h"
				42	#include "locks.h"
				43	#include "mmap.h"
				44	#include "suballoc.h"
				45	#include "super.h"
				46	#include "xattr.h"
				47	#include "acl.h"
				48	#include "quota.h"
				49	#include "refcounttree.h"
				50	#include "ocfs2_trace.h"
				51
				52	#include "buffer_head_io.h"
				53
				54	static int ocfs2_init_file_private(struct inode inode, struct file file)
				55	{
				56	struct ocfs2_file_private *fp;
				57
				58	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
				59	if (!fp)
				60	return -ENOMEM;
				61
				62	fp->fp_file = file;
				63	mutex_init(&fp->fp_mutex);
				64	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
				65	file->private_data = fp;
				66
				67	return 0;
				68	}
				69
				70	static void ocfs2_free_file_private(struct inode inode, struct file file)
				71	{
				72	struct ocfs2_file_private *fp = file->private_data;
				73	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				74
				75	if (fp) {
				76	ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
				77	ocfs2_lock_res_free(&fp->fp_flock);
				78	kfree(fp);
				79	file->private_data = NULL;
				80	}
				81	}
				82
				83	static int ocfs2_file_open(struct inode inode, struct file file)
				84	{
				85	int status;
				86	int mode = file->f_flags;
				87	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				88
				89	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
				90	(unsigned long long)oi->ip_blkno,
				91	file->f_path.dentry->d_name.len,
				92	file->f_path.dentry->d_name.name, mode);
				93
				94	if (file->f_mode & FMODE_WRITE) {
				95	status = dquot_initialize(inode);
				96	if (status)
				97	goto leave;
				98	}
				99
				100	spin_lock(&oi->ip_lock);
				101
				102	/* Check that the inode hasn't been wiped from disk by another
				103	* node. If it hasn't then we're safe as long as we hold the
				104	* spin lock until our increment of open count. */
				105	if (oi->ip_flags & OCFS2_INODE_DELETED) {
				106	spin_unlock(&oi->ip_lock);
				107
				108	status = -ENOENT;
				109	goto leave;
				110	}
				111
				112	if (mode & O_DIRECT)
				113	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
				114
				115	oi->ip_open_count++;
				116	spin_unlock(&oi->ip_lock);
				117
				118	status = ocfs2_init_file_private(inode, file);
				119	if (status) {
				120	/*
				121	* We want to set open count back if we're failing the
				122	* open.
				123	*/
				124	spin_lock(&oi->ip_lock);
				125	oi->ip_open_count--;
				126	spin_unlock(&oi->ip_lock);
				127	}
				128
				129	file->f_mode \|= FMODE_NOWAIT;
				130
				131	leave:
				132	return status;
				133	}
				134
				135	static int ocfs2_file_release(struct inode inode, struct file file)
				136	{
				137	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				138
				139	spin_lock(&oi->ip_lock);
				140	if (!--oi->ip_open_count)
				141	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
				142
				143	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
				144	oi->ip_blkno,
				145	file->f_path.dentry->d_name.len,
				146	file->f_path.dentry->d_name.name,
				147	oi->ip_open_count);
				148	spin_unlock(&oi->ip_lock);
				149
				150	ocfs2_free_file_private(inode, file);
				151
				152	return 0;
				153	}
				154
				155	static int ocfs2_dir_open(struct inode inode, struct file file)
				156	{
				157	return ocfs2_init_file_private(inode, file);
				158	}
				159
				160	static int ocfs2_dir_release(struct inode inode, struct file file)
				161	{
				162	ocfs2_free_file_private(inode, file);
				163	return 0;
				164	}
				165
				166	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
				167	int datasync)
				168	{
				169	int err = 0;
				170	struct inode *inode = file->f_mapping->host;
				171	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				172	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				173	journal_t *journal = osb->journal->j_journal;
				174	int ret;
				175	tid_t commit_tid;
				176	bool needs_barrier = false;
				177
				178	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
				179	oi->ip_blkno,
				180	file->f_path.dentry->d_name.len,
				181	file->f_path.dentry->d_name.name,
				182	(unsigned long long)datasync);
				183
				184	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				185	return -EROFS;
				186
				187	err = file_write_and_wait_range(file, start, end);
				188	if (err)
				189	return err;
				190
				191	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
				192	if (journal->j_flags & JBD2_BARRIER &&
				193	!jbd2_trans_will_send_data_barrier(journal, commit_tid))
				194	needs_barrier = true;
				195	err = jbd2_complete_transaction(journal, commit_tid);
				196	if (needs_barrier) {
				197	ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
				198	if (!err)
				199	err = ret;
				200	}
				201
				202	if (err)
				203	mlog_errno(err);
				204
				205	return (err < 0) ? -EIO : 0;
				206	}
				207
				208	int ocfs2_should_update_atime(struct inode *inode,
				209	struct vfsmount *vfsmnt)
				210	{
				211	struct timespec64 now;
				212	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				213
				214	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				215	return 0;
				216
				217	if ((inode->i_flags & S_NOATIME) \|\|
				218	((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
				219	return 0;
				220
				221	/*
				222	* We can be called with no vfsmnt structure - NFSD will
				223	* sometimes do this.
				224	*
				225	* Note that our action here is different than touch_atime() -
				226	* if we can't tell whether this is a noatime mount, then we
				227	* don't know whether to trust the value of s_atime_quantum.
				228	*/
				229	if (vfsmnt == NULL)
				230	return 0;
				231
				232	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
				233	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
				234	return 0;
				235
				236	if (vfsmnt->mnt_flags & MNT_RELATIME) {
				237	if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
				238	(timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
				239	return 1;
				240
				241	return 0;
				242	}
				243
				244	now = current_time(inode);
				245	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
				246	return 0;
				247	else
				248	return 1;
				249	}
				250
				251	int ocfs2_update_inode_atime(struct inode *inode,
				252	struct buffer_head *bh)
				253	{
				254	int ret;
				255	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				256	handle_t *handle;
				257	struct ocfs2_dinode di = (struct ocfs2_dinode ) bh->b_data;
				258
				259	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				260	if (IS_ERR(handle)) {
				261	ret = PTR_ERR(handle);
				262	mlog_errno(ret);
				263	goto out;
				264	}
				265
				266	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				267	OCFS2_JOURNAL_ACCESS_WRITE);
				268	if (ret) {
				269	mlog_errno(ret);
				270	goto out_commit;
				271	}
				272
				273	/*
				274	* Don't use ocfs2_mark_inode_dirty() here as we don't always
				275	* have i_mutex to guard against concurrent changes to other
				276	* inode fields.
				277	*/
				278	inode->i_atime = current_time(inode);
				279	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
				280	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
				281	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				282	ocfs2_journal_dirty(handle, bh);
				283
				284	out_commit:
				285	ocfs2_commit_trans(osb, handle);
				286	out:
				287	return ret;
				288	}
				289
				290	int ocfs2_set_inode_size(handle_t *handle,
				291	struct inode *inode,
				292	struct buffer_head *fe_bh,
				293	u64 new_i_size)
				294	{
				295	int status;
				296
				297	i_size_write(inode, new_i_size);
				298	inode->i_blocks = ocfs2_inode_sector_count(inode);
				299	inode->i_ctime = inode->i_mtime = current_time(inode);
				300
				301	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
				302	if (status < 0) {
				303	mlog_errno(status);
				304	goto bail;
				305	}
				306
				307	bail:
				308	return status;
				309	}
				310
				311	int ocfs2_simple_size_update(struct inode *inode,
				312	struct buffer_head *di_bh,
				313	u64 new_i_size)
				314	{
				315	int ret;
				316	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				317	handle_t *handle = NULL;
				318
				319	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				320	if (IS_ERR(handle)) {
				321	ret = PTR_ERR(handle);
				322	mlog_errno(ret);
				323	goto out;
				324	}
				325
				326	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				327	new_i_size);
				328	if (ret < 0)
				329	mlog_errno(ret);
				330
				331	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				332	ocfs2_commit_trans(osb, handle);
				333	out:
				334	return ret;
				335	}
				336
				337	static int ocfs2_cow_file_pos(struct inode *inode,
				338	struct buffer_head *fe_bh,
				339	u64 offset)
				340	{
				341	int status;
				342	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				343	unsigned int num_clusters = 0;
				344	unsigned int ext_flags = 0;
				345
				346	/*
				347	* If the new offset is aligned to the range of the cluster, there is
				348	* no space for ocfs2_zero_range_for_truncate to fill, so no need to
				349	* CoW either.
				350	*/
				351	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
				352	return 0;
				353
				354	status = ocfs2_get_clusters(inode, cpos, &phys,
				355	&num_clusters, &ext_flags);
				356	if (status) {
				357	mlog_errno(status);
				358	goto out;
				359	}
				360
				361	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
				362	goto out;
				363
				364	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
				365
				366	out:
				367	return status;
				368	}
				369
				370	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				371	struct inode *inode,
				372	struct buffer_head *fe_bh,
				373	u64 new_i_size)
				374	{
				375	int status;
				376	handle_t *handle;
				377	struct ocfs2_dinode *di;
				378	u64 cluster_bytes;
				379
				380	/*
				381	* We need to CoW the cluster contains the offset if it is reflinked
				382	* since we will call ocfs2_zero_range_for_truncate later which will
				383	* write "0" from offset to the end of the cluster.
				384	*/
				385	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
				386	if (status) {
				387	mlog_errno(status);
				388	return status;
				389	}
				390
				391	/* TODO: This needs to actually orphan the inode in this
				392	* transaction. */
				393
				394	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				395	if (IS_ERR(handle)) {
				396	status = PTR_ERR(handle);
				397	mlog_errno(status);
				398	goto out;
				399	}
				400
				401	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
				402	OCFS2_JOURNAL_ACCESS_WRITE);
				403	if (status < 0) {
				404	mlog_errno(status);
				405	goto out_commit;
				406	}
				407
				408	/*
				409	* Do this before setting i_size.
				410	*/
				411	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
				412	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
				413	cluster_bytes);
				414	if (status) {
				415	mlog_errno(status);
				416	goto out_commit;
				417	}
				418
				419	i_size_write(inode, new_i_size);
				420	inode->i_ctime = inode->i_mtime = current_time(inode);
				421
				422	di = (struct ocfs2_dinode *) fe_bh->b_data;
				423	di->i_size = cpu_to_le64(new_i_size);
				424	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
				425	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
				426	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				427
				428	ocfs2_journal_dirty(handle, fe_bh);
				429
				430	out_commit:
				431	ocfs2_commit_trans(osb, handle);
				432	out:
				433	return status;
				434	}
				435
				436	int ocfs2_truncate_file(struct inode *inode,
				437	struct buffer_head *di_bh,
				438	u64 new_i_size)
				439	{
				440	int status = 0;
				441	struct ocfs2_dinode *fe = NULL;
				442	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				443
				444	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
				445	* already validated it */
				446	fe = (struct ocfs2_dinode *) di_bh->b_data;
				447
				448	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
				449	(unsigned long long)le64_to_cpu(fe->i_size),
				450	(unsigned long long)new_i_size);
				451
				452	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
				453	"Inode %llu, inode i_size = %lld != di "
				454	"i_size = %llu, i_flags = 0x%x\n",
				455	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				456	i_size_read(inode),
				457	(unsigned long long)le64_to_cpu(fe->i_size),
				458	le32_to_cpu(fe->i_flags));
				459
				460	if (new_i_size > le64_to_cpu(fe->i_size)) {
				461	trace_ocfs2_truncate_file_error(
				462	(unsigned long long)le64_to_cpu(fe->i_size),
				463	(unsigned long long)new_i_size);
				464	status = -EINVAL;
				465	mlog_errno(status);
				466	goto bail;
				467	}
				468
				469	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				470
				471	ocfs2_resv_discard(&osb->osb_la_resmap,
				472	&OCFS2_I(inode)->ip_la_data_resv);
				473
				474	/*
				475	* The inode lock forced other nodes to sync and drop their
				476	* pages, which (correctly) happens even if we have a truncate
				477	* without allocation change - ocfs2 cluster sizes can be much
				478	* greater than page size, so we have to truncate them
				479	* anyway.
				480	*/
				481
				482	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				483	unmap_mapping_range(inode->i_mapping,
				484	new_i_size + PAGE_SIZE - 1, 0, 1);
				485	truncate_inode_pages(inode->i_mapping, new_i_size);
				486	status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
				487	i_size_read(inode), 1);
				488	if (status)
				489	mlog_errno(status);
				490
				491	goto bail_unlock_sem;
				492	}
				493
				494	/* alright, we're going to need to do a full blown alloc size
				495	* change. Orphan the inode so that recovery can complete the
				496	* truncate if necessary. This does the task of marking
				497	* i_size. */
				498	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
				499	if (status < 0) {
				500	mlog_errno(status);
				501	goto bail_unlock_sem;
				502	}
				503
				504	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
				505	truncate_inode_pages(inode->i_mapping, new_i_size);
				506
				507	status = ocfs2_commit_truncate(osb, inode, di_bh);
				508	if (status < 0) {
				509	mlog_errno(status);
				510	goto bail_unlock_sem;
				511	}
				512
				513	/* TODO: orphan dir cleanup here. */
				514	bail_unlock_sem:
				515	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				516
				517	bail:
				518	if (!status && OCFS2_I(inode)->ip_clusters == 0)
				519	status = ocfs2_try_remove_refcount_tree(inode, di_bh);
				520
				521	return status;
				522	}
				523
				524	/*
				525	* extend file allocation only here.
				526	* we'll update all the disk stuff, and oip->alloc_size
				527	*
				528	* expect stuff to be locked, a transaction started and enough data /
				529	* metadata reservations in the contexts.
				530	*
				531	* Will return -EAGAIN, and a reason if a restart is needed.
				532	* If passed in, *reason will always be set, even in error.
				533	*/
				534	int ocfs2_add_inode_data(struct ocfs2_super *osb,
				535	struct inode *inode,
				536	u32 *logical_offset,
				537	u32 clusters_to_add,
				538	int mark_unwritten,
				539	struct buffer_head *fe_bh,
				540	handle_t *handle,
				541	struct ocfs2_alloc_context *data_ac,
				542	struct ocfs2_alloc_context *meta_ac,
				543	enum ocfs2_alloc_restarted *reason_ret)
				544	{
				545	int ret;
				546	struct ocfs2_extent_tree et;
				547
				548	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
				549	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
				550	clusters_to_add, mark_unwritten,
				551	data_ac, meta_ac, reason_ret);
				552
				553	return ret;
				554	}
				555
				556	static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
				557	u32 clusters_to_add, int mark_unwritten)
				558	{
				559	int status = 0;
				560	int restart_func = 0;
				561	int credits;
				562	u32 prev_clusters;
				563	struct buffer_head *bh = NULL;
				564	struct ocfs2_dinode *fe = NULL;
				565	handle_t *handle = NULL;
				566	struct ocfs2_alloc_context *data_ac = NULL;
				567	struct ocfs2_alloc_context *meta_ac = NULL;
				568	enum ocfs2_alloc_restarted why = RESTART_NONE;
				569	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				570	struct ocfs2_extent_tree et;
				571	int did_quota = 0;
				572
				573	/*
				574	* Unwritten extent only exists for file systems which
				575	* support holes.
				576	*/
				577	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
				578
				579	status = ocfs2_read_inode_block(inode, &bh);
				580	if (status < 0) {
				581	mlog_errno(status);
				582	goto leave;
				583	}
				584	fe = (struct ocfs2_dinode *) bh->b_data;
				585
				586	restart_all:
				587	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
				588
				589	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
				590	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
				591	&data_ac, &meta_ac);
				592	if (status) {
				593	mlog_errno(status);
				594	goto leave;
				595	}
				596
				597	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
				598	handle = ocfs2_start_trans(osb, credits);
				599	if (IS_ERR(handle)) {
				600	status = PTR_ERR(handle);
				601	handle = NULL;
				602	mlog_errno(status);
				603	goto leave;
				604	}
				605
				606	restarted_transaction:
				607	trace_ocfs2_extend_allocation(
				608	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				609	(unsigned long long)i_size_read(inode),
				610	le32_to_cpu(fe->i_clusters), clusters_to_add,
				611	why, restart_func);
				612
				613	status = dquot_alloc_space_nodirty(inode,
				614	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				615	if (status)
				616	goto leave;
				617	did_quota = 1;
				618
				619	/* reserve a write to the file entry early on - that we if we
				620	* run out of credits in the allocation path, we can still
				621	* update i_size. */
				622	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				623	OCFS2_JOURNAL_ACCESS_WRITE);
				624	if (status < 0) {
				625	mlog_errno(status);
				626	goto leave;
				627	}
				628
				629	prev_clusters = OCFS2_I(inode)->ip_clusters;
				630
				631	status = ocfs2_add_inode_data(osb,
				632	inode,
				633	&logical_start,
				634	clusters_to_add,
				635	mark_unwritten,
				636	bh,
				637	handle,
				638	data_ac,
				639	meta_ac,
				640	&why);
				641	if ((status < 0) && (status != -EAGAIN)) {
				642	if (status != -ENOSPC)
				643	mlog_errno(status);
				644	goto leave;
				645	}
				646	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				647	ocfs2_journal_dirty(handle, bh);
				648
				649	spin_lock(&OCFS2_I(inode)->ip_lock);
				650	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
				651	spin_unlock(&OCFS2_I(inode)->ip_lock);
				652	/* Release unused quota reservation */
				653	dquot_free_space(inode,
				654	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				655	did_quota = 0;
				656
				657	if (why != RESTART_NONE && clusters_to_add) {
				658	if (why == RESTART_META) {
				659	restart_func = 1;
				660	status = 0;
				661	} else {
				662	BUG_ON(why != RESTART_TRANS);
				663
				664	status = ocfs2_allocate_extend_trans(handle, 1);
				665	if (status < 0) {
				666	/* handle still has to be committed at
				667	* this point. */
				668	status = -ENOMEM;
				669	mlog_errno(status);
				670	goto leave;
				671	}
				672	goto restarted_transaction;
				673	}
				674	}
				675
				676	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
				677	le32_to_cpu(fe->i_clusters),
				678	(unsigned long long)le64_to_cpu(fe->i_size),
				679	OCFS2_I(inode)->ip_clusters,
				680	(unsigned long long)i_size_read(inode));
				681
				682	leave:
				683	if (status < 0 && did_quota)
				684	dquot_free_space(inode,
				685	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				686	if (handle) {
				687	ocfs2_commit_trans(osb, handle);
				688	handle = NULL;
				689	}
				690	if (data_ac) {
				691	ocfs2_free_alloc_context(data_ac);
				692	data_ac = NULL;
				693	}
				694	if (meta_ac) {
				695	ocfs2_free_alloc_context(meta_ac);
				696	meta_ac = NULL;
				697	}
				698	if ((!status) && restart_func) {
				699	restart_func = 0;
				700	goto restart_all;
				701	}
				702	brelse(bh);
				703	bh = NULL;
				704
				705	return status;
				706	}
				707
				708	/*
				709	* While a write will already be ordering the data, a truncate will not.
				710	* Thus, we need to explicitly order the zeroed pages.
				711	*/
				712	static handle_t ocfs2_zero_start_ordered_transaction(struct inode inode,
				713	struct buffer_head *di_bh,
				714	loff_t start_byte,
				715	loff_t length)
				716	{
				717	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				718	handle_t *handle = NULL;
				719	int ret = 0;
				720
				721	if (!ocfs2_should_order_data(inode))
				722	goto out;
				723
				724	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				725	if (IS_ERR(handle)) {
				726	ret = -ENOMEM;
				727	mlog_errno(ret);
				728	goto out;
				729	}
				730
				731	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
				732	if (ret < 0) {
				733	mlog_errno(ret);
				734	goto out;
				735	}
				736
				737	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
				738	OCFS2_JOURNAL_ACCESS_WRITE);
				739	if (ret)
				740	mlog_errno(ret);
				741	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				742
				743	out:
				744	if (ret) {
				745	if (!IS_ERR(handle))
				746	ocfs2_commit_trans(osb, handle);
				747	handle = ERR_PTR(ret);
				748	}
				749	return handle;
				750	}
				751
				752	/* Some parts of this taken from generic_cont_expand, which turned out
				753	* to be too fragile to do exactly what we need without us having to
				754	* worry about recursive locking in ->write_begin() and ->write_end(). */
				755	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
				756	u64 abs_to, struct buffer_head *di_bh)
				757	{
				758	struct address_space *mapping = inode->i_mapping;
				759	struct page *page;
				760	unsigned long index = abs_from >> PAGE_SHIFT;
				761	handle_t *handle;
				762	int ret = 0;
				763	unsigned zero_from, zero_to, block_start, block_end;
				764	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				765
				766	BUG_ON(abs_from >= abs_to);
				767	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
				768	BUG_ON(abs_from & (inode->i_blkbits - 1));
				769
				770	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
				771	abs_from,
				772	abs_to - abs_from);
				773	if (IS_ERR(handle)) {
				774	ret = PTR_ERR(handle);
				775	goto out;
				776	}
				777
				778	page = find_or_create_page(mapping, index, GFP_NOFS);
				779	if (!page) {
				780	ret = -ENOMEM;
				781	mlog_errno(ret);
				782	goto out_commit_trans;
				783	}
				784
				785	/* Get the offsets within the page that we want to zero */
				786	zero_from = abs_from & (PAGE_SIZE - 1);
				787	zero_to = abs_to & (PAGE_SIZE - 1);
				788	if (!zero_to)
				789	zero_to = PAGE_SIZE;
				790
				791	trace_ocfs2_write_zero_page(
				792	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				793	(unsigned long long)abs_from,
				794	(unsigned long long)abs_to,
				795	index, zero_from, zero_to);
				796
				797	/* We know that zero_from is block aligned */
				798	for (block_start = zero_from; block_start < zero_to;
				799	block_start = block_end) {
				800	block_end = block_start + i_blocksize(inode);
				801
				802	/*
				803	* block_start is block-aligned. Bump it by one to force
				804	* __block_write_begin and block_commit_write to zero the
				805	* whole block.
				806	*/
				807	ret = __block_write_begin(page, block_start + 1, 0,
				808	ocfs2_get_block);
				809	if (ret < 0) {
				810	mlog_errno(ret);
				811	goto out_unlock;
				812	}
				813
				814
				815	/* must not update i_size! */
				816	ret = block_commit_write(page, block_start + 1,
				817	block_start + 1);
				818	if (ret < 0)
				819	mlog_errno(ret);
				820	else
				821	ret = 0;
				822	}
				823
				824	/*
				825	* fs-writeback will release the dirty pages without page lock
				826	* whose offset are over inode size, the release happens at
				827	* block_write_full_page().
				828	*/
				829	i_size_write(inode, abs_to);
				830	inode->i_blocks = ocfs2_inode_sector_count(inode);
				831	di->i_size = cpu_to_le64((u64)i_size_read(inode));
				832	inode->i_mtime = inode->i_ctime = current_time(inode);
				833	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
				834	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
				835	di->i_mtime_nsec = di->i_ctime_nsec;
				836	if (handle) {
				837	ocfs2_journal_dirty(handle, di_bh);
				838	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				839	}
				840
				841	out_unlock:
				842	unlock_page(page);
				843	put_page(page);
				844	out_commit_trans:
				845	if (handle)
				846	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
				847	out:
				848	return ret;
				849	}
				850
				851	/*
				852	* Find the next range to zero. We do this in terms of bytes because
				853	* that's what ocfs2_zero_extend() wants, and it is dealing with the
				854	* pagecache. We may return multiple extents.
				855	*
				856	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
				857	* needs to be zeroed. range_start and range_end return the next zeroing
				858	* range. A subsequent call should pass the previous range_end as its
				859	* zero_start. If range_end is 0, there's nothing to do.
				860	*
				861	* Unwritten extents are skipped over. Refcounted extents are CoWd.
				862	*/
				863	static int ocfs2_zero_extend_get_range(struct inode *inode,
				864	struct buffer_head *di_bh,
				865	u64 zero_start, u64 zero_end,
				866	u64 range_start, u64 range_end)
				867	{
				868	int rc = 0, needs_cow = 0;
				869	u32 p_cpos, zero_clusters = 0;
				870	u32 zero_cpos =
				871	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				872	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
				873	unsigned int num_clusters = 0;
				874	unsigned int ext_flags = 0;
				875
				876	while (zero_cpos < last_cpos) {
				877	rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
				878	&num_clusters, &ext_flags);
				879	if (rc) {
				880	mlog_errno(rc);
				881	goto out;
				882	}
				883
				884	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
				885	zero_clusters = num_clusters;
				886	if (ext_flags & OCFS2_EXT_REFCOUNTED)
				887	needs_cow = 1;
				888	break;
				889	}
				890
				891	zero_cpos += num_clusters;
				892	}
				893	if (!zero_clusters) {
				894	*range_end = 0;
				895	goto out;
				896	}
				897
				898	while ((zero_cpos + zero_clusters) < last_cpos) {
				899	rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
				900	&p_cpos, &num_clusters,
				901	&ext_flags);
				902	if (rc) {
				903	mlog_errno(rc);
				904	goto out;
				905	}
				906
				907	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))
				908	break;
				909	if (ext_flags & OCFS2_EXT_REFCOUNTED)
				910	needs_cow = 1;
				911	zero_clusters += num_clusters;
				912	}
				913	if ((zero_cpos + zero_clusters) > last_cpos)
				914	zero_clusters = last_cpos - zero_cpos;
				915
				916	if (needs_cow) {
				917	rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
				918	zero_clusters, UINT_MAX);
				919	if (rc) {
				920	mlog_errno(rc);
				921	goto out;
				922	}
				923	}
				924
				925	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
				926	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
				927	zero_cpos + zero_clusters);
				928
				929	out:
				930	return rc;
				931	}
				932
				933	/*
				934	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
				935	* has made sure that the entire range needs zeroing.
				936	*/
				937	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
				938	u64 range_end, struct buffer_head *di_bh)
				939	{
				940	int rc = 0;
				941	u64 next_pos;
				942	u64 zero_pos = range_start;
				943
				944	trace_ocfs2_zero_extend_range(
				945	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				946	(unsigned long long)range_start,
				947	(unsigned long long)range_end);
				948	BUG_ON(range_start >= range_end);
				949
				950	while (zero_pos < range_end) {
				951	next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
				952	if (next_pos > range_end)
				953	next_pos = range_end;
				954	rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
				955	if (rc < 0) {
				956	mlog_errno(rc);
				957	break;
				958	}
				959	zero_pos = next_pos;
				960
				961	/*
				962	* Very large extends have the potential to lock up
				963	* the cpu for extended periods of time.
				964	*/
				965	cond_resched();
				966	}
				967
				968	return rc;
				969	}
				970
				971	int ocfs2_zero_extend(struct inode inode, struct buffer_head di_bh,
				972	loff_t zero_to_size)
				973	{
				974	int ret = 0;
				975	u64 zero_start, range_start = 0, range_end = 0;
				976	struct super_block *sb = inode->i_sb;
				977
				978	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
				979	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
				980	(unsigned long long)zero_start,
				981	(unsigned long long)i_size_read(inode));
				982	while (zero_start < zero_to_size) {
				983	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
				984	zero_to_size,
				985	&range_start,
				986	&range_end);
				987	if (ret) {
				988	mlog_errno(ret);
				989	break;
				990	}
				991	if (!range_end)
				992	break;
				993	/* Trim the ends */
				994	if (range_start < zero_start)
				995	range_start = zero_start;
				996	if (range_end > zero_to_size)
				997	range_end = zero_to_size;
				998
				999	ret = ocfs2_zero_extend_range(inode, range_start,
				1000	range_end, di_bh);
				1001	if (ret) {
				1002	mlog_errno(ret);
				1003	break;
				1004	}
				1005	zero_start = range_end;
				1006	}
				1007
				1008	return ret;
				1009	}
				1010
				1011	int ocfs2_extend_no_holes(struct inode inode, struct buffer_head di_bh,
				1012	u64 new_i_size, u64 zero_to)
				1013	{
				1014	int ret;
				1015	u32 clusters_to_add;
				1016	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				1017
				1018	/*
				1019	* Only quota files call this without a bh, and they can't be
				1020	* refcounted.
				1021	*/
				1022	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
				1023	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
				1024
				1025	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
				1026	if (clusters_to_add < oi->ip_clusters)
				1027	clusters_to_add = 0;
				1028	else
				1029	clusters_to_add -= oi->ip_clusters;
				1030
				1031	if (clusters_to_add) {
				1032	ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
				1033	clusters_to_add, 0);
				1034	if (ret) {
				1035	mlog_errno(ret);
				1036	goto out;
				1037	}
				1038	}
				1039
				1040	/*
				1041	* Call this even if we don't add any clusters to the tree. We
				1042	* still need to zero the area between the old i_size and the
				1043	* new i_size.
				1044	*/
				1045	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
				1046	if (ret < 0)
				1047	mlog_errno(ret);
				1048
				1049	out:
				1050	return ret;
				1051	}
				1052
				1053	static int ocfs2_extend_file(struct inode *inode,
				1054	struct buffer_head *di_bh,
				1055	u64 new_i_size)
				1056	{
				1057	int ret = 0;
				1058	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				1059
				1060	BUG_ON(!di_bh);
				1061
				1062	/* setattr sometimes calls us like this. */
				1063	if (new_i_size == 0)
				1064	goto out;
				1065
				1066	if (i_size_read(inode) == new_i_size)
				1067	goto out;
				1068	BUG_ON(new_i_size < i_size_read(inode));
				1069
				1070	/*
				1071	* The alloc sem blocks people in read/write from reading our
				1072	* allocation until we're done changing it. We depend on
				1073	* i_mutex to block other extend/truncate calls while we're
				1074	* here. We even have to hold it for sparse files because there
				1075	* might be some tail zeroing.
				1076	*/
				1077	down_write(&oi->ip_alloc_sem);
				1078
				1079	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1080	/*
				1081	* We can optimize small extends by keeping the inodes
				1082	* inline data.
				1083	*/
				1084	if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
				1085	up_write(&oi->ip_alloc_sem);
				1086	goto out_update_size;
				1087	}
				1088
				1089	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
				1090	if (ret) {
				1091	up_write(&oi->ip_alloc_sem);
				1092	mlog_errno(ret);
				1093	goto out;
				1094	}
				1095	}
				1096
				1097	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
				1098	ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
				1099	else
				1100	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
				1101	new_i_size);
				1102
				1103	up_write(&oi->ip_alloc_sem);
				1104
				1105	if (ret < 0) {
				1106	mlog_errno(ret);
				1107	goto out;
				1108	}
				1109
				1110	out_update_size:
				1111	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				1112	if (ret < 0)
				1113	mlog_errno(ret);
				1114
				1115	out:
				1116	return ret;
				1117	}
				1118
				1119	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
				1120	{
				1121	int status = 0, size_change;
				1122	int inode_locked = 0;
				1123	struct inode *inode = d_inode(dentry);
				1124	struct super_block *sb = inode->i_sb;
				1125	struct ocfs2_super *osb = OCFS2_SB(sb);
				1126	struct buffer_head *bh = NULL;
				1127	handle_t *handle = NULL;
				1128	struct dquot *transfer_to[MAXQUOTAS] = { };
				1129	int qtype;
				1130	int had_lock;
				1131	struct ocfs2_lock_holder oh;
				1132
				1133	trace_ocfs2_setattr(inode, dentry,
				1134	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1135	dentry->d_name.len, dentry->d_name.name,
				1136	attr->ia_valid,
				1137	attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0,
				1138	attr->ia_valid & ATTR_UID ?
				1139	from_kuid(&init_user_ns, attr->ia_uid) : 0,
				1140	attr->ia_valid & ATTR_GID ?
				1141	from_kgid(&init_user_ns, attr->ia_gid) : 0);
				1142
				1143	/* ensuring we don't even attempt to truncate a symlink */
				1144	if (S_ISLNK(inode->i_mode))
				1145	attr->ia_valid &= ~ATTR_SIZE;
				1146
				1147	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
				1148	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
				1149	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
				1150	return 0;
				1151
				1152	status = setattr_prepare(dentry, attr);
				1153	if (status)
				1154	return status;
				1155
				1156	if (is_quota_modification(inode, attr)) {
				1157	status = dquot_initialize(inode);
				1158	if (status)
				1159	return status;
				1160	}
				1161	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
				1162	if (size_change) {
				1163	/*
				1164	* Here we should wait dio to finish before inode lock
				1165	* to avoid a deadlock between ocfs2_setattr() and
				1166	* ocfs2_dio_end_io_write()
				1167	*/
				1168	inode_dio_wait(inode);
				1169
				1170	status = ocfs2_rw_lock(inode, 1);
				1171	if (status < 0) {
				1172	mlog_errno(status);
				1173	goto bail;
				1174	}
				1175	}
				1176
				1177	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
				1178	if (had_lock < 0) {
				1179	status = had_lock;
				1180	goto bail_unlock_rw;
				1181	} else if (had_lock) {
				1182	/*
				1183	* As far as we know, ocfs2_setattr() could only be the first
				1184	* VFS entry point in the call chain of recursive cluster
				1185	* locking issue.
				1186	*
				1187	* For instance:
				1188	* chmod_common()
				1189	* notify_change()
				1190	* ocfs2_setattr()
				1191	* posix_acl_chmod()
				1192	* ocfs2_iop_get_acl()
				1193	*
				1194	* But, we're not 100% sure if it's always true, because the
				1195	* ordering of the VFS entry points in the call chain is out
				1196	* of our control. So, we'd better dump the stack here to
				1197	* catch the other cases of recursive locking.
				1198	*/
				1199	mlog(ML_ERROR, "Another case of recursive locking:\n");
				1200	dump_stack();
				1201	}
				1202	inode_locked = 1;
				1203
				1204	if (size_change) {
				1205	status = inode_newsize_ok(inode, attr->ia_size);
				1206	if (status)
				1207	goto bail_unlock;
				1208
				1209	if (i_size_read(inode) >= attr->ia_size) {
				1210	if (ocfs2_should_order_data(inode)) {
				1211	status = ocfs2_begin_ordered_truncate(inode,
				1212	attr->ia_size);
				1213	if (status)
				1214	goto bail_unlock;
				1215	}
				1216	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
				1217	} else
				1218	status = ocfs2_extend_file(inode, bh, attr->ia_size);
				1219	if (status < 0) {
				1220	if (status != -ENOSPC)
				1221	mlog_errno(status);
				1222	status = -ENOSPC;
				1223	goto bail_unlock;
				1224	}
				1225	}
				1226
				1227	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) \|\|
				1228	(attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
				1229	/*
				1230	* Gather pointers to quota structures so that allocation /
				1231	* freeing of quota structures happens here and not inside
				1232	* dquot_transfer() where we have problems with lock ordering
				1233	*/
				1234	if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
				1235	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
				1236	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
				1237	transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
				1238	if (IS_ERR(transfer_to[USRQUOTA])) {
				1239	status = PTR_ERR(transfer_to[USRQUOTA]);
				1240	transfer_to[USRQUOTA] = NULL;
				1241	goto bail_unlock;
				1242	}
				1243	}
				1244	if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
				1245	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
				1246	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
				1247	transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
				1248	if (IS_ERR(transfer_to[GRPQUOTA])) {
				1249	status = PTR_ERR(transfer_to[GRPQUOTA]);
				1250	transfer_to[GRPQUOTA] = NULL;
				1251	goto bail_unlock;
				1252	}
				1253	}
				1254	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				1255	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
				1256	2 * ocfs2_quota_trans_credits(sb));
				1257	if (IS_ERR(handle)) {
				1258	status = PTR_ERR(handle);
				1259	mlog_errno(status);
				1260	goto bail_unlock_alloc;
				1261	}
				1262	status = __dquot_transfer(inode, transfer_to);
				1263	if (status < 0)
				1264	goto bail_commit;
				1265	} else {
				1266	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				1267	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1268	if (IS_ERR(handle)) {
				1269	status = PTR_ERR(handle);
				1270	mlog_errno(status);
				1271	goto bail_unlock_alloc;
				1272	}
				1273	}
				1274
				1275	setattr_copy(inode, attr);
				1276	mark_inode_dirty(inode);
				1277
				1278	status = ocfs2_mark_inode_dirty(handle, inode, bh);
				1279	if (status < 0)
				1280	mlog_errno(status);
				1281
				1282	bail_commit:
				1283	ocfs2_commit_trans(osb, handle);
				1284	bail_unlock_alloc:
				1285	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				1286	bail_unlock:
				1287	if (status && inode_locked) {
				1288	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
				1289	inode_locked = 0;
				1290	}
				1291	bail_unlock_rw:
				1292	if (size_change)
				1293	ocfs2_rw_unlock(inode, 1);
				1294	bail:
				1295
				1296	/* Release quota pointers in case we acquired them */
				1297	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
				1298	dqput(transfer_to[qtype]);
				1299
				1300	if (!status && attr->ia_valid & ATTR_MODE) {
				1301	status = ocfs2_acl_chmod(inode, bh);
				1302	if (status < 0)
				1303	mlog_errno(status);
				1304	}
				1305	if (inode_locked)
				1306	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
				1307
				1308	brelse(bh);
				1309	return status;
				1310	}
				1311
				1312	int ocfs2_getattr(const struct path path, struct kstat stat,
				1313	u32 request_mask, unsigned int flags)
				1314	{
				1315	struct inode *inode = d_inode(path->dentry);
				1316	struct super_block *sb = path->dentry->d_sb;
				1317	struct ocfs2_super *osb = sb->s_fs_info;
				1318	int err;
				1319
				1320	err = ocfs2_inode_revalidate(path->dentry);
				1321	if (err) {
				1322	if (err != -ENOENT)
				1323	mlog_errno(err);
				1324	goto bail;
				1325	}
				1326
				1327	generic_fillattr(inode, stat);
				1328	/*
				1329	* If there is inline data in the inode, the inode will normally not
				1330	* have data blocks allocated (it may have an external xattr block).
				1331	* Report at least one sector for such files, so tools like tar, rsync,
				1332	* others don't incorrectly think the file is completely sparse.
				1333	*/
				1334	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
				1335	stat->blocks += (stat->size + 511)>>9;
				1336
				1337	/* We set the blksize from the cluster size for performance */
				1338	stat->blksize = osb->s_clustersize;
				1339
				1340	bail:
				1341	return err;
				1342	}
				1343
				1344	int ocfs2_permission(struct inode *inode, int mask)
				1345	{
				1346	int ret, had_lock;
				1347	struct ocfs2_lock_holder oh;
				1348
				1349	if (mask & MAY_NOT_BLOCK)
				1350	return -ECHILD;
				1351
				1352	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
				1353	if (had_lock < 0) {
				1354	ret = had_lock;
				1355	goto out;
				1356	} else if (had_lock) {
				1357	/* See comments in ocfs2_setattr() for details.
				1358	* The call chain of this case could be:
				1359	* do_sys_open()
				1360	* may_open()
				1361	* inode_permission()
				1362	* ocfs2_permission()
				1363	* ocfs2_iop_get_acl()
				1364	*/
				1365	mlog(ML_ERROR, "Another case of recursive locking:\n");
				1366	dump_stack();
				1367	}
				1368
				1369	ret = generic_permission(inode, mask);
				1370
				1371	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
				1372	out:
				1373	return ret;
				1374	}
				1375
				1376	static int __ocfs2_write_remove_suid(struct inode *inode,
				1377	struct buffer_head *bh)
				1378	{
				1379	int ret;
				1380	handle_t *handle;
				1381	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1382	struct ocfs2_dinode *di;
				1383
				1384	trace_ocfs2_write_remove_suid(
				1385	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1386	inode->i_mode);
				1387
				1388	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1389	if (IS_ERR(handle)) {
				1390	ret = PTR_ERR(handle);
				1391	mlog_errno(ret);
				1392	goto out;
				1393	}
				1394
				1395	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				1396	OCFS2_JOURNAL_ACCESS_WRITE);
				1397	if (ret < 0) {
				1398	mlog_errno(ret);
				1399	goto out_trans;
				1400	}
				1401
				1402	inode->i_mode &= ~S_ISUID;
				1403	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
				1404	inode->i_mode &= ~S_ISGID;
				1405
				1406	di = (struct ocfs2_dinode *) bh->b_data;
				1407	di->i_mode = cpu_to_le16(inode->i_mode);
				1408	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				1409
				1410	ocfs2_journal_dirty(handle, bh);
				1411
				1412	out_trans:
				1413	ocfs2_commit_trans(osb, handle);
				1414	out:
				1415	return ret;
				1416	}
				1417
				1418	static int ocfs2_write_remove_suid(struct inode *inode)
				1419	{
				1420	int ret;
				1421	struct buffer_head *bh = NULL;
				1422
				1423	ret = ocfs2_read_inode_block(inode, &bh);
				1424	if (ret < 0) {
				1425	mlog_errno(ret);
				1426	goto out;
				1427	}
				1428
				1429	ret = __ocfs2_write_remove_suid(inode, bh);
				1430	out:
				1431	brelse(bh);
				1432	return ret;
				1433	}
				1434
				1435	/*
				1436	* Allocate enough extents to cover the region starting at byte offset
				1437	* start for len bytes. Existing extents are skipped, any extents
				1438	* added are marked as "unwritten".
				1439	*/
				1440	static int ocfs2_allocate_unwritten_extents(struct inode *inode,
				1441	u64 start, u64 len)
				1442	{
				1443	int ret;
				1444	u32 cpos, phys_cpos, clusters, alloc_size;
				1445	u64 end = start + len;
				1446	struct buffer_head *di_bh = NULL;
				1447
				1448	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1449	ret = ocfs2_read_inode_block(inode, &di_bh);
				1450	if (ret) {
				1451	mlog_errno(ret);
				1452	goto out;
				1453	}
				1454
				1455	/*
				1456	* Nothing to do if the requested reservation range
				1457	* fits within the inode.
				1458	*/
				1459	if (ocfs2_size_fits_inline_data(di_bh, end))
				1460	goto out;
				1461
				1462	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
				1463	if (ret) {
				1464	mlog_errno(ret);
				1465	goto out;
				1466	}
				1467	}
				1468
				1469	/*
				1470	* We consider both start and len to be inclusive.
				1471	*/
				1472	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				1473	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
				1474	clusters -= cpos;
				1475
				1476	while (clusters) {
				1477	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
				1478	&alloc_size, NULL);
				1479	if (ret) {
				1480	mlog_errno(ret);
				1481	goto out;
				1482	}
				1483
				1484	/*
				1485	* Hole or existing extent len can be arbitrary, so
				1486	* cap it to our own allocation request.
				1487	*/
				1488	if (alloc_size > clusters)
				1489	alloc_size = clusters;
				1490
				1491	if (phys_cpos) {
				1492	/*
				1493	* We already have an allocation at this
				1494	* region so we can safely skip it.
				1495	*/
				1496	goto next;
				1497	}
				1498
				1499	ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
				1500	if (ret) {
				1501	if (ret != -ENOSPC)
				1502	mlog_errno(ret);
				1503	goto out;
				1504	}
				1505
				1506	next:
				1507	cpos += alloc_size;
				1508	clusters -= alloc_size;
				1509	}
				1510
				1511	ret = 0;
				1512	out:
				1513
				1514	brelse(di_bh);
				1515	return ret;
				1516	}
				1517
				1518	/*
				1519	* Truncate a byte range, avoiding pages within partial clusters. This
				1520	* preserves those pages for the zeroing code to write to.
				1521	*/
				1522	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
				1523	u64 byte_len)
				1524	{
				1525	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1526	loff_t start, end;
				1527	struct address_space *mapping = inode->i_mapping;
				1528
				1529	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
				1530	end = byte_start + byte_len;
				1531	end = end & ~(osb->s_clustersize - 1);
				1532
				1533	if (start < end) {
				1534	unmap_mapping_range(mapping, start, end - start, 0);
				1535	truncate_inode_pages_range(mapping, start, end - 1);
				1536	}
				1537	}
				1538
				1539	/*
				1540	* zero out partial blocks of one cluster.
				1541	*
				1542	* start: file offset where zero starts, will be made upper block aligned.
				1543	* len: it will be trimmed to the end of current cluster if "start + len"
				1544	* is bigger than it.
				1545	*/
				1546	static int ocfs2_zeroout_partial_cluster(struct inode *inode,
				1547	u64 start, u64 len)
				1548	{
				1549	int ret;
				1550	u64 start_block, end_block, nr_blocks;
				1551	u64 p_block, offset;
				1552	u32 cluster, p_cluster, nr_clusters;
				1553	struct super_block *sb = inode->i_sb;
				1554	u64 end = ocfs2_align_bytes_to_clusters(sb, start);
				1555
				1556	if (start + len < end)
				1557	end = start + len;
				1558
				1559	start_block = ocfs2_blocks_for_bytes(sb, start);
				1560	end_block = ocfs2_blocks_for_bytes(sb, end);
				1561	nr_blocks = end_block - start_block;
				1562	if (!nr_blocks)
				1563	return 0;
				1564
				1565	cluster = ocfs2_bytes_to_clusters(sb, start);
				1566	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
				1567	&nr_clusters, NULL);
				1568	if (ret)
				1569	return ret;
				1570	if (!p_cluster)
				1571	return 0;
				1572
				1573	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
				1574	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
				1575	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
				1576	}
				1577
				1578	static int ocfs2_zero_partial_clusters(struct inode *inode,
				1579	u64 start, u64 len)
				1580	{
				1581	int ret = 0;
				1582	u64 tmpend = 0;
				1583	u64 end = start + len;
				1584	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1585	unsigned int csize = osb->s_clustersize;
				1586	handle_t *handle;
				1587	loff_t isize = i_size_read(inode);
				1588
				1589	/*
				1590	* The "start" and "end" values are NOT necessarily part of
				1591	* the range whose allocation is being deleted. Rather, this
				1592	* is what the user passed in with the request. We must zero
				1593	* partial clusters here. There's no need to worry about
				1594	* physical allocation - the zeroing code knows to skip holes.
				1595	*/
				1596	trace_ocfs2_zero_partial_clusters(
				1597	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1598	(unsigned long long)start, (unsigned long long)end);
				1599
				1600	/*
				1601	* If both edges are on a cluster boundary then there's no
				1602	* zeroing required as the region is part of the allocation to
				1603	* be truncated.
				1604	*/
				1605	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
				1606	goto out;
				1607
				1608	/* No page cache for EOF blocks, issue zero out to disk. */
				1609	if (end > isize) {
				1610	/*
				1611	* zeroout eof blocks in last cluster starting from
				1612	* "isize" even "start" > "isize" because it is
				1613	* complicated to zeroout just at "start" as "start"
				1614	* may be not aligned with block size, buffer write
				1615	* would be required to do that, but out of eof buffer
				1616	* write is not supported.
				1617	*/
				1618	ret = ocfs2_zeroout_partial_cluster(inode, isize,
				1619	end - isize);
				1620	if (ret) {
				1621	mlog_errno(ret);
				1622	goto out;
				1623	}
				1624	if (start >= isize)
				1625	goto out;
				1626	end = isize;
				1627	}
				1628	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1629	if (IS_ERR(handle)) {
				1630	ret = PTR_ERR(handle);
				1631	mlog_errno(ret);
				1632	goto out;
				1633	}
				1634
				1635	/*
				1636	* If start is on a cluster boundary and end is somewhere in another
				1637	* cluster, we have not COWed the cluster starting at start, unless
				1638	* end is also within the same cluster. So, in this case, we skip this
				1639	* first call to ocfs2_zero_range_for_truncate() truncate and move on
				1640	* to the next one.
				1641	*/
				1642	if ((start & (csize - 1)) != 0) {
				1643	/*
				1644	* We want to get the byte offset of the end of the 1st
				1645	* cluster.
				1646	*/
				1647	tmpend = (u64)osb->s_clustersize +
				1648	(start & ~(osb->s_clustersize - 1));
				1649	if (tmpend > end)
				1650	tmpend = end;
				1651
				1652	trace_ocfs2_zero_partial_clusters_range1(
				1653	(unsigned long long)start,
				1654	(unsigned long long)tmpend);
				1655
				1656	ret = ocfs2_zero_range_for_truncate(inode, handle, start,
				1657	tmpend);
				1658	if (ret)
				1659	mlog_errno(ret);
				1660	}
				1661
				1662	if (tmpend < end) {
				1663	/*
				1664	* This may make start and end equal, but the zeroing
				1665	* code will skip any work in that case so there's no
				1666	* need to catch it up here.
				1667	*/
				1668	start = end & ~(osb->s_clustersize - 1);
				1669
				1670	trace_ocfs2_zero_partial_clusters_range2(
				1671	(unsigned long long)start, (unsigned long long)end);
				1672
				1673	ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
				1674	if (ret)
				1675	mlog_errno(ret);
				1676	}
				1677	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				1678
				1679	ocfs2_commit_trans(osb, handle);
				1680	out:
				1681	return ret;
				1682	}
				1683
				1684	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
				1685	{
				1686	int i;
				1687	struct ocfs2_extent_rec *rec = NULL;
				1688
				1689	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
				1690
				1691	rec = &el->l_recs[i];
				1692
				1693	if (le32_to_cpu(rec->e_cpos) < pos)
				1694	break;
				1695	}
				1696
				1697	return i;
				1698	}
				1699
				1700	/*
				1701	* Helper to calculate the punching pos and length in one run, we handle the
				1702	* following three cases in order:
				1703	*
				1704	* - remove the entire record
				1705	* - remove a partial record
				1706	* - no record needs to be removed (hole-punching completed)
				1707	*/
				1708	static void ocfs2_calc_trunc_pos(struct inode *inode,
				1709	struct ocfs2_extent_list *el,
				1710	struct ocfs2_extent_rec *rec,
				1711	u32 trunc_start, u32 *trunc_cpos,
				1712	u32 trunc_len, u32 trunc_end,
				1713	u64 blkno, int done)
				1714	{
				1715	int ret = 0;
				1716	u32 coff, range;
				1717
				1718	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
				1719
				1720	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
				1721	/*
				1722	* remove an entire extent record.
				1723	*/
				1724	*trunc_cpos = le32_to_cpu(rec->e_cpos);
				1725	/*
				1726	* Skip holes if any.
				1727	*/
				1728	if (range < *trunc_end)
				1729	*trunc_end = range;
				1730	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);
				1731	*blkno = le64_to_cpu(rec->e_blkno);
				1732	*trunc_end = le32_to_cpu(rec->e_cpos);
				1733	} else if (range > trunc_start) {
				1734	/*
				1735	* remove a partial extent record, which means we're
				1736	* removing the last extent record.
				1737	*/
				1738	*trunc_cpos = trunc_start;
				1739	/*
				1740	* skip hole if any.
				1741	*/
				1742	if (range < *trunc_end)
				1743	*trunc_end = range;
				1744	trunc_len = trunc_end - trunc_start;
				1745	coff = trunc_start - le32_to_cpu(rec->e_cpos);
				1746	*blkno = le64_to_cpu(rec->e_blkno) +
				1747	ocfs2_clusters_to_blocks(inode->i_sb, coff);
				1748	*trunc_end = trunc_start;
				1749	} else {
				1750	/*
				1751	* It may have two following possibilities:
				1752	*
				1753	* - last record has been removed
				1754	* - trunc_start was within a hole
				1755	*
				1756	* both two cases mean the completion of hole punching.
				1757	*/
				1758	ret = 1;
				1759	}
				1760
				1761	*done = ret;
				1762	}
				1763
				1764	int ocfs2_remove_inode_range(struct inode *inode,
				1765	struct buffer_head *di_bh, u64 byte_start,
				1766	u64 byte_len)
				1767	{
				1768	int ret = 0, flags = 0, done = 0, i;
				1769	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
				1770	u32 cluster_in_el;
				1771	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1772	struct ocfs2_cached_dealloc_ctxt dealloc;
				1773	struct address_space *mapping = inode->i_mapping;
				1774	struct ocfs2_extent_tree et;
				1775	struct ocfs2_path *path = NULL;
				1776	struct ocfs2_extent_list *el = NULL;
				1777	struct ocfs2_extent_rec *rec = NULL;
				1778	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				1779	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
				1780
				1781	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
				1782	ocfs2_init_dealloc_ctxt(&dealloc);
				1783
				1784	trace_ocfs2_remove_inode_range(
				1785	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1786	(unsigned long long)byte_start,
				1787	(unsigned long long)byte_len);
				1788
				1789	if (byte_len == 0)
				1790	return 0;
				1791
				1792	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1793	int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
				1794
				1795	if (byte_start > id_count \|\| byte_start + byte_len > id_count) {
				1796	ret = -EINVAL;
				1797	mlog_errno(ret);
				1798	goto out;
				1799	}
				1800
				1801	ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
				1802	byte_start + byte_len, 0);
				1803	if (ret) {
				1804	mlog_errno(ret);
				1805	goto out;
				1806	}
				1807	/*
				1808	* There's no need to get fancy with the page cache
				1809	* truncate of an inline-data inode. We're talking
				1810	* about less than a page here, which will be cached
				1811	* in the dinode buffer anyway.
				1812	*/
				1813	unmap_mapping_range(mapping, 0, 0, 0);
				1814	truncate_inode_pages(mapping, 0);
				1815	goto out;
				1816	}
				1817
				1818	/*
				1819	* For reflinks, we may need to CoW 2 clusters which might be
				1820	* partially zero'd later, if hole's start and end offset were
				1821	* within one cluster(means is not exactly aligned to clustersize).
				1822	*/
				1823
				1824	if (ocfs2_is_refcount_inode(inode)) {
				1825	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
				1826	if (ret) {
				1827	mlog_errno(ret);
				1828	goto out;
				1829	}
				1830
				1831	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
				1832	if (ret) {
				1833	mlog_errno(ret);
				1834	goto out;
				1835	}
				1836	}
				1837
				1838	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
				1839	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
				1840	cluster_in_el = trunc_end;
				1841
				1842	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
				1843	if (ret) {
				1844	mlog_errno(ret);
				1845	goto out;
				1846	}
				1847
				1848	path = ocfs2_new_path_from_et(&et);
				1849	if (!path) {
				1850	ret = -ENOMEM;
				1851	mlog_errno(ret);
				1852	goto out;
				1853	}
				1854
				1855	while (trunc_end > trunc_start) {
				1856
				1857	ret = ocfs2_find_path(INODE_CACHE(inode), path,
				1858	cluster_in_el);
				1859	if (ret) {
				1860	mlog_errno(ret);
				1861	goto out;
				1862	}
				1863
				1864	el = path_leaf_el(path);
				1865
				1866	i = ocfs2_find_rec(el, trunc_end);
				1867	/*
				1868	* Need to go to previous extent block.
				1869	*/
				1870	if (i < 0) {
				1871	if (path->p_tree_depth == 0)
				1872	break;
				1873
				1874	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
				1875	path,
				1876	&cluster_in_el);
				1877	if (ret) {
				1878	mlog_errno(ret);
				1879	goto out;
				1880	}
				1881
				1882	/*
				1883	* We've reached the leftmost extent block,
				1884	* it's safe to leave.
				1885	*/
				1886	if (cluster_in_el == 0)
				1887	break;
				1888
				1889	/*
				1890	* The 'pos' searched for previous extent block is
				1891	* always one cluster less than actual trunc_end.
				1892	*/
				1893	trunc_end = cluster_in_el + 1;
				1894
				1895	ocfs2_reinit_path(path, 1);
				1896
				1897	continue;
				1898
				1899	} else
				1900	rec = &el->l_recs[i];
				1901
				1902	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
				1903	&trunc_len, &trunc_end, &blkno, &done);
				1904	if (done)
				1905	break;
				1906
				1907	flags = rec->e_flags;
				1908	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
				1909
				1910	ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
				1911	phys_cpos, trunc_len, flags,
				1912	&dealloc, refcount_loc, false);
				1913	if (ret < 0) {
				1914	mlog_errno(ret);
				1915	goto out;
				1916	}
				1917
				1918	cluster_in_el = trunc_end;
				1919
				1920	ocfs2_reinit_path(path, 1);
				1921	}
				1922
				1923	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
				1924
				1925	out:
				1926	ocfs2_free_path(path);
				1927	ocfs2_schedule_truncate_log_flush(osb, 1);
				1928	ocfs2_run_deallocs(osb, &dealloc);
				1929
				1930	return ret;
				1931	}
				1932
				1933	/*
				1934	* Parts of this function taken from xfs_change_file_space()
				1935	*/
				1936	static int __ocfs2_change_file_space(struct file file, struct inode inode,
				1937	loff_t f_pos, unsigned int cmd,
				1938	struct ocfs2_space_resv *sr,
				1939	int change_size)
				1940	{
				1941	int ret;
				1942	s64 llen;
				1943	loff_t size, orig_isize;
				1944	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1945	struct buffer_head *di_bh = NULL;
				1946	handle_t *handle;
				1947	unsigned long long max_off = inode->i_sb->s_maxbytes;
				1948
				1949	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				1950	return -EROFS;
				1951
				1952	inode_lock(inode);
				1953
				1954	/* Wait all existing dio workers, newcomers will block on i_rwsem */
				1955	inode_dio_wait(inode);
				1956	/*
				1957	* This prevents concurrent writes on other nodes
				1958	*/
				1959	ret = ocfs2_rw_lock(inode, 1);
				1960	if (ret) {
				1961	mlog_errno(ret);
				1962	goto out;
				1963	}
				1964
				1965	ret = ocfs2_inode_lock(inode, &di_bh, 1);
				1966	if (ret) {
				1967	mlog_errno(ret);
				1968	goto out_rw_unlock;
				1969	}
				1970
				1971	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {
				1972	ret = -EPERM;
				1973	goto out_inode_unlock;
				1974	}
				1975
				1976	switch (sr->l_whence) {
				1977	case 0: /SEEK_SET/
				1978	break;
				1979	case 1: /SEEK_CUR/
				1980	sr->l_start += f_pos;
				1981	break;
				1982	case 2: /SEEK_END/
				1983	sr->l_start += i_size_read(inode);
				1984	break;
				1985	default:
				1986	ret = -EINVAL;
				1987	goto out_inode_unlock;
				1988	}
				1989	sr->l_whence = 0;
				1990
				1991	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
				1992
				1993	if (sr->l_start < 0
				1994	\|\| sr->l_start > max_off
				1995	\|\| (sr->l_start + llen) < 0
				1996	\|\| (sr->l_start + llen) > max_off) {
				1997	ret = -EINVAL;
				1998	goto out_inode_unlock;
				1999	}
				2000	size = sr->l_start + sr->l_len;
				2001
				2002	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64 \|\|
				2003	cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) {
				2004	if (sr->l_len <= 0) {
				2005	ret = -EINVAL;
				2006	goto out_inode_unlock;
				2007	}
				2008	}
				2009
				2010	if (file && should_remove_suid(file->f_path.dentry)) {
				2011	ret = __ocfs2_write_remove_suid(inode, di_bh);
				2012	if (ret) {
				2013	mlog_errno(ret);
				2014	goto out_inode_unlock;
				2015	}
				2016	}
				2017
				2018	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				2019	switch (cmd) {
				2020	case OCFS2_IOC_RESVSP:
				2021	case OCFS2_IOC_RESVSP64:
				2022	/*
				2023	* This takes unsigned offsets, but the signed ones we
				2024	* pass have been checked against overflow above.
				2025	*/
				2026	ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
				2027	sr->l_len);
				2028	break;
				2029	case OCFS2_IOC_UNRESVSP:
				2030	case OCFS2_IOC_UNRESVSP64:
				2031	ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
				2032	sr->l_len);
				2033	break;
				2034	default:
				2035	ret = -EINVAL;
				2036	}
				2037
				2038	orig_isize = i_size_read(inode);
				2039	/* zeroout eof blocks in the cluster. */
				2040	if (!ret && change_size && orig_isize < size) {
				2041	ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
				2042	size - orig_isize);
				2043	if (!ret)
				2044	i_size_write(inode, size);
				2045	}
				2046	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				2047	if (ret) {
				2048	mlog_errno(ret);
				2049	goto out_inode_unlock;
				2050	}
				2051
				2052	/*
				2053	* We update c/mtime for these changes
				2054	*/
				2055	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				2056	if (IS_ERR(handle)) {
				2057	ret = PTR_ERR(handle);
				2058	mlog_errno(ret);
				2059	goto out_inode_unlock;
				2060	}
				2061
				2062	inode->i_ctime = inode->i_mtime = current_time(inode);
				2063	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
				2064	if (ret < 0)
				2065	mlog_errno(ret);
				2066
				2067	if (file && (file->f_flags & O_SYNC))
				2068	handle->h_sync = 1;
				2069
				2070	ocfs2_commit_trans(osb, handle);
				2071
				2072	out_inode_unlock:
				2073	brelse(di_bh);
				2074	ocfs2_inode_unlock(inode, 1);
				2075	out_rw_unlock:
				2076	ocfs2_rw_unlock(inode, 1);
				2077
				2078	out:
				2079	inode_unlock(inode);
				2080	return ret;
				2081	}
				2082
				2083	int ocfs2_change_file_space(struct file *file, unsigned int cmd,
				2084	struct ocfs2_space_resv *sr)
				2085	{
				2086	struct inode *inode = file_inode(file);
				2087	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2088	int ret;
				2089
				2090	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&
				2091	!ocfs2_writes_unwritten_extents(osb))
				2092	return -ENOTTY;
				2093	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&
				2094	!ocfs2_sparse_alloc(osb))
				2095	return -ENOTTY;
				2096
				2097	if (!S_ISREG(inode->i_mode))
				2098	return -EINVAL;
				2099
				2100	if (!(file->f_mode & FMODE_WRITE))
				2101	return -EBADF;
				2102
				2103	ret = mnt_want_write_file(file);
				2104	if (ret)
				2105	return ret;
				2106	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
				2107	mnt_drop_write_file(file);
				2108	return ret;
				2109	}
				2110
				2111	static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
				2112	loff_t len)
				2113	{
				2114	struct inode *inode = file_inode(file);
				2115	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2116	struct ocfs2_space_resv sr;
				2117	int change_size = 1;
				2118	int cmd = OCFS2_IOC_RESVSP64;
				2119	int ret = 0;
				2120
				2121	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				2122	return -EOPNOTSUPP;
				2123	if (!ocfs2_writes_unwritten_extents(osb))
				2124	return -EOPNOTSUPP;
				2125
				2126	if (mode & FALLOC_FL_KEEP_SIZE) {
				2127	change_size = 0;
				2128	} else {
				2129	ret = inode_newsize_ok(inode, offset + len);
				2130	if (ret)
				2131	return ret;
				2132	}
				2133
				2134	if (mode & FALLOC_FL_PUNCH_HOLE)
				2135	cmd = OCFS2_IOC_UNRESVSP64;
				2136
				2137	sr.l_whence = 0;
				2138	sr.l_start = (s64)offset;
				2139	sr.l_len = (s64)len;
				2140
				2141	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
				2142	change_size);
				2143	}
				2144
				2145	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
				2146	size_t count)
				2147	{
				2148	int ret = 0;
				2149	unsigned int extent_flags;
				2150	u32 cpos, clusters, extent_len, phys_cpos;
				2151	struct super_block *sb = inode->i_sb;
				2152
				2153	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|
				2154	!ocfs2_is_refcount_inode(inode) \|\|
				2155	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
				2156	return 0;
				2157
				2158	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
				2159	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
				2160
				2161	while (clusters) {
				2162	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
				2163	&extent_flags);
				2164	if (ret < 0) {
				2165	mlog_errno(ret);
				2166	goto out;
				2167	}
				2168
				2169	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
				2170	ret = 1;
				2171	break;
				2172	}
				2173
				2174	if (extent_len > clusters)
				2175	extent_len = clusters;
				2176
				2177	clusters -= extent_len;
				2178	cpos += extent_len;
				2179	}
				2180	out:
				2181	return ret;
				2182	}
				2183
				2184	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
				2185	{
				2186	int blockmask = inode->i_sb->s_blocksize - 1;
				2187	loff_t final_size = pos + count;
				2188
				2189	if ((pos & blockmask) \|\| (final_size & blockmask))
				2190	return 1;
				2191	return 0;
				2192	}
				2193
				2194	static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
				2195	struct buffer_head **di_bh,
				2196	int meta_level,
				2197	int write_sem,
				2198	int wait)
				2199	{
				2200	int ret = 0;
				2201
				2202	if (wait)
				2203	ret = ocfs2_inode_lock(inode, di_bh, meta_level);
				2204	else
				2205	ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
				2206	if (ret < 0)
				2207	goto out;
				2208
				2209	if (wait) {
				2210	if (write_sem)
				2211	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				2212	else
				2213	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				2214	} else {
				2215	if (write_sem)
				2216	ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
				2217	else
				2218	ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
				2219
				2220	if (!ret) {
				2221	ret = -EAGAIN;
				2222	goto out_unlock;
				2223	}
				2224	}
				2225
				2226	return ret;
				2227
				2228	out_unlock:
				2229	brelse(*di_bh);
				2230	*di_bh = NULL;
				2231	ocfs2_inode_unlock(inode, meta_level);
				2232	out:
				2233	return ret;
				2234	}
				2235
				2236	static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
				2237	struct buffer_head **di_bh,
				2238	int meta_level,
				2239	int write_sem)
				2240	{
				2241	if (write_sem)
				2242	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				2243	else
				2244	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				2245
				2246	brelse(*di_bh);
				2247	*di_bh = NULL;
				2248
				2249	if (meta_level >= 0)
				2250	ocfs2_inode_unlock(inode, meta_level);
				2251	}
				2252
				2253	static int ocfs2_prepare_inode_for_write(struct file *file,
				2254	loff_t pos, size_t count, int wait)
				2255	{
				2256	int ret = 0, meta_level = 0, overwrite_io = 0;
				2257	int write_sem = 0;
				2258	struct dentry *dentry = file->f_path.dentry;
				2259	struct inode *inode = d_inode(dentry);
				2260	struct buffer_head *di_bh = NULL;
				2261	u32 cpos;
				2262	u32 clusters;
				2263
				2264	/*
				2265	* We start with a read level meta lock and only jump to an ex
				2266	* if we need to make modifications here.
				2267	*/
				2268	for(;;) {
				2269	ret = ocfs2_inode_lock_for_extent_tree(inode,
				2270	&di_bh,
				2271	meta_level,
				2272	write_sem,
				2273	wait);
				2274	if (ret < 0) {
				2275	if (ret != -EAGAIN)
				2276	mlog_errno(ret);
				2277	goto out;
				2278	}
				2279
				2280	/*
				2281	* Check if IO will overwrite allocated blocks in case
				2282	* IOCB_NOWAIT flag is set.
				2283	*/
				2284	if (!wait && !overwrite_io) {
				2285	overwrite_io = 1;
				2286
				2287	ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
				2288	if (ret < 0) {
				2289	if (ret != -EAGAIN)
				2290	mlog_errno(ret);
				2291	goto out_unlock;
				2292	}
				2293	}
				2294
				2295	/* Clear suid / sgid if necessary. We do this here
				2296	* instead of later in the write path because
				2297	* remove_suid() calls ->setattr without any hint that
				2298	* we may have already done our cluster locking. Since
				2299	* ocfs2_setattr() must take cluster locks to
				2300	* proceed, this will lead us to recursively lock the
				2301	* inode. There's also the dinode i_size state which
				2302	* can be lost via setattr during extending writes (we
				2303	* set inode->i_size at the end of a write. */
				2304	if (should_remove_suid(dentry)) {
				2305	if (meta_level == 0) {
				2306	ocfs2_inode_unlock_for_extent_tree(inode,
				2307	&di_bh,
				2308	meta_level,
				2309	write_sem);
				2310	meta_level = 1;
				2311	continue;
				2312	}
				2313
				2314	ret = ocfs2_write_remove_suid(inode);
				2315	if (ret < 0) {
				2316	mlog_errno(ret);
				2317	goto out_unlock;
				2318	}
				2319	}
				2320
				2321	ret = ocfs2_check_range_for_refcount(inode, pos, count);
				2322	if (ret == 1) {
				2323	ocfs2_inode_unlock_for_extent_tree(inode,
				2324	&di_bh,
				2325	meta_level,
				2326	write_sem);
				2327	meta_level = 1;
				2328	write_sem = 1;
				2329	ret = ocfs2_inode_lock_for_extent_tree(inode,
				2330	&di_bh,
				2331	meta_level,
				2332	write_sem,
				2333	wait);
				2334	if (ret < 0) {
				2335	if (ret != -EAGAIN)
				2336	mlog_errno(ret);
				2337	goto out;
				2338	}
				2339
				2340	cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				2341	clusters =
				2342	ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
				2343	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
				2344	}
				2345
				2346	if (ret < 0) {
				2347	if (ret != -EAGAIN)
				2348	mlog_errno(ret);
				2349	goto out_unlock;
				2350	}
				2351
				2352	break;
				2353	}
				2354
				2355	out_unlock:
				2356	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
				2357	pos, count, wait);
				2358
				2359	ocfs2_inode_unlock_for_extent_tree(inode,
				2360	&di_bh,
				2361	meta_level,
				2362	write_sem);
				2363
				2364	out:
				2365	return ret;
				2366	}
				2367
				2368	static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
				2369	struct iov_iter *from)
				2370	{
				2371	int rw_level;
				2372	ssize_t written = 0;
				2373	ssize_t ret;
				2374	size_t count = iov_iter_count(from);
				2375	struct file *file = iocb->ki_filp;
				2376	struct inode *inode = file_inode(file);
				2377	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2378	int full_coherency = !(osb->s_mount_opt &
				2379	OCFS2_MOUNT_COHERENCY_BUFFERED);
				2380	void *saved_ki_complete = NULL;
				2381	int append_write = ((iocb->ki_pos + count) >=
				2382	i_size_read(inode) ? 1 : 0);
				2383	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
				2384	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
				2385
				2386	trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
				2387	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2388	file->f_path.dentry->d_name.len,
				2389	file->f_path.dentry->d_name.name,
				2390	(unsigned int)from->nr_segs); /* GRRRRR */
				2391
				2392	if (!direct_io && nowait)
				2393	return -EOPNOTSUPP;
				2394
				2395	if (count == 0)
				2396	return 0;
				2397
				2398	if (nowait) {
				2399	if (!inode_trylock(inode))
				2400	return -EAGAIN;
				2401	} else
				2402	inode_lock(inode);
				2403
				2404	ocfs2_iocb_init_rw_locked(iocb);
				2405
				2406	/*
				2407	* Concurrent O_DIRECT writes are allowed with
				2408	* mount_option "coherency=buffered".
				2409	* For append write, we must take rw EX.
				2410	*/
				2411	rw_level = (!direct_io \|\| full_coherency \|\| append_write);
				2412
				2413	if (nowait)
				2414	ret = ocfs2_try_rw_lock(inode, rw_level);
				2415	else
				2416	ret = ocfs2_rw_lock(inode, rw_level);
				2417	if (ret < 0) {
				2418	if (ret != -EAGAIN)
				2419	mlog_errno(ret);
				2420	goto out_mutex;
				2421	}
				2422
				2423	/*
				2424	* O_DIRECT writes with "coherency=full" need to take EX cluster
				2425	* inode_lock to guarantee coherency.
				2426	*/
				2427	if (direct_io && full_coherency) {
				2428	/*
				2429	* We need to take and drop the inode lock to force
				2430	* other nodes to drop their caches. Buffered I/O
				2431	* already does this in write_begin().
				2432	*/
				2433	if (nowait)
				2434	ret = ocfs2_try_inode_lock(inode, NULL, 1);
				2435	else
				2436	ret = ocfs2_inode_lock(inode, NULL, 1);
				2437	if (ret < 0) {
				2438	if (ret != -EAGAIN)
				2439	mlog_errno(ret);
				2440	goto out;
				2441	}
				2442
				2443	ocfs2_inode_unlock(inode, 1);
				2444	}
				2445
				2446	ret = generic_write_checks(iocb, from);
				2447	if (ret <= 0) {
				2448	if (ret)
				2449	mlog_errno(ret);
				2450	goto out;
				2451	}
				2452	count = ret;
				2453
				2454	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
				2455	if (ret < 0) {
				2456	if (ret != -EAGAIN)
				2457	mlog_errno(ret);
				2458	goto out;
				2459	}
				2460
				2461	if (direct_io && !is_sync_kiocb(iocb) &&
				2462	ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
				2463	/*
				2464	* Make it a sync io if it's an unaligned aio.
				2465	*/
				2466	saved_ki_complete = xchg(&iocb->ki_complete, NULL);
				2467	}
				2468
				2469	/* communicate with ocfs2_dio_end_io */
				2470	ocfs2_iocb_set_rw_locked(iocb, rw_level);
				2471
				2472	written = __generic_file_write_iter(iocb, from);
				2473	/* buffered aio wouldn't have proper lock coverage today */
				2474	BUG_ON(written == -EIOCBQUEUED && !direct_io);
				2475
				2476	/*
				2477	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
				2478	* function pointer which is called when o_direct io completes so that
				2479	* it can unlock our rw lock.
				2480	* Unfortunately there are error cases which call end_io and others
				2481	* that don't. so we don't have to unlock the rw_lock if either an
				2482	* async dio is going to do it in the future or an end_io after an
				2483	* error has already done it.
				2484	*/
				2485	if ((written == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
				2486	rw_level = -1;
				2487	}
				2488
				2489	if (unlikely(written <= 0))
				2490	goto out;
				2491
				2492	if (((file->f_flags & O_DSYNC) && !direct_io) \|\|
				2493	IS_SYNC(inode)) {
				2494	ret = filemap_fdatawrite_range(file->f_mapping,
				2495	iocb->ki_pos - written,
				2496	iocb->ki_pos - 1);
				2497	if (ret < 0)
				2498	written = ret;
				2499
				2500	if (!ret) {
				2501	ret = jbd2_journal_force_commit(osb->journal->j_journal);
				2502	if (ret < 0)
				2503	written = ret;
				2504	}
				2505
				2506	if (!ret)
				2507	ret = filemap_fdatawait_range(file->f_mapping,
				2508	iocb->ki_pos - written,
				2509	iocb->ki_pos - 1);
				2510	}
				2511
				2512	out:
				2513	if (saved_ki_complete)
				2514	xchg(&iocb->ki_complete, saved_ki_complete);
				2515
				2516	if (rw_level != -1)
				2517	ocfs2_rw_unlock(inode, rw_level);
				2518
				2519	out_mutex:
				2520	inode_unlock(inode);
				2521
				2522	if (written)
				2523	ret = written;
				2524	return ret;
				2525	}
				2526
				2527	static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
				2528	struct iov_iter *to)
				2529	{
				2530	int ret = 0, rw_level = -1, lock_level = 0;
				2531	struct file *filp = iocb->ki_filp;
				2532	struct inode *inode = file_inode(filp);
				2533	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
				2534	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
				2535
				2536	trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
				2537	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2538	filp->f_path.dentry->d_name.len,
				2539	filp->f_path.dentry->d_name.name,
				2540	to->nr_segs); /* GRRRRR */
				2541
				2542
				2543	if (!inode) {
				2544	ret = -EINVAL;
				2545	mlog_errno(ret);
				2546	goto bail;
				2547	}
				2548
				2549	if (!direct_io && nowait)
				2550	return -EOPNOTSUPP;
				2551
				2552	ocfs2_iocb_init_rw_locked(iocb);
				2553
				2554	/*
				2555	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
				2556	* need locks to protect pending reads from racing with truncate.
				2557	*/
				2558	if (direct_io) {
				2559	if (nowait)
				2560	ret = ocfs2_try_rw_lock(inode, 0);
				2561	else
				2562	ret = ocfs2_rw_lock(inode, 0);
				2563
				2564	if (ret < 0) {
				2565	if (ret != -EAGAIN)
				2566	mlog_errno(ret);
				2567	goto bail;
				2568	}
				2569	rw_level = 0;
				2570	/* communicate with ocfs2_dio_end_io */
				2571	ocfs2_iocb_set_rw_locked(iocb, rw_level);
				2572	}
				2573
				2574	/*
				2575	* We're fine letting folks race truncates and extending
				2576	* writes with read across the cluster, just like they can
				2577	* locally. Hence no rw_lock during read.
				2578	*
				2579	* Take and drop the meta data lock to update inode fields
				2580	* like i_size. This allows the checks down below
				2581	* generic_file_read_iter() a chance of actually working.
				2582	*/
				2583	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
				2584	!nowait);
				2585	if (ret < 0) {
				2586	if (ret != -EAGAIN)
				2587	mlog_errno(ret);
				2588	goto bail;
				2589	}
				2590	ocfs2_inode_unlock(inode, lock_level);
				2591
				2592	ret = generic_file_read_iter(iocb, to);
				2593	trace_generic_file_read_iter_ret(ret);
				2594
				2595	/* buffered aio wouldn't have proper lock coverage today */
				2596	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
				2597
				2598	/* see ocfs2_file_write_iter */
				2599	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				2600	rw_level = -1;
				2601	}
				2602
				2603	bail:
				2604	if (rw_level != -1)
				2605	ocfs2_rw_unlock(inode, rw_level);
				2606
				2607	return ret;
				2608	}
				2609
				2610	/* Refer generic_file_llseek_unlocked() */
				2611	static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
				2612	{
				2613	struct inode *inode = file->f_mapping->host;
				2614	int ret = 0;
				2615
				2616	inode_lock(inode);
				2617
				2618	switch (whence) {
				2619	case SEEK_SET:
				2620	break;
				2621	case SEEK_END:
				2622	/* SEEK_END requires the OCFS2 inode lock for the file
				2623	* because it references the file's size.
				2624	*/
				2625	ret = ocfs2_inode_lock(inode, NULL, 0);
				2626	if (ret < 0) {
				2627	mlog_errno(ret);
				2628	goto out;
				2629	}
				2630	offset += i_size_read(inode);
				2631	ocfs2_inode_unlock(inode, 0);
				2632	break;
				2633	case SEEK_CUR:
				2634	if (offset == 0) {
				2635	offset = file->f_pos;
				2636	goto out;
				2637	}
				2638	offset += file->f_pos;
				2639	break;
				2640	case SEEK_DATA:
				2641	case SEEK_HOLE:
				2642	ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
				2643	if (ret)
				2644	goto out;
				2645	break;
				2646	default:
				2647	ret = -EINVAL;
				2648	goto out;
				2649	}
				2650
				2651	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				2652
				2653	out:
				2654	inode_unlock(inode);
				2655	if (ret)
				2656	return ret;
				2657	return offset;
				2658	}
				2659
				2660	static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
				2661	struct file *file_out, loff_t pos_out,
				2662	loff_t len, unsigned int remap_flags)
				2663	{
				2664	struct inode *inode_in = file_inode(file_in);
				2665	struct inode *inode_out = file_inode(file_out);
				2666	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
				2667	struct buffer_head in_bh = NULL, out_bh = NULL;
				2668	bool same_inode = (inode_in == inode_out);
				2669	loff_t remapped = 0;
				2670	ssize_t ret;
				2671
				2672	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				2673	return -EINVAL;
				2674	if (!ocfs2_refcount_tree(osb))
				2675	return -EOPNOTSUPP;
				2676	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				2677	return -EROFS;
				2678
				2679	/* Lock both files against IO */
				2680	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
				2681	if (ret)
				2682	return ret;
				2683
				2684	/* Check file eligibility and prepare for block sharing. */
				2685	ret = -EINVAL;
				2686	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) \|\|
				2687	(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
				2688	goto out_unlock;
				2689
				2690	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
				2691	&len, remap_flags);
				2692	if (ret < 0 \|\| len == 0)
				2693	goto out_unlock;
				2694
				2695	/* Lock out changes to the allocation maps and remap. */
				2696	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
				2697	if (!same_inode)
				2698	down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
				2699	SINGLE_DEPTH_NESTING);
				2700
				2701	/* Zap any page cache for the destination file's range. */
				2702	truncate_inode_pages_range(&inode_out->i_data,
				2703	round_down(pos_out, PAGE_SIZE),
				2704	round_up(pos_out + len, PAGE_SIZE) - 1);
				2705
				2706	remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
				2707	inode_out, out_bh, pos_out, len);
				2708	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
				2709	if (!same_inode)
				2710	up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
				2711	if (remapped < 0) {
				2712	ret = remapped;
				2713	mlog_errno(ret);
				2714	goto out_unlock;
				2715	}
				2716
				2717	/*
				2718	* Empty the extent map so that we may get the right extent
				2719	* record from the disk.
				2720	*/
				2721	ocfs2_extent_map_trunc(inode_in, 0);
				2722	ocfs2_extent_map_trunc(inode_out, 0);
				2723
				2724	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
				2725	if (ret) {
				2726	mlog_errno(ret);
				2727	goto out_unlock;
				2728	}
				2729
				2730	out_unlock:
				2731	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
				2732	return remapped > 0 ? remapped : ret;
				2733	}
				2734
				2735	const struct inode_operations ocfs2_file_iops = {
				2736	.setattr = ocfs2_setattr,
				2737	.getattr = ocfs2_getattr,
				2738	.permission = ocfs2_permission,
				2739	.listxattr = ocfs2_listxattr,
				2740	.fiemap = ocfs2_fiemap,
				2741	.get_acl = ocfs2_iop_get_acl,
				2742	.set_acl = ocfs2_iop_set_acl,
				2743	};
				2744
				2745	const struct inode_operations ocfs2_special_file_iops = {
				2746	.setattr = ocfs2_setattr,
				2747	.getattr = ocfs2_getattr,
				2748	.permission = ocfs2_permission,
				2749	.get_acl = ocfs2_iop_get_acl,
				2750	.set_acl = ocfs2_iop_set_acl,
				2751	};
				2752
				2753	/*
				2754	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
				2755	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
				2756	*/
				2757	const struct file_operations ocfs2_fops = {
				2758	.llseek = ocfs2_file_llseek,
				2759	.mmap = ocfs2_mmap,
				2760	.fsync = ocfs2_sync_file,
				2761	.release = ocfs2_file_release,
				2762	.open = ocfs2_file_open,
				2763	.read_iter = ocfs2_file_read_iter,
				2764	.write_iter = ocfs2_file_write_iter,
				2765	.unlocked_ioctl = ocfs2_ioctl,
				2766	#ifdef CONFIG_COMPAT
				2767	.compat_ioctl = ocfs2_compat_ioctl,
				2768	#endif
				2769	.lock = ocfs2_lock,
				2770	.flock = ocfs2_flock,
				2771	.splice_read = generic_file_splice_read,
				2772	.splice_write = iter_file_splice_write,
				2773	.fallocate = ocfs2_fallocate,
				2774	.remap_file_range = ocfs2_remap_file_range,
				2775	};
				2776
				2777	const struct file_operations ocfs2_dops = {
				2778	.llseek = generic_file_llseek,
				2779	.read = generic_read_dir,
				2780	.iterate = ocfs2_readdir,
				2781	.fsync = ocfs2_sync_file,
				2782	.release = ocfs2_dir_release,
				2783	.open = ocfs2_dir_open,
				2784	.unlocked_ioctl = ocfs2_ioctl,
				2785	#ifdef CONFIG_COMPAT
				2786	.compat_ioctl = ocfs2_compat_ioctl,
				2787	#endif
				2788	.lock = ocfs2_lock,
				2789	.flock = ocfs2_flock,
				2790	};
				2791
				2792	/*
				2793	* POSIX-lockless variants of our file_operations.
				2794	*
				2795	* These will be used if the underlying cluster stack does not support
				2796	* posix file locking, if the user passes the "localflocks" mount
				2797	* option, or if we have a local-only fs.
				2798	*
				2799	* ocfs2_flock is in here because all stacks handle UNIX file locks,
				2800	* so we still want it in the case of no stack support for
				2801	* plocks. Internally, it will do the right thing when asked to ignore
				2802	* the cluster.
				2803	*/
				2804	const struct file_operations ocfs2_fops_no_plocks = {
				2805	.llseek = ocfs2_file_llseek,
				2806	.mmap = ocfs2_mmap,
				2807	.fsync = ocfs2_sync_file,
				2808	.release = ocfs2_file_release,
				2809	.open = ocfs2_file_open,
				2810	.read_iter = ocfs2_file_read_iter,
				2811	.write_iter = ocfs2_file_write_iter,
				2812	.unlocked_ioctl = ocfs2_ioctl,
				2813	#ifdef CONFIG_COMPAT
				2814	.compat_ioctl = ocfs2_compat_ioctl,
				2815	#endif
				2816	.flock = ocfs2_flock,
				2817	.splice_read = generic_file_splice_read,
				2818	.splice_write = iter_file_splice_write,
				2819	.fallocate = ocfs2_fallocate,
				2820	.remap_file_range = ocfs2_remap_file_range,
				2821	};
				2822
				2823	const struct file_operations ocfs2_dops_no_plocks = {
				2824	.llseek = generic_file_llseek,
				2825	.read = generic_read_dir,
				2826	.iterate = ocfs2_readdir,
				2827	.fsync = ocfs2_sync_file,
				2828	.release = ocfs2_dir_release,
				2829	.open = ocfs2_dir_open,
				2830	.unlocked_ioctl = ocfs2_ioctl,
				2831	#ifdef CONFIG_COMPAT
				2832	.compat_ioctl = ocfs2_compat_ioctl,
				2833	#endif
				2834	.flock = ocfs2_flock,
				2835	};