Blame - marvell/linux/fs/ceph/file.c - T108

blob: 83122fc5f8130b8128018658029b280d9e7c4258 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3	#include <linux/ceph/striper.h>
				4
				5	#include <linux/module.h>
				6	#include <linux/sched.h>
				7	#include <linux/slab.h>
				8	#include <linux/file.h>
				9	#include <linux/mount.h>
				10	#include <linux/namei.h>
				11	#include <linux/writeback.h>
				12	#include <linux/falloc.h>
				13	#include <linux/iversion.h>
				14
				15	#include "super.h"
				16	#include "mds_client.h"
				17	#include "cache.h"
				18	#include "io.h"
				19
				20	static __le32 ceph_flags_sys2wire(u32 flags)
				21	{
				22	u32 wire_flags = 0;
				23
				24	switch (flags & O_ACCMODE) {
				25	case O_RDONLY:
				26	wire_flags \|= CEPH_O_RDONLY;
				27	break;
				28	case O_WRONLY:
				29	wire_flags \|= CEPH_O_WRONLY;
				30	break;
				31	case O_RDWR:
				32	wire_flags \|= CEPH_O_RDWR;
				33	break;
				34	}
				35
				36	flags &= ~O_ACCMODE;
				37
				38	#define ceph_sys2wire(a) if (flags & a) { wire_flags \|= CEPH_##a; flags &= ~a; }
				39
				40	ceph_sys2wire(O_CREAT);
				41	ceph_sys2wire(O_EXCL);
				42	ceph_sys2wire(O_TRUNC);
				43	ceph_sys2wire(O_DIRECTORY);
				44	ceph_sys2wire(O_NOFOLLOW);
				45
				46	#undef ceph_sys2wire
				47
				48	if (flags)
				49	dout("unused open flags: %x\n", flags);
				50
				51	return cpu_to_le32(wire_flags);
				52	}
				53
				54	/*
				55	* Ceph file operations
				56	*
				57	* Implement basic open/close functionality, and implement
				58	* read/write.
				59	*
				60	* We implement three modes of file I/O:
				61	* - buffered uses the generic_file_aio_{read,write} helpers
				62	*
				63	* - synchronous is used when there is multi-client read/write
				64	* sharing, avoids the page cache, and synchronously waits for an
				65	* ack from the OSD.
				66	*
				67	* - direct io takes the variant of the sync path that references
				68	* user pages directly.
				69	*
				70	* fsync() flushes and waits on dirty pages, but just queues metadata
				71	* for writeback: since the MDS can recover size and mtime there is no
				72	* need to wait for MDS acknowledgement.
				73	*/
				74
				75	/*
				76	* How many pages to get in one call to iov_iter_get_pages(). This
				77	* determines the size of the on-stack array used as a buffer.
				78	*/
				79	#define ITER_GET_BVECS_PAGES 64
				80
				81	static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
				82	struct bio_vec *bvecs)
				83	{
				84	size_t size = 0;
				85	int bvec_idx = 0;
				86
				87	if (maxsize > iov_iter_count(iter))
				88	maxsize = iov_iter_count(iter);
				89
				90	while (size < maxsize) {
				91	struct page *pages[ITER_GET_BVECS_PAGES];
				92	ssize_t bytes;
				93	size_t start;
				94	int idx = 0;
				95
				96	bytes = iov_iter_get_pages(iter, pages, maxsize - size,
				97	ITER_GET_BVECS_PAGES, &start);
				98	if (bytes < 0)
				99	return size ?: bytes;
				100
				101	iov_iter_advance(iter, bytes);
				102	size += bytes;
				103
				104	for ( ; bytes; idx++, bvec_idx++) {
				105	struct bio_vec bv = {
				106	.bv_page = pages[idx],
				107	.bv_len = min_t(int, bytes, PAGE_SIZE - start),
				108	.bv_offset = start,
				109	};
				110
				111	bvecs[bvec_idx] = bv;
				112	bytes -= bv.bv_len;
				113	start = 0;
				114	}
				115	}
				116
				117	return size;
				118	}
				119
				120	/*
				121	* iov_iter_get_pages() only considers one iov_iter segment, no matter
				122	* what maxsize or maxpages are given. For ITER_BVEC that is a single
				123	* page.
				124	*
				125	* Attempt to get up to @maxsize bytes worth of pages from @iter.
				126	* Return the number of bytes in the created bio_vec array, or an error.
				127	*/
				128	static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
				129	struct bio_vec *bvecs, int num_bvecs)
				130	{
				131	struct bio_vec *bv;
				132	size_t orig_count = iov_iter_count(iter);
				133	ssize_t bytes;
				134	int npages;
				135
				136	iov_iter_truncate(iter, maxsize);
				137	npages = iov_iter_npages(iter, INT_MAX);
				138	iov_iter_reexpand(iter, orig_count);
				139
				140	/*
				141	* __iter_get_bvecs() may populate only part of the array -- zero it
				142	* out.
				143	*/
				144	bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL \| __GFP_ZERO);
				145	if (!bv)
				146	return -ENOMEM;
				147
				148	bytes = __iter_get_bvecs(iter, maxsize, bv);
				149	if (bytes < 0) {
				150	/*
				151	* No pages were pinned -- just free the array.
				152	*/
				153	kvfree(bv);
				154	return bytes;
				155	}
				156
				157	*bvecs = bv;
				158	*num_bvecs = npages;
				159	return bytes;
				160	}
				161
				162	static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
				163	{
				164	int i;
				165
				166	for (i = 0; i < num_bvecs; i++) {
				167	if (bvecs[i].bv_page) {
				168	if (should_dirty)
				169	set_page_dirty_lock(bvecs[i].bv_page);
				170	put_page(bvecs[i].bv_page);
				171	}
				172	}
				173	kvfree(bvecs);
				174	}
				175
				176	/*
				177	* Prepare an open request. Preallocate ceph_cap to avoid an
				178	* inopportune ENOMEM later.
				179	*/
				180	static struct ceph_mds_request *
				181	prepare_open_request(struct super_block *sb, int flags, int create_mode)
				182	{
				183	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
				184	struct ceph_mds_client *mdsc = fsc->mdsc;
				185	struct ceph_mds_request *req;
				186	int want_auth = USE_ANY_MDS;
				187	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
				188
				189	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))
				190	want_auth = USE_AUTH_MDS;
				191
				192	req = ceph_mdsc_create_request(mdsc, op, want_auth);
				193	if (IS_ERR(req))
				194	goto out;
				195	req->r_fmode = ceph_flags_to_mode(flags);
				196	req->r_args.open.flags = ceph_flags_sys2wire(flags);
				197	req->r_args.open.mode = cpu_to_le32(create_mode);
				198	out:
				199	return req;
				200	}
				201
				202	static int ceph_init_file_info(struct inode inode, struct file file,
				203	int fmode, bool isdir)
				204	{
				205	struct ceph_inode_info *ci = ceph_inode(inode);
				206	struct ceph_file_info *fi;
				207
				208	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
				209	inode->i_mode, isdir ? "dir" : "regular");
				210	BUG_ON(inode->i_fop->release != ceph_release);
				211
				212	if (isdir) {
				213	struct ceph_dir_file_info *dfi =
				214	kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
				215	if (!dfi) {
				216	ceph_put_fmode(ci, fmode); /* clean up */
				217	return -ENOMEM;
				218	}
				219
				220	file->private_data = dfi;
				221	fi = &dfi->file_info;
				222	dfi->next_offset = 2;
				223	dfi->readdir_cache_idx = -1;
				224	} else {
				225	fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
				226	if (!fi) {
				227	ceph_put_fmode(ci, fmode); /* clean up */
				228	return -ENOMEM;
				229	}
				230
				231	file->private_data = fi;
				232	}
				233
				234	fi->fmode = fmode;
				235	spin_lock_init(&fi->rw_contexts_lock);
				236	INIT_LIST_HEAD(&fi->rw_contexts);
				237	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
				238
				239	return 0;
				240	}
				241
				242	/*
				243	* initialize private struct file data.
				244	* if we fail, clean up by dropping fmode reference on the ceph_inode
				245	*/
				246	static int ceph_init_file(struct inode inode, struct file file, int fmode)
				247	{
				248	int ret = 0;
				249
				250	switch (inode->i_mode & S_IFMT) {
				251	case S_IFREG:
				252	ceph_fscache_register_inode_cookie(inode);
				253	ceph_fscache_file_set_cookie(inode, file);
				254	/* fall through */
				255	case S_IFDIR:
				256	ret = ceph_init_file_info(inode, file, fmode,
				257	S_ISDIR(inode->i_mode));
				258	if (ret)
				259	return ret;
				260	break;
				261
				262	case S_IFLNK:
				263	dout("init_file %p %p 0%o (symlink)\n", inode, file,
				264	inode->i_mode);
				265	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				266	break;
				267
				268	default:
				269	dout("init_file %p %p 0%o (special)\n", inode, file,
				270	inode->i_mode);
				271	/*
				272	* we need to drop the open ref now, since we don't
				273	* have .release set to ceph_release.
				274	*/
				275	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				276	BUG_ON(inode->i_fop->release == ceph_release);
				277
				278	/* call the proper open fop */
				279	ret = inode->i_fop->open(inode, file);
				280	}
				281	return ret;
				282	}
				283
				284	/*
				285	* try renew caps after session gets killed.
				286	*/
				287	int ceph_renew_caps(struct inode *inode)
				288	{
				289	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				290	struct ceph_inode_info *ci = ceph_inode(inode);
				291	struct ceph_mds_request *req;
				292	int err, flags, wanted;
				293
				294	spin_lock(&ci->i_ceph_lock);
				295	wanted = __ceph_caps_file_wanted(ci);
				296	if (__ceph_is_any_real_caps(ci) &&
				297	(!(wanted & CEPH_CAP_ANY_WR) \|\| ci->i_auth_cap)) {
				298	int issued = __ceph_caps_issued(ci, NULL);
				299	spin_unlock(&ci->i_ceph_lock);
				300	dout("renew caps %p want %s issued %s updating mds_wanted\n",
				301	inode, ceph_cap_string(wanted), ceph_cap_string(issued));
				302	ceph_check_caps(ci, 0, NULL);
				303	return 0;
				304	}
				305	spin_unlock(&ci->i_ceph_lock);
				306
				307	flags = 0;
				308	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
				309	flags = O_RDWR;
				310	else if (wanted & CEPH_CAP_FILE_RD)
				311	flags = O_RDONLY;
				312	else if (wanted & CEPH_CAP_FILE_WR)
				313	flags = O_WRONLY;
				314	#ifdef O_LAZY
				315	if (wanted & CEPH_CAP_FILE_LAZYIO)
				316	flags \|= O_LAZY;
				317	#endif
				318
				319	req = prepare_open_request(inode->i_sb, flags, 0);
				320	if (IS_ERR(req)) {
				321	err = PTR_ERR(req);
				322	goto out;
				323	}
				324
				325	req->r_inode = inode;
				326	ihold(inode);
				327	req->r_num_caps = 1;
				328	req->r_fmode = -1;
				329
				330	err = ceph_mdsc_do_request(mdsc, NULL, req);
				331	ceph_mdsc_put_request(req);
				332	out:
				333	dout("renew caps %p open result=%d\n", inode, err);
				334	return err < 0 ? err : 0;
				335	}
				336
				337	/*
				338	* If we already have the requisite capabilities, we can satisfy
				339	* the open request locally (no need to request new caps from the
				340	* MDS). We do, however, need to inform the MDS (asynchronously)
				341	* if our wanted caps set expands.
				342	*/
				343	int ceph_open(struct inode inode, struct file file)
				344	{
				345	struct ceph_inode_info *ci = ceph_inode(inode);
				346	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
				347	struct ceph_mds_client *mdsc = fsc->mdsc;
				348	struct ceph_mds_request *req;
				349	struct ceph_file_info *fi = file->private_data;
				350	int err;
				351	int flags, fmode, wanted;
				352
				353	if (fi) {
				354	dout("open file %p is already opened\n", file);
				355	return 0;
				356	}
				357
				358	/* filter out O_CREAT\|O_EXCL; vfs did that already. yuck. */
				359	flags = file->f_flags & ~(O_CREAT\|O_EXCL);
				360	if (S_ISDIR(inode->i_mode))
				361	flags = O_DIRECTORY; /* mds likes to know */
				362
				363	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
				364	ceph_vinop(inode), file, flags, file->f_flags);
				365	fmode = ceph_flags_to_mode(flags);
				366	wanted = ceph_caps_for_mode(fmode);
				367
				368	/* snapped files are read-only */
				369	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
				370	return -EROFS;
				371
				372	/* trivially open snapdir */
				373	if (ceph_snap(inode) == CEPH_SNAPDIR) {
				374	spin_lock(&ci->i_ceph_lock);
				375	__ceph_get_fmode(ci, fmode);
				376	spin_unlock(&ci->i_ceph_lock);
				377	return ceph_init_file(inode, file, fmode);
				378	}
				379
				380	/*
				381	* No need to block if we have caps on the auth MDS (for
				382	* write) or any MDS (for read). Update wanted set
				383	* asynchronously.
				384	*/
				385	spin_lock(&ci->i_ceph_lock);
				386	if (__ceph_is_any_real_caps(ci) &&
				387	(((fmode & CEPH_FILE_MODE_WR) == 0) \|\| ci->i_auth_cap)) {
				388	int mds_wanted = __ceph_caps_mds_wanted(ci, true);
				389	int issued = __ceph_caps_issued(ci, NULL);
				390
				391	dout("open %p fmode %d want %s issued %s using existing\n",
				392	inode, fmode, ceph_cap_string(wanted),
				393	ceph_cap_string(issued));
				394	__ceph_get_fmode(ci, fmode);
				395	spin_unlock(&ci->i_ceph_lock);
				396
				397	/* adjust wanted? */
				398	if ((issued & wanted) != wanted &&
				399	(mds_wanted & wanted) != wanted &&
				400	ceph_snap(inode) != CEPH_SNAPDIR)
				401	ceph_check_caps(ci, 0, NULL);
				402
				403	return ceph_init_file(inode, file, fmode);
				404	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
				405	(ci->i_snap_caps & wanted) == wanted) {
				406	__ceph_get_fmode(ci, fmode);
				407	spin_unlock(&ci->i_ceph_lock);
				408	return ceph_init_file(inode, file, fmode);
				409	}
				410
				411	spin_unlock(&ci->i_ceph_lock);
				412
				413	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
				414	req = prepare_open_request(inode->i_sb, flags, 0);
				415	if (IS_ERR(req)) {
				416	err = PTR_ERR(req);
				417	goto out;
				418	}
				419	req->r_inode = inode;
				420	ihold(inode);
				421
				422	req->r_num_caps = 1;
				423	err = ceph_mdsc_do_request(mdsc, NULL, req);
				424	if (!err)
				425	err = ceph_init_file(inode, file, req->r_fmode);
				426	ceph_mdsc_put_request(req);
				427	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
				428	out:
				429	return err;
				430	}
				431
				432
				433	/*
				434	* Do a lookup + open with a single request. If we get a non-existent
				435	* file or symlink, return 1 so the VFS can retry.
				436	*/
				437	int ceph_atomic_open(struct inode dir, struct dentry dentry,
				438	struct file *file, unsigned flags, umode_t mode)
				439	{
				440	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
				441	struct ceph_mds_client *mdsc = fsc->mdsc;
				442	struct ceph_mds_request *req;
				443	struct dentry *dn;
				444	struct ceph_acl_sec_ctx as_ctx = {};
				445	int mask;
				446	int err;
				447
				448	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
				449	dir, dentry, dentry,
				450	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
				451
				452	if (dentry->d_name.len > NAME_MAX)
				453	return -ENAMETOOLONG;
				454
				455	/*
				456	* Do not truncate the file, since atomic_open is called before the
				457	* permission check. The caller will do the truncation afterward.
				458	*/
				459	flags &= ~O_TRUNC;
				460
				461	if (flags & O_CREAT) {
				462	if (ceph_quota_is_max_files_exceeded(dir))
				463	return -EDQUOT;
				464	err = ceph_pre_init_acls(dir, &mode, &as_ctx);
				465	if (err < 0)
				466	return err;
				467	err = ceph_security_init_secctx(dentry, mode, &as_ctx);
				468	if (err < 0)
				469	goto out_ctx;
				470	} else if (!d_in_lookup(dentry)) {
				471	/* If it's not being looked up, it's negative */
				472	return -ENOENT;
				473	}
				474
				475	/* do the open */
				476	req = prepare_open_request(dir->i_sb, flags, mode);
				477	if (IS_ERR(req)) {
				478	err = PTR_ERR(req);
				479	goto out_ctx;
				480	}
				481	req->r_dentry = dget(dentry);
				482	req->r_num_caps = 2;
				483	if (flags & O_CREAT) {
				484	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL;
				485	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
				486	if (as_ctx.pagelist) {
				487	req->r_pagelist = as_ctx.pagelist;
				488	as_ctx.pagelist = NULL;
				489	}
				490	}
				491
				492	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
				493	if (ceph_security_xattr_wanted(dir))
				494	mask \|= CEPH_CAP_XATTR_SHARED;
				495	req->r_args.open.mask = cpu_to_le32(mask);
				496
				497	req->r_parent = dir;
				498	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
				499	err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
				500	err = ceph_handle_snapdir(req, dentry, err);
				501	if (err)
				502	goto out_req;
				503
				504	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
				505	err = ceph_handle_notrace_create(dir, dentry);
				506
				507	if (d_in_lookup(dentry)) {
				508	dn = ceph_finish_lookup(req, dentry, err);
				509	if (IS_ERR(dn))
				510	err = PTR_ERR(dn);
				511	} else {
				512	/* we were given a hashed negative dentry */
				513	dn = NULL;
				514	}
				515	if (err)
				516	goto out_req;
				517	if (dn \|\| d_really_is_negative(dentry) \|\| d_is_symlink(dentry)) {
				518	/* make vfs retry on splice, ENOENT, or symlink */
				519	dout("atomic_open finish_no_open on dn %p\n", dn);
				520	err = finish_no_open(file, dn);
				521	} else {
				522	dout("atomic_open finish_open on dn %p\n", dn);
				523	if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
				524	ceph_init_inode_acls(d_inode(dentry), &as_ctx);
				525	file->f_mode \|= FMODE_CREATED;
				526	}
				527	err = finish_open(file, dentry, ceph_open);
				528	}
				529	out_req:
				530	if (!req->r_err && req->r_target_inode)
				531	ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
				532	ceph_mdsc_put_request(req);
				533	out_ctx:
				534	ceph_release_acl_sec_ctx(&as_ctx);
				535	dout("atomic_open result=%d\n", err);
				536	return err;
				537	}
				538
				539	int ceph_release(struct inode inode, struct file file)
				540	{
				541	struct ceph_inode_info *ci = ceph_inode(inode);
				542
				543	if (S_ISDIR(inode->i_mode)) {
				544	struct ceph_dir_file_info *dfi = file->private_data;
				545	dout("release inode %p dir file %p\n", inode, file);
				546	WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
				547
				548	ceph_put_fmode(ci, dfi->file_info.fmode);
				549
				550	if (dfi->last_readdir)
				551	ceph_mdsc_put_request(dfi->last_readdir);
				552	kfree(dfi->last_name);
				553	kfree(dfi->dir_info);
				554	kmem_cache_free(ceph_dir_file_cachep, dfi);
				555	} else {
				556	struct ceph_file_info *fi = file->private_data;
				557	dout("release inode %p regular file %p\n", inode, file);
				558	WARN_ON(!list_empty(&fi->rw_contexts));
				559
				560	ceph_put_fmode(ci, fi->fmode);
				561	kmem_cache_free(ceph_file_cachep, fi);
				562	}
				563
				564	/* wake up anyone waiting for caps on this inode */
				565	wake_up_all(&ci->i_cap_wq);
				566	return 0;
				567	}
				568
				569	enum {
				570	HAVE_RETRIED = 1,
				571	CHECK_EOF = 2,
				572	READ_INLINE = 3,
				573	};
				574
				575	/*
				576	* Completely synchronous read and write methods. Direct from __user
				577	* buffer to osd, or directly to user pages (if O_DIRECT).
				578	*
				579	* If the read spans object boundary, just do multiple reads. (That's not
				580	* atomic, but good enough for now.)
				581	*
				582	* If we get a short result from the OSD, check against i_size; we need to
				583	* only return a short read to the caller if we hit EOF.
				584	*/
				585	static ssize_t ceph_sync_read(struct kiocb iocb, struct iov_iter to,
				586	int *retry_op)
				587	{
				588	struct file *file = iocb->ki_filp;
				589	struct inode *inode = file_inode(file);
				590	struct ceph_inode_info *ci = ceph_inode(inode);
				591	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				592	struct ceph_osd_client *osdc = &fsc->client->osdc;
				593	ssize_t ret;
				594	u64 off = iocb->ki_pos;
				595	u64 len = iov_iter_count(to);
				596
				597	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
				598	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
				599
				600	if (!len)
				601	return 0;
				602	/*
				603	* flush any page cache pages in this range. this
				604	* will make concurrent normal and sync io slow,
				605	* but it will at least behave sensibly when they are
				606	* in sequence.
				607	*/
				608	ret = filemap_write_and_wait_range(inode->i_mapping,
				609	off, off + len - 1);
				610	if (ret < 0)
				611	return ret;
				612
				613	ret = 0;
				614	while ((len = iov_iter_count(to)) > 0) {
				615	struct ceph_osd_request *req;
				616	struct page **pages;
				617	int num_pages;
				618	size_t page_off;
				619	u64 i_size;
				620	bool more;
				621
				622	req = ceph_osdc_new_request(osdc, &ci->i_layout,
				623	ci->i_vino, off, &len, 0, 1,
				624	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				625	NULL, ci->i_truncate_seq,
				626	ci->i_truncate_size, false);
				627	if (IS_ERR(req)) {
				628	ret = PTR_ERR(req);
				629	break;
				630	}
				631
				632	more = len < iov_iter_count(to);
				633
				634	if (unlikely(iov_iter_is_pipe(to))) {
				635	ret = iov_iter_get_pages_alloc(to, &pages, len,
				636	&page_off);
				637	if (ret <= 0) {
				638	ceph_osdc_put_request(req);
				639	ret = -ENOMEM;
				640	break;
				641	}
				642	num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
				643	if (ret < len) {
				644	len = ret;
				645	osd_req_op_extent_update(req, 0, len);
				646	more = false;
				647	}
				648	} else {
				649	num_pages = calc_pages_for(off, len);
				650	page_off = off & ~PAGE_MASK;
				651	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				652	if (IS_ERR(pages)) {
				653	ceph_osdc_put_request(req);
				654	ret = PTR_ERR(pages);
				655	break;
				656	}
				657	}
				658
				659	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
				660	false, false);
				661	ret = ceph_osdc_start_request(osdc, req, false);
				662	if (!ret)
				663	ret = ceph_osdc_wait_request(osdc, req);
				664	ceph_osdc_put_request(req);
				665
				666	i_size = i_size_read(inode);
				667	dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
				668	off, len, ret, i_size, (more ? " MORE" : ""));
				669
				670	if (ret == -ENOENT)
				671	ret = 0;
				672	if (ret >= 0 && ret < len && (off + ret < i_size)) {
				673	int zlen = min(len - ret, i_size - off - ret);
				674	int zoff = page_off + ret;
				675	dout("sync_read zero gap %llu~%llu\n",
				676	off + ret, off + ret + zlen);
				677	ceph_zero_page_vector_range(zoff, zlen, pages);
				678	ret += zlen;
				679	}
				680
				681	if (unlikely(iov_iter_is_pipe(to))) {
				682	if (ret > 0) {
				683	iov_iter_advance(to, ret);
				684	off += ret;
				685	} else {
				686	iov_iter_advance(to, 0);
				687	}
				688	ceph_put_page_vector(pages, num_pages, false);
				689	} else {
				690	int idx = 0;
				691	size_t left = ret > 0 ? ret : 0;
				692	while (left > 0) {
				693	size_t len, copied;
				694	page_off = off & ~PAGE_MASK;
				695	len = min_t(size_t, left, PAGE_SIZE - page_off);
				696	copied = copy_page_to_iter(pages[idx++],
				697	page_off, len, to);
				698	off += copied;
				699	left -= copied;
				700	if (copied < len) {
				701	ret = -EFAULT;
				702	break;
				703	}
				704	}
				705	ceph_release_page_vector(pages, num_pages);
				706	}
				707
				708	if (ret < 0) {
				709	if (ret == -EBLACKLISTED)
				710	fsc->blacklisted = true;
				711	break;
				712	}
				713
				714	if (off >= i_size \|\| !more)
				715	break;
				716	}
				717
				718	if (off > iocb->ki_pos) {
				719	if (ret >= 0 &&
				720	iov_iter_count(to) > 0 && off >= i_size_read(inode))
				721	*retry_op = CHECK_EOF;
				722	ret = off - iocb->ki_pos;
				723	iocb->ki_pos = off;
				724	}
				725
				726	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
				727	return ret;
				728	}
				729
				730	struct ceph_aio_request {
				731	struct kiocb *iocb;
				732	size_t total_len;
				733	bool write;
				734	bool should_dirty;
				735	int error;
				736	struct list_head osd_reqs;
				737	unsigned num_reqs;
				738	atomic_t pending_reqs;
				739	struct timespec64 mtime;
				740	struct ceph_cap_flush *prealloc_cf;
				741	};
				742
				743	struct ceph_aio_work {
				744	struct work_struct work;
				745	struct ceph_osd_request *req;
				746	};
				747
				748	static void ceph_aio_retry_work(struct work_struct *work);
				749
				750	static void ceph_aio_complete(struct inode *inode,
				751	struct ceph_aio_request *aio_req)
				752	{
				753	struct ceph_inode_info *ci = ceph_inode(inode);
				754	int ret;
				755
				756	if (!atomic_dec_and_test(&aio_req->pending_reqs))
				757	return;
				758
				759	if (aio_req->iocb->ki_flags & IOCB_DIRECT)
				760	inode_dio_end(inode);
				761
				762	ret = aio_req->error;
				763	if (!ret)
				764	ret = aio_req->total_len;
				765
				766	dout("ceph_aio_complete %p rc %d\n", inode, ret);
				767
				768	if (ret >= 0 && aio_req->write) {
				769	int dirty;
				770
				771	loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
				772	if (endoff > i_size_read(inode)) {
				773	if (ceph_inode_set_size(inode, endoff))
				774	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				775	}
				776
				777	spin_lock(&ci->i_ceph_lock);
				778	ci->i_inline_version = CEPH_INLINE_NONE;
				779	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				780	&aio_req->prealloc_cf);
				781	spin_unlock(&ci->i_ceph_lock);
				782	if (dirty)
				783	__mark_inode_dirty(inode, dirty);
				784
				785	}
				786
				787	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
				788	CEPH_CAP_FILE_RD));
				789
				790	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
				791
				792	ceph_free_cap_flush(aio_req->prealloc_cf);
				793	kfree(aio_req);
				794	}
				795
				796	static void ceph_aio_complete_req(struct ceph_osd_request *req)
				797	{
				798	int rc = req->r_result;
				799	struct inode *inode = req->r_inode;
				800	struct ceph_aio_request *aio_req = req->r_priv;
				801	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
				802
				803	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
				804	BUG_ON(!osd_data->num_bvecs);
				805
				806	dout("ceph_aio_complete_req %p rc %d bytes %u\n",
				807	inode, rc, osd_data->bvec_pos.iter.bi_size);
				808
				809	if (rc == -EOLDSNAPC) {
				810	struct ceph_aio_work *aio_work;
				811	BUG_ON(!aio_req->write);
				812
				813	aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
				814	if (aio_work) {
				815	INIT_WORK(&aio_work->work, ceph_aio_retry_work);
				816	aio_work->req = req;
				817	queue_work(ceph_inode_to_client(inode)->inode_wq,
				818	&aio_work->work);
				819	return;
				820	}
				821	rc = -ENOMEM;
				822	} else if (!aio_req->write) {
				823	if (rc == -ENOENT)
				824	rc = 0;
				825	if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
				826	struct iov_iter i;
				827	int zlen = osd_data->bvec_pos.iter.bi_size - rc;
				828
				829	/*
				830	* If read is satisfied by single OSD request,
				831	* it can pass EOF. Otherwise read is within
				832	* i_size.
				833	*/
				834	if (aio_req->num_reqs == 1) {
				835	loff_t i_size = i_size_read(inode);
				836	loff_t endoff = aio_req->iocb->ki_pos + rc;
				837	if (endoff < i_size)
				838	zlen = min_t(size_t, zlen,
				839	i_size - endoff);
				840	aio_req->total_len = rc + zlen;
				841	}
				842
				843	iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
				844	osd_data->num_bvecs,
				845	osd_data->bvec_pos.iter.bi_size);
				846	iov_iter_advance(&i, rc);
				847	iov_iter_zero(zlen, &i);
				848	}
				849	}
				850
				851	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
				852	aio_req->should_dirty);
				853	ceph_osdc_put_request(req);
				854
				855	if (rc < 0)
				856	cmpxchg(&aio_req->error, 0, rc);
				857
				858	ceph_aio_complete(inode, aio_req);
				859	return;
				860	}
				861
				862	static void ceph_aio_retry_work(struct work_struct *work)
				863	{
				864	struct ceph_aio_work *aio_work =
				865	container_of(work, struct ceph_aio_work, work);
				866	struct ceph_osd_request *orig_req = aio_work->req;
				867	struct ceph_aio_request *aio_req = orig_req->r_priv;
				868	struct inode *inode = orig_req->r_inode;
				869	struct ceph_inode_info *ci = ceph_inode(inode);
				870	struct ceph_snap_context *snapc;
				871	struct ceph_osd_request *req;
				872	int ret;
				873
				874	spin_lock(&ci->i_ceph_lock);
				875	if (__ceph_have_pending_cap_snap(ci)) {
				876	struct ceph_cap_snap *capsnap =
				877	list_last_entry(&ci->i_cap_snaps,
				878	struct ceph_cap_snap,
				879	ci_item);
				880	snapc = ceph_get_snap_context(capsnap->context);
				881	} else {
				882	BUG_ON(!ci->i_head_snapc);
				883	snapc = ceph_get_snap_context(ci->i_head_snapc);
				884	}
				885	spin_unlock(&ci->i_ceph_lock);
				886
				887	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
				888	false, GFP_NOFS);
				889	if (!req) {
				890	ret = -ENOMEM;
				891	req = orig_req;
				892	goto out;
				893	}
				894
				895	req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				896	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
				897	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
				898
				899	req->r_ops[0] = orig_req->r_ops[0];
				900
				901	req->r_mtime = aio_req->mtime;
				902	req->r_data_offset = req->r_ops[0].extent.offset;
				903
				904	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
				905	if (ret) {
				906	ceph_osdc_put_request(req);
				907	req = orig_req;
				908	goto out;
				909	}
				910
				911	ceph_osdc_put_request(orig_req);
				912
				913	req->r_callback = ceph_aio_complete_req;
				914	req->r_inode = inode;
				915	req->r_priv = aio_req;
				916
				917	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				918	out:
				919	if (ret < 0) {
				920	req->r_result = ret;
				921	ceph_aio_complete_req(req);
				922	}
				923
				924	ceph_put_snap_context(snapc);
				925	kfree(aio_work);
				926	}
				927
				928	static ssize_t
				929	ceph_direct_read_write(struct kiocb iocb, struct iov_iter iter,
				930	struct ceph_snap_context *snapc,
				931	struct ceph_cap_flush **pcf)
				932	{
				933	struct file *file = iocb->ki_filp;
				934	struct inode *inode = file_inode(file);
				935	struct ceph_inode_info *ci = ceph_inode(inode);
				936	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				937	struct ceph_vino vino;
				938	struct ceph_osd_request *req;
				939	struct bio_vec *bvecs;
				940	struct ceph_aio_request *aio_req = NULL;
				941	int num_pages = 0;
				942	int flags;
				943	int ret = 0;
				944	struct timespec64 mtime = current_time(inode);
				945	size_t count = iov_iter_count(iter);
				946	loff_t pos = iocb->ki_pos;
				947	bool write = iov_iter_rw(iter) == WRITE;
				948	bool should_dirty = !write && iter_is_iovec(iter);
				949
				950	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				951	return -EROFS;
				952
				953	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
				954	(write ? "write" : "read"), file, pos, (unsigned)count,
				955	snapc, snapc ? snapc->seq : 0);
				956
				957	if (write) {
				958	int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
				959	pos >> PAGE_SHIFT,
				960	(pos + count - 1) >> PAGE_SHIFT);
				961	if (ret2 < 0)
				962	dout("invalidate_inode_pages2_range returned %d\n", ret2);
				963
				964	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				965	} else {
				966	flags = CEPH_OSD_FLAG_READ;
				967	}
				968
				969	while (iov_iter_count(iter) > 0) {
				970	u64 size = iov_iter_count(iter);
				971	ssize_t len;
				972
				973	if (write)
				974	size = min_t(u64, size, fsc->mount_options->wsize);
				975	else
				976	size = min_t(u64, size, fsc->mount_options->rsize);
				977
				978	vino = ceph_vino(inode);
				979	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				980	vino, pos, &size, 0,
				981	1,
				982	write ? CEPH_OSD_OP_WRITE :
				983	CEPH_OSD_OP_READ,
				984	flags, snapc,
				985	ci->i_truncate_seq,
				986	ci->i_truncate_size,
				987	false);
				988	if (IS_ERR(req)) {
				989	ret = PTR_ERR(req);
				990	break;
				991	}
				992
				993	len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
				994	if (len < 0) {
				995	ceph_osdc_put_request(req);
				996	ret = len;
				997	break;
				998	}
				999	if (len != size)
				1000	osd_req_op_extent_update(req, 0, len);
				1001
				1002	/*
				1003	* To simplify error handling, allow AIO when IO within i_size
				1004	* or IO can be satisfied by single OSD request.
				1005	*/
				1006	if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
				1007	(len == count \|\| pos + count <= i_size_read(inode))) {
				1008	aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
				1009	if (aio_req) {
				1010	aio_req->iocb = iocb;
				1011	aio_req->write = write;
				1012	aio_req->should_dirty = should_dirty;
				1013	INIT_LIST_HEAD(&aio_req->osd_reqs);
				1014	if (write) {
				1015	aio_req->mtime = mtime;
				1016	swap(aio_req->prealloc_cf, *pcf);
				1017	}
				1018	}
				1019	/* ignore error */
				1020	}
				1021
				1022	if (write) {
				1023	/*
				1024	* throw out any page cache pages in this range. this
				1025	* may block.
				1026	*/
				1027	truncate_inode_pages_range(inode->i_mapping, pos,
				1028	PAGE_ALIGN(pos + len) - 1);
				1029
				1030	req->r_mtime = mtime;
				1031	}
				1032
				1033	osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
				1034
				1035	if (aio_req) {
				1036	aio_req->total_len += len;
				1037	aio_req->num_reqs++;
				1038	atomic_inc(&aio_req->pending_reqs);
				1039
				1040	req->r_callback = ceph_aio_complete_req;
				1041	req->r_inode = inode;
				1042	req->r_priv = aio_req;
				1043	list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
				1044
				1045	pos += len;
				1046	continue;
				1047	}
				1048
				1049	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				1050	if (!ret)
				1051	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1052
				1053	size = i_size_read(inode);
				1054	if (!write) {
				1055	if (ret == -ENOENT)
				1056	ret = 0;
				1057	if (ret >= 0 && ret < len && pos + ret < size) {
				1058	struct iov_iter i;
				1059	int zlen = min_t(size_t, len - ret,
				1060	size - pos - ret);
				1061
				1062	iov_iter_bvec(&i, READ, bvecs, num_pages, len);
				1063	iov_iter_advance(&i, ret);
				1064	iov_iter_zero(zlen, &i);
				1065	ret += zlen;
				1066	}
				1067	if (ret >= 0)
				1068	len = ret;
				1069	}
				1070
				1071	put_bvecs(bvecs, num_pages, should_dirty);
				1072	ceph_osdc_put_request(req);
				1073	if (ret < 0)
				1074	break;
				1075
				1076	pos += len;
				1077	if (!write && pos >= size)
				1078	break;
				1079
				1080	if (write && pos > size) {
				1081	if (ceph_inode_set_size(inode, pos))
				1082	ceph_check_caps(ceph_inode(inode),
				1083	CHECK_CAPS_AUTHONLY,
				1084	NULL);
				1085	}
				1086	}
				1087
				1088	if (aio_req) {
				1089	LIST_HEAD(osd_reqs);
				1090
				1091	if (aio_req->num_reqs == 0) {
				1092	kfree(aio_req);
				1093	return ret;
				1094	}
				1095
				1096	ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
				1097	CEPH_CAP_FILE_RD);
				1098
				1099	list_splice(&aio_req->osd_reqs, &osd_reqs);
				1100	inode_dio_begin(inode);
				1101	while (!list_empty(&osd_reqs)) {
				1102	req = list_first_entry(&osd_reqs,
				1103	struct ceph_osd_request,
				1104	r_private_item);
				1105	list_del_init(&req->r_private_item);
				1106	if (ret >= 0)
				1107	ret = ceph_osdc_start_request(req->r_osdc,
				1108	req, false);
				1109	if (ret < 0) {
				1110	req->r_result = ret;
				1111	ceph_aio_complete_req(req);
				1112	}
				1113	}
				1114	return -EIOCBQUEUED;
				1115	}
				1116
				1117	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
				1118	ret = pos - iocb->ki_pos;
				1119	iocb->ki_pos = pos;
				1120	}
				1121	return ret;
				1122	}
				1123
				1124	/*
				1125	* Synchronous write, straight from __user pointer or user pages.
				1126	*
				1127	* If write spans object boundary, just do multiple writes. (For a
				1128	* correct atomic write, we should e.g. take write locks on all
				1129	* objects, rollback on failure, etc.)
				1130	*/
				1131	static ssize_t
				1132	ceph_sync_write(struct kiocb iocb, struct iov_iter from, loff_t pos,
				1133	struct ceph_snap_context *snapc)
				1134	{
				1135	struct file *file = iocb->ki_filp;
				1136	struct inode *inode = file_inode(file);
				1137	struct ceph_inode_info *ci = ceph_inode(inode);
				1138	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1139	struct ceph_vino vino;
				1140	struct ceph_osd_request *req;
				1141	struct page **pages;
				1142	u64 len;
				1143	int num_pages;
				1144	int written = 0;
				1145	int flags;
				1146	int ret;
				1147	bool check_caps = false;
				1148	struct timespec64 mtime = current_time(inode);
				1149	size_t count = iov_iter_count(from);
				1150
				1151	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				1152	return -EROFS;
				1153
				1154	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
				1155	file, pos, (unsigned)count, snapc, snapc->seq);
				1156
				1157	ret = filemap_write_and_wait_range(inode->i_mapping,
				1158	pos, pos + count - 1);
				1159	if (ret < 0)
				1160	return ret;
				1161
				1162	ret = invalidate_inode_pages2_range(inode->i_mapping,
				1163	pos >> PAGE_SHIFT,
				1164	(pos + count - 1) >> PAGE_SHIFT);
				1165	if (ret < 0)
				1166	dout("invalidate_inode_pages2_range returned %d\n", ret);
				1167
				1168	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				1169
				1170	while ((len = iov_iter_count(from)) > 0) {
				1171	size_t left;
				1172	int n;
				1173
				1174	vino = ceph_vino(inode);
				1175	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1176	vino, pos, &len, 0, 1,
				1177	CEPH_OSD_OP_WRITE, flags, snapc,
				1178	ci->i_truncate_seq,
				1179	ci->i_truncate_size,
				1180	false);
				1181	if (IS_ERR(req)) {
				1182	ret = PTR_ERR(req);
				1183	break;
				1184	}
				1185
				1186	/*
				1187	* write from beginning of first page,
				1188	* regardless of io alignment
				1189	*/
				1190	num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1191
				1192	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				1193	if (IS_ERR(pages)) {
				1194	ret = PTR_ERR(pages);
				1195	goto out;
				1196	}
				1197
				1198	left = len;
				1199	for (n = 0; n < num_pages; n++) {
				1200	size_t plen = min_t(size_t, left, PAGE_SIZE);
				1201	ret = copy_page_from_iter(pages[n], 0, plen, from);
				1202	if (ret != plen) {
				1203	ret = -EFAULT;
				1204	break;
				1205	}
				1206	left -= ret;
				1207	}
				1208
				1209	if (ret < 0) {
				1210	ceph_release_page_vector(pages, num_pages);
				1211	goto out;
				1212	}
				1213
				1214	req->r_inode = inode;
				1215
				1216	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
				1217	false, true);
				1218
				1219	req->r_mtime = mtime;
				1220	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1221	if (!ret)
				1222	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1223
				1224	out:
				1225	ceph_osdc_put_request(req);
				1226	if (ret != 0) {
				1227	ceph_set_error_write(ci);
				1228	break;
				1229	}
				1230
				1231	ceph_clear_error_write(ci);
				1232	pos += len;
				1233	written += len;
				1234	if (pos > i_size_read(inode)) {
				1235	check_caps = ceph_inode_set_size(inode, pos);
				1236	if (check_caps)
				1237	ceph_check_caps(ceph_inode(inode),
				1238	CHECK_CAPS_AUTHONLY,
				1239	NULL);
				1240	}
				1241
				1242	}
				1243
				1244	if (ret != -EOLDSNAPC && written > 0) {
				1245	ret = written;
				1246	iocb->ki_pos = pos;
				1247	}
				1248	return ret;
				1249	}
				1250
				1251	/*
				1252	* Wrap generic_file_aio_read with checks for cap bits on the inode.
				1253	* Atomically grab references, so that those bits are not released
				1254	* back to the MDS mid-read.
				1255	*
				1256	* Hmm, the sync read case isn't actually async... should it be?
				1257	*/
				1258	static ssize_t ceph_read_iter(struct kiocb iocb, struct iov_iter to)
				1259	{
				1260	struct file *filp = iocb->ki_filp;
				1261	struct ceph_file_info *fi = filp->private_data;
				1262	size_t len = iov_iter_count(to);
				1263	struct inode *inode = file_inode(filp);
				1264	struct ceph_inode_info *ci = ceph_inode(inode);
				1265	struct page *pinned_page = NULL;
				1266	bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
				1267	ssize_t ret;
				1268	int want, got = 0;
				1269	int retry_op = 0, read = 0;
				1270
				1271	again:
				1272	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
				1273	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
				1274
				1275	if (direct_lock)
				1276	ceph_start_io_direct(inode);
				1277	else
				1278	ceph_start_io_read(inode);
				1279
				1280	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1281	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
				1282	else
				1283	want = CEPH_CAP_FILE_CACHE;
				1284	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
				1285	&got, &pinned_page);
				1286	if (ret < 0) {
				1287	if (iocb->ki_flags & IOCB_DIRECT)
				1288	ceph_end_io_direct(inode);
				1289	else
				1290	ceph_end_io_read(inode);
				1291	return ret;
				1292	}
				1293
				1294	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1295	(iocb->ki_flags & IOCB_DIRECT) \|\|
				1296	(fi->flags & CEPH_F_SYNC)) {
				1297
				1298	dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1299	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1300	ceph_cap_string(got));
				1301
				1302	if (ci->i_inline_version == CEPH_INLINE_NONE) {
				1303	if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
				1304	ret = ceph_direct_read_write(iocb, to,
				1305	NULL, NULL);
				1306	if (ret >= 0 && ret < len)
				1307	retry_op = CHECK_EOF;
				1308	} else {
				1309	ret = ceph_sync_read(iocb, to, &retry_op);
				1310	}
				1311	} else {
				1312	retry_op = READ_INLINE;
				1313	}
				1314	} else {
				1315	CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
				1316	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1317	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1318	ceph_cap_string(got));
				1319	ceph_add_rw_context(fi, &rw_ctx);
				1320	ret = generic_file_read_iter(iocb, to);
				1321	ceph_del_rw_context(fi, &rw_ctx);
				1322	}
				1323
				1324	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
				1325	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
				1326	if (pinned_page) {
				1327	put_page(pinned_page);
				1328	pinned_page = NULL;
				1329	}
				1330	ceph_put_cap_refs(ci, got);
				1331
				1332	if (direct_lock)
				1333	ceph_end_io_direct(inode);
				1334	else
				1335	ceph_end_io_read(inode);
				1336
				1337	if (retry_op > HAVE_RETRIED && ret >= 0) {
				1338	int statret;
				1339	struct page *page = NULL;
				1340	loff_t i_size;
				1341	if (retry_op == READ_INLINE) {
				1342	page = __page_cache_alloc(GFP_KERNEL);
				1343	if (!page)
				1344	return -ENOMEM;
				1345	}
				1346
				1347	statret = __ceph_do_getattr(inode, page,
				1348	CEPH_STAT_CAP_INLINE_DATA, !!page);
				1349	if (statret < 0) {
				1350	if (page)
				1351	__free_page(page);
				1352	if (statret == -ENODATA) {
				1353	BUG_ON(retry_op != READ_INLINE);
				1354	goto again;
				1355	}
				1356	return statret;
				1357	}
				1358
				1359	i_size = i_size_read(inode);
				1360	if (retry_op == READ_INLINE) {
				1361	BUG_ON(ret > 0 \|\| read > 0);
				1362	if (iocb->ki_pos < i_size &&
				1363	iocb->ki_pos < PAGE_SIZE) {
				1364	loff_t end = min_t(loff_t, i_size,
				1365	iocb->ki_pos + len);
				1366	end = min_t(loff_t, end, PAGE_SIZE);
				1367	if (statret < end)
				1368	zero_user_segment(page, statret, end);
				1369	ret = copy_page_to_iter(page,
				1370	iocb->ki_pos & ~PAGE_MASK,
				1371	end - iocb->ki_pos, to);
				1372	iocb->ki_pos += ret;
				1373	read += ret;
				1374	}
				1375	if (iocb->ki_pos < i_size && read < len) {
				1376	size_t zlen = min_t(size_t, len - read,
				1377	i_size - iocb->ki_pos);
				1378	ret = iov_iter_zero(zlen, to);
				1379	iocb->ki_pos += ret;
				1380	read += ret;
				1381	}
				1382	__free_pages(page, 0);
				1383	return read;
				1384	}
				1385
				1386	/* hit EOF or hole? */
				1387	if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
				1388	ret < len) {
				1389	dout("sync_read hit hole, ppos %lld < size %lld"
				1390	", reading more\n", iocb->ki_pos, i_size);
				1391
				1392	read += ret;
				1393	len -= ret;
				1394	retry_op = HAVE_RETRIED;
				1395	goto again;
				1396	}
				1397	}
				1398
				1399	if (ret >= 0)
				1400	ret += read;
				1401
				1402	return ret;
				1403	}
				1404
				1405	/*
				1406	* Take cap references to avoid releasing caps to MDS mid-write.
				1407	*
				1408	* If we are synchronous, and write with an old snap context, the OSD
				1409	* may return EOLDSNAPC. In that case, retry the write.. _after_
				1410	* dropping our cap refs and allowing the pending snap to logically
				1411	* complete _before_ this write occurs.
				1412	*
				1413	* If we are near ENOSPC, write synchronously.
				1414	*/
				1415	static ssize_t ceph_write_iter(struct kiocb iocb, struct iov_iter from)
				1416	{
				1417	struct file *file = iocb->ki_filp;
				1418	struct ceph_file_info *fi = file->private_data;
				1419	struct inode *inode = file_inode(file);
				1420	struct ceph_inode_info *ci = ceph_inode(inode);
				1421	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1422	struct ceph_osd_client *osdc = &fsc->client->osdc;
				1423	struct ceph_cap_flush *prealloc_cf;
				1424	ssize_t count, written = 0;
				1425	int err, want, got;
				1426	bool direct_lock = false;
				1427	u32 map_flags;
				1428	u64 pool_flags;
				1429	loff_t pos;
				1430	loff_t limit = max(i_size_read(inode), fsc->max_file_size);
				1431
				1432	if (ceph_snap(inode) != CEPH_NOSNAP)
				1433	return -EROFS;
				1434
				1435	prealloc_cf = ceph_alloc_cap_flush();
				1436	if (!prealloc_cf)
				1437	return -ENOMEM;
				1438
				1439	if ((iocb->ki_flags & (IOCB_DIRECT \| IOCB_APPEND)) == IOCB_DIRECT)
				1440	direct_lock = true;
				1441
				1442	retry_snap:
				1443	if (direct_lock)
				1444	ceph_start_io_direct(inode);
				1445	else
				1446	ceph_start_io_write(inode);
				1447
				1448	/* We can write back this queue in page reclaim */
				1449	current->backing_dev_info = inode_to_bdi(inode);
				1450
				1451	if (iocb->ki_flags & IOCB_APPEND) {
				1452	err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1453	if (err < 0)
				1454	goto out;
				1455	}
				1456
				1457	err = generic_write_checks(iocb, from);
				1458	if (err <= 0)
				1459	goto out;
				1460
				1461	pos = iocb->ki_pos;
				1462	if (unlikely(pos >= limit)) {
				1463	err = -EFBIG;
				1464	goto out;
				1465	} else {
				1466	iov_iter_truncate(from, limit - pos);
				1467	}
				1468
				1469	count = iov_iter_count(from);
				1470	if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
				1471	err = -EDQUOT;
				1472	goto out;
				1473	}
				1474
				1475	down_read(&osdc->lock);
				1476	map_flags = osdc->osdmap->flags;
				1477	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
				1478	up_read(&osdc->lock);
				1479	if ((map_flags & CEPH_OSDMAP_FULL) \|\|
				1480	(pool_flags & CEPH_POOL_FLAG_FULL)) {
				1481	err = -ENOSPC;
				1482	goto out;
				1483	}
				1484
				1485	err = file_remove_privs(file);
				1486	if (err)
				1487	goto out;
				1488
				1489	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1490	err = ceph_uninline_data(file, NULL);
				1491	if (err < 0)
				1492	goto out;
				1493	}
				1494
				1495	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
				1496	inode, ceph_vinop(inode), pos, count, i_size_read(inode));
				1497	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1498	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1499	else
				1500	want = CEPH_CAP_FILE_BUFFER;
				1501	got = 0;
				1502	err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
				1503	&got, NULL);
				1504	if (err < 0)
				1505	goto out;
				1506
				1507	err = file_update_time(file);
				1508	if (err)
				1509	goto out_caps;
				1510
				1511	inode_inc_iversion_raw(inode);
				1512
				1513	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
				1514	inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
				1515
				1516	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1517	(iocb->ki_flags & IOCB_DIRECT) \|\| (fi->flags & CEPH_F_SYNC) \|\|
				1518	(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
				1519	struct ceph_snap_context *snapc;
				1520	struct iov_iter data;
				1521
				1522	spin_lock(&ci->i_ceph_lock);
				1523	if (__ceph_have_pending_cap_snap(ci)) {
				1524	struct ceph_cap_snap *capsnap =
				1525	list_last_entry(&ci->i_cap_snaps,
				1526	struct ceph_cap_snap,
				1527	ci_item);
				1528	snapc = ceph_get_snap_context(capsnap->context);
				1529	} else {
				1530	BUG_ON(!ci->i_head_snapc);
				1531	snapc = ceph_get_snap_context(ci->i_head_snapc);
				1532	}
				1533	spin_unlock(&ci->i_ceph_lock);
				1534
				1535	/* we might need to revert back to that point */
				1536	data = *from;
				1537	if (iocb->ki_flags & IOCB_DIRECT)
				1538	written = ceph_direct_read_write(iocb, &data, snapc,
				1539	&prealloc_cf);
				1540	else
				1541	written = ceph_sync_write(iocb, &data, pos, snapc);
				1542	if (direct_lock)
				1543	ceph_end_io_direct(inode);
				1544	else
				1545	ceph_end_io_write(inode);
				1546	if (written > 0)
				1547	iov_iter_advance(from, written);
				1548	ceph_put_snap_context(snapc);
				1549	} else {
				1550	/*
				1551	* No need to acquire the i_truncate_mutex. Because
				1552	* the MDS revokes Fwb caps before sending truncate
				1553	* message to us. We can't get Fwb cap while there
				1554	* are pending vmtruncate. So write and vmtruncate
				1555	* can not run at the same time
				1556	*/
				1557	written = generic_perform_write(file, from, pos);
				1558	if (likely(written >= 0))
				1559	iocb->ki_pos = pos + written;
				1560	ceph_end_io_write(inode);
				1561	}
				1562
				1563	if (written >= 0) {
				1564	int dirty;
				1565
				1566	spin_lock(&ci->i_ceph_lock);
				1567	ci->i_inline_version = CEPH_INLINE_NONE;
				1568	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1569	&prealloc_cf);
				1570	spin_unlock(&ci->i_ceph_lock);
				1571	if (dirty)
				1572	__mark_inode_dirty(inode, dirty);
				1573	if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
				1574	ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
				1575	}
				1576
				1577	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
				1578	inode, ceph_vinop(inode), pos, (unsigned)count,
				1579	ceph_cap_string(got));
				1580	ceph_put_cap_refs(ci, got);
				1581
				1582	if (written == -EOLDSNAPC) {
				1583	dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
				1584	inode, ceph_vinop(inode), pos, (unsigned)count);
				1585	goto retry_snap;
				1586	}
				1587
				1588	if (written >= 0) {
				1589	if ((map_flags & CEPH_OSDMAP_NEARFULL) \|\|
				1590	(pool_flags & CEPH_POOL_FLAG_NEARFULL))
				1591	iocb->ki_flags \|= IOCB_DSYNC;
				1592	written = generic_write_sync(iocb, written);
				1593	}
				1594
				1595	goto out_unlocked;
				1596	out_caps:
				1597	ceph_put_cap_refs(ci, got);
				1598	out:
				1599	if (direct_lock)
				1600	ceph_end_io_direct(inode);
				1601	else
				1602	ceph_end_io_write(inode);
				1603	out_unlocked:
				1604	ceph_free_cap_flush(prealloc_cf);
				1605	current->backing_dev_info = NULL;
				1606	return written ? written : err;
				1607	}
				1608
				1609	/*
				1610	* llseek. be sure to verify file size on SEEK_END.
				1611	*/
				1612	static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
				1613	{
				1614	struct inode *inode = file->f_mapping->host;
				1615	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1616	loff_t i_size;
				1617	loff_t ret;
				1618
				1619	inode_lock(inode);
				1620
				1621	if (whence == SEEK_END \|\| whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
				1622	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1623	if (ret < 0)
				1624	goto out;
				1625	}
				1626
				1627	i_size = i_size_read(inode);
				1628	switch (whence) {
				1629	case SEEK_END:
				1630	offset += i_size;
				1631	break;
				1632	case SEEK_CUR:
				1633	/*
				1634	* Here we special-case the lseek(fd, 0, SEEK_CUR)
				1635	* position-querying operation. Avoid rewriting the "same"
				1636	* f_pos value back to the file because a concurrent read(),
				1637	* write() or lseek() might have altered it
				1638	*/
				1639	if (offset == 0) {
				1640	ret = file->f_pos;
				1641	goto out;
				1642	}
				1643	offset += file->f_pos;
				1644	break;
				1645	case SEEK_DATA:
				1646	if (offset < 0 \|\| offset >= i_size) {
				1647	ret = -ENXIO;
				1648	goto out;
				1649	}
				1650	break;
				1651	case SEEK_HOLE:
				1652	if (offset < 0 \|\| offset >= i_size) {
				1653	ret = -ENXIO;
				1654	goto out;
				1655	}
				1656	offset = i_size;
				1657	break;
				1658	}
				1659
				1660	ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
				1661
				1662	out:
				1663	inode_unlock(inode);
				1664	return ret;
				1665	}
				1666
				1667	static inline void ceph_zero_partial_page(
				1668	struct inode *inode, loff_t offset, unsigned size)
				1669	{
				1670	struct page *page;
				1671	pgoff_t index = offset >> PAGE_SHIFT;
				1672
				1673	page = find_lock_page(inode->i_mapping, index);
				1674	if (page) {
				1675	wait_on_page_writeback(page);
				1676	zero_user(page, offset & (PAGE_SIZE - 1), size);
				1677	unlock_page(page);
				1678	put_page(page);
				1679	}
				1680	}
				1681
				1682	static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
				1683	loff_t length)
				1684	{
				1685	loff_t nearly = round_up(offset, PAGE_SIZE);
				1686	if (offset < nearly) {
				1687	loff_t size = nearly - offset;
				1688	if (length < size)
				1689	size = length;
				1690	ceph_zero_partial_page(inode, offset, size);
				1691	offset += size;
				1692	length -= size;
				1693	}
				1694	if (length >= PAGE_SIZE) {
				1695	loff_t size = round_down(length, PAGE_SIZE);
				1696	truncate_pagecache_range(inode, offset, offset + size - 1);
				1697	offset += size;
				1698	length -= size;
				1699	}
				1700	if (length)
				1701	ceph_zero_partial_page(inode, offset, length);
				1702	}
				1703
				1704	static int ceph_zero_partial_object(struct inode *inode,
				1705	loff_t offset, loff_t *length)
				1706	{
				1707	struct ceph_inode_info *ci = ceph_inode(inode);
				1708	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1709	struct ceph_osd_request *req;
				1710	int ret = 0;
				1711	loff_t zero = 0;
				1712	int op;
				1713
				1714	if (!length) {
				1715	op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
				1716	length = &zero;
				1717	} else {
				1718	op = CEPH_OSD_OP_ZERO;
				1719	}
				1720
				1721	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1722	ceph_vino(inode),
				1723	offset, length,
				1724	0, 1, op,
				1725	CEPH_OSD_FLAG_WRITE,
				1726	NULL, 0, 0, false);
				1727	if (IS_ERR(req)) {
				1728	ret = PTR_ERR(req);
				1729	goto out;
				1730	}
				1731
				1732	req->r_mtime = inode->i_mtime;
				1733	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1734	if (!ret) {
				1735	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1736	if (ret == -ENOENT)
				1737	ret = 0;
				1738	}
				1739	ceph_osdc_put_request(req);
				1740
				1741	out:
				1742	return ret;
				1743	}
				1744
				1745	static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
				1746	{
				1747	int ret = 0;
				1748	struct ceph_inode_info *ci = ceph_inode(inode);
				1749	s32 stripe_unit = ci->i_layout.stripe_unit;
				1750	s32 stripe_count = ci->i_layout.stripe_count;
				1751	s32 object_size = ci->i_layout.object_size;
				1752	u64 object_set_size = object_size * stripe_count;
				1753	u64 nearly, t;
				1754
				1755	/* round offset up to next period boundary */
				1756	nearly = offset + object_set_size - 1;
				1757	t = nearly;
				1758	nearly -= do_div(t, object_set_size);
				1759
				1760	while (length && offset < nearly) {
				1761	loff_t size = length;
				1762	ret = ceph_zero_partial_object(inode, offset, &size);
				1763	if (ret < 0)
				1764	return ret;
				1765	offset += size;
				1766	length -= size;
				1767	}
				1768	while (length >= object_set_size) {
				1769	int i;
				1770	loff_t pos = offset;
				1771	for (i = 0; i < stripe_count; ++i) {
				1772	ret = ceph_zero_partial_object(inode, pos, NULL);
				1773	if (ret < 0)
				1774	return ret;
				1775	pos += stripe_unit;
				1776	}
				1777	offset += object_set_size;
				1778	length -= object_set_size;
				1779	}
				1780	while (length) {
				1781	loff_t size = length;
				1782	ret = ceph_zero_partial_object(inode, offset, &size);
				1783	if (ret < 0)
				1784	return ret;
				1785	offset += size;
				1786	length -= size;
				1787	}
				1788	return ret;
				1789	}
				1790
				1791	static long ceph_fallocate(struct file *file, int mode,
				1792	loff_t offset, loff_t length)
				1793	{
				1794	struct ceph_file_info *fi = file->private_data;
				1795	struct inode *inode = file_inode(file);
				1796	struct ceph_inode_info *ci = ceph_inode(inode);
				1797	struct ceph_cap_flush *prealloc_cf;
				1798	int want, got = 0;
				1799	int dirty;
				1800	int ret = 0;
				1801	loff_t endoff = 0;
				1802	loff_t size;
				1803
				1804	if (mode != (FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				1805	return -EOPNOTSUPP;
				1806
				1807	if (!S_ISREG(inode->i_mode))
				1808	return -EOPNOTSUPP;
				1809
				1810	prealloc_cf = ceph_alloc_cap_flush();
				1811	if (!prealloc_cf)
				1812	return -ENOMEM;
				1813
				1814	inode_lock(inode);
				1815
				1816	if (ceph_snap(inode) != CEPH_NOSNAP) {
				1817	ret = -EROFS;
				1818	goto unlock;
				1819	}
				1820
				1821	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1822	ret = ceph_uninline_data(file, NULL);
				1823	if (ret < 0)
				1824	goto unlock;
				1825	}
				1826
				1827	size = i_size_read(inode);
				1828
				1829	/* Are we punching a hole beyond EOF? */
				1830	if (offset >= size)
				1831	goto unlock;
				1832	if ((offset + length) > size)
				1833	length = size - offset;
				1834
				1835	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1836	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1837	else
				1838	want = CEPH_CAP_FILE_BUFFER;
				1839
				1840	ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
				1841	if (ret < 0)
				1842	goto unlock;
				1843
				1844	ceph_zero_pagecache_range(inode, offset, length);
				1845	ret = ceph_zero_objects(inode, offset, length);
				1846
				1847	if (!ret) {
				1848	spin_lock(&ci->i_ceph_lock);
				1849	ci->i_inline_version = CEPH_INLINE_NONE;
				1850	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1851	&prealloc_cf);
				1852	spin_unlock(&ci->i_ceph_lock);
				1853	if (dirty)
				1854	__mark_inode_dirty(inode, dirty);
				1855	}
				1856
				1857	ceph_put_cap_refs(ci, got);
				1858	unlock:
				1859	inode_unlock(inode);
				1860	ceph_free_cap_flush(prealloc_cf);
				1861	return ret;
				1862	}
				1863
				1864	/*
				1865	* This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
				1866	* src_ci. Two attempts are made to obtain both caps, and an error is return if
				1867	* this fails; zero is returned on success.
				1868	*/
				1869	static int get_rd_wr_caps(struct file src_filp, int src_got,
				1870	struct file *dst_filp,
				1871	loff_t dst_endoff, int *dst_got)
				1872	{
				1873	int ret = 0;
				1874	bool retrying = false;
				1875
				1876	retry_caps:
				1877	ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
				1878	dst_endoff, dst_got, NULL);
				1879	if (ret < 0)
				1880	return ret;
				1881
				1882	/*
				1883	* Since we're already holding the FILE_WR capability for the dst file,
				1884	* we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
				1885	* retry dance instead to try to get both capabilities.
				1886	*/
				1887	ret = ceph_try_get_caps(file_inode(src_filp),
				1888	CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
				1889	false, src_got);
				1890	if (ret <= 0) {
				1891	/* Start by dropping dst_ci caps and getting src_ci caps */
				1892	ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
				1893	if (retrying) {
				1894	if (!ret)
				1895	/* ceph_try_get_caps masks EAGAIN */
				1896	ret = -EAGAIN;
				1897	return ret;
				1898	}
				1899	ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
				1900	CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
				1901	if (ret < 0)
				1902	return ret;
				1903	/... drop src_ci caps too, and retry /
				1904	ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
				1905	retrying = true;
				1906	goto retry_caps;
				1907	}
				1908	return ret;
				1909	}
				1910
				1911	static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
				1912	struct ceph_inode_info *dst_ci, int dst_got)
				1913	{
				1914	ceph_put_cap_refs(src_ci, src_got);
				1915	ceph_put_cap_refs(dst_ci, dst_got);
				1916	}
				1917
				1918	/*
				1919	* This function does several size-related checks, returning an error if:
				1920	* - source file is smaller than off+len
				1921	* - destination file size is not OK (inode_newsize_ok())
				1922	* - max bytes quotas is exceeded
				1923	*/
				1924	static int is_file_size_ok(struct inode src_inode, struct inode dst_inode,
				1925	loff_t src_off, loff_t dst_off, size_t len)
				1926	{
				1927	loff_t size, endoff;
				1928
				1929	size = i_size_read(src_inode);
				1930	/*
				1931	* Don't copy beyond source file EOF. Instead of simply setting length
				1932	* to (size - src_off), just drop to VFS default implementation, as the
				1933	* local i_size may be stale due to other clients writing to the source
				1934	* inode.
				1935	*/
				1936	if (src_off + len > size) {
				1937	dout("Copy beyond EOF (%llu + %zu > %llu)\n",
				1938	src_off, len, size);
				1939	return -EOPNOTSUPP;
				1940	}
				1941	size = i_size_read(dst_inode);
				1942
				1943	endoff = dst_off + len;
				1944	if (inode_newsize_ok(dst_inode, endoff))
				1945	return -EOPNOTSUPP;
				1946
				1947	if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
				1948	return -EDQUOT;
				1949
				1950	return 0;
				1951	}
				1952
				1953	static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
				1954	struct file *dst_file, loff_t dst_off,
				1955	size_t len, unsigned int flags)
				1956	{
				1957	struct inode *src_inode = file_inode(src_file);
				1958	struct inode *dst_inode = file_inode(dst_file);
				1959	struct ceph_inode_info *src_ci = ceph_inode(src_inode);
				1960	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
				1961	struct ceph_cap_flush *prealloc_cf;
				1962	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
				1963	struct ceph_object_locator src_oloc, dst_oloc;
				1964	struct ceph_object_id src_oid, dst_oid;
				1965	loff_t endoff = 0, size;
				1966	ssize_t ret = -EIO;
				1967	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
				1968	u32 src_objlen, dst_objlen, object_size;
				1969	int src_got = 0, dst_got = 0, err, dirty;
				1970	bool do_final_copy = false;
				1971
				1972	if (src_inode->i_sb != dst_inode->i_sb) {
				1973	struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
				1974
				1975	if (ceph_fsid_compare(&src_fsc->client->fsid,
				1976	&dst_fsc->client->fsid)) {
				1977	dout("Copying files across clusters: src: %pU dst: %pU\n",
				1978	&src_fsc->client->fsid, &dst_fsc->client->fsid);
				1979	return -EXDEV;
				1980	}
				1981	}
				1982	if (ceph_snap(dst_inode) != CEPH_NOSNAP)
				1983	return -EROFS;
				1984
				1985	/*
				1986	* Some of the checks below will return -EOPNOTSUPP, which will force a
				1987	* fallback to the default VFS copy_file_range implementation. This is
				1988	* desirable in several cases (for ex, the 'len' is smaller than the
				1989	* size of the objects, or in cases where that would be more
				1990	* efficient).
				1991	*/
				1992
				1993	if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
				1994	return -EOPNOTSUPP;
				1995
				1996	/*
				1997	* Striped file layouts require that we copy partial objects, but the
				1998	* OSD copy-from operation only supports full-object copies. Limit
				1999	* this to non-striped file layouts for now.
				2000	*/
				2001	if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) \|\|
				2002	(src_ci->i_layout.stripe_count != 1) \|\|
				2003	(dst_ci->i_layout.stripe_count != 1) \|\|
				2004	(src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
				2005	dout("Invalid src/dst files layout\n");
				2006	return -EOPNOTSUPP;
				2007	}
				2008
				2009	if (len < src_ci->i_layout.object_size)
				2010	return -EOPNOTSUPP; /* no remote copy will be done */
				2011
				2012	prealloc_cf = ceph_alloc_cap_flush();
				2013	if (!prealloc_cf)
				2014	return -ENOMEM;
				2015
				2016	/* Start by sync'ing the source and destination files */
				2017	ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
				2018	if (ret < 0) {
				2019	dout("failed to write src file (%zd)\n", ret);
				2020	goto out;
				2021	}
				2022	ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
				2023	if (ret < 0) {
				2024	dout("failed to write dst file (%zd)\n", ret);
				2025	goto out;
				2026	}
				2027
				2028	/*
				2029	* We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
				2030	* clients may have dirty data in their caches. And OSDs know nothing
				2031	* about caps, so they can't safely do the remote object copies.
				2032	*/
				2033	err = get_rd_wr_caps(src_file, &src_got,
				2034	dst_file, (dst_off + len), &dst_got);
				2035	if (err < 0) {
				2036	dout("get_rd_wr_caps returned %d\n", err);
				2037	ret = -EOPNOTSUPP;
				2038	goto out;
				2039	}
				2040
				2041	ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
				2042	if (ret < 0)
				2043	goto out_caps;
				2044
				2045	size = i_size_read(dst_inode);
				2046	endoff = dst_off + len;
				2047
				2048	/* Drop dst file cached pages */
				2049	ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
				2050	dst_off >> PAGE_SHIFT,
				2051	endoff >> PAGE_SHIFT);
				2052	if (ret < 0) {
				2053	dout("Failed to invalidate inode pages (%zd)\n", ret);
				2054	ret = 0; /* XXX */
				2055	}
				2056	src_oloc.pool = src_ci->i_layout.pool_id;
				2057	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
				2058	dst_oloc.pool = dst_ci->i_layout.pool_id;
				2059	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
				2060
				2061	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
				2062	src_ci->i_layout.object_size,
				2063	&src_objnum, &src_objoff, &src_objlen);
				2064	ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
				2065	dst_ci->i_layout.object_size,
				2066	&dst_objnum, &dst_objoff, &dst_objlen);
				2067	/* object-level offsets need to the same */
				2068	if (src_objoff != dst_objoff) {
				2069	ret = -EOPNOTSUPP;
				2070	goto out_caps;
				2071	}
				2072
				2073	/*
				2074	* Do a manual copy if the object offset isn't object aligned.
				2075	* 'src_objlen' contains the bytes left until the end of the object,
				2076	* starting at the src_off
				2077	*/
				2078	if (src_objoff) {
				2079	/*
				2080	* we need to temporarily drop all caps as we'll be calling
				2081	* {read,write}_iter, which will get caps again.
				2082	*/
				2083	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
				2084	ret = do_splice_direct(src_file, &src_off, dst_file,
				2085	&dst_off, src_objlen, flags);
				2086	if (ret < 0) {
				2087	dout("do_splice_direct returned %d\n", err);
				2088	goto out;
				2089	}
				2090	len -= ret;
				2091	err = get_rd_wr_caps(src_file, &src_got,
				2092	dst_file, (dst_off + len), &dst_got);
				2093	if (err < 0)
				2094	goto out;
				2095	err = is_file_size_ok(src_inode, dst_inode,
				2096	src_off, dst_off, len);
				2097	if (err < 0)
				2098	goto out_caps;
				2099	}
				2100	object_size = src_ci->i_layout.object_size;
				2101	while (len >= object_size) {
				2102	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
				2103	object_size, &src_objnum,
				2104	&src_objoff, &src_objlen);
				2105	ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
				2106	object_size, &dst_objnum,
				2107	&dst_objoff, &dst_objlen);
				2108	ceph_oid_init(&src_oid);
				2109	ceph_oid_printf(&src_oid, "%llx.%08llx",
				2110	src_ci->i_vino.ino, src_objnum);
				2111	ceph_oid_init(&dst_oid);
				2112	ceph_oid_printf(&dst_oid, "%llx.%08llx",
				2113	dst_ci->i_vino.ino, dst_objnum);
				2114	/* Do an object remote copy */
				2115	err = ceph_osdc_copy_from(
				2116	&src_fsc->client->osdc,
				2117	src_ci->i_vino.snap, 0,
				2118	&src_oid, &src_oloc,
				2119	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
				2120	CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
				2121	&dst_oid, &dst_oloc,
				2122	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
				2123	CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
				2124	if (err) {
				2125	dout("ceph_osdc_copy_from returned %d\n", err);
				2126	if (!ret)
				2127	ret = err;
				2128	goto out_caps;
				2129	}
				2130	len -= object_size;
				2131	src_off += object_size;
				2132	dst_off += object_size;
				2133	ret += object_size;
				2134	}
				2135
				2136	if (len)
				2137	/* We still need one final local copy */
				2138	do_final_copy = true;
				2139
				2140	file_update_time(dst_file);
				2141	inode_inc_iversion_raw(dst_inode);
				2142
				2143	if (endoff > size) {
				2144	int caps_flags = 0;
				2145
				2146	/* Let the MDS know about dst file size change */
				2147	if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
				2148	caps_flags \|= CHECK_CAPS_NODELAY;
				2149	if (ceph_inode_set_size(dst_inode, endoff))
				2150	caps_flags \|= CHECK_CAPS_AUTHONLY;
				2151	if (caps_flags)
				2152	ceph_check_caps(dst_ci, caps_flags, NULL);
				2153	}
				2154	/* Mark Fw dirty */
				2155	spin_lock(&dst_ci->i_ceph_lock);
				2156	dst_ci->i_inline_version = CEPH_INLINE_NONE;
				2157	dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
				2158	spin_unlock(&dst_ci->i_ceph_lock);
				2159	if (dirty)
				2160	__mark_inode_dirty(dst_inode, dirty);
				2161
				2162	out_caps:
				2163	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
				2164
				2165	if (do_final_copy) {
				2166	err = do_splice_direct(src_file, &src_off, dst_file,
				2167	&dst_off, len, flags);
				2168	if (err < 0) {
				2169	dout("do_splice_direct returned %d\n", err);
				2170	goto out;
				2171	}
				2172	len -= err;
				2173	ret += err;
				2174	}
				2175
				2176	out:
				2177	ceph_free_cap_flush(prealloc_cf);
				2178
				2179	return ret;
				2180	}
				2181
				2182	static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
				2183	struct file *dst_file, loff_t dst_off,
				2184	size_t len, unsigned int flags)
				2185	{
				2186	ssize_t ret;
				2187
				2188	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
				2189	len, flags);
				2190
				2191	if (ret == -EOPNOTSUPP \|\| ret == -EXDEV)
				2192	ret = generic_copy_file_range(src_file, src_off, dst_file,
				2193	dst_off, len, flags);
				2194	return ret;
				2195	}
				2196
				2197	const struct file_operations ceph_file_fops = {
				2198	.open = ceph_open,
				2199	.release = ceph_release,
				2200	.llseek = ceph_llseek,
				2201	.read_iter = ceph_read_iter,
				2202	.write_iter = ceph_write_iter,
				2203	.mmap = ceph_mmap,
				2204	.fsync = ceph_fsync,
				2205	.lock = ceph_lock,
				2206	.setlease = simple_nosetlease,
				2207	.flock = ceph_flock,
				2208	.splice_read = generic_file_splice_read,
				2209	.splice_write = iter_file_splice_write,
				2210	.unlocked_ioctl = ceph_ioctl,
				2211	.compat_ioctl = compat_ptr_ioctl,
				2212	.fallocate = ceph_fallocate,
				2213	.copy_file_range = ceph_copy_file_range,
				2214	};