Blame - src/kernel/linux/v4.19/fs/ceph/file.c - T800

blob: 91a7ad259bcf2532de88ddc11041f4094117b3da [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/module.h>
				5	#include <linux/sched.h>
				6	#include <linux/slab.h>
				7	#include <linux/file.h>
				8	#include <linux/mount.h>
				9	#include <linux/namei.h>
				10	#include <linux/writeback.h>
				11	#include <linux/falloc.h>
				12
				13	#include "super.h"
				14	#include "mds_client.h"
				15	#include "cache.h"
				16
				17	static __le32 ceph_flags_sys2wire(u32 flags)
				18	{
				19	u32 wire_flags = 0;
				20
				21	switch (flags & O_ACCMODE) {
				22	case O_RDONLY:
				23	wire_flags \|= CEPH_O_RDONLY;
				24	break;
				25	case O_WRONLY:
				26	wire_flags \|= CEPH_O_WRONLY;
				27	break;
				28	case O_RDWR:
				29	wire_flags \|= CEPH_O_RDWR;
				30	break;
				31	}
				32
				33	flags &= ~O_ACCMODE;
				34
				35	#define ceph_sys2wire(a) if (flags & a) { wire_flags \|= CEPH_##a; flags &= ~a; }
				36
				37	ceph_sys2wire(O_CREAT);
				38	ceph_sys2wire(O_EXCL);
				39	ceph_sys2wire(O_TRUNC);
				40	ceph_sys2wire(O_DIRECTORY);
				41	ceph_sys2wire(O_NOFOLLOW);
				42
				43	#undef ceph_sys2wire
				44
				45	if (flags)
				46	dout("unused open flags: %x\n", flags);
				47
				48	return cpu_to_le32(wire_flags);
				49	}
				50
				51	/*
				52	* Ceph file operations
				53	*
				54	* Implement basic open/close functionality, and implement
				55	* read/write.
				56	*
				57	* We implement three modes of file I/O:
				58	* - buffered uses the generic_file_aio_{read,write} helpers
				59	*
				60	* - synchronous is used when there is multi-client read/write
				61	* sharing, avoids the page cache, and synchronously waits for an
				62	* ack from the OSD.
				63	*
				64	* - direct io takes the variant of the sync path that references
				65	* user pages directly.
				66	*
				67	* fsync() flushes and waits on dirty pages, but just queues metadata
				68	* for writeback: since the MDS can recover size and mtime there is no
				69	* need to wait for MDS acknowledgement.
				70	*/
				71
				72	/*
				73	* How many pages to get in one call to iov_iter_get_pages(). This
				74	* determines the size of the on-stack array used as a buffer.
				75	*/
				76	#define ITER_GET_BVECS_PAGES 64
				77
				78	static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
				79	struct bio_vec *bvecs)
				80	{
				81	size_t size = 0;
				82	int bvec_idx = 0;
				83
				84	if (maxsize > iov_iter_count(iter))
				85	maxsize = iov_iter_count(iter);
				86
				87	while (size < maxsize) {
				88	struct page *pages[ITER_GET_BVECS_PAGES];
				89	ssize_t bytes;
				90	size_t start;
				91	int idx = 0;
				92
				93	bytes = iov_iter_get_pages(iter, pages, maxsize - size,
				94	ITER_GET_BVECS_PAGES, &start);
				95	if (bytes < 0)
				96	return size ?: bytes;
				97
				98	iov_iter_advance(iter, bytes);
				99	size += bytes;
				100
				101	for ( ; bytes; idx++, bvec_idx++) {
				102	struct bio_vec bv = {
				103	.bv_page = pages[idx],
				104	.bv_len = min_t(int, bytes, PAGE_SIZE - start),
				105	.bv_offset = start,
				106	};
				107
				108	bvecs[bvec_idx] = bv;
				109	bytes -= bv.bv_len;
				110	start = 0;
				111	}
				112	}
				113
				114	return size;
				115	}
				116
				117	/*
				118	* iov_iter_get_pages() only considers one iov_iter segment, no matter
				119	* what maxsize or maxpages are given. For ITER_BVEC that is a single
				120	* page.
				121	*
				122	* Attempt to get up to @maxsize bytes worth of pages from @iter.
				123	* Return the number of bytes in the created bio_vec array, or an error.
				124	*/
				125	static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
				126	struct bio_vec *bvecs, int num_bvecs)
				127	{
				128	struct bio_vec *bv;
				129	size_t orig_count = iov_iter_count(iter);
				130	ssize_t bytes;
				131	int npages;
				132
				133	iov_iter_truncate(iter, maxsize);
				134	npages = iov_iter_npages(iter, INT_MAX);
				135	iov_iter_reexpand(iter, orig_count);
				136
				137	/*
				138	* __iter_get_bvecs() may populate only part of the array -- zero it
				139	* out.
				140	*/
				141	bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL \| __GFP_ZERO);
				142	if (!bv)
				143	return -ENOMEM;
				144
				145	bytes = __iter_get_bvecs(iter, maxsize, bv);
				146	if (bytes < 0) {
				147	/*
				148	* No pages were pinned -- just free the array.
				149	*/
				150	kvfree(bv);
				151	return bytes;
				152	}
				153
				154	*bvecs = bv;
				155	*num_bvecs = npages;
				156	return bytes;
				157	}
				158
				159	static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
				160	{
				161	int i;
				162
				163	for (i = 0; i < num_bvecs; i++) {
				164	if (bvecs[i].bv_page) {
				165	if (should_dirty)
				166	set_page_dirty_lock(bvecs[i].bv_page);
				167	put_page(bvecs[i].bv_page);
				168	}
				169	}
				170	kvfree(bvecs);
				171	}
				172
				173	/*
				174	* Prepare an open request. Preallocate ceph_cap to avoid an
				175	* inopportune ENOMEM later.
				176	*/
				177	static struct ceph_mds_request *
				178	prepare_open_request(struct super_block *sb, int flags, int create_mode)
				179	{
				180	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
				181	struct ceph_mds_client *mdsc = fsc->mdsc;
				182	struct ceph_mds_request *req;
				183	int want_auth = USE_ANY_MDS;
				184	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
				185
				186	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))
				187	want_auth = USE_AUTH_MDS;
				188
				189	req = ceph_mdsc_create_request(mdsc, op, want_auth);
				190	if (IS_ERR(req))
				191	goto out;
				192	req->r_fmode = ceph_flags_to_mode(flags);
				193	req->r_args.open.flags = ceph_flags_sys2wire(flags);
				194	req->r_args.open.mode = cpu_to_le32(create_mode);
				195	out:
				196	return req;
				197	}
				198
				199	static int ceph_init_file_info(struct inode inode, struct file file,
				200	int fmode, bool isdir)
				201	{
				202	struct ceph_file_info *fi;
				203
				204	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
				205	inode->i_mode, isdir ? "dir" : "regular");
				206	BUG_ON(inode->i_fop->release != ceph_release);
				207
				208	if (isdir) {
				209	struct ceph_dir_file_info *dfi =
				210	kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
				211	if (!dfi) {
				212	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				213	return -ENOMEM;
				214	}
				215
				216	file->private_data = dfi;
				217	fi = &dfi->file_info;
				218	dfi->next_offset = 2;
				219	dfi->readdir_cache_idx = -1;
				220	} else {
				221	fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
				222	if (!fi) {
				223	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				224	return -ENOMEM;
				225	}
				226
				227	file->private_data = fi;
				228	}
				229
				230	fi->fmode = fmode;
				231	spin_lock_init(&fi->rw_contexts_lock);
				232	INIT_LIST_HEAD(&fi->rw_contexts);
				233
				234	return 0;
				235	}
				236
				237	/*
				238	* initialize private struct file data.
				239	* if we fail, clean up by dropping fmode reference on the ceph_inode
				240	*/
				241	static int ceph_init_file(struct inode inode, struct file file, int fmode)
				242	{
				243	int ret = 0;
				244
				245	switch (inode->i_mode & S_IFMT) {
				246	case S_IFREG:
				247	ceph_fscache_register_inode_cookie(inode);
				248	ceph_fscache_file_set_cookie(inode, file);
				249	case S_IFDIR:
				250	ret = ceph_init_file_info(inode, file, fmode,
				251	S_ISDIR(inode->i_mode));
				252	if (ret)
				253	return ret;
				254	break;
				255
				256	case S_IFLNK:
				257	dout("init_file %p %p 0%o (symlink)\n", inode, file,
				258	inode->i_mode);
				259	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				260	break;
				261
				262	default:
				263	dout("init_file %p %p 0%o (special)\n", inode, file,
				264	inode->i_mode);
				265	/*
				266	* we need to drop the open ref now, since we don't
				267	* have .release set to ceph_release.
				268	*/
				269	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				270	BUG_ON(inode->i_fop->release == ceph_release);
				271
				272	/* call the proper open fop */
				273	ret = inode->i_fop->open(inode, file);
				274	}
				275	return ret;
				276	}
				277
				278	/*
				279	* try renew caps after session gets killed.
				280	*/
				281	int ceph_renew_caps(struct inode *inode)
				282	{
				283	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				284	struct ceph_inode_info *ci = ceph_inode(inode);
				285	struct ceph_mds_request *req;
				286	int err, flags, wanted;
				287
				288	spin_lock(&ci->i_ceph_lock);
				289	wanted = __ceph_caps_file_wanted(ci);
				290	if (__ceph_is_any_real_caps(ci) &&
				291	(!(wanted & CEPH_CAP_ANY_WR) \|\| ci->i_auth_cap)) {
				292	int issued = __ceph_caps_issued(ci, NULL);
				293	spin_unlock(&ci->i_ceph_lock);
				294	dout("renew caps %p want %s issued %s updating mds_wanted\n",
				295	inode, ceph_cap_string(wanted), ceph_cap_string(issued));
				296	ceph_check_caps(ci, 0, NULL);
				297	return 0;
				298	}
				299	spin_unlock(&ci->i_ceph_lock);
				300
				301	flags = 0;
				302	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
				303	flags = O_RDWR;
				304	else if (wanted & CEPH_CAP_FILE_RD)
				305	flags = O_RDONLY;
				306	else if (wanted & CEPH_CAP_FILE_WR)
				307	flags = O_WRONLY;
				308	#ifdef O_LAZY
				309	if (wanted & CEPH_CAP_FILE_LAZYIO)
				310	flags \|= O_LAZY;
				311	#endif
				312
				313	req = prepare_open_request(inode->i_sb, flags, 0);
				314	if (IS_ERR(req)) {
				315	err = PTR_ERR(req);
				316	goto out;
				317	}
				318
				319	req->r_inode = inode;
				320	ihold(inode);
				321	req->r_num_caps = 1;
				322	req->r_fmode = -1;
				323
				324	err = ceph_mdsc_do_request(mdsc, NULL, req);
				325	ceph_mdsc_put_request(req);
				326	out:
				327	dout("renew caps %p open result=%d\n", inode, err);
				328	return err < 0 ? err : 0;
				329	}
				330
				331	/*
				332	* If we already have the requisite capabilities, we can satisfy
				333	* the open request locally (no need to request new caps from the
				334	* MDS). We do, however, need to inform the MDS (asynchronously)
				335	* if our wanted caps set expands.
				336	*/
				337	int ceph_open(struct inode inode, struct file file)
				338	{
				339	struct ceph_inode_info *ci = ceph_inode(inode);
				340	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
				341	struct ceph_mds_client *mdsc = fsc->mdsc;
				342	struct ceph_mds_request *req;
				343	struct ceph_file_info *fi = file->private_data;
				344	int err;
				345	int flags, fmode, wanted;
				346
				347	if (fi) {
				348	dout("open file %p is already opened\n", file);
				349	return 0;
				350	}
				351
				352	/* filter out O_CREAT\|O_EXCL; vfs did that already. yuck. */
				353	flags = file->f_flags & ~(O_CREAT\|O_EXCL);
				354	if (S_ISDIR(inode->i_mode))
				355	flags = O_DIRECTORY; /* mds likes to know */
				356
				357	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
				358	ceph_vinop(inode), file, flags, file->f_flags);
				359	fmode = ceph_flags_to_mode(flags);
				360	wanted = ceph_caps_for_mode(fmode);
				361
				362	/* snapped files are read-only */
				363	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
				364	return -EROFS;
				365
				366	/* trivially open snapdir */
				367	if (ceph_snap(inode) == CEPH_SNAPDIR) {
				368	spin_lock(&ci->i_ceph_lock);
				369	__ceph_get_fmode(ci, fmode);
				370	spin_unlock(&ci->i_ceph_lock);
				371	return ceph_init_file(inode, file, fmode);
				372	}
				373
				374	/*
				375	* No need to block if we have caps on the auth MDS (for
				376	* write) or any MDS (for read). Update wanted set
				377	* asynchronously.
				378	*/
				379	spin_lock(&ci->i_ceph_lock);
				380	if (__ceph_is_any_real_caps(ci) &&
				381	(((fmode & CEPH_FILE_MODE_WR) == 0) \|\| ci->i_auth_cap)) {
				382	int mds_wanted = __ceph_caps_mds_wanted(ci, true);
				383	int issued = __ceph_caps_issued(ci, NULL);
				384
				385	dout("open %p fmode %d want %s issued %s using existing\n",
				386	inode, fmode, ceph_cap_string(wanted),
				387	ceph_cap_string(issued));
				388	__ceph_get_fmode(ci, fmode);
				389	spin_unlock(&ci->i_ceph_lock);
				390
				391	/* adjust wanted? */
				392	if ((issued & wanted) != wanted &&
				393	(mds_wanted & wanted) != wanted &&
				394	ceph_snap(inode) != CEPH_SNAPDIR)
				395	ceph_check_caps(ci, 0, NULL);
				396
				397	return ceph_init_file(inode, file, fmode);
				398	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
				399	(ci->i_snap_caps & wanted) == wanted) {
				400	__ceph_get_fmode(ci, fmode);
				401	spin_unlock(&ci->i_ceph_lock);
				402	return ceph_init_file(inode, file, fmode);
				403	}
				404
				405	spin_unlock(&ci->i_ceph_lock);
				406
				407	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
				408	req = prepare_open_request(inode->i_sb, flags, 0);
				409	if (IS_ERR(req)) {
				410	err = PTR_ERR(req);
				411	goto out;
				412	}
				413	req->r_inode = inode;
				414	ihold(inode);
				415
				416	req->r_num_caps = 1;
				417	err = ceph_mdsc_do_request(mdsc, NULL, req);
				418	if (!err)
				419	err = ceph_init_file(inode, file, req->r_fmode);
				420	ceph_mdsc_put_request(req);
				421	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
				422	out:
				423	return err;
				424	}
				425
				426
				427	/*
				428	* Do a lookup + open with a single request. If we get a non-existent
				429	* file or symlink, return 1 so the VFS can retry.
				430	*/
				431	int ceph_atomic_open(struct inode dir, struct dentry dentry,
				432	struct file *file, unsigned flags, umode_t mode)
				433	{
				434	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
				435	struct ceph_mds_client *mdsc = fsc->mdsc;
				436	struct ceph_mds_request *req;
				437	struct dentry *dn;
				438	struct ceph_acls_info acls = {};
				439	int mask;
				440	int err;
				441
				442	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
				443	dir, dentry, dentry,
				444	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
				445
				446	if (dentry->d_name.len > NAME_MAX)
				447	return -ENAMETOOLONG;
				448
				449	if (flags & O_CREAT) {
				450	if (ceph_quota_is_max_files_exceeded(dir))
				451	return -EDQUOT;
				452	err = ceph_pre_init_acls(dir, &mode, &acls);
				453	if (err < 0)
				454	return err;
				455	}
				456
				457	/* do the open */
				458	req = prepare_open_request(dir->i_sb, flags, mode);
				459	if (IS_ERR(req)) {
				460	err = PTR_ERR(req);
				461	goto out_acl;
				462	}
				463	req->r_dentry = dget(dentry);
				464	req->r_num_caps = 2;
				465	if (flags & O_CREAT) {
				466	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL;
				467	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
				468	if (acls.pagelist) {
				469	req->r_pagelist = acls.pagelist;
				470	acls.pagelist = NULL;
				471	}
				472	}
				473
				474	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
				475	if (ceph_security_xattr_wanted(dir))
				476	mask \|= CEPH_CAP_XATTR_SHARED;
				477	req->r_args.open.mask = cpu_to_le32(mask);
				478
				479	req->r_parent = dir;
				480	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
				481	err = ceph_mdsc_do_request(mdsc,
				482	(flags & (O_CREAT\|O_TRUNC)) ? dir : NULL,
				483	req);
				484	err = ceph_handle_snapdir(req, dentry, err);
				485	if (err)
				486	goto out_req;
				487
				488	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
				489	err = ceph_handle_notrace_create(dir, dentry);
				490
				491	if (d_in_lookup(dentry)) {
				492	dn = ceph_finish_lookup(req, dentry, err);
				493	if (IS_ERR(dn))
				494	err = PTR_ERR(dn);
				495	} else {
				496	/* we were given a hashed negative dentry */
				497	dn = NULL;
				498	}
				499	if (err)
				500	goto out_req;
				501	if (dn \|\| d_really_is_negative(dentry) \|\| d_is_symlink(dentry)) {
				502	/* make vfs retry on splice, ENOENT, or symlink */
				503	dout("atomic_open finish_no_open on dn %p\n", dn);
				504	err = finish_no_open(file, dn);
				505	} else {
				506	dout("atomic_open finish_open on dn %p\n", dn);
				507	if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
				508	ceph_init_inode_acls(d_inode(dentry), &acls);
				509	file->f_mode \|= FMODE_CREATED;
				510	}
				511	err = finish_open(file, dentry, ceph_open);
				512	}
				513	out_req:
				514	if (!req->r_err && req->r_target_inode)
				515	ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
				516	ceph_mdsc_put_request(req);
				517	out_acl:
				518	ceph_release_acls_info(&acls);
				519	dout("atomic_open result=%d\n", err);
				520	return err;
				521	}
				522
				523	int ceph_release(struct inode inode, struct file file)
				524	{
				525	struct ceph_inode_info *ci = ceph_inode(inode);
				526
				527	if (S_ISDIR(inode->i_mode)) {
				528	struct ceph_dir_file_info *dfi = file->private_data;
				529	dout("release inode %p dir file %p\n", inode, file);
				530	WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
				531
				532	ceph_put_fmode(ci, dfi->file_info.fmode);
				533
				534	if (dfi->last_readdir)
				535	ceph_mdsc_put_request(dfi->last_readdir);
				536	kfree(dfi->last_name);
				537	kfree(dfi->dir_info);
				538	kmem_cache_free(ceph_dir_file_cachep, dfi);
				539	} else {
				540	struct ceph_file_info *fi = file->private_data;
				541	dout("release inode %p regular file %p\n", inode, file);
				542	WARN_ON(!list_empty(&fi->rw_contexts));
				543
				544	ceph_put_fmode(ci, fi->fmode);
				545	kmem_cache_free(ceph_file_cachep, fi);
				546	}
				547
				548	/* wake up anyone waiting for caps on this inode */
				549	wake_up_all(&ci->i_cap_wq);
				550	return 0;
				551	}
				552
				553	enum {
				554	HAVE_RETRIED = 1,
				555	CHECK_EOF = 2,
				556	READ_INLINE = 3,
				557	};
				558
				559	/*
				560	* Read a range of bytes striped over one or more objects. Iterate over
				561	* objects we stripe over. (That's not atomic, but good enough for now.)
				562	*
				563	* If we get a short result from the OSD, check against i_size; we need to
				564	* only return a short read to the caller if we hit EOF.
				565	*/
				566	static int striped_read(struct inode *inode,
				567	u64 pos, u64 len,
				568	struct page **pages, int num_pages,
				569	int page_align, int *checkeof)
				570	{
				571	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				572	struct ceph_inode_info *ci = ceph_inode(inode);
				573	u64 this_len;
				574	loff_t i_size;
				575	int page_idx;
				576	int ret, read = 0;
				577	bool hit_stripe, was_short;
				578
				579	/*
				580	* we may need to do multiple reads. not atomic, unfortunately.
				581	*/
				582	more:
				583	this_len = len;
				584	page_idx = (page_align + read) >> PAGE_SHIFT;
				585	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
				586	&ci->i_layout, pos, &this_len,
				587	ci->i_truncate_seq, ci->i_truncate_size,
				588	pages + page_idx, num_pages - page_idx,
				589	((page_align + read) & ~PAGE_MASK));
				590	if (ret == -ENOENT)
				591	ret = 0;
				592	hit_stripe = this_len < len;
				593	was_short = ret >= 0 && ret < this_len;
				594	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
				595	ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
				596
				597	i_size = i_size_read(inode);
				598	if (ret >= 0) {
				599	if (was_short && (pos + ret < i_size)) {
				600	int zlen = min(this_len - ret, i_size - pos - ret);
				601	int zoff = page_align + read + ret;
				602	dout(" zero gap %llu to %llu\n",
				603	pos + ret, pos + ret + zlen);
				604	ceph_zero_page_vector_range(zoff, zlen, pages);
				605	ret += zlen;
				606	}
				607
				608	read += ret;
				609	pos += ret;
				610	len -= ret;
				611
				612	/* hit stripe and need continue*/
				613	if (len && hit_stripe && pos < i_size)
				614	goto more;
				615	}
				616
				617	if (read > 0) {
				618	ret = read;
				619	/* did we bounce off eof? */
				620	if (pos + len > i_size)
				621	*checkeof = CHECK_EOF;
				622	}
				623
				624	dout("striped_read returns %d\n", ret);
				625	return ret;
				626	}
				627
				628	/*
				629	* Completely synchronous read and write methods. Direct from __user
				630	* buffer to osd, or directly to user pages (if O_DIRECT).
				631	*
				632	* If the read spans object boundary, just do multiple reads.
				633	*/
				634	static ssize_t ceph_sync_read(struct kiocb iocb, struct iov_iter to,
				635	int *checkeof)
				636	{
				637	struct file *file = iocb->ki_filp;
				638	struct inode *inode = file_inode(file);
				639	struct page **pages;
				640	u64 off = iocb->ki_pos;
				641	int num_pages;
				642	ssize_t ret;
				643	size_t len = iov_iter_count(to);
				644
				645	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
				646	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
				647
				648	if (!len)
				649	return 0;
				650	/*
				651	* flush any page cache pages in this range. this
				652	* will make concurrent normal and sync io slow,
				653	* but it will at least behave sensibly when they are
				654	* in sequence.
				655	*/
				656	ret = filemap_write_and_wait_range(inode->i_mapping, off,
				657	off + len);
				658	if (ret < 0)
				659	return ret;
				660
				661	if (unlikely(to->type & ITER_PIPE)) {
				662	size_t page_off;
				663	ret = iov_iter_get_pages_alloc(to, &pages, len,
				664	&page_off);
				665	if (ret <= 0)
				666	return -ENOMEM;
				667	num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
				668
				669	ret = striped_read(inode, off, ret, pages, num_pages,
				670	page_off, checkeof);
				671	if (ret > 0) {
				672	iov_iter_advance(to, ret);
				673	off += ret;
				674	} else {
				675	iov_iter_advance(to, 0);
				676	}
				677	ceph_put_page_vector(pages, num_pages, false);
				678	} else {
				679	num_pages = calc_pages_for(off, len);
				680	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				681	if (IS_ERR(pages))
				682	return PTR_ERR(pages);
				683
				684	ret = striped_read(inode, off, len, pages, num_pages,
				685	(off & ~PAGE_MASK), checkeof);
				686	if (ret > 0) {
				687	int l, k = 0;
				688	size_t left = ret;
				689
				690	while (left) {
				691	size_t page_off = off & ~PAGE_MASK;
				692	size_t copy = min_t(size_t, left,
				693	PAGE_SIZE - page_off);
				694	l = copy_page_to_iter(pages[k++], page_off,
				695	copy, to);
				696	off += l;
				697	left -= l;
				698	if (l < copy)
				699	break;
				700	}
				701	}
				702	ceph_release_page_vector(pages, num_pages);
				703	}
				704
				705	if (off > iocb->ki_pos) {
				706	ret = off - iocb->ki_pos;
				707	iocb->ki_pos = off;
				708	}
				709
				710	dout("sync_read result %zd\n", ret);
				711	return ret;
				712	}
				713
				714	struct ceph_aio_request {
				715	struct kiocb *iocb;
				716	size_t total_len;
				717	bool write;
				718	bool should_dirty;
				719	int error;
				720	struct list_head osd_reqs;
				721	unsigned num_reqs;
				722	atomic_t pending_reqs;
				723	struct timespec64 mtime;
				724	struct ceph_cap_flush *prealloc_cf;
				725	};
				726
				727	struct ceph_aio_work {
				728	struct work_struct work;
				729	struct ceph_osd_request *req;
				730	};
				731
				732	static void ceph_aio_retry_work(struct work_struct *work);
				733
				734	static void ceph_aio_complete(struct inode *inode,
				735	struct ceph_aio_request *aio_req)
				736	{
				737	struct ceph_inode_info *ci = ceph_inode(inode);
				738	int ret;
				739
				740	if (!atomic_dec_and_test(&aio_req->pending_reqs))
				741	return;
				742
				743	ret = aio_req->error;
				744	if (!ret)
				745	ret = aio_req->total_len;
				746
				747	dout("ceph_aio_complete %p rc %d\n", inode, ret);
				748
				749	if (ret >= 0 && aio_req->write) {
				750	int dirty;
				751
				752	loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
				753	if (endoff > i_size_read(inode)) {
				754	if (ceph_inode_set_size(inode, endoff))
				755	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				756	}
				757
				758	spin_lock(&ci->i_ceph_lock);
				759	ci->i_inline_version = CEPH_INLINE_NONE;
				760	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				761	&aio_req->prealloc_cf);
				762	spin_unlock(&ci->i_ceph_lock);
				763	if (dirty)
				764	__mark_inode_dirty(inode, dirty);
				765
				766	}
				767
				768	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
				769	CEPH_CAP_FILE_RD));
				770
				771	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
				772
				773	ceph_free_cap_flush(aio_req->prealloc_cf);
				774	kfree(aio_req);
				775	}
				776
				777	static void ceph_aio_complete_req(struct ceph_osd_request *req)
				778	{
				779	int rc = req->r_result;
				780	struct inode *inode = req->r_inode;
				781	struct ceph_aio_request *aio_req = req->r_priv;
				782	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
				783
				784	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
				785	BUG_ON(!osd_data->num_bvecs);
				786
				787	dout("ceph_aio_complete_req %p rc %d bytes %u\n",
				788	inode, rc, osd_data->bvec_pos.iter.bi_size);
				789
				790	if (rc == -EOLDSNAPC) {
				791	struct ceph_aio_work *aio_work;
				792	BUG_ON(!aio_req->write);
				793
				794	aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
				795	if (aio_work) {
				796	INIT_WORK(&aio_work->work, ceph_aio_retry_work);
				797	aio_work->req = req;
				798	queue_work(ceph_inode_to_client(inode)->wb_wq,
				799	&aio_work->work);
				800	return;
				801	}
				802	rc = -ENOMEM;
				803	} else if (!aio_req->write) {
				804	if (rc == -ENOENT)
				805	rc = 0;
				806	if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
				807	struct iov_iter i;
				808	int zlen = osd_data->bvec_pos.iter.bi_size - rc;
				809
				810	/*
				811	* If read is satisfied by single OSD request,
				812	* it can pass EOF. Otherwise read is within
				813	* i_size.
				814	*/
				815	if (aio_req->num_reqs == 1) {
				816	loff_t i_size = i_size_read(inode);
				817	loff_t endoff = aio_req->iocb->ki_pos + rc;
				818	if (endoff < i_size)
				819	zlen = min_t(size_t, zlen,
				820	i_size - endoff);
				821	aio_req->total_len = rc + zlen;
				822	}
				823
				824	iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
				825	osd_data->num_bvecs,
				826	osd_data->bvec_pos.iter.bi_size);
				827	iov_iter_advance(&i, rc);
				828	iov_iter_zero(zlen, &i);
				829	}
				830	}
				831
				832	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
				833	aio_req->should_dirty);
				834	ceph_osdc_put_request(req);
				835
				836	if (rc < 0)
				837	cmpxchg(&aio_req->error, 0, rc);
				838
				839	ceph_aio_complete(inode, aio_req);
				840	return;
				841	}
				842
				843	static void ceph_aio_retry_work(struct work_struct *work)
				844	{
				845	struct ceph_aio_work *aio_work =
				846	container_of(work, struct ceph_aio_work, work);
				847	struct ceph_osd_request *orig_req = aio_work->req;
				848	struct ceph_aio_request *aio_req = orig_req->r_priv;
				849	struct inode *inode = orig_req->r_inode;
				850	struct ceph_inode_info *ci = ceph_inode(inode);
				851	struct ceph_snap_context *snapc;
				852	struct ceph_osd_request *req;
				853	int ret;
				854
				855	spin_lock(&ci->i_ceph_lock);
				856	if (__ceph_have_pending_cap_snap(ci)) {
				857	struct ceph_cap_snap *capsnap =
				858	list_last_entry(&ci->i_cap_snaps,
				859	struct ceph_cap_snap,
				860	ci_item);
				861	snapc = ceph_get_snap_context(capsnap->context);
				862	} else {
				863	BUG_ON(!ci->i_head_snapc);
				864	snapc = ceph_get_snap_context(ci->i_head_snapc);
				865	}
				866	spin_unlock(&ci->i_ceph_lock);
				867
				868	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
				869	false, GFP_NOFS);
				870	if (!req) {
				871	ret = -ENOMEM;
				872	req = orig_req;
				873	goto out;
				874	}
				875
				876	req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				877	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
				878	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
				879
				880	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
				881	if (ret) {
				882	ceph_osdc_put_request(req);
				883	req = orig_req;
				884	goto out;
				885	}
				886
				887	req->r_ops[0] = orig_req->r_ops[0];
				888
				889	req->r_mtime = aio_req->mtime;
				890	req->r_data_offset = req->r_ops[0].extent.offset;
				891
				892	ceph_osdc_put_request(orig_req);
				893
				894	req->r_callback = ceph_aio_complete_req;
				895	req->r_inode = inode;
				896	req->r_priv = aio_req;
				897
				898	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				899	out:
				900	if (ret < 0) {
				901	req->r_result = ret;
				902	ceph_aio_complete_req(req);
				903	}
				904
				905	ceph_put_snap_context(snapc);
				906	kfree(aio_work);
				907	}
				908
				909	static ssize_t
				910	ceph_direct_read_write(struct kiocb iocb, struct iov_iter iter,
				911	struct ceph_snap_context *snapc,
				912	struct ceph_cap_flush **pcf)
				913	{
				914	struct file *file = iocb->ki_filp;
				915	struct inode *inode = file_inode(file);
				916	struct ceph_inode_info *ci = ceph_inode(inode);
				917	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				918	struct ceph_vino vino;
				919	struct ceph_osd_request *req;
				920	struct bio_vec *bvecs;
				921	struct ceph_aio_request *aio_req = NULL;
				922	int num_pages = 0;
				923	int flags;
				924	int ret;
				925	struct timespec64 mtime = current_time(inode);
				926	size_t count = iov_iter_count(iter);
				927	loff_t pos = iocb->ki_pos;
				928	bool write = iov_iter_rw(iter) == WRITE;
				929	bool should_dirty = !write && iter_is_iovec(iter);
				930
				931	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				932	return -EROFS;
				933
				934	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
				935	(write ? "write" : "read"), file, pos, (unsigned)count,
				936	snapc, snapc->seq);
				937
				938	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
				939	if (ret < 0)
				940	return ret;
				941
				942	if (write) {
				943	int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
				944	pos >> PAGE_SHIFT,
				945	(pos + count) >> PAGE_SHIFT);
				946	if (ret2 < 0)
				947	dout("invalidate_inode_pages2_range returned %d\n", ret2);
				948
				949	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				950	} else {
				951	flags = CEPH_OSD_FLAG_READ;
				952	}
				953
				954	while (iov_iter_count(iter) > 0) {
				955	u64 size = iov_iter_count(iter);
				956	ssize_t len;
				957
				958	if (write)
				959	size = min_t(u64, size, fsc->mount_options->wsize);
				960	else
				961	size = min_t(u64, size, fsc->mount_options->rsize);
				962
				963	vino = ceph_vino(inode);
				964	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				965	vino, pos, &size, 0,
				966	1,
				967	write ? CEPH_OSD_OP_WRITE :
				968	CEPH_OSD_OP_READ,
				969	flags, snapc,
				970	ci->i_truncate_seq,
				971	ci->i_truncate_size,
				972	false);
				973	if (IS_ERR(req)) {
				974	ret = PTR_ERR(req);
				975	break;
				976	}
				977
				978	len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
				979	if (len < 0) {
				980	ceph_osdc_put_request(req);
				981	ret = len;
				982	break;
				983	}
				984	if (len != size)
				985	osd_req_op_extent_update(req, 0, len);
				986
				987	/*
				988	* To simplify error handling, allow AIO when IO within i_size
				989	* or IO can be satisfied by single OSD request.
				990	*/
				991	if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
				992	(len == count \|\| pos + count <= i_size_read(inode))) {
				993	aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
				994	if (aio_req) {
				995	aio_req->iocb = iocb;
				996	aio_req->write = write;
				997	aio_req->should_dirty = should_dirty;
				998	INIT_LIST_HEAD(&aio_req->osd_reqs);
				999	if (write) {
				1000	aio_req->mtime = mtime;
				1001	swap(aio_req->prealloc_cf, *pcf);
				1002	}
				1003	}
				1004	/* ignore error */
				1005	}
				1006
				1007	if (write) {
				1008	/*
				1009	* throw out any page cache pages in this range. this
				1010	* may block.
				1011	*/
				1012	truncate_inode_pages_range(inode->i_mapping, pos,
				1013	(pos+len) \| (PAGE_SIZE - 1));
				1014
				1015	req->r_mtime = mtime;
				1016	}
				1017
				1018	osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
				1019
				1020	if (aio_req) {
				1021	aio_req->total_len += len;
				1022	aio_req->num_reqs++;
				1023	atomic_inc(&aio_req->pending_reqs);
				1024
				1025	req->r_callback = ceph_aio_complete_req;
				1026	req->r_inode = inode;
				1027	req->r_priv = aio_req;
				1028	list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
				1029
				1030	pos += len;
				1031	continue;
				1032	}
				1033
				1034	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				1035	if (!ret)
				1036	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1037
				1038	size = i_size_read(inode);
				1039	if (!write) {
				1040	if (ret == -ENOENT)
				1041	ret = 0;
				1042	if (ret >= 0 && ret < len && pos + ret < size) {
				1043	struct iov_iter i;
				1044	int zlen = min_t(size_t, len - ret,
				1045	size - pos - ret);
				1046
				1047	iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
				1048	len);
				1049	iov_iter_advance(&i, ret);
				1050	iov_iter_zero(zlen, &i);
				1051	ret += zlen;
				1052	}
				1053	if (ret >= 0)
				1054	len = ret;
				1055	}
				1056
				1057	put_bvecs(bvecs, num_pages, should_dirty);
				1058	ceph_osdc_put_request(req);
				1059	if (ret < 0)
				1060	break;
				1061
				1062	pos += len;
				1063	if (!write && pos >= size)
				1064	break;
				1065
				1066	if (write && pos > size) {
				1067	if (ceph_inode_set_size(inode, pos))
				1068	ceph_check_caps(ceph_inode(inode),
				1069	CHECK_CAPS_AUTHONLY,
				1070	NULL);
				1071	}
				1072	}
				1073
				1074	if (aio_req) {
				1075	LIST_HEAD(osd_reqs);
				1076
				1077	if (aio_req->num_reqs == 0) {
				1078	kfree(aio_req);
				1079	return ret;
				1080	}
				1081
				1082	ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
				1083	CEPH_CAP_FILE_RD);
				1084
				1085	list_splice(&aio_req->osd_reqs, &osd_reqs);
				1086	while (!list_empty(&osd_reqs)) {
				1087	req = list_first_entry(&osd_reqs,
				1088	struct ceph_osd_request,
				1089	r_unsafe_item);
				1090	list_del_init(&req->r_unsafe_item);
				1091	if (ret >= 0)
				1092	ret = ceph_osdc_start_request(req->r_osdc,
				1093	req, false);
				1094	if (ret < 0) {
				1095	req->r_result = ret;
				1096	ceph_aio_complete_req(req);
				1097	}
				1098	}
				1099	return -EIOCBQUEUED;
				1100	}
				1101
				1102	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
				1103	ret = pos - iocb->ki_pos;
				1104	iocb->ki_pos = pos;
				1105	}
				1106	return ret;
				1107	}
				1108
				1109	/*
				1110	* Synchronous write, straight from __user pointer or user pages.
				1111	*
				1112	* If write spans object boundary, just do multiple writes. (For a
				1113	* correct atomic write, we should e.g. take write locks on all
				1114	* objects, rollback on failure, etc.)
				1115	*/
				1116	static ssize_t
				1117	ceph_sync_write(struct kiocb iocb, struct iov_iter from, loff_t pos,
				1118	struct ceph_snap_context *snapc)
				1119	{
				1120	struct file *file = iocb->ki_filp;
				1121	struct inode *inode = file_inode(file);
				1122	struct ceph_inode_info *ci = ceph_inode(inode);
				1123	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1124	struct ceph_vino vino;
				1125	struct ceph_osd_request *req;
				1126	struct page **pages;
				1127	u64 len;
				1128	int num_pages;
				1129	int written = 0;
				1130	int flags;
				1131	int ret;
				1132	bool check_caps = false;
				1133	struct timespec64 mtime = current_time(inode);
				1134	size_t count = iov_iter_count(from);
				1135
				1136	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				1137	return -EROFS;
				1138
				1139	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
				1140	file, pos, (unsigned)count, snapc, snapc->seq);
				1141
				1142	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
				1143	if (ret < 0)
				1144	return ret;
				1145
				1146	ret = invalidate_inode_pages2_range(inode->i_mapping,
				1147	pos >> PAGE_SHIFT,
				1148	(pos + count) >> PAGE_SHIFT);
				1149	if (ret < 0)
				1150	dout("invalidate_inode_pages2_range returned %d\n", ret);
				1151
				1152	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				1153
				1154	while ((len = iov_iter_count(from)) > 0) {
				1155	size_t left;
				1156	int n;
				1157
				1158	vino = ceph_vino(inode);
				1159	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1160	vino, pos, &len, 0, 1,
				1161	CEPH_OSD_OP_WRITE, flags, snapc,
				1162	ci->i_truncate_seq,
				1163	ci->i_truncate_size,
				1164	false);
				1165	if (IS_ERR(req)) {
				1166	ret = PTR_ERR(req);
				1167	break;
				1168	}
				1169
				1170	/*
				1171	* write from beginning of first page,
				1172	* regardless of io alignment
				1173	*/
				1174	num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1175
				1176	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				1177	if (IS_ERR(pages)) {
				1178	ret = PTR_ERR(pages);
				1179	goto out;
				1180	}
				1181
				1182	left = len;
				1183	for (n = 0; n < num_pages; n++) {
				1184	size_t plen = min_t(size_t, left, PAGE_SIZE);
				1185	ret = copy_page_from_iter(pages[n], 0, plen, from);
				1186	if (ret != plen) {
				1187	ret = -EFAULT;
				1188	break;
				1189	}
				1190	left -= ret;
				1191	}
				1192
				1193	if (ret < 0) {
				1194	ceph_release_page_vector(pages, num_pages);
				1195	goto out;
				1196	}
				1197
				1198	req->r_inode = inode;
				1199
				1200	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
				1201	false, true);
				1202
				1203	req->r_mtime = mtime;
				1204	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1205	if (!ret)
				1206	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1207
				1208	out:
				1209	ceph_osdc_put_request(req);
				1210	if (ret != 0) {
				1211	ceph_set_error_write(ci);
				1212	break;
				1213	}
				1214
				1215	ceph_clear_error_write(ci);
				1216	pos += len;
				1217	written += len;
				1218	if (pos > i_size_read(inode)) {
				1219	check_caps = ceph_inode_set_size(inode, pos);
				1220	if (check_caps)
				1221	ceph_check_caps(ceph_inode(inode),
				1222	CHECK_CAPS_AUTHONLY,
				1223	NULL);
				1224	}
				1225
				1226	}
				1227
				1228	if (ret != -EOLDSNAPC && written > 0) {
				1229	ret = written;
				1230	iocb->ki_pos = pos;
				1231	}
				1232	return ret;
				1233	}
				1234
				1235	/*
				1236	* Wrap generic_file_aio_read with checks for cap bits on the inode.
				1237	* Atomically grab references, so that those bits are not released
				1238	* back to the MDS mid-read.
				1239	*
				1240	* Hmm, the sync read case isn't actually async... should it be?
				1241	*/
				1242	static ssize_t ceph_read_iter(struct kiocb iocb, struct iov_iter to)
				1243	{
				1244	struct file *filp = iocb->ki_filp;
				1245	struct ceph_file_info *fi = filp->private_data;
				1246	size_t len = iov_iter_count(to);
				1247	struct inode *inode = file_inode(filp);
				1248	struct ceph_inode_info *ci = ceph_inode(inode);
				1249	struct page *pinned_page = NULL;
				1250	ssize_t ret;
				1251	int want, got = 0;
				1252	int retry_op = 0, read = 0;
				1253
				1254	again:
				1255	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
				1256	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
				1257
				1258	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1259	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
				1260	else
				1261	want = CEPH_CAP_FILE_CACHE;
				1262	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
				1263	if (ret < 0)
				1264	return ret;
				1265
				1266	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1267	(iocb->ki_flags & IOCB_DIRECT) \|\|
				1268	(fi->flags & CEPH_F_SYNC)) {
				1269
				1270	dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1271	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1272	ceph_cap_string(got));
				1273
				1274	if (ci->i_inline_version == CEPH_INLINE_NONE) {
				1275	if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
				1276	ret = ceph_direct_read_write(iocb, to,
				1277	NULL, NULL);
				1278	if (ret >= 0 && ret < len)
				1279	retry_op = CHECK_EOF;
				1280	} else {
				1281	ret = ceph_sync_read(iocb, to, &retry_op);
				1282	}
				1283	} else {
				1284	retry_op = READ_INLINE;
				1285	}
				1286	} else {
				1287	CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
				1288	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1289	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1290	ceph_cap_string(got));
				1291	ceph_add_rw_context(fi, &rw_ctx);
				1292	ret = generic_file_read_iter(iocb, to);
				1293	ceph_del_rw_context(fi, &rw_ctx);
				1294	}
				1295	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
				1296	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
				1297	if (pinned_page) {
				1298	put_page(pinned_page);
				1299	pinned_page = NULL;
				1300	}
				1301	ceph_put_cap_refs(ci, got);
				1302	if (retry_op > HAVE_RETRIED && ret >= 0) {
				1303	int statret;
				1304	struct page *page = NULL;
				1305	loff_t i_size;
				1306	if (retry_op == READ_INLINE) {
				1307	page = __page_cache_alloc(GFP_KERNEL);
				1308	if (!page)
				1309	return -ENOMEM;
				1310	}
				1311
				1312	statret = __ceph_do_getattr(inode, page,
				1313	CEPH_STAT_CAP_INLINE_DATA, !!page);
				1314	if (statret < 0) {
				1315	if (page)
				1316	__free_page(page);
				1317	if (statret == -ENODATA) {
				1318	BUG_ON(retry_op != READ_INLINE);
				1319	goto again;
				1320	}
				1321	return statret;
				1322	}
				1323
				1324	i_size = i_size_read(inode);
				1325	if (retry_op == READ_INLINE) {
				1326	BUG_ON(ret > 0 \|\| read > 0);
				1327	if (iocb->ki_pos < i_size &&
				1328	iocb->ki_pos < PAGE_SIZE) {
				1329	loff_t end = min_t(loff_t, i_size,
				1330	iocb->ki_pos + len);
				1331	end = min_t(loff_t, end, PAGE_SIZE);
				1332	if (statret < end)
				1333	zero_user_segment(page, statret, end);
				1334	ret = copy_page_to_iter(page,
				1335	iocb->ki_pos & ~PAGE_MASK,
				1336	end - iocb->ki_pos, to);
				1337	iocb->ki_pos += ret;
				1338	read += ret;
				1339	}
				1340	if (iocb->ki_pos < i_size && read < len) {
				1341	size_t zlen = min_t(size_t, len - read,
				1342	i_size - iocb->ki_pos);
				1343	ret = iov_iter_zero(zlen, to);
				1344	iocb->ki_pos += ret;
				1345	read += ret;
				1346	}
				1347	__free_pages(page, 0);
				1348	return read;
				1349	}
				1350
				1351	/* hit EOF or hole? */
				1352	if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
				1353	ret < len) {
				1354	dout("sync_read hit hole, ppos %lld < size %lld"
				1355	", reading more\n", iocb->ki_pos, i_size);
				1356
				1357	read += ret;
				1358	len -= ret;
				1359	retry_op = HAVE_RETRIED;
				1360	goto again;
				1361	}
				1362	}
				1363
				1364	if (ret >= 0)
				1365	ret += read;
				1366
				1367	return ret;
				1368	}
				1369
				1370	/*
				1371	* Take cap references to avoid releasing caps to MDS mid-write.
				1372	*
				1373	* If we are synchronous, and write with an old snap context, the OSD
				1374	* may return EOLDSNAPC. In that case, retry the write.. _after_
				1375	* dropping our cap refs and allowing the pending snap to logically
				1376	* complete _before_ this write occurs.
				1377	*
				1378	* If we are near ENOSPC, write synchronously.
				1379	*/
				1380	static ssize_t ceph_write_iter(struct kiocb iocb, struct iov_iter from)
				1381	{
				1382	struct file *file = iocb->ki_filp;
				1383	struct ceph_file_info *fi = file->private_data;
				1384	struct inode *inode = file_inode(file);
				1385	struct ceph_inode_info *ci = ceph_inode(inode);
				1386	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1387	struct ceph_cap_flush *prealloc_cf;
				1388	ssize_t count, written = 0;
				1389	int err, want, got;
				1390	loff_t pos;
				1391	loff_t limit = max(i_size_read(inode), fsc->max_file_size);
				1392
				1393	if (ceph_snap(inode) != CEPH_NOSNAP)
				1394	return -EROFS;
				1395
				1396	prealloc_cf = ceph_alloc_cap_flush();
				1397	if (!prealloc_cf)
				1398	return -ENOMEM;
				1399
				1400	retry_snap:
				1401	inode_lock(inode);
				1402
				1403	/* We can write back this queue in page reclaim */
				1404	current->backing_dev_info = inode_to_bdi(inode);
				1405
				1406	if (iocb->ki_flags & IOCB_APPEND) {
				1407	err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1408	if (err < 0)
				1409	goto out;
				1410	}
				1411
				1412	err = generic_write_checks(iocb, from);
				1413	if (err <= 0)
				1414	goto out;
				1415
				1416	pos = iocb->ki_pos;
				1417	if (unlikely(pos >= limit)) {
				1418	err = -EFBIG;
				1419	goto out;
				1420	} else {
				1421	iov_iter_truncate(from, limit - pos);
				1422	}
				1423
				1424	count = iov_iter_count(from);
				1425	if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
				1426	err = -EDQUOT;
				1427	goto out;
				1428	}
				1429
				1430	err = file_remove_privs(file);
				1431	if (err)
				1432	goto out;
				1433
				1434	err = file_update_time(file);
				1435	if (err)
				1436	goto out;
				1437
				1438	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1439	err = ceph_uninline_data(file, NULL);
				1440	if (err < 0)
				1441	goto out;
				1442	}
				1443
				1444	/* FIXME: not complete since it doesn't account for being at quota */
				1445	if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
				1446	err = -ENOSPC;
				1447	goto out;
				1448	}
				1449
				1450	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
				1451	inode, ceph_vinop(inode), pos, count, i_size_read(inode));
				1452	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1453	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1454	else
				1455	want = CEPH_CAP_FILE_BUFFER;
				1456	got = 0;
				1457	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
				1458	&got, NULL);
				1459	if (err < 0)
				1460	goto out;
				1461
				1462	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
				1463	inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
				1464
				1465	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1466	(iocb->ki_flags & IOCB_DIRECT) \|\| (fi->flags & CEPH_F_SYNC) \|\|
				1467	(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
				1468	struct ceph_snap_context *snapc;
				1469	struct iov_iter data;
				1470	inode_unlock(inode);
				1471
				1472	spin_lock(&ci->i_ceph_lock);
				1473	if (__ceph_have_pending_cap_snap(ci)) {
				1474	struct ceph_cap_snap *capsnap =
				1475	list_last_entry(&ci->i_cap_snaps,
				1476	struct ceph_cap_snap,
				1477	ci_item);
				1478	snapc = ceph_get_snap_context(capsnap->context);
				1479	} else {
				1480	BUG_ON(!ci->i_head_snapc);
				1481	snapc = ceph_get_snap_context(ci->i_head_snapc);
				1482	}
				1483	spin_unlock(&ci->i_ceph_lock);
				1484
				1485	/* we might need to revert back to that point */
				1486	data = *from;
				1487	if (iocb->ki_flags & IOCB_DIRECT)
				1488	written = ceph_direct_read_write(iocb, &data, snapc,
				1489	&prealloc_cf);
				1490	else
				1491	written = ceph_sync_write(iocb, &data, pos, snapc);
				1492	if (written > 0)
				1493	iov_iter_advance(from, written);
				1494	ceph_put_snap_context(snapc);
				1495	} else {
				1496	/*
				1497	* No need to acquire the i_truncate_mutex. Because
				1498	* the MDS revokes Fwb caps before sending truncate
				1499	* message to us. We can't get Fwb cap while there
				1500	* are pending vmtruncate. So write and vmtruncate
				1501	* can not run at the same time
				1502	*/
				1503	written = generic_perform_write(file, from, pos);
				1504	if (likely(written >= 0))
				1505	iocb->ki_pos = pos + written;
				1506	inode_unlock(inode);
				1507	}
				1508
				1509	if (written >= 0) {
				1510	int dirty;
				1511
				1512	spin_lock(&ci->i_ceph_lock);
				1513	ci->i_inline_version = CEPH_INLINE_NONE;
				1514	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1515	&prealloc_cf);
				1516	spin_unlock(&ci->i_ceph_lock);
				1517	if (dirty)
				1518	__mark_inode_dirty(inode, dirty);
				1519	if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
				1520	ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
				1521	}
				1522
				1523	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
				1524	inode, ceph_vinop(inode), pos, (unsigned)count,
				1525	ceph_cap_string(got));
				1526	ceph_put_cap_refs(ci, got);
				1527
				1528	if (written == -EOLDSNAPC) {
				1529	dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
				1530	inode, ceph_vinop(inode), pos, (unsigned)count);
				1531	goto retry_snap;
				1532	}
				1533
				1534	if (written >= 0) {
				1535	if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL))
				1536	iocb->ki_flags \|= IOCB_DSYNC;
				1537	written = generic_write_sync(iocb, written);
				1538	}
				1539
				1540	goto out_unlocked;
				1541
				1542	out:
				1543	inode_unlock(inode);
				1544	out_unlocked:
				1545	ceph_free_cap_flush(prealloc_cf);
				1546	current->backing_dev_info = NULL;
				1547	return written ? written : err;
				1548	}
				1549
				1550	/*
				1551	* llseek. be sure to verify file size on SEEK_END.
				1552	*/
				1553	static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
				1554	{
				1555	struct inode *inode = file->f_mapping->host;
				1556	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1557	loff_t i_size;
				1558	loff_t ret;
				1559
				1560	inode_lock(inode);
				1561
				1562	if (whence == SEEK_END \|\| whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
				1563	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1564	if (ret < 0)
				1565	goto out;
				1566	}
				1567
				1568	i_size = i_size_read(inode);
				1569	switch (whence) {
				1570	case SEEK_END:
				1571	offset += i_size;
				1572	break;
				1573	case SEEK_CUR:
				1574	/*
				1575	* Here we special-case the lseek(fd, 0, SEEK_CUR)
				1576	* position-querying operation. Avoid rewriting the "same"
				1577	* f_pos value back to the file because a concurrent read(),
				1578	* write() or lseek() might have altered it
				1579	*/
				1580	if (offset == 0) {
				1581	ret = file->f_pos;
				1582	goto out;
				1583	}
				1584	offset += file->f_pos;
				1585	break;
				1586	case SEEK_DATA:
				1587	if (offset < 0 \|\| offset >= i_size) {
				1588	ret = -ENXIO;
				1589	goto out;
				1590	}
				1591	break;
				1592	case SEEK_HOLE:
				1593	if (offset < 0 \|\| offset >= i_size) {
				1594	ret = -ENXIO;
				1595	goto out;
				1596	}
				1597	offset = i_size;
				1598	break;
				1599	}
				1600
				1601	ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
				1602
				1603	out:
				1604	inode_unlock(inode);
				1605	return ret;
				1606	}
				1607
				1608	static inline void ceph_zero_partial_page(
				1609	struct inode *inode, loff_t offset, unsigned size)
				1610	{
				1611	struct page *page;
				1612	pgoff_t index = offset >> PAGE_SHIFT;
				1613
				1614	page = find_lock_page(inode->i_mapping, index);
				1615	if (page) {
				1616	wait_on_page_writeback(page);
				1617	zero_user(page, offset & (PAGE_SIZE - 1), size);
				1618	unlock_page(page);
				1619	put_page(page);
				1620	}
				1621	}
				1622
				1623	static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
				1624	loff_t length)
				1625	{
				1626	loff_t nearly = round_up(offset, PAGE_SIZE);
				1627	if (offset < nearly) {
				1628	loff_t size = nearly - offset;
				1629	if (length < size)
				1630	size = length;
				1631	ceph_zero_partial_page(inode, offset, size);
				1632	offset += size;
				1633	length -= size;
				1634	}
				1635	if (length >= PAGE_SIZE) {
				1636	loff_t size = round_down(length, PAGE_SIZE);
				1637	truncate_pagecache_range(inode, offset, offset + size - 1);
				1638	offset += size;
				1639	length -= size;
				1640	}
				1641	if (length)
				1642	ceph_zero_partial_page(inode, offset, length);
				1643	}
				1644
				1645	static int ceph_zero_partial_object(struct inode *inode,
				1646	loff_t offset, loff_t *length)
				1647	{
				1648	struct ceph_inode_info *ci = ceph_inode(inode);
				1649	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1650	struct ceph_osd_request *req;
				1651	int ret = 0;
				1652	loff_t zero = 0;
				1653	int op;
				1654
				1655	if (!length) {
				1656	op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
				1657	length = &zero;
				1658	} else {
				1659	op = CEPH_OSD_OP_ZERO;
				1660	}
				1661
				1662	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1663	ceph_vino(inode),
				1664	offset, length,
				1665	0, 1, op,
				1666	CEPH_OSD_FLAG_WRITE,
				1667	NULL, 0, 0, false);
				1668	if (IS_ERR(req)) {
				1669	ret = PTR_ERR(req);
				1670	goto out;
				1671	}
				1672
				1673	req->r_mtime = inode->i_mtime;
				1674	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1675	if (!ret) {
				1676	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1677	if (ret == -ENOENT)
				1678	ret = 0;
				1679	}
				1680	ceph_osdc_put_request(req);
				1681
				1682	out:
				1683	return ret;
				1684	}
				1685
				1686	static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
				1687	{
				1688	int ret = 0;
				1689	struct ceph_inode_info *ci = ceph_inode(inode);
				1690	s32 stripe_unit = ci->i_layout.stripe_unit;
				1691	s32 stripe_count = ci->i_layout.stripe_count;
				1692	s32 object_size = ci->i_layout.object_size;
				1693	u64 object_set_size = object_size * stripe_count;
				1694	u64 nearly, t;
				1695
				1696	/* round offset up to next period boundary */
				1697	nearly = offset + object_set_size - 1;
				1698	t = nearly;
				1699	nearly -= do_div(t, object_set_size);
				1700
				1701	while (length && offset < nearly) {
				1702	loff_t size = length;
				1703	ret = ceph_zero_partial_object(inode, offset, &size);
				1704	if (ret < 0)
				1705	return ret;
				1706	offset += size;
				1707	length -= size;
				1708	}
				1709	while (length >= object_set_size) {
				1710	int i;
				1711	loff_t pos = offset;
				1712	for (i = 0; i < stripe_count; ++i) {
				1713	ret = ceph_zero_partial_object(inode, pos, NULL);
				1714	if (ret < 0)
				1715	return ret;
				1716	pos += stripe_unit;
				1717	}
				1718	offset += object_set_size;
				1719	length -= object_set_size;
				1720	}
				1721	while (length) {
				1722	loff_t size = length;
				1723	ret = ceph_zero_partial_object(inode, offset, &size);
				1724	if (ret < 0)
				1725	return ret;
				1726	offset += size;
				1727	length -= size;
				1728	}
				1729	return ret;
				1730	}
				1731
				1732	static long ceph_fallocate(struct file *file, int mode,
				1733	loff_t offset, loff_t length)
				1734	{
				1735	struct ceph_file_info *fi = file->private_data;
				1736	struct inode *inode = file_inode(file);
				1737	struct ceph_inode_info *ci = ceph_inode(inode);
				1738	struct ceph_cap_flush *prealloc_cf;
				1739	int want, got = 0;
				1740	int dirty;
				1741	int ret = 0;
				1742	loff_t endoff = 0;
				1743	loff_t size;
				1744
				1745	if (mode != (FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				1746	return -EOPNOTSUPP;
				1747
				1748	if (!S_ISREG(inode->i_mode))
				1749	return -EOPNOTSUPP;
				1750
				1751	prealloc_cf = ceph_alloc_cap_flush();
				1752	if (!prealloc_cf)
				1753	return -ENOMEM;
				1754
				1755	inode_lock(inode);
				1756
				1757	if (ceph_snap(inode) != CEPH_NOSNAP) {
				1758	ret = -EROFS;
				1759	goto unlock;
				1760	}
				1761
				1762	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1763	ret = ceph_uninline_data(file, NULL);
				1764	if (ret < 0)
				1765	goto unlock;
				1766	}
				1767
				1768	size = i_size_read(inode);
				1769
				1770	/* Are we punching a hole beyond EOF? */
				1771	if (offset >= size)
				1772	goto unlock;
				1773	if ((offset + length) > size)
				1774	length = size - offset;
				1775
				1776	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1777	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1778	else
				1779	want = CEPH_CAP_FILE_BUFFER;
				1780
				1781	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
				1782	if (ret < 0)
				1783	goto unlock;
				1784
				1785	ceph_zero_pagecache_range(inode, offset, length);
				1786	ret = ceph_zero_objects(inode, offset, length);
				1787
				1788	if (!ret) {
				1789	spin_lock(&ci->i_ceph_lock);
				1790	ci->i_inline_version = CEPH_INLINE_NONE;
				1791	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1792	&prealloc_cf);
				1793	spin_unlock(&ci->i_ceph_lock);
				1794	if (dirty)
				1795	__mark_inode_dirty(inode, dirty);
				1796	}
				1797
				1798	ceph_put_cap_refs(ci, got);
				1799	unlock:
				1800	inode_unlock(inode);
				1801	ceph_free_cap_flush(prealloc_cf);
				1802	return ret;
				1803	}
				1804
				1805	const struct file_operations ceph_file_fops = {
				1806	.open = ceph_open,
				1807	.release = ceph_release,
				1808	.llseek = ceph_llseek,
				1809	.read_iter = ceph_read_iter,
				1810	.write_iter = ceph_write_iter,
				1811	.mmap = ceph_mmap,
				1812	.fsync = ceph_fsync,
				1813	.lock = ceph_lock,
				1814	.flock = ceph_flock,
				1815	.splice_read = generic_file_splice_read,
				1816	.splice_write = iter_file_splice_write,
				1817	.unlocked_ioctl = ceph_ioctl,
				1818	.compat_ioctl = ceph_ioctl,
				1819	.fallocate = ceph_fallocate,
				1820	};
				1821