Blame - src/kernel/linux/v4.14/fs/ceph/file.c - T103

blob: 1f873034f46913c2e33f1693972eb16d65e25bee [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/module.h>
				5	#include <linux/sched.h>
				6	#include <linux/slab.h>
				7	#include <linux/file.h>
				8	#include <linux/mount.h>
				9	#include <linux/namei.h>
				10	#include <linux/writeback.h>
				11	#include <linux/falloc.h>
				12
				13	#include "super.h"
				14	#include "mds_client.h"
				15	#include "cache.h"
				16
				17	static __le32 ceph_flags_sys2wire(u32 flags)
				18	{
				19	u32 wire_flags = 0;
				20
				21	switch (flags & O_ACCMODE) {
				22	case O_RDONLY:
				23	wire_flags \|= CEPH_O_RDONLY;
				24	break;
				25	case O_WRONLY:
				26	wire_flags \|= CEPH_O_WRONLY;
				27	break;
				28	case O_RDWR:
				29	wire_flags \|= CEPH_O_RDWR;
				30	break;
				31	}
				32
				33	#define ceph_sys2wire(a) if (flags & a) { wire_flags \|= CEPH_##a; flags &= ~a; }
				34
				35	ceph_sys2wire(O_CREAT);
				36	ceph_sys2wire(O_EXCL);
				37	ceph_sys2wire(O_TRUNC);
				38	ceph_sys2wire(O_DIRECTORY);
				39	ceph_sys2wire(O_NOFOLLOW);
				40
				41	#undef ceph_sys2wire
				42
				43	if (flags)
				44	dout("unused open flags: %x", flags);
				45
				46	return cpu_to_le32(wire_flags);
				47	}
				48
				49	/*
				50	* Ceph file operations
				51	*
				52	* Implement basic open/close functionality, and implement
				53	* read/write.
				54	*
				55	* We implement three modes of file I/O:
				56	* - buffered uses the generic_file_aio_{read,write} helpers
				57	*
				58	* - synchronous is used when there is multi-client read/write
				59	* sharing, avoids the page cache, and synchronously waits for an
				60	* ack from the OSD.
				61	*
				62	* - direct io takes the variant of the sync path that references
				63	* user pages directly.
				64	*
				65	* fsync() flushes and waits on dirty pages, but just queues metadata
				66	* for writeback: since the MDS can recover size and mtime there is no
				67	* need to wait for MDS acknowledgement.
				68	*/
				69
				70	/*
				71	* Calculate the length sum of direct io vectors that can
				72	* be combined into one page vector.
				73	*/
				74	static size_t dio_get_pagev_size(const struct iov_iter *it)
				75	{
				76	const struct iovec *iov = it->iov;
				77	const struct iovec *iovend = iov + it->nr_segs;
				78	size_t size;
				79
				80	size = iov->iov_len - it->iov_offset;
				81	/*
				82	* An iov can be page vectored when both the current tail
				83	* and the next base are page aligned.
				84	*/
				85	while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
				86	(++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
				87	size += iov->iov_len;
				88	}
				89	dout("dio_get_pagevlen len = %zu\n", size);
				90	return size;
				91	}
				92
				93	/*
				94	* Allocate a page vector based on (@it, @nbytes).
				95	* The return value is the tuple describing a page vector,
				96	* that is (@pages, @page_align, @num_pages).
				97	*/
				98	static struct page **
				99	dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
				100	size_t page_align, int num_pages)
				101	{
				102	struct iov_iter tmp_it = *it;
				103	size_t align;
				104	struct page **pages;
				105	int ret = 0, idx, npages;
				106
				107	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
				108	(PAGE_SIZE - 1);
				109	npages = calc_pages_for(align, nbytes);
				110	pages = kvmalloc(sizeof(pages) npages, GFP_KERNEL);
				111	if (!pages)
				112	return ERR_PTR(-ENOMEM);
				113
				114	for (idx = 0; idx < npages; ) {
				115	size_t start;
				116	ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
				117	npages - idx, &start);
				118	if (ret < 0)
				119	goto fail;
				120
				121	iov_iter_advance(&tmp_it, ret);
				122	nbytes -= ret;
				123	idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
				124	}
				125
				126	BUG_ON(nbytes != 0);
				127	*num_pages = npages;
				128	*page_align = align;
				129	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
				130	return pages;
				131	fail:
				132	ceph_put_page_vector(pages, idx, false);
				133	return ERR_PTR(ret);
				134	}
				135
				136	/*
				137	* Prepare an open request. Preallocate ceph_cap to avoid an
				138	* inopportune ENOMEM later.
				139	*/
				140	static struct ceph_mds_request *
				141	prepare_open_request(struct super_block *sb, int flags, int create_mode)
				142	{
				143	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
				144	struct ceph_mds_client *mdsc = fsc->mdsc;
				145	struct ceph_mds_request *req;
				146	int want_auth = USE_ANY_MDS;
				147	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
				148
				149	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))
				150	want_auth = USE_AUTH_MDS;
				151
				152	req = ceph_mdsc_create_request(mdsc, op, want_auth);
				153	if (IS_ERR(req))
				154	goto out;
				155	req->r_fmode = ceph_flags_to_mode(flags);
				156	req->r_args.open.flags = ceph_flags_sys2wire(flags);
				157	req->r_args.open.mode = cpu_to_le32(create_mode);
				158	out:
				159	return req;
				160	}
				161
				162	/*
				163	* initialize private struct file data.
				164	* if we fail, clean up by dropping fmode reference on the ceph_inode
				165	*/
				166	static int ceph_init_file(struct inode inode, struct file file, int fmode)
				167	{
				168	struct ceph_file_info *cf;
				169	int ret = 0;
				170
				171	switch (inode->i_mode & S_IFMT) {
				172	case S_IFREG:
				173	ceph_fscache_register_inode_cookie(inode);
				174	ceph_fscache_file_set_cookie(inode, file);
				175	case S_IFDIR:
				176	dout("init_file %p %p 0%o (regular)\n", inode, file,
				177	inode->i_mode);
				178	cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
				179	if (!cf) {
				180	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				181	return -ENOMEM;
				182	}
				183	cf->fmode = fmode;
				184	cf->next_offset = 2;
				185	cf->readdir_cache_idx = -1;
				186	file->private_data = cf;
				187	BUG_ON(inode->i_fop->release != ceph_release);
				188	break;
				189
				190	case S_IFLNK:
				191	dout("init_file %p %p 0%o (symlink)\n", inode, file,
				192	inode->i_mode);
				193	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				194	break;
				195
				196	default:
				197	dout("init_file %p %p 0%o (special)\n", inode, file,
				198	inode->i_mode);
				199	/*
				200	* we need to drop the open ref now, since we don't
				201	* have .release set to ceph_release.
				202	*/
				203	ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
				204	BUG_ON(inode->i_fop->release == ceph_release);
				205
				206	/* call the proper open fop */
				207	ret = inode->i_fop->open(inode, file);
				208	}
				209	return ret;
				210	}
				211
				212	/*
				213	* try renew caps after session gets killed.
				214	*/
				215	int ceph_renew_caps(struct inode *inode)
				216	{
				217	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				218	struct ceph_inode_info *ci = ceph_inode(inode);
				219	struct ceph_mds_request *req;
				220	int err, flags, wanted;
				221
				222	spin_lock(&ci->i_ceph_lock);
				223	wanted = __ceph_caps_file_wanted(ci);
				224	if (__ceph_is_any_real_caps(ci) &&
				225	(!(wanted & CEPH_CAP_ANY_WR) \|\| ci->i_auth_cap)) {
				226	int issued = __ceph_caps_issued(ci, NULL);
				227	spin_unlock(&ci->i_ceph_lock);
				228	dout("renew caps %p want %s issued %s updating mds_wanted\n",
				229	inode, ceph_cap_string(wanted), ceph_cap_string(issued));
				230	ceph_check_caps(ci, 0, NULL);
				231	return 0;
				232	}
				233	spin_unlock(&ci->i_ceph_lock);
				234
				235	flags = 0;
				236	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
				237	flags = O_RDWR;
				238	else if (wanted & CEPH_CAP_FILE_RD)
				239	flags = O_RDONLY;
				240	else if (wanted & CEPH_CAP_FILE_WR)
				241	flags = O_WRONLY;
				242	#ifdef O_LAZY
				243	if (wanted & CEPH_CAP_FILE_LAZYIO)
				244	flags \|= O_LAZY;
				245	#endif
				246
				247	req = prepare_open_request(inode->i_sb, flags, 0);
				248	if (IS_ERR(req)) {
				249	err = PTR_ERR(req);
				250	goto out;
				251	}
				252
				253	req->r_inode = inode;
				254	ihold(inode);
				255	req->r_num_caps = 1;
				256	req->r_fmode = -1;
				257
				258	err = ceph_mdsc_do_request(mdsc, NULL, req);
				259	ceph_mdsc_put_request(req);
				260	out:
				261	dout("renew caps %p open result=%d\n", inode, err);
				262	return err < 0 ? err : 0;
				263	}
				264
				265	/*
				266	* If we already have the requisite capabilities, we can satisfy
				267	* the open request locally (no need to request new caps from the
				268	* MDS). We do, however, need to inform the MDS (asynchronously)
				269	* if our wanted caps set expands.
				270	*/
				271	int ceph_open(struct inode inode, struct file file)
				272	{
				273	struct ceph_inode_info *ci = ceph_inode(inode);
				274	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
				275	struct ceph_mds_client *mdsc = fsc->mdsc;
				276	struct ceph_mds_request *req;
				277	struct ceph_file_info *cf = file->private_data;
				278	int err;
				279	int flags, fmode, wanted;
				280
				281	if (cf) {
				282	dout("open file %p is already opened\n", file);
				283	return 0;
				284	}
				285
				286	/* filter out O_CREAT\|O_EXCL; vfs did that already. yuck. */
				287	flags = file->f_flags & ~(O_CREAT\|O_EXCL);
				288	if (S_ISDIR(inode->i_mode))
				289	flags = O_DIRECTORY; /* mds likes to know */
				290
				291	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
				292	ceph_vinop(inode), file, flags, file->f_flags);
				293	fmode = ceph_flags_to_mode(flags);
				294	wanted = ceph_caps_for_mode(fmode);
				295
				296	/* snapped files are read-only */
				297	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
				298	return -EROFS;
				299
				300	/* trivially open snapdir */
				301	if (ceph_snap(inode) == CEPH_SNAPDIR) {
				302	spin_lock(&ci->i_ceph_lock);
				303	__ceph_get_fmode(ci, fmode);
				304	spin_unlock(&ci->i_ceph_lock);
				305	return ceph_init_file(inode, file, fmode);
				306	}
				307
				308	/*
				309	* No need to block if we have caps on the auth MDS (for
				310	* write) or any MDS (for read). Update wanted set
				311	* asynchronously.
				312	*/
				313	spin_lock(&ci->i_ceph_lock);
				314	if (__ceph_is_any_real_caps(ci) &&
				315	(((fmode & CEPH_FILE_MODE_WR) == 0) \|\| ci->i_auth_cap)) {
				316	int mds_wanted = __ceph_caps_mds_wanted(ci, true);
				317	int issued = __ceph_caps_issued(ci, NULL);
				318
				319	dout("open %p fmode %d want %s issued %s using existing\n",
				320	inode, fmode, ceph_cap_string(wanted),
				321	ceph_cap_string(issued));
				322	__ceph_get_fmode(ci, fmode);
				323	spin_unlock(&ci->i_ceph_lock);
				324
				325	/* adjust wanted? */
				326	if ((issued & wanted) != wanted &&
				327	(mds_wanted & wanted) != wanted &&
				328	ceph_snap(inode) != CEPH_SNAPDIR)
				329	ceph_check_caps(ci, 0, NULL);
				330
				331	return ceph_init_file(inode, file, fmode);
				332	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
				333	(ci->i_snap_caps & wanted) == wanted) {
				334	__ceph_get_fmode(ci, fmode);
				335	spin_unlock(&ci->i_ceph_lock);
				336	return ceph_init_file(inode, file, fmode);
				337	}
				338
				339	spin_unlock(&ci->i_ceph_lock);
				340
				341	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
				342	req = prepare_open_request(inode->i_sb, flags, 0);
				343	if (IS_ERR(req)) {
				344	err = PTR_ERR(req);
				345	goto out;
				346	}
				347	req->r_inode = inode;
				348	ihold(inode);
				349
				350	req->r_num_caps = 1;
				351	err = ceph_mdsc_do_request(mdsc, NULL, req);
				352	if (!err)
				353	err = ceph_init_file(inode, file, req->r_fmode);
				354	ceph_mdsc_put_request(req);
				355	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
				356	out:
				357	return err;
				358	}
				359
				360
				361	/*
				362	* Do a lookup + open with a single request. If we get a non-existent
				363	* file or symlink, return 1 so the VFS can retry.
				364	*/
				365	int ceph_atomic_open(struct inode dir, struct dentry dentry,
				366	struct file *file, unsigned flags, umode_t mode,
				367	int *opened)
				368	{
				369	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
				370	struct ceph_mds_client *mdsc = fsc->mdsc;
				371	struct ceph_mds_request *req;
				372	struct dentry *dn;
				373	struct ceph_acls_info acls = {};
				374	int mask;
				375	int err;
				376
				377	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
				378	dir, dentry, dentry,
				379	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
				380
				381	if (dentry->d_name.len > NAME_MAX)
				382	return -ENAMETOOLONG;
				383
				384	if (flags & O_CREAT) {
				385	err = ceph_pre_init_acls(dir, &mode, &acls);
				386	if (err < 0)
				387	return err;
				388	}
				389
				390	/* do the open */
				391	req = prepare_open_request(dir->i_sb, flags, mode);
				392	if (IS_ERR(req)) {
				393	err = PTR_ERR(req);
				394	goto out_acl;
				395	}
				396	req->r_dentry = dget(dentry);
				397	req->r_num_caps = 2;
				398	if (flags & O_CREAT) {
				399	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
				400	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
				401	if (acls.pagelist) {
				402	req->r_pagelist = acls.pagelist;
				403	acls.pagelist = NULL;
				404	}
				405	}
				406
				407	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
				408	if (ceph_security_xattr_wanted(dir))
				409	mask \|= CEPH_CAP_XATTR_SHARED;
				410	req->r_args.open.mask = cpu_to_le32(mask);
				411
				412	req->r_parent = dir;
				413	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
				414	err = ceph_mdsc_do_request(mdsc,
				415	(flags & (O_CREAT\|O_TRUNC)) ? dir : NULL,
				416	req);
				417	err = ceph_handle_snapdir(req, dentry, err);
				418	if (err)
				419	goto out_req;
				420
				421	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
				422	err = ceph_handle_notrace_create(dir, dentry);
				423
				424	if (d_in_lookup(dentry)) {
				425	dn = ceph_finish_lookup(req, dentry, err);
				426	if (IS_ERR(dn))
				427	err = PTR_ERR(dn);
				428	} else {
				429	/* we were given a hashed negative dentry */
				430	dn = NULL;
				431	}
				432	if (err)
				433	goto out_req;
				434	if (dn \|\| d_really_is_negative(dentry) \|\| d_is_symlink(dentry)) {
				435	/* make vfs retry on splice, ENOENT, or symlink */
				436	dout("atomic_open finish_no_open on dn %p\n", dn);
				437	err = finish_no_open(file, dn);
				438	} else {
				439	dout("atomic_open finish_open on dn %p\n", dn);
				440	if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
				441	ceph_init_inode_acls(d_inode(dentry), &acls);
				442	*opened \|= FILE_CREATED;
				443	}
				444	err = finish_open(file, dentry, ceph_open, opened);
				445	}
				446	out_req:
				447	if (!req->r_err && req->r_target_inode)
				448	ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
				449	ceph_mdsc_put_request(req);
				450	out_acl:
				451	ceph_release_acls_info(&acls);
				452	dout("atomic_open result=%d\n", err);
				453	return err;
				454	}
				455
				456	int ceph_release(struct inode inode, struct file file)
				457	{
				458	struct ceph_inode_info *ci = ceph_inode(inode);
				459	struct ceph_file_info *cf = file->private_data;
				460
				461	dout("release inode %p file %p\n", inode, file);
				462	ceph_put_fmode(ci, cf->fmode);
				463	if (cf->last_readdir)
				464	ceph_mdsc_put_request(cf->last_readdir);
				465	kfree(cf->last_name);
				466	kfree(cf->dir_info);
				467	kmem_cache_free(ceph_file_cachep, cf);
				468
				469	/* wake up anyone waiting for caps on this inode */
				470	wake_up_all(&ci->i_cap_wq);
				471	return 0;
				472	}
				473
				474	enum {
				475	HAVE_RETRIED = 1,
				476	CHECK_EOF = 2,
				477	READ_INLINE = 3,
				478	};
				479
				480	/*
				481	* Read a range of bytes striped over one or more objects. Iterate over
				482	* objects we stripe over. (That's not atomic, but good enough for now.)
				483	*
				484	* If we get a short result from the OSD, check against i_size; we need to
				485	* only return a short read to the caller if we hit EOF.
				486	*/
				487	static int striped_read(struct inode *inode,
				488	u64 pos, u64 len,
				489	struct page **pages, int num_pages,
				490	int page_align, int *checkeof)
				491	{
				492	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				493	struct ceph_inode_info *ci = ceph_inode(inode);
				494	u64 this_len;
				495	loff_t i_size;
				496	int page_idx;
				497	int ret, read = 0;
				498	bool hit_stripe, was_short;
				499
				500	/*
				501	* we may need to do multiple reads. not atomic, unfortunately.
				502	*/
				503	more:
				504	this_len = len;
				505	page_idx = (page_align + read) >> PAGE_SHIFT;
				506	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
				507	&ci->i_layout, pos, &this_len,
				508	ci->i_truncate_seq, ci->i_truncate_size,
				509	pages + page_idx, num_pages - page_idx,
				510	((page_align + read) & ~PAGE_MASK));
				511	if (ret == -ENOENT)
				512	ret = 0;
				513	hit_stripe = this_len < len;
				514	was_short = ret >= 0 && ret < this_len;
				515	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
				516	ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
				517
				518	i_size = i_size_read(inode);
				519	if (ret >= 0) {
				520	if (was_short && (pos + ret < i_size)) {
				521	int zlen = min(this_len - ret, i_size - pos - ret);
				522	int zoff = page_align + read + ret;
				523	dout(" zero gap %llu to %llu\n",
				524	pos + ret, pos + ret + zlen);
				525	ceph_zero_page_vector_range(zoff, zlen, pages);
				526	ret += zlen;
				527	}
				528
				529	read += ret;
				530	pos += ret;
				531	len -= ret;
				532
				533	/* hit stripe and need continue*/
				534	if (len && hit_stripe && pos < i_size)
				535	goto more;
				536	}
				537
				538	if (read > 0) {
				539	ret = read;
				540	/* did we bounce off eof? */
				541	if (pos + len > i_size)
				542	*checkeof = CHECK_EOF;
				543	}
				544
				545	dout("striped_read returns %d\n", ret);
				546	return ret;
				547	}
				548
				549	/*
				550	* Completely synchronous read and write methods. Direct from __user
				551	* buffer to osd, or directly to user pages (if O_DIRECT).
				552	*
				553	* If the read spans object boundary, just do multiple reads.
				554	*/
				555	static ssize_t ceph_sync_read(struct kiocb iocb, struct iov_iter to,
				556	int *checkeof)
				557	{
				558	struct file *file = iocb->ki_filp;
				559	struct inode *inode = file_inode(file);
				560	struct page **pages;
				561	u64 off = iocb->ki_pos;
				562	int num_pages;
				563	ssize_t ret;
				564	size_t len = iov_iter_count(to);
				565
				566	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
				567	(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
				568
				569	if (!len)
				570	return 0;
				571	/*
				572	* flush any page cache pages in this range. this
				573	* will make concurrent normal and sync io slow,
				574	* but it will at least behave sensibly when they are
				575	* in sequence.
				576	*/
				577	ret = filemap_write_and_wait_range(inode->i_mapping, off,
				578	off + len);
				579	if (ret < 0)
				580	return ret;
				581
				582	if (unlikely(to->type & ITER_PIPE)) {
				583	size_t page_off;
				584	ret = iov_iter_get_pages_alloc(to, &pages, len,
				585	&page_off);
				586	if (ret <= 0)
				587	return -ENOMEM;
				588	num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
				589
				590	ret = striped_read(inode, off, ret, pages, num_pages,
				591	page_off, checkeof);
				592	if (ret > 0) {
				593	iov_iter_advance(to, ret);
				594	off += ret;
				595	} else {
				596	iov_iter_advance(to, 0);
				597	}
				598	ceph_put_page_vector(pages, num_pages, false);
				599	} else {
				600	num_pages = calc_pages_for(off, len);
				601	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				602	if (IS_ERR(pages))
				603	return PTR_ERR(pages);
				604
				605	ret = striped_read(inode, off, len, pages, num_pages,
				606	(off & ~PAGE_MASK), checkeof);
				607	if (ret > 0) {
				608	int l, k = 0;
				609	size_t left = ret;
				610
				611	while (left) {
				612	size_t page_off = off & ~PAGE_MASK;
				613	size_t copy = min_t(size_t, left,
				614	PAGE_SIZE - page_off);
				615	l = copy_page_to_iter(pages[k++], page_off,
				616	copy, to);
				617	off += l;
				618	left -= l;
				619	if (l < copy)
				620	break;
				621	}
				622	}
				623	ceph_release_page_vector(pages, num_pages);
				624	}
				625
				626	if (off > iocb->ki_pos) {
				627	ret = off - iocb->ki_pos;
				628	iocb->ki_pos = off;
				629	}
				630
				631	dout("sync_read result %zd\n", ret);
				632	return ret;
				633	}
				634
				635	struct ceph_aio_request {
				636	struct kiocb *iocb;
				637	size_t total_len;
				638	bool write;
				639	bool should_dirty;
				640	int error;
				641	struct list_head osd_reqs;
				642	unsigned num_reqs;
				643	atomic_t pending_reqs;
				644	struct timespec mtime;
				645	struct ceph_cap_flush *prealloc_cf;
				646	};
				647
				648	struct ceph_aio_work {
				649	struct work_struct work;
				650	struct ceph_osd_request *req;
				651	};
				652
				653	static void ceph_aio_retry_work(struct work_struct *work);
				654
				655	static void ceph_aio_complete(struct inode *inode,
				656	struct ceph_aio_request *aio_req)
				657	{
				658	struct ceph_inode_info *ci = ceph_inode(inode);
				659	int ret;
				660
				661	if (!atomic_dec_and_test(&aio_req->pending_reqs))
				662	return;
				663
				664	ret = aio_req->error;
				665	if (!ret)
				666	ret = aio_req->total_len;
				667
				668	dout("ceph_aio_complete %p rc %d\n", inode, ret);
				669
				670	if (ret >= 0 && aio_req->write) {
				671	int dirty;
				672
				673	loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
				674	if (endoff > i_size_read(inode)) {
				675	if (ceph_inode_set_size(inode, endoff))
				676	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				677	}
				678
				679	spin_lock(&ci->i_ceph_lock);
				680	ci->i_inline_version = CEPH_INLINE_NONE;
				681	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				682	&aio_req->prealloc_cf);
				683	spin_unlock(&ci->i_ceph_lock);
				684	if (dirty)
				685	__mark_inode_dirty(inode, dirty);
				686
				687	}
				688
				689	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
				690	CEPH_CAP_FILE_RD));
				691
				692	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
				693
				694	ceph_free_cap_flush(aio_req->prealloc_cf);
				695	kfree(aio_req);
				696	}
				697
				698	static void ceph_aio_complete_req(struct ceph_osd_request *req)
				699	{
				700	int rc = req->r_result;
				701	struct inode *inode = req->r_inode;
				702	struct ceph_aio_request *aio_req = req->r_priv;
				703	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
				704	int num_pages = calc_pages_for((u64)osd_data->alignment,
				705	osd_data->length);
				706
				707	dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
				708	inode, rc, osd_data->length);
				709
				710	if (rc == -EOLDSNAPC) {
				711	struct ceph_aio_work *aio_work;
				712	BUG_ON(!aio_req->write);
				713
				714	aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
				715	if (aio_work) {
				716	INIT_WORK(&aio_work->work, ceph_aio_retry_work);
				717	aio_work->req = req;
				718	queue_work(ceph_inode_to_client(inode)->wb_wq,
				719	&aio_work->work);
				720	return;
				721	}
				722	rc = -ENOMEM;
				723	} else if (!aio_req->write) {
				724	if (rc == -ENOENT)
				725	rc = 0;
				726	if (rc >= 0 && osd_data->length > rc) {
				727	int zoff = osd_data->alignment + rc;
				728	int zlen = osd_data->length - rc;
				729	/*
				730	* If read is satisfied by single OSD request,
				731	* it can pass EOF. Otherwise read is within
				732	* i_size.
				733	*/
				734	if (aio_req->num_reqs == 1) {
				735	loff_t i_size = i_size_read(inode);
				736	loff_t endoff = aio_req->iocb->ki_pos + rc;
				737	if (endoff < i_size)
				738	zlen = min_t(size_t, zlen,
				739	i_size - endoff);
				740	aio_req->total_len = rc + zlen;
				741	}
				742
				743	if (zlen > 0)
				744	ceph_zero_page_vector_range(zoff, zlen,
				745	osd_data->pages);
				746	}
				747	}
				748
				749	ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
				750	ceph_osdc_put_request(req);
				751
				752	if (rc < 0)
				753	cmpxchg(&aio_req->error, 0, rc);
				754
				755	ceph_aio_complete(inode, aio_req);
				756	return;
				757	}
				758
				759	static void ceph_aio_retry_work(struct work_struct *work)
				760	{
				761	struct ceph_aio_work *aio_work =
				762	container_of(work, struct ceph_aio_work, work);
				763	struct ceph_osd_request *orig_req = aio_work->req;
				764	struct ceph_aio_request *aio_req = orig_req->r_priv;
				765	struct inode *inode = orig_req->r_inode;
				766	struct ceph_inode_info *ci = ceph_inode(inode);
				767	struct ceph_snap_context *snapc;
				768	struct ceph_osd_request *req;
				769	int ret;
				770
				771	spin_lock(&ci->i_ceph_lock);
				772	if (__ceph_have_pending_cap_snap(ci)) {
				773	struct ceph_cap_snap *capsnap =
				774	list_last_entry(&ci->i_cap_snaps,
				775	struct ceph_cap_snap,
				776	ci_item);
				777	snapc = ceph_get_snap_context(capsnap->context);
				778	} else {
				779	BUG_ON(!ci->i_head_snapc);
				780	snapc = ceph_get_snap_context(ci->i_head_snapc);
				781	}
				782	spin_unlock(&ci->i_ceph_lock);
				783
				784	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
				785	false, GFP_NOFS);
				786	if (!req) {
				787	ret = -ENOMEM;
				788	req = orig_req;
				789	goto out;
				790	}
				791
				792	req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				793	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
				794	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
				795
				796	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
				797	if (ret) {
				798	ceph_osdc_put_request(req);
				799	req = orig_req;
				800	goto out;
				801	}
				802
				803	req->r_ops[0] = orig_req->r_ops[0];
				804
				805	req->r_mtime = aio_req->mtime;
				806	req->r_data_offset = req->r_ops[0].extent.offset;
				807
				808	ceph_osdc_put_request(orig_req);
				809
				810	req->r_callback = ceph_aio_complete_req;
				811	req->r_inode = inode;
				812	req->r_priv = aio_req;
				813	req->r_abort_on_full = true;
				814
				815	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				816	out:
				817	if (ret < 0) {
				818	req->r_result = ret;
				819	ceph_aio_complete_req(req);
				820	}
				821
				822	ceph_put_snap_context(snapc);
				823	kfree(aio_work);
				824	}
				825
				826	static ssize_t
				827	ceph_direct_read_write(struct kiocb iocb, struct iov_iter iter,
				828	struct ceph_snap_context *snapc,
				829	struct ceph_cap_flush **pcf)
				830	{
				831	struct file *file = iocb->ki_filp;
				832	struct inode *inode = file_inode(file);
				833	struct ceph_inode_info *ci = ceph_inode(inode);
				834	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				835	struct ceph_vino vino;
				836	struct ceph_osd_request *req;
				837	struct page **pages;
				838	struct ceph_aio_request *aio_req = NULL;
				839	int num_pages = 0;
				840	int flags;
				841	int ret;
				842	struct timespec mtime = current_time(inode);
				843	size_t count = iov_iter_count(iter);
				844	loff_t pos = iocb->ki_pos;
				845	bool write = iov_iter_rw(iter) == WRITE;
				846	bool should_dirty = !write && iter_is_iovec(iter);
				847
				848	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				849	return -EROFS;
				850
				851	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
				852	(write ? "write" : "read"), file, pos, (unsigned)count,
				853	snapc, snapc->seq);
				854
				855	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
				856	if (ret < 0)
				857	return ret;
				858
				859	if (write) {
				860	int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
				861	pos >> PAGE_SHIFT,
				862	(pos + count) >> PAGE_SHIFT);
				863	if (ret2 < 0)
				864	dout("invalidate_inode_pages2_range returned %d\n", ret2);
				865
				866	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				867	} else {
				868	flags = CEPH_OSD_FLAG_READ;
				869	}
				870
				871	while (iov_iter_count(iter) > 0) {
				872	u64 size = dio_get_pagev_size(iter);
				873	size_t start = 0;
				874	ssize_t len;
				875
				876	if (write)
				877	size = min_t(u64, size, fsc->mount_options->wsize);
				878	else
				879	size = min_t(u64, size, fsc->mount_options->rsize);
				880
				881	vino = ceph_vino(inode);
				882	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				883	vino, pos, &size, 0,
				884	1,
				885	write ? CEPH_OSD_OP_WRITE :
				886	CEPH_OSD_OP_READ,
				887	flags, snapc,
				888	ci->i_truncate_seq,
				889	ci->i_truncate_size,
				890	false);
				891	if (IS_ERR(req)) {
				892	ret = PTR_ERR(req);
				893	break;
				894	}
				895
				896	len = size;
				897	pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
				898	if (IS_ERR(pages)) {
				899	ceph_osdc_put_request(req);
				900	ret = PTR_ERR(pages);
				901	break;
				902	}
				903
				904	/*
				905	* To simplify error handling, allow AIO when IO within i_size
				906	* or IO can be satisfied by single OSD request.
				907	*/
				908	if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
				909	(len == count \|\| pos + count <= i_size_read(inode))) {
				910	aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
				911	if (aio_req) {
				912	aio_req->iocb = iocb;
				913	aio_req->write = write;
				914	aio_req->should_dirty = should_dirty;
				915	INIT_LIST_HEAD(&aio_req->osd_reqs);
				916	if (write) {
				917	aio_req->mtime = mtime;
				918	swap(aio_req->prealloc_cf, *pcf);
				919	}
				920	}
				921	/* ignore error */
				922	}
				923
				924	if (write) {
				925	/*
				926	* throw out any page cache pages in this range. this
				927	* may block.
				928	*/
				929	truncate_inode_pages_range(inode->i_mapping, pos,
				930	(pos+len) \| (PAGE_SIZE - 1));
				931
				932	req->r_mtime = mtime;
				933	}
				934
				935	osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
				936	false, false);
				937
				938	if (aio_req) {
				939	aio_req->total_len += len;
				940	aio_req->num_reqs++;
				941	atomic_inc(&aio_req->pending_reqs);
				942
				943	req->r_callback = ceph_aio_complete_req;
				944	req->r_inode = inode;
				945	req->r_priv = aio_req;
				946	list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
				947
				948	pos += len;
				949	iov_iter_advance(iter, len);
				950	continue;
				951	}
				952
				953	ret = ceph_osdc_start_request(req->r_osdc, req, false);
				954	if (!ret)
				955	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				956
				957	size = i_size_read(inode);
				958	if (!write) {
				959	if (ret == -ENOENT)
				960	ret = 0;
				961	if (ret >= 0 && ret < len && pos + ret < size) {
				962	int zlen = min_t(size_t, len - ret,
				963	size - pos - ret);
				964	ceph_zero_page_vector_range(start + ret, zlen,
				965	pages);
				966	ret += zlen;
				967	}
				968	if (ret >= 0)
				969	len = ret;
				970	}
				971
				972	ceph_put_page_vector(pages, num_pages, should_dirty);
				973
				974	ceph_osdc_put_request(req);
				975	if (ret < 0)
				976	break;
				977
				978	pos += len;
				979	iov_iter_advance(iter, len);
				980
				981	if (!write && pos >= size)
				982	break;
				983
				984	if (write && pos > size) {
				985	if (ceph_inode_set_size(inode, pos))
				986	ceph_check_caps(ceph_inode(inode),
				987	CHECK_CAPS_AUTHONLY,
				988	NULL);
				989	}
				990	}
				991
				992	if (aio_req) {
				993	LIST_HEAD(osd_reqs);
				994
				995	if (aio_req->num_reqs == 0) {
				996	kfree(aio_req);
				997	return ret;
				998	}
				999
				1000	ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
				1001	CEPH_CAP_FILE_RD);
				1002
				1003	list_splice(&aio_req->osd_reqs, &osd_reqs);
				1004	while (!list_empty(&osd_reqs)) {
				1005	req = list_first_entry(&osd_reqs,
				1006	struct ceph_osd_request,
				1007	r_unsafe_item);
				1008	list_del_init(&req->r_unsafe_item);
				1009	if (ret >= 0)
				1010	ret = ceph_osdc_start_request(req->r_osdc,
				1011	req, false);
				1012	if (ret < 0) {
				1013	req->r_result = ret;
				1014	ceph_aio_complete_req(req);
				1015	}
				1016	}
				1017	return -EIOCBQUEUED;
				1018	}
				1019
				1020	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
				1021	ret = pos - iocb->ki_pos;
				1022	iocb->ki_pos = pos;
				1023	}
				1024	return ret;
				1025	}
				1026
				1027	/*
				1028	* Synchronous write, straight from __user pointer or user pages.
				1029	*
				1030	* If write spans object boundary, just do multiple writes. (For a
				1031	* correct atomic write, we should e.g. take write locks on all
				1032	* objects, rollback on failure, etc.)
				1033	*/
				1034	static ssize_t
				1035	ceph_sync_write(struct kiocb iocb, struct iov_iter from, loff_t pos,
				1036	struct ceph_snap_context *snapc)
				1037	{
				1038	struct file *file = iocb->ki_filp;
				1039	struct inode *inode = file_inode(file);
				1040	struct ceph_inode_info *ci = ceph_inode(inode);
				1041	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1042	struct ceph_vino vino;
				1043	struct ceph_osd_request *req;
				1044	struct page **pages;
				1045	u64 len;
				1046	int num_pages;
				1047	int written = 0;
				1048	int flags;
				1049	int ret;
				1050	bool check_caps = false;
				1051	struct timespec mtime = current_time(inode);
				1052	size_t count = iov_iter_count(from);
				1053
				1054	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
				1055	return -EROFS;
				1056
				1057	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
				1058	file, pos, (unsigned)count, snapc, snapc->seq);
				1059
				1060	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
				1061	if (ret < 0)
				1062	return ret;
				1063
				1064	ret = invalidate_inode_pages2_range(inode->i_mapping,
				1065	pos >> PAGE_SHIFT,
				1066	(pos + count) >> PAGE_SHIFT);
				1067	if (ret < 0)
				1068	dout("invalidate_inode_pages2_range returned %d\n", ret);
				1069
				1070	flags = /* CEPH_OSD_FLAG_ORDERSNAP \| */ CEPH_OSD_FLAG_WRITE;
				1071
				1072	while ((len = iov_iter_count(from)) > 0) {
				1073	size_t left;
				1074	int n;
				1075
				1076	vino = ceph_vino(inode);
				1077	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1078	vino, pos, &len, 0, 1,
				1079	CEPH_OSD_OP_WRITE, flags, snapc,
				1080	ci->i_truncate_seq,
				1081	ci->i_truncate_size,
				1082	false);
				1083	if (IS_ERR(req)) {
				1084	ret = PTR_ERR(req);
				1085	break;
				1086	}
				1087
				1088	/*
				1089	* write from beginning of first page,
				1090	* regardless of io alignment
				1091	*/
				1092	num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1093
				1094	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
				1095	if (IS_ERR(pages)) {
				1096	ret = PTR_ERR(pages);
				1097	goto out;
				1098	}
				1099
				1100	left = len;
				1101	for (n = 0; n < num_pages; n++) {
				1102	size_t plen = min_t(size_t, left, PAGE_SIZE);
				1103	ret = copy_page_from_iter(pages[n], 0, plen, from);
				1104	if (ret != plen) {
				1105	ret = -EFAULT;
				1106	break;
				1107	}
				1108	left -= ret;
				1109	}
				1110
				1111	if (ret < 0) {
				1112	ceph_release_page_vector(pages, num_pages);
				1113	goto out;
				1114	}
				1115
				1116	req->r_inode = inode;
				1117
				1118	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
				1119	false, true);
				1120
				1121	req->r_mtime = mtime;
				1122	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1123	if (!ret)
				1124	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1125
				1126	out:
				1127	ceph_osdc_put_request(req);
				1128	if (ret != 0) {
				1129	ceph_set_error_write(ci);
				1130	break;
				1131	}
				1132
				1133	ceph_clear_error_write(ci);
				1134	pos += len;
				1135	written += len;
				1136	if (pos > i_size_read(inode)) {
				1137	check_caps = ceph_inode_set_size(inode, pos);
				1138	if (check_caps)
				1139	ceph_check_caps(ceph_inode(inode),
				1140	CHECK_CAPS_AUTHONLY,
				1141	NULL);
				1142	}
				1143
				1144	}
				1145
				1146	if (ret != -EOLDSNAPC && written > 0) {
				1147	ret = written;
				1148	iocb->ki_pos = pos;
				1149	}
				1150	return ret;
				1151	}
				1152
				1153	/*
				1154	* Wrap generic_file_aio_read with checks for cap bits on the inode.
				1155	* Atomically grab references, so that those bits are not released
				1156	* back to the MDS mid-read.
				1157	*
				1158	* Hmm, the sync read case isn't actually async... should it be?
				1159	*/
				1160	static ssize_t ceph_read_iter(struct kiocb iocb, struct iov_iter to)
				1161	{
				1162	struct file *filp = iocb->ki_filp;
				1163	struct ceph_file_info *fi = filp->private_data;
				1164	size_t len = iov_iter_count(to);
				1165	struct inode *inode = file_inode(filp);
				1166	struct ceph_inode_info *ci = ceph_inode(inode);
				1167	struct page *pinned_page = NULL;
				1168	ssize_t ret;
				1169	int want, got = 0;
				1170	int retry_op = 0, read = 0;
				1171
				1172	again:
				1173	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
				1174	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
				1175
				1176	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1177	want = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
				1178	else
				1179	want = CEPH_CAP_FILE_CACHE;
				1180	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
				1181	if (ret < 0)
				1182	return ret;
				1183
				1184	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1185	(iocb->ki_flags & IOCB_DIRECT) \|\|
				1186	(fi->flags & CEPH_F_SYNC)) {
				1187
				1188	dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1189	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1190	ceph_cap_string(got));
				1191
				1192	if (ci->i_inline_version == CEPH_INLINE_NONE) {
				1193	if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
				1194	ret = ceph_direct_read_write(iocb, to,
				1195	NULL, NULL);
				1196	if (ret >= 0 && ret < len)
				1197	retry_op = CHECK_EOF;
				1198	} else {
				1199	ret = ceph_sync_read(iocb, to, &retry_op);
				1200	}
				1201	} else {
				1202	retry_op = READ_INLINE;
				1203	}
				1204	} else {
				1205	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
				1206	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
				1207	ceph_cap_string(got));
				1208	current->journal_info = filp;
				1209	ret = generic_file_read_iter(iocb, to);
				1210	current->journal_info = NULL;
				1211	}
				1212	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
				1213	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
				1214	if (pinned_page) {
				1215	put_page(pinned_page);
				1216	pinned_page = NULL;
				1217	}
				1218	ceph_put_cap_refs(ci, got);
				1219	if (retry_op > HAVE_RETRIED && ret >= 0) {
				1220	int statret;
				1221	struct page *page = NULL;
				1222	loff_t i_size;
				1223	if (retry_op == READ_INLINE) {
				1224	page = __page_cache_alloc(GFP_KERNEL);
				1225	if (!page)
				1226	return -ENOMEM;
				1227	}
				1228
				1229	statret = __ceph_do_getattr(inode, page,
				1230	CEPH_STAT_CAP_INLINE_DATA, !!page);
				1231	if (statret < 0) {
				1232	if (page)
				1233	__free_page(page);
				1234	if (statret == -ENODATA) {
				1235	BUG_ON(retry_op != READ_INLINE);
				1236	goto again;
				1237	}
				1238	return statret;
				1239	}
				1240
				1241	i_size = i_size_read(inode);
				1242	if (retry_op == READ_INLINE) {
				1243	BUG_ON(ret > 0 \|\| read > 0);
				1244	if (iocb->ki_pos < i_size &&
				1245	iocb->ki_pos < PAGE_SIZE) {
				1246	loff_t end = min_t(loff_t, i_size,
				1247	iocb->ki_pos + len);
				1248	end = min_t(loff_t, end, PAGE_SIZE);
				1249	if (statret < end)
				1250	zero_user_segment(page, statret, end);
				1251	ret = copy_page_to_iter(page,
				1252	iocb->ki_pos & ~PAGE_MASK,
				1253	end - iocb->ki_pos, to);
				1254	iocb->ki_pos += ret;
				1255	read += ret;
				1256	}
				1257	if (iocb->ki_pos < i_size && read < len) {
				1258	size_t zlen = min_t(size_t, len - read,
				1259	i_size - iocb->ki_pos);
				1260	ret = iov_iter_zero(zlen, to);
				1261	iocb->ki_pos += ret;
				1262	read += ret;
				1263	}
				1264	__free_pages(page, 0);
				1265	return read;
				1266	}
				1267
				1268	/* hit EOF or hole? */
				1269	if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
				1270	ret < len) {
				1271	dout("sync_read hit hole, ppos %lld < size %lld"
				1272	", reading more\n", iocb->ki_pos, i_size);
				1273
				1274	read += ret;
				1275	len -= ret;
				1276	retry_op = HAVE_RETRIED;
				1277	goto again;
				1278	}
				1279	}
				1280
				1281	if (ret >= 0)
				1282	ret += read;
				1283
				1284	return ret;
				1285	}
				1286
				1287	/*
				1288	* Take cap references to avoid releasing caps to MDS mid-write.
				1289	*
				1290	* If we are synchronous, and write with an old snap context, the OSD
				1291	* may return EOLDSNAPC. In that case, retry the write.. _after_
				1292	* dropping our cap refs and allowing the pending snap to logically
				1293	* complete _before_ this write occurs.
				1294	*
				1295	* If we are near ENOSPC, write synchronously.
				1296	*/
				1297	static ssize_t ceph_write_iter(struct kiocb iocb, struct iov_iter from)
				1298	{
				1299	struct file *file = iocb->ki_filp;
				1300	struct ceph_file_info *fi = file->private_data;
				1301	struct inode *inode = file_inode(file);
				1302	struct ceph_inode_info *ci = ceph_inode(inode);
				1303	struct ceph_osd_client *osdc =
				1304	&ceph_sb_to_client(inode->i_sb)->client->osdc;
				1305	struct ceph_cap_flush *prealloc_cf;
				1306	ssize_t count, written = 0;
				1307	int err, want, got;
				1308	loff_t pos;
				1309
				1310	if (ceph_snap(inode) != CEPH_NOSNAP)
				1311	return -EROFS;
				1312
				1313	prealloc_cf = ceph_alloc_cap_flush();
				1314	if (!prealloc_cf)
				1315	return -ENOMEM;
				1316
				1317	retry_snap:
				1318	inode_lock(inode);
				1319
				1320	/* We can write back this queue in page reclaim */
				1321	current->backing_dev_info = inode_to_bdi(inode);
				1322
				1323	if (iocb->ki_flags & IOCB_APPEND) {
				1324	err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1325	if (err < 0)
				1326	goto out;
				1327	}
				1328
				1329	err = generic_write_checks(iocb, from);
				1330	if (err <= 0)
				1331	goto out;
				1332
				1333	pos = iocb->ki_pos;
				1334	count = iov_iter_count(from);
				1335	err = file_remove_privs(file);
				1336	if (err)
				1337	goto out;
				1338
				1339	err = file_update_time(file);
				1340	if (err)
				1341	goto out;
				1342
				1343	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1344	err = ceph_uninline_data(file, NULL);
				1345	if (err < 0)
				1346	goto out;
				1347	}
				1348
				1349	/* FIXME: not complete since it doesn't account for being at quota */
				1350	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
				1351	err = -ENOSPC;
				1352	goto out;
				1353	}
				1354
				1355	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
				1356	inode, ceph_vinop(inode), pos, count, i_size_read(inode));
				1357	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1358	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1359	else
				1360	want = CEPH_CAP_FILE_BUFFER;
				1361	got = 0;
				1362	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
				1363	&got, NULL);
				1364	if (err < 0)
				1365	goto out;
				1366
				1367	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
				1368	inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
				1369
				1370	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == 0 \|\|
				1371	(iocb->ki_flags & IOCB_DIRECT) \|\| (fi->flags & CEPH_F_SYNC) \|\|
				1372	(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
				1373	struct ceph_snap_context *snapc;
				1374	struct iov_iter data;
				1375	inode_unlock(inode);
				1376
				1377	spin_lock(&ci->i_ceph_lock);
				1378	if (__ceph_have_pending_cap_snap(ci)) {
				1379	struct ceph_cap_snap *capsnap =
				1380	list_last_entry(&ci->i_cap_snaps,
				1381	struct ceph_cap_snap,
				1382	ci_item);
				1383	snapc = ceph_get_snap_context(capsnap->context);
				1384	} else {
				1385	BUG_ON(!ci->i_head_snapc);
				1386	snapc = ceph_get_snap_context(ci->i_head_snapc);
				1387	}
				1388	spin_unlock(&ci->i_ceph_lock);
				1389
				1390	/* we might need to revert back to that point */
				1391	data = *from;
				1392	if (iocb->ki_flags & IOCB_DIRECT)
				1393	written = ceph_direct_read_write(iocb, &data, snapc,
				1394	&prealloc_cf);
				1395	else
				1396	written = ceph_sync_write(iocb, &data, pos, snapc);
				1397	if (written > 0)
				1398	iov_iter_advance(from, written);
				1399	ceph_put_snap_context(snapc);
				1400	} else {
				1401	/*
				1402	* No need to acquire the i_truncate_mutex. Because
				1403	* the MDS revokes Fwb caps before sending truncate
				1404	* message to us. We can't get Fwb cap while there
				1405	* are pending vmtruncate. So write and vmtruncate
				1406	* can not run at the same time
				1407	*/
				1408	written = generic_perform_write(file, from, pos);
				1409	if (likely(written >= 0))
				1410	iocb->ki_pos = pos + written;
				1411	inode_unlock(inode);
				1412	}
				1413
				1414	if (written >= 0) {
				1415	int dirty;
				1416	spin_lock(&ci->i_ceph_lock);
				1417	ci->i_inline_version = CEPH_INLINE_NONE;
				1418	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1419	&prealloc_cf);
				1420	spin_unlock(&ci->i_ceph_lock);
				1421	if (dirty)
				1422	__mark_inode_dirty(inode, dirty);
				1423	}
				1424
				1425	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
				1426	inode, ceph_vinop(inode), pos, (unsigned)count,
				1427	ceph_cap_string(got));
				1428	ceph_put_cap_refs(ci, got);
				1429
				1430	if (written == -EOLDSNAPC) {
				1431	dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
				1432	inode, ceph_vinop(inode), pos, (unsigned)count);
				1433	goto retry_snap;
				1434	}
				1435
				1436	if (written >= 0) {
				1437	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
				1438	iocb->ki_flags \|= IOCB_DSYNC;
				1439	written = generic_write_sync(iocb, written);
				1440	}
				1441
				1442	goto out_unlocked;
				1443
				1444	out:
				1445	inode_unlock(inode);
				1446	out_unlocked:
				1447	ceph_free_cap_flush(prealloc_cf);
				1448	current->backing_dev_info = NULL;
				1449	return written ? written : err;
				1450	}
				1451
				1452	/*
				1453	* llseek. be sure to verify file size on SEEK_END.
				1454	*/
				1455	static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
				1456	{
				1457	struct inode *inode = file->f_mapping->host;
				1458	loff_t i_size;
				1459	loff_t ret;
				1460
				1461	inode_lock(inode);
				1462
				1463	if (whence == SEEK_END \|\| whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
				1464	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
				1465	if (ret < 0)
				1466	goto out;
				1467	}
				1468
				1469	i_size = i_size_read(inode);
				1470	switch (whence) {
				1471	case SEEK_END:
				1472	offset += i_size;
				1473	break;
				1474	case SEEK_CUR:
				1475	/*
				1476	* Here we special-case the lseek(fd, 0, SEEK_CUR)
				1477	* position-querying operation. Avoid rewriting the "same"
				1478	* f_pos value back to the file because a concurrent read(),
				1479	* write() or lseek() might have altered it
				1480	*/
				1481	if (offset == 0) {
				1482	ret = file->f_pos;
				1483	goto out;
				1484	}
				1485	offset += file->f_pos;
				1486	break;
				1487	case SEEK_DATA:
				1488	if (offset < 0 \|\| offset >= i_size) {
				1489	ret = -ENXIO;
				1490	goto out;
				1491	}
				1492	break;
				1493	case SEEK_HOLE:
				1494	if (offset < 0 \|\| offset >= i_size) {
				1495	ret = -ENXIO;
				1496	goto out;
				1497	}
				1498	offset = i_size;
				1499	break;
				1500	}
				1501
				1502	ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				1503
				1504	out:
				1505	inode_unlock(inode);
				1506	return ret;
				1507	}
				1508
				1509	static inline void ceph_zero_partial_page(
				1510	struct inode *inode, loff_t offset, unsigned size)
				1511	{
				1512	struct page *page;
				1513	pgoff_t index = offset >> PAGE_SHIFT;
				1514
				1515	page = find_lock_page(inode->i_mapping, index);
				1516	if (page) {
				1517	wait_on_page_writeback(page);
				1518	zero_user(page, offset & (PAGE_SIZE - 1), size);
				1519	unlock_page(page);
				1520	put_page(page);
				1521	}
				1522	}
				1523
				1524	static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
				1525	loff_t length)
				1526	{
				1527	loff_t nearly = round_up(offset, PAGE_SIZE);
				1528	if (offset < nearly) {
				1529	loff_t size = nearly - offset;
				1530	if (length < size)
				1531	size = length;
				1532	ceph_zero_partial_page(inode, offset, size);
				1533	offset += size;
				1534	length -= size;
				1535	}
				1536	if (length >= PAGE_SIZE) {
				1537	loff_t size = round_down(length, PAGE_SIZE);
				1538	truncate_pagecache_range(inode, offset, offset + size - 1);
				1539	offset += size;
				1540	length -= size;
				1541	}
				1542	if (length)
				1543	ceph_zero_partial_page(inode, offset, length);
				1544	}
				1545
				1546	static int ceph_zero_partial_object(struct inode *inode,
				1547	loff_t offset, loff_t *length)
				1548	{
				1549	struct ceph_inode_info *ci = ceph_inode(inode);
				1550	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
				1551	struct ceph_osd_request *req;
				1552	int ret = 0;
				1553	loff_t zero = 0;
				1554	int op;
				1555
				1556	if (!length) {
				1557	op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
				1558	length = &zero;
				1559	} else {
				1560	op = CEPH_OSD_OP_ZERO;
				1561	}
				1562
				1563	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				1564	ceph_vino(inode),
				1565	offset, length,
				1566	0, 1, op,
				1567	CEPH_OSD_FLAG_WRITE,
				1568	NULL, 0, 0, false);
				1569	if (IS_ERR(req)) {
				1570	ret = PTR_ERR(req);
				1571	goto out;
				1572	}
				1573
				1574	req->r_mtime = inode->i_mtime;
				1575	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
				1576	if (!ret) {
				1577	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
				1578	if (ret == -ENOENT)
				1579	ret = 0;
				1580	}
				1581	ceph_osdc_put_request(req);
				1582
				1583	out:
				1584	return ret;
				1585	}
				1586
				1587	static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
				1588	{
				1589	int ret = 0;
				1590	struct ceph_inode_info *ci = ceph_inode(inode);
				1591	s32 stripe_unit = ci->i_layout.stripe_unit;
				1592	s32 stripe_count = ci->i_layout.stripe_count;
				1593	s32 object_size = ci->i_layout.object_size;
				1594	u64 object_set_size = object_size * stripe_count;
				1595	u64 nearly, t;
				1596
				1597	/* round offset up to next period boundary */
				1598	nearly = offset + object_set_size - 1;
				1599	t = nearly;
				1600	nearly -= do_div(t, object_set_size);
				1601
				1602	while (length && offset < nearly) {
				1603	loff_t size = length;
				1604	ret = ceph_zero_partial_object(inode, offset, &size);
				1605	if (ret < 0)
				1606	return ret;
				1607	offset += size;
				1608	length -= size;
				1609	}
				1610	while (length >= object_set_size) {
				1611	int i;
				1612	loff_t pos = offset;
				1613	for (i = 0; i < stripe_count; ++i) {
				1614	ret = ceph_zero_partial_object(inode, pos, NULL);
				1615	if (ret < 0)
				1616	return ret;
				1617	pos += stripe_unit;
				1618	}
				1619	offset += object_set_size;
				1620	length -= object_set_size;
				1621	}
				1622	while (length) {
				1623	loff_t size = length;
				1624	ret = ceph_zero_partial_object(inode, offset, &size);
				1625	if (ret < 0)
				1626	return ret;
				1627	offset += size;
				1628	length -= size;
				1629	}
				1630	return ret;
				1631	}
				1632
				1633	static long ceph_fallocate(struct file *file, int mode,
				1634	loff_t offset, loff_t length)
				1635	{
				1636	struct ceph_file_info *fi = file->private_data;
				1637	struct inode *inode = file_inode(file);
				1638	struct ceph_inode_info *ci = ceph_inode(inode);
				1639	struct ceph_osd_client *osdc =
				1640	&ceph_inode_to_client(inode)->client->osdc;
				1641	struct ceph_cap_flush *prealloc_cf;
				1642	int want, got = 0;
				1643	int dirty;
				1644	int ret = 0;
				1645	loff_t endoff = 0;
				1646	loff_t size;
				1647
				1648	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				1649	return -EOPNOTSUPP;
				1650
				1651	if (!S_ISREG(inode->i_mode))
				1652	return -EOPNOTSUPP;
				1653
				1654	prealloc_cf = ceph_alloc_cap_flush();
				1655	if (!prealloc_cf)
				1656	return -ENOMEM;
				1657
				1658	inode_lock(inode);
				1659
				1660	if (ceph_snap(inode) != CEPH_NOSNAP) {
				1661	ret = -EROFS;
				1662	goto unlock;
				1663	}
				1664
				1665	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
				1666	!(mode & FALLOC_FL_PUNCH_HOLE)) {
				1667	ret = -ENOSPC;
				1668	goto unlock;
				1669	}
				1670
				1671	if (ci->i_inline_version != CEPH_INLINE_NONE) {
				1672	ret = ceph_uninline_data(file, NULL);
				1673	if (ret < 0)
				1674	goto unlock;
				1675	}
				1676
				1677	size = i_size_read(inode);
				1678	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
				1679	endoff = offset + length;
				1680	ret = inode_newsize_ok(inode, endoff);
				1681	if (ret)
				1682	goto unlock;
				1683	}
				1684
				1685	if (fi->fmode & CEPH_FILE_MODE_LAZY)
				1686	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
				1687	else
				1688	want = CEPH_CAP_FILE_BUFFER;
				1689
				1690	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
				1691	if (ret < 0)
				1692	goto unlock;
				1693
				1694	if (mode & FALLOC_FL_PUNCH_HOLE) {
				1695	if (offset < size)
				1696	ceph_zero_pagecache_range(inode, offset, length);
				1697	ret = ceph_zero_objects(inode, offset, length);
				1698	} else if (endoff > size) {
				1699	truncate_pagecache_range(inode, size, -1);
				1700	if (ceph_inode_set_size(inode, endoff))
				1701	ceph_check_caps(ceph_inode(inode),
				1702	CHECK_CAPS_AUTHONLY, NULL);
				1703	}
				1704
				1705	if (!ret) {
				1706	spin_lock(&ci->i_ceph_lock);
				1707	ci->i_inline_version = CEPH_INLINE_NONE;
				1708	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
				1709	&prealloc_cf);
				1710	spin_unlock(&ci->i_ceph_lock);
				1711	if (dirty)
				1712	__mark_inode_dirty(inode, dirty);
				1713	}
				1714
				1715	ceph_put_cap_refs(ci, got);
				1716	unlock:
				1717	inode_unlock(inode);
				1718	ceph_free_cap_flush(prealloc_cf);
				1719	return ret;
				1720	}
				1721
				1722	const struct file_operations ceph_file_fops = {
				1723	.open = ceph_open,
				1724	.release = ceph_release,
				1725	.llseek = ceph_llseek,
				1726	.read_iter = ceph_read_iter,
				1727	.write_iter = ceph_write_iter,
				1728	.mmap = ceph_mmap,
				1729	.fsync = ceph_fsync,
				1730	.lock = ceph_lock,
				1731	.setlease = simple_nosetlease,
				1732	.flock = ceph_flock,
				1733	.splice_read = generic_file_splice_read,
				1734	.splice_write = iter_file_splice_write,
				1735	.unlocked_ioctl = ceph_ioctl,
				1736	.compat_ioctl = ceph_ioctl,
				1737	.fallocate = ceph_fallocate,
				1738	};
				1739