Blame - src/kernel/linux/v4.14/fs/ceph/caps.c - T103

blob: ce94d09f6abf9b95930fb092e21fc364f2c8c22d [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/fs.h>
				5	#include <linux/kernel.h>
				6	#include <linux/sched/signal.h>
				7	#include <linux/slab.h>
				8	#include <linux/vmalloc.h>
				9	#include <linux/wait.h>
				10	#include <linux/writeback.h>
				11
				12	#include "super.h"
				13	#include "mds_client.h"
				14	#include "cache.h"
				15	#include <linux/ceph/decode.h>
				16	#include <linux/ceph/messenger.h>
				17
				18	/*
				19	* Capability management
				20	*
				21	* The Ceph metadata servers control client access to inode metadata
				22	* and file data by issuing capabilities, granting clients permission
				23	* to read and/or write both inode field and file data to OSDs
				24	* (storage nodes). Each capability consists of a set of bits
				25	* indicating which operations are allowed.
				26	*
				27	* If the client holds a *_SHARED cap, the client has a coherent value
				28	* that can be safely read from the cached inode.
				29	*
				30	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
				31	* client is allowed to change inode attributes (e.g., file size,
				32	* mtime), note its dirty state in the ceph_cap, and asynchronously
				33	* flush that metadata change to the MDS.
				34	*
				35	* In the event of a conflicting operation (perhaps by another
				36	* client), the MDS will revoke the conflicting client capabilities.
				37	*
				38	* In order for a client to cache an inode, it must hold a capability
				39	* with at least one MDS server. When inodes are released, release
				40	* notifications are batched and periodically sent en masse to the MDS
				41	* cluster to release server state.
				42	*/
				43
				44	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
				45	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
				46	struct ceph_mds_session *session,
				47	struct ceph_inode_info *ci,
				48	u64 oldest_flush_tid);
				49
				50	/*
				51	* Generate readable cap strings for debugging output.
				52	*/
				53	#define MAX_CAP_STR 20
				54	static char cap_str[MAX_CAP_STR][40];
				55	static DEFINE_SPINLOCK(cap_str_lock);
				56	static int last_cap_str;
				57
				58	static char gcap_string(char s, int c)
				59	{
				60	if (c & CEPH_CAP_GSHARED)
				61	*s++ = 's';
				62	if (c & CEPH_CAP_GEXCL)
				63	*s++ = 'x';
				64	if (c & CEPH_CAP_GCACHE)
				65	*s++ = 'c';
				66	if (c & CEPH_CAP_GRD)
				67	*s++ = 'r';
				68	if (c & CEPH_CAP_GWR)
				69	*s++ = 'w';
				70	if (c & CEPH_CAP_GBUFFER)
				71	*s++ = 'b';
				72	if (c & CEPH_CAP_GLAZYIO)
				73	*s++ = 'l';
				74	return s;
				75	}
				76
				77	const char *ceph_cap_string(int caps)
				78	{
				79	int i;
				80	char *s;
				81	int c;
				82
				83	spin_lock(&cap_str_lock);
				84	i = last_cap_str++;
				85	if (last_cap_str == MAX_CAP_STR)
				86	last_cap_str = 0;
				87	spin_unlock(&cap_str_lock);
				88
				89	s = cap_str[i];
				90
				91	if (caps & CEPH_CAP_PIN)
				92	*s++ = 'p';
				93
				94	c = (caps >> CEPH_CAP_SAUTH) & 3;
				95	if (c) {
				96	*s++ = 'A';
				97	s = gcap_string(s, c);
				98	}
				99
				100	c = (caps >> CEPH_CAP_SLINK) & 3;
				101	if (c) {
				102	*s++ = 'L';
				103	s = gcap_string(s, c);
				104	}
				105
				106	c = (caps >> CEPH_CAP_SXATTR) & 3;
				107	if (c) {
				108	*s++ = 'X';
				109	s = gcap_string(s, c);
				110	}
				111
				112	c = caps >> CEPH_CAP_SFILE;
				113	if (c) {
				114	*s++ = 'F';
				115	s = gcap_string(s, c);
				116	}
				117
				118	if (s == cap_str[i])
				119	*s++ = '-';
				120	*s = 0;
				121	return cap_str[i];
				122	}
				123
				124	void ceph_caps_init(struct ceph_mds_client *mdsc)
				125	{
				126	INIT_LIST_HEAD(&mdsc->caps_list);
				127	spin_lock_init(&mdsc->caps_list_lock);
				128	}
				129
				130	void ceph_caps_finalize(struct ceph_mds_client *mdsc)
				131	{
				132	struct ceph_cap *cap;
				133
				134	spin_lock(&mdsc->caps_list_lock);
				135	while (!list_empty(&mdsc->caps_list)) {
				136	cap = list_first_entry(&mdsc->caps_list,
				137	struct ceph_cap, caps_item);
				138	list_del(&cap->caps_item);
				139	kmem_cache_free(ceph_cap_cachep, cap);
				140	}
				141	mdsc->caps_total_count = 0;
				142	mdsc->caps_avail_count = 0;
				143	mdsc->caps_use_count = 0;
				144	mdsc->caps_reserve_count = 0;
				145	mdsc->caps_min_count = 0;
				146	spin_unlock(&mdsc->caps_list_lock);
				147	}
				148
				149	void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
				150	{
				151	spin_lock(&mdsc->caps_list_lock);
				152	mdsc->caps_min_count += delta;
				153	BUG_ON(mdsc->caps_min_count < 0);
				154	spin_unlock(&mdsc->caps_list_lock);
				155	}
				156
				157	void ceph_reserve_caps(struct ceph_mds_client *mdsc,
				158	struct ceph_cap_reservation *ctx, int need)
				159	{
				160	int i;
				161	struct ceph_cap *cap;
				162	int have;
				163	int alloc = 0;
				164	LIST_HEAD(newcaps);
				165
				166	dout("reserve caps ctx=%p need=%d\n", ctx, need);
				167
				168	/* first reserve any caps that are already allocated */
				169	spin_lock(&mdsc->caps_list_lock);
				170	if (mdsc->caps_avail_count >= need)
				171	have = need;
				172	else
				173	have = mdsc->caps_avail_count;
				174	mdsc->caps_avail_count -= have;
				175	mdsc->caps_reserve_count += have;
				176	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				177	mdsc->caps_reserve_count +
				178	mdsc->caps_avail_count);
				179	spin_unlock(&mdsc->caps_list_lock);
				180
				181	for (i = have; i < need; i++) {
				182	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				183	if (!cap)
				184	break;
				185	list_add(&cap->caps_item, &newcaps);
				186	alloc++;
				187	}
				188	/* we didn't manage to reserve as much as we needed */
				189	if (have + alloc != need)
				190	pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
				191	ctx, need, have + alloc);
				192
				193	spin_lock(&mdsc->caps_list_lock);
				194	mdsc->caps_total_count += alloc;
				195	mdsc->caps_reserve_count += alloc;
				196	list_splice(&newcaps, &mdsc->caps_list);
				197
				198	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				199	mdsc->caps_reserve_count +
				200	mdsc->caps_avail_count);
				201	spin_unlock(&mdsc->caps_list_lock);
				202
				203	ctx->count = need;
				204	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
				205	ctx, mdsc->caps_total_count, mdsc->caps_use_count,
				206	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				207	}
				208
				209	int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
				210	struct ceph_cap_reservation *ctx)
				211	{
				212	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
				213	if (ctx->count) {
				214	spin_lock(&mdsc->caps_list_lock);
				215	BUG_ON(mdsc->caps_reserve_count < ctx->count);
				216	mdsc->caps_reserve_count -= ctx->count;
				217	mdsc->caps_avail_count += ctx->count;
				218	ctx->count = 0;
				219	dout("unreserve caps %d = %d used + %d resv + %d avail\n",
				220	mdsc->caps_total_count, mdsc->caps_use_count,
				221	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				222	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				223	mdsc->caps_reserve_count +
				224	mdsc->caps_avail_count);
				225	spin_unlock(&mdsc->caps_list_lock);
				226	}
				227	return 0;
				228	}
				229
				230	struct ceph_cap ceph_get_cap(struct ceph_mds_client mdsc,
				231	struct ceph_cap_reservation *ctx)
				232	{
				233	struct ceph_cap *cap = NULL;
				234
				235	/* temporary, until we do something about cap import/export */
				236	if (!ctx) {
				237	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				238	if (cap) {
				239	spin_lock(&mdsc->caps_list_lock);
				240	mdsc->caps_use_count++;
				241	mdsc->caps_total_count++;
				242	spin_unlock(&mdsc->caps_list_lock);
				243	}
				244	return cap;
				245	}
				246
				247	spin_lock(&mdsc->caps_list_lock);
				248	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				249	ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
				250	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				251	BUG_ON(!ctx->count);
				252	BUG_ON(ctx->count > mdsc->caps_reserve_count);
				253	BUG_ON(list_empty(&mdsc->caps_list));
				254
				255	ctx->count--;
				256	mdsc->caps_reserve_count--;
				257	mdsc->caps_use_count++;
				258
				259	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
				260	list_del(&cap->caps_item);
				261
				262	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				263	mdsc->caps_reserve_count + mdsc->caps_avail_count);
				264	spin_unlock(&mdsc->caps_list_lock);
				265	return cap;
				266	}
				267
				268	void ceph_put_cap(struct ceph_mds_client mdsc, struct ceph_cap cap)
				269	{
				270	spin_lock(&mdsc->caps_list_lock);
				271	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
				272	cap, mdsc->caps_total_count, mdsc->caps_use_count,
				273	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				274	mdsc->caps_use_count--;
				275	/*
				276	* Keep some preallocated caps around (ceph_min_count), to
				277	* avoid lots of free/alloc churn.
				278	*/
				279	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
				280	mdsc->caps_min_count) {
				281	mdsc->caps_total_count--;
				282	kmem_cache_free(ceph_cap_cachep, cap);
				283	} else {
				284	mdsc->caps_avail_count++;
				285	list_add(&cap->caps_item, &mdsc->caps_list);
				286	}
				287
				288	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				289	mdsc->caps_reserve_count + mdsc->caps_avail_count);
				290	spin_unlock(&mdsc->caps_list_lock);
				291	}
				292
				293	void ceph_reservation_status(struct ceph_fs_client *fsc,
				294	int total, int avail, int used, int reserved,
				295	int *min)
				296	{
				297	struct ceph_mds_client *mdsc = fsc->mdsc;
				298
				299	if (total)
				300	*total = mdsc->caps_total_count;
				301	if (avail)
				302	*avail = mdsc->caps_avail_count;
				303	if (used)
				304	*used = mdsc->caps_use_count;
				305	if (reserved)
				306	*reserved = mdsc->caps_reserve_count;
				307	if (min)
				308	*min = mdsc->caps_min_count;
				309	}
				310
				311	/*
				312	* Find ceph_cap for given mds, if any.
				313	*
				314	* Called with i_ceph_lock held.
				315	*/
				316	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
				317	{
				318	struct ceph_cap *cap;
				319	struct rb_node *n = ci->i_caps.rb_node;
				320
				321	while (n) {
				322	cap = rb_entry(n, struct ceph_cap, ci_node);
				323	if (mds < cap->mds)
				324	n = n->rb_left;
				325	else if (mds > cap->mds)
				326	n = n->rb_right;
				327	else
				328	return cap;
				329	}
				330	return NULL;
				331	}
				332
				333	struct ceph_cap ceph_get_cap_for_mds(struct ceph_inode_info ci, int mds)
				334	{
				335	struct ceph_cap *cap;
				336
				337	spin_lock(&ci->i_ceph_lock);
				338	cap = __get_cap_for_mds(ci, mds);
				339	spin_unlock(&ci->i_ceph_lock);
				340	return cap;
				341	}
				342
				343	/*
				344	* Return id of any MDS with a cap, preferably FILE_WR\|BUFFER\|EXCL, else -1.
				345	*/
				346	static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
				347	{
				348	struct ceph_cap *cap;
				349	int mds = -1;
				350	struct rb_node *p;
				351
				352	/* prefer mds with WR\|BUFFER\|EXCL caps */
				353	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				354	cap = rb_entry(p, struct ceph_cap, ci_node);
				355	mds = cap->mds;
				356	if (cap->issued & (CEPH_CAP_FILE_WR \|
				357	CEPH_CAP_FILE_BUFFER \|
				358	CEPH_CAP_FILE_EXCL))
				359	break;
				360	}
				361	return mds;
				362	}
				363
				364	int ceph_get_cap_mds(struct inode *inode)
				365	{
				366	struct ceph_inode_info *ci = ceph_inode(inode);
				367	int mds;
				368	spin_lock(&ci->i_ceph_lock);
				369	mds = __ceph_get_cap_mds(ceph_inode(inode));
				370	spin_unlock(&ci->i_ceph_lock);
				371	return mds;
				372	}
				373
				374	/*
				375	* Called under i_ceph_lock.
				376	*/
				377	static void __insert_cap_node(struct ceph_inode_info *ci,
				378	struct ceph_cap *new)
				379	{
				380	struct rb_node **p = &ci->i_caps.rb_node;
				381	struct rb_node *parent = NULL;
				382	struct ceph_cap *cap = NULL;
				383
				384	while (*p) {
				385	parent = *p;
				386	cap = rb_entry(parent, struct ceph_cap, ci_node);
				387	if (new->mds < cap->mds)
				388	p = &(*p)->rb_left;
				389	else if (new->mds > cap->mds)
				390	p = &(*p)->rb_right;
				391	else
				392	BUG();
				393	}
				394
				395	rb_link_node(&new->ci_node, parent, p);
				396	rb_insert_color(&new->ci_node, &ci->i_caps);
				397	}
				398
				399	/*
				400	* (re)set cap hold timeouts, which control the delayed release
				401	* of unused caps back to the MDS. Should be called on cap use.
				402	*/
				403	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
				404	struct ceph_inode_info *ci)
				405	{
				406	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
				407
				408	ci->i_hold_caps_min = round_jiffies(jiffies +
				409	ma->caps_wanted_delay_min * HZ);
				410	ci->i_hold_caps_max = round_jiffies(jiffies +
				411	ma->caps_wanted_delay_max * HZ);
				412	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
				413	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
				414	}
				415
				416	/*
				417	* (Re)queue cap at the end of the delayed cap release list.
				418	*
				419	* If I_FLUSH is set, leave the inode at the front of the list.
				420	*
				421	* Caller holds i_ceph_lock
				422	* -> we take mdsc->cap_delay_lock
				423	*/
				424	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
				425	struct ceph_inode_info *ci)
				426	{
				427	__cap_set_timeouts(mdsc, ci);
				428	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
				429	ci->i_ceph_flags, ci->i_hold_caps_max);
				430	if (!mdsc->stopping) {
				431	spin_lock(&mdsc->cap_delay_lock);
				432	if (!list_empty(&ci->i_cap_delay_list)) {
				433	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				434	goto no_change;
				435	list_del_init(&ci->i_cap_delay_list);
				436	}
				437	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				438	no_change:
				439	spin_unlock(&mdsc->cap_delay_lock);
				440	}
				441	}
				442
				443	/*
				444	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
				445	* indicating we should send a cap message to flush dirty metadata
				446	* asap, and move to the front of the delayed cap list.
				447	*/
				448	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
				449	struct ceph_inode_info *ci)
				450	{
				451	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
				452	spin_lock(&mdsc->cap_delay_lock);
				453	ci->i_ceph_flags \|= CEPH_I_FLUSH;
				454	if (!list_empty(&ci->i_cap_delay_list))
				455	list_del_init(&ci->i_cap_delay_list);
				456	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				457	spin_unlock(&mdsc->cap_delay_lock);
				458	}
				459
				460	/*
				461	* Cancel delayed work on cap.
				462	*
				463	* Caller must hold i_ceph_lock.
				464	*/
				465	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
				466	struct ceph_inode_info *ci)
				467	{
				468	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
				469	if (list_empty(&ci->i_cap_delay_list))
				470	return;
				471	spin_lock(&mdsc->cap_delay_lock);
				472	list_del_init(&ci->i_cap_delay_list);
				473	spin_unlock(&mdsc->cap_delay_lock);
				474	}
				475
				476	/*
				477	* Common issue checks for add_cap, handle_cap_grant.
				478	*/
				479	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
				480	unsigned issued)
				481	{
				482	unsigned had = __ceph_caps_issued(ci, NULL);
				483
				484	/*
				485	* Each time we receive FILE_CACHE anew, we increment
				486	* i_rdcache_gen.
				487	*/
				488	if ((issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
				489	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0) {
				490	ci->i_rdcache_gen++;
				491	}
				492
				493	/*
				494	* If FILE_SHARED is newly issued, mark dir not complete. We don't
				495	* know what happened to this directory while we didn't have the cap.
				496	* If FILE_SHARED is being revoked, also mark dir not complete. It
				497	* stops on-going cached readdir.
				498	*/
				499	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
				500	if (issued & CEPH_CAP_FILE_SHARED)
				501	ci->i_shared_gen++;
				502	if (S_ISDIR(ci->vfs_inode.i_mode)) {
				503	dout(" marking %p NOT complete\n", &ci->vfs_inode);
				504	__ceph_dir_clear_complete(ci);
				505	}
				506	}
				507	}
				508
				509	/*
				510	* Add a capability under the given MDS session.
				511	*
				512	* Caller should hold session snap_rwsem (read) and s_mutex.
				513	*
				514	* @fmode is the open file mode, if we are opening a file, otherwise
				515	* it is < 0. (This is so we can atomically add the cap and add an
				516	* open file reference to it.)
				517	*/
				518	void ceph_add_cap(struct inode *inode,
				519	struct ceph_mds_session *session, u64 cap_id,
				520	int fmode, unsigned issued, unsigned wanted,
				521	unsigned seq, unsigned mseq, u64 realmino, int flags,
				522	struct ceph_cap **new_cap)
				523	{
				524	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				525	struct ceph_inode_info *ci = ceph_inode(inode);
				526	struct ceph_cap *cap;
				527	int mds = session->s_mds;
				528	int actual_wanted;
				529
				530	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
				531	session->s_mds, cap_id, ceph_cap_string(issued), seq);
				532
				533	/*
				534	* If we are opening the file, include file mode wanted bits
				535	* in wanted.
				536	*/
				537	if (fmode >= 0)
				538	wanted \|= ceph_caps_for_mode(fmode);
				539
				540	cap = __get_cap_for_mds(ci, mds);
				541	if (!cap) {
				542	cap = *new_cap;
				543	*new_cap = NULL;
				544
				545	cap->issued = 0;
				546	cap->implemented = 0;
				547	cap->mds = mds;
				548	cap->mds_wanted = 0;
				549	cap->mseq = 0;
				550
				551	cap->ci = ci;
				552	__insert_cap_node(ci, cap);
				553
				554	/* add to session cap list */
				555	cap->session = session;
				556	spin_lock(&session->s_cap_lock);
				557	list_add_tail(&cap->session_caps, &session->s_caps);
				558	session->s_nr_caps++;
				559	spin_unlock(&session->s_cap_lock);
				560	} else {
				561	/*
				562	* auth mds of the inode changed. we received the cap export
				563	* message, but still haven't received the cap import message.
				564	* handle_cap_export() updated the new auth MDS' cap.
				565	*
				566	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
				567	* a message that was send before the cap import message. So
				568	* don't remove caps.
				569	*/
				570	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
				571	WARN_ON(cap != ci->i_auth_cap);
				572	WARN_ON(cap->cap_id != cap_id);
				573	seq = cap->seq;
				574	mseq = cap->mseq;
				575	issued \|= cap->issued;
				576	flags \|= CEPH_CAP_FLAG_AUTH;
				577	}
				578	}
				579
				580	if (!ci->i_snap_realm) {
				581	/*
				582	* add this inode to the appropriate snap realm
				583	*/
				584	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
				585	realmino);
				586	if (realm) {
				587	spin_lock(&realm->inodes_with_caps_lock);
				588	ci->i_snap_realm = realm;
				589	list_add(&ci->i_snap_realm_item,
				590	&realm->inodes_with_caps);
				591	spin_unlock(&realm->inodes_with_caps_lock);
				592	} else {
				593	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
				594	realmino);
				595	WARN_ON(!realm);
				596	}
				597	}
				598
				599	__check_cap_issue(ci, cap, issued);
				600
				601	/*
				602	* If we are issued caps we don't want, or the mds' wanted
				603	* value appears to be off, queue a check so we'll release
				604	* later and/or update the mds wanted value.
				605	*/
				606	actual_wanted = __ceph_caps_wanted(ci);
				607	if ((wanted & ~actual_wanted) \|\|
				608	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
				609	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
				610	ceph_cap_string(issued), ceph_cap_string(wanted),
				611	ceph_cap_string(actual_wanted));
				612	__cap_delay_requeue(mdsc, ci);
				613	}
				614
				615	if (flags & CEPH_CAP_FLAG_AUTH) {
				616	if (!ci->i_auth_cap \|\|
				617	ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
				618	ci->i_auth_cap = cap;
				619	cap->mds_wanted = wanted;
				620	}
				621	} else {
				622	WARN_ON(ci->i_auth_cap == cap);
				623	}
				624
				625	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
				626	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
				627	ceph_cap_string(issued\|cap->issued), seq, mds);
				628	cap->cap_id = cap_id;
				629	cap->issued = issued;
				630	cap->implemented \|= issued;
				631	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
				632	cap->mds_wanted = wanted;
				633	else
				634	cap->mds_wanted \|= wanted;
				635	cap->seq = seq;
				636	cap->issue_seq = seq;
				637	cap->mseq = mseq;
				638	cap->cap_gen = session->s_cap_gen;
				639
				640	if (fmode >= 0)
				641	__ceph_get_fmode(ci, fmode);
				642	}
				643
				644	/*
				645	* Return true if cap has not timed out and belongs to the current
				646	* generation of the MDS session (i.e. has not gone 'stale' due to
				647	* us losing touch with the mds).
				648	*/
				649	static int __cap_is_valid(struct ceph_cap *cap)
				650	{
				651	unsigned long ttl;
				652	u32 gen;
				653
				654	spin_lock(&cap->session->s_gen_ttl_lock);
				655	gen = cap->session->s_cap_gen;
				656	ttl = cap->session->s_cap_ttl;
				657	spin_unlock(&cap->session->s_gen_ttl_lock);
				658
				659	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
				660	dout("__cap_is_valid %p cap %p issued %s "
				661	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
				662	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
				663	return 0;
				664	}
				665
				666	return 1;
				667	}
				668
				669	/*
				670	* Return set of valid cap bits issued to us. Note that caps time
				671	* out, and may be invalidated in bulk if the client session times out
				672	* and session->s_cap_gen is bumped.
				673	*/
				674	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
				675	{
				676	int have = ci->i_snap_caps;
				677	struct ceph_cap *cap;
				678	struct rb_node *p;
				679
				680	if (implemented)
				681	*implemented = 0;
				682	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				683	cap = rb_entry(p, struct ceph_cap, ci_node);
				684	if (!__cap_is_valid(cap))
				685	continue;
				686	dout("__ceph_caps_issued %p cap %p issued %s\n",
				687	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
				688	have \|= cap->issued;
				689	if (implemented)
				690	*implemented \|= cap->implemented;
				691	}
				692	/*
				693	* exclude caps issued by non-auth MDS, but are been revoking
				694	* by the auth MDS. The non-auth MDS should be revoking/exporting
				695	* these caps, but the message is delayed.
				696	*/
				697	if (ci->i_auth_cap) {
				698	cap = ci->i_auth_cap;
				699	have &= ~cap->implemented \| cap->issued;
				700	}
				701	return have;
				702	}
				703
				704	/*
				705	* Get cap bits issued by caps other than @ocap
				706	*/
				707	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
				708	{
				709	int have = ci->i_snap_caps;
				710	struct ceph_cap *cap;
				711	struct rb_node *p;
				712
				713	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				714	cap = rb_entry(p, struct ceph_cap, ci_node);
				715	if (cap == ocap)
				716	continue;
				717	if (!__cap_is_valid(cap))
				718	continue;
				719	have \|= cap->issued;
				720	}
				721	return have;
				722	}
				723
				724	/*
				725	* Move a cap to the end of the LRU (oldest caps at list head, newest
				726	* at list tail).
				727	*/
				728	static void __touch_cap(struct ceph_cap *cap)
				729	{
				730	struct ceph_mds_session *s = cap->session;
				731
				732	spin_lock(&s->s_cap_lock);
				733	if (!s->s_cap_iterator) {
				734	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
				735	s->s_mds);
				736	list_move_tail(&cap->session_caps, &s->s_caps);
				737	} else {
				738	dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
				739	&cap->ci->vfs_inode, cap, s->s_mds);
				740	}
				741	spin_unlock(&s->s_cap_lock);
				742	}
				743
				744	/*
				745	* Check if we hold the given mask. If so, move the cap(s) to the
				746	* front of their respective LRUs. (This is the preferred way for
				747	* callers to check for caps they want.)
				748	*/
				749	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
				750	{
				751	struct ceph_cap *cap;
				752	struct rb_node *p;
				753	int have = ci->i_snap_caps;
				754
				755	if ((have & mask) == mask) {
				756	dout("__ceph_caps_issued_mask %p snap issued %s"
				757	" (mask %s)\n", &ci->vfs_inode,
				758	ceph_cap_string(have),
				759	ceph_cap_string(mask));
				760	return 1;
				761	}
				762
				763	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				764	cap = rb_entry(p, struct ceph_cap, ci_node);
				765	if (!__cap_is_valid(cap))
				766	continue;
				767	if ((cap->issued & mask) == mask) {
				768	dout("__ceph_caps_issued_mask %p cap %p issued %s"
				769	" (mask %s)\n", &ci->vfs_inode, cap,
				770	ceph_cap_string(cap->issued),
				771	ceph_cap_string(mask));
				772	if (touch)
				773	__touch_cap(cap);
				774	return 1;
				775	}
				776
				777	/* does a combination of caps satisfy mask? */
				778	have \|= cap->issued;
				779	if ((have & mask) == mask) {
				780	dout("__ceph_caps_issued_mask %p combo issued %s"
				781	" (mask %s)\n", &ci->vfs_inode,
				782	ceph_cap_string(cap->issued),
				783	ceph_cap_string(mask));
				784	if (touch) {
				785	struct rb_node *q;
				786
				787	/* touch this + preceding caps */
				788	__touch_cap(cap);
				789	for (q = rb_first(&ci->i_caps); q != p;
				790	q = rb_next(q)) {
				791	cap = rb_entry(q, struct ceph_cap,
				792	ci_node);
				793	if (!__cap_is_valid(cap))
				794	continue;
				795	__touch_cap(cap);
				796	}
				797	}
				798	return 1;
				799	}
				800	}
				801
				802	return 0;
				803	}
				804
				805	/*
				806	* Return true if mask caps are currently being revoked by an MDS.
				807	*/
				808	int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
				809	struct ceph_cap *ocap, int mask)
				810	{
				811	struct ceph_cap *cap;
				812	struct rb_node *p;
				813
				814	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				815	cap = rb_entry(p, struct ceph_cap, ci_node);
				816	if (cap != ocap &&
				817	(cap->implemented & ~cap->issued & mask))
				818	return 1;
				819	}
				820	return 0;
				821	}
				822
				823	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
				824	{
				825	struct inode *inode = &ci->vfs_inode;
				826	int ret;
				827
				828	spin_lock(&ci->i_ceph_lock);
				829	ret = __ceph_caps_revoking_other(ci, NULL, mask);
				830	spin_unlock(&ci->i_ceph_lock);
				831	dout("ceph_caps_revoking %p %s = %d\n", inode,
				832	ceph_cap_string(mask), ret);
				833	return ret;
				834	}
				835
				836	int __ceph_caps_used(struct ceph_inode_info *ci)
				837	{
				838	int used = 0;
				839	if (ci->i_pin_ref)
				840	used \|= CEPH_CAP_PIN;
				841	if (ci->i_rd_ref)
				842	used \|= CEPH_CAP_FILE_RD;
				843	if (ci->i_rdcache_ref \|\|
				844	(!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
				845	ci->vfs_inode.i_data.nrpages))
				846	used \|= CEPH_CAP_FILE_CACHE;
				847	if (ci->i_wr_ref)
				848	used \|= CEPH_CAP_FILE_WR;
				849	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)
				850	used \|= CEPH_CAP_FILE_BUFFER;
				851	return used;
				852	}
				853
				854	/*
				855	* wanted, by virtue of open file modes
				856	*/
				857	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
				858	{
				859	int i, bits = 0;
				860	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				861	if (ci->i_nr_by_mode[i])
				862	bits \|= 1 << i;
				863	}
				864	if (bits == 0)
				865	return 0;
				866	return ceph_caps_for_mode(bits >> 1);
				867	}
				868
				869	/*
				870	* Return caps we have registered with the MDS(s) as 'wanted'.
				871	*/
				872	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
				873	{
				874	struct ceph_cap *cap;
				875	struct rb_node *p;
				876	int mds_wanted = 0;
				877
				878	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				879	cap = rb_entry(p, struct ceph_cap, ci_node);
				880	if (check && !__cap_is_valid(cap))
				881	continue;
				882	if (cap == ci->i_auth_cap)
				883	mds_wanted \|= cap->mds_wanted;
				884	else
				885	mds_wanted \|= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
				886	}
				887	return mds_wanted;
				888	}
				889
				890	/*
				891	* called under i_ceph_lock
				892	*/
				893	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
				894	{
				895	return !RB_EMPTY_ROOT(&ci->i_caps);
				896	}
				897
				898	int ceph_is_any_caps(struct inode *inode)
				899	{
				900	struct ceph_inode_info *ci = ceph_inode(inode);
				901	int ret;
				902
				903	spin_lock(&ci->i_ceph_lock);
				904	ret = __ceph_is_any_caps(ci);
				905	spin_unlock(&ci->i_ceph_lock);
				906
				907	return ret;
				908	}
				909
				910	static void drop_inode_snap_realm(struct ceph_inode_info *ci)
				911	{
				912	struct ceph_snap_realm *realm = ci->i_snap_realm;
				913	spin_lock(&realm->inodes_with_caps_lock);
				914	list_del_init(&ci->i_snap_realm_item);
				915	ci->i_snap_realm_counter++;
				916	ci->i_snap_realm = NULL;
				917	spin_unlock(&realm->inodes_with_caps_lock);
				918	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
				919	realm);
				920	}
				921
				922	/*
				923	* Remove a cap. Take steps to deal with a racing iterate_session_caps.
				924	*
				925	* caller should hold i_ceph_lock.
				926	* caller will not hold session s_mutex if called from destroy_inode.
				927	*/
				928	void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
				929	{
				930	struct ceph_mds_session *session = cap->session;
				931	struct ceph_inode_info *ci = cap->ci;
				932	struct ceph_mds_client *mdsc =
				933	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
				934	int removed = 0;
				935
				936	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
				937
				938	/* remove from inode's cap rbtree, and clear auth cap */
				939	rb_erase(&cap->ci_node, &ci->i_caps);
				940	if (ci->i_auth_cap == cap)
				941	ci->i_auth_cap = NULL;
				942
				943	/* remove from session list */
				944	spin_lock(&session->s_cap_lock);
				945	if (session->s_cap_iterator == cap) {
				946	/* not yet, we are iterating over this very cap */
				947	dout("__ceph_remove_cap delaying %p removal from session %p\n",
				948	cap, cap->session);
				949	} else {
				950	list_del_init(&cap->session_caps);
				951	session->s_nr_caps--;
				952	cap->session = NULL;
				953	removed = 1;
				954	}
				955	/* protect backpointer with s_cap_lock: see iterate_session_caps */
				956	cap->ci = NULL;
				957
				958	/*
				959	* s_cap_reconnect is protected by s_cap_lock. no one changes
				960	* s_cap_gen while session is in the reconnect state.
				961	*/
				962	if (queue_release &&
				963	(!session->s_cap_reconnect \|\| cap->cap_gen == session->s_cap_gen)) {
				964	cap->queue_release = 1;
				965	if (removed) {
				966	list_add_tail(&cap->session_caps,
				967	&session->s_cap_releases);
				968	session->s_num_cap_releases++;
				969	removed = 0;
				970	}
				971	} else {
				972	cap->queue_release = 0;
				973	}
				974	cap->cap_ino = ci->i_vino.ino;
				975
				976	spin_unlock(&session->s_cap_lock);
				977
				978	if (removed)
				979	ceph_put_cap(mdsc, cap);
				980
				981	/* when reconnect denied, we remove session caps forcibly,
				982	* i_wr_ref can be non-zero. If there are ongoing write,
				983	* keep i_snap_realm.
				984	*/
				985	if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
				986	drop_inode_snap_realm(ci);
				987
				988	if (!__ceph_is_any_real_caps(ci))
				989	__cap_delay_cancel(mdsc, ci);
				990	}
				991
				992	struct cap_msg_args {
				993	struct ceph_mds_session *session;
				994	u64 ino, cid, follows;
				995	u64 flush_tid, oldest_flush_tid, size, max_size;
				996	u64 xattr_version;
				997	struct ceph_buffer *xattr_buf;
				998	struct timespec atime, mtime, ctime;
				999	int op, caps, wanted, dirty;
				1000	u32 seq, issue_seq, mseq, time_warp_seq;
				1001	u32 flags;
				1002	kuid_t uid;
				1003	kgid_t gid;
				1004	umode_t mode;
				1005	bool inline_data;
				1006	};
				1007
				1008	/*
				1009	* Build and send a cap message to the given MDS.
				1010	*
				1011	* Caller should be holding s_mutex.
				1012	*/
				1013	static int send_cap_msg(struct cap_msg_args *arg)
				1014	{
				1015	struct ceph_mds_caps *fc;
				1016	struct ceph_msg *msg;
				1017	void *p;
				1018	size_t extra_len;
				1019	struct timespec zerotime = {0};
				1020	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
				1021
				1022	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
				1023	" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
				1024	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
				1025	arg->cid, arg->ino, ceph_cap_string(arg->caps),
				1026	ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
				1027	arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
				1028	arg->mseq, arg->follows, arg->size, arg->max_size,
				1029	arg->xattr_version,
				1030	arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
				1031
				1032	/* flock buffer size + inline version + inline data size +
				1033	* osd_epoch_barrier + oldest_flush_tid */
				1034	extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
				1035	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
				1036	GFP_NOFS, false);
				1037	if (!msg)
				1038	return -ENOMEM;
				1039
				1040	msg->hdr.version = cpu_to_le16(10);
				1041	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
				1042
				1043	fc = msg->front.iov_base;
				1044	memset(fc, 0, sizeof(*fc));
				1045
				1046	fc->cap_id = cpu_to_le64(arg->cid);
				1047	fc->op = cpu_to_le32(arg->op);
				1048	fc->seq = cpu_to_le32(arg->seq);
				1049	fc->issue_seq = cpu_to_le32(arg->issue_seq);
				1050	fc->migrate_seq = cpu_to_le32(arg->mseq);
				1051	fc->caps = cpu_to_le32(arg->caps);
				1052	fc->wanted = cpu_to_le32(arg->wanted);
				1053	fc->dirty = cpu_to_le32(arg->dirty);
				1054	fc->ino = cpu_to_le64(arg->ino);
				1055	fc->snap_follows = cpu_to_le64(arg->follows);
				1056
				1057	fc->size = cpu_to_le64(arg->size);
				1058	fc->max_size = cpu_to_le64(arg->max_size);
				1059	ceph_encode_timespec(&fc->mtime, &arg->mtime);
				1060	ceph_encode_timespec(&fc->atime, &arg->atime);
				1061	ceph_encode_timespec(&fc->ctime, &arg->ctime);
				1062	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
				1063
				1064	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
				1065	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
				1066	fc->mode = cpu_to_le32(arg->mode);
				1067
				1068	fc->xattr_version = cpu_to_le64(arg->xattr_version);
				1069	if (arg->xattr_buf) {
				1070	msg->middle = ceph_buffer_get(arg->xattr_buf);
				1071	fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
				1072	msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
				1073	}
				1074
				1075	p = fc + 1;
				1076	/* flock buffer size (version 2) */
				1077	ceph_encode_32(&p, 0);
				1078	/* inline version (version 4) */
				1079	ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
				1080	/* inline data size */
				1081	ceph_encode_32(&p, 0);
				1082	/*
				1083	* osd_epoch_barrier (version 5)
				1084	* The epoch_barrier is protected osdc->lock, so READ_ONCE here in
				1085	* case it was recently changed
				1086	*/
				1087	ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
				1088	/* oldest_flush_tid (version 6) */
				1089	ceph_encode_64(&p, arg->oldest_flush_tid);
				1090
				1091	/*
				1092	* caller_uid/caller_gid (version 7)
				1093	*
				1094	* Currently, we don't properly track which caller dirtied the caps
				1095	* last, and force a flush of them when there is a conflict. For now,
				1096	* just set this to 0:0, to emulate how the MDS has worked up to now.
				1097	*/
				1098	ceph_encode_32(&p, 0);
				1099	ceph_encode_32(&p, 0);
				1100
				1101	/* pool namespace (version 8) (mds always ignores this) */
				1102	ceph_encode_32(&p, 0);
				1103
				1104	/*
				1105	* btime and change_attr (version 9)
				1106	*
				1107	* We just zero these out for now, as the MDS ignores them unless
				1108	* the requisite feature flags are set (which we don't do yet).
				1109	*/
				1110	ceph_encode_timespec(p, &zerotime);
				1111	p += sizeof(struct ceph_timespec);
				1112	ceph_encode_64(&p, 0);
				1113
				1114	/* Advisory flags (version 10) */
				1115	ceph_encode_32(&p, arg->flags);
				1116
				1117	ceph_con_send(&arg->session->s_con, msg);
				1118	return 0;
				1119	}
				1120
				1121	/*
				1122	* Queue cap releases when an inode is dropped from our cache.
				1123	*/
				1124	void ceph_queue_caps_release(struct inode *inode)
				1125	{
				1126	struct ceph_inode_info *ci = ceph_inode(inode);
				1127	struct rb_node *p;
				1128
				1129	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
				1130	* may call __ceph_caps_issued_mask() on a freeing inode. */
				1131	spin_lock(&ci->i_ceph_lock);
				1132	p = rb_first(&ci->i_caps);
				1133	while (p) {
				1134	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
				1135	p = rb_next(p);
				1136	__ceph_remove_cap(cap, true);
				1137	}
				1138	spin_unlock(&ci->i_ceph_lock);
				1139	}
				1140
				1141	/*
				1142	* Send a cap msg on the given inode. Update our caps state, then
				1143	* drop i_ceph_lock and send the message.
				1144	*
				1145	* Make note of max_size reported/requested from mds, revoked caps
				1146	* that have now been implemented.
				1147	*
				1148	* Make half-hearted attempt ot to invalidate page cache if we are
				1149	* dropping RDCACHE. Note that this will leave behind locked pages
				1150	* that we'll then need to deal with elsewhere.
				1151	*
				1152	* Return non-zero if delayed release, or we experienced an error
				1153	* such that the caller should requeue + retry later.
				1154	*
				1155	* called with i_ceph_lock, then drops it.
				1156	* caller should hold snap_rwsem (read), s_mutex.
				1157	*/
				1158	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
				1159	int op, bool sync, int used, int want, int retain,
				1160	int flushing, u64 flush_tid, u64 oldest_flush_tid)
				1161	__releases(cap->ci->i_ceph_lock)
				1162	{
				1163	struct ceph_inode_info *ci = cap->ci;
				1164	struct inode *inode = &ci->vfs_inode;
				1165	struct ceph_buffer *old_blob = NULL;
				1166	struct cap_msg_args arg;
				1167	int held, revoking, dropping;
				1168	int wake = 0;
				1169	int delayed = 0;
				1170	int ret;
				1171
				1172	held = cap->issued \| cap->implemented;
				1173	revoking = cap->implemented & ~cap->issued;
				1174	retain &= ~revoking;
				1175	dropping = cap->issued & ~retain;
				1176
				1177	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
				1178	inode, cap, cap->session,
				1179	ceph_cap_string(held), ceph_cap_string(held & retain),
				1180	ceph_cap_string(revoking));
				1181	BUG_ON((retain & CEPH_CAP_PIN) == 0);
				1182
				1183	arg.session = cap->session;
				1184
				1185	/* don't release wanted unless we've waited a bit. */
				1186	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1187	time_before(jiffies, ci->i_hold_caps_min)) {
				1188	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
				1189	ceph_cap_string(cap->issued),
				1190	ceph_cap_string(cap->issued & retain),
				1191	ceph_cap_string(cap->mds_wanted),
				1192	ceph_cap_string(want));
				1193	want \|= cap->mds_wanted;
				1194	retain \|= cap->issued;
				1195	delayed = 1;
				1196	}
				1197	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
				1198	if (want & ~cap->mds_wanted) {
				1199	/* user space may open/close single file frequently.
				1200	* This avoids droping mds_wanted immediately after
				1201	* requesting new mds_wanted.
				1202	*/
				1203	__cap_set_timeouts(mdsc, ci);
				1204	}
				1205
				1206	cap->issued &= retain; /* drop bits we don't want */
				1207	if (cap->implemented & ~cap->issued) {
				1208	/*
				1209	* Wake up any waiters on wanted -> needed transition.
				1210	* This is due to the weird transition from buffered
				1211	* to sync IO... we need to flush dirty pages _before_
				1212	* allowing sync writes to avoid reordering.
				1213	*/
				1214	wake = 1;
				1215	}
				1216	cap->implemented &= cap->issued \| used;
				1217	cap->mds_wanted = want;
				1218
				1219	arg.ino = ceph_vino(inode).ino;
				1220	arg.cid = cap->cap_id;
				1221	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
				1222	arg.flush_tid = flush_tid;
				1223	arg.oldest_flush_tid = oldest_flush_tid;
				1224
				1225	arg.size = inode->i_size;
				1226	ci->i_reported_size = arg.size;
				1227	arg.max_size = ci->i_wanted_max_size;
				1228	ci->i_requested_max_size = arg.max_size;
				1229
				1230	if (flushing & CEPH_CAP_XATTR_EXCL) {
				1231	old_blob = __ceph_build_xattrs_blob(ci);
				1232	arg.xattr_version = ci->i_xattrs.version;
				1233	arg.xattr_buf = ci->i_xattrs.blob;
				1234	} else {
				1235	arg.xattr_buf = NULL;
				1236	}
				1237
				1238	arg.mtime = inode->i_mtime;
				1239	arg.atime = inode->i_atime;
				1240	arg.ctime = inode->i_ctime;
				1241
				1242	arg.op = op;
				1243	arg.caps = cap->implemented;
				1244	arg.wanted = want;
				1245	arg.dirty = flushing;
				1246
				1247	arg.seq = cap->seq;
				1248	arg.issue_seq = cap->issue_seq;
				1249	arg.mseq = cap->mseq;
				1250	arg.time_warp_seq = ci->i_time_warp_seq;
				1251
				1252	arg.uid = inode->i_uid;
				1253	arg.gid = inode->i_gid;
				1254	arg.mode = inode->i_mode;
				1255
				1256	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
				1257	if (list_empty(&ci->i_cap_snaps))
				1258	arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
				1259	else
				1260	arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
				1261	if (sync)
				1262	arg.flags \|= CEPH_CLIENT_CAPS_SYNC;
				1263
				1264	spin_unlock(&ci->i_ceph_lock);
				1265
				1266	ceph_buffer_put(old_blob);
				1267
				1268	ret = send_cap_msg(&arg);
				1269	if (ret < 0) {
				1270	dout("error sending cap msg, must requeue %p\n", inode);
				1271	delayed = 1;
				1272	}
				1273
				1274	if (wake)
				1275	wake_up_all(&ci->i_cap_wq);
				1276
				1277	return delayed;
				1278	}
				1279
				1280	static inline int __send_flush_snap(struct inode *inode,
				1281	struct ceph_mds_session *session,
				1282	struct ceph_cap_snap *capsnap,
				1283	u32 mseq, u64 oldest_flush_tid)
				1284	{
				1285	struct cap_msg_args arg;
				1286
				1287	arg.session = session;
				1288	arg.ino = ceph_vino(inode).ino;
				1289	arg.cid = 0;
				1290	arg.follows = capsnap->follows;
				1291	arg.flush_tid = capsnap->cap_flush.tid;
				1292	arg.oldest_flush_tid = oldest_flush_tid;
				1293
				1294	arg.size = capsnap->size;
				1295	arg.max_size = 0;
				1296	arg.xattr_version = capsnap->xattr_version;
				1297	arg.xattr_buf = capsnap->xattr_blob;
				1298
				1299	arg.atime = capsnap->atime;
				1300	arg.mtime = capsnap->mtime;
				1301	arg.ctime = capsnap->ctime;
				1302
				1303	arg.op = CEPH_CAP_OP_FLUSHSNAP;
				1304	arg.caps = capsnap->issued;
				1305	arg.wanted = 0;
				1306	arg.dirty = capsnap->dirty;
				1307
				1308	arg.seq = 0;
				1309	arg.issue_seq = 0;
				1310	arg.mseq = mseq;
				1311	arg.time_warp_seq = capsnap->time_warp_seq;
				1312
				1313	arg.uid = capsnap->uid;
				1314	arg.gid = capsnap->gid;
				1315	arg.mode = capsnap->mode;
				1316
				1317	arg.inline_data = capsnap->inline_data;
				1318	arg.flags = 0;
				1319
				1320	return send_cap_msg(&arg);
				1321	}
				1322
				1323	/*
				1324	* When a snapshot is taken, clients accumulate dirty metadata on
				1325	* inodes with capabilities in ceph_cap_snaps to describe the file
				1326	* state at the time the snapshot was taken. This must be flushed
				1327	* asynchronously back to the MDS once sync writes complete and dirty
				1328	* data is written out.
				1329	*
				1330	* Called under i_ceph_lock. Takes s_mutex as needed.
				1331	*/
				1332	static void __ceph_flush_snaps(struct ceph_inode_info *ci,
				1333	struct ceph_mds_session *session)
				1334	__releases(ci->i_ceph_lock)
				1335	__acquires(ci->i_ceph_lock)
				1336	{
				1337	struct inode *inode = &ci->vfs_inode;
				1338	struct ceph_mds_client *mdsc = session->s_mdsc;
				1339	struct ceph_cap_snap *capsnap;
				1340	u64 oldest_flush_tid = 0;
				1341	u64 first_tid = 1, last_tid = 0;
				1342
				1343	dout("__flush_snaps %p session %p\n", inode, session);
				1344
				1345	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				1346	/*
				1347	* we need to wait for sync writes to complete and for dirty
				1348	* pages to be written out.
				1349	*/
				1350	if (capsnap->dirty_pages \|\| capsnap->writing)
				1351	break;
				1352
				1353	/* should be removed by ceph_try_drop_cap_snap() */
				1354	BUG_ON(!capsnap->need_flush);
				1355
				1356	/* only flush each capsnap once */
				1357	if (capsnap->cap_flush.tid > 0) {
				1358	dout(" already flushed %p, skipping\n", capsnap);
				1359	continue;
				1360	}
				1361
				1362	spin_lock(&mdsc->cap_dirty_lock);
				1363	capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
				1364	list_add_tail(&capsnap->cap_flush.g_list,
				1365	&mdsc->cap_flush_list);
				1366	if (oldest_flush_tid == 0)
				1367	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				1368	if (list_empty(&ci->i_flushing_item)) {
				1369	list_add_tail(&ci->i_flushing_item,
				1370	&session->s_cap_flushing);
				1371	}
				1372	spin_unlock(&mdsc->cap_dirty_lock);
				1373
				1374	list_add_tail(&capsnap->cap_flush.i_list,
				1375	&ci->i_cap_flush_list);
				1376
				1377	if (first_tid == 1)
				1378	first_tid = capsnap->cap_flush.tid;
				1379	last_tid = capsnap->cap_flush.tid;
				1380	}
				1381
				1382	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
				1383
				1384	while (first_tid <= last_tid) {
				1385	struct ceph_cap *cap = ci->i_auth_cap;
				1386	struct ceph_cap_flush *cf;
				1387	int ret;
				1388
				1389	if (!(cap && cap->session == session)) {
				1390	dout("__flush_snaps %p auth cap %p not mds%d, "
				1391	"stop\n", inode, cap, session->s_mds);
				1392	break;
				1393	}
				1394
				1395	ret = -ENOENT;
				1396	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
				1397	if (cf->tid >= first_tid) {
				1398	ret = 0;
				1399	break;
				1400	}
				1401	}
				1402	if (ret < 0)
				1403	break;
				1404
				1405	first_tid = cf->tid + 1;
				1406
				1407	capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
				1408	refcount_inc(&capsnap->nref);
				1409	spin_unlock(&ci->i_ceph_lock);
				1410
				1411	dout("__flush_snaps %p capsnap %p tid %llu %s\n",
				1412	inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
				1413
				1414	ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
				1415	oldest_flush_tid);
				1416	if (ret < 0) {
				1417	pr_err("__flush_snaps: error sending cap flushsnap, "
				1418	"ino (%llx.%llx) tid %llu follows %llu\n",
				1419	ceph_vinop(inode), cf->tid, capsnap->follows);
				1420	}
				1421
				1422	ceph_put_cap_snap(capsnap);
				1423	spin_lock(&ci->i_ceph_lock);
				1424	}
				1425	}
				1426
				1427	void ceph_flush_snaps(struct ceph_inode_info *ci,
				1428	struct ceph_mds_session **psession)
				1429	{
				1430	struct inode *inode = &ci->vfs_inode;
				1431	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				1432	struct ceph_mds_session *session = NULL;
				1433	int mds;
				1434
				1435	dout("ceph_flush_snaps %p\n", inode);
				1436	if (psession)
				1437	session = *psession;
				1438	retry:
				1439	spin_lock(&ci->i_ceph_lock);
				1440	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
				1441	dout(" no capsnap needs flush, doing nothing\n");
				1442	goto out;
				1443	}
				1444	if (!ci->i_auth_cap) {
				1445	dout(" no auth cap (migrating?), doing nothing\n");
				1446	goto out;
				1447	}
				1448
				1449	mds = ci->i_auth_cap->session->s_mds;
				1450	if (session && session->s_mds != mds) {
				1451	dout(" oops, wrong session %p mutex\n", session);
				1452	mutex_unlock(&session->s_mutex);
				1453	ceph_put_mds_session(session);
				1454	session = NULL;
				1455	}
				1456	if (!session) {
				1457	spin_unlock(&ci->i_ceph_lock);
				1458	mutex_lock(&mdsc->mutex);
				1459	session = __ceph_lookup_mds_session(mdsc, mds);
				1460	mutex_unlock(&mdsc->mutex);
				1461	if (session) {
				1462	dout(" inverting session/ino locks on %p\n", session);
				1463	mutex_lock(&session->s_mutex);
				1464	}
				1465	goto retry;
				1466	}
				1467
				1468	// make sure flushsnap messages are sent in proper order.
				1469	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				1470	__kick_flushing_caps(mdsc, session, ci, 0);
				1471	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				1472	}
				1473
				1474	__ceph_flush_snaps(ci, session);
				1475	out:
				1476	spin_unlock(&ci->i_ceph_lock);
				1477
				1478	if (psession) {
				1479	*psession = session;
				1480	} else if (session) {
				1481	mutex_unlock(&session->s_mutex);
				1482	ceph_put_mds_session(session);
				1483	}
				1484	/* we flushed them all; remove this inode from the queue */
				1485	spin_lock(&mdsc->snap_flush_lock);
				1486	list_del_init(&ci->i_snap_flush_item);
				1487	spin_unlock(&mdsc->snap_flush_lock);
				1488	}
				1489
				1490	/*
				1491	* Mark caps dirty. If inode is newly dirty, return the dirty flags.
				1492	* Caller is then responsible for calling __mark_inode_dirty with the
				1493	* returned flags value.
				1494	*/
				1495	int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
				1496	struct ceph_cap_flush **pcf)
				1497	{
				1498	struct ceph_mds_client *mdsc =
				1499	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
				1500	struct inode *inode = &ci->vfs_inode;
				1501	int was = ci->i_dirty_caps;
				1502	int dirty = 0;
				1503
				1504	if (!ci->i_auth_cap) {
				1505	pr_warn("__mark_dirty_caps %p %llx mask %s, "
				1506	"but no auth cap (session was closed?)\n",
				1507	inode, ceph_ino(inode), ceph_cap_string(mask));
				1508	return 0;
				1509	}
				1510
				1511	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
				1512	ceph_cap_string(mask), ceph_cap_string(was),
				1513	ceph_cap_string(was \| mask));
				1514	ci->i_dirty_caps \|= mask;
				1515	if (was == 0) {
				1516	WARN_ON_ONCE(ci->i_prealloc_cap_flush);
				1517	swap(ci->i_prealloc_cap_flush, *pcf);
				1518
				1519	if (!ci->i_head_snapc) {
				1520	WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
				1521	ci->i_head_snapc = ceph_get_snap_context(
				1522	ci->i_snap_realm->cached_context);
				1523	}
				1524	dout(" inode %p now dirty snapc %p auth cap %p\n",
				1525	&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
				1526	BUG_ON(!list_empty(&ci->i_dirty_item));
				1527	spin_lock(&mdsc->cap_dirty_lock);
				1528	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
				1529	spin_unlock(&mdsc->cap_dirty_lock);
				1530	if (ci->i_flushing_caps == 0) {
				1531	ihold(inode);
				1532	dirty \|= I_DIRTY_SYNC;
				1533	}
				1534	} else {
				1535	WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
				1536	}
				1537	BUG_ON(list_empty(&ci->i_dirty_item));
				1538	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
				1539	(mask & CEPH_CAP_FILE_BUFFER))
				1540	dirty \|= I_DIRTY_DATASYNC;
				1541	__cap_delay_requeue(mdsc, ci);
				1542	return dirty;
				1543	}
				1544
				1545	struct ceph_cap_flush *ceph_alloc_cap_flush(void)
				1546	{
				1547	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
				1548	}
				1549
				1550	void ceph_free_cap_flush(struct ceph_cap_flush *cf)
				1551	{
				1552	if (cf)
				1553	kmem_cache_free(ceph_cap_flush_cachep, cf);
				1554	}
				1555
				1556	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
				1557	{
				1558	if (!list_empty(&mdsc->cap_flush_list)) {
				1559	struct ceph_cap_flush *cf =
				1560	list_first_entry(&mdsc->cap_flush_list,
				1561	struct ceph_cap_flush, g_list);
				1562	return cf->tid;
				1563	}
				1564	return 0;
				1565	}
				1566
				1567	/*
				1568	* Remove cap_flush from the mdsc's or inode's flushing cap list.
				1569	* Return true if caller needs to wake up flush waiters.
				1570	*/
				1571	static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
				1572	struct ceph_inode_info *ci,
				1573	struct ceph_cap_flush *cf)
				1574	{
				1575	struct ceph_cap_flush *prev;
				1576	bool wake = cf->wake;
				1577	if (mdsc) {
				1578	/* are there older pending cap flushes? */
				1579	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
				1580	prev = list_prev_entry(cf, g_list);
				1581	prev->wake = true;
				1582	wake = false;
				1583	}
				1584	list_del(&cf->g_list);
				1585	} else if (ci) {
				1586	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
				1587	prev = list_prev_entry(cf, i_list);
				1588	prev->wake = true;
				1589	wake = false;
				1590	}
				1591	list_del(&cf->i_list);
				1592	} else {
				1593	BUG_ON(1);
				1594	}
				1595	return wake;
				1596	}
				1597
				1598	/*
				1599	* Add dirty inode to the flushing list. Assigned a seq number so we
				1600	* can wait for caps to flush without starving.
				1601	*
				1602	* Called under i_ceph_lock.
				1603	*/
				1604	static int __mark_caps_flushing(struct inode *inode,
				1605	struct ceph_mds_session *session, bool wake,
				1606	u64 flush_tid, u64 oldest_flush_tid)
				1607	{
				1608	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				1609	struct ceph_inode_info *ci = ceph_inode(inode);
				1610	struct ceph_cap_flush *cf = NULL;
				1611	int flushing;
				1612
				1613	BUG_ON(ci->i_dirty_caps == 0);
				1614	BUG_ON(list_empty(&ci->i_dirty_item));
				1615	BUG_ON(!ci->i_prealloc_cap_flush);
				1616
				1617	flushing = ci->i_dirty_caps;
				1618	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
				1619	ceph_cap_string(flushing),
				1620	ceph_cap_string(ci->i_flushing_caps),
				1621	ceph_cap_string(ci->i_flushing_caps \| flushing));
				1622	ci->i_flushing_caps \|= flushing;
				1623	ci->i_dirty_caps = 0;
				1624	dout(" inode %p now !dirty\n", inode);
				1625
				1626	swap(cf, ci->i_prealloc_cap_flush);
				1627	cf->caps = flushing;
				1628	cf->wake = wake;
				1629
				1630	spin_lock(&mdsc->cap_dirty_lock);
				1631	list_del_init(&ci->i_dirty_item);
				1632
				1633	cf->tid = ++mdsc->last_cap_flush_tid;
				1634	list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
				1635	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				1636
				1637	if (list_empty(&ci->i_flushing_item)) {
				1638	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1639	mdsc->num_cap_flushing++;
				1640	}
				1641	spin_unlock(&mdsc->cap_dirty_lock);
				1642
				1643	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
				1644
				1645	*flush_tid = cf->tid;
				1646	return flushing;
				1647	}
				1648
				1649	/*
				1650	* try to invalidate mapping pages without blocking.
				1651	*/
				1652	static int try_nonblocking_invalidate(struct inode *inode)
				1653	{
				1654	struct ceph_inode_info *ci = ceph_inode(inode);
				1655	u32 invalidating_gen = ci->i_rdcache_gen;
				1656
				1657	spin_unlock(&ci->i_ceph_lock);
				1658	invalidate_mapping_pages(&inode->i_data, 0, -1);
				1659	spin_lock(&ci->i_ceph_lock);
				1660
				1661	if (inode->i_data.nrpages == 0 &&
				1662	invalidating_gen == ci->i_rdcache_gen) {
				1663	/* success. */
				1664	dout("try_nonblocking_invalidate %p success\n", inode);
				1665	/* save any racing async invalidate some trouble */
				1666	ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
				1667	return 0;
				1668	}
				1669	dout("try_nonblocking_invalidate %p failed\n", inode);
				1670	return -1;
				1671	}
				1672
				1673	bool __ceph_should_report_size(struct ceph_inode_info *ci)
				1674	{
				1675	loff_t size = ci->vfs_inode.i_size;
				1676	/* mds will adjust max size according to the reported size */
				1677	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
				1678	return false;
				1679	if (size >= ci->i_max_size)
				1680	return true;
				1681	/* half of previous max_size increment has been used */
				1682	if (ci->i_max_size > ci->i_reported_size &&
				1683	(size << 1) >= ci->i_max_size + ci->i_reported_size)
				1684	return true;
				1685	return false;
				1686	}
				1687
				1688	/*
				1689	* Swiss army knife function to examine currently used and wanted
				1690	* versus held caps. Release, flush, ack revoked caps to mds as
				1691	* appropriate.
				1692	*
				1693	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
				1694	* cap release further.
				1695	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
				1696	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
				1697	* further delay.
				1698	*/
				1699	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
				1700	struct ceph_mds_session *session)
				1701	{
				1702	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
				1703	struct ceph_mds_client *mdsc = fsc->mdsc;
				1704	struct inode *inode = &ci->vfs_inode;
				1705	struct ceph_cap *cap;
				1706	u64 flush_tid, oldest_flush_tid;
				1707	int file_wanted, used, cap_used;
				1708	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
				1709	int issued, implemented, want, retain, revoking, flushing = 0;
				1710	int mds = -1; /* keep track of how far we've gone through i_caps list
				1711	to avoid an infinite loop on retry */
				1712	struct rb_node *p;
				1713	int delayed = 0, sent = 0, num;
				1714	bool is_delayed = flags & CHECK_CAPS_NODELAY;
				1715	bool queue_invalidate = false;
				1716	bool force_requeue = false;
				1717	bool tried_invalidate = false;
				1718
				1719	/* if we are unmounting, flush any unused caps immediately. */
				1720	if (mdsc->stopping)
				1721	is_delayed = 1;
				1722
				1723	spin_lock(&ci->i_ceph_lock);
				1724
				1725	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				1726	flags \|= CHECK_CAPS_FLUSH;
				1727
				1728	goto retry_locked;
				1729	retry:
				1730	spin_lock(&ci->i_ceph_lock);
				1731	retry_locked:
				1732	file_wanted = __ceph_caps_file_wanted(ci);
				1733	used = __ceph_caps_used(ci);
				1734	issued = __ceph_caps_issued(ci, &implemented);
				1735	revoking = implemented & ~issued;
				1736
				1737	want = file_wanted;
				1738	retain = file_wanted \| used \| CEPH_CAP_PIN;
				1739	if (!mdsc->stopping && inode->i_nlink > 0) {
				1740	if (file_wanted) {
				1741	retain \|= CEPH_CAP_ANY; /* be greedy */
				1742	} else if (S_ISDIR(inode->i_mode) &&
				1743	(issued & CEPH_CAP_FILE_SHARED) &&
				1744	__ceph_dir_is_complete(ci)) {
				1745	/*
				1746	* If a directory is complete, we want to keep
				1747	* the exclusive cap. So that MDS does not end up
				1748	* revoking the shared cap on every create/unlink
				1749	* operation.
				1750	*/
				1751	want = CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
				1752	retain \|= want;
				1753	} else {
				1754
				1755	retain \|= CEPH_CAP_ANY_SHARED;
				1756	/*
				1757	* keep RD only if we didn't have the file open RW,
				1758	* because then the mds would revoke it anyway to
				1759	* journal max_size=0.
				1760	*/
				1761	if (ci->i_max_size == 0)
				1762	retain \|= CEPH_CAP_ANY_RD;
				1763	}
				1764	}
				1765
				1766	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
				1767	" issued %s revoking %s retain %s %s%s%s\n", inode,
				1768	ceph_cap_string(file_wanted),
				1769	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
				1770	ceph_cap_string(ci->i_flushing_caps),
				1771	ceph_cap_string(issued), ceph_cap_string(revoking),
				1772	ceph_cap_string(retain),
				1773	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
				1774	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
				1775	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
				1776
				1777	/*
				1778	* If we no longer need to hold onto old our caps, and we may
				1779	* have cached pages, but don't want them, then try to invalidate.
				1780	* If we fail, it's because pages are locked.... try again later.
				1781	*/
				1782	if ((!is_delayed \|\| mdsc->stopping) &&
				1783	!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
				1784	!(ci->i_wb_ref \|\| ci->i_wrbuffer_ref) && /* no dirty pages... */
				1785	inode->i_data.nrpages && /* have cached pages */
				1786	(revoking & (CEPH_CAP_FILE_CACHE\|
				1787	CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
				1788	!tried_invalidate) {
				1789	dout("check_caps trying to invalidate on %p\n", inode);
				1790	if (try_nonblocking_invalidate(inode) < 0) {
				1791	if (revoking & (CEPH_CAP_FILE_CACHE\|
				1792	CEPH_CAP_FILE_LAZYIO)) {
				1793	dout("check_caps queuing invalidate\n");
				1794	queue_invalidate = true;
				1795	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				1796	} else {
				1797	dout("check_caps failed to invalidate pages\n");
				1798	/* we failed to invalidate pages. check these
				1799	caps again later. */
				1800	force_requeue = true;
				1801	__cap_set_timeouts(mdsc, ci);
				1802	}
				1803	}
				1804	tried_invalidate = true;
				1805	goto retry_locked;
				1806	}
				1807
				1808	num = 0;
				1809	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				1810	cap = rb_entry(p, struct ceph_cap, ci_node);
				1811	num++;
				1812
				1813	/* avoid looping forever */
				1814	if (mds >= cap->mds \|\|
				1815	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
				1816	continue;
				1817
				1818	/* NOTE: no side-effects allowed, until we take s_mutex */
				1819
				1820	cap_used = used;
				1821	if (ci->i_auth_cap && cap != ci->i_auth_cap)
				1822	cap_used &= ~ci->i_auth_cap->issued;
				1823
				1824	revoking = cap->implemented & ~cap->issued;
				1825	dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
				1826	cap->mds, cap, ceph_cap_string(cap_used),
				1827	ceph_cap_string(cap->issued),
				1828	ceph_cap_string(cap->implemented),
				1829	ceph_cap_string(revoking));
				1830
				1831	if (cap == ci->i_auth_cap &&
				1832	(cap->issued & CEPH_CAP_FILE_WR)) {
				1833	/* request larger max_size from MDS? */
				1834	if (ci->i_wanted_max_size > ci->i_max_size &&
				1835	ci->i_wanted_max_size > ci->i_requested_max_size) {
				1836	dout("requesting new max_size\n");
				1837	goto ack;
				1838	}
				1839
				1840	/* approaching file_max? */
				1841	if (__ceph_should_report_size(ci)) {
				1842	dout("i_size approaching max_size\n");
				1843	goto ack;
				1844	}
				1845	}
				1846	/* flush anything dirty? */
				1847	if (cap == ci->i_auth_cap) {
				1848	if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
				1849	dout("flushing dirty caps\n");
				1850	goto ack;
				1851	}
				1852	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
				1853	dout("flushing snap caps\n");
				1854	goto ack;
				1855	}
				1856	}
				1857
				1858	/* completed revocation? going down and there are no caps? */
				1859	if (revoking && (revoking & cap_used) == 0) {
				1860	dout("completed revocation of %s\n",
				1861	ceph_cap_string(cap->implemented & ~cap->issued));
				1862	goto ack;
				1863	}
				1864
				1865	/* want more caps from mds? */
				1866	if (want & ~cap->mds_wanted) {
				1867	if (want & ~(cap->mds_wanted \| cap->issued))
				1868	goto ack;
				1869	if (!__cap_is_valid(cap))
				1870	goto ack;
				1871	}
				1872
				1873	/* things we might delay */
				1874	if ((cap->issued & ~retain) == 0 &&
				1875	cap->mds_wanted == want)
				1876	continue; /* nope, all good */
				1877
				1878	if (is_delayed)
				1879	goto ack;
				1880
				1881	/* delay? */
				1882	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1883	time_before(jiffies, ci->i_hold_caps_max)) {
				1884	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
				1885	ceph_cap_string(cap->issued),
				1886	ceph_cap_string(cap->issued & retain),
				1887	ceph_cap_string(cap->mds_wanted),
				1888	ceph_cap_string(want));
				1889	delayed++;
				1890	continue;
				1891	}
				1892
				1893	ack:
				1894	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
				1895	dout(" skipping %p I_NOFLUSH set\n", inode);
				1896	continue;
				1897	}
				1898
				1899	if (session && session != cap->session) {
				1900	dout("oops, wrong session %p mutex\n", session);
				1901	mutex_unlock(&session->s_mutex);
				1902	session = NULL;
				1903	}
				1904	if (!session) {
				1905	session = cap->session;
				1906	if (mutex_trylock(&session->s_mutex) == 0) {
				1907	dout("inverting session/ino locks on %p\n",
				1908	session);
				1909	session = ceph_get_mds_session(session);
				1910	spin_unlock(&ci->i_ceph_lock);
				1911	if (took_snap_rwsem) {
				1912	up_read(&mdsc->snap_rwsem);
				1913	took_snap_rwsem = 0;
				1914	}
				1915	if (session) {
				1916	mutex_lock(&session->s_mutex);
				1917	ceph_put_mds_session(session);
				1918	} else {
				1919	/*
				1920	* Because we take the reference while
				1921	* holding the i_ceph_lock, it should
				1922	* never be NULL. Throw a warning if it
				1923	* ever is.
				1924	*/
				1925	WARN_ON_ONCE(true);
				1926	}
				1927	goto retry;
				1928	}
				1929	}
				1930
				1931	/* kick flushing and flush snaps before sending normal
				1932	* cap message */
				1933	if (cap == ci->i_auth_cap &&
				1934	(ci->i_ceph_flags &
				1935	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS))) {
				1936	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				1937	__kick_flushing_caps(mdsc, session, ci, 0);
				1938	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				1939	}
				1940	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
				1941	__ceph_flush_snaps(ci, session);
				1942
				1943	goto retry_locked;
				1944	}
				1945
				1946	/* take snap_rwsem after session mutex */
				1947	if (!took_snap_rwsem) {
				1948	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
				1949	dout("inverting snap/in locks on %p\n",
				1950	inode);
				1951	spin_unlock(&ci->i_ceph_lock);
				1952	down_read(&mdsc->snap_rwsem);
				1953	took_snap_rwsem = 1;
				1954	goto retry;
				1955	}
				1956	took_snap_rwsem = 1;
				1957	}
				1958
				1959	if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
				1960	flushing = __mark_caps_flushing(inode, session, false,
				1961	&flush_tid,
				1962	&oldest_flush_tid);
				1963	} else {
				1964	flushing = 0;
				1965	flush_tid = 0;
				1966	spin_lock(&mdsc->cap_dirty_lock);
				1967	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				1968	spin_unlock(&mdsc->cap_dirty_lock);
				1969	}
				1970
				1971	mds = cap->mds; /* remember mds, so we don't repeat */
				1972	sent++;
				1973
				1974	/* __send_cap drops i_ceph_lock */
				1975	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
				1976	cap_used, want, retain, flushing,
				1977	flush_tid, oldest_flush_tid);
				1978	goto retry; /* retake i_ceph_lock and restart our cap scan. */
				1979	}
				1980
				1981	/*
				1982	* Reschedule delayed caps release if we delayed anything,
				1983	* otherwise cancel.
				1984	*/
				1985	if (delayed && is_delayed)
				1986	force_requeue = true; /* __send_cap delayed release; requeue */
				1987	if (!delayed && !is_delayed)
				1988	__cap_delay_cancel(mdsc, ci);
				1989	else if (!is_delayed \|\| force_requeue)
				1990	__cap_delay_requeue(mdsc, ci);
				1991
				1992	spin_unlock(&ci->i_ceph_lock);
				1993
				1994	if (queue_invalidate)
				1995	ceph_queue_invalidate(inode);
				1996
				1997	if (session)
				1998	mutex_unlock(&session->s_mutex);
				1999	if (took_snap_rwsem)
				2000	up_read(&mdsc->snap_rwsem);
				2001	}
				2002
				2003	/*
				2004	* Try to flush dirty caps back to the auth mds.
				2005	*/
				2006	static int try_flush_caps(struct inode inode, u64 ptid)
				2007	{
				2008	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				2009	struct ceph_inode_info *ci = ceph_inode(inode);
				2010	struct ceph_mds_session *session = NULL;
				2011	int flushing = 0;
				2012	u64 flush_tid = 0, oldest_flush_tid = 0;
				2013
				2014	retry:
				2015	spin_lock(&ci->i_ceph_lock);
				2016	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
				2017	spin_unlock(&ci->i_ceph_lock);
				2018	dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
				2019	goto out;
				2020	}
				2021	if (ci->i_dirty_caps && ci->i_auth_cap) {
				2022	struct ceph_cap *cap = ci->i_auth_cap;
				2023	int used = __ceph_caps_used(ci);
				2024	int want = __ceph_caps_wanted(ci);
				2025	int delayed;
				2026
				2027	if (!session \|\| session != cap->session) {
				2028	spin_unlock(&ci->i_ceph_lock);
				2029	if (session)
				2030	mutex_unlock(&session->s_mutex);
				2031	session = cap->session;
				2032	mutex_lock(&session->s_mutex);
				2033	goto retry;
				2034	}
				2035	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
				2036	spin_unlock(&ci->i_ceph_lock);
				2037	goto out;
				2038	}
				2039
				2040	flushing = __mark_caps_flushing(inode, session, true,
				2041	&flush_tid, &oldest_flush_tid);
				2042
				2043	/* __send_cap drops i_ceph_lock */
				2044	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
				2045	used, want, (cap->issued \| cap->implemented),
				2046	flushing, flush_tid, oldest_flush_tid);
				2047
				2048	if (delayed) {
				2049	spin_lock(&ci->i_ceph_lock);
				2050	__cap_delay_requeue(mdsc, ci);
				2051	spin_unlock(&ci->i_ceph_lock);
				2052	}
				2053	} else {
				2054	if (!list_empty(&ci->i_cap_flush_list)) {
				2055	struct ceph_cap_flush *cf =
				2056	list_last_entry(&ci->i_cap_flush_list,
				2057	struct ceph_cap_flush, i_list);
				2058	cf->wake = true;
				2059	flush_tid = cf->tid;
				2060	}
				2061	flushing = ci->i_flushing_caps;
				2062	spin_unlock(&ci->i_ceph_lock);
				2063	}
				2064	out:
				2065	if (session)
				2066	mutex_unlock(&session->s_mutex);
				2067
				2068	*ptid = flush_tid;
				2069	return flushing;
				2070	}
				2071
				2072	/*
				2073	* Return true if we've flushed caps through the given flush_tid.
				2074	*/
				2075	static int caps_are_flushed(struct inode *inode, u64 flush_tid)
				2076	{
				2077	struct ceph_inode_info *ci = ceph_inode(inode);
				2078	int ret = 1;
				2079
				2080	spin_lock(&ci->i_ceph_lock);
				2081	if (!list_empty(&ci->i_cap_flush_list)) {
				2082	struct ceph_cap_flush * cf =
				2083	list_first_entry(&ci->i_cap_flush_list,
				2084	struct ceph_cap_flush, i_list);
				2085	if (cf->tid <= flush_tid)
				2086	ret = 0;
				2087	}
				2088	spin_unlock(&ci->i_ceph_lock);
				2089	return ret;
				2090	}
				2091
				2092	/*
				2093	* wait for any unsafe requests to complete.
				2094	*/
				2095	static int unsafe_request_wait(struct inode *inode)
				2096	{
				2097	struct ceph_inode_info *ci = ceph_inode(inode);
				2098	struct ceph_mds_request req1 = NULL, req2 = NULL;
				2099	int ret, err = 0;
				2100
				2101	spin_lock(&ci->i_unsafe_lock);
				2102	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
				2103	req1 = list_last_entry(&ci->i_unsafe_dirops,
				2104	struct ceph_mds_request,
				2105	r_unsafe_dir_item);
				2106	ceph_mdsc_get_request(req1);
				2107	}
				2108	if (!list_empty(&ci->i_unsafe_iops)) {
				2109	req2 = list_last_entry(&ci->i_unsafe_iops,
				2110	struct ceph_mds_request,
				2111	r_unsafe_target_item);
				2112	ceph_mdsc_get_request(req2);
				2113	}
				2114	spin_unlock(&ci->i_unsafe_lock);
				2115
				2116	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
				2117	inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
				2118	if (req1) {
				2119	ret = !wait_for_completion_timeout(&req1->r_safe_completion,
				2120	ceph_timeout_jiffies(req1->r_timeout));
				2121	if (ret)
				2122	err = -EIO;
				2123	ceph_mdsc_put_request(req1);
				2124	}
				2125	if (req2) {
				2126	ret = !wait_for_completion_timeout(&req2->r_safe_completion,
				2127	ceph_timeout_jiffies(req2->r_timeout));
				2128	if (ret)
				2129	err = -EIO;
				2130	ceph_mdsc_put_request(req2);
				2131	}
				2132	return err;
				2133	}
				2134
				2135	int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
				2136	{
				2137	struct inode *inode = file->f_mapping->host;
				2138	struct ceph_inode_info *ci = ceph_inode(inode);
				2139	u64 flush_tid;
				2140	int ret;
				2141	int dirty;
				2142
				2143	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
				2144
				2145	ret = file_write_and_wait_range(file, start, end);
				2146	if (ret < 0)
				2147	goto out;
				2148
				2149	if (datasync)
				2150	goto out;
				2151
				2152	inode_lock(inode);
				2153
				2154	dirty = try_flush_caps(inode, &flush_tid);
				2155	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
				2156
				2157	ret = unsafe_request_wait(inode);
				2158
				2159	/*
				2160	* only wait on non-file metadata writeback (the mds
				2161	* can recover size and mtime, so we don't need to
				2162	* wait for that)
				2163	*/
				2164	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
				2165	ret = wait_event_interruptible(ci->i_cap_wq,
				2166	caps_are_flushed(inode, flush_tid));
				2167	}
				2168	inode_unlock(inode);
				2169	out:
				2170	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
				2171	return ret;
				2172	}
				2173
				2174	/*
				2175	* Flush any dirty caps back to the mds. If we aren't asked to wait,
				2176	* queue inode for flush but don't do so immediately, because we can
				2177	* get by with fewer MDS messages if we wait for data writeback to
				2178	* complete first.
				2179	*/
				2180	int ceph_write_inode(struct inode inode, struct writeback_control wbc)
				2181	{
				2182	struct ceph_inode_info *ci = ceph_inode(inode);
				2183	u64 flush_tid;
				2184	int err = 0;
				2185	int dirty;
				2186	int wait = wbc->sync_mode == WB_SYNC_ALL;
				2187
				2188	dout("write_inode %p wait=%d\n", inode, wait);
				2189	if (wait) {
				2190	dirty = try_flush_caps(inode, &flush_tid);
				2191	if (dirty)
				2192	err = wait_event_interruptible(ci->i_cap_wq,
				2193	caps_are_flushed(inode, flush_tid));
				2194	} else {
				2195	struct ceph_mds_client *mdsc =
				2196	ceph_sb_to_client(inode->i_sb)->mdsc;
				2197
				2198	spin_lock(&ci->i_ceph_lock);
				2199	if (__ceph_caps_dirty(ci))
				2200	__cap_delay_requeue_front(mdsc, ci);
				2201	spin_unlock(&ci->i_ceph_lock);
				2202	}
				2203	return err;
				2204	}
				2205
				2206	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
				2207	struct ceph_mds_session *session,
				2208	struct ceph_inode_info *ci,
				2209	u64 oldest_flush_tid)
				2210	__releases(ci->i_ceph_lock)
				2211	__acquires(ci->i_ceph_lock)
				2212	{
				2213	struct inode *inode = &ci->vfs_inode;
				2214	struct ceph_cap *cap;
				2215	struct ceph_cap_flush *cf;
				2216	int ret;
				2217	u64 first_tid = 0;
				2218
				2219	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
				2220	if (cf->tid < first_tid)
				2221	continue;
				2222
				2223	cap = ci->i_auth_cap;
				2224	if (!(cap && cap->session == session)) {
				2225	pr_err("%p auth cap %p not mds%d ???\n",
				2226	inode, cap, session->s_mds);
				2227	break;
				2228	}
				2229
				2230	first_tid = cf->tid + 1;
				2231
				2232	if (cf->caps) {
				2233	dout("kick_flushing_caps %p cap %p tid %llu %s\n",
				2234	inode, cap, cf->tid, ceph_cap_string(cf->caps));
				2235	ci->i_ceph_flags \|= CEPH_I_NODELAY;
				2236	ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
				2237	false, __ceph_caps_used(ci),
				2238	__ceph_caps_wanted(ci),
				2239	cap->issued \| cap->implemented,
				2240	cf->caps, cf->tid, oldest_flush_tid);
				2241	if (ret) {
				2242	pr_err("kick_flushing_caps: error sending "
				2243	"cap flush, ino (%llx.%llx) "
				2244	"tid %llu flushing %s\n",
				2245	ceph_vinop(inode), cf->tid,
				2246	ceph_cap_string(cf->caps));
				2247	}
				2248	} else {
				2249	struct ceph_cap_snap *capsnap =
				2250	container_of(cf, struct ceph_cap_snap,
				2251	cap_flush);
				2252	dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
				2253	inode, capsnap, cf->tid,
				2254	ceph_cap_string(capsnap->dirty));
				2255
				2256	refcount_inc(&capsnap->nref);
				2257	spin_unlock(&ci->i_ceph_lock);
				2258
				2259	ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
				2260	oldest_flush_tid);
				2261	if (ret < 0) {
				2262	pr_err("kick_flushing_caps: error sending "
				2263	"cap flushsnap, ino (%llx.%llx) "
				2264	"tid %llu follows %llu\n",
				2265	ceph_vinop(inode), cf->tid,
				2266	capsnap->follows);
				2267	}
				2268
				2269	ceph_put_cap_snap(capsnap);
				2270	}
				2271
				2272	spin_lock(&ci->i_ceph_lock);
				2273	}
				2274	}
				2275
				2276	void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
				2277	struct ceph_mds_session *session)
				2278	{
				2279	struct ceph_inode_info *ci;
				2280	struct ceph_cap *cap;
				2281	u64 oldest_flush_tid;
				2282
				2283	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
				2284
				2285	spin_lock(&mdsc->cap_dirty_lock);
				2286	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2287	spin_unlock(&mdsc->cap_dirty_lock);
				2288
				2289	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				2290	spin_lock(&ci->i_ceph_lock);
				2291	cap = ci->i_auth_cap;
				2292	if (!(cap && cap->session == session)) {
				2293	pr_err("%p auth cap %p not mds%d ???\n",
				2294	&ci->vfs_inode, cap, session->s_mds);
				2295	spin_unlock(&ci->i_ceph_lock);
				2296	continue;
				2297	}
				2298
				2299
				2300	/*
				2301	* if flushing caps were revoked, we re-send the cap flush
				2302	* in client reconnect stage. This guarantees MDS * processes
				2303	* the cap flush message before issuing the flushing caps to
				2304	* other client.
				2305	*/
				2306	if ((cap->issued & ci->i_flushing_caps) !=
				2307	ci->i_flushing_caps) {
				2308	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2309	__kick_flushing_caps(mdsc, session, ci,
				2310	oldest_flush_tid);
				2311	} else {
				2312	ci->i_ceph_flags \|= CEPH_I_KICK_FLUSH;
				2313	}
				2314
				2315	spin_unlock(&ci->i_ceph_lock);
				2316	}
				2317	}
				2318
				2319	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
				2320	struct ceph_mds_session *session)
				2321	{
				2322	struct ceph_inode_info *ci;
				2323	struct ceph_cap *cap;
				2324	u64 oldest_flush_tid;
				2325
				2326	dout("kick_flushing_caps mds%d\n", session->s_mds);
				2327
				2328	spin_lock(&mdsc->cap_dirty_lock);
				2329	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2330	spin_unlock(&mdsc->cap_dirty_lock);
				2331
				2332	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				2333	spin_lock(&ci->i_ceph_lock);
				2334	cap = ci->i_auth_cap;
				2335	if (!(cap && cap->session == session)) {
				2336	pr_err("%p auth cap %p not mds%d ???\n",
				2337	&ci->vfs_inode, cap, session->s_mds);
				2338	spin_unlock(&ci->i_ceph_lock);
				2339	continue;
				2340	}
				2341	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				2342	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2343	__kick_flushing_caps(mdsc, session, ci,
				2344	oldest_flush_tid);
				2345	}
				2346	spin_unlock(&ci->i_ceph_lock);
				2347	}
				2348	}
				2349
				2350	static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
				2351	struct ceph_mds_session *session,
				2352	struct inode *inode)
				2353	__releases(ci->i_ceph_lock)
				2354	{
				2355	struct ceph_inode_info *ci = ceph_inode(inode);
				2356	struct ceph_cap *cap;
				2357
				2358	cap = ci->i_auth_cap;
				2359	dout("kick_flushing_inode_caps %p flushing %s\n", inode,
				2360	ceph_cap_string(ci->i_flushing_caps));
				2361
				2362	if (!list_empty(&ci->i_cap_flush_list)) {
				2363	u64 oldest_flush_tid;
				2364	spin_lock(&mdsc->cap_dirty_lock);
				2365	list_move_tail(&ci->i_flushing_item,
				2366	&cap->session->s_cap_flushing);
				2367	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2368	spin_unlock(&mdsc->cap_dirty_lock);
				2369
				2370	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2371	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
				2372	spin_unlock(&ci->i_ceph_lock);
				2373	} else {
				2374	spin_unlock(&ci->i_ceph_lock);
				2375	}
				2376	}
				2377
				2378
				2379	/*
				2380	* Take references to capabilities we hold, so that we don't release
				2381	* them to the MDS prematurely.
				2382	*
				2383	* Protected by i_ceph_lock.
				2384	*/
				2385	static void __take_cap_refs(struct ceph_inode_info *ci, int got,
				2386	bool snap_rwsem_locked)
				2387	{
				2388	if (got & CEPH_CAP_PIN)
				2389	ci->i_pin_ref++;
				2390	if (got & CEPH_CAP_FILE_RD)
				2391	ci->i_rd_ref++;
				2392	if (got & CEPH_CAP_FILE_CACHE)
				2393	ci->i_rdcache_ref++;
				2394	if (got & CEPH_CAP_FILE_WR) {
				2395	if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
				2396	BUG_ON(!snap_rwsem_locked);
				2397	ci->i_head_snapc = ceph_get_snap_context(
				2398	ci->i_snap_realm->cached_context);
				2399	}
				2400	ci->i_wr_ref++;
				2401	}
				2402	if (got & CEPH_CAP_FILE_BUFFER) {
				2403	if (ci->i_wb_ref == 0)
				2404	ihold(&ci->vfs_inode);
				2405	ci->i_wb_ref++;
				2406	dout("__take_cap_refs %p wb %d -> %d (?)\n",
				2407	&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
				2408	}
				2409	}
				2410
				2411	/*
				2412	* Try to grab cap references. Specify those refs we @want, and the
				2413	* minimal set we @need. Also include the larger offset we are writing
				2414	* to (when applicable), and check against max_size here as well.
				2415	* Note that caller is responsible for ensuring max_size increases are
				2416	* requested from the MDS.
				2417	*/
				2418	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
				2419	loff_t endoff, bool nonblock, int got, int err)
				2420	{
				2421	struct inode *inode = &ci->vfs_inode;
				2422	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				2423	int ret = 0;
				2424	int have, implemented;
				2425	int file_wanted;
				2426	bool snap_rwsem_locked = false;
				2427
				2428	dout("get_cap_refs %p need %s want %s\n", inode,
				2429	ceph_cap_string(need), ceph_cap_string(want));
				2430
				2431	again:
				2432	spin_lock(&ci->i_ceph_lock);
				2433
				2434	/* make sure file is actually open */
				2435	file_wanted = __ceph_caps_file_wanted(ci);
				2436	if ((file_wanted & need) != need) {
				2437	dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
				2438	ceph_cap_string(need), ceph_cap_string(file_wanted));
				2439	*err = -EBADF;
				2440	ret = 1;
				2441	goto out_unlock;
				2442	}
				2443
				2444	/* finish pending truncate */
				2445	while (ci->i_truncate_pending) {
				2446	spin_unlock(&ci->i_ceph_lock);
				2447	if (snap_rwsem_locked) {
				2448	up_read(&mdsc->snap_rwsem);
				2449	snap_rwsem_locked = false;
				2450	}
				2451	__ceph_do_pending_vmtruncate(inode);
				2452	spin_lock(&ci->i_ceph_lock);
				2453	}
				2454
				2455	have = __ceph_caps_issued(ci, &implemented);
				2456
				2457	if (have & need & CEPH_CAP_FILE_WR) {
				2458	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
				2459	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
				2460	inode, endoff, ci->i_max_size);
				2461	if (endoff > ci->i_requested_max_size) {
				2462	*err = -EAGAIN;
				2463	ret = 1;
				2464	}
				2465	goto out_unlock;
				2466	}
				2467	/*
				2468	* If a sync write is in progress, we must wait, so that we
				2469	* can get a final snapshot value for size+mtime.
				2470	*/
				2471	if (__ceph_have_pending_cap_snap(ci)) {
				2472	dout("get_cap_refs %p cap_snap_pending\n", inode);
				2473	goto out_unlock;
				2474	}
				2475	}
				2476
				2477	if ((have & need) == need) {
				2478	/*
				2479	* Look at (implemented & ~have & not) so that we keep waiting
				2480	* on transition from wanted -> needed caps. This is needed
				2481	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
				2482	* going before a prior buffered writeback happens.
				2483	*/
				2484	int not = want & ~(have & need);
				2485	int revoking = implemented & ~have;
				2486	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
				2487	inode, ceph_cap_string(have), ceph_cap_string(not),
				2488	ceph_cap_string(revoking));
				2489	if ((revoking & not) == 0) {
				2490	if (!snap_rwsem_locked &&
				2491	!ci->i_head_snapc &&
				2492	(need & CEPH_CAP_FILE_WR)) {
				2493	if (!down_read_trylock(&mdsc->snap_rwsem)) {
				2494	/*
				2495	* we can not call down_read() when
				2496	* task isn't in TASK_RUNNING state
				2497	*/
				2498	if (nonblock) {
				2499	*err = -EAGAIN;
				2500	ret = 1;
				2501	goto out_unlock;
				2502	}
				2503
				2504	spin_unlock(&ci->i_ceph_lock);
				2505	down_read(&mdsc->snap_rwsem);
				2506	snap_rwsem_locked = true;
				2507	goto again;
				2508	}
				2509	snap_rwsem_locked = true;
				2510	}
				2511	*got = need \| (have & want);
				2512	if ((need & CEPH_CAP_FILE_RD) &&
				2513	!(*got & CEPH_CAP_FILE_CACHE))
				2514	ceph_disable_fscache_readpage(ci);
				2515	__take_cap_refs(ci, *got, true);
				2516	ret = 1;
				2517	}
				2518	} else {
				2519	int session_readonly = false;
				2520	if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
				2521	struct ceph_mds_session *s = ci->i_auth_cap->session;
				2522	spin_lock(&s->s_cap_lock);
				2523	session_readonly = s->s_readonly;
				2524	spin_unlock(&s->s_cap_lock);
				2525	}
				2526	if (session_readonly) {
				2527	dout("get_cap_refs %p needed %s but mds%d readonly\n",
				2528	inode, ceph_cap_string(need), ci->i_auth_cap->mds);
				2529	*err = -EROFS;
				2530	ret = 1;
				2531	goto out_unlock;
				2532	}
				2533
				2534	if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
				2535	int mds_wanted;
				2536	if (READ_ONCE(mdsc->fsc->mount_state) ==
				2537	CEPH_MOUNT_SHUTDOWN) {
				2538	dout("get_cap_refs %p forced umount\n", inode);
				2539	*err = -EIO;
				2540	ret = 1;
				2541	goto out_unlock;
				2542	}
				2543	mds_wanted = __ceph_caps_mds_wanted(ci, false);
				2544	if (need & ~(mds_wanted & need)) {
				2545	dout("get_cap_refs %p caps were dropped"
				2546	" (session killed?)\n", inode);
				2547	*err = -ESTALE;
				2548	ret = 1;
				2549	goto out_unlock;
				2550	}
				2551	if (!(file_wanted & ~mds_wanted))
				2552	ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
				2553	}
				2554
				2555	dout("get_cap_refs %p have %s needed %s\n", inode,
				2556	ceph_cap_string(have), ceph_cap_string(need));
				2557	}
				2558	out_unlock:
				2559	spin_unlock(&ci->i_ceph_lock);
				2560	if (snap_rwsem_locked)
				2561	up_read(&mdsc->snap_rwsem);
				2562
				2563	dout("get_cap_refs %p ret %d got %s\n", inode,
				2564	ret, ceph_cap_string(*got));
				2565	return ret;
				2566	}
				2567
				2568	/*
				2569	* Check the offset we are writing up to against our current
				2570	* max_size. If necessary, tell the MDS we want to write to
				2571	* a larger offset.
				2572	*/
				2573	static void check_max_size(struct inode *inode, loff_t endoff)
				2574	{
				2575	struct ceph_inode_info *ci = ceph_inode(inode);
				2576	int check = 0;
				2577
				2578	/* do we need to explicitly request a larger max_size? */
				2579	spin_lock(&ci->i_ceph_lock);
				2580	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
				2581	dout("write %p at large endoff %llu, req max_size\n",
				2582	inode, endoff);
				2583	ci->i_wanted_max_size = endoff;
				2584	}
				2585	/* duplicate ceph_check_caps()'s logic */
				2586	if (ci->i_auth_cap &&
				2587	(ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
				2588	ci->i_wanted_max_size > ci->i_max_size &&
				2589	ci->i_wanted_max_size > ci->i_requested_max_size)
				2590	check = 1;
				2591	spin_unlock(&ci->i_ceph_lock);
				2592	if (check)
				2593	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2594	}
				2595
				2596	int ceph_try_get_caps(struct ceph_inode_info ci, int need, int want, int got)
				2597	{
				2598	int ret, err = 0;
				2599
				2600	BUG_ON(need & ~CEPH_CAP_FILE_RD);
				2601	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO));
				2602	ret = ceph_pool_perm_check(ci, need);
				2603	if (ret < 0)
				2604	return ret;
				2605
				2606	ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
				2607	if (ret) {
				2608	if (err == -EAGAIN) {
				2609	ret = 0;
				2610	} else if (err < 0) {
				2611	ret = err;
				2612	}
				2613	}
				2614	return ret;
				2615	}
				2616
				2617	/*
				2618	* Wait for caps, and take cap references. If we can't get a WR cap
				2619	* due to a small max_size, make sure we check_max_size (and possibly
				2620	* ask the mds) so we don't get hung up indefinitely.
				2621	*/
				2622	int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
				2623	loff_t endoff, int got, struct page *pinned_page)
				2624	{
				2625	int _got, ret, err = 0;
				2626
				2627	ret = ceph_pool_perm_check(ci, need);
				2628	if (ret < 0)
				2629	return ret;
				2630
				2631	while (true) {
				2632	if (endoff > 0)
				2633	check_max_size(&ci->vfs_inode, endoff);
				2634
				2635	err = 0;
				2636	_got = 0;
				2637	ret = try_get_cap_refs(ci, need, want, endoff,
				2638	false, &_got, &err);
				2639	if (ret) {
				2640	if (err == -EAGAIN)
				2641	continue;
				2642	if (err < 0)
				2643	ret = err;
				2644	} else {
				2645	DEFINE_WAIT_FUNC(wait, woken_wake_function);
				2646	add_wait_queue(&ci->i_cap_wq, &wait);
				2647
				2648	while (!try_get_cap_refs(ci, need, want, endoff,
				2649	true, &_got, &err)) {
				2650	if (signal_pending(current)) {
				2651	ret = -ERESTARTSYS;
				2652	break;
				2653	}
				2654	wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
				2655	}
				2656
				2657	remove_wait_queue(&ci->i_cap_wq, &wait);
				2658
				2659	if (err == -EAGAIN)
				2660	continue;
				2661	if (err < 0)
				2662	ret = err;
				2663	}
				2664	if (ret < 0) {
				2665	if (err == -ESTALE) {
				2666	/* session was killed, try renew caps */
				2667	ret = ceph_renew_caps(&ci->vfs_inode);
				2668	if (ret == 0)
				2669	continue;
				2670	}
				2671	return ret;
				2672	}
				2673
				2674	if (ci->i_inline_version != CEPH_INLINE_NONE &&
				2675	(_got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
				2676	i_size_read(&ci->vfs_inode) > 0) {
				2677	struct page *page =
				2678	find_get_page(ci->vfs_inode.i_mapping, 0);
				2679	if (page) {
				2680	if (PageUptodate(page)) {
				2681	*pinned_page = page;
				2682	break;
				2683	}
				2684	put_page(page);
				2685	}
				2686	/*
				2687	* drop cap refs first because getattr while
				2688	* holding * caps refs can cause deadlock.
				2689	*/
				2690	ceph_put_cap_refs(ci, _got);
				2691	_got = 0;
				2692
				2693	/*
				2694	* getattr request will bring inline data into
				2695	* page cache
				2696	*/
				2697	ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
				2698	CEPH_STAT_CAP_INLINE_DATA,
				2699	true);
				2700	if (ret < 0)
				2701	return ret;
				2702	continue;
				2703	}
				2704	break;
				2705	}
				2706
				2707	if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
				2708	ceph_fscache_revalidate_cookie(ci);
				2709
				2710	*got = _got;
				2711	return 0;
				2712	}
				2713
				2714	/*
				2715	* Take cap refs. Caller must already know we hold at least one ref
				2716	* on the caps in question or we don't know this is safe.
				2717	*/
				2718	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
				2719	{
				2720	spin_lock(&ci->i_ceph_lock);
				2721	__take_cap_refs(ci, caps, false);
				2722	spin_unlock(&ci->i_ceph_lock);
				2723	}
				2724
				2725
				2726	/*
				2727	* drop cap_snap that is not associated with any snapshot.
				2728	* we don't need to send FLUSHSNAP message for it.
				2729	*/
				2730	static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
				2731	struct ceph_cap_snap *capsnap)
				2732	{
				2733	if (!capsnap->need_flush &&
				2734	!capsnap->writing && !capsnap->dirty_pages) {
				2735	dout("dropping cap_snap %p follows %llu\n",
				2736	capsnap, capsnap->follows);
				2737	BUG_ON(capsnap->cap_flush.tid > 0);
				2738	ceph_put_snap_context(capsnap->context);
				2739	if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
				2740	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
				2741
				2742	list_del(&capsnap->ci_item);
				2743	ceph_put_cap_snap(capsnap);
				2744	return 1;
				2745	}
				2746	return 0;
				2747	}
				2748
				2749	/*
				2750	* Release cap refs.
				2751	*
				2752	* If we released the last ref on any given cap, call ceph_check_caps
				2753	* to release (or schedule a release).
				2754	*
				2755	* If we are releasing a WR cap (from a sync write), finalize any affected
				2756	* cap_snap, and wake up any waiters.
				2757	*/
				2758	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
				2759	{
				2760	struct inode *inode = &ci->vfs_inode;
				2761	int last = 0, put = 0, flushsnaps = 0, wake = 0;
				2762
				2763	spin_lock(&ci->i_ceph_lock);
				2764	if (had & CEPH_CAP_PIN)
				2765	--ci->i_pin_ref;
				2766	if (had & CEPH_CAP_FILE_RD)
				2767	if (--ci->i_rd_ref == 0)
				2768	last++;
				2769	if (had & CEPH_CAP_FILE_CACHE)
				2770	if (--ci->i_rdcache_ref == 0)
				2771	last++;
				2772	if (had & CEPH_CAP_FILE_BUFFER) {
				2773	if (--ci->i_wb_ref == 0) {
				2774	last++;
				2775	put++;
				2776	}
				2777	dout("put_cap_refs %p wb %d -> %d (?)\n",
				2778	inode, ci->i_wb_ref+1, ci->i_wb_ref);
				2779	}
				2780	if (had & CEPH_CAP_FILE_WR)
				2781	if (--ci->i_wr_ref == 0) {
				2782	last++;
				2783	if (__ceph_have_pending_cap_snap(ci)) {
				2784	struct ceph_cap_snap *capsnap =
				2785	list_last_entry(&ci->i_cap_snaps,
				2786	struct ceph_cap_snap,
				2787	ci_item);
				2788	capsnap->writing = 0;
				2789	if (ceph_try_drop_cap_snap(ci, capsnap))
				2790	put++;
				2791	else if (__ceph_finish_cap_snap(ci, capsnap))
				2792	flushsnaps = 1;
				2793	wake = 1;
				2794	}
				2795	if (ci->i_wrbuffer_ref_head == 0 &&
				2796	ci->i_dirty_caps == 0 &&
				2797	ci->i_flushing_caps == 0) {
				2798	BUG_ON(!ci->i_head_snapc);
				2799	ceph_put_snap_context(ci->i_head_snapc);
				2800	ci->i_head_snapc = NULL;
				2801	}
				2802	/* see comment in __ceph_remove_cap() */
				2803	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
				2804	drop_inode_snap_realm(ci);
				2805	}
				2806	spin_unlock(&ci->i_ceph_lock);
				2807
				2808	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
				2809	last ? " last" : "", put ? " put" : "");
				2810
				2811	if (last && !flushsnaps)
				2812	ceph_check_caps(ci, 0, NULL);
				2813	else if (flushsnaps)
				2814	ceph_flush_snaps(ci, NULL);
				2815	if (wake)
				2816	wake_up_all(&ci->i_cap_wq);
				2817	while (put-- > 0)
				2818	iput(inode);
				2819	}
				2820
				2821	/*
				2822	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
				2823	* context. Adjust per-snap dirty page accounting as appropriate.
				2824	* Once all dirty data for a cap_snap is flushed, flush snapped file
				2825	* metadata back to the MDS. If we dropped the last ref, call
				2826	* ceph_check_caps.
				2827	*/
				2828	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				2829	struct ceph_snap_context *snapc)
				2830	{
				2831	struct inode *inode = &ci->vfs_inode;
				2832	struct ceph_cap_snap *capsnap = NULL;
				2833	int put = 0;
				2834	bool last = false;
				2835	bool found = false;
				2836	bool flush_snaps = false;
				2837	bool complete_capsnap = false;
				2838
				2839	spin_lock(&ci->i_ceph_lock);
				2840	ci->i_wrbuffer_ref -= nr;
				2841	if (ci->i_wrbuffer_ref == 0) {
				2842	last = true;
				2843	put++;
				2844	}
				2845
				2846	if (ci->i_head_snapc == snapc) {
				2847	ci->i_wrbuffer_ref_head -= nr;
				2848	if (ci->i_wrbuffer_ref_head == 0 &&
				2849	ci->i_wr_ref == 0 &&
				2850	ci->i_dirty_caps == 0 &&
				2851	ci->i_flushing_caps == 0) {
				2852	BUG_ON(!ci->i_head_snapc);
				2853	ceph_put_snap_context(ci->i_head_snapc);
				2854	ci->i_head_snapc = NULL;
				2855	}
				2856	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
				2857	inode,
				2858	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
				2859	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
				2860	last ? " LAST" : "");
				2861	} else {
				2862	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2863	if (capsnap->context == snapc) {
				2864	found = true;
				2865	break;
				2866	}
				2867	}
				2868	BUG_ON(!found);
				2869	capsnap->dirty_pages -= nr;
				2870	if (capsnap->dirty_pages == 0) {
				2871	complete_capsnap = true;
				2872	if (!capsnap->writing) {
				2873	if (ceph_try_drop_cap_snap(ci, capsnap)) {
				2874	put++;
				2875	} else {
				2876	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
				2877	flush_snaps = true;
				2878	}
				2879	}
				2880	}
				2881	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
				2882	" snap %lld %d/%d -> %d/%d %s%s\n",
				2883	inode, capsnap, capsnap->context->seq,
				2884	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
				2885	ci->i_wrbuffer_ref, capsnap->dirty_pages,
				2886	last ? " (wrbuffer last)" : "",
				2887	complete_capsnap ? " (complete capsnap)" : "");
				2888	}
				2889
				2890	spin_unlock(&ci->i_ceph_lock);
				2891
				2892	if (last) {
				2893	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2894	} else if (flush_snaps) {
				2895	ceph_flush_snaps(ci, NULL);
				2896	}
				2897	if (complete_capsnap)
				2898	wake_up_all(&ci->i_cap_wq);
				2899	while (put-- > 0)
				2900	iput(inode);
				2901	}
				2902
				2903	/*
				2904	* Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
				2905	*/
				2906	static void invalidate_aliases(struct inode *inode)
				2907	{
				2908	struct dentry dn, prev = NULL;
				2909
				2910	dout("invalidate_aliases inode %p\n", inode);
				2911	d_prune_aliases(inode);
				2912	/*
				2913	* For non-directory inode, d_find_alias() only returns
				2914	* hashed dentry. After calling d_invalidate(), the
				2915	* dentry becomes unhashed.
				2916	*
				2917	* For directory inode, d_find_alias() can return
				2918	* unhashed dentry. But directory inode should have
				2919	* one alias at most.
				2920	*/
				2921	while ((dn = d_find_alias(inode))) {
				2922	if (dn == prev) {
				2923	dput(dn);
				2924	break;
				2925	}
				2926	d_invalidate(dn);
				2927	if (prev)
				2928	dput(prev);
				2929	prev = dn;
				2930	}
				2931	if (prev)
				2932	dput(prev);
				2933	}
				2934
				2935	/*
				2936	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
				2937	* actually be a revocation if it specifies a smaller cap set.)
				2938	*
				2939	* caller holds s_mutex and i_ceph_lock, we drop both.
				2940	*/
				2941	static void handle_cap_grant(struct ceph_mds_client *mdsc,
				2942	struct inode inode, struct ceph_mds_caps grant,
				2943	struct ceph_string **pns, u64 inline_version,
				2944	void *inline_data, u32 inline_len,
				2945	struct ceph_buffer *xattr_buf,
				2946	struct ceph_mds_session *session,
				2947	struct ceph_cap *cap, int issued)
				2948	__releases(ci->i_ceph_lock)
				2949	__releases(mdsc->snap_rwsem)
				2950	{
				2951	struct ceph_inode_info *ci = ceph_inode(inode);
				2952	int mds = session->s_mds;
				2953	int seq = le32_to_cpu(grant->seq);
				2954	int newcaps = le32_to_cpu(grant->caps);
				2955	int used, wanted, dirty;
				2956	u64 size = le64_to_cpu(grant->size);
				2957	u64 max_size = le64_to_cpu(grant->max_size);
				2958	struct timespec mtime, atime, ctime;
				2959	int check_caps = 0;
				2960	bool wake = false;
				2961	bool writeback = false;
				2962	bool queue_trunc = false;
				2963	bool queue_invalidate = false;
				2964	bool deleted_inode = false;
				2965	bool fill_inline = false;
				2966
				2967	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
				2968	inode, cap, mds, seq, ceph_cap_string(newcaps));
				2969	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
				2970	inode->i_size);
				2971
				2972
				2973	/*
				2974	* auth mds of the inode changed. we received the cap export message,
				2975	* but still haven't received the cap import message. handle_cap_export
				2976	* updated the new auth MDS' cap.
				2977	*
				2978	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
				2979	* that was sent before the cap import message. So don't remove caps.
				2980	*/
				2981	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
				2982	WARN_ON(cap != ci->i_auth_cap);
				2983	WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
				2984	seq = cap->seq;
				2985	newcaps \|= cap->issued;
				2986	}
				2987
				2988	/*
				2989	* If CACHE is being revoked, and we have no dirty buffers,
				2990	* try to invalidate (once). (If there are dirty buffers, we
				2991	* will invalidate _after_ writeback.)
				2992	*/
				2993	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
				2994	((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
				2995	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
				2996	!(ci->i_wrbuffer_ref \|\| ci->i_wb_ref)) {
				2997	if (try_nonblocking_invalidate(inode)) {
				2998	/* there were locked pages.. invalidate later
				2999	in a separate thread. */
				3000	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				3001	queue_invalidate = true;
				3002	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				3003	}
				3004	}
				3005	}
				3006
				3007	/* side effects now are allowed */
				3008	cap->cap_gen = session->s_cap_gen;
				3009	cap->seq = seq;
				3010
				3011	__check_cap_issue(ci, cap, newcaps);
				3012
				3013	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
				3014	(issued & CEPH_CAP_AUTH_EXCL) == 0) {
				3015	inode->i_mode = le32_to_cpu(grant->mode);
				3016	inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
				3017	inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
				3018	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				3019	from_kuid(&init_user_ns, inode->i_uid),
				3020	from_kgid(&init_user_ns, inode->i_gid));
				3021	}
				3022
				3023	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
				3024	(issued & CEPH_CAP_LINK_EXCL) == 0) {
				3025	set_nlink(inode, le32_to_cpu(grant->nlink));
				3026	if (inode->i_nlink == 0 &&
				3027	(newcaps & (CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL)))
				3028	deleted_inode = true;
				3029	}
				3030
				3031	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
				3032	int len = le32_to_cpu(grant->xattr_len);
				3033	u64 version = le64_to_cpu(grant->xattr_version);
				3034
				3035	if (version > ci->i_xattrs.version) {
				3036	dout(" got new xattrs v%llu on %p len %d\n",
				3037	version, inode, len);
				3038	if (ci->i_xattrs.blob)
				3039	ceph_buffer_put(ci->i_xattrs.blob);
				3040	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
				3041	ci->i_xattrs.version = version;
				3042	ceph_forget_all_cached_acls(inode);
				3043	}
				3044	}
				3045
				3046	if (newcaps & CEPH_CAP_ANY_RD) {
				3047	/* ctime/mtime/atime? */
				3048	ceph_decode_timespec(&mtime, &grant->mtime);
				3049	ceph_decode_timespec(&atime, &grant->atime);
				3050	ceph_decode_timespec(&ctime, &grant->ctime);
				3051	ceph_fill_file_time(inode, issued,
				3052	le32_to_cpu(grant->time_warp_seq),
				3053	&ctime, &mtime, &atime);
				3054	}
				3055
				3056	if (newcaps & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR)) {
				3057	/* file layout may have changed */
				3058	s64 old_pool = ci->i_layout.pool_id;
				3059	struct ceph_string *old_ns;
				3060
				3061	ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
				3062	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
				3063	lockdep_is_held(&ci->i_ceph_lock));
				3064	rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
				3065
				3066	if (ci->i_layout.pool_id != old_pool \|\| *pns != old_ns)
				3067	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
				3068
				3069	*pns = old_ns;
				3070
				3071	/* size/truncate_seq? */
				3072	queue_trunc = ceph_fill_file_size(inode, issued,
				3073	le32_to_cpu(grant->truncate_seq),
				3074	le64_to_cpu(grant->truncate_size),
				3075	size);
				3076	}
				3077
				3078	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
				3079	if (max_size != ci->i_max_size) {
				3080	dout("max_size %lld -> %llu\n",
				3081	ci->i_max_size, max_size);
				3082	ci->i_max_size = max_size;
				3083	if (max_size >= ci->i_wanted_max_size) {
				3084	ci->i_wanted_max_size = 0; /* reset */
				3085	ci->i_requested_max_size = 0;
				3086	}
				3087	wake = true;
				3088	} else if (ci->i_wanted_max_size > ci->i_max_size &&
				3089	ci->i_wanted_max_size > ci->i_requested_max_size) {
				3090	/* CEPH_CAP_OP_IMPORT */
				3091	wake = true;
				3092	}
				3093	}
				3094
				3095	/* check cap bits */
				3096	wanted = __ceph_caps_wanted(ci);
				3097	used = __ceph_caps_used(ci);
				3098	dirty = __ceph_caps_dirty(ci);
				3099	dout(" my wanted = %s, used = %s, dirty %s\n",
				3100	ceph_cap_string(wanted),
				3101	ceph_cap_string(used),
				3102	ceph_cap_string(dirty));
				3103	if (wanted != le32_to_cpu(grant->wanted)) {
				3104	dout("mds wanted %s -> %s\n",
				3105	ceph_cap_string(le32_to_cpu(grant->wanted)),
				3106	ceph_cap_string(wanted));
				3107	/* imported cap may not have correct mds_wanted */
				3108	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
				3109	check_caps = 1;
				3110	}
				3111
				3112	/* revocation, grant, or no-op? */
				3113	if (cap->issued & ~newcaps) {
				3114	int revoking = cap->issued & ~newcaps;
				3115
				3116	dout("revocation: %s -> %s (revoking %s)\n",
				3117	ceph_cap_string(cap->issued),
				3118	ceph_cap_string(newcaps),
				3119	ceph_cap_string(revoking));
				3120	if (revoking & used & CEPH_CAP_FILE_BUFFER)
				3121	writeback = true; /* initiate writeback; will delay ack */
				3122	else if (revoking == CEPH_CAP_FILE_CACHE &&
				3123	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
				3124	queue_invalidate)
				3125	; /* do nothing yet, invalidation will be queued */
				3126	else if (cap == ci->i_auth_cap)
				3127	check_caps = 1; /* check auth cap only */
				3128	else
				3129	check_caps = 2; /* check all caps */
				3130	cap->issued = newcaps;
				3131	cap->implemented \|= newcaps;
				3132	} else if (cap->issued == newcaps) {
				3133	dout("caps unchanged: %s -> %s\n",
				3134	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
				3135	} else {
				3136	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
				3137	ceph_cap_string(newcaps));
				3138	/* non-auth MDS is revoking the newly grant caps ? */
				3139	if (cap == ci->i_auth_cap &&
				3140	__ceph_caps_revoking_other(ci, cap, newcaps))
				3141	check_caps = 2;
				3142
				3143	cap->issued = newcaps;
				3144	cap->implemented \|= newcaps; /* add bits only, to
				3145	* avoid stepping on a
				3146	* pending revocation */
				3147	wake = true;
				3148	}
				3149	BUG_ON(cap->issued & ~cap->implemented);
				3150
				3151	if (inline_version > 0 && inline_version >= ci->i_inline_version) {
				3152	ci->i_inline_version = inline_version;
				3153	if (ci->i_inline_version != CEPH_INLINE_NONE &&
				3154	(newcaps & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)))
				3155	fill_inline = true;
				3156	}
				3157
				3158	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
				3159	if (newcaps & ~issued)
				3160	wake = true;
				3161	kick_flushing_inode_caps(mdsc, session, inode);
				3162	up_read(&mdsc->snap_rwsem);
				3163	} else {
				3164	spin_unlock(&ci->i_ceph_lock);
				3165	}
				3166
				3167	if (fill_inline)
				3168	ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
				3169
				3170	if (queue_trunc)
				3171	ceph_queue_vmtruncate(inode);
				3172
				3173	if (writeback)
				3174	/*
				3175	* queue inode for writeback: we can't actually call
				3176	* filemap_write_and_wait, etc. from message handler
				3177	* context.
				3178	*/
				3179	ceph_queue_writeback(inode);
				3180	if (queue_invalidate)
				3181	ceph_queue_invalidate(inode);
				3182	if (deleted_inode)
				3183	invalidate_aliases(inode);
				3184	if (wake)
				3185	wake_up_all(&ci->i_cap_wq);
				3186
				3187	if (check_caps == 1)
				3188	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
				3189	session);
				3190	else if (check_caps == 2)
				3191	ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
				3192	else
				3193	mutex_unlock(&session->s_mutex);
				3194	}
				3195
				3196	/*
				3197	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
				3198	* MDS has been safely committed.
				3199	*/
				3200	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
				3201	struct ceph_mds_caps *m,
				3202	struct ceph_mds_session *session,
				3203	struct ceph_cap *cap)
				3204	__releases(ci->i_ceph_lock)
				3205	{
				3206	struct ceph_inode_info *ci = ceph_inode(inode);
				3207	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				3208	struct ceph_cap_flush cf, tmp_cf;
				3209	LIST_HEAD(to_remove);
				3210	unsigned seq = le32_to_cpu(m->seq);
				3211	int dirty = le32_to_cpu(m->dirty);
				3212	int cleaned = 0;
				3213	bool drop = false;
				3214	bool wake_ci = 0;
				3215	bool wake_mdsc = 0;
				3216
				3217	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
				3218	if (cf->tid == flush_tid)
				3219	cleaned = cf->caps;
				3220	if (cf->caps == 0) /* capsnap */
				3221	continue;
				3222	if (cf->tid <= flush_tid) {
				3223	if (__finish_cap_flush(NULL, ci, cf))
				3224	wake_ci = true;
				3225	list_add_tail(&cf->i_list, &to_remove);
				3226	} else {
				3227	cleaned &= ~cf->caps;
				3228	if (!cleaned)
				3229	break;
				3230	}
				3231	}
				3232
				3233	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
				3234	" flushing %s -> %s\n",
				3235	inode, session->s_mds, seq, ceph_cap_string(dirty),
				3236	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
				3237	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
				3238
				3239	if (list_empty(&to_remove) && !cleaned)
				3240	goto out;
				3241
				3242	ci->i_flushing_caps &= ~cleaned;
				3243
				3244	spin_lock(&mdsc->cap_dirty_lock);
				3245
				3246	list_for_each_entry(cf, &to_remove, i_list) {
				3247	if (__finish_cap_flush(mdsc, NULL, cf))
				3248	wake_mdsc = true;
				3249	}
				3250
				3251	if (ci->i_flushing_caps == 0) {
				3252	if (list_empty(&ci->i_cap_flush_list)) {
				3253	list_del_init(&ci->i_flushing_item);
				3254	if (!list_empty(&session->s_cap_flushing)) {
				3255	dout(" mds%d still flushing cap on %p\n",
				3256	session->s_mds,
				3257	&list_first_entry(&session->s_cap_flushing,
				3258	struct ceph_inode_info,
				3259	i_flushing_item)->vfs_inode);
				3260	}
				3261	}
				3262	mdsc->num_cap_flushing--;
				3263	dout(" inode %p now !flushing\n", inode);
				3264
				3265	if (ci->i_dirty_caps == 0) {
				3266	dout(" inode %p now clean\n", inode);
				3267	BUG_ON(!list_empty(&ci->i_dirty_item));
				3268	drop = true;
				3269	if (ci->i_wr_ref == 0 &&
				3270	ci->i_wrbuffer_ref_head == 0) {
				3271	BUG_ON(!ci->i_head_snapc);
				3272	ceph_put_snap_context(ci->i_head_snapc);
				3273	ci->i_head_snapc = NULL;
				3274	}
				3275	} else {
				3276	BUG_ON(list_empty(&ci->i_dirty_item));
				3277	}
				3278	}
				3279	spin_unlock(&mdsc->cap_dirty_lock);
				3280
				3281	out:
				3282	spin_unlock(&ci->i_ceph_lock);
				3283
				3284	while (!list_empty(&to_remove)) {
				3285	cf = list_first_entry(&to_remove,
				3286	struct ceph_cap_flush, i_list);
				3287	list_del(&cf->i_list);
				3288	ceph_free_cap_flush(cf);
				3289	}
				3290
				3291	if (wake_ci)
				3292	wake_up_all(&ci->i_cap_wq);
				3293	if (wake_mdsc)
				3294	wake_up_all(&mdsc->cap_flushing_wq);
				3295	if (drop)
				3296	iput(inode);
				3297	}
				3298
				3299	/*
				3300	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
				3301	* throw away our cap_snap.
				3302	*
				3303	* Caller hold s_mutex.
				3304	*/
				3305	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
				3306	struct ceph_mds_caps *m,
				3307	struct ceph_mds_session *session)
				3308	{
				3309	struct ceph_inode_info *ci = ceph_inode(inode);
				3310	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				3311	u64 follows = le64_to_cpu(m->snap_follows);
				3312	struct ceph_cap_snap *capsnap;
				3313	bool flushed = false;
				3314	bool wake_ci = false;
				3315	bool wake_mdsc = false;
				3316
				3317	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
				3318	inode, ci, session->s_mds, follows);
				3319
				3320	spin_lock(&ci->i_ceph_lock);
				3321	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				3322	if (capsnap->follows == follows) {
				3323	if (capsnap->cap_flush.tid != flush_tid) {
				3324	dout(" cap_snap %p follows %lld tid %lld !="
				3325	" %lld\n", capsnap, follows,
				3326	flush_tid, capsnap->cap_flush.tid);
				3327	break;
				3328	}
				3329	flushed = true;
				3330	break;
				3331	} else {
				3332	dout(" skipping cap_snap %p follows %lld\n",
				3333	capsnap, capsnap->follows);
				3334	}
				3335	}
				3336	if (flushed) {
				3337	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
				3338	dout(" removing %p cap_snap %p follows %lld\n",
				3339	inode, capsnap, follows);
				3340	list_del(&capsnap->ci_item);
				3341	if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
				3342	wake_ci = true;
				3343
				3344	spin_lock(&mdsc->cap_dirty_lock);
				3345
				3346	if (list_empty(&ci->i_cap_flush_list))
				3347	list_del_init(&ci->i_flushing_item);
				3348
				3349	if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
				3350	wake_mdsc = true;
				3351
				3352	spin_unlock(&mdsc->cap_dirty_lock);
				3353	}
				3354	spin_unlock(&ci->i_ceph_lock);
				3355	if (flushed) {
				3356	ceph_put_snap_context(capsnap->context);
				3357	ceph_put_cap_snap(capsnap);
				3358	if (wake_ci)
				3359	wake_up_all(&ci->i_cap_wq);
				3360	if (wake_mdsc)
				3361	wake_up_all(&mdsc->cap_flushing_wq);
				3362	iput(inode);
				3363	}
				3364	}
				3365
				3366	/*
				3367	* Handle TRUNC from MDS, indicating file truncation.
				3368	*
				3369	* caller hold s_mutex.
				3370	*/
				3371	static void handle_cap_trunc(struct inode *inode,
				3372	struct ceph_mds_caps *trunc,
				3373	struct ceph_mds_session *session)
				3374	__releases(ci->i_ceph_lock)
				3375	{
				3376	struct ceph_inode_info *ci = ceph_inode(inode);
				3377	int mds = session->s_mds;
				3378	int seq = le32_to_cpu(trunc->seq);
				3379	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
				3380	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
				3381	u64 size = le64_to_cpu(trunc->size);
				3382	int implemented = 0;
				3383	int dirty = __ceph_caps_dirty(ci);
				3384	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
				3385	int queue_trunc = 0;
				3386
				3387	issued \|= implemented \| dirty;
				3388
				3389	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
				3390	inode, mds, seq, truncate_size, truncate_seq);
				3391	queue_trunc = ceph_fill_file_size(inode, issued,
				3392	truncate_seq, truncate_size, size);
				3393	spin_unlock(&ci->i_ceph_lock);
				3394
				3395	if (queue_trunc)
				3396	ceph_queue_vmtruncate(inode);
				3397	}
				3398
				3399	/*
				3400	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
				3401	* different one. If we are the most recent migration we've seen (as
				3402	* indicated by mseq), make note of the migrating cap bits for the
				3403	* duration (until we see the corresponding IMPORT).
				3404	*
				3405	* caller holds s_mutex
				3406	*/
				3407	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
				3408	struct ceph_mds_cap_peer *ph,
				3409	struct ceph_mds_session *session)
				3410	{
				3411	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				3412	struct ceph_mds_session *tsession = NULL;
				3413	struct ceph_cap cap, tcap, *new_cap = NULL;
				3414	struct ceph_inode_info *ci = ceph_inode(inode);
				3415	u64 t_cap_id;
				3416	unsigned mseq = le32_to_cpu(ex->migrate_seq);
				3417	unsigned t_seq, t_mseq;
				3418	int target, issued;
				3419	int mds = session->s_mds;
				3420
				3421	if (ph) {
				3422	t_cap_id = le64_to_cpu(ph->cap_id);
				3423	t_seq = le32_to_cpu(ph->seq);
				3424	t_mseq = le32_to_cpu(ph->mseq);
				3425	target = le32_to_cpu(ph->mds);
				3426	} else {
				3427	t_cap_id = t_seq = t_mseq = 0;
				3428	target = -1;
				3429	}
				3430
				3431	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
				3432	inode, ci, mds, mseq, target);
				3433	retry:
				3434	spin_lock(&ci->i_ceph_lock);
				3435	cap = __get_cap_for_mds(ci, mds);
				3436	if (!cap \|\| cap->cap_id != le64_to_cpu(ex->cap_id))
				3437	goto out_unlock;
				3438
				3439	if (target < 0) {
				3440	__ceph_remove_cap(cap, false);
				3441	if (!ci->i_auth_cap)
				3442	ci->i_ceph_flags \|= CEPH_I_CAP_DROPPED;
				3443	goto out_unlock;
				3444	}
				3445
				3446	/*
				3447	* now we know we haven't received the cap import message yet
				3448	* because the exported cap still exist.
				3449	*/
				3450
				3451	issued = cap->issued;
				3452	WARN_ON(issued != cap->implemented);
				3453
				3454	tcap = __get_cap_for_mds(ci, target);
				3455	if (tcap) {
				3456	/* already have caps from the target */
				3457	if (tcap->cap_id == t_cap_id &&
				3458	ceph_seq_cmp(tcap->seq, t_seq) < 0) {
				3459	dout(" updating import cap %p mds%d\n", tcap, target);
				3460	tcap->cap_id = t_cap_id;
				3461	tcap->seq = t_seq - 1;
				3462	tcap->issue_seq = t_seq - 1;
				3463	tcap->issued \|= issued;
				3464	tcap->implemented \|= issued;
				3465	if (cap == ci->i_auth_cap)
				3466	ci->i_auth_cap = tcap;
				3467
				3468	if (!list_empty(&ci->i_cap_flush_list) &&
				3469	ci->i_auth_cap == tcap) {
				3470	spin_lock(&mdsc->cap_dirty_lock);
				3471	list_move_tail(&ci->i_flushing_item,
				3472	&tcap->session->s_cap_flushing);
				3473	spin_unlock(&mdsc->cap_dirty_lock);
				3474	}
				3475	}
				3476	__ceph_remove_cap(cap, false);
				3477	goto out_unlock;
				3478	} else if (tsession) {
				3479	/* add placeholder for the export tagert */
				3480	int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
				3481	tcap = new_cap;
				3482	ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
				3483	t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
				3484
				3485	if (!list_empty(&ci->i_cap_flush_list) &&
				3486	ci->i_auth_cap == tcap) {
				3487	spin_lock(&mdsc->cap_dirty_lock);
				3488	list_move_tail(&ci->i_flushing_item,
				3489	&tcap->session->s_cap_flushing);
				3490	spin_unlock(&mdsc->cap_dirty_lock);
				3491	}
				3492
				3493	__ceph_remove_cap(cap, false);
				3494	goto out_unlock;
				3495	}
				3496
				3497	spin_unlock(&ci->i_ceph_lock);
				3498	mutex_unlock(&session->s_mutex);
				3499
				3500	/* open target session */
				3501	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
				3502	if (!IS_ERR(tsession)) {
				3503	if (mds > target) {
				3504	mutex_lock(&session->s_mutex);
				3505	mutex_lock_nested(&tsession->s_mutex,
				3506	SINGLE_DEPTH_NESTING);
				3507	} else {
				3508	mutex_lock(&tsession->s_mutex);
				3509	mutex_lock_nested(&session->s_mutex,
				3510	SINGLE_DEPTH_NESTING);
				3511	}
				3512	new_cap = ceph_get_cap(mdsc, NULL);
				3513	} else {
				3514	WARN_ON(1);
				3515	tsession = NULL;
				3516	target = -1;
				3517	mutex_lock(&session->s_mutex);
				3518	}
				3519	goto retry;
				3520
				3521	out_unlock:
				3522	spin_unlock(&ci->i_ceph_lock);
				3523	mutex_unlock(&session->s_mutex);
				3524	if (tsession) {
				3525	mutex_unlock(&tsession->s_mutex);
				3526	ceph_put_mds_session(tsession);
				3527	}
				3528	if (new_cap)
				3529	ceph_put_cap(mdsc, new_cap);
				3530	}
				3531
				3532	/*
				3533	* Handle cap IMPORT.
				3534	*
				3535	* caller holds s_mutex. acquires i_ceph_lock
				3536	*/
				3537	static void handle_cap_import(struct ceph_mds_client *mdsc,
				3538	struct inode inode, struct ceph_mds_caps im,
				3539	struct ceph_mds_cap_peer *ph,
				3540	struct ceph_mds_session *session,
				3541	struct ceph_cap *target_cap, int old_issued)
				3542	__acquires(ci->i_ceph_lock)
				3543	{
				3544	struct ceph_inode_info *ci = ceph_inode(inode);
				3545	struct ceph_cap cap, ocap, *new_cap = NULL;
				3546	int mds = session->s_mds;
				3547	int issued;
				3548	unsigned caps = le32_to_cpu(im->caps);
				3549	unsigned wanted = le32_to_cpu(im->wanted);
				3550	unsigned seq = le32_to_cpu(im->seq);
				3551	unsigned mseq = le32_to_cpu(im->migrate_seq);
				3552	u64 realmino = le64_to_cpu(im->realm);
				3553	u64 cap_id = le64_to_cpu(im->cap_id);
				3554	u64 p_cap_id;
				3555	int peer;
				3556
				3557	if (ph) {
				3558	p_cap_id = le64_to_cpu(ph->cap_id);
				3559	peer = le32_to_cpu(ph->mds);
				3560	} else {
				3561	p_cap_id = 0;
				3562	peer = -1;
				3563	}
				3564
				3565	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
				3566	inode, ci, mds, mseq, peer);
				3567
				3568	retry:
				3569	spin_lock(&ci->i_ceph_lock);
				3570	cap = __get_cap_for_mds(ci, mds);
				3571	if (!cap) {
				3572	if (!new_cap) {
				3573	spin_unlock(&ci->i_ceph_lock);
				3574	new_cap = ceph_get_cap(mdsc, NULL);
				3575	goto retry;
				3576	}
				3577	cap = new_cap;
				3578	} else {
				3579	if (new_cap) {
				3580	ceph_put_cap(mdsc, new_cap);
				3581	new_cap = NULL;
				3582	}
				3583	}
				3584
				3585	__ceph_caps_issued(ci, &issued);
				3586	issued \|= __ceph_caps_dirty(ci);
				3587
				3588	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
				3589	realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
				3590
				3591	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
				3592	if (ocap && ocap->cap_id == p_cap_id) {
				3593	dout(" remove export cap %p mds%d flags %d\n",
				3594	ocap, peer, ph->flags);
				3595	if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
				3596	(ocap->seq != le32_to_cpu(ph->seq) \|\|
				3597	ocap->mseq != le32_to_cpu(ph->mseq))) {
				3598	pr_err("handle_cap_import: mismatched seq/mseq: "
				3599	"ino (%llx.%llx) mds%d seq %d mseq %d "
				3600	"importer mds%d has peer seq %d mseq %d\n",
				3601	ceph_vinop(inode), peer, ocap->seq,
				3602	ocap->mseq, mds, le32_to_cpu(ph->seq),
				3603	le32_to_cpu(ph->mseq));
				3604	}
				3605	__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
				3606	}
				3607
				3608	/* make sure we re-request max_size, if necessary */
				3609	ci->i_requested_max_size = 0;
				3610
				3611	*old_issued = issued;
				3612	*target_cap = cap;
				3613	}
				3614
				3615	/*
				3616	* Handle a caps message from the MDS.
				3617	*
				3618	* Identify the appropriate session, inode, and call the right handler
				3619	* based on the cap op.
				3620	*/
				3621	void ceph_handle_caps(struct ceph_mds_session *session,
				3622	struct ceph_msg *msg)
				3623	{
				3624	struct ceph_mds_client *mdsc = session->s_mdsc;
				3625	struct super_block *sb = mdsc->fsc->sb;
				3626	struct inode *inode;
				3627	struct ceph_inode_info *ci;
				3628	struct ceph_cap *cap;
				3629	struct ceph_mds_caps *h;
				3630	struct ceph_mds_cap_peer *peer = NULL;
				3631	struct ceph_snap_realm *realm = NULL;
				3632	struct ceph_string *pool_ns = NULL;
				3633	int mds = session->s_mds;
				3634	int op, issued;
				3635	u32 seq, mseq;
				3636	struct ceph_vino vino;
				3637	u64 tid;
				3638	u64 inline_version = 0;
				3639	void *inline_data = NULL;
				3640	u32 inline_len = 0;
				3641	void *snaptrace;
				3642	size_t snaptrace_len;
				3643	void p, end;
				3644
				3645	dout("handle_caps from mds%d\n", mds);
				3646
				3647	/* decode */
				3648	end = msg->front.iov_base + msg->front.iov_len;
				3649	tid = le64_to_cpu(msg->hdr.tid);
				3650	if (msg->front.iov_len < sizeof(*h))
				3651	goto bad;
				3652	h = msg->front.iov_base;
				3653	op = le32_to_cpu(h->op);
				3654	vino.ino = le64_to_cpu(h->ino);
				3655	vino.snap = CEPH_NOSNAP;
				3656	seq = le32_to_cpu(h->seq);
				3657	mseq = le32_to_cpu(h->migrate_seq);
				3658
				3659	snaptrace = h + 1;
				3660	snaptrace_len = le32_to_cpu(h->snap_trace_len);
				3661	p = snaptrace + snaptrace_len;
				3662
				3663	if (le16_to_cpu(msg->hdr.version) >= 2) {
				3664	u32 flock_len;
				3665	ceph_decode_32_safe(&p, end, flock_len, bad);
				3666	if (p + flock_len > end)
				3667	goto bad;
				3668	p += flock_len;
				3669	}
				3670
				3671	if (le16_to_cpu(msg->hdr.version) >= 3) {
				3672	if (op == CEPH_CAP_OP_IMPORT) {
				3673	if (p + sizeof(*peer) > end)
				3674	goto bad;
				3675	peer = p;
				3676	p += sizeof(*peer);
				3677	} else if (op == CEPH_CAP_OP_EXPORT) {
				3678	/* recorded in unused fields */
				3679	peer = (void *)&h->size;
				3680	}
				3681	}
				3682
				3683	if (le16_to_cpu(msg->hdr.version) >= 4) {
				3684	ceph_decode_64_safe(&p, end, inline_version, bad);
				3685	ceph_decode_32_safe(&p, end, inline_len, bad);
				3686	if (p + inline_len > end)
				3687	goto bad;
				3688	inline_data = p;
				3689	p += inline_len;
				3690	}
				3691
				3692	if (le16_to_cpu(msg->hdr.version) >= 5) {
				3693	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
				3694	u32 epoch_barrier;
				3695
				3696	ceph_decode_32_safe(&p, end, epoch_barrier, bad);
				3697	ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
				3698	}
				3699
				3700	if (le16_to_cpu(msg->hdr.version) >= 8) {
				3701	u64 flush_tid;
				3702	u32 caller_uid, caller_gid;
				3703	u32 pool_ns_len;
				3704
				3705	/* version >= 6 */
				3706	ceph_decode_64_safe(&p, end, flush_tid, bad);
				3707	/* version >= 7 */
				3708	ceph_decode_32_safe(&p, end, caller_uid, bad);
				3709	ceph_decode_32_safe(&p, end, caller_gid, bad);
				3710	/* version >= 8 */
				3711	ceph_decode_32_safe(&p, end, pool_ns_len, bad);
				3712	if (pool_ns_len > 0) {
				3713	ceph_decode_need(&p, end, pool_ns_len, bad);
				3714	pool_ns = ceph_find_or_create_string(p, pool_ns_len);
				3715	p += pool_ns_len;
				3716	}
				3717	}
				3718
				3719	/* lookup ino */
				3720	inode = ceph_find_inode(sb, vino);
				3721	ci = ceph_inode(inode);
				3722	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
				3723	vino.snap, inode);
				3724
				3725	mutex_lock(&session->s_mutex);
				3726	session->s_seq++;
				3727	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
				3728	(unsigned)seq);
				3729
				3730	if (!inode) {
				3731	dout(" i don't have ino %llx\n", vino.ino);
				3732
				3733	if (op == CEPH_CAP_OP_IMPORT) {
				3734	cap = ceph_get_cap(mdsc, NULL);
				3735	cap->cap_ino = vino.ino;
				3736	cap->queue_release = 1;
				3737	cap->cap_id = le64_to_cpu(h->cap_id);
				3738	cap->mseq = mseq;
				3739	cap->seq = seq;
				3740	cap->issue_seq = seq;
				3741	spin_lock(&session->s_cap_lock);
				3742	list_add_tail(&cap->session_caps,
				3743	&session->s_cap_releases);
				3744	session->s_num_cap_releases++;
				3745	spin_unlock(&session->s_cap_lock);
				3746	}
				3747	goto flush_cap_releases;
				3748	}
				3749
				3750	/* these will work even if we don't have a cap yet */
				3751	switch (op) {
				3752	case CEPH_CAP_OP_FLUSHSNAP_ACK:
				3753	handle_cap_flushsnap_ack(inode, tid, h, session);
				3754	goto done;
				3755
				3756	case CEPH_CAP_OP_EXPORT:
				3757	handle_cap_export(inode, h, peer, session);
				3758	goto done_unlocked;
				3759
				3760	case CEPH_CAP_OP_IMPORT:
				3761	realm = NULL;
				3762	if (snaptrace_len) {
				3763	down_write(&mdsc->snap_rwsem);
				3764	ceph_update_snap_trace(mdsc, snaptrace,
				3765	snaptrace + snaptrace_len,
				3766	false, &realm);
				3767	downgrade_write(&mdsc->snap_rwsem);
				3768	} else {
				3769	down_read(&mdsc->snap_rwsem);
				3770	}
				3771	handle_cap_import(mdsc, inode, h, peer, session,
				3772	&cap, &issued);
				3773	handle_cap_grant(mdsc, inode, h, &pool_ns,
				3774	inline_version, inline_data, inline_len,
				3775	msg->middle, session, cap, issued);
				3776	if (realm)
				3777	ceph_put_snap_realm(mdsc, realm);
				3778	goto done_unlocked;
				3779	}
				3780
				3781	/* the rest require a cap */
				3782	spin_lock(&ci->i_ceph_lock);
				3783	cap = __get_cap_for_mds(ceph_inode(inode), mds);
				3784	if (!cap) {
				3785	dout(" no cap on %p ino %llx.%llx from mds%d\n",
				3786	inode, ceph_ino(inode), ceph_snap(inode), mds);
				3787	spin_unlock(&ci->i_ceph_lock);
				3788	goto flush_cap_releases;
				3789	}
				3790
				3791	/* note that each of these drops i_ceph_lock for us */
				3792	switch (op) {
				3793	case CEPH_CAP_OP_REVOKE:
				3794	case CEPH_CAP_OP_GRANT:
				3795	__ceph_caps_issued(ci, &issued);
				3796	issued \|= __ceph_caps_dirty(ci);
				3797	handle_cap_grant(mdsc, inode, h, &pool_ns,
				3798	inline_version, inline_data, inline_len,
				3799	msg->middle, session, cap, issued);
				3800	goto done_unlocked;
				3801
				3802	case CEPH_CAP_OP_FLUSH_ACK:
				3803	handle_cap_flush_ack(inode, tid, h, session, cap);
				3804	break;
				3805
				3806	case CEPH_CAP_OP_TRUNC:
				3807	handle_cap_trunc(inode, h, session);
				3808	break;
				3809
				3810	default:
				3811	spin_unlock(&ci->i_ceph_lock);
				3812	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
				3813	ceph_cap_op_name(op));
				3814	}
				3815
				3816	goto done;
				3817
				3818	flush_cap_releases:
				3819	/*
				3820	* send any cap release message to try to move things
				3821	* along for the mds (who clearly thinks we still have this
				3822	* cap).
				3823	*/
				3824	ceph_send_cap_releases(mdsc, session);
				3825
				3826	done:
				3827	mutex_unlock(&session->s_mutex);
				3828	done_unlocked:
				3829	iput(inode);
				3830	ceph_put_string(pool_ns);
				3831	return;
				3832
				3833	bad:
				3834	pr_err("ceph_handle_caps: corrupt message\n");
				3835	ceph_msg_dump(msg);
				3836	return;
				3837	}
				3838
				3839	/*
				3840	* Delayed work handler to process end of delayed cap release LRU list.
				3841	*/
				3842	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
				3843	{
				3844	struct inode *inode;
				3845	struct ceph_inode_info *ci;
				3846	int flags = CHECK_CAPS_NODELAY;
				3847
				3848	dout("check_delayed_caps\n");
				3849	while (1) {
				3850	spin_lock(&mdsc->cap_delay_lock);
				3851	if (list_empty(&mdsc->cap_delay_list))
				3852	break;
				3853	ci = list_first_entry(&mdsc->cap_delay_list,
				3854	struct ceph_inode_info,
				3855	i_cap_delay_list);
				3856	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
				3857	time_before(jiffies, ci->i_hold_caps_max))
				3858	break;
				3859	list_del_init(&ci->i_cap_delay_list);
				3860
				3861	inode = igrab(&ci->vfs_inode);
				3862	spin_unlock(&mdsc->cap_delay_lock);
				3863
				3864	if (inode) {
				3865	dout("check_delayed_caps on %p\n", inode);
				3866	ceph_check_caps(ci, flags, NULL);
				3867	iput(inode);
				3868	}
				3869	}
				3870	spin_unlock(&mdsc->cap_delay_lock);
				3871	}
				3872
				3873	/*
				3874	* Flush all dirty caps to the mds
				3875	*/
				3876	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
				3877	{
				3878	struct ceph_inode_info *ci;
				3879	struct inode *inode;
				3880
				3881	dout("flush_dirty_caps\n");
				3882	spin_lock(&mdsc->cap_dirty_lock);
				3883	while (!list_empty(&mdsc->cap_dirty)) {
				3884	ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
				3885	i_dirty_item);
				3886	inode = &ci->vfs_inode;
				3887	ihold(inode);
				3888	dout("flush_dirty_caps %p\n", inode);
				3889	spin_unlock(&mdsc->cap_dirty_lock);
				3890	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH, NULL);
				3891	iput(inode);
				3892	spin_lock(&mdsc->cap_dirty_lock);
				3893	}
				3894	spin_unlock(&mdsc->cap_dirty_lock);
				3895	dout("flush_dirty_caps done\n");
				3896	}
				3897
				3898	void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
				3899	{
				3900	int i;
				3901	int bits = (fmode << 1) \| 1;
				3902	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				3903	if (bits & (1 << i))
				3904	ci->i_nr_by_mode[i]++;
				3905	}
				3906	}
				3907
				3908	/*
				3909	* Drop open file reference. If we were the last open file,
				3910	* we may need to release capabilities to the MDS (or schedule
				3911	* their delayed release).
				3912	*/
				3913	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
				3914	{
				3915	int i, last = 0;
				3916	int bits = (fmode << 1) \| 1;
				3917	spin_lock(&ci->i_ceph_lock);
				3918	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				3919	if (bits & (1 << i)) {
				3920	BUG_ON(ci->i_nr_by_mode[i] == 0);
				3921	if (--ci->i_nr_by_mode[i] == 0)
				3922	last++;
				3923	}
				3924	}
				3925	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
				3926	&ci->vfs_inode, fmode,
				3927	ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
				3928	ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
				3929	spin_unlock(&ci->i_ceph_lock);
				3930
				3931	if (last && ci->i_vino.snap == CEPH_NOSNAP)
				3932	ceph_check_caps(ci, 0, NULL);
				3933	}
				3934
				3935	/*
				3936	* Helpers for embedding cap and dentry lease releases into mds
				3937	* requests.
				3938	*
				3939	* @force is used by dentry_release (below) to force inclusion of a
				3940	* record for the directory inode, even when there aren't any caps to
				3941	* drop.
				3942	*/
				3943	int ceph_encode_inode_release(void *p, struct inode inode,
				3944	int mds, int drop, int unless, int force)
				3945	{
				3946	struct ceph_inode_info *ci = ceph_inode(inode);
				3947	struct ceph_cap *cap;
				3948	struct ceph_mds_request_release rel = p;
				3949	int used, dirty;
				3950	int ret = 0;
				3951
				3952	spin_lock(&ci->i_ceph_lock);
				3953	used = __ceph_caps_used(ci);
				3954	dirty = __ceph_caps_dirty(ci);
				3955
				3956	dout("encode_inode_release %p mds%d used\|dirty %s drop %s unless %s\n",
				3957	inode, mds, ceph_cap_string(used\|dirty), ceph_cap_string(drop),
				3958	ceph_cap_string(unless));
				3959
				3960	/* only drop unused, clean caps */
				3961	drop &= ~(used \| dirty);
				3962
				3963	cap = __get_cap_for_mds(ci, mds);
				3964	if (cap && __cap_is_valid(cap)) {
				3965	if (force \|\|
				3966	((cap->issued & drop) &&
				3967	(cap->issued & unless) == 0)) {
				3968	if ((cap->issued & drop) &&
				3969	(cap->issued & unless) == 0) {
				3970	int wanted = __ceph_caps_wanted(ci);
				3971	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
				3972	wanted \|= cap->mds_wanted;
				3973	dout("encode_inode_release %p cap %p "
				3974	"%s -> %s, wanted %s -> %s\n", inode, cap,
				3975	ceph_cap_string(cap->issued),
				3976	ceph_cap_string(cap->issued & ~drop),
				3977	ceph_cap_string(cap->mds_wanted),
				3978	ceph_cap_string(wanted));
				3979
				3980	cap->issued &= ~drop;
				3981	cap->implemented &= ~drop;
				3982	cap->mds_wanted = wanted;
				3983	} else {
				3984	dout("encode_inode_release %p cap %p %s"
				3985	" (force)\n", inode, cap,
				3986	ceph_cap_string(cap->issued));
				3987	}
				3988
				3989	rel->ino = cpu_to_le64(ceph_ino(inode));
				3990	rel->cap_id = cpu_to_le64(cap->cap_id);
				3991	rel->seq = cpu_to_le32(cap->seq);
				3992	rel->issue_seq = cpu_to_le32(cap->issue_seq);
				3993	rel->mseq = cpu_to_le32(cap->mseq);
				3994	rel->caps = cpu_to_le32(cap->implemented);
				3995	rel->wanted = cpu_to_le32(cap->mds_wanted);
				3996	rel->dname_len = 0;
				3997	rel->dname_seq = 0;
				3998	p += sizeof(rel);
				3999	ret = 1;
				4000	} else {
				4001	dout("encode_inode_release %p cap %p %s\n",
				4002	inode, cap, ceph_cap_string(cap->issued));
				4003	}
				4004	}
				4005	spin_unlock(&ci->i_ceph_lock);
				4006	return ret;
				4007	}
				4008
				4009	int ceph_encode_dentry_release(void *p, struct dentry dentry,
				4010	struct inode *dir,
				4011	int mds, int drop, int unless)
				4012	{
				4013	struct dentry *parent = NULL;
				4014	struct ceph_mds_request_release rel = p;
				4015	struct ceph_dentry_info *di = ceph_dentry(dentry);
				4016	int force = 0;
				4017	int ret;
				4018
				4019	/*
				4020	* force an record for the directory caps if we have a dentry lease.
				4021	* this is racy (can't take i_ceph_lock and d_lock together), but it
				4022	* doesn't have to be perfect; the mds will revoke anything we don't
				4023	* release.
				4024	*/
				4025	spin_lock(&dentry->d_lock);
				4026	if (di->lease_session && di->lease_session->s_mds == mds)
				4027	force = 1;
				4028	if (!dir) {
				4029	parent = dget(dentry->d_parent);
				4030	dir = d_inode(parent);
				4031	}
				4032	spin_unlock(&dentry->d_lock);
				4033
				4034	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
				4035	dput(parent);
				4036
				4037	spin_lock(&dentry->d_lock);
				4038	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
				4039	dout("encode_dentry_release %p mds%d seq %d\n",
				4040	dentry, mds, (int)di->lease_seq);
				4041	rel->dname_len = cpu_to_le32(dentry->d_name.len);
				4042	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
				4043	*p += dentry->d_name.len;
				4044	rel->dname_seq = cpu_to_le32(di->lease_seq);
				4045	__ceph_mdsc_drop_dentry_lease(dentry);
				4046	}
				4047	spin_unlock(&dentry->d_lock);
				4048	return ret;
				4049	}