Blame - src/kernel/linux/v4.19/fs/ceph/caps.c - T800

blob: 4c0b220e20bab10acc9922122733dc7fa88f2d17 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/fs.h>
				5	#include <linux/kernel.h>
				6	#include <linux/sched/signal.h>
				7	#include <linux/slab.h>
				8	#include <linux/vmalloc.h>
				9	#include <linux/wait.h>
				10	#include <linux/writeback.h>
				11
				12	#include "super.h"
				13	#include "mds_client.h"
				14	#include "cache.h"
				15	#include <linux/ceph/decode.h>
				16	#include <linux/ceph/messenger.h>
				17
				18	/*
				19	* Capability management
				20	*
				21	* The Ceph metadata servers control client access to inode metadata
				22	* and file data by issuing capabilities, granting clients permission
				23	* to read and/or write both inode field and file data to OSDs
				24	* (storage nodes). Each capability consists of a set of bits
				25	* indicating which operations are allowed.
				26	*
				27	* If the client holds a *_SHARED cap, the client has a coherent value
				28	* that can be safely read from the cached inode.
				29	*
				30	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
				31	* client is allowed to change inode attributes (e.g., file size,
				32	* mtime), note its dirty state in the ceph_cap, and asynchronously
				33	* flush that metadata change to the MDS.
				34	*
				35	* In the event of a conflicting operation (perhaps by another
				36	* client), the MDS will revoke the conflicting client capabilities.
				37	*
				38	* In order for a client to cache an inode, it must hold a capability
				39	* with at least one MDS server. When inodes are released, release
				40	* notifications are batched and periodically sent en masse to the MDS
				41	* cluster to release server state.
				42	*/
				43
				44	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
				45	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
				46	struct ceph_mds_session *session,
				47	struct ceph_inode_info *ci,
				48	u64 oldest_flush_tid);
				49
				50	/*
				51	* Generate readable cap strings for debugging output.
				52	*/
				53	#define MAX_CAP_STR 20
				54	static char cap_str[MAX_CAP_STR][40];
				55	static DEFINE_SPINLOCK(cap_str_lock);
				56	static int last_cap_str;
				57
				58	static char gcap_string(char s, int c)
				59	{
				60	if (c & CEPH_CAP_GSHARED)
				61	*s++ = 's';
				62	if (c & CEPH_CAP_GEXCL)
				63	*s++ = 'x';
				64	if (c & CEPH_CAP_GCACHE)
				65	*s++ = 'c';
				66	if (c & CEPH_CAP_GRD)
				67	*s++ = 'r';
				68	if (c & CEPH_CAP_GWR)
				69	*s++ = 'w';
				70	if (c & CEPH_CAP_GBUFFER)
				71	*s++ = 'b';
				72	if (c & CEPH_CAP_GWREXTEND)
				73	*s++ = 'a';
				74	if (c & CEPH_CAP_GLAZYIO)
				75	*s++ = 'l';
				76	return s;
				77	}
				78
				79	const char *ceph_cap_string(int caps)
				80	{
				81	int i;
				82	char *s;
				83	int c;
				84
				85	spin_lock(&cap_str_lock);
				86	i = last_cap_str++;
				87	if (last_cap_str == MAX_CAP_STR)
				88	last_cap_str = 0;
				89	spin_unlock(&cap_str_lock);
				90
				91	s = cap_str[i];
				92
				93	if (caps & CEPH_CAP_PIN)
				94	*s++ = 'p';
				95
				96	c = (caps >> CEPH_CAP_SAUTH) & 3;
				97	if (c) {
				98	*s++ = 'A';
				99	s = gcap_string(s, c);
				100	}
				101
				102	c = (caps >> CEPH_CAP_SLINK) & 3;
				103	if (c) {
				104	*s++ = 'L';
				105	s = gcap_string(s, c);
				106	}
				107
				108	c = (caps >> CEPH_CAP_SXATTR) & 3;
				109	if (c) {
				110	*s++ = 'X';
				111	s = gcap_string(s, c);
				112	}
				113
				114	c = caps >> CEPH_CAP_SFILE;
				115	if (c) {
				116	*s++ = 'F';
				117	s = gcap_string(s, c);
				118	}
				119
				120	if (s == cap_str[i])
				121	*s++ = '-';
				122	*s = 0;
				123	return cap_str[i];
				124	}
				125
				126	void ceph_caps_init(struct ceph_mds_client *mdsc)
				127	{
				128	INIT_LIST_HEAD(&mdsc->caps_list);
				129	spin_lock_init(&mdsc->caps_list_lock);
				130	}
				131
				132	void ceph_caps_finalize(struct ceph_mds_client *mdsc)
				133	{
				134	struct ceph_cap *cap;
				135
				136	spin_lock(&mdsc->caps_list_lock);
				137	while (!list_empty(&mdsc->caps_list)) {
				138	cap = list_first_entry(&mdsc->caps_list,
				139	struct ceph_cap, caps_item);
				140	list_del(&cap->caps_item);
				141	kmem_cache_free(ceph_cap_cachep, cap);
				142	}
				143	mdsc->caps_total_count = 0;
				144	mdsc->caps_avail_count = 0;
				145	mdsc->caps_use_count = 0;
				146	mdsc->caps_reserve_count = 0;
				147	mdsc->caps_min_count = 0;
				148	spin_unlock(&mdsc->caps_list_lock);
				149	}
				150
				151	void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
				152	{
				153	spin_lock(&mdsc->caps_list_lock);
				154	mdsc->caps_min_count += delta;
				155	BUG_ON(mdsc->caps_min_count < 0);
				156	spin_unlock(&mdsc->caps_list_lock);
				157	}
				158
				159	static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
				160	{
				161	struct ceph_cap *cap;
				162	int i;
				163
				164	if (nr_caps) {
				165	BUG_ON(mdsc->caps_reserve_count < nr_caps);
				166	mdsc->caps_reserve_count -= nr_caps;
				167	if (mdsc->caps_avail_count >=
				168	mdsc->caps_reserve_count + mdsc->caps_min_count) {
				169	mdsc->caps_total_count -= nr_caps;
				170	for (i = 0; i < nr_caps; i++) {
				171	cap = list_first_entry(&mdsc->caps_list,
				172	struct ceph_cap, caps_item);
				173	list_del(&cap->caps_item);
				174	kmem_cache_free(ceph_cap_cachep, cap);
				175	}
				176	} else {
				177	mdsc->caps_avail_count += nr_caps;
				178	}
				179
				180	dout("%s: caps %d = %d used + %d resv + %d avail\n",
				181	__func__,
				182	mdsc->caps_total_count, mdsc->caps_use_count,
				183	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				184	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				185	mdsc->caps_reserve_count +
				186	mdsc->caps_avail_count);
				187	}
				188	}
				189
				190	/*
				191	* Called under mdsc->mutex.
				192	*/
				193	int ceph_reserve_caps(struct ceph_mds_client *mdsc,
				194	struct ceph_cap_reservation *ctx, int need)
				195	{
				196	int i, j;
				197	struct ceph_cap *cap;
				198	int have;
				199	int alloc = 0;
				200	int max_caps;
				201	int err = 0;
				202	bool trimmed = false;
				203	struct ceph_mds_session *s;
				204	LIST_HEAD(newcaps);
				205
				206	dout("reserve caps ctx=%p need=%d\n", ctx, need);
				207
				208	/* first reserve any caps that are already allocated */
				209	spin_lock(&mdsc->caps_list_lock);
				210	if (mdsc->caps_avail_count >= need)
				211	have = need;
				212	else
				213	have = mdsc->caps_avail_count;
				214	mdsc->caps_avail_count -= have;
				215	mdsc->caps_reserve_count += have;
				216	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				217	mdsc->caps_reserve_count +
				218	mdsc->caps_avail_count);
				219	spin_unlock(&mdsc->caps_list_lock);
				220
				221	for (i = have; i < need; ) {
				222	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				223	if (cap) {
				224	list_add(&cap->caps_item, &newcaps);
				225	alloc++;
				226	i++;
				227	continue;
				228	}
				229
				230	if (!trimmed) {
				231	for (j = 0; j < mdsc->max_sessions; j++) {
				232	s = __ceph_lookup_mds_session(mdsc, j);
				233	if (!s)
				234	continue;
				235	mutex_unlock(&mdsc->mutex);
				236
				237	mutex_lock(&s->s_mutex);
				238	max_caps = s->s_nr_caps - (need - i);
				239	ceph_trim_caps(mdsc, s, max_caps);
				240	mutex_unlock(&s->s_mutex);
				241
				242	ceph_put_mds_session(s);
				243	mutex_lock(&mdsc->mutex);
				244	}
				245	trimmed = true;
				246
				247	spin_lock(&mdsc->caps_list_lock);
				248	if (mdsc->caps_avail_count) {
				249	int more_have;
				250	if (mdsc->caps_avail_count >= need - i)
				251	more_have = need - i;
				252	else
				253	more_have = mdsc->caps_avail_count;
				254
				255	i += more_have;
				256	have += more_have;
				257	mdsc->caps_avail_count -= more_have;
				258	mdsc->caps_reserve_count += more_have;
				259
				260	}
				261	spin_unlock(&mdsc->caps_list_lock);
				262
				263	continue;
				264	}
				265
				266	pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
				267	ctx, need, have + alloc);
				268	err = -ENOMEM;
				269	break;
				270	}
				271
				272	if (!err) {
				273	BUG_ON(have + alloc != need);
				274	ctx->count = need;
				275	}
				276
				277	spin_lock(&mdsc->caps_list_lock);
				278	mdsc->caps_total_count += alloc;
				279	mdsc->caps_reserve_count += alloc;
				280	list_splice(&newcaps, &mdsc->caps_list);
				281
				282	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				283	mdsc->caps_reserve_count +
				284	mdsc->caps_avail_count);
				285
				286	if (err)
				287	__ceph_unreserve_caps(mdsc, have + alloc);
				288
				289	spin_unlock(&mdsc->caps_list_lock);
				290
				291	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
				292	ctx, mdsc->caps_total_count, mdsc->caps_use_count,
				293	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				294	return err;
				295	}
				296
				297	void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
				298	struct ceph_cap_reservation *ctx)
				299	{
				300	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
				301	spin_lock(&mdsc->caps_list_lock);
				302	__ceph_unreserve_caps(mdsc, ctx->count);
				303	ctx->count = 0;
				304	spin_unlock(&mdsc->caps_list_lock);
				305	}
				306
				307	struct ceph_cap ceph_get_cap(struct ceph_mds_client mdsc,
				308	struct ceph_cap_reservation *ctx)
				309	{
				310	struct ceph_cap *cap = NULL;
				311
				312	/* temporary, until we do something about cap import/export */
				313	if (!ctx) {
				314	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				315	if (cap) {
				316	spin_lock(&mdsc->caps_list_lock);
				317	mdsc->caps_use_count++;
				318	mdsc->caps_total_count++;
				319	spin_unlock(&mdsc->caps_list_lock);
				320	} else {
				321	spin_lock(&mdsc->caps_list_lock);
				322	if (mdsc->caps_avail_count) {
				323	BUG_ON(list_empty(&mdsc->caps_list));
				324
				325	mdsc->caps_avail_count--;
				326	mdsc->caps_use_count++;
				327	cap = list_first_entry(&mdsc->caps_list,
				328	struct ceph_cap, caps_item);
				329	list_del(&cap->caps_item);
				330
				331	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				332	mdsc->caps_reserve_count + mdsc->caps_avail_count);
				333	}
				334	spin_unlock(&mdsc->caps_list_lock);
				335	}
				336
				337	return cap;
				338	}
				339
				340	spin_lock(&mdsc->caps_list_lock);
				341	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				342	ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
				343	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				344	BUG_ON(!ctx->count);
				345	BUG_ON(ctx->count > mdsc->caps_reserve_count);
				346	BUG_ON(list_empty(&mdsc->caps_list));
				347
				348	ctx->count--;
				349	mdsc->caps_reserve_count--;
				350	mdsc->caps_use_count++;
				351
				352	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
				353	list_del(&cap->caps_item);
				354
				355	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				356	mdsc->caps_reserve_count + mdsc->caps_avail_count);
				357	spin_unlock(&mdsc->caps_list_lock);
				358	return cap;
				359	}
				360
				361	void ceph_put_cap(struct ceph_mds_client mdsc, struct ceph_cap cap)
				362	{
				363	spin_lock(&mdsc->caps_list_lock);
				364	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
				365	cap, mdsc->caps_total_count, mdsc->caps_use_count,
				366	mdsc->caps_reserve_count, mdsc->caps_avail_count);
				367	mdsc->caps_use_count--;
				368	/*
				369	* Keep some preallocated caps around (ceph_min_count), to
				370	* avoid lots of free/alloc churn.
				371	*/
				372	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
				373	mdsc->caps_min_count) {
				374	mdsc->caps_total_count--;
				375	kmem_cache_free(ceph_cap_cachep, cap);
				376	} else {
				377	mdsc->caps_avail_count++;
				378	list_add(&cap->caps_item, &mdsc->caps_list);
				379	}
				380
				381	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
				382	mdsc->caps_reserve_count + mdsc->caps_avail_count);
				383	spin_unlock(&mdsc->caps_list_lock);
				384	}
				385
				386	void ceph_reservation_status(struct ceph_fs_client *fsc,
				387	int total, int avail, int used, int reserved,
				388	int *min)
				389	{
				390	struct ceph_mds_client *mdsc = fsc->mdsc;
				391
				392	spin_lock(&mdsc->caps_list_lock);
				393
				394	if (total)
				395	*total = mdsc->caps_total_count;
				396	if (avail)
				397	*avail = mdsc->caps_avail_count;
				398	if (used)
				399	*used = mdsc->caps_use_count;
				400	if (reserved)
				401	*reserved = mdsc->caps_reserve_count;
				402	if (min)
				403	*min = mdsc->caps_min_count;
				404
				405	spin_unlock(&mdsc->caps_list_lock);
				406	}
				407
				408	/*
				409	* Find ceph_cap for given mds, if any.
				410	*
				411	* Called with i_ceph_lock held.
				412	*/
				413	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
				414	{
				415	struct ceph_cap *cap;
				416	struct rb_node *n = ci->i_caps.rb_node;
				417
				418	while (n) {
				419	cap = rb_entry(n, struct ceph_cap, ci_node);
				420	if (mds < cap->mds)
				421	n = n->rb_left;
				422	else if (mds > cap->mds)
				423	n = n->rb_right;
				424	else
				425	return cap;
				426	}
				427	return NULL;
				428	}
				429
				430	struct ceph_cap ceph_get_cap_for_mds(struct ceph_inode_info ci, int mds)
				431	{
				432	struct ceph_cap *cap;
				433
				434	spin_lock(&ci->i_ceph_lock);
				435	cap = __get_cap_for_mds(ci, mds);
				436	spin_unlock(&ci->i_ceph_lock);
				437	return cap;
				438	}
				439
				440	/*
				441	* Return id of any MDS with a cap, preferably FILE_WR\|BUFFER\|EXCL, else -1.
				442	*/
				443	static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
				444	{
				445	struct ceph_cap *cap;
				446	int mds = -1;
				447	struct rb_node *p;
				448
				449	/* prefer mds with WR\|BUFFER\|EXCL caps */
				450	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				451	cap = rb_entry(p, struct ceph_cap, ci_node);
				452	mds = cap->mds;
				453	if (cap->issued & (CEPH_CAP_FILE_WR \|
				454	CEPH_CAP_FILE_BUFFER \|
				455	CEPH_CAP_FILE_EXCL))
				456	break;
				457	}
				458	return mds;
				459	}
				460
				461	int ceph_get_cap_mds(struct inode *inode)
				462	{
				463	struct ceph_inode_info *ci = ceph_inode(inode);
				464	int mds;
				465	spin_lock(&ci->i_ceph_lock);
				466	mds = __ceph_get_cap_mds(ceph_inode(inode));
				467	spin_unlock(&ci->i_ceph_lock);
				468	return mds;
				469	}
				470
				471	/*
				472	* Called under i_ceph_lock.
				473	*/
				474	static void __insert_cap_node(struct ceph_inode_info *ci,
				475	struct ceph_cap *new)
				476	{
				477	struct rb_node **p = &ci->i_caps.rb_node;
				478	struct rb_node *parent = NULL;
				479	struct ceph_cap *cap = NULL;
				480
				481	while (*p) {
				482	parent = *p;
				483	cap = rb_entry(parent, struct ceph_cap, ci_node);
				484	if (new->mds < cap->mds)
				485	p = &(*p)->rb_left;
				486	else if (new->mds > cap->mds)
				487	p = &(*p)->rb_right;
				488	else
				489	BUG();
				490	}
				491
				492	rb_link_node(&new->ci_node, parent, p);
				493	rb_insert_color(&new->ci_node, &ci->i_caps);
				494	}
				495
				496	/*
				497	* (re)set cap hold timeouts, which control the delayed release
				498	* of unused caps back to the MDS. Should be called on cap use.
				499	*/
				500	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
				501	struct ceph_inode_info *ci)
				502	{
				503	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
				504
				505	ci->i_hold_caps_min = round_jiffies(jiffies +
				506	ma->caps_wanted_delay_min * HZ);
				507	ci->i_hold_caps_max = round_jiffies(jiffies +
				508	ma->caps_wanted_delay_max * HZ);
				509	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
				510	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
				511	}
				512
				513	/*
				514	* (Re)queue cap at the end of the delayed cap release list.
				515	*
				516	* If I_FLUSH is set, leave the inode at the front of the list.
				517	*
				518	* Caller holds i_ceph_lock
				519	* -> we take mdsc->cap_delay_lock
				520	*/
				521	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
				522	struct ceph_inode_info *ci)
				523	{
				524	__cap_set_timeouts(mdsc, ci);
				525	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
				526	ci->i_ceph_flags, ci->i_hold_caps_max);
				527	if (!mdsc->stopping) {
				528	spin_lock(&mdsc->cap_delay_lock);
				529	if (!list_empty(&ci->i_cap_delay_list)) {
				530	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				531	goto no_change;
				532	list_del_init(&ci->i_cap_delay_list);
				533	}
				534	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				535	no_change:
				536	spin_unlock(&mdsc->cap_delay_lock);
				537	}
				538	}
				539
				540	/*
				541	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
				542	* indicating we should send a cap message to flush dirty metadata
				543	* asap, and move to the front of the delayed cap list.
				544	*/
				545	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
				546	struct ceph_inode_info *ci)
				547	{
				548	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
				549	spin_lock(&mdsc->cap_delay_lock);
				550	ci->i_ceph_flags \|= CEPH_I_FLUSH;
				551	if (!list_empty(&ci->i_cap_delay_list))
				552	list_del_init(&ci->i_cap_delay_list);
				553	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				554	spin_unlock(&mdsc->cap_delay_lock);
				555	}
				556
				557	/*
				558	* Cancel delayed work on cap.
				559	*
				560	* Caller must hold i_ceph_lock.
				561	*/
				562	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
				563	struct ceph_inode_info *ci)
				564	{
				565	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
				566	if (list_empty(&ci->i_cap_delay_list))
				567	return;
				568	spin_lock(&mdsc->cap_delay_lock);
				569	list_del_init(&ci->i_cap_delay_list);
				570	spin_unlock(&mdsc->cap_delay_lock);
				571	}
				572
				573	/*
				574	* Common issue checks for add_cap, handle_cap_grant.
				575	*/
				576	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
				577	unsigned issued)
				578	{
				579	unsigned had = __ceph_caps_issued(ci, NULL);
				580
				581	/*
				582	* Each time we receive FILE_CACHE anew, we increment
				583	* i_rdcache_gen.
				584	*/
				585	if ((issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
				586	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0) {
				587	ci->i_rdcache_gen++;
				588	}
				589
				590	/*
				591	* If FILE_SHARED is newly issued, mark dir not complete. We don't
				592	* know what happened to this directory while we didn't have the cap.
				593	* If FILE_SHARED is being revoked, also mark dir not complete. It
				594	* stops on-going cached readdir.
				595	*/
				596	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
				597	if (issued & CEPH_CAP_FILE_SHARED)
				598	atomic_inc(&ci->i_shared_gen);
				599	if (S_ISDIR(ci->vfs_inode.i_mode)) {
				600	dout(" marking %p NOT complete\n", &ci->vfs_inode);
				601	__ceph_dir_clear_complete(ci);
				602	}
				603	}
				604	}
				605
				606	/*
				607	* Add a capability under the given MDS session.
				608	*
				609	* Caller should hold session snap_rwsem (read) and s_mutex.
				610	*
				611	* @fmode is the open file mode, if we are opening a file, otherwise
				612	* it is < 0. (This is so we can atomically add the cap and add an
				613	* open file reference to it.)
				614	*/
				615	void ceph_add_cap(struct inode *inode,
				616	struct ceph_mds_session *session, u64 cap_id,
				617	int fmode, unsigned issued, unsigned wanted,
				618	unsigned seq, unsigned mseq, u64 realmino, int flags,
				619	struct ceph_cap **new_cap)
				620	{
				621	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				622	struct ceph_inode_info *ci = ceph_inode(inode);
				623	struct ceph_cap *cap;
				624	int mds = session->s_mds;
				625	int actual_wanted;
				626
				627	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
				628	session->s_mds, cap_id, ceph_cap_string(issued), seq);
				629
				630	/*
				631	* If we are opening the file, include file mode wanted bits
				632	* in wanted.
				633	*/
				634	if (fmode >= 0)
				635	wanted \|= ceph_caps_for_mode(fmode);
				636
				637	cap = __get_cap_for_mds(ci, mds);
				638	if (!cap) {
				639	cap = *new_cap;
				640	*new_cap = NULL;
				641
				642	cap->issued = 0;
				643	cap->implemented = 0;
				644	cap->mds = mds;
				645	cap->mds_wanted = 0;
				646	cap->mseq = 0;
				647
				648	cap->ci = ci;
				649	__insert_cap_node(ci, cap);
				650
				651	/* add to session cap list */
				652	cap->session = session;
				653	spin_lock(&session->s_cap_lock);
				654	list_add_tail(&cap->session_caps, &session->s_caps);
				655	session->s_nr_caps++;
				656	spin_unlock(&session->s_cap_lock);
				657	} else {
				658	/*
				659	* auth mds of the inode changed. we received the cap export
				660	* message, but still haven't received the cap import message.
				661	* handle_cap_export() updated the new auth MDS' cap.
				662	*
				663	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
				664	* a message that was send before the cap import message. So
				665	* don't remove caps.
				666	*/
				667	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
				668	WARN_ON(cap != ci->i_auth_cap);
				669	WARN_ON(cap->cap_id != cap_id);
				670	seq = cap->seq;
				671	mseq = cap->mseq;
				672	issued \|= cap->issued;
				673	flags \|= CEPH_CAP_FLAG_AUTH;
				674	}
				675	}
				676
				677	if (!ci->i_snap_realm \|\|
				678	((flags & CEPH_CAP_FLAG_AUTH) &&
				679	realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
				680	/*
				681	* add this inode to the appropriate snap realm
				682	*/
				683	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
				684	realmino);
				685	if (realm) {
				686	struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
				687	if (oldrealm) {
				688	spin_lock(&oldrealm->inodes_with_caps_lock);
				689	list_del_init(&ci->i_snap_realm_item);
				690	spin_unlock(&oldrealm->inodes_with_caps_lock);
				691	}
				692
				693	spin_lock(&realm->inodes_with_caps_lock);
				694	list_add(&ci->i_snap_realm_item,
				695	&realm->inodes_with_caps);
				696	ci->i_snap_realm = realm;
				697	if (realm->ino == ci->i_vino.ino)
				698	realm->inode = inode;
				699	spin_unlock(&realm->inodes_with_caps_lock);
				700
				701	if (oldrealm)
				702	ceph_put_snap_realm(mdsc, oldrealm);
				703	} else {
				704	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
				705	realmino);
				706	WARN_ON(!realm);
				707	}
				708	}
				709
				710	__check_cap_issue(ci, cap, issued);
				711
				712	/*
				713	* If we are issued caps we don't want, or the mds' wanted
				714	* value appears to be off, queue a check so we'll release
				715	* later and/or update the mds wanted value.
				716	*/
				717	actual_wanted = __ceph_caps_wanted(ci);
				718	if ((wanted & ~actual_wanted) \|\|
				719	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
				720	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
				721	ceph_cap_string(issued), ceph_cap_string(wanted),
				722	ceph_cap_string(actual_wanted));
				723	__cap_delay_requeue(mdsc, ci);
				724	}
				725
				726	if (flags & CEPH_CAP_FLAG_AUTH) {
				727	if (!ci->i_auth_cap \|\|
				728	ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
				729	ci->i_auth_cap = cap;
				730	cap->mds_wanted = wanted;
				731	}
				732	} else {
				733	WARN_ON(ci->i_auth_cap == cap);
				734	}
				735
				736	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
				737	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
				738	ceph_cap_string(issued\|cap->issued), seq, mds);
				739	cap->cap_id = cap_id;
				740	cap->issued = issued;
				741	cap->implemented \|= issued;
				742	if (ceph_seq_cmp(mseq, cap->mseq) > 0)
				743	cap->mds_wanted = wanted;
				744	else
				745	cap->mds_wanted \|= wanted;
				746	cap->seq = seq;
				747	cap->issue_seq = seq;
				748	cap->mseq = mseq;
				749	cap->cap_gen = session->s_cap_gen;
				750
				751	if (fmode >= 0)
				752	__ceph_get_fmode(ci, fmode);
				753	}
				754
				755	/*
				756	* Return true if cap has not timed out and belongs to the current
				757	* generation of the MDS session (i.e. has not gone 'stale' due to
				758	* us losing touch with the mds).
				759	*/
				760	static int __cap_is_valid(struct ceph_cap *cap)
				761	{
				762	unsigned long ttl;
				763	u32 gen;
				764
				765	spin_lock(&cap->session->s_gen_ttl_lock);
				766	gen = cap->session->s_cap_gen;
				767	ttl = cap->session->s_cap_ttl;
				768	spin_unlock(&cap->session->s_gen_ttl_lock);
				769
				770	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
				771	dout("__cap_is_valid %p cap %p issued %s "
				772	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
				773	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
				774	return 0;
				775	}
				776
				777	return 1;
				778	}
				779
				780	/*
				781	* Return set of valid cap bits issued to us. Note that caps time
				782	* out, and may be invalidated in bulk if the client session times out
				783	* and session->s_cap_gen is bumped.
				784	*/
				785	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
				786	{
				787	int have = ci->i_snap_caps;
				788	struct ceph_cap *cap;
				789	struct rb_node *p;
				790
				791	if (implemented)
				792	*implemented = 0;
				793	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				794	cap = rb_entry(p, struct ceph_cap, ci_node);
				795	if (!__cap_is_valid(cap))
				796	continue;
				797	dout("__ceph_caps_issued %p cap %p issued %s\n",
				798	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
				799	have \|= cap->issued;
				800	if (implemented)
				801	*implemented \|= cap->implemented;
				802	}
				803	/*
				804	* exclude caps issued by non-auth MDS, but are been revoking
				805	* by the auth MDS. The non-auth MDS should be revoking/exporting
				806	* these caps, but the message is delayed.
				807	*/
				808	if (ci->i_auth_cap) {
				809	cap = ci->i_auth_cap;
				810	have &= ~cap->implemented \| cap->issued;
				811	}
				812	return have;
				813	}
				814
				815	/*
				816	* Get cap bits issued by caps other than @ocap
				817	*/
				818	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
				819	{
				820	int have = ci->i_snap_caps;
				821	struct ceph_cap *cap;
				822	struct rb_node *p;
				823
				824	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				825	cap = rb_entry(p, struct ceph_cap, ci_node);
				826	if (cap == ocap)
				827	continue;
				828	if (!__cap_is_valid(cap))
				829	continue;
				830	have \|= cap->issued;
				831	}
				832	return have;
				833	}
				834
				835	/*
				836	* Move a cap to the end of the LRU (oldest caps at list head, newest
				837	* at list tail).
				838	*/
				839	static void __touch_cap(struct ceph_cap *cap)
				840	{
				841	struct ceph_mds_session *s = cap->session;
				842
				843	spin_lock(&s->s_cap_lock);
				844	if (!s->s_cap_iterator) {
				845	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
				846	s->s_mds);
				847	list_move_tail(&cap->session_caps, &s->s_caps);
				848	} else {
				849	dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
				850	&cap->ci->vfs_inode, cap, s->s_mds);
				851	}
				852	spin_unlock(&s->s_cap_lock);
				853	}
				854
				855	/*
				856	* Check if we hold the given mask. If so, move the cap(s) to the
				857	* front of their respective LRUs. (This is the preferred way for
				858	* callers to check for caps they want.)
				859	*/
				860	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
				861	{
				862	struct ceph_cap *cap;
				863	struct rb_node *p;
				864	int have = ci->i_snap_caps;
				865
				866	if ((have & mask) == mask) {
				867	dout("__ceph_caps_issued_mask %p snap issued %s"
				868	" (mask %s)\n", &ci->vfs_inode,
				869	ceph_cap_string(have),
				870	ceph_cap_string(mask));
				871	return 1;
				872	}
				873
				874	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				875	cap = rb_entry(p, struct ceph_cap, ci_node);
				876	if (!__cap_is_valid(cap))
				877	continue;
				878	if ((cap->issued & mask) == mask) {
				879	dout("__ceph_caps_issued_mask %p cap %p issued %s"
				880	" (mask %s)\n", &ci->vfs_inode, cap,
				881	ceph_cap_string(cap->issued),
				882	ceph_cap_string(mask));
				883	if (touch)
				884	__touch_cap(cap);
				885	return 1;
				886	}
				887
				888	/* does a combination of caps satisfy mask? */
				889	have \|= cap->issued;
				890	if ((have & mask) == mask) {
				891	dout("__ceph_caps_issued_mask %p combo issued %s"
				892	" (mask %s)\n", &ci->vfs_inode,
				893	ceph_cap_string(cap->issued),
				894	ceph_cap_string(mask));
				895	if (touch) {
				896	struct rb_node *q;
				897
				898	/* touch this + preceding caps */
				899	__touch_cap(cap);
				900	for (q = rb_first(&ci->i_caps); q != p;
				901	q = rb_next(q)) {
				902	cap = rb_entry(q, struct ceph_cap,
				903	ci_node);
				904	if (!__cap_is_valid(cap))
				905	continue;
				906	__touch_cap(cap);
				907	}
				908	}
				909	return 1;
				910	}
				911	}
				912
				913	return 0;
				914	}
				915
				916	/*
				917	* Return true if mask caps are currently being revoked by an MDS.
				918	*/
				919	int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
				920	struct ceph_cap *ocap, int mask)
				921	{
				922	struct ceph_cap *cap;
				923	struct rb_node *p;
				924
				925	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				926	cap = rb_entry(p, struct ceph_cap, ci_node);
				927	if (cap != ocap &&
				928	(cap->implemented & ~cap->issued & mask))
				929	return 1;
				930	}
				931	return 0;
				932	}
				933
				934	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
				935	{
				936	struct inode *inode = &ci->vfs_inode;
				937	int ret;
				938
				939	spin_lock(&ci->i_ceph_lock);
				940	ret = __ceph_caps_revoking_other(ci, NULL, mask);
				941	spin_unlock(&ci->i_ceph_lock);
				942	dout("ceph_caps_revoking %p %s = %d\n", inode,
				943	ceph_cap_string(mask), ret);
				944	return ret;
				945	}
				946
				947	int __ceph_caps_used(struct ceph_inode_info *ci)
				948	{
				949	int used = 0;
				950	if (ci->i_pin_ref)
				951	used \|= CEPH_CAP_PIN;
				952	if (ci->i_rd_ref)
				953	used \|= CEPH_CAP_FILE_RD;
				954	if (ci->i_rdcache_ref \|\|
				955	(!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
				956	ci->vfs_inode.i_data.nrpages))
				957	used \|= CEPH_CAP_FILE_CACHE;
				958	if (ci->i_wr_ref)
				959	used \|= CEPH_CAP_FILE_WR;
				960	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)
				961	used \|= CEPH_CAP_FILE_BUFFER;
				962	return used;
				963	}
				964
				965	/*
				966	* wanted, by virtue of open file modes
				967	*/
				968	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
				969	{
				970	int i, bits = 0;
				971	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				972	if (ci->i_nr_by_mode[i])
				973	bits \|= 1 << i;
				974	}
				975	if (bits == 0)
				976	return 0;
				977	return ceph_caps_for_mode(bits >> 1);
				978	}
				979
				980	/*
				981	* Return caps we have registered with the MDS(s) as 'wanted'.
				982	*/
				983	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
				984	{
				985	struct ceph_cap *cap;
				986	struct rb_node *p;
				987	int mds_wanted = 0;
				988
				989	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				990	cap = rb_entry(p, struct ceph_cap, ci_node);
				991	if (check && !__cap_is_valid(cap))
				992	continue;
				993	if (cap == ci->i_auth_cap)
				994	mds_wanted \|= cap->mds_wanted;
				995	else
				996	mds_wanted \|= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
				997	}
				998	return mds_wanted;
				999	}
				1000
				1001	/*
				1002	* called under i_ceph_lock
				1003	*/
				1004	static int __ceph_is_single_caps(struct ceph_inode_info *ci)
				1005	{
				1006	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
				1007	}
				1008
				1009	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
				1010	{
				1011	return !RB_EMPTY_ROOT(&ci->i_caps);
				1012	}
				1013
				1014	int ceph_is_any_caps(struct inode *inode)
				1015	{
				1016	struct ceph_inode_info *ci = ceph_inode(inode);
				1017	int ret;
				1018
				1019	spin_lock(&ci->i_ceph_lock);
				1020	ret = __ceph_is_any_caps(ci);
				1021	spin_unlock(&ci->i_ceph_lock);
				1022
				1023	return ret;
				1024	}
				1025
				1026	static void drop_inode_snap_realm(struct ceph_inode_info *ci)
				1027	{
				1028	struct ceph_snap_realm *realm = ci->i_snap_realm;
				1029	spin_lock(&realm->inodes_with_caps_lock);
				1030	list_del_init(&ci->i_snap_realm_item);
				1031	ci->i_snap_realm_counter++;
				1032	ci->i_snap_realm = NULL;
				1033	if (realm->ino == ci->i_vino.ino)
				1034	realm->inode = NULL;
				1035	spin_unlock(&realm->inodes_with_caps_lock);
				1036	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
				1037	realm);
				1038	}
				1039
				1040	/*
				1041	* Remove a cap. Take steps to deal with a racing iterate_session_caps.
				1042	*
				1043	* caller should hold i_ceph_lock.
				1044	* caller will not hold session s_mutex if called from destroy_inode.
				1045	*/
				1046	void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
				1047	{
				1048	struct ceph_mds_session *session = cap->session;
				1049	struct ceph_inode_info *ci = cap->ci;
				1050	struct ceph_mds_client *mdsc =
				1051	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
				1052	int removed = 0;
				1053
				1054	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
				1055
				1056	/* remove from inode's cap rbtree, and clear auth cap */
				1057	rb_erase(&cap->ci_node, &ci->i_caps);
				1058	if (ci->i_auth_cap == cap)
				1059	ci->i_auth_cap = NULL;
				1060
				1061	/* remove from session list */
				1062	spin_lock(&session->s_cap_lock);
				1063	if (session->s_cap_iterator == cap) {
				1064	/* not yet, we are iterating over this very cap */
				1065	dout("__ceph_remove_cap delaying %p removal from session %p\n",
				1066	cap, cap->session);
				1067	} else {
				1068	list_del_init(&cap->session_caps);
				1069	session->s_nr_caps--;
				1070	cap->session = NULL;
				1071	removed = 1;
				1072	}
				1073	/* protect backpointer with s_cap_lock: see iterate_session_caps */
				1074	cap->ci = NULL;
				1075
				1076	/*
				1077	* s_cap_reconnect is protected by s_cap_lock. no one changes
				1078	* s_cap_gen while session is in the reconnect state.
				1079	*/
				1080	if (queue_release &&
				1081	(!session->s_cap_reconnect \|\| cap->cap_gen == session->s_cap_gen)) {
				1082	cap->queue_release = 1;
				1083	if (removed) {
				1084	list_add_tail(&cap->session_caps,
				1085	&session->s_cap_releases);
				1086	session->s_num_cap_releases++;
				1087	removed = 0;
				1088	}
				1089	} else {
				1090	cap->queue_release = 0;
				1091	}
				1092	cap->cap_ino = ci->i_vino.ino;
				1093
				1094	spin_unlock(&session->s_cap_lock);
				1095
				1096	if (removed)
				1097	ceph_put_cap(mdsc, cap);
				1098
				1099	/* when reconnect denied, we remove session caps forcibly,
				1100	* i_wr_ref can be non-zero. If there are ongoing write,
				1101	* keep i_snap_realm.
				1102	*/
				1103	if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
				1104	drop_inode_snap_realm(ci);
				1105
				1106	if (!__ceph_is_any_real_caps(ci))
				1107	__cap_delay_cancel(mdsc, ci);
				1108	}
				1109
				1110	struct cap_msg_args {
				1111	struct ceph_mds_session *session;
				1112	u64 ino, cid, follows;
				1113	u64 flush_tid, oldest_flush_tid, size, max_size;
				1114	u64 xattr_version;
				1115	struct ceph_buffer *xattr_buf;
				1116	struct timespec64 atime, mtime, ctime;
				1117	int op, caps, wanted, dirty;
				1118	u32 seq, issue_seq, mseq, time_warp_seq;
				1119	u32 flags;
				1120	kuid_t uid;
				1121	kgid_t gid;
				1122	umode_t mode;
				1123	bool inline_data;
				1124	};
				1125
				1126	/*
				1127	* Build and send a cap message to the given MDS.
				1128	*
				1129	* Caller should be holding s_mutex.
				1130	*/
				1131	static int send_cap_msg(struct cap_msg_args *arg)
				1132	{
				1133	struct ceph_mds_caps *fc;
				1134	struct ceph_msg *msg;
				1135	void *p;
				1136	size_t extra_len;
				1137	struct timespec64 zerotime = {0};
				1138	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
				1139
				1140	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
				1141	" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
				1142	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
				1143	arg->cid, arg->ino, ceph_cap_string(arg->caps),
				1144	ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
				1145	arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
				1146	arg->mseq, arg->follows, arg->size, arg->max_size,
				1147	arg->xattr_version,
				1148	arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
				1149
				1150	/* flock buffer size + inline version + inline data size +
				1151	* osd_epoch_barrier + oldest_flush_tid */
				1152	extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
				1153	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
				1154	GFP_NOFS, false);
				1155	if (!msg)
				1156	return -ENOMEM;
				1157
				1158	msg->hdr.version = cpu_to_le16(10);
				1159	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
				1160
				1161	fc = msg->front.iov_base;
				1162	memset(fc, 0, sizeof(*fc));
				1163
				1164	fc->cap_id = cpu_to_le64(arg->cid);
				1165	fc->op = cpu_to_le32(arg->op);
				1166	fc->seq = cpu_to_le32(arg->seq);
				1167	fc->issue_seq = cpu_to_le32(arg->issue_seq);
				1168	fc->migrate_seq = cpu_to_le32(arg->mseq);
				1169	fc->caps = cpu_to_le32(arg->caps);
				1170	fc->wanted = cpu_to_le32(arg->wanted);
				1171	fc->dirty = cpu_to_le32(arg->dirty);
				1172	fc->ino = cpu_to_le64(arg->ino);
				1173	fc->snap_follows = cpu_to_le64(arg->follows);
				1174
				1175	fc->size = cpu_to_le64(arg->size);
				1176	fc->max_size = cpu_to_le64(arg->max_size);
				1177	ceph_encode_timespec64(&fc->mtime, &arg->mtime);
				1178	ceph_encode_timespec64(&fc->atime, &arg->atime);
				1179	ceph_encode_timespec64(&fc->ctime, &arg->ctime);
				1180	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
				1181
				1182	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
				1183	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
				1184	fc->mode = cpu_to_le32(arg->mode);
				1185
				1186	fc->xattr_version = cpu_to_le64(arg->xattr_version);
				1187	if (arg->xattr_buf) {
				1188	msg->middle = ceph_buffer_get(arg->xattr_buf);
				1189	fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
				1190	msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
				1191	}
				1192
				1193	p = fc + 1;
				1194	/* flock buffer size (version 2) */
				1195	ceph_encode_32(&p, 0);
				1196	/* inline version (version 4) */
				1197	ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
				1198	/* inline data size */
				1199	ceph_encode_32(&p, 0);
				1200	/*
				1201	* osd_epoch_barrier (version 5)
				1202	* The epoch_barrier is protected osdc->lock, so READ_ONCE here in
				1203	* case it was recently changed
				1204	*/
				1205	ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
				1206	/* oldest_flush_tid (version 6) */
				1207	ceph_encode_64(&p, arg->oldest_flush_tid);
				1208
				1209	/*
				1210	* caller_uid/caller_gid (version 7)
				1211	*
				1212	* Currently, we don't properly track which caller dirtied the caps
				1213	* last, and force a flush of them when there is a conflict. For now,
				1214	* just set this to 0:0, to emulate how the MDS has worked up to now.
				1215	*/
				1216	ceph_encode_32(&p, 0);
				1217	ceph_encode_32(&p, 0);
				1218
				1219	/* pool namespace (version 8) (mds always ignores this) */
				1220	ceph_encode_32(&p, 0);
				1221
				1222	/*
				1223	* btime and change_attr (version 9)
				1224	*
				1225	* We just zero these out for now, as the MDS ignores them unless
				1226	* the requisite feature flags are set (which we don't do yet).
				1227	*/
				1228	ceph_encode_timespec64(p, &zerotime);
				1229	p += sizeof(struct ceph_timespec);
				1230	ceph_encode_64(&p, 0);
				1231
				1232	/* Advisory flags (version 10) */
				1233	ceph_encode_32(&p, arg->flags);
				1234
				1235	ceph_con_send(&arg->session->s_con, msg);
				1236	return 0;
				1237	}
				1238
				1239	/*
				1240	* Queue cap releases when an inode is dropped from our cache.
				1241	*/
				1242	void ceph_queue_caps_release(struct inode *inode)
				1243	{
				1244	struct ceph_inode_info *ci = ceph_inode(inode);
				1245	struct rb_node *p;
				1246
				1247	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
				1248	* may call __ceph_caps_issued_mask() on a freeing inode. */
				1249	spin_lock(&ci->i_ceph_lock);
				1250	p = rb_first(&ci->i_caps);
				1251	while (p) {
				1252	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
				1253	p = rb_next(p);
				1254	__ceph_remove_cap(cap, true);
				1255	}
				1256	spin_unlock(&ci->i_ceph_lock);
				1257	}
				1258
				1259	/*
				1260	* Send a cap msg on the given inode. Update our caps state, then
				1261	* drop i_ceph_lock and send the message.
				1262	*
				1263	* Make note of max_size reported/requested from mds, revoked caps
				1264	* that have now been implemented.
				1265	*
				1266	* Make half-hearted attempt ot to invalidate page cache if we are
				1267	* dropping RDCACHE. Note that this will leave behind locked pages
				1268	* that we'll then need to deal with elsewhere.
				1269	*
				1270	* Return non-zero if delayed release, or we experienced an error
				1271	* such that the caller should requeue + retry later.
				1272	*
				1273	* called with i_ceph_lock, then drops it.
				1274	* caller should hold snap_rwsem (read), s_mutex.
				1275	*/
				1276	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
				1277	int op, bool sync, int used, int want, int retain,
				1278	int flushing, u64 flush_tid, u64 oldest_flush_tid)
				1279	__releases(cap->ci->i_ceph_lock)
				1280	{
				1281	struct ceph_inode_info *ci = cap->ci;
				1282	struct inode *inode = &ci->vfs_inode;
				1283	struct ceph_buffer *old_blob = NULL;
				1284	struct cap_msg_args arg;
				1285	int held, revoking;
				1286	int wake = 0;
				1287	int delayed = 0;
				1288	int ret;
				1289
				1290	held = cap->issued \| cap->implemented;
				1291	revoking = cap->implemented & ~cap->issued;
				1292	retain &= ~revoking;
				1293
				1294	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
				1295	inode, cap, cap->session,
				1296	ceph_cap_string(held), ceph_cap_string(held & retain),
				1297	ceph_cap_string(revoking));
				1298	BUG_ON((retain & CEPH_CAP_PIN) == 0);
				1299
				1300	arg.session = cap->session;
				1301
				1302	/* don't release wanted unless we've waited a bit. */
				1303	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1304	time_before(jiffies, ci->i_hold_caps_min)) {
				1305	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
				1306	ceph_cap_string(cap->issued),
				1307	ceph_cap_string(cap->issued & retain),
				1308	ceph_cap_string(cap->mds_wanted),
				1309	ceph_cap_string(want));
				1310	want \|= cap->mds_wanted;
				1311	retain \|= cap->issued;
				1312	delayed = 1;
				1313	}
				1314	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
				1315	if (want & ~cap->mds_wanted) {
				1316	/* user space may open/close single file frequently.
				1317	* This avoids droping mds_wanted immediately after
				1318	* requesting new mds_wanted.
				1319	*/
				1320	__cap_set_timeouts(mdsc, ci);
				1321	}
				1322
				1323	cap->issued &= retain; /* drop bits we don't want */
				1324	if (cap->implemented & ~cap->issued) {
				1325	/*
				1326	* Wake up any waiters on wanted -> needed transition.
				1327	* This is due to the weird transition from buffered
				1328	* to sync IO... we need to flush dirty pages _before_
				1329	* allowing sync writes to avoid reordering.
				1330	*/
				1331	wake = 1;
				1332	}
				1333	cap->implemented &= cap->issued \| used;
				1334	cap->mds_wanted = want;
				1335
				1336	arg.ino = ceph_vino(inode).ino;
				1337	arg.cid = cap->cap_id;
				1338	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
				1339	arg.flush_tid = flush_tid;
				1340	arg.oldest_flush_tid = oldest_flush_tid;
				1341
				1342	arg.size = inode->i_size;
				1343	ci->i_reported_size = arg.size;
				1344	arg.max_size = ci->i_wanted_max_size;
				1345	ci->i_requested_max_size = arg.max_size;
				1346
				1347	if (flushing & CEPH_CAP_XATTR_EXCL) {
				1348	old_blob = __ceph_build_xattrs_blob(ci);
				1349	arg.xattr_version = ci->i_xattrs.version;
				1350	arg.xattr_buf = ci->i_xattrs.blob;
				1351	} else {
				1352	arg.xattr_buf = NULL;
				1353	}
				1354
				1355	arg.mtime = inode->i_mtime;
				1356	arg.atime = inode->i_atime;
				1357	arg.ctime = inode->i_ctime;
				1358
				1359	arg.op = op;
				1360	arg.caps = cap->implemented;
				1361	arg.wanted = want;
				1362	arg.dirty = flushing;
				1363
				1364	arg.seq = cap->seq;
				1365	arg.issue_seq = cap->issue_seq;
				1366	arg.mseq = cap->mseq;
				1367	arg.time_warp_seq = ci->i_time_warp_seq;
				1368
				1369	arg.uid = inode->i_uid;
				1370	arg.gid = inode->i_gid;
				1371	arg.mode = inode->i_mode;
				1372
				1373	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
				1374	if (list_empty(&ci->i_cap_snaps))
				1375	arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
				1376	else
				1377	arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
				1378	if (sync)
				1379	arg.flags \|= CEPH_CLIENT_CAPS_SYNC;
				1380
				1381	spin_unlock(&ci->i_ceph_lock);
				1382
				1383	ceph_buffer_put(old_blob);
				1384
				1385	ret = send_cap_msg(&arg);
				1386	if (ret < 0) {
				1387	dout("error sending cap msg, must requeue %p\n", inode);
				1388	delayed = 1;
				1389	}
				1390
				1391	if (wake)
				1392	wake_up_all(&ci->i_cap_wq);
				1393
				1394	return delayed;
				1395	}
				1396
				1397	static inline int __send_flush_snap(struct inode *inode,
				1398	struct ceph_mds_session *session,
				1399	struct ceph_cap_snap *capsnap,
				1400	u32 mseq, u64 oldest_flush_tid)
				1401	{
				1402	struct cap_msg_args arg;
				1403
				1404	arg.session = session;
				1405	arg.ino = ceph_vino(inode).ino;
				1406	arg.cid = 0;
				1407	arg.follows = capsnap->follows;
				1408	arg.flush_tid = capsnap->cap_flush.tid;
				1409	arg.oldest_flush_tid = oldest_flush_tid;
				1410
				1411	arg.size = capsnap->size;
				1412	arg.max_size = 0;
				1413	arg.xattr_version = capsnap->xattr_version;
				1414	arg.xattr_buf = capsnap->xattr_blob;
				1415
				1416	arg.atime = capsnap->atime;
				1417	arg.mtime = capsnap->mtime;
				1418	arg.ctime = capsnap->ctime;
				1419
				1420	arg.op = CEPH_CAP_OP_FLUSHSNAP;
				1421	arg.caps = capsnap->issued;
				1422	arg.wanted = 0;
				1423	arg.dirty = capsnap->dirty;
				1424
				1425	arg.seq = 0;
				1426	arg.issue_seq = 0;
				1427	arg.mseq = mseq;
				1428	arg.time_warp_seq = capsnap->time_warp_seq;
				1429
				1430	arg.uid = capsnap->uid;
				1431	arg.gid = capsnap->gid;
				1432	arg.mode = capsnap->mode;
				1433
				1434	arg.inline_data = capsnap->inline_data;
				1435	arg.flags = 0;
				1436
				1437	return send_cap_msg(&arg);
				1438	}
				1439
				1440	/*
				1441	* When a snapshot is taken, clients accumulate dirty metadata on
				1442	* inodes with capabilities in ceph_cap_snaps to describe the file
				1443	* state at the time the snapshot was taken. This must be flushed
				1444	* asynchronously back to the MDS once sync writes complete and dirty
				1445	* data is written out.
				1446	*
				1447	* Called under i_ceph_lock. Takes s_mutex as needed.
				1448	*/
				1449	static void __ceph_flush_snaps(struct ceph_inode_info *ci,
				1450	struct ceph_mds_session *session)
				1451	__releases(ci->i_ceph_lock)
				1452	__acquires(ci->i_ceph_lock)
				1453	{
				1454	struct inode *inode = &ci->vfs_inode;
				1455	struct ceph_mds_client *mdsc = session->s_mdsc;
				1456	struct ceph_cap_snap *capsnap;
				1457	u64 oldest_flush_tid = 0;
				1458	u64 first_tid = 1, last_tid = 0;
				1459
				1460	dout("__flush_snaps %p session %p\n", inode, session);
				1461
				1462	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				1463	/*
				1464	* we need to wait for sync writes to complete and for dirty
				1465	* pages to be written out.
				1466	*/
				1467	if (capsnap->dirty_pages \|\| capsnap->writing)
				1468	break;
				1469
				1470	/* should be removed by ceph_try_drop_cap_snap() */
				1471	BUG_ON(!capsnap->need_flush);
				1472
				1473	/* only flush each capsnap once */
				1474	if (capsnap->cap_flush.tid > 0) {
				1475	dout(" already flushed %p, skipping\n", capsnap);
				1476	continue;
				1477	}
				1478
				1479	spin_lock(&mdsc->cap_dirty_lock);
				1480	capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
				1481	list_add_tail(&capsnap->cap_flush.g_list,
				1482	&mdsc->cap_flush_list);
				1483	if (oldest_flush_tid == 0)
				1484	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				1485	if (list_empty(&ci->i_flushing_item)) {
				1486	list_add_tail(&ci->i_flushing_item,
				1487	&session->s_cap_flushing);
				1488	}
				1489	spin_unlock(&mdsc->cap_dirty_lock);
				1490
				1491	list_add_tail(&capsnap->cap_flush.i_list,
				1492	&ci->i_cap_flush_list);
				1493
				1494	if (first_tid == 1)
				1495	first_tid = capsnap->cap_flush.tid;
				1496	last_tid = capsnap->cap_flush.tid;
				1497	}
				1498
				1499	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
				1500
				1501	while (first_tid <= last_tid) {
				1502	struct ceph_cap *cap = ci->i_auth_cap;
				1503	struct ceph_cap_flush *cf;
				1504	int ret;
				1505
				1506	if (!(cap && cap->session == session)) {
				1507	dout("__flush_snaps %p auth cap %p not mds%d, "
				1508	"stop\n", inode, cap, session->s_mds);
				1509	break;
				1510	}
				1511
				1512	ret = -ENOENT;
				1513	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
				1514	if (cf->tid >= first_tid) {
				1515	ret = 0;
				1516	break;
				1517	}
				1518	}
				1519	if (ret < 0)
				1520	break;
				1521
				1522	first_tid = cf->tid + 1;
				1523
				1524	capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
				1525	refcount_inc(&capsnap->nref);
				1526	spin_unlock(&ci->i_ceph_lock);
				1527
				1528	dout("__flush_snaps %p capsnap %p tid %llu %s\n",
				1529	inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
				1530
				1531	ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
				1532	oldest_flush_tid);
				1533	if (ret < 0) {
				1534	pr_err("__flush_snaps: error sending cap flushsnap, "
				1535	"ino (%llx.%llx) tid %llu follows %llu\n",
				1536	ceph_vinop(inode), cf->tid, capsnap->follows);
				1537	}
				1538
				1539	ceph_put_cap_snap(capsnap);
				1540	spin_lock(&ci->i_ceph_lock);
				1541	}
				1542	}
				1543
				1544	void ceph_flush_snaps(struct ceph_inode_info *ci,
				1545	struct ceph_mds_session **psession)
				1546	{
				1547	struct inode *inode = &ci->vfs_inode;
				1548	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				1549	struct ceph_mds_session *session = NULL;
				1550	int mds;
				1551
				1552	dout("ceph_flush_snaps %p\n", inode);
				1553	if (psession)
				1554	session = *psession;
				1555	retry:
				1556	spin_lock(&ci->i_ceph_lock);
				1557	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
				1558	dout(" no capsnap needs flush, doing nothing\n");
				1559	goto out;
				1560	}
				1561	if (!ci->i_auth_cap) {
				1562	dout(" no auth cap (migrating?), doing nothing\n");
				1563	goto out;
				1564	}
				1565
				1566	mds = ci->i_auth_cap->session->s_mds;
				1567	if (session && session->s_mds != mds) {
				1568	dout(" oops, wrong session %p mutex\n", session);
				1569	mutex_unlock(&session->s_mutex);
				1570	ceph_put_mds_session(session);
				1571	session = NULL;
				1572	}
				1573	if (!session) {
				1574	spin_unlock(&ci->i_ceph_lock);
				1575	mutex_lock(&mdsc->mutex);
				1576	session = __ceph_lookup_mds_session(mdsc, mds);
				1577	mutex_unlock(&mdsc->mutex);
				1578	if (session) {
				1579	dout(" inverting session/ino locks on %p\n", session);
				1580	mutex_lock(&session->s_mutex);
				1581	}
				1582	goto retry;
				1583	}
				1584
				1585	// make sure flushsnap messages are sent in proper order.
				1586	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				1587	__kick_flushing_caps(mdsc, session, ci, 0);
				1588	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				1589	}
				1590
				1591	__ceph_flush_snaps(ci, session);
				1592	out:
				1593	spin_unlock(&ci->i_ceph_lock);
				1594
				1595	if (psession) {
				1596	*psession = session;
				1597	} else if (session) {
				1598	mutex_unlock(&session->s_mutex);
				1599	ceph_put_mds_session(session);
				1600	}
				1601	/* we flushed them all; remove this inode from the queue */
				1602	spin_lock(&mdsc->snap_flush_lock);
				1603	list_del_init(&ci->i_snap_flush_item);
				1604	spin_unlock(&mdsc->snap_flush_lock);
				1605	}
				1606
				1607	/*
				1608	* Mark caps dirty. If inode is newly dirty, return the dirty flags.
				1609	* Caller is then responsible for calling __mark_inode_dirty with the
				1610	* returned flags value.
				1611	*/
				1612	int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
				1613	struct ceph_cap_flush **pcf)
				1614	{
				1615	struct ceph_mds_client *mdsc =
				1616	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
				1617	struct inode *inode = &ci->vfs_inode;
				1618	int was = ci->i_dirty_caps;
				1619	int dirty = 0;
				1620
				1621	if (!ci->i_auth_cap) {
				1622	pr_warn("__mark_dirty_caps %p %llx mask %s, "
				1623	"but no auth cap (session was closed?)\n",
				1624	inode, ceph_ino(inode), ceph_cap_string(mask));
				1625	return 0;
				1626	}
				1627
				1628	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
				1629	ceph_cap_string(mask), ceph_cap_string(was),
				1630	ceph_cap_string(was \| mask));
				1631	ci->i_dirty_caps \|= mask;
				1632	if (was == 0) {
				1633	WARN_ON_ONCE(ci->i_prealloc_cap_flush);
				1634	swap(ci->i_prealloc_cap_flush, *pcf);
				1635
				1636	if (!ci->i_head_snapc) {
				1637	WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
				1638	ci->i_head_snapc = ceph_get_snap_context(
				1639	ci->i_snap_realm->cached_context);
				1640	}
				1641	dout(" inode %p now dirty snapc %p auth cap %p\n",
				1642	&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
				1643	BUG_ON(!list_empty(&ci->i_dirty_item));
				1644	spin_lock(&mdsc->cap_dirty_lock);
				1645	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
				1646	spin_unlock(&mdsc->cap_dirty_lock);
				1647	if (ci->i_flushing_caps == 0) {
				1648	ihold(inode);
				1649	dirty \|= I_DIRTY_SYNC;
				1650	}
				1651	} else {
				1652	WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
				1653	}
				1654	BUG_ON(list_empty(&ci->i_dirty_item));
				1655	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
				1656	(mask & CEPH_CAP_FILE_BUFFER))
				1657	dirty \|= I_DIRTY_DATASYNC;
				1658	__cap_delay_requeue(mdsc, ci);
				1659	return dirty;
				1660	}
				1661
				1662	struct ceph_cap_flush *ceph_alloc_cap_flush(void)
				1663	{
				1664	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
				1665	}
				1666
				1667	void ceph_free_cap_flush(struct ceph_cap_flush *cf)
				1668	{
				1669	if (cf)
				1670	kmem_cache_free(ceph_cap_flush_cachep, cf);
				1671	}
				1672
				1673	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
				1674	{
				1675	if (!list_empty(&mdsc->cap_flush_list)) {
				1676	struct ceph_cap_flush *cf =
				1677	list_first_entry(&mdsc->cap_flush_list,
				1678	struct ceph_cap_flush, g_list);
				1679	return cf->tid;
				1680	}
				1681	return 0;
				1682	}
				1683
				1684	/*
				1685	* Remove cap_flush from the mdsc's or inode's flushing cap list.
				1686	* Return true if caller needs to wake up flush waiters.
				1687	*/
				1688	static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
				1689	struct ceph_inode_info *ci,
				1690	struct ceph_cap_flush *cf)
				1691	{
				1692	struct ceph_cap_flush *prev;
				1693	bool wake = cf->wake;
				1694	if (mdsc) {
				1695	/* are there older pending cap flushes? */
				1696	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
				1697	prev = list_prev_entry(cf, g_list);
				1698	prev->wake = true;
				1699	wake = false;
				1700	}
				1701	list_del(&cf->g_list);
				1702	} else if (ci) {
				1703	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
				1704	prev = list_prev_entry(cf, i_list);
				1705	prev->wake = true;
				1706	wake = false;
				1707	}
				1708	list_del(&cf->i_list);
				1709	} else {
				1710	BUG_ON(1);
				1711	}
				1712	return wake;
				1713	}
				1714
				1715	/*
				1716	* Add dirty inode to the flushing list. Assigned a seq number so we
				1717	* can wait for caps to flush without starving.
				1718	*
				1719	* Called under i_ceph_lock.
				1720	*/
				1721	static int __mark_caps_flushing(struct inode *inode,
				1722	struct ceph_mds_session *session, bool wake,
				1723	u64 flush_tid, u64 oldest_flush_tid)
				1724	{
				1725	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				1726	struct ceph_inode_info *ci = ceph_inode(inode);
				1727	struct ceph_cap_flush *cf = NULL;
				1728	int flushing;
				1729
				1730	BUG_ON(ci->i_dirty_caps == 0);
				1731	BUG_ON(list_empty(&ci->i_dirty_item));
				1732	BUG_ON(!ci->i_prealloc_cap_flush);
				1733
				1734	flushing = ci->i_dirty_caps;
				1735	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
				1736	ceph_cap_string(flushing),
				1737	ceph_cap_string(ci->i_flushing_caps),
				1738	ceph_cap_string(ci->i_flushing_caps \| flushing));
				1739	ci->i_flushing_caps \|= flushing;
				1740	ci->i_dirty_caps = 0;
				1741	dout(" inode %p now !dirty\n", inode);
				1742
				1743	swap(cf, ci->i_prealloc_cap_flush);
				1744	cf->caps = flushing;
				1745	cf->wake = wake;
				1746
				1747	spin_lock(&mdsc->cap_dirty_lock);
				1748	list_del_init(&ci->i_dirty_item);
				1749
				1750	cf->tid = ++mdsc->last_cap_flush_tid;
				1751	list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
				1752	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				1753
				1754	if (list_empty(&ci->i_flushing_item)) {
				1755	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1756	mdsc->num_cap_flushing++;
				1757	}
				1758	spin_unlock(&mdsc->cap_dirty_lock);
				1759
				1760	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
				1761
				1762	*flush_tid = cf->tid;
				1763	return flushing;
				1764	}
				1765
				1766	/*
				1767	* try to invalidate mapping pages without blocking.
				1768	*/
				1769	static int try_nonblocking_invalidate(struct inode *inode)
				1770	{
				1771	struct ceph_inode_info *ci = ceph_inode(inode);
				1772	u32 invalidating_gen = ci->i_rdcache_gen;
				1773
				1774	spin_unlock(&ci->i_ceph_lock);
				1775	invalidate_mapping_pages(&inode->i_data, 0, -1);
				1776	spin_lock(&ci->i_ceph_lock);
				1777
				1778	if (inode->i_data.nrpages == 0 &&
				1779	invalidating_gen == ci->i_rdcache_gen) {
				1780	/* success. */
				1781	dout("try_nonblocking_invalidate %p success\n", inode);
				1782	/* save any racing async invalidate some trouble */
				1783	ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
				1784	return 0;
				1785	}
				1786	dout("try_nonblocking_invalidate %p failed\n", inode);
				1787	return -1;
				1788	}
				1789
				1790	bool __ceph_should_report_size(struct ceph_inode_info *ci)
				1791	{
				1792	loff_t size = ci->vfs_inode.i_size;
				1793	/* mds will adjust max size according to the reported size */
				1794	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
				1795	return false;
				1796	if (size >= ci->i_max_size)
				1797	return true;
				1798	/* half of previous max_size increment has been used */
				1799	if (ci->i_max_size > ci->i_reported_size &&
				1800	(size << 1) >= ci->i_max_size + ci->i_reported_size)
				1801	return true;
				1802	return false;
				1803	}
				1804
				1805	/*
				1806	* Swiss army knife function to examine currently used and wanted
				1807	* versus held caps. Release, flush, ack revoked caps to mds as
				1808	* appropriate.
				1809	*
				1810	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
				1811	* cap release further.
				1812	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
				1813	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
				1814	* further delay.
				1815	*/
				1816	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
				1817	struct ceph_mds_session *session)
				1818	{
				1819	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
				1820	struct ceph_mds_client *mdsc = fsc->mdsc;
				1821	struct inode *inode = &ci->vfs_inode;
				1822	struct ceph_cap *cap;
				1823	u64 flush_tid, oldest_flush_tid;
				1824	int file_wanted, used, cap_used;
				1825	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
				1826	int issued, implemented, want, retain, revoking, flushing = 0;
				1827	int mds = -1; /* keep track of how far we've gone through i_caps list
				1828	to avoid an infinite loop on retry */
				1829	struct rb_node *p;
				1830	int delayed = 0, sent = 0;
				1831	bool no_delay = flags & CHECK_CAPS_NODELAY;
				1832	bool queue_invalidate = false;
				1833	bool tried_invalidate = false;
				1834
				1835	/* if we are unmounting, flush any unused caps immediately. */
				1836	if (mdsc->stopping)
				1837	no_delay = true;
				1838
				1839	spin_lock(&ci->i_ceph_lock);
				1840
				1841	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				1842	flags \|= CHECK_CAPS_FLUSH;
				1843
				1844	if (!(flags & CHECK_CAPS_AUTHONLY) \|\|
				1845	(ci->i_auth_cap && __ceph_is_single_caps(ci)))
				1846	__cap_delay_cancel(mdsc, ci);
				1847
				1848	goto retry_locked;
				1849	retry:
				1850	spin_lock(&ci->i_ceph_lock);
				1851	retry_locked:
				1852	file_wanted = __ceph_caps_file_wanted(ci);
				1853	used = __ceph_caps_used(ci);
				1854	issued = __ceph_caps_issued(ci, &implemented);
				1855	revoking = implemented & ~issued;
				1856
				1857	want = file_wanted;
				1858	retain = file_wanted \| used \| CEPH_CAP_PIN;
				1859	if (!mdsc->stopping && inode->i_nlink > 0) {
				1860	if (file_wanted) {
				1861	retain \|= CEPH_CAP_ANY; /* be greedy */
				1862	} else if (S_ISDIR(inode->i_mode) &&
				1863	(issued & CEPH_CAP_FILE_SHARED) &&
				1864	__ceph_dir_is_complete(ci)) {
				1865	/*
				1866	* If a directory is complete, we want to keep
				1867	* the exclusive cap. So that MDS does not end up
				1868	* revoking the shared cap on every create/unlink
				1869	* operation.
				1870	*/
				1871	want = CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
				1872	retain \|= want;
				1873	} else {
				1874
				1875	retain \|= CEPH_CAP_ANY_SHARED;
				1876	/*
				1877	* keep RD only if we didn't have the file open RW,
				1878	* because then the mds would revoke it anyway to
				1879	* journal max_size=0.
				1880	*/
				1881	if (ci->i_max_size == 0)
				1882	retain \|= CEPH_CAP_ANY_RD;
				1883	}
				1884	}
				1885
				1886	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
				1887	" issued %s revoking %s retain %s %s%s%s\n", inode,
				1888	ceph_cap_string(file_wanted),
				1889	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
				1890	ceph_cap_string(ci->i_flushing_caps),
				1891	ceph_cap_string(issued), ceph_cap_string(revoking),
				1892	ceph_cap_string(retain),
				1893	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
				1894	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
				1895	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
				1896
				1897	/*
				1898	* If we no longer need to hold onto old our caps, and we may
				1899	* have cached pages, but don't want them, then try to invalidate.
				1900	* If we fail, it's because pages are locked.... try again later.
				1901	*/
				1902	if ((!no_delay \|\| mdsc->stopping) &&
				1903	!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
				1904	!(ci->i_wb_ref \|\| ci->i_wrbuffer_ref) && /* no dirty pages... */
				1905	inode->i_data.nrpages && /* have cached pages */
				1906	(revoking & (CEPH_CAP_FILE_CACHE\|
				1907	CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
				1908	!tried_invalidate) {
				1909	dout("check_caps trying to invalidate on %p\n", inode);
				1910	if (try_nonblocking_invalidate(inode) < 0) {
				1911	dout("check_caps queuing invalidate\n");
				1912	queue_invalidate = true;
				1913	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				1914	}
				1915	tried_invalidate = true;
				1916	goto retry_locked;
				1917	}
				1918
				1919	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				1920	cap = rb_entry(p, struct ceph_cap, ci_node);
				1921
				1922	/* avoid looping forever */
				1923	if (mds >= cap->mds \|\|
				1924	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
				1925	continue;
				1926
				1927	/* NOTE: no side-effects allowed, until we take s_mutex */
				1928
				1929	cap_used = used;
				1930	if (ci->i_auth_cap && cap != ci->i_auth_cap)
				1931	cap_used &= ~ci->i_auth_cap->issued;
				1932
				1933	revoking = cap->implemented & ~cap->issued;
				1934	dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
				1935	cap->mds, cap, ceph_cap_string(cap_used),
				1936	ceph_cap_string(cap->issued),
				1937	ceph_cap_string(cap->implemented),
				1938	ceph_cap_string(revoking));
				1939
				1940	if (cap == ci->i_auth_cap &&
				1941	(cap->issued & CEPH_CAP_FILE_WR)) {
				1942	/* request larger max_size from MDS? */
				1943	if (ci->i_wanted_max_size > ci->i_max_size &&
				1944	ci->i_wanted_max_size > ci->i_requested_max_size) {
				1945	dout("requesting new max_size\n");
				1946	goto ack;
				1947	}
				1948
				1949	/* approaching file_max? */
				1950	if (__ceph_should_report_size(ci)) {
				1951	dout("i_size approaching max_size\n");
				1952	goto ack;
				1953	}
				1954	}
				1955	/* flush anything dirty? */
				1956	if (cap == ci->i_auth_cap) {
				1957	if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
				1958	dout("flushing dirty caps\n");
				1959	goto ack;
				1960	}
				1961	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
				1962	dout("flushing snap caps\n");
				1963	goto ack;
				1964	}
				1965	}
				1966
				1967	/* completed revocation? going down and there are no caps? */
				1968	if (revoking && (revoking & cap_used) == 0) {
				1969	dout("completed revocation of %s\n",
				1970	ceph_cap_string(cap->implemented & ~cap->issued));
				1971	goto ack;
				1972	}
				1973
				1974	/* want more caps from mds? */
				1975	if (want & ~(cap->mds_wanted \| cap->issued))
				1976	goto ack;
				1977
				1978	/* things we might delay */
				1979	if ((cap->issued & ~retain) == 0 &&
				1980	cap->mds_wanted == want)
				1981	continue; /* nope, all good */
				1982
				1983	if (no_delay)
				1984	goto ack;
				1985
				1986	/* delay? */
				1987	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1988	time_before(jiffies, ci->i_hold_caps_max)) {
				1989	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
				1990	ceph_cap_string(cap->issued),
				1991	ceph_cap_string(cap->issued & retain),
				1992	ceph_cap_string(cap->mds_wanted),
				1993	ceph_cap_string(want));
				1994	delayed++;
				1995	continue;
				1996	}
				1997
				1998	ack:
				1999	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
				2000	dout(" skipping %p I_NOFLUSH set\n", inode);
				2001	continue;
				2002	}
				2003
				2004	if (session && session != cap->session) {
				2005	dout("oops, wrong session %p mutex\n", session);
				2006	mutex_unlock(&session->s_mutex);
				2007	session = NULL;
				2008	}
				2009	if (!session) {
				2010	session = cap->session;
				2011	if (mutex_trylock(&session->s_mutex) == 0) {
				2012	dout("inverting session/ino locks on %p\n",
				2013	session);
				2014	spin_unlock(&ci->i_ceph_lock);
				2015	if (took_snap_rwsem) {
				2016	up_read(&mdsc->snap_rwsem);
				2017	took_snap_rwsem = 0;
				2018	}
				2019	mutex_lock(&session->s_mutex);
				2020	goto retry;
				2021	}
				2022	}
				2023
				2024	/* kick flushing and flush snaps before sending normal
				2025	* cap message */
				2026	if (cap == ci->i_auth_cap &&
				2027	(ci->i_ceph_flags &
				2028	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS))) {
				2029	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				2030	__kick_flushing_caps(mdsc, session, ci, 0);
				2031	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2032	}
				2033	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
				2034	__ceph_flush_snaps(ci, session);
				2035
				2036	goto retry_locked;
				2037	}
				2038
				2039	/* take snap_rwsem after session mutex */
				2040	if (!took_snap_rwsem) {
				2041	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
				2042	dout("inverting snap/in locks on %p\n",
				2043	inode);
				2044	spin_unlock(&ci->i_ceph_lock);
				2045	down_read(&mdsc->snap_rwsem);
				2046	took_snap_rwsem = 1;
				2047	goto retry;
				2048	}
				2049	took_snap_rwsem = 1;
				2050	}
				2051
				2052	if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
				2053	flushing = __mark_caps_flushing(inode, session, false,
				2054	&flush_tid,
				2055	&oldest_flush_tid);
				2056	} else {
				2057	flushing = 0;
				2058	flush_tid = 0;
				2059	spin_lock(&mdsc->cap_dirty_lock);
				2060	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2061	spin_unlock(&mdsc->cap_dirty_lock);
				2062	}
				2063
				2064	mds = cap->mds; /* remember mds, so we don't repeat */
				2065	sent++;
				2066
				2067	/* __send_cap drops i_ceph_lock */
				2068	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
				2069	cap_used, want, retain, flushing,
				2070	flush_tid, oldest_flush_tid);
				2071	goto retry; /* retake i_ceph_lock and restart our cap scan. */
				2072	}
				2073
				2074	/* Reschedule delayed caps release if we delayed anything */
				2075	if (delayed)
				2076	__cap_delay_requeue(mdsc, ci);
				2077
				2078	spin_unlock(&ci->i_ceph_lock);
				2079
				2080	if (queue_invalidate)
				2081	ceph_queue_invalidate(inode);
				2082
				2083	if (session)
				2084	mutex_unlock(&session->s_mutex);
				2085	if (took_snap_rwsem)
				2086	up_read(&mdsc->snap_rwsem);
				2087	}
				2088
				2089	/*
				2090	* Try to flush dirty caps back to the auth mds.
				2091	*/
				2092	static int try_flush_caps(struct inode inode, u64 ptid)
				2093	{
				2094	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				2095	struct ceph_inode_info *ci = ceph_inode(inode);
				2096	struct ceph_mds_session *session = NULL;
				2097	int flushing = 0;
				2098	u64 flush_tid = 0, oldest_flush_tid = 0;
				2099
				2100	retry:
				2101	spin_lock(&ci->i_ceph_lock);
				2102	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
				2103	spin_unlock(&ci->i_ceph_lock);
				2104	dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
				2105	goto out;
				2106	}
				2107	if (ci->i_dirty_caps && ci->i_auth_cap) {
				2108	struct ceph_cap *cap = ci->i_auth_cap;
				2109	int used = __ceph_caps_used(ci);
				2110	int want = __ceph_caps_wanted(ci);
				2111	int delayed;
				2112
				2113	if (!session \|\| session != cap->session) {
				2114	spin_unlock(&ci->i_ceph_lock);
				2115	if (session)
				2116	mutex_unlock(&session->s_mutex);
				2117	session = cap->session;
				2118	mutex_lock(&session->s_mutex);
				2119	goto retry;
				2120	}
				2121	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
				2122	spin_unlock(&ci->i_ceph_lock);
				2123	goto out;
				2124	}
				2125
				2126	flushing = __mark_caps_flushing(inode, session, true,
				2127	&flush_tid, &oldest_flush_tid);
				2128
				2129	/* __send_cap drops i_ceph_lock */
				2130	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
				2131	used, want, (cap->issued \| cap->implemented),
				2132	flushing, flush_tid, oldest_flush_tid);
				2133
				2134	if (delayed) {
				2135	spin_lock(&ci->i_ceph_lock);
				2136	__cap_delay_requeue(mdsc, ci);
				2137	spin_unlock(&ci->i_ceph_lock);
				2138	}
				2139	} else {
				2140	if (!list_empty(&ci->i_cap_flush_list)) {
				2141	struct ceph_cap_flush *cf =
				2142	list_last_entry(&ci->i_cap_flush_list,
				2143	struct ceph_cap_flush, i_list);
				2144	cf->wake = true;
				2145	flush_tid = cf->tid;
				2146	}
				2147	flushing = ci->i_flushing_caps;
				2148	spin_unlock(&ci->i_ceph_lock);
				2149	}
				2150	out:
				2151	if (session)
				2152	mutex_unlock(&session->s_mutex);
				2153
				2154	*ptid = flush_tid;
				2155	return flushing;
				2156	}
				2157
				2158	/*
				2159	* Return true if we've flushed caps through the given flush_tid.
				2160	*/
				2161	static int caps_are_flushed(struct inode *inode, u64 flush_tid)
				2162	{
				2163	struct ceph_inode_info *ci = ceph_inode(inode);
				2164	int ret = 1;
				2165
				2166	spin_lock(&ci->i_ceph_lock);
				2167	if (!list_empty(&ci->i_cap_flush_list)) {
				2168	struct ceph_cap_flush * cf =
				2169	list_first_entry(&ci->i_cap_flush_list,
				2170	struct ceph_cap_flush, i_list);
				2171	if (cf->tid <= flush_tid)
				2172	ret = 0;
				2173	}
				2174	spin_unlock(&ci->i_ceph_lock);
				2175	return ret;
				2176	}
				2177
				2178	/*
				2179	* wait for any unsafe requests to complete.
				2180	*/
				2181	static int unsafe_request_wait(struct inode *inode)
				2182	{
				2183	struct ceph_inode_info *ci = ceph_inode(inode);
				2184	struct ceph_mds_request req1 = NULL, req2 = NULL;
				2185	int ret, err = 0;
				2186
				2187	spin_lock(&ci->i_unsafe_lock);
				2188	if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
				2189	req1 = list_last_entry(&ci->i_unsafe_dirops,
				2190	struct ceph_mds_request,
				2191	r_unsafe_dir_item);
				2192	ceph_mdsc_get_request(req1);
				2193	}
				2194	if (!list_empty(&ci->i_unsafe_iops)) {
				2195	req2 = list_last_entry(&ci->i_unsafe_iops,
				2196	struct ceph_mds_request,
				2197	r_unsafe_target_item);
				2198	ceph_mdsc_get_request(req2);
				2199	}
				2200	spin_unlock(&ci->i_unsafe_lock);
				2201
				2202	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
				2203	inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
				2204	if (req1) {
				2205	ret = !wait_for_completion_timeout(&req1->r_safe_completion,
				2206	ceph_timeout_jiffies(req1->r_timeout));
				2207	if (ret)
				2208	err = -EIO;
				2209	ceph_mdsc_put_request(req1);
				2210	}
				2211	if (req2) {
				2212	ret = !wait_for_completion_timeout(&req2->r_safe_completion,
				2213	ceph_timeout_jiffies(req2->r_timeout));
				2214	if (ret)
				2215	err = -EIO;
				2216	ceph_mdsc_put_request(req2);
				2217	}
				2218	return err;
				2219	}
				2220
				2221	int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
				2222	{
				2223	struct inode *inode = file->f_mapping->host;
				2224	struct ceph_inode_info *ci = ceph_inode(inode);
				2225	u64 flush_tid;
				2226	int ret;
				2227	int dirty;
				2228
				2229	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
				2230
				2231	ret = file_write_and_wait_range(file, start, end);
				2232	if (ret < 0)
				2233	goto out;
				2234
				2235	if (datasync)
				2236	goto out;
				2237
				2238	inode_lock(inode);
				2239
				2240	dirty = try_flush_caps(inode, &flush_tid);
				2241	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
				2242
				2243	ret = unsafe_request_wait(inode);
				2244
				2245	/*
				2246	* only wait on non-file metadata writeback (the mds
				2247	* can recover size and mtime, so we don't need to
				2248	* wait for that)
				2249	*/
				2250	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
				2251	ret = wait_event_interruptible(ci->i_cap_wq,
				2252	caps_are_flushed(inode, flush_tid));
				2253	}
				2254	inode_unlock(inode);
				2255	out:
				2256	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
				2257	return ret;
				2258	}
				2259
				2260	/*
				2261	* Flush any dirty caps back to the mds. If we aren't asked to wait,
				2262	* queue inode for flush but don't do so immediately, because we can
				2263	* get by with fewer MDS messages if we wait for data writeback to
				2264	* complete first.
				2265	*/
				2266	int ceph_write_inode(struct inode inode, struct writeback_control wbc)
				2267	{
				2268	struct ceph_inode_info *ci = ceph_inode(inode);
				2269	u64 flush_tid;
				2270	int err = 0;
				2271	int dirty;
				2272	int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
				2273
				2274	dout("write_inode %p wait=%d\n", inode, wait);
				2275	if (wait) {
				2276	dirty = try_flush_caps(inode, &flush_tid);
				2277	if (dirty)
				2278	err = wait_event_interruptible(ci->i_cap_wq,
				2279	caps_are_flushed(inode, flush_tid));
				2280	} else {
				2281	struct ceph_mds_client *mdsc =
				2282	ceph_sb_to_client(inode->i_sb)->mdsc;
				2283
				2284	spin_lock(&ci->i_ceph_lock);
				2285	if (__ceph_caps_dirty(ci))
				2286	__cap_delay_requeue_front(mdsc, ci);
				2287	spin_unlock(&ci->i_ceph_lock);
				2288	}
				2289	return err;
				2290	}
				2291
				2292	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
				2293	struct ceph_mds_session *session,
				2294	struct ceph_inode_info *ci,
				2295	u64 oldest_flush_tid)
				2296	__releases(ci->i_ceph_lock)
				2297	__acquires(ci->i_ceph_lock)
				2298	{
				2299	struct inode *inode = &ci->vfs_inode;
				2300	struct ceph_cap *cap;
				2301	struct ceph_cap_flush *cf;
				2302	int ret;
				2303	u64 first_tid = 0;
				2304
				2305	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
				2306	if (cf->tid < first_tid)
				2307	continue;
				2308
				2309	cap = ci->i_auth_cap;
				2310	if (!(cap && cap->session == session)) {
				2311	pr_err("%p auth cap %p not mds%d ???\n",
				2312	inode, cap, session->s_mds);
				2313	break;
				2314	}
				2315
				2316	first_tid = cf->tid + 1;
				2317
				2318	if (cf->caps) {
				2319	dout("kick_flushing_caps %p cap %p tid %llu %s\n",
				2320	inode, cap, cf->tid, ceph_cap_string(cf->caps));
				2321	ci->i_ceph_flags \|= CEPH_I_NODELAY;
				2322	ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
				2323	false, __ceph_caps_used(ci),
				2324	__ceph_caps_wanted(ci),
				2325	cap->issued \| cap->implemented,
				2326	cf->caps, cf->tid, oldest_flush_tid);
				2327	if (ret) {
				2328	pr_err("kick_flushing_caps: error sending "
				2329	"cap flush, ino (%llx.%llx) "
				2330	"tid %llu flushing %s\n",
				2331	ceph_vinop(inode), cf->tid,
				2332	ceph_cap_string(cf->caps));
				2333	}
				2334	} else {
				2335	struct ceph_cap_snap *capsnap =
				2336	container_of(cf, struct ceph_cap_snap,
				2337	cap_flush);
				2338	dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
				2339	inode, capsnap, cf->tid,
				2340	ceph_cap_string(capsnap->dirty));
				2341
				2342	refcount_inc(&capsnap->nref);
				2343	spin_unlock(&ci->i_ceph_lock);
				2344
				2345	ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
				2346	oldest_flush_tid);
				2347	if (ret < 0) {
				2348	pr_err("kick_flushing_caps: error sending "
				2349	"cap flushsnap, ino (%llx.%llx) "
				2350	"tid %llu follows %llu\n",
				2351	ceph_vinop(inode), cf->tid,
				2352	capsnap->follows);
				2353	}
				2354
				2355	ceph_put_cap_snap(capsnap);
				2356	}
				2357
				2358	spin_lock(&ci->i_ceph_lock);
				2359	}
				2360	}
				2361
				2362	void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
				2363	struct ceph_mds_session *session)
				2364	{
				2365	struct ceph_inode_info *ci;
				2366	struct ceph_cap *cap;
				2367	u64 oldest_flush_tid;
				2368
				2369	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
				2370
				2371	spin_lock(&mdsc->cap_dirty_lock);
				2372	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2373	spin_unlock(&mdsc->cap_dirty_lock);
				2374
				2375	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				2376	spin_lock(&ci->i_ceph_lock);
				2377	cap = ci->i_auth_cap;
				2378	if (!(cap && cap->session == session)) {
				2379	pr_err("%p auth cap %p not mds%d ???\n",
				2380	&ci->vfs_inode, cap, session->s_mds);
				2381	spin_unlock(&ci->i_ceph_lock);
				2382	continue;
				2383	}
				2384
				2385
				2386	/*
				2387	* if flushing caps were revoked, we re-send the cap flush
				2388	* in client reconnect stage. This guarantees MDS * processes
				2389	* the cap flush message before issuing the flushing caps to
				2390	* other client.
				2391	*/
				2392	if ((cap->issued & ci->i_flushing_caps) !=
				2393	ci->i_flushing_caps) {
				2394	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2395	__kick_flushing_caps(mdsc, session, ci,
				2396	oldest_flush_tid);
				2397	} else {
				2398	ci->i_ceph_flags \|= CEPH_I_KICK_FLUSH;
				2399	}
				2400
				2401	spin_unlock(&ci->i_ceph_lock);
				2402	}
				2403	}
				2404
				2405	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
				2406	struct ceph_mds_session *session)
				2407	{
				2408	struct ceph_inode_info *ci;
				2409	struct ceph_cap *cap;
				2410	u64 oldest_flush_tid;
				2411
				2412	dout("kick_flushing_caps mds%d\n", session->s_mds);
				2413
				2414	spin_lock(&mdsc->cap_dirty_lock);
				2415	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2416	spin_unlock(&mdsc->cap_dirty_lock);
				2417
				2418	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				2419	spin_lock(&ci->i_ceph_lock);
				2420	cap = ci->i_auth_cap;
				2421	if (!(cap && cap->session == session)) {
				2422	pr_err("%p auth cap %p not mds%d ???\n",
				2423	&ci->vfs_inode, cap, session->s_mds);
				2424	spin_unlock(&ci->i_ceph_lock);
				2425	continue;
				2426	}
				2427	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
				2428	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2429	__kick_flushing_caps(mdsc, session, ci,
				2430	oldest_flush_tid);
				2431	}
				2432	spin_unlock(&ci->i_ceph_lock);
				2433	}
				2434	}
				2435
				2436	static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
				2437	struct ceph_mds_session *session,
				2438	struct inode *inode)
				2439	__releases(ci->i_ceph_lock)
				2440	{
				2441	struct ceph_inode_info *ci = ceph_inode(inode);
				2442	struct ceph_cap *cap;
				2443
				2444	cap = ci->i_auth_cap;
				2445	dout("kick_flushing_inode_caps %p flushing %s\n", inode,
				2446	ceph_cap_string(ci->i_flushing_caps));
				2447
				2448	if (!list_empty(&ci->i_cap_flush_list)) {
				2449	u64 oldest_flush_tid;
				2450	spin_lock(&mdsc->cap_dirty_lock);
				2451	list_move_tail(&ci->i_flushing_item,
				2452	&cap->session->s_cap_flushing);
				2453	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
				2454	spin_unlock(&mdsc->cap_dirty_lock);
				2455
				2456	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
				2457	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
				2458	spin_unlock(&ci->i_ceph_lock);
				2459	} else {
				2460	spin_unlock(&ci->i_ceph_lock);
				2461	}
				2462	}
				2463
				2464
				2465	/*
				2466	* Take references to capabilities we hold, so that we don't release
				2467	* them to the MDS prematurely.
				2468	*
				2469	* Protected by i_ceph_lock.
				2470	*/
				2471	static void __take_cap_refs(struct ceph_inode_info *ci, int got,
				2472	bool snap_rwsem_locked)
				2473	{
				2474	if (got & CEPH_CAP_PIN)
				2475	ci->i_pin_ref++;
				2476	if (got & CEPH_CAP_FILE_RD)
				2477	ci->i_rd_ref++;
				2478	if (got & CEPH_CAP_FILE_CACHE)
				2479	ci->i_rdcache_ref++;
				2480	if (got & CEPH_CAP_FILE_WR) {
				2481	if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
				2482	BUG_ON(!snap_rwsem_locked);
				2483	ci->i_head_snapc = ceph_get_snap_context(
				2484	ci->i_snap_realm->cached_context);
				2485	}
				2486	ci->i_wr_ref++;
				2487	}
				2488	if (got & CEPH_CAP_FILE_BUFFER) {
				2489	if (ci->i_wb_ref == 0)
				2490	ihold(&ci->vfs_inode);
				2491	ci->i_wb_ref++;
				2492	dout("__take_cap_refs %p wb %d -> %d (?)\n",
				2493	&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
				2494	}
				2495	}
				2496
				2497	/*
				2498	* Try to grab cap references. Specify those refs we @want, and the
				2499	* minimal set we @need. Also include the larger offset we are writing
				2500	* to (when applicable), and check against max_size here as well.
				2501	* Note that caller is responsible for ensuring max_size increases are
				2502	* requested from the MDS.
				2503	*/
				2504	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
				2505	loff_t endoff, bool nonblock, int got, int err)
				2506	{
				2507	struct inode *inode = &ci->vfs_inode;
				2508	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				2509	int ret = 0;
				2510	int have, implemented;
				2511	int file_wanted;
				2512	bool snap_rwsem_locked = false;
				2513
				2514	dout("get_cap_refs %p need %s want %s\n", inode,
				2515	ceph_cap_string(need), ceph_cap_string(want));
				2516
				2517	again:
				2518	spin_lock(&ci->i_ceph_lock);
				2519
				2520	/* make sure file is actually open */
				2521	file_wanted = __ceph_caps_file_wanted(ci);
				2522	if ((file_wanted & need) != need) {
				2523	dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
				2524	ceph_cap_string(need), ceph_cap_string(file_wanted));
				2525	*err = -EBADF;
				2526	ret = 1;
				2527	goto out_unlock;
				2528	}
				2529
				2530	/* finish pending truncate */
				2531	while (ci->i_truncate_pending) {
				2532	spin_unlock(&ci->i_ceph_lock);
				2533	if (snap_rwsem_locked) {
				2534	up_read(&mdsc->snap_rwsem);
				2535	snap_rwsem_locked = false;
				2536	}
				2537	__ceph_do_pending_vmtruncate(inode);
				2538	spin_lock(&ci->i_ceph_lock);
				2539	}
				2540
				2541	have = __ceph_caps_issued(ci, &implemented);
				2542
				2543	if (have & need & CEPH_CAP_FILE_WR) {
				2544	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
				2545	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
				2546	inode, endoff, ci->i_max_size);
				2547	if (endoff > ci->i_requested_max_size) {
				2548	*err = -EAGAIN;
				2549	ret = 1;
				2550	}
				2551	goto out_unlock;
				2552	}
				2553	/*
				2554	* If a sync write is in progress, we must wait, so that we
				2555	* can get a final snapshot value for size+mtime.
				2556	*/
				2557	if (__ceph_have_pending_cap_snap(ci)) {
				2558	dout("get_cap_refs %p cap_snap_pending\n", inode);
				2559	goto out_unlock;
				2560	}
				2561	}
				2562
				2563	if ((have & need) == need) {
				2564	/*
				2565	* Look at (implemented & ~have & not) so that we keep waiting
				2566	* on transition from wanted -> needed caps. This is needed
				2567	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
				2568	* going before a prior buffered writeback happens.
				2569	*/
				2570	int not = want & ~(have & need);
				2571	int revoking = implemented & ~have;
				2572	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
				2573	inode, ceph_cap_string(have), ceph_cap_string(not),
				2574	ceph_cap_string(revoking));
				2575	if ((revoking & not) == 0) {
				2576	if (!snap_rwsem_locked &&
				2577	!ci->i_head_snapc &&
				2578	(need & CEPH_CAP_FILE_WR)) {
				2579	if (!down_read_trylock(&mdsc->snap_rwsem)) {
				2580	/*
				2581	* we can not call down_read() when
				2582	* task isn't in TASK_RUNNING state
				2583	*/
				2584	if (nonblock) {
				2585	*err = -EAGAIN;
				2586	ret = 1;
				2587	goto out_unlock;
				2588	}
				2589
				2590	spin_unlock(&ci->i_ceph_lock);
				2591	down_read(&mdsc->snap_rwsem);
				2592	snap_rwsem_locked = true;
				2593	goto again;
				2594	}
				2595	snap_rwsem_locked = true;
				2596	}
				2597	*got = need \| (have & want);
				2598	if ((need & CEPH_CAP_FILE_RD) &&
				2599	!(*got & CEPH_CAP_FILE_CACHE))
				2600	ceph_disable_fscache_readpage(ci);
				2601	__take_cap_refs(ci, *got, true);
				2602	ret = 1;
				2603	}
				2604	} else {
				2605	int session_readonly = false;
				2606	if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
				2607	struct ceph_mds_session *s = ci->i_auth_cap->session;
				2608	spin_lock(&s->s_cap_lock);
				2609	session_readonly = s->s_readonly;
				2610	spin_unlock(&s->s_cap_lock);
				2611	}
				2612	if (session_readonly) {
				2613	dout("get_cap_refs %p needed %s but mds%d readonly\n",
				2614	inode, ceph_cap_string(need), ci->i_auth_cap->mds);
				2615	*err = -EROFS;
				2616	ret = 1;
				2617	goto out_unlock;
				2618	}
				2619
				2620	if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
				2621	int mds_wanted;
				2622	if (READ_ONCE(mdsc->fsc->mount_state) ==
				2623	CEPH_MOUNT_SHUTDOWN) {
				2624	dout("get_cap_refs %p forced umount\n", inode);
				2625	*err = -EIO;
				2626	ret = 1;
				2627	goto out_unlock;
				2628	}
				2629	mds_wanted = __ceph_caps_mds_wanted(ci, false);
				2630	if (need & ~(mds_wanted & need)) {
				2631	dout("get_cap_refs %p caps were dropped"
				2632	" (session killed?)\n", inode);
				2633	*err = -ESTALE;
				2634	ret = 1;
				2635	goto out_unlock;
				2636	}
				2637	if (!(file_wanted & ~mds_wanted))
				2638	ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
				2639	}
				2640
				2641	dout("get_cap_refs %p have %s needed %s\n", inode,
				2642	ceph_cap_string(have), ceph_cap_string(need));
				2643	}
				2644	out_unlock:
				2645	spin_unlock(&ci->i_ceph_lock);
				2646	if (snap_rwsem_locked)
				2647	up_read(&mdsc->snap_rwsem);
				2648
				2649	dout("get_cap_refs %p ret %d got %s\n", inode,
				2650	ret, ceph_cap_string(*got));
				2651	return ret;
				2652	}
				2653
				2654	/*
				2655	* Check the offset we are writing up to against our current
				2656	* max_size. If necessary, tell the MDS we want to write to
				2657	* a larger offset.
				2658	*/
				2659	static void check_max_size(struct inode *inode, loff_t endoff)
				2660	{
				2661	struct ceph_inode_info *ci = ceph_inode(inode);
				2662	int check = 0;
				2663
				2664	/* do we need to explicitly request a larger max_size? */
				2665	spin_lock(&ci->i_ceph_lock);
				2666	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
				2667	dout("write %p at large endoff %llu, req max_size\n",
				2668	inode, endoff);
				2669	ci->i_wanted_max_size = endoff;
				2670	}
				2671	/* duplicate ceph_check_caps()'s logic */
				2672	if (ci->i_auth_cap &&
				2673	(ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
				2674	ci->i_wanted_max_size > ci->i_max_size &&
				2675	ci->i_wanted_max_size > ci->i_requested_max_size)
				2676	check = 1;
				2677	spin_unlock(&ci->i_ceph_lock);
				2678	if (check)
				2679	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2680	}
				2681
				2682	int ceph_try_get_caps(struct ceph_inode_info ci, int need, int want, int got)
				2683	{
				2684	int ret, err = 0;
				2685
				2686	BUG_ON(need & ~CEPH_CAP_FILE_RD);
				2687	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO));
				2688	ret = ceph_pool_perm_check(ci, need);
				2689	if (ret < 0)
				2690	return ret;
				2691
				2692	ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
				2693	if (ret) {
				2694	if (err == -EAGAIN) {
				2695	ret = 0;
				2696	} else if (err < 0) {
				2697	ret = err;
				2698	}
				2699	}
				2700	return ret;
				2701	}
				2702
				2703	/*
				2704	* Wait for caps, and take cap references. If we can't get a WR cap
				2705	* due to a small max_size, make sure we check_max_size (and possibly
				2706	* ask the mds) so we don't get hung up indefinitely.
				2707	*/
				2708	int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
				2709	loff_t endoff, int got, struct page *pinned_page)
				2710	{
				2711	int _got, ret, err = 0;
				2712
				2713	ret = ceph_pool_perm_check(ci, need);
				2714	if (ret < 0)
				2715	return ret;
				2716
				2717	while (true) {
				2718	if (endoff > 0)
				2719	check_max_size(&ci->vfs_inode, endoff);
				2720
				2721	err = 0;
				2722	_got = 0;
				2723	ret = try_get_cap_refs(ci, need, want, endoff,
				2724	false, &_got, &err);
				2725	if (ret) {
				2726	if (err == -EAGAIN)
				2727	continue;
				2728	if (err < 0)
				2729	ret = err;
				2730	} else {
				2731	DEFINE_WAIT_FUNC(wait, woken_wake_function);
				2732	add_wait_queue(&ci->i_cap_wq, &wait);
				2733
				2734	while (!try_get_cap_refs(ci, need, want, endoff,
				2735	true, &_got, &err)) {
				2736	if (signal_pending(current)) {
				2737	ret = -ERESTARTSYS;
				2738	break;
				2739	}
				2740	wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
				2741	}
				2742
				2743	remove_wait_queue(&ci->i_cap_wq, &wait);
				2744
				2745	if (err == -EAGAIN)
				2746	continue;
				2747	if (err < 0)
				2748	ret = err;
				2749	}
				2750	if (ret < 0) {
				2751	if (err == -ESTALE) {
				2752	/* session was killed, try renew caps */
				2753	ret = ceph_renew_caps(&ci->vfs_inode);
				2754	if (ret == 0)
				2755	continue;
				2756	}
				2757	return ret;
				2758	}
				2759
				2760	if (ci->i_inline_version != CEPH_INLINE_NONE &&
				2761	(_got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
				2762	i_size_read(&ci->vfs_inode) > 0) {
				2763	struct page *page =
				2764	find_get_page(ci->vfs_inode.i_mapping, 0);
				2765	if (page) {
				2766	if (PageUptodate(page)) {
				2767	*pinned_page = page;
				2768	break;
				2769	}
				2770	put_page(page);
				2771	}
				2772	/*
				2773	* drop cap refs first because getattr while
				2774	* holding * caps refs can cause deadlock.
				2775	*/
				2776	ceph_put_cap_refs(ci, _got);
				2777	_got = 0;
				2778
				2779	/*
				2780	* getattr request will bring inline data into
				2781	* page cache
				2782	*/
				2783	ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
				2784	CEPH_STAT_CAP_INLINE_DATA,
				2785	true);
				2786	if (ret < 0)
				2787	return ret;
				2788	continue;
				2789	}
				2790	break;
				2791	}
				2792
				2793	if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
				2794	ceph_fscache_revalidate_cookie(ci);
				2795
				2796	*got = _got;
				2797	return 0;
				2798	}
				2799
				2800	/*
				2801	* Take cap refs. Caller must already know we hold at least one ref
				2802	* on the caps in question or we don't know this is safe.
				2803	*/
				2804	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
				2805	{
				2806	spin_lock(&ci->i_ceph_lock);
				2807	__take_cap_refs(ci, caps, false);
				2808	spin_unlock(&ci->i_ceph_lock);
				2809	}
				2810
				2811
				2812	/*
				2813	* drop cap_snap that is not associated with any snapshot.
				2814	* we don't need to send FLUSHSNAP message for it.
				2815	*/
				2816	static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
				2817	struct ceph_cap_snap *capsnap)
				2818	{
				2819	if (!capsnap->need_flush &&
				2820	!capsnap->writing && !capsnap->dirty_pages) {
				2821	dout("dropping cap_snap %p follows %llu\n",
				2822	capsnap, capsnap->follows);
				2823	BUG_ON(capsnap->cap_flush.tid > 0);
				2824	ceph_put_snap_context(capsnap->context);
				2825	if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
				2826	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
				2827
				2828	list_del(&capsnap->ci_item);
				2829	ceph_put_cap_snap(capsnap);
				2830	return 1;
				2831	}
				2832	return 0;
				2833	}
				2834
				2835	/*
				2836	* Release cap refs.
				2837	*
				2838	* If we released the last ref on any given cap, call ceph_check_caps
				2839	* to release (or schedule a release).
				2840	*
				2841	* If we are releasing a WR cap (from a sync write), finalize any affected
				2842	* cap_snap, and wake up any waiters.
				2843	*/
				2844	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
				2845	{
				2846	struct inode *inode = &ci->vfs_inode;
				2847	int last = 0, put = 0, flushsnaps = 0, wake = 0;
				2848
				2849	spin_lock(&ci->i_ceph_lock);
				2850	if (had & CEPH_CAP_PIN)
				2851	--ci->i_pin_ref;
				2852	if (had & CEPH_CAP_FILE_RD)
				2853	if (--ci->i_rd_ref == 0)
				2854	last++;
				2855	if (had & CEPH_CAP_FILE_CACHE)
				2856	if (--ci->i_rdcache_ref == 0)
				2857	last++;
				2858	if (had & CEPH_CAP_FILE_BUFFER) {
				2859	if (--ci->i_wb_ref == 0) {
				2860	last++;
				2861	put++;
				2862	}
				2863	dout("put_cap_refs %p wb %d -> %d (?)\n",
				2864	inode, ci->i_wb_ref+1, ci->i_wb_ref);
				2865	}
				2866	if (had & CEPH_CAP_FILE_WR)
				2867	if (--ci->i_wr_ref == 0) {
				2868	last++;
				2869	if (__ceph_have_pending_cap_snap(ci)) {
				2870	struct ceph_cap_snap *capsnap =
				2871	list_last_entry(&ci->i_cap_snaps,
				2872	struct ceph_cap_snap,
				2873	ci_item);
				2874	capsnap->writing = 0;
				2875	if (ceph_try_drop_cap_snap(ci, capsnap))
				2876	put++;
				2877	else if (__ceph_finish_cap_snap(ci, capsnap))
				2878	flushsnaps = 1;
				2879	wake = 1;
				2880	}
				2881	if (ci->i_wrbuffer_ref_head == 0 &&
				2882	ci->i_dirty_caps == 0 &&
				2883	ci->i_flushing_caps == 0) {
				2884	BUG_ON(!ci->i_head_snapc);
				2885	ceph_put_snap_context(ci->i_head_snapc);
				2886	ci->i_head_snapc = NULL;
				2887	}
				2888	/* see comment in __ceph_remove_cap() */
				2889	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
				2890	drop_inode_snap_realm(ci);
				2891	}
				2892	spin_unlock(&ci->i_ceph_lock);
				2893
				2894	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
				2895	last ? " last" : "", put ? " put" : "");
				2896
				2897	if (last && !flushsnaps)
				2898	ceph_check_caps(ci, 0, NULL);
				2899	else if (flushsnaps)
				2900	ceph_flush_snaps(ci, NULL);
				2901	if (wake)
				2902	wake_up_all(&ci->i_cap_wq);
				2903	while (put-- > 0)
				2904	iput(inode);
				2905	}
				2906
				2907	/*
				2908	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
				2909	* context. Adjust per-snap dirty page accounting as appropriate.
				2910	* Once all dirty data for a cap_snap is flushed, flush snapped file
				2911	* metadata back to the MDS. If we dropped the last ref, call
				2912	* ceph_check_caps.
				2913	*/
				2914	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				2915	struct ceph_snap_context *snapc)
				2916	{
				2917	struct inode *inode = &ci->vfs_inode;
				2918	struct ceph_cap_snap *capsnap = NULL;
				2919	int put = 0;
				2920	bool last = false;
				2921	bool found = false;
				2922	bool flush_snaps = false;
				2923	bool complete_capsnap = false;
				2924
				2925	spin_lock(&ci->i_ceph_lock);
				2926	ci->i_wrbuffer_ref -= nr;
				2927	if (ci->i_wrbuffer_ref == 0) {
				2928	last = true;
				2929	put++;
				2930	}
				2931
				2932	if (ci->i_head_snapc == snapc) {
				2933	ci->i_wrbuffer_ref_head -= nr;
				2934	if (ci->i_wrbuffer_ref_head == 0 &&
				2935	ci->i_wr_ref == 0 &&
				2936	ci->i_dirty_caps == 0 &&
				2937	ci->i_flushing_caps == 0) {
				2938	BUG_ON(!ci->i_head_snapc);
				2939	ceph_put_snap_context(ci->i_head_snapc);
				2940	ci->i_head_snapc = NULL;
				2941	}
				2942	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
				2943	inode,
				2944	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
				2945	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
				2946	last ? " LAST" : "");
				2947	} else {
				2948	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2949	if (capsnap->context == snapc) {
				2950	found = true;
				2951	break;
				2952	}
				2953	}
				2954	BUG_ON(!found);
				2955	capsnap->dirty_pages -= nr;
				2956	if (capsnap->dirty_pages == 0) {
				2957	complete_capsnap = true;
				2958	if (!capsnap->writing) {
				2959	if (ceph_try_drop_cap_snap(ci, capsnap)) {
				2960	put++;
				2961	} else {
				2962	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
				2963	flush_snaps = true;
				2964	}
				2965	}
				2966	}
				2967	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
				2968	" snap %lld %d/%d -> %d/%d %s%s\n",
				2969	inode, capsnap, capsnap->context->seq,
				2970	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
				2971	ci->i_wrbuffer_ref, capsnap->dirty_pages,
				2972	last ? " (wrbuffer last)" : "",
				2973	complete_capsnap ? " (complete capsnap)" : "");
				2974	}
				2975
				2976	spin_unlock(&ci->i_ceph_lock);
				2977
				2978	if (last) {
				2979	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2980	} else if (flush_snaps) {
				2981	ceph_flush_snaps(ci, NULL);
				2982	}
				2983	if (complete_capsnap)
				2984	wake_up_all(&ci->i_cap_wq);
				2985	while (put-- > 0)
				2986	iput(inode);
				2987	}
				2988
				2989	/*
				2990	* Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
				2991	*/
				2992	static void invalidate_aliases(struct inode *inode)
				2993	{
				2994	struct dentry dn, prev = NULL;
				2995
				2996	dout("invalidate_aliases inode %p\n", inode);
				2997	d_prune_aliases(inode);
				2998	/*
				2999	* For non-directory inode, d_find_alias() only returns
				3000	* hashed dentry. After calling d_invalidate(), the
				3001	* dentry becomes unhashed.
				3002	*
				3003	* For directory inode, d_find_alias() can return
				3004	* unhashed dentry. But directory inode should have
				3005	* one alias at most.
				3006	*/
				3007	while ((dn = d_find_alias(inode))) {
				3008	if (dn == prev) {
				3009	dput(dn);
				3010	break;
				3011	}
				3012	d_invalidate(dn);
				3013	if (prev)
				3014	dput(prev);
				3015	prev = dn;
				3016	}
				3017	if (prev)
				3018	dput(prev);
				3019	}
				3020
				3021	struct cap_extra_info {
				3022	struct ceph_string *pool_ns;
				3023	/* inline data */
				3024	u64 inline_version;
				3025	void *inline_data;
				3026	u32 inline_len;
				3027	/* dirstat */
				3028	bool dirstat_valid;
				3029	u64 nfiles;
				3030	u64 nsubdirs;
				3031	/* currently issued */
				3032	int issued;
				3033	};
				3034
				3035	/*
				3036	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
				3037	* actually be a revocation if it specifies a smaller cap set.)
				3038	*
				3039	* caller holds s_mutex and i_ceph_lock, we drop both.
				3040	*/
				3041	static void handle_cap_grant(struct inode *inode,
				3042	struct ceph_mds_session *session,
				3043	struct ceph_cap *cap,
				3044	struct ceph_mds_caps *grant,
				3045	struct ceph_buffer *xattr_buf,
				3046	struct cap_extra_info *extra_info)
				3047	__releases(ci->i_ceph_lock)
				3048	__releases(session->s_mdsc->snap_rwsem)
				3049	{
				3050	struct ceph_inode_info *ci = ceph_inode(inode);
				3051	int seq = le32_to_cpu(grant->seq);
				3052	int newcaps = le32_to_cpu(grant->caps);
				3053	int used, wanted, dirty;
				3054	u64 size = le64_to_cpu(grant->size);
				3055	u64 max_size = le64_to_cpu(grant->max_size);
				3056	int check_caps = 0;
				3057	bool wake = false;
				3058	bool writeback = false;
				3059	bool queue_trunc = false;
				3060	bool queue_invalidate = false;
				3061	bool deleted_inode = false;
				3062	bool fill_inline = false;
				3063
				3064	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
				3065	inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
				3066	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
				3067	inode->i_size);
				3068
				3069
				3070	/*
				3071	* auth mds of the inode changed. we received the cap export message,
				3072	* but still haven't received the cap import message. handle_cap_export
				3073	* updated the new auth MDS' cap.
				3074	*
				3075	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
				3076	* that was sent before the cap import message. So don't remove caps.
				3077	*/
				3078	if (ceph_seq_cmp(seq, cap->seq) <= 0) {
				3079	WARN_ON(cap != ci->i_auth_cap);
				3080	WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
				3081	seq = cap->seq;
				3082	newcaps \|= cap->issued;
				3083	}
				3084
				3085	/*
				3086	* If CACHE is being revoked, and we have no dirty buffers,
				3087	* try to invalidate (once). (If there are dirty buffers, we
				3088	* will invalidate _after_ writeback.)
				3089	*/
				3090	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
				3091	((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
				3092	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
				3093	!(ci->i_wrbuffer_ref \|\| ci->i_wb_ref)) {
				3094	if (try_nonblocking_invalidate(inode)) {
				3095	/* there were locked pages.. invalidate later
				3096	in a separate thread. */
				3097	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				3098	queue_invalidate = true;
				3099	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				3100	}
				3101	}
				3102	}
				3103
				3104	/* side effects now are allowed */
				3105	cap->cap_gen = session->s_cap_gen;
				3106	cap->seq = seq;
				3107
				3108	__check_cap_issue(ci, cap, newcaps);
				3109
				3110	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
				3111	(extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
				3112	inode->i_mode = le32_to_cpu(grant->mode);
				3113	inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
				3114	inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
				3115	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				3116	from_kuid(&init_user_ns, inode->i_uid),
				3117	from_kgid(&init_user_ns, inode->i_gid));
				3118	}
				3119
				3120	if ((newcaps & CEPH_CAP_LINK_SHARED) &&
				3121	(extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
				3122	set_nlink(inode, le32_to_cpu(grant->nlink));
				3123	if (inode->i_nlink == 0 &&
				3124	(newcaps & (CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL)))
				3125	deleted_inode = true;
				3126	}
				3127
				3128	if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
				3129	grant->xattr_len) {
				3130	int len = le32_to_cpu(grant->xattr_len);
				3131	u64 version = le64_to_cpu(grant->xattr_version);
				3132
				3133	if (version > ci->i_xattrs.version) {
				3134	dout(" got new xattrs v%llu on %p len %d\n",
				3135	version, inode, len);
				3136	if (ci->i_xattrs.blob)
				3137	ceph_buffer_put(ci->i_xattrs.blob);
				3138	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
				3139	ci->i_xattrs.version = version;
				3140	ceph_forget_all_cached_acls(inode);
				3141	}
				3142	}
				3143
				3144	if (newcaps & CEPH_CAP_ANY_RD) {
				3145	struct timespec64 mtime, atime, ctime;
				3146	/* ctime/mtime/atime? */
				3147	ceph_decode_timespec64(&mtime, &grant->mtime);
				3148	ceph_decode_timespec64(&atime, &grant->atime);
				3149	ceph_decode_timespec64(&ctime, &grant->ctime);
				3150	ceph_fill_file_time(inode, extra_info->issued,
				3151	le32_to_cpu(grant->time_warp_seq),
				3152	&ctime, &mtime, &atime);
				3153	}
				3154
				3155	if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
				3156	ci->i_files = extra_info->nfiles;
				3157	ci->i_subdirs = extra_info->nsubdirs;
				3158	}
				3159
				3160	if (newcaps & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR)) {
				3161	/* file layout may have changed */
				3162	s64 old_pool = ci->i_layout.pool_id;
				3163	struct ceph_string *old_ns;
				3164
				3165	ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
				3166	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
				3167	lockdep_is_held(&ci->i_ceph_lock));
				3168	rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
				3169
				3170	if (ci->i_layout.pool_id != old_pool \|\|
				3171	extra_info->pool_ns != old_ns)
				3172	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
				3173
				3174	extra_info->pool_ns = old_ns;
				3175
				3176	/* size/truncate_seq? */
				3177	queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
				3178	le32_to_cpu(grant->truncate_seq),
				3179	le64_to_cpu(grant->truncate_size),
				3180	size);
				3181	}
				3182
				3183	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
				3184	if (max_size != ci->i_max_size) {
				3185	dout("max_size %lld -> %llu\n",
				3186	ci->i_max_size, max_size);
				3187	ci->i_max_size = max_size;
				3188	if (max_size >= ci->i_wanted_max_size) {
				3189	ci->i_wanted_max_size = 0; /* reset */
				3190	ci->i_requested_max_size = 0;
				3191	}
				3192	wake = true;
				3193	} else if (ci->i_wanted_max_size > ci->i_max_size &&
				3194	ci->i_wanted_max_size > ci->i_requested_max_size) {
				3195	/* CEPH_CAP_OP_IMPORT */
				3196	wake = true;
				3197	}
				3198	}
				3199
				3200	/* check cap bits */
				3201	wanted = __ceph_caps_wanted(ci);
				3202	used = __ceph_caps_used(ci);
				3203	dirty = __ceph_caps_dirty(ci);
				3204	dout(" my wanted = %s, used = %s, dirty %s\n",
				3205	ceph_cap_string(wanted),
				3206	ceph_cap_string(used),
				3207	ceph_cap_string(dirty));
				3208	if (wanted != le32_to_cpu(grant->wanted)) {
				3209	dout("mds wanted %s -> %s\n",
				3210	ceph_cap_string(le32_to_cpu(grant->wanted)),
				3211	ceph_cap_string(wanted));
				3212	/* imported cap may not have correct mds_wanted */
				3213	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
				3214	check_caps = 1;
				3215	}
				3216
				3217	/* revocation, grant, or no-op? */
				3218	if (cap->issued & ~newcaps) {
				3219	int revoking = cap->issued & ~newcaps;
				3220
				3221	dout("revocation: %s -> %s (revoking %s)\n",
				3222	ceph_cap_string(cap->issued),
				3223	ceph_cap_string(newcaps),
				3224	ceph_cap_string(revoking));
				3225	if (revoking & used & CEPH_CAP_FILE_BUFFER)
				3226	writeback = true; /* initiate writeback; will delay ack */
				3227	else if (revoking == CEPH_CAP_FILE_CACHE &&
				3228	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
				3229	queue_invalidate)
				3230	; /* do nothing yet, invalidation will be queued */
				3231	else if (cap == ci->i_auth_cap)
				3232	check_caps = 1; /* check auth cap only */
				3233	else
				3234	check_caps = 2; /* check all caps */
				3235	cap->issued = newcaps;
				3236	cap->implemented \|= newcaps;
				3237	} else if (cap->issued == newcaps) {
				3238	dout("caps unchanged: %s -> %s\n",
				3239	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
				3240	} else {
				3241	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
				3242	ceph_cap_string(newcaps));
				3243	/* non-auth MDS is revoking the newly grant caps ? */
				3244	if (cap == ci->i_auth_cap &&
				3245	__ceph_caps_revoking_other(ci, cap, newcaps))
				3246	check_caps = 2;
				3247
				3248	cap->issued = newcaps;
				3249	cap->implemented \|= newcaps; /* add bits only, to
				3250	* avoid stepping on a
				3251	* pending revocation */
				3252	wake = true;
				3253	}
				3254	BUG_ON(cap->issued & ~cap->implemented);
				3255
				3256	if (extra_info->inline_version > 0 &&
				3257	extra_info->inline_version >= ci->i_inline_version) {
				3258	ci->i_inline_version = extra_info->inline_version;
				3259	if (ci->i_inline_version != CEPH_INLINE_NONE &&
				3260	(newcaps & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)))
				3261	fill_inline = true;
				3262	}
				3263
				3264	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
				3265	if (newcaps & ~extra_info->issued)
				3266	wake = true;
				3267	kick_flushing_inode_caps(session->s_mdsc, session, inode);
				3268	up_read(&session->s_mdsc->snap_rwsem);
				3269	} else {
				3270	spin_unlock(&ci->i_ceph_lock);
				3271	}
				3272
				3273	if (fill_inline)
				3274	ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
				3275	extra_info->inline_len);
				3276
				3277	if (queue_trunc)
				3278	ceph_queue_vmtruncate(inode);
				3279
				3280	if (writeback)
				3281	/*
				3282	* queue inode for writeback: we can't actually call
				3283	* filemap_write_and_wait, etc. from message handler
				3284	* context.
				3285	*/
				3286	ceph_queue_writeback(inode);
				3287	if (queue_invalidate)
				3288	ceph_queue_invalidate(inode);
				3289	if (deleted_inode)
				3290	invalidate_aliases(inode);
				3291	if (wake)
				3292	wake_up_all(&ci->i_cap_wq);
				3293
				3294	if (check_caps == 1)
				3295	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
				3296	session);
				3297	else if (check_caps == 2)
				3298	ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
				3299	else
				3300	mutex_unlock(&session->s_mutex);
				3301	}
				3302
				3303	/*
				3304	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
				3305	* MDS has been safely committed.
				3306	*/
				3307	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
				3308	struct ceph_mds_caps *m,
				3309	struct ceph_mds_session *session,
				3310	struct ceph_cap *cap)
				3311	__releases(ci->i_ceph_lock)
				3312	{
				3313	struct ceph_inode_info *ci = ceph_inode(inode);
				3314	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				3315	struct ceph_cap_flush cf, tmp_cf;
				3316	LIST_HEAD(to_remove);
				3317	unsigned seq = le32_to_cpu(m->seq);
				3318	int dirty = le32_to_cpu(m->dirty);
				3319	int cleaned = 0;
				3320	bool drop = false;
				3321	bool wake_ci = false;
				3322	bool wake_mdsc = false;
				3323
				3324	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
				3325	if (cf->tid == flush_tid)
				3326	cleaned = cf->caps;
				3327	if (cf->caps == 0) /* capsnap */
				3328	continue;
				3329	if (cf->tid <= flush_tid) {
				3330	if (__finish_cap_flush(NULL, ci, cf))
				3331	wake_ci = true;
				3332	list_add_tail(&cf->i_list, &to_remove);
				3333	} else {
				3334	cleaned &= ~cf->caps;
				3335	if (!cleaned)
				3336	break;
				3337	}
				3338	}
				3339
				3340	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
				3341	" flushing %s -> %s\n",
				3342	inode, session->s_mds, seq, ceph_cap_string(dirty),
				3343	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
				3344	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
				3345
				3346	if (list_empty(&to_remove) && !cleaned)
				3347	goto out;
				3348
				3349	ci->i_flushing_caps &= ~cleaned;
				3350
				3351	spin_lock(&mdsc->cap_dirty_lock);
				3352
				3353	list_for_each_entry(cf, &to_remove, i_list) {
				3354	if (__finish_cap_flush(mdsc, NULL, cf))
				3355	wake_mdsc = true;
				3356	}
				3357
				3358	if (ci->i_flushing_caps == 0) {
				3359	if (list_empty(&ci->i_cap_flush_list)) {
				3360	list_del_init(&ci->i_flushing_item);
				3361	if (!list_empty(&session->s_cap_flushing)) {
				3362	dout(" mds%d still flushing cap on %p\n",
				3363	session->s_mds,
				3364	&list_first_entry(&session->s_cap_flushing,
				3365	struct ceph_inode_info,
				3366	i_flushing_item)->vfs_inode);
				3367	}
				3368	}
				3369	mdsc->num_cap_flushing--;
				3370	dout(" inode %p now !flushing\n", inode);
				3371
				3372	if (ci->i_dirty_caps == 0) {
				3373	dout(" inode %p now clean\n", inode);
				3374	BUG_ON(!list_empty(&ci->i_dirty_item));
				3375	drop = true;
				3376	if (ci->i_wr_ref == 0 &&
				3377	ci->i_wrbuffer_ref_head == 0) {
				3378	BUG_ON(!ci->i_head_snapc);
				3379	ceph_put_snap_context(ci->i_head_snapc);
				3380	ci->i_head_snapc = NULL;
				3381	}
				3382	} else {
				3383	BUG_ON(list_empty(&ci->i_dirty_item));
				3384	}
				3385	}
				3386	spin_unlock(&mdsc->cap_dirty_lock);
				3387
				3388	out:
				3389	spin_unlock(&ci->i_ceph_lock);
				3390
				3391	while (!list_empty(&to_remove)) {
				3392	cf = list_first_entry(&to_remove,
				3393	struct ceph_cap_flush, i_list);
				3394	list_del(&cf->i_list);
				3395	ceph_free_cap_flush(cf);
				3396	}
				3397
				3398	if (wake_ci)
				3399	wake_up_all(&ci->i_cap_wq);
				3400	if (wake_mdsc)
				3401	wake_up_all(&mdsc->cap_flushing_wq);
				3402	if (drop)
				3403	iput(inode);
				3404	}
				3405
				3406	/*
				3407	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
				3408	* throw away our cap_snap.
				3409	*
				3410	* Caller hold s_mutex.
				3411	*/
				3412	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
				3413	struct ceph_mds_caps *m,
				3414	struct ceph_mds_session *session)
				3415	{
				3416	struct ceph_inode_info *ci = ceph_inode(inode);
				3417	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
				3418	u64 follows = le64_to_cpu(m->snap_follows);
				3419	struct ceph_cap_snap *capsnap;
				3420	bool flushed = false;
				3421	bool wake_ci = false;
				3422	bool wake_mdsc = false;
				3423
				3424	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
				3425	inode, ci, session->s_mds, follows);
				3426
				3427	spin_lock(&ci->i_ceph_lock);
				3428	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				3429	if (capsnap->follows == follows) {
				3430	if (capsnap->cap_flush.tid != flush_tid) {
				3431	dout(" cap_snap %p follows %lld tid %lld !="
				3432	" %lld\n", capsnap, follows,
				3433	flush_tid, capsnap->cap_flush.tid);
				3434	break;
				3435	}
				3436	flushed = true;
				3437	break;
				3438	} else {
				3439	dout(" skipping cap_snap %p follows %lld\n",
				3440	capsnap, capsnap->follows);
				3441	}
				3442	}
				3443	if (flushed) {
				3444	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
				3445	dout(" removing %p cap_snap %p follows %lld\n",
				3446	inode, capsnap, follows);
				3447	list_del(&capsnap->ci_item);
				3448	if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
				3449	wake_ci = true;
				3450
				3451	spin_lock(&mdsc->cap_dirty_lock);
				3452
				3453	if (list_empty(&ci->i_cap_flush_list))
				3454	list_del_init(&ci->i_flushing_item);
				3455
				3456	if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
				3457	wake_mdsc = true;
				3458
				3459	spin_unlock(&mdsc->cap_dirty_lock);
				3460	}
				3461	spin_unlock(&ci->i_ceph_lock);
				3462	if (flushed) {
				3463	ceph_put_snap_context(capsnap->context);
				3464	ceph_put_cap_snap(capsnap);
				3465	if (wake_ci)
				3466	wake_up_all(&ci->i_cap_wq);
				3467	if (wake_mdsc)
				3468	wake_up_all(&mdsc->cap_flushing_wq);
				3469	iput(inode);
				3470	}
				3471	}
				3472
				3473	/*
				3474	* Handle TRUNC from MDS, indicating file truncation.
				3475	*
				3476	* caller hold s_mutex.
				3477	*/
				3478	static void handle_cap_trunc(struct inode *inode,
				3479	struct ceph_mds_caps *trunc,
				3480	struct ceph_mds_session *session)
				3481	__releases(ci->i_ceph_lock)
				3482	{
				3483	struct ceph_inode_info *ci = ceph_inode(inode);
				3484	int mds = session->s_mds;
				3485	int seq = le32_to_cpu(trunc->seq);
				3486	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
				3487	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
				3488	u64 size = le64_to_cpu(trunc->size);
				3489	int implemented = 0;
				3490	int dirty = __ceph_caps_dirty(ci);
				3491	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
				3492	int queue_trunc = 0;
				3493
				3494	issued \|= implemented \| dirty;
				3495
				3496	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
				3497	inode, mds, seq, truncate_size, truncate_seq);
				3498	queue_trunc = ceph_fill_file_size(inode, issued,
				3499	truncate_seq, truncate_size, size);
				3500	spin_unlock(&ci->i_ceph_lock);
				3501
				3502	if (queue_trunc)
				3503	ceph_queue_vmtruncate(inode);
				3504	}
				3505
				3506	/*
				3507	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
				3508	* different one. If we are the most recent migration we've seen (as
				3509	* indicated by mseq), make note of the migrating cap bits for the
				3510	* duration (until we see the corresponding IMPORT).
				3511	*
				3512	* caller holds s_mutex
				3513	*/
				3514	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
				3515	struct ceph_mds_cap_peer *ph,
				3516	struct ceph_mds_session *session)
				3517	{
				3518	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
				3519	struct ceph_mds_session *tsession = NULL;
				3520	struct ceph_cap cap, tcap, *new_cap = NULL;
				3521	struct ceph_inode_info *ci = ceph_inode(inode);
				3522	u64 t_cap_id;
				3523	unsigned mseq = le32_to_cpu(ex->migrate_seq);
				3524	unsigned t_seq, t_mseq;
				3525	int target, issued;
				3526	int mds = session->s_mds;
				3527
				3528	if (ph) {
				3529	t_cap_id = le64_to_cpu(ph->cap_id);
				3530	t_seq = le32_to_cpu(ph->seq);
				3531	t_mseq = le32_to_cpu(ph->mseq);
				3532	target = le32_to_cpu(ph->mds);
				3533	} else {
				3534	t_cap_id = t_seq = t_mseq = 0;
				3535	target = -1;
				3536	}
				3537
				3538	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
				3539	inode, ci, mds, mseq, target);
				3540	retry:
				3541	spin_lock(&ci->i_ceph_lock);
				3542	cap = __get_cap_for_mds(ci, mds);
				3543	if (!cap \|\| cap->cap_id != le64_to_cpu(ex->cap_id))
				3544	goto out_unlock;
				3545
				3546	if (target < 0) {
				3547	__ceph_remove_cap(cap, false);
				3548	if (!ci->i_auth_cap)
				3549	ci->i_ceph_flags \|= CEPH_I_CAP_DROPPED;
				3550	goto out_unlock;
				3551	}
				3552
				3553	/*
				3554	* now we know we haven't received the cap import message yet
				3555	* because the exported cap still exist.
				3556	*/
				3557
				3558	issued = cap->issued;
				3559	if (issued != cap->implemented)
				3560	pr_err_ratelimited("handle_cap_export: issued != implemented: "
				3561	"ino (%llx.%llx) mds%d seq %d mseq %d "
				3562	"issued %s implemented %s\n",
				3563	ceph_vinop(inode), mds, cap->seq, cap->mseq,
				3564	ceph_cap_string(issued),
				3565	ceph_cap_string(cap->implemented));
				3566
				3567
				3568	tcap = __get_cap_for_mds(ci, target);
				3569	if (tcap) {
				3570	/* already have caps from the target */
				3571	if (tcap->cap_id == t_cap_id &&
				3572	ceph_seq_cmp(tcap->seq, t_seq) < 0) {
				3573	dout(" updating import cap %p mds%d\n", tcap, target);
				3574	tcap->cap_id = t_cap_id;
				3575	tcap->seq = t_seq - 1;
				3576	tcap->issue_seq = t_seq - 1;
				3577	tcap->issued \|= issued;
				3578	tcap->implemented \|= issued;
				3579	if (cap == ci->i_auth_cap)
				3580	ci->i_auth_cap = tcap;
				3581
				3582	if (!list_empty(&ci->i_cap_flush_list) &&
				3583	ci->i_auth_cap == tcap) {
				3584	spin_lock(&mdsc->cap_dirty_lock);
				3585	list_move_tail(&ci->i_flushing_item,
				3586	&tcap->session->s_cap_flushing);
				3587	spin_unlock(&mdsc->cap_dirty_lock);
				3588	}
				3589	}
				3590	__ceph_remove_cap(cap, false);
				3591	goto out_unlock;
				3592	} else if (tsession) {
				3593	/* add placeholder for the export tagert */
				3594	int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
				3595	tcap = new_cap;
				3596	ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
				3597	t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
				3598
				3599	if (!list_empty(&ci->i_cap_flush_list) &&
				3600	ci->i_auth_cap == tcap) {
				3601	spin_lock(&mdsc->cap_dirty_lock);
				3602	list_move_tail(&ci->i_flushing_item,
				3603	&tcap->session->s_cap_flushing);
				3604	spin_unlock(&mdsc->cap_dirty_lock);
				3605	}
				3606
				3607	__ceph_remove_cap(cap, false);
				3608	goto out_unlock;
				3609	}
				3610
				3611	spin_unlock(&ci->i_ceph_lock);
				3612	mutex_unlock(&session->s_mutex);
				3613
				3614	/* open target session */
				3615	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
				3616	if (!IS_ERR(tsession)) {
				3617	if (mds > target) {
				3618	mutex_lock(&session->s_mutex);
				3619	mutex_lock_nested(&tsession->s_mutex,
				3620	SINGLE_DEPTH_NESTING);
				3621	} else {
				3622	mutex_lock(&tsession->s_mutex);
				3623	mutex_lock_nested(&session->s_mutex,
				3624	SINGLE_DEPTH_NESTING);
				3625	}
				3626	new_cap = ceph_get_cap(mdsc, NULL);
				3627	} else {
				3628	WARN_ON(1);
				3629	tsession = NULL;
				3630	target = -1;
				3631	}
				3632	goto retry;
				3633
				3634	out_unlock:
				3635	spin_unlock(&ci->i_ceph_lock);
				3636	mutex_unlock(&session->s_mutex);
				3637	if (tsession) {
				3638	mutex_unlock(&tsession->s_mutex);
				3639	ceph_put_mds_session(tsession);
				3640	}
				3641	if (new_cap)
				3642	ceph_put_cap(mdsc, new_cap);
				3643	}
				3644
				3645	/*
				3646	* Handle cap IMPORT.
				3647	*
				3648	* caller holds s_mutex. acquires i_ceph_lock
				3649	*/
				3650	static void handle_cap_import(struct ceph_mds_client *mdsc,
				3651	struct inode inode, struct ceph_mds_caps im,
				3652	struct ceph_mds_cap_peer *ph,
				3653	struct ceph_mds_session *session,
				3654	struct ceph_cap *target_cap, int old_issued)
				3655	__acquires(ci->i_ceph_lock)
				3656	{
				3657	struct ceph_inode_info *ci = ceph_inode(inode);
				3658	struct ceph_cap cap, ocap, *new_cap = NULL;
				3659	int mds = session->s_mds;
				3660	int issued;
				3661	unsigned caps = le32_to_cpu(im->caps);
				3662	unsigned wanted = le32_to_cpu(im->wanted);
				3663	unsigned seq = le32_to_cpu(im->seq);
				3664	unsigned mseq = le32_to_cpu(im->migrate_seq);
				3665	u64 realmino = le64_to_cpu(im->realm);
				3666	u64 cap_id = le64_to_cpu(im->cap_id);
				3667	u64 p_cap_id;
				3668	int peer;
				3669
				3670	if (ph) {
				3671	p_cap_id = le64_to_cpu(ph->cap_id);
				3672	peer = le32_to_cpu(ph->mds);
				3673	} else {
				3674	p_cap_id = 0;
				3675	peer = -1;
				3676	}
				3677
				3678	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
				3679	inode, ci, mds, mseq, peer);
				3680
				3681	retry:
				3682	spin_lock(&ci->i_ceph_lock);
				3683	cap = __get_cap_for_mds(ci, mds);
				3684	if (!cap) {
				3685	if (!new_cap) {
				3686	spin_unlock(&ci->i_ceph_lock);
				3687	new_cap = ceph_get_cap(mdsc, NULL);
				3688	goto retry;
				3689	}
				3690	cap = new_cap;
				3691	} else {
				3692	if (new_cap) {
				3693	ceph_put_cap(mdsc, new_cap);
				3694	new_cap = NULL;
				3695	}
				3696	}
				3697
				3698	__ceph_caps_issued(ci, &issued);
				3699	issued \|= __ceph_caps_dirty(ci);
				3700
				3701	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
				3702	realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
				3703
				3704	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
				3705	if (ocap && ocap->cap_id == p_cap_id) {
				3706	dout(" remove export cap %p mds%d flags %d\n",
				3707	ocap, peer, ph->flags);
				3708	if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
				3709	(ocap->seq != le32_to_cpu(ph->seq) \|\|
				3710	ocap->mseq != le32_to_cpu(ph->mseq))) {
				3711	pr_err_ratelimited("handle_cap_import: "
				3712	"mismatched seq/mseq: ino (%llx.%llx) "
				3713	"mds%d seq %d mseq %d importer mds%d "
				3714	"has peer seq %d mseq %d\n",
				3715	ceph_vinop(inode), peer, ocap->seq,
				3716	ocap->mseq, mds, le32_to_cpu(ph->seq),
				3717	le32_to_cpu(ph->mseq));
				3718	}
				3719	__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
				3720	}
				3721
				3722	/* make sure we re-request max_size, if necessary */
				3723	ci->i_requested_max_size = 0;
				3724
				3725	*old_issued = issued;
				3726	*target_cap = cap;
				3727	}
				3728
				3729	/*
				3730	* Handle a caps message from the MDS.
				3731	*
				3732	* Identify the appropriate session, inode, and call the right handler
				3733	* based on the cap op.
				3734	*/
				3735	void ceph_handle_caps(struct ceph_mds_session *session,
				3736	struct ceph_msg *msg)
				3737	{
				3738	struct ceph_mds_client *mdsc = session->s_mdsc;
				3739	struct inode *inode;
				3740	struct ceph_inode_info *ci;
				3741	struct ceph_cap *cap;
				3742	struct ceph_mds_caps *h;
				3743	struct ceph_mds_cap_peer *peer = NULL;
				3744	struct ceph_snap_realm *realm = NULL;
				3745	int op;
				3746	int msg_version = le16_to_cpu(msg->hdr.version);
				3747	u32 seq, mseq;
				3748	struct ceph_vino vino;
				3749	void *snaptrace;
				3750	size_t snaptrace_len;
				3751	void p, end;
				3752	struct cap_extra_info extra_info = {};
				3753
				3754	dout("handle_caps from mds%d\n", session->s_mds);
				3755
				3756	/* decode */
				3757	end = msg->front.iov_base + msg->front.iov_len;
				3758	if (msg->front.iov_len < sizeof(*h))
				3759	goto bad;
				3760	h = msg->front.iov_base;
				3761	op = le32_to_cpu(h->op);
				3762	vino.ino = le64_to_cpu(h->ino);
				3763	vino.snap = CEPH_NOSNAP;
				3764	seq = le32_to_cpu(h->seq);
				3765	mseq = le32_to_cpu(h->migrate_seq);
				3766
				3767	snaptrace = h + 1;
				3768	snaptrace_len = le32_to_cpu(h->snap_trace_len);
				3769	p = snaptrace + snaptrace_len;
				3770
				3771	if (msg_version >= 2) {
				3772	u32 flock_len;
				3773	ceph_decode_32_safe(&p, end, flock_len, bad);
				3774	if (p + flock_len > end)
				3775	goto bad;
				3776	p += flock_len;
				3777	}
				3778
				3779	if (msg_version >= 3) {
				3780	if (op == CEPH_CAP_OP_IMPORT) {
				3781	if (p + sizeof(*peer) > end)
				3782	goto bad;
				3783	peer = p;
				3784	p += sizeof(*peer);
				3785	} else if (op == CEPH_CAP_OP_EXPORT) {
				3786	/* recorded in unused fields */
				3787	peer = (void *)&h->size;
				3788	}
				3789	}
				3790
				3791	if (msg_version >= 4) {
				3792	ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
				3793	ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
				3794	if (p + extra_info.inline_len > end)
				3795	goto bad;
				3796	extra_info.inline_data = p;
				3797	p += extra_info.inline_len;
				3798	}
				3799
				3800	if (msg_version >= 5) {
				3801	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
				3802	u32 epoch_barrier;
				3803
				3804	ceph_decode_32_safe(&p, end, epoch_barrier, bad);
				3805	ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
				3806	}
				3807
				3808	if (msg_version >= 8) {
				3809	u64 flush_tid;
				3810	u32 caller_uid, caller_gid;
				3811	u32 pool_ns_len;
				3812
				3813	/* version >= 6 */
				3814	ceph_decode_64_safe(&p, end, flush_tid, bad);
				3815	/* version >= 7 */
				3816	ceph_decode_32_safe(&p, end, caller_uid, bad);
				3817	ceph_decode_32_safe(&p, end, caller_gid, bad);
				3818	/* version >= 8 */
				3819	ceph_decode_32_safe(&p, end, pool_ns_len, bad);
				3820	if (pool_ns_len > 0) {
				3821	ceph_decode_need(&p, end, pool_ns_len, bad);
				3822	extra_info.pool_ns =
				3823	ceph_find_or_create_string(p, pool_ns_len);
				3824	p += pool_ns_len;
				3825	}
				3826	}
				3827
				3828	if (msg_version >= 11) {
				3829	struct ceph_timespec *btime;
				3830	u64 change_attr;
				3831	u32 flags;
				3832
				3833	/* version >= 9 */
				3834	if (p + sizeof(*btime) > end)
				3835	goto bad;
				3836	btime = p;
				3837	p += sizeof(*btime);
				3838	ceph_decode_64_safe(&p, end, change_attr, bad);
				3839	/* version >= 10 */
				3840	ceph_decode_32_safe(&p, end, flags, bad);
				3841	/* version >= 11 */
				3842	extra_info.dirstat_valid = true;
				3843	ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
				3844	ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
				3845	}
				3846
				3847	/* lookup ino */
				3848	inode = ceph_find_inode(mdsc->fsc->sb, vino);
				3849	ci = ceph_inode(inode);
				3850	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
				3851	vino.snap, inode);
				3852
				3853	mutex_lock(&session->s_mutex);
				3854	session->s_seq++;
				3855	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
				3856	(unsigned)seq);
				3857
				3858	if (!inode) {
				3859	dout(" i don't have ino %llx\n", vino.ino);
				3860
				3861	if (op == CEPH_CAP_OP_IMPORT) {
				3862	cap = ceph_get_cap(mdsc, NULL);
				3863	cap->cap_ino = vino.ino;
				3864	cap->queue_release = 1;
				3865	cap->cap_id = le64_to_cpu(h->cap_id);
				3866	cap->mseq = mseq;
				3867	cap->seq = seq;
				3868	cap->issue_seq = seq;
				3869	spin_lock(&session->s_cap_lock);
				3870	list_add_tail(&cap->session_caps,
				3871	&session->s_cap_releases);
				3872	session->s_num_cap_releases++;
				3873	spin_unlock(&session->s_cap_lock);
				3874	}
				3875	goto flush_cap_releases;
				3876	}
				3877
				3878	/* these will work even if we don't have a cap yet */
				3879	switch (op) {
				3880	case CEPH_CAP_OP_FLUSHSNAP_ACK:
				3881	handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
				3882	h, session);
				3883	goto done;
				3884
				3885	case CEPH_CAP_OP_EXPORT:
				3886	handle_cap_export(inode, h, peer, session);
				3887	goto done_unlocked;
				3888
				3889	case CEPH_CAP_OP_IMPORT:
				3890	realm = NULL;
				3891	if (snaptrace_len) {
				3892	down_write(&mdsc->snap_rwsem);
				3893	ceph_update_snap_trace(mdsc, snaptrace,
				3894	snaptrace + snaptrace_len,
				3895	false, &realm);
				3896	downgrade_write(&mdsc->snap_rwsem);
				3897	} else {
				3898	down_read(&mdsc->snap_rwsem);
				3899	}
				3900	handle_cap_import(mdsc, inode, h, peer, session,
				3901	&cap, &extra_info.issued);
				3902	handle_cap_grant(inode, session, cap,
				3903	h, msg->middle, &extra_info);
				3904	if (realm)
				3905	ceph_put_snap_realm(mdsc, realm);
				3906	goto done_unlocked;
				3907	}
				3908
				3909	/* the rest require a cap */
				3910	spin_lock(&ci->i_ceph_lock);
				3911	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
				3912	if (!cap) {
				3913	dout(" no cap on %p ino %llx.%llx from mds%d\n",
				3914	inode, ceph_ino(inode), ceph_snap(inode),
				3915	session->s_mds);
				3916	spin_unlock(&ci->i_ceph_lock);
				3917	goto flush_cap_releases;
				3918	}
				3919
				3920	/* note that each of these drops i_ceph_lock for us */
				3921	switch (op) {
				3922	case CEPH_CAP_OP_REVOKE:
				3923	case CEPH_CAP_OP_GRANT:
				3924	__ceph_caps_issued(ci, &extra_info.issued);
				3925	extra_info.issued \|= __ceph_caps_dirty(ci);
				3926	handle_cap_grant(inode, session, cap,
				3927	h, msg->middle, &extra_info);
				3928	goto done_unlocked;
				3929
				3930	case CEPH_CAP_OP_FLUSH_ACK:
				3931	handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
				3932	h, session, cap);
				3933	break;
				3934
				3935	case CEPH_CAP_OP_TRUNC:
				3936	handle_cap_trunc(inode, h, session);
				3937	break;
				3938
				3939	default:
				3940	spin_unlock(&ci->i_ceph_lock);
				3941	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
				3942	ceph_cap_op_name(op));
				3943	}
				3944
				3945	goto done;
				3946
				3947	flush_cap_releases:
				3948	/*
				3949	* send any cap release message to try to move things
				3950	* along for the mds (who clearly thinks we still have this
				3951	* cap).
				3952	*/
				3953	ceph_send_cap_releases(mdsc, session);
				3954
				3955	done:
				3956	mutex_unlock(&session->s_mutex);
				3957	done_unlocked:
				3958	iput(inode);
				3959	ceph_put_string(extra_info.pool_ns);
				3960	return;
				3961
				3962	bad:
				3963	pr_err("ceph_handle_caps: corrupt message\n");
				3964	ceph_msg_dump(msg);
				3965	return;
				3966	}
				3967
				3968	/*
				3969	* Delayed work handler to process end of delayed cap release LRU list.
				3970	*/
				3971	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
				3972	{
				3973	struct inode *inode;
				3974	struct ceph_inode_info *ci;
				3975	int flags = CHECK_CAPS_NODELAY;
				3976
				3977	dout("check_delayed_caps\n");
				3978	while (1) {
				3979	spin_lock(&mdsc->cap_delay_lock);
				3980	if (list_empty(&mdsc->cap_delay_list))
				3981	break;
				3982	ci = list_first_entry(&mdsc->cap_delay_list,
				3983	struct ceph_inode_info,
				3984	i_cap_delay_list);
				3985	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
				3986	time_before(jiffies, ci->i_hold_caps_max))
				3987	break;
				3988	list_del_init(&ci->i_cap_delay_list);
				3989
				3990	inode = igrab(&ci->vfs_inode);
				3991	spin_unlock(&mdsc->cap_delay_lock);
				3992
				3993	if (inode) {
				3994	dout("check_delayed_caps on %p\n", inode);
				3995	ceph_check_caps(ci, flags, NULL);
				3996	iput(inode);
				3997	}
				3998	}
				3999	spin_unlock(&mdsc->cap_delay_lock);
				4000	}
				4001
				4002	/*
				4003	* Flush all dirty caps to the mds
				4004	*/
				4005	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
				4006	{
				4007	struct ceph_inode_info *ci;
				4008	struct inode *inode;
				4009
				4010	dout("flush_dirty_caps\n");
				4011	spin_lock(&mdsc->cap_dirty_lock);
				4012	while (!list_empty(&mdsc->cap_dirty)) {
				4013	ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
				4014	i_dirty_item);
				4015	inode = &ci->vfs_inode;
				4016	ihold(inode);
				4017	dout("flush_dirty_caps %p\n", inode);
				4018	spin_unlock(&mdsc->cap_dirty_lock);
				4019	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH, NULL);
				4020	iput(inode);
				4021	spin_lock(&mdsc->cap_dirty_lock);
				4022	}
				4023	spin_unlock(&mdsc->cap_dirty_lock);
				4024	dout("flush_dirty_caps done\n");
				4025	}
				4026
				4027	void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
				4028	{
				4029	int i;
				4030	int bits = (fmode << 1) \| 1;
				4031	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				4032	if (bits & (1 << i))
				4033	ci->i_nr_by_mode[i]++;
				4034	}
				4035	}
				4036
				4037	/*
				4038	* Drop open file reference. If we were the last open file,
				4039	* we may need to release capabilities to the MDS (or schedule
				4040	* their delayed release).
				4041	*/
				4042	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
				4043	{
				4044	int i, last = 0;
				4045	int bits = (fmode << 1) \| 1;
				4046	spin_lock(&ci->i_ceph_lock);
				4047	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
				4048	if (bits & (1 << i)) {
				4049	BUG_ON(ci->i_nr_by_mode[i] == 0);
				4050	if (--ci->i_nr_by_mode[i] == 0)
				4051	last++;
				4052	}
				4053	}
				4054	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
				4055	&ci->vfs_inode, fmode,
				4056	ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
				4057	ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
				4058	spin_unlock(&ci->i_ceph_lock);
				4059
				4060	if (last && ci->i_vino.snap == CEPH_NOSNAP)
				4061	ceph_check_caps(ci, 0, NULL);
				4062	}
				4063
				4064	/*
				4065	* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
				4066	* looks like the link count will hit 0, drop any other caps (other
				4067	* than PIN) we don't specifically want (due to the file still being
				4068	* open).
				4069	*/
				4070	int ceph_drop_caps_for_unlink(struct inode *inode)
				4071	{
				4072	struct ceph_inode_info *ci = ceph_inode(inode);
				4073	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
				4074
				4075	spin_lock(&ci->i_ceph_lock);
				4076	if (inode->i_nlink == 1) {
				4077	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);
				4078
				4079	ci->i_ceph_flags \|= CEPH_I_NODELAY;
				4080	if (__ceph_caps_dirty(ci)) {
				4081	struct ceph_mds_client *mdsc =
				4082	ceph_inode_to_client(inode)->mdsc;
				4083	__cap_delay_requeue_front(mdsc, ci);
				4084	}
				4085	}
				4086	spin_unlock(&ci->i_ceph_lock);
				4087	return drop;
				4088	}
				4089
				4090	/*
				4091	* Helpers for embedding cap and dentry lease releases into mds
				4092	* requests.
				4093	*
				4094	* @force is used by dentry_release (below) to force inclusion of a
				4095	* record for the directory inode, even when there aren't any caps to
				4096	* drop.
				4097	*/
				4098	int ceph_encode_inode_release(void *p, struct inode inode,
				4099	int mds, int drop, int unless, int force)
				4100	{
				4101	struct ceph_inode_info *ci = ceph_inode(inode);
				4102	struct ceph_cap *cap;
				4103	struct ceph_mds_request_release rel = p;
				4104	int used, dirty;
				4105	int ret = 0;
				4106
				4107	spin_lock(&ci->i_ceph_lock);
				4108	used = __ceph_caps_used(ci);
				4109	dirty = __ceph_caps_dirty(ci);
				4110
				4111	dout("encode_inode_release %p mds%d used\|dirty %s drop %s unless %s\n",
				4112	inode, mds, ceph_cap_string(used\|dirty), ceph_cap_string(drop),
				4113	ceph_cap_string(unless));
				4114
				4115	/* only drop unused, clean caps */
				4116	drop &= ~(used \| dirty);
				4117
				4118	cap = __get_cap_for_mds(ci, mds);
				4119	if (cap && __cap_is_valid(cap)) {
				4120	unless &= cap->issued;
				4121	if (unless) {
				4122	if (unless & CEPH_CAP_AUTH_EXCL)
				4123	drop &= ~CEPH_CAP_AUTH_SHARED;
				4124	if (unless & CEPH_CAP_LINK_EXCL)
				4125	drop &= ~CEPH_CAP_LINK_SHARED;
				4126	if (unless & CEPH_CAP_XATTR_EXCL)
				4127	drop &= ~CEPH_CAP_XATTR_SHARED;
				4128	if (unless & CEPH_CAP_FILE_EXCL)
				4129	drop &= ~CEPH_CAP_FILE_SHARED;
				4130	}
				4131
				4132	if (force \|\| (cap->issued & drop)) {
				4133	if (cap->issued & drop) {
				4134	int wanted = __ceph_caps_wanted(ci);
				4135	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
				4136	wanted \|= cap->mds_wanted;
				4137	dout("encode_inode_release %p cap %p "
				4138	"%s -> %s, wanted %s -> %s\n", inode, cap,
				4139	ceph_cap_string(cap->issued),
				4140	ceph_cap_string(cap->issued & ~drop),
				4141	ceph_cap_string(cap->mds_wanted),
				4142	ceph_cap_string(wanted));
				4143
				4144	cap->issued &= ~drop;
				4145	cap->implemented &= ~drop;
				4146	cap->mds_wanted = wanted;
				4147	} else {
				4148	dout("encode_inode_release %p cap %p %s"
				4149	" (force)\n", inode, cap,
				4150	ceph_cap_string(cap->issued));
				4151	}
				4152
				4153	rel->ino = cpu_to_le64(ceph_ino(inode));
				4154	rel->cap_id = cpu_to_le64(cap->cap_id);
				4155	rel->seq = cpu_to_le32(cap->seq);
				4156	rel->issue_seq = cpu_to_le32(cap->issue_seq);
				4157	rel->mseq = cpu_to_le32(cap->mseq);
				4158	rel->caps = cpu_to_le32(cap->implemented);
				4159	rel->wanted = cpu_to_le32(cap->mds_wanted);
				4160	rel->dname_len = 0;
				4161	rel->dname_seq = 0;
				4162	p += sizeof(rel);
				4163	ret = 1;
				4164	} else {
				4165	dout("encode_inode_release %p cap %p %s (noop)\n",
				4166	inode, cap, ceph_cap_string(cap->issued));
				4167	}
				4168	}
				4169	spin_unlock(&ci->i_ceph_lock);
				4170	return ret;
				4171	}
				4172
				4173	int ceph_encode_dentry_release(void *p, struct dentry dentry,
				4174	struct inode *dir,
				4175	int mds, int drop, int unless)
				4176	{
				4177	struct dentry *parent = NULL;
				4178	struct ceph_mds_request_release rel = p;
				4179	struct ceph_dentry_info *di = ceph_dentry(dentry);
				4180	int force = 0;
				4181	int ret;
				4182
				4183	/*
				4184	* force an record for the directory caps if we have a dentry lease.
				4185	* this is racy (can't take i_ceph_lock and d_lock together), but it
				4186	* doesn't have to be perfect; the mds will revoke anything we don't
				4187	* release.
				4188	*/
				4189	spin_lock(&dentry->d_lock);
				4190	if (di->lease_session && di->lease_session->s_mds == mds)
				4191	force = 1;
				4192	if (!dir) {
				4193	parent = dget(dentry->d_parent);
				4194	dir = d_inode(parent);
				4195	}
				4196	spin_unlock(&dentry->d_lock);
				4197
				4198	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
				4199	dput(parent);
				4200
				4201	spin_lock(&dentry->d_lock);
				4202	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
				4203	dout("encode_dentry_release %p mds%d seq %d\n",
				4204	dentry, mds, (int)di->lease_seq);
				4205	rel->dname_len = cpu_to_le32(dentry->d_name.len);
				4206	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
				4207	*p += dentry->d_name.len;
				4208	rel->dname_seq = cpu_to_le32(di->lease_seq);
				4209	__ceph_mdsc_drop_dentry_lease(dentry);
				4210	}
				4211	spin_unlock(&dentry->d_lock);
				4212	return ret;
				4213	}