Blame - src/kernel/linux/v4.14/fs/ceph/mds_client.c - T103

blob: 06109314d93c8fff854c2753c080f5a6b87f975a [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/fs.h>
				5	#include <linux/wait.h>
				6	#include <linux/slab.h>
				7	#include <linux/gfp.h>
				8	#include <linux/sched.h>
				9	#include <linux/debugfs.h>
				10	#include <linux/seq_file.h>
				11	#include <linux/ratelimit.h>
				12
				13	#include "super.h"
				14	#include "mds_client.h"
				15
				16	#include <linux/ceph/ceph_features.h>
				17	#include <linux/ceph/messenger.h>
				18	#include <linux/ceph/decode.h>
				19	#include <linux/ceph/pagelist.h>
				20	#include <linux/ceph/auth.h>
				21	#include <linux/ceph/debugfs.h>
				22
				23	/*
				24	* A cluster of MDS (metadata server) daemons is responsible for
				25	* managing the file system namespace (the directory hierarchy and
				26	* inodes) and for coordinating shared access to storage. Metadata is
				27	* partitioning hierarchically across a number of servers, and that
				28	* partition varies over time as the cluster adjusts the distribution
				29	* in order to balance load.
				30	*
				31	* The MDS client is primarily responsible to managing synchronous
				32	* metadata requests for operations like open, unlink, and so forth.
				33	* If there is a MDS failure, we find out about it when we (possibly
				34	* request and) receive a new MDS map, and can resubmit affected
				35	* requests.
				36	*
				37	* For the most part, though, we take advantage of a lossless
				38	* communications channel to the MDS, and do not need to worry about
				39	* timing out or resubmitting requests.
				40	*
				41	* We maintain a stateful "session" with each MDS we interact with.
				42	* Within each session, we sent periodic heartbeat messages to ensure
				43	* any capabilities or leases we have been issues remain valid. If
				44	* the session times out and goes stale, our leases and capabilities
				45	* are no longer valid.
				46	*/
				47
				48	struct ceph_reconnect_state {
				49	int nr_caps;
				50	struct ceph_pagelist *pagelist;
				51	unsigned msg_version;
				52	};
				53
				54	static void __wake_requests(struct ceph_mds_client *mdsc,
				55	struct list_head *head);
				56
				57	static const struct ceph_connection_operations mds_con_ops;
				58
				59
				60	/*
				61	* mds reply parsing
				62	*/
				63
				64	/*
				65	* parse individual inode info
				66	*/
				67	static int parse_reply_info_in(void *p, void end,
				68	struct ceph_mds_reply_info_in *info,
				69	u64 features)
				70	{
				71	int err = -EIO;
				72
				73	info->in = *p;
				74	*p += sizeof(struct ceph_mds_reply_inode) +
				75	sizeof(info->in->fragtree.splits)
				76	le32_to_cpu(info->in->fragtree.nsplits);
				77
				78	ceph_decode_32_safe(p, end, info->symlink_len, bad);
				79	ceph_decode_need(p, end, info->symlink_len, bad);
				80	info->symlink = *p;
				81	*p += info->symlink_len;
				82
				83	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
				84	ceph_decode_copy_safe(p, end, &info->dir_layout,
				85	sizeof(info->dir_layout), bad);
				86	else
				87	memset(&info->dir_layout, 0, sizeof(info->dir_layout));
				88
				89	ceph_decode_32_safe(p, end, info->xattr_len, bad);
				90	ceph_decode_need(p, end, info->xattr_len, bad);
				91	info->xattr_data = *p;
				92	*p += info->xattr_len;
				93
				94	if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
				95	ceph_decode_64_safe(p, end, info->inline_version, bad);
				96	ceph_decode_32_safe(p, end, info->inline_len, bad);
				97	ceph_decode_need(p, end, info->inline_len, bad);
				98	info->inline_data = *p;
				99	*p += info->inline_len;
				100	} else
				101	info->inline_version = CEPH_INLINE_NONE;
				102
				103	info->pool_ns_len = 0;
				104	info->pool_ns_data = NULL;
				105	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
				106	ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
				107	if (info->pool_ns_len > 0) {
				108	ceph_decode_need(p, end, info->pool_ns_len, bad);
				109	info->pool_ns_data = *p;
				110	*p += info->pool_ns_len;
				111	}
				112	}
				113
				114	return 0;
				115	bad:
				116	return err;
				117	}
				118
				119	/*
				120	* parse a normal reply, which may contain a (dir+)dentry and/or a
				121	* target inode.
				122	*/
				123	static int parse_reply_info_trace(void *p, void end,
				124	struct ceph_mds_reply_info_parsed *info,
				125	u64 features)
				126	{
				127	int err;
				128
				129	if (info->head->is_dentry) {
				130	err = parse_reply_info_in(p, end, &info->diri, features);
				131	if (err < 0)
				132	goto out_bad;
				133
				134	if (unlikely(p + sizeof(info->dirfrag) > end))
				135	goto bad;
				136	info->dirfrag = *p;
				137	p += sizeof(info->dirfrag) +
				138	sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
				139	if (unlikely(*p > end))
				140	goto bad;
				141
				142	ceph_decode_32_safe(p, end, info->dname_len, bad);
				143	ceph_decode_need(p, end, info->dname_len, bad);
				144	info->dname = *p;
				145	*p += info->dname_len;
				146	info->dlease = *p;
				147	p += sizeof(info->dlease);
				148	}
				149
				150	if (info->head->is_target) {
				151	err = parse_reply_info_in(p, end, &info->targeti, features);
				152	if (err < 0)
				153	goto out_bad;
				154	}
				155
				156	if (unlikely(*p != end))
				157	goto bad;
				158	return 0;
				159
				160	bad:
				161	err = -EIO;
				162	out_bad:
				163	pr_err("problem parsing mds trace %d\n", err);
				164	return err;
				165	}
				166
				167	/*
				168	* parse readdir results
				169	*/
				170	static int parse_reply_info_dir(void *p, void end,
				171	struct ceph_mds_reply_info_parsed *info,
				172	u64 features)
				173	{
				174	u32 num, i = 0;
				175	int err;
				176
				177	info->dir_dir = *p;
				178	if (p + sizeof(info->dir_dir) > end)
				179	goto bad;
				180	p += sizeof(info->dir_dir) +
				181	sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
				182	if (*p > end)
				183	goto bad;
				184
				185	ceph_decode_need(p, end, sizeof(num) + 2, bad);
				186	num = ceph_decode_32(p);
				187	{
				188	u16 flags = ceph_decode_16(p);
				189	info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
				190	info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
				191	info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
				192	info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
				193	}
				194	if (num == 0)
				195	goto done;
				196
				197	BUG_ON(!info->dir_entries);
				198	if ((unsigned long)(info->dir_entries + num) >
				199	(unsigned long)info->dir_entries + info->dir_buf_size) {
				200	pr_err("dir contents are larger than expected\n");
				201	WARN_ON(1);
				202	goto bad;
				203	}
				204
				205	info->dir_nr = num;
				206	while (num) {
				207	struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
				208	/* dentry */
				209	ceph_decode_need(p, end, sizeof(u32)*2, bad);
				210	rde->name_len = ceph_decode_32(p);
				211	ceph_decode_need(p, end, rde->name_len, bad);
				212	rde->name = *p;
				213	*p += rde->name_len;
				214	dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
				215	rde->lease = *p;
				216	*p += sizeof(struct ceph_mds_reply_lease);
				217
				218	/* inode */
				219	err = parse_reply_info_in(p, end, &rde->inode, features);
				220	if (err < 0)
				221	goto out_bad;
				222	/* ceph_readdir_prepopulate() will update it */
				223	rde->offset = 0;
				224	i++;
				225	num--;
				226	}
				227
				228	done:
				229	if (*p != end)
				230	goto bad;
				231	return 0;
				232
				233	bad:
				234	err = -EIO;
				235	out_bad:
				236	pr_err("problem parsing dir contents %d\n", err);
				237	return err;
				238	}
				239
				240	/*
				241	* parse fcntl F_GETLK results
				242	*/
				243	static int parse_reply_info_filelock(void *p, void end,
				244	struct ceph_mds_reply_info_parsed *info,
				245	u64 features)
				246	{
				247	if (p + sizeof(info->filelock_reply) > end)
				248	goto bad;
				249
				250	info->filelock_reply = *p;
				251	p += sizeof(info->filelock_reply);
				252
				253	if (unlikely(*p != end))
				254	goto bad;
				255	return 0;
				256
				257	bad:
				258	return -EIO;
				259	}
				260
				261	/*
				262	* parse create results
				263	*/
				264	static int parse_reply_info_create(void *p, void end,
				265	struct ceph_mds_reply_info_parsed *info,
				266	u64 features)
				267	{
				268	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
				269	if (*p == end) {
				270	info->has_create_ino = false;
				271	} else {
				272	info->has_create_ino = true;
				273	info->ino = ceph_decode_64(p);
				274	}
				275	}
				276
				277	if (unlikely(*p != end))
				278	goto bad;
				279	return 0;
				280
				281	bad:
				282	return -EIO;
				283	}
				284
				285	/*
				286	* parse extra results
				287	*/
				288	static int parse_reply_info_extra(void *p, void end,
				289	struct ceph_mds_reply_info_parsed *info,
				290	u64 features)
				291	{
				292	u32 op = le32_to_cpu(info->head->op);
				293
				294	if (op == CEPH_MDS_OP_GETFILELOCK)
				295	return parse_reply_info_filelock(p, end, info, features);
				296	else if (op == CEPH_MDS_OP_READDIR \|\| op == CEPH_MDS_OP_LSSNAP)
				297	return parse_reply_info_dir(p, end, info, features);
				298	else if (op == CEPH_MDS_OP_CREATE)
				299	return parse_reply_info_create(p, end, info, features);
				300	else
				301	return -EIO;
				302	}
				303
				304	/*
				305	* parse entire mds reply
				306	*/
				307	static int parse_reply_info(struct ceph_msg *msg,
				308	struct ceph_mds_reply_info_parsed *info,
				309	u64 features)
				310	{
				311	void p, end;
				312	u32 len;
				313	int err;
				314
				315	info->head = msg->front.iov_base;
				316	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
				317	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
				318
				319	/* trace */
				320	ceph_decode_32_safe(&p, end, len, bad);
				321	if (len > 0) {
				322	ceph_decode_need(&p, end, len, bad);
				323	err = parse_reply_info_trace(&p, p+len, info, features);
				324	if (err < 0)
				325	goto out_bad;
				326	}
				327
				328	/* extra */
				329	ceph_decode_32_safe(&p, end, len, bad);
				330	if (len > 0) {
				331	ceph_decode_need(&p, end, len, bad);
				332	err = parse_reply_info_extra(&p, p+len, info, features);
				333	if (err < 0)
				334	goto out_bad;
				335	}
				336
				337	/* snap blob */
				338	ceph_decode_32_safe(&p, end, len, bad);
				339	info->snapblob_len = len;
				340	info->snapblob = p;
				341	p += len;
				342
				343	if (p != end)
				344	goto bad;
				345	return 0;
				346
				347	bad:
				348	err = -EIO;
				349	out_bad:
				350	pr_err("mds parse_reply err %d\n", err);
				351	return err;
				352	}
				353
				354	static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
				355	{
				356	if (!info->dir_entries)
				357	return;
				358	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
				359	}
				360
				361
				362	/*
				363	* sessions
				364	*/
				365	const char *ceph_session_state_name(int s)
				366	{
				367	switch (s) {
				368	case CEPH_MDS_SESSION_NEW: return "new";
				369	case CEPH_MDS_SESSION_OPENING: return "opening";
				370	case CEPH_MDS_SESSION_OPEN: return "open";
				371	case CEPH_MDS_SESSION_HUNG: return "hung";
				372	case CEPH_MDS_SESSION_CLOSING: return "closing";
				373	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
				374	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
				375	case CEPH_MDS_SESSION_REJECTED: return "rejected";
				376	default: return "???";
				377	}
				378	}
				379
				380	static struct ceph_mds_session get_session(struct ceph_mds_session s)
				381	{
				382	if (refcount_inc_not_zero(&s->s_ref)) {
				383	dout("mdsc get_session %p %d -> %d\n", s,
				384	refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
				385	return s;
				386	} else {
				387	dout("mdsc get_session %p 0 -- FAIL", s);
				388	return NULL;
				389	}
				390	}
				391
				392	void ceph_put_mds_session(struct ceph_mds_session *s)
				393	{
				394	dout("mdsc put_session %p %d -> %d\n", s,
				395	refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
				396	if (refcount_dec_and_test(&s->s_ref)) {
				397	if (s->s_auth.authorizer)
				398	ceph_auth_destroy_authorizer(s->s_auth.authorizer);
				399	kfree(s);
				400	}
				401	}
				402
				403	/*
				404	* called under mdsc->mutex
				405	*/
				406	struct ceph_mds_session __ceph_lookup_mds_session(struct ceph_mds_client mdsc,
				407	int mds)
				408	{
				409	struct ceph_mds_session *session;
				410
				411	if (mds >= mdsc->max_sessions \|\| !mdsc->sessions[mds])
				412	return NULL;
				413	session = mdsc->sessions[mds];
				414	dout("lookup_mds_session %p %d\n", session,
				415	refcount_read(&session->s_ref));
				416	get_session(session);
				417	return session;
				418	}
				419
				420	static bool __have_session(struct ceph_mds_client *mdsc, int mds)
				421	{
				422	if (mds >= mdsc->max_sessions)
				423	return false;
				424	return mdsc->sessions[mds];
				425	}
				426
				427	static int __verify_registered_session(struct ceph_mds_client *mdsc,
				428	struct ceph_mds_session *s)
				429	{
				430	if (s->s_mds >= mdsc->max_sessions \|\|
				431	mdsc->sessions[s->s_mds] != s)
				432	return -ENOENT;
				433	return 0;
				434	}
				435
				436	/*
				437	* create+register a new session for given mds.
				438	* called under mdsc->mutex.
				439	*/
				440	static struct ceph_mds_session register_session(struct ceph_mds_client mdsc,
				441	int mds)
				442	{
				443	struct ceph_mds_session *s;
				444
				445	if (mds >= mdsc->mdsmap->m_num_mds)
				446	return ERR_PTR(-EINVAL);
				447
				448	s = kzalloc(sizeof(*s), GFP_NOFS);
				449	if (!s)
				450	return ERR_PTR(-ENOMEM);
				451	s->s_mdsc = mdsc;
				452	s->s_mds = mds;
				453	s->s_state = CEPH_MDS_SESSION_NEW;
				454	s->s_ttl = 0;
				455	s->s_seq = 0;
				456	mutex_init(&s->s_mutex);
				457
				458	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
				459
				460	spin_lock_init(&s->s_gen_ttl_lock);
				461	s->s_cap_gen = 0;
				462	s->s_cap_ttl = jiffies - 1;
				463
				464	spin_lock_init(&s->s_cap_lock);
				465	s->s_renew_requested = 0;
				466	s->s_renew_seq = 0;
				467	INIT_LIST_HEAD(&s->s_caps);
				468	s->s_nr_caps = 0;
				469	s->s_trim_caps = 0;
				470	refcount_set(&s->s_ref, 1);
				471	INIT_LIST_HEAD(&s->s_waiting);
				472	INIT_LIST_HEAD(&s->s_unsafe);
				473	s->s_num_cap_releases = 0;
				474	s->s_cap_reconnect = 0;
				475	s->s_cap_iterator = NULL;
				476	INIT_LIST_HEAD(&s->s_cap_releases);
				477	INIT_LIST_HEAD(&s->s_cap_flushing);
				478
				479	dout("register_session mds%d\n", mds);
				480	if (mds >= mdsc->max_sessions) {
				481	int newmax = 1 << get_count_order(mds+1);
				482	struct ceph_mds_session **sa;
				483
				484	dout("register_session realloc to %d\n", newmax);
				485	sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
				486	if (!sa)
				487	goto fail_realloc;
				488	if (mdsc->sessions) {
				489	memcpy(sa, mdsc->sessions,
				490	mdsc->max_sessions * sizeof(void *));
				491	kfree(mdsc->sessions);
				492	}
				493	mdsc->sessions = sa;
				494	mdsc->max_sessions = newmax;
				495	}
				496	mdsc->sessions[mds] = s;
				497	atomic_inc(&mdsc->num_sessions);
				498	refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
				499
				500	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
				501	ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
				502
				503	return s;
				504
				505	fail_realloc:
				506	kfree(s);
				507	return ERR_PTR(-ENOMEM);
				508	}
				509
				510	/*
				511	* called under mdsc->mutex
				512	*/
				513	static void __unregister_session(struct ceph_mds_client *mdsc,
				514	struct ceph_mds_session *s)
				515	{
				516	dout("__unregister_session mds%d %p\n", s->s_mds, s);
				517	BUG_ON(mdsc->sessions[s->s_mds] != s);
				518	mdsc->sessions[s->s_mds] = NULL;
				519	ceph_con_close(&s->s_con);
				520	ceph_put_mds_session(s);
				521	atomic_dec(&mdsc->num_sessions);
				522	}
				523
				524	/*
				525	* drop session refs in request.
				526	*
				527	* should be last request ref, or hold mdsc->mutex
				528	*/
				529	static void put_request_session(struct ceph_mds_request *req)
				530	{
				531	if (req->r_session) {
				532	ceph_put_mds_session(req->r_session);
				533	req->r_session = NULL;
				534	}
				535	}
				536
				537	void ceph_mdsc_release_request(struct kref *kref)
				538	{
				539	struct ceph_mds_request *req = container_of(kref,
				540	struct ceph_mds_request,
				541	r_kref);
				542	destroy_reply_info(&req->r_reply_info);
				543	if (req->r_request)
				544	ceph_msg_put(req->r_request);
				545	if (req->r_reply)
				546	ceph_msg_put(req->r_reply);
				547	if (req->r_inode) {
				548	ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
				549	iput(req->r_inode);
				550	}
				551	if (req->r_parent)
				552	ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
				553	iput(req->r_target_inode);
				554	if (req->r_dentry)
				555	dput(req->r_dentry);
				556	if (req->r_old_dentry)
				557	dput(req->r_old_dentry);
				558	if (req->r_old_dentry_dir) {
				559	/*
				560	* track (and drop pins for) r_old_dentry_dir
				561	* separately, since r_old_dentry's d_parent may have
				562	* changed between the dir mutex being dropped and
				563	* this request being freed.
				564	*/
				565	ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
				566	CEPH_CAP_PIN);
				567	iput(req->r_old_dentry_dir);
				568	}
				569	kfree(req->r_path1);
				570	kfree(req->r_path2);
				571	if (req->r_pagelist)
				572	ceph_pagelist_release(req->r_pagelist);
				573	put_request_session(req);
				574	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
				575	kfree(req);
				576	}
				577
				578	DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
				579
				580	/*
				581	* lookup session, bump ref if found.
				582	*
				583	* called under mdsc->mutex.
				584	*/
				585	static struct ceph_mds_request *
				586	lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
				587	{
				588	struct ceph_mds_request *req;
				589
				590	req = lookup_request(&mdsc->request_tree, tid);
				591	if (req)
				592	ceph_mdsc_get_request(req);
				593
				594	return req;
				595	}
				596
				597	/*
				598	* Register an in-flight request, and assign a tid. Link to directory
				599	* are modifying (if any).
				600	*
				601	* Called under mdsc->mutex.
				602	*/
				603	static void __register_request(struct ceph_mds_client *mdsc,
				604	struct ceph_mds_request *req,
				605	struct inode *dir)
				606	{
				607	req->r_tid = ++mdsc->last_tid;
				608	if (req->r_num_caps)
				609	ceph_reserve_caps(mdsc, &req->r_caps_reservation,
				610	req->r_num_caps);
				611	dout("__register_request %p tid %lld\n", req, req->r_tid);
				612	ceph_mdsc_get_request(req);
				613	insert_request(&mdsc->request_tree, req);
				614
				615	req->r_uid = current_fsuid();
				616	req->r_gid = current_fsgid();
				617
				618	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
				619	mdsc->oldest_tid = req->r_tid;
				620
				621	if (dir) {
				622	ihold(dir);
				623	req->r_unsafe_dir = dir;
				624	}
				625	}
				626
				627	static void __unregister_request(struct ceph_mds_client *mdsc,
				628	struct ceph_mds_request *req)
				629	{
				630	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
				631
				632	/* Never leave an unregistered request on an unsafe list! */
				633	list_del_init(&req->r_unsafe_item);
				634
				635	if (req->r_tid == mdsc->oldest_tid) {
				636	struct rb_node *p = rb_next(&req->r_node);
				637	mdsc->oldest_tid = 0;
				638	while (p) {
				639	struct ceph_mds_request *next_req =
				640	rb_entry(p, struct ceph_mds_request, r_node);
				641	if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
				642	mdsc->oldest_tid = next_req->r_tid;
				643	break;
				644	}
				645	p = rb_next(p);
				646	}
				647	}
				648
				649	erase_request(&mdsc->request_tree, req);
				650
				651	if (req->r_unsafe_dir &&
				652	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				653	struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
				654	spin_lock(&ci->i_unsafe_lock);
				655	list_del_init(&req->r_unsafe_dir_item);
				656	spin_unlock(&ci->i_unsafe_lock);
				657	}
				658	if (req->r_target_inode &&
				659	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				660	struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
				661	spin_lock(&ci->i_unsafe_lock);
				662	list_del_init(&req->r_unsafe_target_item);
				663	spin_unlock(&ci->i_unsafe_lock);
				664	}
				665
				666	if (req->r_unsafe_dir) {
				667	iput(req->r_unsafe_dir);
				668	req->r_unsafe_dir = NULL;
				669	}
				670
				671	complete_all(&req->r_safe_completion);
				672
				673	ceph_mdsc_put_request(req);
				674	}
				675
				676	/*
				677	* Walk back up the dentry tree until we hit a dentry representing a
				678	* non-snapshot inode. We do this using the rcu_read_lock (which must be held
				679	* when calling this) to ensure that the objects won't disappear while we're
				680	* working with them. Once we hit a candidate dentry, we attempt to take a
				681	* reference to it, and return that as the result.
				682	*/
				683	static struct inode get_nonsnap_parent(struct dentry dentry)
				684	{
				685	struct inode *inode = NULL;
				686
				687	while (dentry && !IS_ROOT(dentry)) {
				688	inode = d_inode_rcu(dentry);
				689	if (!inode \|\| ceph_snap(inode) == CEPH_NOSNAP)
				690	break;
				691	dentry = dentry->d_parent;
				692	}
				693	if (inode)
				694	inode = igrab(inode);
				695	return inode;
				696	}
				697
				698	/*
				699	* Choose mds to send request to next. If there is a hint set in the
				700	* request (e.g., due to a prior forward hint from the mds), use that.
				701	* Otherwise, consult frag tree and/or caps to identify the
				702	* appropriate mds. If all else fails, choose randomly.
				703	*
				704	* Called under mdsc->mutex.
				705	*/
				706	static int __choose_mds(struct ceph_mds_client *mdsc,
				707	struct ceph_mds_request *req)
				708	{
				709	struct inode *inode;
				710	struct ceph_inode_info *ci;
				711	struct ceph_cap *cap;
				712	int mode = req->r_direct_mode;
				713	int mds = -1;
				714	u32 hash = req->r_direct_hash;
				715	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
				716
				717	/*
				718	* is there a specific mds we should try? ignore hint if we have
				719	* no session and the mds is not up (active or recovering).
				720	*/
				721	if (req->r_resend_mds >= 0 &&
				722	(__have_session(mdsc, req->r_resend_mds) \|\|
				723	ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
				724	dout("choose_mds using resend_mds mds%d\n",
				725	req->r_resend_mds);
				726	return req->r_resend_mds;
				727	}
				728
				729	if (mode == USE_RANDOM_MDS)
				730	goto random;
				731
				732	inode = NULL;
				733	if (req->r_inode) {
				734	if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
				735	inode = req->r_inode;
				736	ihold(inode);
				737	} else {
				738	/* req->r_dentry is non-null for LSSNAP request */
				739	rcu_read_lock();
				740	inode = get_nonsnap_parent(req->r_dentry);
				741	rcu_read_unlock();
				742	dout("__choose_mds using snapdir's parent %p\n", inode);
				743	}
				744	} else if (req->r_dentry) {
				745	/* ignore race with rename; old or new d_parent is okay */
				746	struct dentry *parent;
				747	struct inode *dir;
				748
				749	rcu_read_lock();
				750	parent = req->r_dentry->d_parent;
				751	dir = req->r_parent ? : d_inode_rcu(parent);
				752
				753	if (!dir \|\| dir->i_sb != mdsc->fsc->sb) {
				754	/* not this fs or parent went negative */
				755	inode = d_inode(req->r_dentry);
				756	if (inode)
				757	ihold(inode);
				758	} else if (ceph_snap(dir) != CEPH_NOSNAP) {
				759	/* direct snapped/virtual snapdir requests
				760	* based on parent dir inode */
				761	inode = get_nonsnap_parent(parent);
				762	dout("__choose_mds using nonsnap parent %p\n", inode);
				763	} else {
				764	/* dentry target */
				765	inode = d_inode(req->r_dentry);
				766	if (!inode \|\| mode == USE_AUTH_MDS) {
				767	/* dir + name */
				768	inode = igrab(dir);
				769	hash = ceph_dentry_hash(dir, req->r_dentry);
				770	is_hash = true;
				771	} else {
				772	ihold(inode);
				773	}
				774	}
				775	rcu_read_unlock();
				776	}
				777
				778	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
				779	(int)hash, mode);
				780	if (!inode)
				781	goto random;
				782	ci = ceph_inode(inode);
				783
				784	if (is_hash && S_ISDIR(inode->i_mode)) {
				785	struct ceph_inode_frag frag;
				786	int found;
				787
				788	ceph_choose_frag(ci, hash, &frag, &found);
				789	if (found) {
				790	if (mode == USE_ANY_MDS && frag.ndist > 0) {
				791	u8 r;
				792
				793	/* choose a random replica */
				794	get_random_bytes(&r, 1);
				795	r %= frag.ndist;
				796	mds = frag.dist[r];
				797	dout("choose_mds %p %llx.%llx "
				798	"frag %u mds%d (%d/%d)\n",
				799	inode, ceph_vinop(inode),
				800	frag.frag, mds,
				801	(int)r, frag.ndist);
				802	if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
				803	CEPH_MDS_STATE_ACTIVE)
				804	goto out;
				805	}
				806
				807	/* since this file/dir wasn't known to be
				808	* replicated, then we want to look for the
				809	* authoritative mds. */
				810	mode = USE_AUTH_MDS;
				811	if (frag.mds >= 0) {
				812	/* choose auth mds */
				813	mds = frag.mds;
				814	dout("choose_mds %p %llx.%llx "
				815	"frag %u mds%d (auth)\n",
				816	inode, ceph_vinop(inode), frag.frag, mds);
				817	if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
				818	CEPH_MDS_STATE_ACTIVE)
				819	goto out;
				820	}
				821	}
				822	}
				823
				824	spin_lock(&ci->i_ceph_lock);
				825	cap = NULL;
				826	if (mode == USE_AUTH_MDS)
				827	cap = ci->i_auth_cap;
				828	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
				829	cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
				830	if (!cap) {
				831	spin_unlock(&ci->i_ceph_lock);
				832	iput(inode);
				833	goto random;
				834	}
				835	mds = cap->session->s_mds;
				836	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
				837	inode, ceph_vinop(inode), mds,
				838	cap == ci->i_auth_cap ? "auth " : "", cap);
				839	spin_unlock(&ci->i_ceph_lock);
				840	out:
				841	iput(inode);
				842	return mds;
				843
				844	random:
				845	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
				846	dout("choose_mds chose random mds%d\n", mds);
				847	return mds;
				848	}
				849
				850
				851	/*
				852	* session messages
				853	*/
				854	static struct ceph_msg *create_session_msg(u32 op, u64 seq)
				855	{
				856	struct ceph_msg *msg;
				857	struct ceph_mds_session_head *h;
				858
				859	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
				860	false);
				861	if (!msg) {
				862	pr_err("create_session_msg ENOMEM creating msg\n");
				863	return NULL;
				864	}
				865	h = msg->front.iov_base;
				866	h->op = cpu_to_le32(op);
				867	h->seq = cpu_to_le64(seq);
				868
				869	return msg;
				870	}
				871
				872	/*
				873	* session message, specialization for CEPH_SESSION_REQUEST_OPEN
				874	* to include additional client metadata fields.
				875	*/
				876	static struct ceph_msg create_session_open_msg(struct ceph_mds_client mdsc, u64 seq)
				877	{
				878	struct ceph_msg *msg;
				879	struct ceph_mds_session_head *h;
				880	int i = -1;
				881	int metadata_bytes = 0;
				882	int metadata_key_count = 0;
				883	struct ceph_options *opt = mdsc->fsc->client->options;
				884	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
				885	void *p;
				886
				887	const char* metadata[][2] = {
				888	{"hostname", mdsc->nodename},
				889	{"kernel_version", init_utsname()->release},
				890	{"entity_id", opt->name ? : ""},
				891	{"root", fsopt->server_path ? : "/"},
				892	{NULL, NULL}
				893	};
				894
				895	/* Calculate serialized length of metadata */
				896	metadata_bytes = 4; /* map length */
				897	for (i = 0; metadata[i][0]; ++i) {
				898	metadata_bytes += 8 + strlen(metadata[i][0]) +
				899	strlen(metadata[i][1]);
				900	metadata_key_count++;
				901	}
				902
				903	/* Allocate the message */
				904	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
				905	GFP_NOFS, false);
				906	if (!msg) {
				907	pr_err("create_session_msg ENOMEM creating msg\n");
				908	return NULL;
				909	}
				910	h = msg->front.iov_base;
				911	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
				912	h->seq = cpu_to_le64(seq);
				913
				914	/*
				915	* Serialize client metadata into waiting buffer space, using
				916	* the format that userspace expects for map<string, string>
				917	*
				918	* ClientSession messages with metadata are v2
				919	*/
				920	msg->hdr.version = cpu_to_le16(2);
				921	msg->hdr.compat_version = cpu_to_le16(1);
				922
				923	/* The write pointer, following the session_head structure */
				924	p = msg->front.iov_base + sizeof(*h);
				925
				926	/* Number of entries in the map */
				927	ceph_encode_32(&p, metadata_key_count);
				928
				929	/* Two length-prefixed strings for each entry in the map */
				930	for (i = 0; metadata[i][0]; ++i) {
				931	size_t const key_len = strlen(metadata[i][0]);
				932	size_t const val_len = strlen(metadata[i][1]);
				933
				934	ceph_encode_32(&p, key_len);
				935	memcpy(p, metadata[i][0], key_len);
				936	p += key_len;
				937	ceph_encode_32(&p, val_len);
				938	memcpy(p, metadata[i][1], val_len);
				939	p += val_len;
				940	}
				941
				942	return msg;
				943	}
				944
				945	/*
				946	* send session open request.
				947	*
				948	* called under mdsc->mutex
				949	*/
				950	static int __open_session(struct ceph_mds_client *mdsc,
				951	struct ceph_mds_session *session)
				952	{
				953	struct ceph_msg *msg;
				954	int mstate;
				955	int mds = session->s_mds;
				956
				957	/* wait for mds to go active? */
				958	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
				959	dout("open_session to mds%d (%s)\n", mds,
				960	ceph_mds_state_name(mstate));
				961	session->s_state = CEPH_MDS_SESSION_OPENING;
				962	session->s_renew_requested = jiffies;
				963
				964	/* send connect message */
				965	msg = create_session_open_msg(mdsc, session->s_seq);
				966	if (!msg)
				967	return -ENOMEM;
				968	ceph_con_send(&session->s_con, msg);
				969	return 0;
				970	}
				971
				972	/*
				973	* open sessions for any export targets for the given mds
				974	*
				975	* called under mdsc->mutex
				976	*/
				977	static struct ceph_mds_session *
				978	__open_export_target_session(struct ceph_mds_client *mdsc, int target)
				979	{
				980	struct ceph_mds_session *session;
				981
				982	session = __ceph_lookup_mds_session(mdsc, target);
				983	if (!session) {
				984	session = register_session(mdsc, target);
				985	if (IS_ERR(session))
				986	return session;
				987	}
				988	if (session->s_state == CEPH_MDS_SESSION_NEW \|\|
				989	session->s_state == CEPH_MDS_SESSION_CLOSING)
				990	__open_session(mdsc, session);
				991
				992	return session;
				993	}
				994
				995	struct ceph_mds_session *
				996	ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
				997	{
				998	struct ceph_mds_session *session;
				999
				1000	dout("open_export_target_session to mds%d\n", target);
				1001
				1002	mutex_lock(&mdsc->mutex);
				1003	session = __open_export_target_session(mdsc, target);
				1004	mutex_unlock(&mdsc->mutex);
				1005
				1006	return session;
				1007	}
				1008
				1009	static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
				1010	struct ceph_mds_session *session)
				1011	{
				1012	struct ceph_mds_info *mi;
				1013	struct ceph_mds_session *ts;
				1014	int i, mds = session->s_mds;
				1015
				1016	if (mds >= mdsc->mdsmap->m_num_mds)
				1017	return;
				1018
				1019	mi = &mdsc->mdsmap->m_info[mds];
				1020	dout("open_export_target_sessions for mds%d (%d targets)\n",
				1021	session->s_mds, mi->num_export_targets);
				1022
				1023	for (i = 0; i < mi->num_export_targets; i++) {
				1024	ts = __open_export_target_session(mdsc, mi->export_targets[i]);
				1025	if (!IS_ERR(ts))
				1026	ceph_put_mds_session(ts);
				1027	}
				1028	}
				1029
				1030	void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
				1031	struct ceph_mds_session *session)
				1032	{
				1033	mutex_lock(&mdsc->mutex);
				1034	__open_export_target_sessions(mdsc, session);
				1035	mutex_unlock(&mdsc->mutex);
				1036	}
				1037
				1038	/*
				1039	* session caps
				1040	*/
				1041
				1042	/* caller holds s_cap_lock, we drop it */
				1043	static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
				1044	struct ceph_mds_session *session)
				1045	__releases(session->s_cap_lock)
				1046	{
				1047	LIST_HEAD(tmp_list);
				1048	list_splice_init(&session->s_cap_releases, &tmp_list);
				1049	session->s_num_cap_releases = 0;
				1050	spin_unlock(&session->s_cap_lock);
				1051
				1052	dout("cleanup_cap_releases mds%d\n", session->s_mds);
				1053	while (!list_empty(&tmp_list)) {
				1054	struct ceph_cap *cap;
				1055	/* zero out the in-progress message */
				1056	cap = list_first_entry(&tmp_list,
				1057	struct ceph_cap, session_caps);
				1058	list_del(&cap->session_caps);
				1059	ceph_put_cap(mdsc, cap);
				1060	}
				1061	}
				1062
				1063	static void cleanup_session_requests(struct ceph_mds_client *mdsc,
				1064	struct ceph_mds_session *session)
				1065	{
				1066	struct ceph_mds_request *req;
				1067	struct rb_node *p;
				1068
				1069	dout("cleanup_session_requests mds%d\n", session->s_mds);
				1070	mutex_lock(&mdsc->mutex);
				1071	while (!list_empty(&session->s_unsafe)) {
				1072	req = list_first_entry(&session->s_unsafe,
				1073	struct ceph_mds_request, r_unsafe_item);
				1074	pr_warn_ratelimited(" dropping unsafe request %llu\n",
				1075	req->r_tid);
				1076	__unregister_request(mdsc, req);
				1077	}
				1078	/* zero r_attempts, so kick_requests() will re-send requests */
				1079	p = rb_first(&mdsc->request_tree);
				1080	while (p) {
				1081	req = rb_entry(p, struct ceph_mds_request, r_node);
				1082	p = rb_next(p);
				1083	if (req->r_session &&
				1084	req->r_session->s_mds == session->s_mds)
				1085	req->r_attempts = 0;
				1086	}
				1087	mutex_unlock(&mdsc->mutex);
				1088	}
				1089
				1090	/*
				1091	* Helper to safely iterate over all caps associated with a session, with
				1092	* special care taken to handle a racing __ceph_remove_cap().
				1093	*
				1094	* Caller must hold session s_mutex.
				1095	*/
				1096	static int iterate_session_caps(struct ceph_mds_session *session,
				1097	int (cb)(struct inode , struct ceph_cap *,
				1098	void ), void arg)
				1099	{
				1100	struct list_head *p;
				1101	struct ceph_cap *cap;
				1102	struct inode inode, last_inode = NULL;
				1103	struct ceph_cap *old_cap = NULL;
				1104	int ret;
				1105
				1106	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
				1107	spin_lock(&session->s_cap_lock);
				1108	p = session->s_caps.next;
				1109	while (p != &session->s_caps) {
				1110	cap = list_entry(p, struct ceph_cap, session_caps);
				1111	inode = igrab(&cap->ci->vfs_inode);
				1112	if (!inode) {
				1113	p = p->next;
				1114	continue;
				1115	}
				1116	session->s_cap_iterator = cap;
				1117	spin_unlock(&session->s_cap_lock);
				1118
				1119	if (last_inode) {
				1120	iput(last_inode);
				1121	last_inode = NULL;
				1122	}
				1123	if (old_cap) {
				1124	ceph_put_cap(session->s_mdsc, old_cap);
				1125	old_cap = NULL;
				1126	}
				1127
				1128	ret = cb(inode, cap, arg);
				1129	last_inode = inode;
				1130
				1131	spin_lock(&session->s_cap_lock);
				1132	p = p->next;
				1133	if (!cap->ci) {
				1134	dout("iterate_session_caps finishing cap %p removal\n",
				1135	cap);
				1136	BUG_ON(cap->session != session);
				1137	cap->session = NULL;
				1138	list_del_init(&cap->session_caps);
				1139	session->s_nr_caps--;
				1140	if (cap->queue_release) {
				1141	list_add_tail(&cap->session_caps,
				1142	&session->s_cap_releases);
				1143	session->s_num_cap_releases++;
				1144	} else {
				1145	old_cap = cap; /* put_cap it w/o locks held */
				1146	}
				1147	}
				1148	if (ret < 0)
				1149	goto out;
				1150	}
				1151	ret = 0;
				1152	out:
				1153	session->s_cap_iterator = NULL;
				1154	spin_unlock(&session->s_cap_lock);
				1155
				1156	iput(last_inode);
				1157	if (old_cap)
				1158	ceph_put_cap(session->s_mdsc, old_cap);
				1159
				1160	return ret;
				1161	}
				1162
				1163	static int remove_session_caps_cb(struct inode inode, struct ceph_cap cap,
				1164	void *arg)
				1165	{
				1166	struct ceph_fs_client fsc = (struct ceph_fs_client )arg;
				1167	struct ceph_inode_info *ci = ceph_inode(inode);
				1168	LIST_HEAD(to_remove);
				1169	bool drop = false;
				1170	bool invalidate = false;
				1171
				1172	dout("removing cap %p, ci is %p, inode is %p\n",
				1173	cap, ci, &ci->vfs_inode);
				1174	spin_lock(&ci->i_ceph_lock);
				1175	__ceph_remove_cap(cap, false);
				1176	if (!ci->i_auth_cap) {
				1177	struct ceph_cap_flush *cf;
				1178	struct ceph_mds_client *mdsc = fsc->mdsc;
				1179
				1180	ci->i_ceph_flags \|= CEPH_I_CAP_DROPPED;
				1181
				1182	if (ci->i_wrbuffer_ref > 0 &&
				1183	READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				1184	invalidate = true;
				1185
				1186	while (!list_empty(&ci->i_cap_flush_list)) {
				1187	cf = list_first_entry(&ci->i_cap_flush_list,
				1188	struct ceph_cap_flush, i_list);
				1189	list_move(&cf->i_list, &to_remove);
				1190	}
				1191
				1192	spin_lock(&mdsc->cap_dirty_lock);
				1193
				1194	list_for_each_entry(cf, &to_remove, i_list)
				1195	list_del(&cf->g_list);
				1196
				1197	if (!list_empty(&ci->i_dirty_item)) {
				1198	pr_warn_ratelimited(
				1199	" dropping dirty %s state for %p %lld\n",
				1200	ceph_cap_string(ci->i_dirty_caps),
				1201	inode, ceph_ino(inode));
				1202	ci->i_dirty_caps = 0;
				1203	list_del_init(&ci->i_dirty_item);
				1204	drop = true;
				1205	}
				1206	if (!list_empty(&ci->i_flushing_item)) {
				1207	pr_warn_ratelimited(
				1208	" dropping dirty+flushing %s state for %p %lld\n",
				1209	ceph_cap_string(ci->i_flushing_caps),
				1210	inode, ceph_ino(inode));
				1211	ci->i_flushing_caps = 0;
				1212	list_del_init(&ci->i_flushing_item);
				1213	mdsc->num_cap_flushing--;
				1214	drop = true;
				1215	}
				1216	spin_unlock(&mdsc->cap_dirty_lock);
				1217
				1218	if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
				1219	list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
				1220	ci->i_prealloc_cap_flush = NULL;
				1221	}
				1222
				1223	if (drop &&
				1224	ci->i_wrbuffer_ref_head == 0 &&
				1225	ci->i_wr_ref == 0 &&
				1226	ci->i_dirty_caps == 0 &&
				1227	ci->i_flushing_caps == 0) {
				1228	ceph_put_snap_context(ci->i_head_snapc);
				1229	ci->i_head_snapc = NULL;
				1230	}
				1231	}
				1232	spin_unlock(&ci->i_ceph_lock);
				1233	while (!list_empty(&to_remove)) {
				1234	struct ceph_cap_flush *cf;
				1235	cf = list_first_entry(&to_remove,
				1236	struct ceph_cap_flush, i_list);
				1237	list_del(&cf->i_list);
				1238	ceph_free_cap_flush(cf);
				1239	}
				1240
				1241	wake_up_all(&ci->i_cap_wq);
				1242	if (invalidate)
				1243	ceph_queue_invalidate(inode);
				1244	if (drop)
				1245	iput(inode);
				1246	return 0;
				1247	}
				1248
				1249	/*
				1250	* caller must hold session s_mutex
				1251	*/
				1252	static void remove_session_caps(struct ceph_mds_session *session)
				1253	{
				1254	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
				1255	struct super_block *sb = fsc->sb;
				1256	dout("remove_session_caps on %p\n", session);
				1257	iterate_session_caps(session, remove_session_caps_cb, fsc);
				1258
				1259	wake_up_all(&fsc->mdsc->cap_flushing_wq);
				1260
				1261	spin_lock(&session->s_cap_lock);
				1262	if (session->s_nr_caps > 0) {
				1263	struct inode *inode;
				1264	struct ceph_cap cap, prev = NULL;
				1265	struct ceph_vino vino;
				1266	/*
				1267	* iterate_session_caps() skips inodes that are being
				1268	* deleted, we need to wait until deletions are complete.
				1269	* __wait_on_freeing_inode() is designed for the job,
				1270	* but it is not exported, so use lookup inode function
				1271	* to access it.
				1272	*/
				1273	while (!list_empty(&session->s_caps)) {
				1274	cap = list_entry(session->s_caps.next,
				1275	struct ceph_cap, session_caps);
				1276	if (cap == prev)
				1277	break;
				1278	prev = cap;
				1279	vino = cap->ci->i_vino;
				1280	spin_unlock(&session->s_cap_lock);
				1281
				1282	inode = ceph_find_inode(sb, vino);
				1283	iput(inode);
				1284
				1285	spin_lock(&session->s_cap_lock);
				1286	}
				1287	}
				1288
				1289	// drop cap expires and unlock s_cap_lock
				1290	cleanup_cap_releases(session->s_mdsc, session);
				1291
				1292	BUG_ON(session->s_nr_caps > 0);
				1293	BUG_ON(!list_empty(&session->s_cap_flushing));
				1294	}
				1295
				1296	/*
				1297	* wake up any threads waiting on this session's caps. if the cap is
				1298	* old (didn't get renewed on the client reconnect), remove it now.
				1299	*
				1300	* caller must hold s_mutex.
				1301	*/
				1302	static int wake_up_session_cb(struct inode inode, struct ceph_cap cap,
				1303	void *arg)
				1304	{
				1305	struct ceph_inode_info *ci = ceph_inode(inode);
				1306
				1307	if (arg) {
				1308	spin_lock(&ci->i_ceph_lock);
				1309	ci->i_wanted_max_size = 0;
				1310	ci->i_requested_max_size = 0;
				1311	spin_unlock(&ci->i_ceph_lock);
				1312	}
				1313	wake_up_all(&ci->i_cap_wq);
				1314	return 0;
				1315	}
				1316
				1317	static void wake_up_session_caps(struct ceph_mds_session *session,
				1318	int reconnect)
				1319	{
				1320	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
				1321	iterate_session_caps(session, wake_up_session_cb,
				1322	(void *)(unsigned long)reconnect);
				1323	}
				1324
				1325	/*
				1326	* Send periodic message to MDS renewing all currently held caps. The
				1327	* ack will reset the expiration for all caps from this session.
				1328	*
				1329	* caller holds s_mutex
				1330	*/
				1331	static int send_renew_caps(struct ceph_mds_client *mdsc,
				1332	struct ceph_mds_session *session)
				1333	{
				1334	struct ceph_msg *msg;
				1335	int state;
				1336
				1337	if (time_after_eq(jiffies, session->s_cap_ttl) &&
				1338	time_after_eq(session->s_cap_ttl, session->s_renew_requested))
				1339	pr_info("mds%d caps stale\n", session->s_mds);
				1340	session->s_renew_requested = jiffies;
				1341
				1342	/* do not try to renew caps until a recovering mds has reconnected
				1343	* with its clients. */
				1344	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
				1345	if (state < CEPH_MDS_STATE_RECONNECT) {
				1346	dout("send_renew_caps ignoring mds%d (%s)\n",
				1347	session->s_mds, ceph_mds_state_name(state));
				1348	return 0;
				1349	}
				1350
				1351	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
				1352	ceph_mds_state_name(state));
				1353	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
				1354	++session->s_renew_seq);
				1355	if (!msg)
				1356	return -ENOMEM;
				1357	ceph_con_send(&session->s_con, msg);
				1358	return 0;
				1359	}
				1360
				1361	static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
				1362	struct ceph_mds_session *session, u64 seq)
				1363	{
				1364	struct ceph_msg *msg;
				1365
				1366	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
				1367	session->s_mds, ceph_session_state_name(session->s_state), seq);
				1368	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
				1369	if (!msg)
				1370	return -ENOMEM;
				1371	ceph_con_send(&session->s_con, msg);
				1372	return 0;
				1373	}
				1374
				1375
				1376	/*
				1377	* Note new cap ttl, and any transition from stale -> not stale (fresh?).
				1378	*
				1379	* Called under session->s_mutex
				1380	*/
				1381	static void renewed_caps(struct ceph_mds_client *mdsc,
				1382	struct ceph_mds_session *session, int is_renew)
				1383	{
				1384	int was_stale;
				1385	int wake = 0;
				1386
				1387	spin_lock(&session->s_cap_lock);
				1388	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
				1389
				1390	session->s_cap_ttl = session->s_renew_requested +
				1391	mdsc->mdsmap->m_session_timeout*HZ;
				1392
				1393	if (was_stale) {
				1394	if (time_before(jiffies, session->s_cap_ttl)) {
				1395	pr_info("mds%d caps renewed\n", session->s_mds);
				1396	wake = 1;
				1397	} else {
				1398	pr_info("mds%d caps still stale\n", session->s_mds);
				1399	}
				1400	}
				1401	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
				1402	session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
				1403	time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
				1404	spin_unlock(&session->s_cap_lock);
				1405
				1406	if (wake)
				1407	wake_up_session_caps(session, 0);
				1408	}
				1409
				1410	/*
				1411	* send a session close request
				1412	*/
				1413	static int request_close_session(struct ceph_mds_client *mdsc,
				1414	struct ceph_mds_session *session)
				1415	{
				1416	struct ceph_msg *msg;
				1417
				1418	dout("request_close_session mds%d state %s seq %lld\n",
				1419	session->s_mds, ceph_session_state_name(session->s_state),
				1420	session->s_seq);
				1421	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
				1422	if (!msg)
				1423	return -ENOMEM;
				1424	ceph_con_send(&session->s_con, msg);
				1425	return 1;
				1426	}
				1427
				1428	/*
				1429	* Called with s_mutex held.
				1430	*/
				1431	static int __close_session(struct ceph_mds_client *mdsc,
				1432	struct ceph_mds_session *session)
				1433	{
				1434	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
				1435	return 0;
				1436	session->s_state = CEPH_MDS_SESSION_CLOSING;
				1437	return request_close_session(mdsc, session);
				1438	}
				1439
				1440	static bool drop_negative_children(struct dentry *dentry)
				1441	{
				1442	struct dentry *child;
				1443	bool all_negative = true;
				1444
				1445	if (!d_is_dir(dentry))
				1446	goto out;
				1447
				1448	spin_lock(&dentry->d_lock);
				1449	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
				1450	if (d_really_is_positive(child)) {
				1451	all_negative = false;
				1452	break;
				1453	}
				1454	}
				1455	spin_unlock(&dentry->d_lock);
				1456
				1457	if (all_negative)
				1458	shrink_dcache_parent(dentry);
				1459	out:
				1460	return all_negative;
				1461	}
				1462
				1463	/*
				1464	* Trim old(er) caps.
				1465	*
				1466	* Because we can't cache an inode without one or more caps, we do
				1467	* this indirectly: if a cap is unused, we prune its aliases, at which
				1468	* point the inode will hopefully get dropped to.
				1469	*
				1470	* Yes, this is a bit sloppy. Our only real goal here is to respond to
				1471	* memory pressure from the MDS, though, so it needn't be perfect.
				1472	*/
				1473	static int trim_caps_cb(struct inode inode, struct ceph_cap cap, void *arg)
				1474	{
				1475	struct ceph_mds_session *session = arg;
				1476	struct ceph_inode_info *ci = ceph_inode(inode);
				1477	int used, wanted, oissued, mine;
				1478
				1479	if (session->s_trim_caps <= 0)
				1480	return -1;
				1481
				1482	spin_lock(&ci->i_ceph_lock);
				1483	mine = cap->issued \| cap->implemented;
				1484	used = __ceph_caps_used(ci);
				1485	wanted = __ceph_caps_file_wanted(ci);
				1486	oissued = __ceph_caps_issued_other(ci, cap);
				1487
				1488	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
				1489	inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
				1490	ceph_cap_string(used), ceph_cap_string(wanted));
				1491	if (cap == ci->i_auth_cap) {
				1492	if (ci->i_dirty_caps \|\| ci->i_flushing_caps \|\|
				1493	!list_empty(&ci->i_cap_snaps))
				1494	goto out;
				1495	if ((used \| wanted) & CEPH_CAP_ANY_WR)
				1496	goto out;
				1497	}
				1498	/* The inode has cached pages, but it's no longer used.
				1499	* we can safely drop it */
				1500	if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
				1501	!(oissued & CEPH_CAP_FILE_CACHE)) {
				1502	used = 0;
				1503	oissued = 0;
				1504	}
				1505	if ((used \| wanted) & ~oissued & mine)
				1506	goto out; /* we need these caps */
				1507
				1508	if (oissued) {
				1509	/* we aren't the only cap.. just remove us */
				1510	__ceph_remove_cap(cap, true);
				1511	session->s_trim_caps--;
				1512	} else {
				1513	struct dentry *dentry;
				1514	/* try dropping referring dentries */
				1515	spin_unlock(&ci->i_ceph_lock);
				1516	dentry = d_find_any_alias(inode);
				1517	if (dentry && drop_negative_children(dentry)) {
				1518	int count;
				1519	dput(dentry);
				1520	d_prune_aliases(inode);
				1521	count = atomic_read(&inode->i_count);
				1522	if (count == 1)
				1523	session->s_trim_caps--;
				1524	dout("trim_caps_cb %p cap %p pruned, count now %d\n",
				1525	inode, cap, count);
				1526	} else {
				1527	dput(dentry);
				1528	}
				1529	return 0;
				1530	}
				1531
				1532	out:
				1533	spin_unlock(&ci->i_ceph_lock);
				1534	return 0;
				1535	}
				1536
				1537	/*
				1538	* Trim session cap count down to some max number.
				1539	*/
				1540	static int trim_caps(struct ceph_mds_client *mdsc,
				1541	struct ceph_mds_session *session,
				1542	int max_caps)
				1543	{
				1544	int trim_caps = session->s_nr_caps - max_caps;
				1545
				1546	dout("trim_caps mds%d start: %d / %d, trim %d\n",
				1547	session->s_mds, session->s_nr_caps, max_caps, trim_caps);
				1548	if (trim_caps > 0) {
				1549	session->s_trim_caps = trim_caps;
				1550	iterate_session_caps(session, trim_caps_cb, session);
				1551	dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
				1552	session->s_mds, session->s_nr_caps, max_caps,
				1553	trim_caps - session->s_trim_caps);
				1554	session->s_trim_caps = 0;
				1555	}
				1556
				1557	ceph_send_cap_releases(mdsc, session);
				1558	return 0;
				1559	}
				1560
				1561	static int check_caps_flush(struct ceph_mds_client *mdsc,
				1562	u64 want_flush_tid)
				1563	{
				1564	int ret = 1;
				1565
				1566	spin_lock(&mdsc->cap_dirty_lock);
				1567	if (!list_empty(&mdsc->cap_flush_list)) {
				1568	struct ceph_cap_flush *cf =
				1569	list_first_entry(&mdsc->cap_flush_list,
				1570	struct ceph_cap_flush, g_list);
				1571	if (cf->tid <= want_flush_tid) {
				1572	dout("check_caps_flush still flushing tid "
				1573	"%llu <= %llu\n", cf->tid, want_flush_tid);
				1574	ret = 0;
				1575	}
				1576	}
				1577	spin_unlock(&mdsc->cap_dirty_lock);
				1578	return ret;
				1579	}
				1580
				1581	/*
				1582	* flush all dirty inode data to disk.
				1583	*
				1584	* returns true if we've flushed through want_flush_tid
				1585	*/
				1586	static void wait_caps_flush(struct ceph_mds_client *mdsc,
				1587	u64 want_flush_tid)
				1588	{
				1589	dout("check_caps_flush want %llu\n", want_flush_tid);
				1590
				1591	wait_event(mdsc->cap_flushing_wq,
				1592	check_caps_flush(mdsc, want_flush_tid));
				1593
				1594	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
				1595	}
				1596
				1597	/*
				1598	* called under s_mutex
				1599	*/
				1600	void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
				1601	struct ceph_mds_session *session)
				1602	{
				1603	struct ceph_msg *msg = NULL;
				1604	struct ceph_mds_cap_release *head;
				1605	struct ceph_mds_cap_item *item;
				1606	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
				1607	struct ceph_cap *cap;
				1608	LIST_HEAD(tmp_list);
				1609	int num_cap_releases;
				1610	__le32 barrier, *cap_barrier;
				1611
				1612	down_read(&osdc->lock);
				1613	barrier = cpu_to_le32(osdc->epoch_barrier);
				1614	up_read(&osdc->lock);
				1615
				1616	spin_lock(&session->s_cap_lock);
				1617	again:
				1618	list_splice_init(&session->s_cap_releases, &tmp_list);
				1619	num_cap_releases = session->s_num_cap_releases;
				1620	session->s_num_cap_releases = 0;
				1621	spin_unlock(&session->s_cap_lock);
				1622
				1623	while (!list_empty(&tmp_list)) {
				1624	if (!msg) {
				1625	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
				1626	PAGE_SIZE, GFP_NOFS, false);
				1627	if (!msg)
				1628	goto out_err;
				1629	head = msg->front.iov_base;
				1630	head->num = cpu_to_le32(0);
				1631	msg->front.iov_len = sizeof(*head);
				1632
				1633	msg->hdr.version = cpu_to_le16(2);
				1634	msg->hdr.compat_version = cpu_to_le16(1);
				1635	}
				1636
				1637	cap = list_first_entry(&tmp_list, struct ceph_cap,
				1638	session_caps);
				1639	list_del(&cap->session_caps);
				1640	num_cap_releases--;
				1641
				1642	head = msg->front.iov_base;
				1643	le32_add_cpu(&head->num, 1);
				1644	item = msg->front.iov_base + msg->front.iov_len;
				1645	item->ino = cpu_to_le64(cap->cap_ino);
				1646	item->cap_id = cpu_to_le64(cap->cap_id);
				1647	item->migrate_seq = cpu_to_le32(cap->mseq);
				1648	item->seq = cpu_to_le32(cap->issue_seq);
				1649	msg->front.iov_len += sizeof(*item);
				1650
				1651	ceph_put_cap(mdsc, cap);
				1652
				1653	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
				1654	// Append cap_barrier field
				1655	cap_barrier = msg->front.iov_base + msg->front.iov_len;
				1656	*cap_barrier = barrier;
				1657	msg->front.iov_len += sizeof(*cap_barrier);
				1658
				1659	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1660	dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
				1661	ceph_con_send(&session->s_con, msg);
				1662	msg = NULL;
				1663	}
				1664	}
				1665
				1666	BUG_ON(num_cap_releases != 0);
				1667
				1668	spin_lock(&session->s_cap_lock);
				1669	if (!list_empty(&session->s_cap_releases))
				1670	goto again;
				1671	spin_unlock(&session->s_cap_lock);
				1672
				1673	if (msg) {
				1674	// Append cap_barrier field
				1675	cap_barrier = msg->front.iov_base + msg->front.iov_len;
				1676	*cap_barrier = barrier;
				1677	msg->front.iov_len += sizeof(*cap_barrier);
				1678
				1679	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1680	dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
				1681	ceph_con_send(&session->s_con, msg);
				1682	}
				1683	return;
				1684	out_err:
				1685	pr_err("send_cap_releases mds%d, failed to allocate message\n",
				1686	session->s_mds);
				1687	spin_lock(&session->s_cap_lock);
				1688	list_splice(&tmp_list, &session->s_cap_releases);
				1689	session->s_num_cap_releases += num_cap_releases;
				1690	spin_unlock(&session->s_cap_lock);
				1691	}
				1692
				1693	/*
				1694	* requests
				1695	*/
				1696
				1697	int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
				1698	struct inode *dir)
				1699	{
				1700	struct ceph_inode_info *ci = ceph_inode(dir);
				1701	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
				1702	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
				1703	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
				1704	int order, num_entries;
				1705
				1706	spin_lock(&ci->i_ceph_lock);
				1707	num_entries = ci->i_files + ci->i_subdirs;
				1708	spin_unlock(&ci->i_ceph_lock);
				1709	num_entries = max(num_entries, 1);
				1710	num_entries = min(num_entries, opt->max_readdir);
				1711
				1712	order = get_order(size * num_entries);
				1713	while (order >= 0) {
				1714	rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL \|
				1715	__GFP_NOWARN,
				1716	order);
				1717	if (rinfo->dir_entries)
				1718	break;
				1719	order--;
				1720	}
				1721	if (!rinfo->dir_entries)
				1722	return -ENOMEM;
				1723
				1724	num_entries = (PAGE_SIZE << order) / size;
				1725	num_entries = min(num_entries, opt->max_readdir);
				1726
				1727	rinfo->dir_buf_size = PAGE_SIZE << order;
				1728	req->r_num_caps = num_entries + 1;
				1729	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
				1730	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
				1731	return 0;
				1732	}
				1733
				1734	/*
				1735	* Create an mds request.
				1736	*/
				1737	struct ceph_mds_request *
				1738	ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
				1739	{
				1740	struct ceph_mds_request req = kzalloc(sizeof(req), GFP_NOFS);
				1741
				1742	if (!req)
				1743	return ERR_PTR(-ENOMEM);
				1744
				1745	mutex_init(&req->r_fill_mutex);
				1746	req->r_mdsc = mdsc;
				1747	req->r_started = jiffies;
				1748	req->r_resend_mds = -1;
				1749	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
				1750	INIT_LIST_HEAD(&req->r_unsafe_target_item);
				1751	req->r_fmode = -1;
				1752	kref_init(&req->r_kref);
				1753	RB_CLEAR_NODE(&req->r_node);
				1754	INIT_LIST_HEAD(&req->r_wait);
				1755	init_completion(&req->r_completion);
				1756	init_completion(&req->r_safe_completion);
				1757	INIT_LIST_HEAD(&req->r_unsafe_item);
				1758
				1759	req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran);
				1760
				1761	req->r_op = op;
				1762	req->r_direct_mode = mode;
				1763	return req;
				1764	}
				1765
				1766	/*
				1767	* return oldest (lowest) request, tid in request tree, 0 if none.
				1768	*
				1769	* called under mdsc->mutex.
				1770	*/
				1771	static struct ceph_mds_request __get_oldest_req(struct ceph_mds_client mdsc)
				1772	{
				1773	if (RB_EMPTY_ROOT(&mdsc->request_tree))
				1774	return NULL;
				1775	return rb_entry(rb_first(&mdsc->request_tree),
				1776	struct ceph_mds_request, r_node);
				1777	}
				1778
				1779	static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
				1780	{
				1781	return mdsc->oldest_tid;
				1782	}
				1783
				1784	/*
				1785	* Build a dentry's path. Allocate on heap; caller must kfree. Based
				1786	* on build_path_from_dentry in fs/cifs/dir.c.
				1787	*
				1788	* If @stop_on_nosnap, generate path relative to the first non-snapped
				1789	* inode.
				1790	*
				1791	* Encode hidden .snap dirs as a double /, i.e.
				1792	* foo/.snap/bar -> foo//bar
				1793	*/
				1794	char ceph_mdsc_build_path(struct dentry dentry, int plen, u64 base,
				1795	int stop_on_nosnap)
				1796	{
				1797	struct dentry *temp;
				1798	char *path;
				1799	int len, pos;
				1800	unsigned seq;
				1801
				1802	if (!dentry)
				1803	return ERR_PTR(-EINVAL);
				1804
				1805	retry:
				1806	len = 0;
				1807	seq = read_seqbegin(&rename_lock);
				1808	rcu_read_lock();
				1809	for (temp = dentry; !IS_ROOT(temp);) {
				1810	struct inode *inode = d_inode(temp);
				1811	if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
				1812	len++; /* slash only */
				1813	else if (stop_on_nosnap && inode &&
				1814	ceph_snap(inode) == CEPH_NOSNAP)
				1815	break;
				1816	else
				1817	len += 1 + temp->d_name.len;
				1818	temp = temp->d_parent;
				1819	}
				1820	rcu_read_unlock();
				1821	if (len)
				1822	len--; /* no leading '/' */
				1823
				1824	path = kmalloc(len+1, GFP_NOFS);
				1825	if (!path)
				1826	return ERR_PTR(-ENOMEM);
				1827	pos = len;
				1828	path[pos] = 0; /* trailing null */
				1829	rcu_read_lock();
				1830	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
				1831	struct inode *inode;
				1832
				1833	spin_lock(&temp->d_lock);
				1834	inode = d_inode(temp);
				1835	if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
				1836	dout("build_path path+%d: %p SNAPDIR\n",
				1837	pos, temp);
				1838	} else if (stop_on_nosnap && inode &&
				1839	ceph_snap(inode) == CEPH_NOSNAP) {
				1840	spin_unlock(&temp->d_lock);
				1841	break;
				1842	} else {
				1843	pos -= temp->d_name.len;
				1844	if (pos < 0) {
				1845	spin_unlock(&temp->d_lock);
				1846	break;
				1847	}
				1848	strncpy(path + pos, temp->d_name.name,
				1849	temp->d_name.len);
				1850	}
				1851	spin_unlock(&temp->d_lock);
				1852	if (pos)
				1853	path[--pos] = '/';
				1854	temp = temp->d_parent;
				1855	}
				1856	rcu_read_unlock();
				1857	if (pos != 0 \|\| read_seqretry(&rename_lock, seq)) {
				1858	pr_err("build_path did not end path lookup where "
				1859	"expected, namelen is %d, pos is %d\n", len, pos);
				1860	/* presumably this is only possible if racing with a
				1861	rename of one of the parent directories (we can not
				1862	lock the dentries above us to prevent this, but
				1863	retrying should be harmless) */
				1864	kfree(path);
				1865	goto retry;
				1866	}
				1867
				1868	*base = ceph_ino(d_inode(temp));
				1869	*plen = len;
				1870	dout("build_path on %p %d built %llx '%.*s'\n",
				1871	dentry, d_count(dentry), *base, len, path);
				1872	return path;
				1873	}
				1874
				1875	/* Duplicate the dentry->d_name.name safely */
				1876	static int clone_dentry_name(struct dentry dentry, const char *ppath,
				1877	int *ppathlen)
				1878	{
				1879	u32 len;
				1880	char *name;
				1881
				1882	retry:
				1883	len = READ_ONCE(dentry->d_name.len);
				1884	name = kmalloc(len + 1, GFP_NOFS);
				1885	if (!name)
				1886	return -ENOMEM;
				1887
				1888	spin_lock(&dentry->d_lock);
				1889	if (dentry->d_name.len != len) {
				1890	spin_unlock(&dentry->d_lock);
				1891	kfree(name);
				1892	goto retry;
				1893	}
				1894	memcpy(name, dentry->d_name.name, len);
				1895	spin_unlock(&dentry->d_lock);
				1896
				1897	name[len] = '\0';
				1898	*ppath = name;
				1899	*ppathlen = len;
				1900	return 0;
				1901	}
				1902
				1903	static int build_dentry_path(struct dentry dentry, struct inode dir,
				1904	const char *ppath, int ppathlen, u64 *pino,
				1905	bool *pfreepath, bool parent_locked)
				1906	{
				1907	int ret;
				1908	char *path;
				1909
				1910	rcu_read_lock();
				1911	if (!dir)
				1912	dir = d_inode_rcu(dentry->d_parent);
				1913	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
				1914	*pino = ceph_ino(dir);
				1915	rcu_read_unlock();
				1916	if (parent_locked) {
				1917	*ppath = dentry->d_name.name;
				1918	*ppathlen = dentry->d_name.len;
				1919	} else {
				1920	ret = clone_dentry_name(dentry, ppath, ppathlen);
				1921	if (ret)
				1922	return ret;
				1923	*pfreepath = true;
				1924	}
				1925	return 0;
				1926	}
				1927	rcu_read_unlock();
				1928	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
				1929	if (IS_ERR(path))
				1930	return PTR_ERR(path);
				1931	*ppath = path;
				1932	*pfreepath = true;
				1933	return 0;
				1934	}
				1935
				1936	static int build_inode_path(struct inode *inode,
				1937	const char *ppath, int ppathlen, u64 *pino,
				1938	bool *pfreepath)
				1939	{
				1940	struct dentry *dentry;
				1941	char *path;
				1942
				1943	if (ceph_snap(inode) == CEPH_NOSNAP) {
				1944	*pino = ceph_ino(inode);
				1945	*ppathlen = 0;
				1946	return 0;
				1947	}
				1948	dentry = d_find_alias(inode);
				1949	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
				1950	dput(dentry);
				1951	if (IS_ERR(path))
				1952	return PTR_ERR(path);
				1953	*ppath = path;
				1954	*pfreepath = true;
				1955	return 0;
				1956	}
				1957
				1958	/*
				1959	* request arguments may be specified via an inode , a dentry , or
				1960	* an explicit ino+path.
				1961	*/
				1962	static int set_request_path_attr(struct inode rinode, struct dentry rdentry,
				1963	struct inode rdiri, const char rpath,
				1964	u64 rino, const char *ppath, int pathlen,
				1965	u64 ino, bool freepath, bool parent_locked)
				1966	{
				1967	int r = 0;
				1968
				1969	if (rinode) {
				1970	r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
				1971	dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
				1972	ceph_snap(rinode));
				1973	} else if (rdentry) {
				1974	r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
				1975	freepath, parent_locked);
				1976	dout(" dentry %p %llx/%.s\n", rdentry, ino, *pathlen,
				1977	*ppath);
				1978	} else if (rpath \|\| rino) {
				1979	*ino = rino;
				1980	*ppath = rpath;
				1981	*pathlen = rpath ? strlen(rpath) : 0;
				1982	dout(" path %.s\n", pathlen, rpath);
				1983	}
				1984
				1985	return r;
				1986	}
				1987
				1988	/*
				1989	* called under mdsc->mutex
				1990	*/
				1991	static struct ceph_msg create_request_message(struct ceph_mds_client mdsc,
				1992	struct ceph_mds_request *req,
				1993	int mds, bool drop_cap_releases)
				1994	{
				1995	struct ceph_msg *msg;
				1996	struct ceph_mds_request_head *head;
				1997	const char *path1 = NULL;
				1998	const char *path2 = NULL;
				1999	u64 ino1 = 0, ino2 = 0;
				2000	int pathlen1 = 0, pathlen2 = 0;
				2001	bool freepath1 = false, freepath2 = false;
				2002	int len;
				2003	u16 releases;
				2004	void p, end;
				2005	int ret;
				2006
				2007	ret = set_request_path_attr(req->r_inode, req->r_dentry,
				2008	req->r_parent, req->r_path1, req->r_ino1.ino,
				2009	&path1, &pathlen1, &ino1, &freepath1,
				2010	test_bit(CEPH_MDS_R_PARENT_LOCKED,
				2011	&req->r_req_flags));
				2012	if (ret < 0) {
				2013	msg = ERR_PTR(ret);
				2014	goto out;
				2015	}
				2016
				2017	/* If r_old_dentry is set, then assume that its parent is locked */
				2018	ret = set_request_path_attr(NULL, req->r_old_dentry,
				2019	req->r_old_dentry_dir,
				2020	req->r_path2, req->r_ino2.ino,
				2021	&path2, &pathlen2, &ino2, &freepath2, true);
				2022	if (ret < 0) {
				2023	msg = ERR_PTR(ret);
				2024	goto out_free1;
				2025	}
				2026
				2027	len = sizeof(*head) +
				2028	pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
				2029	sizeof(struct ceph_timespec);
				2030
				2031	/* calculate (max) length for cap releases */
				2032	len += sizeof(struct ceph_mds_request_release) *
				2033	(!!req->r_inode_drop + !!req->r_dentry_drop +
				2034	!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
				2035	if (req->r_dentry_drop)
				2036	len += req->r_dentry->d_name.len;
				2037	if (req->r_old_dentry_drop)
				2038	len += req->r_old_dentry->d_name.len;
				2039
				2040	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
				2041	if (!msg) {
				2042	msg = ERR_PTR(-ENOMEM);
				2043	goto out_free2;
				2044	}
				2045
				2046	msg->hdr.version = cpu_to_le16(2);
				2047	msg->hdr.tid = cpu_to_le64(req->r_tid);
				2048
				2049	head = msg->front.iov_base;
				2050	p = msg->front.iov_base + sizeof(*head);
				2051	end = msg->front.iov_base + msg->front.iov_len;
				2052
				2053	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
				2054	head->op = cpu_to_le32(req->r_op);
				2055	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
				2056	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
				2057	head->args = req->r_args;
				2058
				2059	ceph_encode_filepath(&p, end, ino1, path1);
				2060	ceph_encode_filepath(&p, end, ino2, path2);
				2061
				2062	/* make note of release offset, in case we need to replay */
				2063	req->r_request_release_offset = p - msg->front.iov_base;
				2064
				2065	/* cap releases */
				2066	releases = 0;
				2067	if (req->r_inode_drop)
				2068	releases += ceph_encode_inode_release(&p,
				2069	req->r_inode ? req->r_inode : d_inode(req->r_dentry),
				2070	mds, req->r_inode_drop, req->r_inode_unless, 0);
				2071	if (req->r_dentry_drop)
				2072	releases += ceph_encode_dentry_release(&p, req->r_dentry,
				2073	req->r_parent, mds, req->r_dentry_drop,
				2074	req->r_dentry_unless);
				2075	if (req->r_old_dentry_drop)
				2076	releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
				2077	req->r_old_dentry_dir, mds,
				2078	req->r_old_dentry_drop,
				2079	req->r_old_dentry_unless);
				2080	if (req->r_old_inode_drop)
				2081	releases += ceph_encode_inode_release(&p,
				2082	d_inode(req->r_old_dentry),
				2083	mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
				2084
				2085	if (drop_cap_releases) {
				2086	releases = 0;
				2087	p = msg->front.iov_base + req->r_request_release_offset;
				2088	}
				2089
				2090	head->num_releases = cpu_to_le16(releases);
				2091
				2092	/* time stamp */
				2093	{
				2094	struct ceph_timespec ts;
				2095	ceph_encode_timespec(&ts, &req->r_stamp);
				2096	ceph_encode_copy(&p, &ts, sizeof(ts));
				2097	}
				2098
				2099	BUG_ON(p > end);
				2100	msg->front.iov_len = p - msg->front.iov_base;
				2101	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2102
				2103	if (req->r_pagelist) {
				2104	struct ceph_pagelist *pagelist = req->r_pagelist;
				2105	refcount_inc(&pagelist->refcnt);
				2106	ceph_msg_data_add_pagelist(msg, pagelist);
				2107	msg->hdr.data_len = cpu_to_le32(pagelist->length);
				2108	} else {
				2109	msg->hdr.data_len = 0;
				2110	}
				2111
				2112	msg->hdr.data_off = cpu_to_le16(0);
				2113
				2114	out_free2:
				2115	if (freepath2)
				2116	kfree((char *)path2);
				2117	out_free1:
				2118	if (freepath1)
				2119	kfree((char *)path1);
				2120	out:
				2121	return msg;
				2122	}
				2123
				2124	/*
				2125	* called under mdsc->mutex if error, under no mutex if
				2126	* success.
				2127	*/
				2128	static void complete_request(struct ceph_mds_client *mdsc,
				2129	struct ceph_mds_request *req)
				2130	{
				2131	if (req->r_callback)
				2132	req->r_callback(mdsc, req);
				2133	else
				2134	complete_all(&req->r_completion);
				2135	}
				2136
				2137	/*
				2138	* called under mdsc->mutex
				2139	*/
				2140	static int __prepare_send_request(struct ceph_mds_client *mdsc,
				2141	struct ceph_mds_request *req,
				2142	int mds, bool drop_cap_releases)
				2143	{
				2144	struct ceph_mds_request_head *rhead;
				2145	struct ceph_msg *msg;
				2146	int flags = 0;
				2147
				2148	req->r_attempts++;
				2149	if (req->r_inode) {
				2150	struct ceph_cap *cap =
				2151	ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
				2152
				2153	if (cap)
				2154	req->r_sent_on_mseq = cap->mseq;
				2155	else
				2156	req->r_sent_on_mseq = -1;
				2157	}
				2158	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
				2159	req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
				2160
				2161	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2162	void *p;
				2163	/*
				2164	* Replay. Do not regenerate message (and rebuild
				2165	* paths, etc.); just use the original message.
				2166	* Rebuilding paths will break for renames because
				2167	* d_move mangles the src name.
				2168	*/
				2169	msg = req->r_request;
				2170	rhead = msg->front.iov_base;
				2171
				2172	flags = le32_to_cpu(rhead->flags);
				2173	flags \|= CEPH_MDS_FLAG_REPLAY;
				2174	rhead->flags = cpu_to_le32(flags);
				2175
				2176	if (req->r_target_inode)
				2177	rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
				2178
				2179	rhead->num_retry = req->r_attempts - 1;
				2180
				2181	/* remove cap/dentry releases from message */
				2182	rhead->num_releases = 0;
				2183
				2184	/* time stamp */
				2185	p = msg->front.iov_base + req->r_request_release_offset;
				2186	{
				2187	struct ceph_timespec ts;
				2188	ceph_encode_timespec(&ts, &req->r_stamp);
				2189	ceph_encode_copy(&p, &ts, sizeof(ts));
				2190	}
				2191
				2192	msg->front.iov_len = p - msg->front.iov_base;
				2193	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2194	return 0;
				2195	}
				2196
				2197	if (req->r_request) {
				2198	ceph_msg_put(req->r_request);
				2199	req->r_request = NULL;
				2200	}
				2201	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
				2202	if (IS_ERR(msg)) {
				2203	req->r_err = PTR_ERR(msg);
				2204	return PTR_ERR(msg);
				2205	}
				2206	req->r_request = msg;
				2207
				2208	rhead = msg->front.iov_base;
				2209	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
				2210	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2211	flags \|= CEPH_MDS_FLAG_REPLAY;
				2212	if (req->r_parent)
				2213	flags \|= CEPH_MDS_FLAG_WANT_DENTRY;
				2214	rhead->flags = cpu_to_le32(flags);
				2215	rhead->num_fwd = req->r_num_fwd;
				2216	rhead->num_retry = req->r_attempts - 1;
				2217	rhead->ino = 0;
				2218
				2219	dout(" r_parent = %p\n", req->r_parent);
				2220	return 0;
				2221	}
				2222
				2223	/*
				2224	* send request, or put it on the appropriate wait list.
				2225	*/
				2226	static int __do_request(struct ceph_mds_client *mdsc,
				2227	struct ceph_mds_request *req)
				2228	{
				2229	struct ceph_mds_session *session = NULL;
				2230	int mds = -1;
				2231	int err = 0;
				2232
				2233	if (req->r_err \|\| test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
				2234	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
				2235	__unregister_request(mdsc, req);
				2236	goto out;
				2237	}
				2238
				2239	if (req->r_timeout &&
				2240	time_after_eq(jiffies, req->r_started + req->r_timeout)) {
				2241	dout("do_request timed out\n");
				2242	err = -EIO;
				2243	goto finish;
				2244	}
				2245	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
				2246	dout("do_request forced umount\n");
				2247	err = -EIO;
				2248	goto finish;
				2249	}
				2250	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
				2251	if (mdsc->mdsmap_err) {
				2252	err = mdsc->mdsmap_err;
				2253	dout("do_request mdsmap err %d\n", err);
				2254	goto finish;
				2255	}
				2256	if (mdsc->mdsmap->m_epoch == 0) {
				2257	dout("do_request no mdsmap, waiting for map\n");
				2258	list_add(&req->r_wait, &mdsc->waiting_for_map);
				2259	goto finish;
				2260	}
				2261	if (!(mdsc->fsc->mount_options->flags &
				2262	CEPH_MOUNT_OPT_MOUNTWAIT) &&
				2263	!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
				2264	err = -EHOSTUNREACH;
				2265	goto finish;
				2266	}
				2267	}
				2268
				2269	put_request_session(req);
				2270
				2271	mds = __choose_mds(mdsc, req);
				2272	if (mds < 0 \|\|
				2273	ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
				2274	dout("do_request no mds or not active, waiting for map\n");
				2275	list_add(&req->r_wait, &mdsc->waiting_for_map);
				2276	goto out;
				2277	}
				2278
				2279	/* get, open session */
				2280	session = __ceph_lookup_mds_session(mdsc, mds);
				2281	if (!session) {
				2282	session = register_session(mdsc, mds);
				2283	if (IS_ERR(session)) {
				2284	err = PTR_ERR(session);
				2285	goto finish;
				2286	}
				2287	}
				2288	req->r_session = get_session(session);
				2289
				2290	dout("do_request mds%d session %p state %s\n", mds, session,
				2291	ceph_session_state_name(session->s_state));
				2292	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
				2293	session->s_state != CEPH_MDS_SESSION_HUNG) {
				2294	if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
				2295	err = -EACCES;
				2296	goto out_session;
				2297	}
				2298	if (session->s_state == CEPH_MDS_SESSION_NEW \|\|
				2299	session->s_state == CEPH_MDS_SESSION_CLOSING)
				2300	__open_session(mdsc, session);
				2301	list_add(&req->r_wait, &session->s_waiting);
				2302	goto out_session;
				2303	}
				2304
				2305	/* send request */
				2306	req->r_resend_mds = -1; /* forget any previous mds hint */
				2307
				2308	if (req->r_request_started == 0) /* note request start time */
				2309	req->r_request_started = jiffies;
				2310
				2311	err = __prepare_send_request(mdsc, req, mds, false);
				2312	if (!err) {
				2313	ceph_msg_get(req->r_request);
				2314	ceph_con_send(&session->s_con, req->r_request);
				2315	}
				2316
				2317	out_session:
				2318	ceph_put_mds_session(session);
				2319	finish:
				2320	if (err) {
				2321	dout("__do_request early error %d\n", err);
				2322	req->r_err = err;
				2323	complete_request(mdsc, req);
				2324	__unregister_request(mdsc, req);
				2325	}
				2326	out:
				2327	return err;
				2328	}
				2329
				2330	/*
				2331	* called under mdsc->mutex
				2332	*/
				2333	static void __wake_requests(struct ceph_mds_client *mdsc,
				2334	struct list_head *head)
				2335	{
				2336	struct ceph_mds_request *req;
				2337	LIST_HEAD(tmp_list);
				2338
				2339	list_splice_init(head, &tmp_list);
				2340
				2341	while (!list_empty(&tmp_list)) {
				2342	req = list_entry(tmp_list.next,
				2343	struct ceph_mds_request, r_wait);
				2344	list_del_init(&req->r_wait);
				2345	dout(" wake request %p tid %llu\n", req, req->r_tid);
				2346	__do_request(mdsc, req);
				2347	}
				2348	}
				2349
				2350	/*
				2351	* Wake up threads with requests pending for @mds, so that they can
				2352	* resubmit their requests to a possibly different mds.
				2353	*/
				2354	static void kick_requests(struct ceph_mds_client *mdsc, int mds)
				2355	{
				2356	struct ceph_mds_request *req;
				2357	struct rb_node *p = rb_first(&mdsc->request_tree);
				2358
				2359	dout("kick_requests mds%d\n", mds);
				2360	while (p) {
				2361	req = rb_entry(p, struct ceph_mds_request, r_node);
				2362	p = rb_next(p);
				2363	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2364	continue;
				2365	if (req->r_attempts > 0)
				2366	continue; /* only new requests */
				2367	if (req->r_session &&
				2368	req->r_session->s_mds == mds) {
				2369	dout(" kicking tid %llu\n", req->r_tid);
				2370	list_del_init(&req->r_wait);
				2371	__do_request(mdsc, req);
				2372	}
				2373	}
				2374	}
				2375
				2376	void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
				2377	struct ceph_mds_request *req)
				2378	{
				2379	dout("submit_request on %p\n", req);
				2380	mutex_lock(&mdsc->mutex);
				2381	__register_request(mdsc, req, NULL);
				2382	__do_request(mdsc, req);
				2383	mutex_unlock(&mdsc->mutex);
				2384	}
				2385
				2386	/*
				2387	* Synchrously perform an mds request. Take care of all of the
				2388	* session setup, forwarding, retry details.
				2389	*/
				2390	int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
				2391	struct inode *dir,
				2392	struct ceph_mds_request *req)
				2393	{
				2394	int err;
				2395
				2396	dout("do_request on %p\n", req);
				2397
				2398	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
				2399	if (req->r_inode)
				2400	ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
				2401	if (req->r_parent)
				2402	ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
				2403	if (req->r_old_dentry_dir)
				2404	ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
				2405	CEPH_CAP_PIN);
				2406
				2407	/* issue */
				2408	mutex_lock(&mdsc->mutex);
				2409	__register_request(mdsc, req, dir);
				2410	__do_request(mdsc, req);
				2411
				2412	if (req->r_err) {
				2413	err = req->r_err;
				2414	goto out;
				2415	}
				2416
				2417	/* wait */
				2418	mutex_unlock(&mdsc->mutex);
				2419	dout("do_request waiting\n");
				2420	if (!req->r_timeout && req->r_wait_for_completion) {
				2421	err = req->r_wait_for_completion(mdsc, req);
				2422	} else {
				2423	long timeleft = wait_for_completion_killable_timeout(
				2424	&req->r_completion,
				2425	ceph_timeout_jiffies(req->r_timeout));
				2426	if (timeleft > 0)
				2427	err = 0;
				2428	else if (!timeleft)
				2429	err = -EIO; /* timed out */
				2430	else
				2431	err = timeleft; /* killed */
				2432	}
				2433	dout("do_request waited, got %d\n", err);
				2434	mutex_lock(&mdsc->mutex);
				2435
				2436	/* only abort if we didn't race with a real reply */
				2437	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
				2438	err = le32_to_cpu(req->r_reply_info.head->result);
				2439	} else if (err < 0) {
				2440	dout("aborted request %lld with %d\n", req->r_tid, err);
				2441
				2442	/*
				2443	* ensure we aren't running concurrently with
				2444	* ceph_fill_trace or ceph_readdir_prepopulate, which
				2445	* rely on locks (dir mutex) held by our caller.
				2446	*/
				2447	mutex_lock(&req->r_fill_mutex);
				2448	req->r_err = err;
				2449	set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
				2450	mutex_unlock(&req->r_fill_mutex);
				2451
				2452	if (req->r_parent &&
				2453	(req->r_op & CEPH_MDS_OP_WRITE))
				2454	ceph_invalidate_dir_request(req);
				2455	} else {
				2456	err = req->r_err;
				2457	}
				2458
				2459	out:
				2460	mutex_unlock(&mdsc->mutex);
				2461	dout("do_request %p done, result %d\n", req, err);
				2462	return err;
				2463	}
				2464
				2465	/*
				2466	* Invalidate dir's completeness, dentry lease state on an aborted MDS
				2467	* namespace request.
				2468	*/
				2469	void ceph_invalidate_dir_request(struct ceph_mds_request *req)
				2470	{
				2471	struct inode *inode = req->r_parent;
				2472
				2473	dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
				2474
				2475	ceph_dir_clear_complete(inode);
				2476	if (req->r_dentry)
				2477	ceph_invalidate_dentry_lease(req->r_dentry);
				2478	if (req->r_old_dentry)
				2479	ceph_invalidate_dentry_lease(req->r_old_dentry);
				2480	}
				2481
				2482	/*
				2483	* Handle mds reply.
				2484	*
				2485	* We take the session mutex and parse and process the reply immediately.
				2486	* This preserves the logical ordering of replies, capabilities, etc., sent
				2487	* by the MDS as they are applied to our local cache.
				2488	*/
				2489	static void handle_reply(struct ceph_mds_session session, struct ceph_msg msg)
				2490	{
				2491	struct ceph_mds_client *mdsc = session->s_mdsc;
				2492	struct ceph_mds_request *req;
				2493	struct ceph_mds_reply_head *head = msg->front.iov_base;
				2494	struct ceph_mds_reply_info_parsed rinfo; / parsed reply info */
				2495	struct ceph_snap_realm *realm;
				2496	u64 tid;
				2497	int err, result;
				2498	int mds = session->s_mds;
				2499
				2500	if (msg->front.iov_len < sizeof(*head)) {
				2501	pr_err("mdsc_handle_reply got corrupt (short) reply\n");
				2502	ceph_msg_dump(msg);
				2503	return;
				2504	}
				2505
				2506	/* get request, session */
				2507	tid = le64_to_cpu(msg->hdr.tid);
				2508	mutex_lock(&mdsc->mutex);
				2509	req = lookup_get_request(mdsc, tid);
				2510	if (!req) {
				2511	dout("handle_reply on unknown tid %llu\n", tid);
				2512	mutex_unlock(&mdsc->mutex);
				2513	return;
				2514	}
				2515	dout("handle_reply %p\n", req);
				2516
				2517	/* correct session? */
				2518	if (req->r_session != session) {
				2519	pr_err("mdsc_handle_reply got %llu on session mds%d"
				2520	" not mds%d\n", tid, session->s_mds,
				2521	req->r_session ? req->r_session->s_mds : -1);
				2522	mutex_unlock(&mdsc->mutex);
				2523	goto out;
				2524	}
				2525
				2526	/* dup? */
				2527	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) \|\|
				2528	(test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
				2529	pr_warn("got a dup %s reply on %llu from mds%d\n",
				2530	head->safe ? "safe" : "unsafe", tid, mds);
				2531	mutex_unlock(&mdsc->mutex);
				2532	goto out;
				2533	}
				2534	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
				2535	pr_warn("got unsafe after safe on %llu from mds%d\n",
				2536	tid, mds);
				2537	mutex_unlock(&mdsc->mutex);
				2538	goto out;
				2539	}
				2540
				2541	result = le32_to_cpu(head->result);
				2542
				2543	/*
				2544	* Handle an ESTALE
				2545	* if we're not talking to the authority, send to them
				2546	* if the authority has changed while we weren't looking,
				2547	* send to new authority
				2548	* Otherwise we just have to return an ESTALE
				2549	*/
				2550	if (result == -ESTALE) {
				2551	dout("got ESTALE on request %llu", req->r_tid);
				2552	req->r_resend_mds = -1;
				2553	if (req->r_direct_mode != USE_AUTH_MDS) {
				2554	dout("not using auth, setting for that now");
				2555	req->r_direct_mode = USE_AUTH_MDS;
				2556	__do_request(mdsc, req);
				2557	mutex_unlock(&mdsc->mutex);
				2558	goto out;
				2559	} else {
				2560	int mds = __choose_mds(mdsc, req);
				2561	if (mds >= 0 && mds != req->r_session->s_mds) {
				2562	dout("but auth changed, so resending");
				2563	__do_request(mdsc, req);
				2564	mutex_unlock(&mdsc->mutex);
				2565	goto out;
				2566	}
				2567	}
				2568	dout("have to return ESTALE on request %llu", req->r_tid);
				2569	}
				2570
				2571
				2572	if (head->safe) {
				2573	set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
				2574	__unregister_request(mdsc, req);
				2575
				2576	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2577	/*
				2578	* We already handled the unsafe response, now do the
				2579	* cleanup. No need to examine the response; the MDS
				2580	* doesn't include any result info in the safe
				2581	* response. And even if it did, there is nothing
				2582	* useful we could do with a revised return value.
				2583	*/
				2584	dout("got safe reply %llu, mds%d\n", tid, mds);
				2585
				2586	/* last unsafe request during umount? */
				2587	if (mdsc->stopping && !__get_oldest_req(mdsc))
				2588	complete_all(&mdsc->safe_umount_waiters);
				2589	mutex_unlock(&mdsc->mutex);
				2590	goto out;
				2591	}
				2592	} else {
				2593	set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
				2594	list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
				2595	if (req->r_unsafe_dir) {
				2596	struct ceph_inode_info *ci =
				2597	ceph_inode(req->r_unsafe_dir);
				2598	spin_lock(&ci->i_unsafe_lock);
				2599	list_add_tail(&req->r_unsafe_dir_item,
				2600	&ci->i_unsafe_dirops);
				2601	spin_unlock(&ci->i_unsafe_lock);
				2602	}
				2603	}
				2604
				2605	dout("handle_reply tid %lld result %d\n", tid, result);
				2606	rinfo = &req->r_reply_info;
				2607	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
				2608	mutex_unlock(&mdsc->mutex);
				2609
				2610	mutex_lock(&session->s_mutex);
				2611	if (err < 0) {
				2612	pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
				2613	ceph_msg_dump(msg);
				2614	goto out_err;
				2615	}
				2616
				2617	/* snap trace */
				2618	realm = NULL;
				2619	if (rinfo->snapblob_len) {
				2620	down_write(&mdsc->snap_rwsem);
				2621	ceph_update_snap_trace(mdsc, rinfo->snapblob,
				2622	rinfo->snapblob + rinfo->snapblob_len,
				2623	le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
				2624	&realm);
				2625	downgrade_write(&mdsc->snap_rwsem);
				2626	} else {
				2627	down_read(&mdsc->snap_rwsem);
				2628	}
				2629
				2630	/* insert trace into our cache */
				2631	mutex_lock(&req->r_fill_mutex);
				2632	current->journal_info = req;
				2633	err = ceph_fill_trace(mdsc->fsc->sb, req);
				2634	if (err == 0) {
				2635	if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR \|\|
				2636	req->r_op == CEPH_MDS_OP_LSSNAP))
				2637	ceph_readdir_prepopulate(req, req->r_session);
				2638	ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
				2639	}
				2640	current->journal_info = NULL;
				2641	mutex_unlock(&req->r_fill_mutex);
				2642
				2643	up_read(&mdsc->snap_rwsem);
				2644	if (realm)
				2645	ceph_put_snap_realm(mdsc, realm);
				2646
				2647	if (err == 0 && req->r_target_inode &&
				2648	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2649	struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
				2650	spin_lock(&ci->i_unsafe_lock);
				2651	list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
				2652	spin_unlock(&ci->i_unsafe_lock);
				2653	}
				2654	out_err:
				2655	mutex_lock(&mdsc->mutex);
				2656	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
				2657	if (err) {
				2658	req->r_err = err;
				2659	} else {
				2660	req->r_reply = ceph_msg_get(msg);
				2661	set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
				2662	}
				2663	} else {
				2664	dout("reply arrived after request %lld was aborted\n", tid);
				2665	}
				2666	mutex_unlock(&mdsc->mutex);
				2667
				2668	mutex_unlock(&session->s_mutex);
				2669
				2670	/* kick calling process */
				2671	complete_request(mdsc, req);
				2672	out:
				2673	ceph_mdsc_put_request(req);
				2674	return;
				2675	}
				2676
				2677
				2678
				2679	/*
				2680	* handle mds notification that our request has been forwarded.
				2681	*/
				2682	static void handle_forward(struct ceph_mds_client *mdsc,
				2683	struct ceph_mds_session *session,
				2684	struct ceph_msg *msg)
				2685	{
				2686	struct ceph_mds_request *req;
				2687	u64 tid = le64_to_cpu(msg->hdr.tid);
				2688	u32 next_mds;
				2689	u32 fwd_seq;
				2690	int err = -EINVAL;
				2691	void *p = msg->front.iov_base;
				2692	void *end = p + msg->front.iov_len;
				2693
				2694	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				2695	next_mds = ceph_decode_32(&p);
				2696	fwd_seq = ceph_decode_32(&p);
				2697
				2698	mutex_lock(&mdsc->mutex);
				2699	req = lookup_get_request(mdsc, tid);
				2700	if (!req) {
				2701	dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
				2702	goto out; /* dup reply? */
				2703	}
				2704
				2705	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
				2706	dout("forward tid %llu aborted, unregistering\n", tid);
				2707	__unregister_request(mdsc, req);
				2708	} else if (fwd_seq <= req->r_num_fwd) {
				2709	dout("forward tid %llu to mds%d - old seq %d <= %d\n",
				2710	tid, next_mds, req->r_num_fwd, fwd_seq);
				2711	} else {
				2712	/* resend. forward race not possible; mds would drop */
				2713	dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
				2714	BUG_ON(req->r_err);
				2715	BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
				2716	req->r_attempts = 0;
				2717	req->r_num_fwd = fwd_seq;
				2718	req->r_resend_mds = next_mds;
				2719	put_request_session(req);
				2720	__do_request(mdsc, req);
				2721	}
				2722	ceph_mdsc_put_request(req);
				2723	out:
				2724	mutex_unlock(&mdsc->mutex);
				2725	return;
				2726
				2727	bad:
				2728	pr_err("mdsc_handle_forward decode error err=%d\n", err);
				2729	}
				2730
				2731	/*
				2732	* handle a mds session control message
				2733	*/
				2734	static void handle_session(struct ceph_mds_session *session,
				2735	struct ceph_msg *msg)
				2736	{
				2737	struct ceph_mds_client *mdsc = session->s_mdsc;
				2738	u32 op;
				2739	u64 seq;
				2740	int mds = session->s_mds;
				2741	struct ceph_mds_session_head *h = msg->front.iov_base;
				2742	int wake = 0;
				2743
				2744	/* decode */
				2745	if (msg->front.iov_len != sizeof(*h))
				2746	goto bad;
				2747	op = le32_to_cpu(h->op);
				2748	seq = le64_to_cpu(h->seq);
				2749
				2750	mutex_lock(&mdsc->mutex);
				2751	if (op == CEPH_SESSION_CLOSE) {
				2752	get_session(session);
				2753	__unregister_session(mdsc, session);
				2754	}
				2755	/* FIXME: this ttl calculation is generous */
				2756	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
				2757	mutex_unlock(&mdsc->mutex);
				2758
				2759	mutex_lock(&session->s_mutex);
				2760
				2761	dout("handle_session mds%d %s %p state %s seq %llu\n",
				2762	mds, ceph_session_op_name(op), session,
				2763	ceph_session_state_name(session->s_state), seq);
				2764
				2765	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
				2766	session->s_state = CEPH_MDS_SESSION_OPEN;
				2767	pr_info("mds%d came back\n", session->s_mds);
				2768	}
				2769
				2770	switch (op) {
				2771	case CEPH_SESSION_OPEN:
				2772	if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
				2773	pr_info("mds%d reconnect success\n", session->s_mds);
				2774	session->s_state = CEPH_MDS_SESSION_OPEN;
				2775	renewed_caps(mdsc, session, 0);
				2776	wake = 1;
				2777	if (mdsc->stopping)
				2778	__close_session(mdsc, session);
				2779	break;
				2780
				2781	case CEPH_SESSION_RENEWCAPS:
				2782	if (session->s_renew_seq == seq)
				2783	renewed_caps(mdsc, session, 1);
				2784	break;
				2785
				2786	case CEPH_SESSION_CLOSE:
				2787	if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
				2788	pr_info("mds%d reconnect denied\n", session->s_mds);
				2789	cleanup_session_requests(mdsc, session);
				2790	remove_session_caps(session);
				2791	wake = 2; /* for good measure */
				2792	wake_up_all(&mdsc->session_close_wq);
				2793	break;
				2794
				2795	case CEPH_SESSION_STALE:
				2796	pr_info("mds%d caps went stale, renewing\n",
				2797	session->s_mds);
				2798	spin_lock(&session->s_gen_ttl_lock);
				2799	session->s_cap_gen++;
				2800	session->s_cap_ttl = jiffies - 1;
				2801	spin_unlock(&session->s_gen_ttl_lock);
				2802	send_renew_caps(mdsc, session);
				2803	break;
				2804
				2805	case CEPH_SESSION_RECALL_STATE:
				2806	trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
				2807	break;
				2808
				2809	case CEPH_SESSION_FLUSHMSG:
				2810	send_flushmsg_ack(mdsc, session, seq);
				2811	break;
				2812
				2813	case CEPH_SESSION_FORCE_RO:
				2814	dout("force_session_readonly %p\n", session);
				2815	spin_lock(&session->s_cap_lock);
				2816	session->s_readonly = true;
				2817	spin_unlock(&session->s_cap_lock);
				2818	wake_up_session_caps(session, 0);
				2819	break;
				2820
				2821	case CEPH_SESSION_REJECT:
				2822	WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
				2823	pr_info("mds%d rejected session\n", session->s_mds);
				2824	session->s_state = CEPH_MDS_SESSION_REJECTED;
				2825	cleanup_session_requests(mdsc, session);
				2826	remove_session_caps(session);
				2827	wake = 2; /* for good measure */
				2828	break;
				2829
				2830	default:
				2831	pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
				2832	WARN_ON(1);
				2833	}
				2834
				2835	mutex_unlock(&session->s_mutex);
				2836	if (wake) {
				2837	mutex_lock(&mdsc->mutex);
				2838	__wake_requests(mdsc, &session->s_waiting);
				2839	if (wake == 2)
				2840	kick_requests(mdsc, mds);
				2841	mutex_unlock(&mdsc->mutex);
				2842	}
				2843	if (op == CEPH_SESSION_CLOSE)
				2844	ceph_put_mds_session(session);
				2845	return;
				2846
				2847	bad:
				2848	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
				2849	(int)msg->front.iov_len);
				2850	ceph_msg_dump(msg);
				2851	return;
				2852	}
				2853
				2854
				2855	/*
				2856	* called under session->mutex.
				2857	*/
				2858	static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
				2859	struct ceph_mds_session *session)
				2860	{
				2861	struct ceph_mds_request req, nreq;
				2862	struct rb_node *p;
				2863	int err;
				2864
				2865	dout("replay_unsafe_requests mds%d\n", session->s_mds);
				2866
				2867	mutex_lock(&mdsc->mutex);
				2868	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
				2869	err = __prepare_send_request(mdsc, req, session->s_mds, true);
				2870	if (!err) {
				2871	ceph_msg_get(req->r_request);
				2872	ceph_con_send(&session->s_con, req->r_request);
				2873	}
				2874	}
				2875
				2876	/*
				2877	* also re-send old requests when MDS enters reconnect stage. So that MDS
				2878	* can process completed request in clientreplay stage.
				2879	*/
				2880	p = rb_first(&mdsc->request_tree);
				2881	while (p) {
				2882	req = rb_entry(p, struct ceph_mds_request, r_node);
				2883	p = rb_next(p);
				2884	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2885	continue;
				2886	if (req->r_attempts == 0)
				2887	continue; /* only old requests */
				2888	if (req->r_session &&
				2889	req->r_session->s_mds == session->s_mds) {
				2890	err = __prepare_send_request(mdsc, req,
				2891	session->s_mds, true);
				2892	if (!err) {
				2893	ceph_msg_get(req->r_request);
				2894	ceph_con_send(&session->s_con, req->r_request);
				2895	}
				2896	}
				2897	}
				2898	mutex_unlock(&mdsc->mutex);
				2899	}
				2900
				2901	/*
				2902	* Encode information about a cap for a reconnect with the MDS.
				2903	*/
				2904	static int encode_caps_cb(struct inode inode, struct ceph_cap cap,
				2905	void *arg)
				2906	{
				2907	union {
				2908	struct ceph_mds_cap_reconnect v2;
				2909	struct ceph_mds_cap_reconnect_v1 v1;
				2910	} rec;
				2911	struct ceph_inode_info *ci;
				2912	struct ceph_reconnect_state *recon_state = arg;
				2913	struct ceph_pagelist *pagelist = recon_state->pagelist;
				2914	char *path;
				2915	int pathlen, err;
				2916	u64 pathbase;
				2917	u64 snap_follows;
				2918	struct dentry *dentry;
				2919
				2920	ci = cap->ci;
				2921
				2922	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
				2923	inode, ceph_vinop(inode), cap, cap->cap_id,
				2924	ceph_cap_string(cap->issued));
				2925	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
				2926	if (err)
				2927	return err;
				2928
				2929	dentry = d_find_alias(inode);
				2930	if (dentry) {
				2931	path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
				2932	if (IS_ERR(path)) {
				2933	err = PTR_ERR(path);
				2934	goto out_dput;
				2935	}
				2936	} else {
				2937	path = NULL;
				2938	pathlen = 0;
				2939	pathbase = 0;
				2940	}
				2941
				2942	spin_lock(&ci->i_ceph_lock);
				2943	cap->seq = 0; /* reset cap seq */
				2944	cap->issue_seq = 0; /* and issue_seq */
				2945	cap->mseq = 0; /* and migrate_seq */
				2946	cap->cap_gen = cap->session->s_cap_gen;
				2947
				2948	if (recon_state->msg_version >= 2) {
				2949	rec.v2.cap_id = cpu_to_le64(cap->cap_id);
				2950	rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
				2951	rec.v2.issued = cpu_to_le32(cap->issued);
				2952	rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
				2953	rec.v2.pathbase = cpu_to_le64(pathbase);
				2954	rec.v2.flock_len = 0;
				2955	} else {
				2956	rec.v1.cap_id = cpu_to_le64(cap->cap_id);
				2957	rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
				2958	rec.v1.issued = cpu_to_le32(cap->issued);
				2959	rec.v1.size = cpu_to_le64(inode->i_size);
				2960	ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
				2961	ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
				2962	rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
				2963	rec.v1.pathbase = cpu_to_le64(pathbase);
				2964	}
				2965
				2966	if (list_empty(&ci->i_cap_snaps)) {
				2967	snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
				2968	} else {
				2969	struct ceph_cap_snap *capsnap =
				2970	list_first_entry(&ci->i_cap_snaps,
				2971	struct ceph_cap_snap, ci_item);
				2972	snap_follows = capsnap->follows;
				2973	}
				2974	spin_unlock(&ci->i_ceph_lock);
				2975
				2976	if (recon_state->msg_version >= 2) {
				2977	int num_fcntl_locks, num_flock_locks;
				2978	struct ceph_filelock *flocks;
				2979	size_t struct_len, total_len = 0;
				2980	u8 struct_v = 0;
				2981
				2982	encode_again:
				2983	ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
				2984	flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
				2985	sizeof(struct ceph_filelock), GFP_NOFS);
				2986	if (!flocks) {
				2987	err = -ENOMEM;
				2988	goto out_free;
				2989	}
				2990	err = ceph_encode_locks_to_buffer(inode, flocks,
				2991	num_fcntl_locks,
				2992	num_flock_locks);
				2993	if (err) {
				2994	kfree(flocks);
				2995	if (err == -ENOSPC)
				2996	goto encode_again;
				2997	goto out_free;
				2998	}
				2999
				3000	if (recon_state->msg_version >= 3) {
				3001	/* version, compat_version and struct_len */
				3002	total_len = 2 * sizeof(u8) + sizeof(u32);
				3003	struct_v = 2;
				3004	}
				3005	/*
				3006	* number of encoded locks is stable, so copy to pagelist
				3007	*/
				3008	struct_len = 2 * sizeof(u32) +
				3009	(num_fcntl_locks + num_flock_locks) *
				3010	sizeof(struct ceph_filelock);
				3011	rec.v2.flock_len = cpu_to_le32(struct_len);
				3012
				3013	struct_len += sizeof(rec.v2);
				3014	struct_len += sizeof(u32) + pathlen;
				3015
				3016	if (struct_v >= 2)
				3017	struct_len += sizeof(u64); /* snap_follows */
				3018
				3019	total_len += struct_len;
				3020	err = ceph_pagelist_reserve(pagelist, total_len);
				3021
				3022	if (!err) {
				3023	if (recon_state->msg_version >= 3) {
				3024	ceph_pagelist_encode_8(pagelist, struct_v);
				3025	ceph_pagelist_encode_8(pagelist, 1);
				3026	ceph_pagelist_encode_32(pagelist, struct_len);
				3027	}
				3028	ceph_pagelist_encode_string(pagelist, path, pathlen);
				3029	ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
				3030	ceph_locks_to_pagelist(flocks, pagelist,
				3031	num_fcntl_locks,
				3032	num_flock_locks);
				3033	if (struct_v >= 2)
				3034	ceph_pagelist_encode_64(pagelist, snap_follows);
				3035	}
				3036	kfree(flocks);
				3037	} else {
				3038	size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
				3039	err = ceph_pagelist_reserve(pagelist, size);
				3040	if (!err) {
				3041	ceph_pagelist_encode_string(pagelist, path, pathlen);
				3042	ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
				3043	}
				3044	}
				3045
				3046	recon_state->nr_caps++;
				3047	out_free:
				3048	kfree(path);
				3049	out_dput:
				3050	dput(dentry);
				3051	return err;
				3052	}
				3053
				3054
				3055	/*
				3056	* If an MDS fails and recovers, clients need to reconnect in order to
				3057	* reestablish shared state. This includes all caps issued through
				3058	* this session _and_ the snap_realm hierarchy. Because it's not
				3059	* clear which snap realms the mds cares about, we send everything we
				3060	* know about.. that ensures we'll then get any new info the
				3061	* recovering MDS might have.
				3062	*
				3063	* This is a relatively heavyweight operation, but it's rare.
				3064	*
				3065	* called with mdsc->mutex held.
				3066	*/
				3067	static void send_mds_reconnect(struct ceph_mds_client *mdsc,
				3068	struct ceph_mds_session *session)
				3069	{
				3070	struct ceph_msg *reply;
				3071	struct rb_node *p;
				3072	int mds = session->s_mds;
				3073	int err = -ENOMEM;
				3074	int s_nr_caps;
				3075	struct ceph_pagelist *pagelist;
				3076	struct ceph_reconnect_state recon_state;
				3077
				3078	pr_info("mds%d reconnect start\n", mds);
				3079
				3080	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
				3081	if (!pagelist)
				3082	goto fail_nopagelist;
				3083	ceph_pagelist_init(pagelist);
				3084
				3085	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
				3086	if (!reply)
				3087	goto fail_nomsg;
				3088
				3089	mutex_lock(&session->s_mutex);
				3090	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
				3091	session->s_seq = 0;
				3092
				3093	dout("session %p state %s\n", session,
				3094	ceph_session_state_name(session->s_state));
				3095
				3096	spin_lock(&session->s_gen_ttl_lock);
				3097	session->s_cap_gen++;
				3098	spin_unlock(&session->s_gen_ttl_lock);
				3099
				3100	spin_lock(&session->s_cap_lock);
				3101	/* don't know if session is readonly */
				3102	session->s_readonly = 0;
				3103	/*
				3104	* notify __ceph_remove_cap() that we are composing cap reconnect.
				3105	* If a cap get released before being added to the cap reconnect,
				3106	* __ceph_remove_cap() should skip queuing cap release.
				3107	*/
				3108	session->s_cap_reconnect = 1;
				3109	/* drop old cap expires; we're about to reestablish that state */
				3110	cleanup_cap_releases(mdsc, session);
				3111
				3112	/* trim unused caps to reduce MDS's cache rejoin time */
				3113	if (mdsc->fsc->sb->s_root)
				3114	shrink_dcache_parent(mdsc->fsc->sb->s_root);
				3115
				3116	ceph_con_close(&session->s_con);
				3117	ceph_con_open(&session->s_con,
				3118	CEPH_ENTITY_TYPE_MDS, mds,
				3119	ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
				3120
				3121	/* replay unsafe requests */
				3122	replay_unsafe_requests(mdsc, session);
				3123
				3124	down_read(&mdsc->snap_rwsem);
				3125
				3126	/* traverse this session's caps */
				3127	s_nr_caps = session->s_nr_caps;
				3128	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
				3129	if (err)
				3130	goto fail;
				3131
				3132	recon_state.nr_caps = 0;
				3133	recon_state.pagelist = pagelist;
				3134	if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
				3135	recon_state.msg_version = 3;
				3136	else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
				3137	recon_state.msg_version = 2;
				3138	else
				3139	recon_state.msg_version = 1;
				3140	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
				3141	if (err < 0)
				3142	goto fail;
				3143
				3144	spin_lock(&session->s_cap_lock);
				3145	session->s_cap_reconnect = 0;
				3146	spin_unlock(&session->s_cap_lock);
				3147
				3148	/*
				3149	* snaprealms. we provide mds with the ino, seq (version), and
				3150	* parent for all of our realms. If the mds has any newer info,
				3151	* it will tell us.
				3152	*/
				3153	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
				3154	struct ceph_snap_realm *realm =
				3155	rb_entry(p, struct ceph_snap_realm, node);
				3156	struct ceph_mds_snaprealm_reconnect sr_rec;
				3157
				3158	dout(" adding snap realm %llx seq %lld parent %llx\n",
				3159	realm->ino, realm->seq, realm->parent_ino);
				3160	sr_rec.ino = cpu_to_le64(realm->ino);
				3161	sr_rec.seq = cpu_to_le64(realm->seq);
				3162	sr_rec.parent = cpu_to_le64(realm->parent_ino);
				3163	err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
				3164	if (err)
				3165	goto fail;
				3166	}
				3167
				3168	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
				3169
				3170	/* raced with cap release? */
				3171	if (s_nr_caps != recon_state.nr_caps) {
				3172	struct page *page = list_first_entry(&pagelist->head,
				3173	struct page, lru);
				3174	__le32 *addr = kmap_atomic(page);
				3175	*addr = cpu_to_le32(recon_state.nr_caps);
				3176	kunmap_atomic(addr);
				3177	}
				3178
				3179	reply->hdr.data_len = cpu_to_le32(pagelist->length);
				3180	ceph_msg_data_add_pagelist(reply, pagelist);
				3181
				3182	ceph_early_kick_flushing_caps(mdsc, session);
				3183
				3184	ceph_con_send(&session->s_con, reply);
				3185
				3186	mutex_unlock(&session->s_mutex);
				3187
				3188	mutex_lock(&mdsc->mutex);
				3189	__wake_requests(mdsc, &session->s_waiting);
				3190	mutex_unlock(&mdsc->mutex);
				3191
				3192	up_read(&mdsc->snap_rwsem);
				3193	return;
				3194
				3195	fail:
				3196	ceph_msg_put(reply);
				3197	up_read(&mdsc->snap_rwsem);
				3198	mutex_unlock(&session->s_mutex);
				3199	fail_nomsg:
				3200	ceph_pagelist_release(pagelist);
				3201	fail_nopagelist:
				3202	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
				3203	return;
				3204	}
				3205
				3206
				3207	/*
				3208	* compare old and new mdsmaps, kicking requests
				3209	* and closing out old connections as necessary
				3210	*
				3211	* called under mdsc->mutex.
				3212	*/
				3213	static void check_new_map(struct ceph_mds_client *mdsc,
				3214	struct ceph_mdsmap *newmap,
				3215	struct ceph_mdsmap *oldmap)
				3216	{
				3217	int i;
				3218	int oldstate, newstate;
				3219	struct ceph_mds_session *s;
				3220
				3221	dout("check_new_map new %u old %u\n",
				3222	newmap->m_epoch, oldmap->m_epoch);
				3223
				3224	for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
				3225	if (!mdsc->sessions[i])
				3226	continue;
				3227	s = mdsc->sessions[i];
				3228	oldstate = ceph_mdsmap_get_state(oldmap, i);
				3229	newstate = ceph_mdsmap_get_state(newmap, i);
				3230
				3231	dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
				3232	i, ceph_mds_state_name(oldstate),
				3233	ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
				3234	ceph_mds_state_name(newstate),
				3235	ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
				3236	ceph_session_state_name(s->s_state));
				3237
				3238	if (i >= newmap->m_num_mds \|\|
				3239	memcmp(ceph_mdsmap_get_addr(oldmap, i),
				3240	ceph_mdsmap_get_addr(newmap, i),
				3241	sizeof(struct ceph_entity_addr))) {
				3242	if (s->s_state == CEPH_MDS_SESSION_OPENING) {
				3243	/* the session never opened, just close it
				3244	* out now */
				3245	get_session(s);
				3246	__unregister_session(mdsc, s);
				3247	__wake_requests(mdsc, &s->s_waiting);
				3248	ceph_put_mds_session(s);
				3249	} else if (i >= newmap->m_num_mds) {
				3250	/* force close session for stopped mds */
				3251	get_session(s);
				3252	__unregister_session(mdsc, s);
				3253	__wake_requests(mdsc, &s->s_waiting);
				3254	kick_requests(mdsc, i);
				3255	mutex_unlock(&mdsc->mutex);
				3256
				3257	mutex_lock(&s->s_mutex);
				3258	cleanup_session_requests(mdsc, s);
				3259	remove_session_caps(s);
				3260	mutex_unlock(&s->s_mutex);
				3261
				3262	ceph_put_mds_session(s);
				3263
				3264	mutex_lock(&mdsc->mutex);
				3265	} else {
				3266	/* just close it */
				3267	mutex_unlock(&mdsc->mutex);
				3268	mutex_lock(&s->s_mutex);
				3269	mutex_lock(&mdsc->mutex);
				3270	ceph_con_close(&s->s_con);
				3271	mutex_unlock(&s->s_mutex);
				3272	s->s_state = CEPH_MDS_SESSION_RESTARTING;
				3273	}
				3274	} else if (oldstate == newstate) {
				3275	continue; /* nothing new with this mds */
				3276	}
				3277
				3278	/*
				3279	* send reconnect?
				3280	*/
				3281	if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
				3282	newstate >= CEPH_MDS_STATE_RECONNECT) {
				3283	mutex_unlock(&mdsc->mutex);
				3284	send_mds_reconnect(mdsc, s);
				3285	mutex_lock(&mdsc->mutex);
				3286	}
				3287
				3288	/*
				3289	* kick request on any mds that has gone active.
				3290	*/
				3291	if (oldstate < CEPH_MDS_STATE_ACTIVE &&
				3292	newstate >= CEPH_MDS_STATE_ACTIVE) {
				3293	if (oldstate != CEPH_MDS_STATE_CREATING &&
				3294	oldstate != CEPH_MDS_STATE_STARTING)
				3295	pr_info("mds%d recovery completed\n", s->s_mds);
				3296	kick_requests(mdsc, i);
				3297	ceph_kick_flushing_caps(mdsc, s);
				3298	wake_up_session_caps(s, 1);
				3299	}
				3300	}
				3301
				3302	for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
				3303	s = mdsc->sessions[i];
				3304	if (!s)
				3305	continue;
				3306	if (!ceph_mdsmap_is_laggy(newmap, i))
				3307	continue;
				3308	if (s->s_state == CEPH_MDS_SESSION_OPEN \|\|
				3309	s->s_state == CEPH_MDS_SESSION_HUNG \|\|
				3310	s->s_state == CEPH_MDS_SESSION_CLOSING) {
				3311	dout(" connecting to export targets of laggy mds%d\n",
				3312	i);
				3313	__open_export_target_sessions(mdsc, s);
				3314	}
				3315	}
				3316	}
				3317
				3318
				3319
				3320	/*
				3321	* leases
				3322	*/
				3323
				3324	/*
				3325	* caller must hold session s_mutex, dentry->d_lock
				3326	*/
				3327	void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
				3328	{
				3329	struct ceph_dentry_info *di = ceph_dentry(dentry);
				3330
				3331	ceph_put_mds_session(di->lease_session);
				3332	di->lease_session = NULL;
				3333	}
				3334
				3335	static void handle_lease(struct ceph_mds_client *mdsc,
				3336	struct ceph_mds_session *session,
				3337	struct ceph_msg *msg)
				3338	{
				3339	struct super_block *sb = mdsc->fsc->sb;
				3340	struct inode *inode;
				3341	struct dentry parent, dentry;
				3342	struct ceph_dentry_info *di;
				3343	int mds = session->s_mds;
				3344	struct ceph_mds_lease *h = msg->front.iov_base;
				3345	u32 seq;
				3346	struct ceph_vino vino;
				3347	struct qstr dname;
				3348	int release = 0;
				3349
				3350	dout("handle_lease from mds%d\n", mds);
				3351
				3352	/* decode */
				3353	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
				3354	goto bad;
				3355	vino.ino = le64_to_cpu(h->ino);
				3356	vino.snap = CEPH_NOSNAP;
				3357	seq = le32_to_cpu(h->seq);
				3358	dname.name = (void )h + sizeof(h) + sizeof(u32);
				3359	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
				3360	if (dname.len != get_unaligned_le32(h+1))
				3361	goto bad;
				3362
				3363	/* lookup inode */
				3364	inode = ceph_find_inode(sb, vino);
				3365	dout("handle_lease %s, ino %llx %p %.*s\n",
				3366	ceph_lease_op_name(h->action), vino.ino, inode,
				3367	dname.len, dname.name);
				3368
				3369	mutex_lock(&session->s_mutex);
				3370	session->s_seq++;
				3371
				3372	if (!inode) {
				3373	dout("handle_lease no inode %llx\n", vino.ino);
				3374	goto release;
				3375	}
				3376
				3377	/* dentry */
				3378	parent = d_find_alias(inode);
				3379	if (!parent) {
				3380	dout("no parent dentry on inode %p\n", inode);
				3381	WARN_ON(1);
				3382	goto release; /* hrm... */
				3383	}
				3384	dname.hash = full_name_hash(parent, dname.name, dname.len);
				3385	dentry = d_lookup(parent, &dname);
				3386	dput(parent);
				3387	if (!dentry)
				3388	goto release;
				3389
				3390	spin_lock(&dentry->d_lock);
				3391	di = ceph_dentry(dentry);
				3392	switch (h->action) {
				3393	case CEPH_MDS_LEASE_REVOKE:
				3394	if (di->lease_session == session) {
				3395	if (ceph_seq_cmp(di->lease_seq, seq) > 0)
				3396	h->seq = cpu_to_le32(di->lease_seq);
				3397	__ceph_mdsc_drop_dentry_lease(dentry);
				3398	}
				3399	release = 1;
				3400	break;
				3401
				3402	case CEPH_MDS_LEASE_RENEW:
				3403	if (di->lease_session == session &&
				3404	di->lease_gen == session->s_cap_gen &&
				3405	di->lease_renew_from &&
				3406	di->lease_renew_after == 0) {
				3407	unsigned long duration =
				3408	msecs_to_jiffies(le32_to_cpu(h->duration_ms));
				3409
				3410	di->lease_seq = seq;
				3411	di->time = di->lease_renew_from + duration;
				3412	di->lease_renew_after = di->lease_renew_from +
				3413	(duration >> 1);
				3414	di->lease_renew_from = 0;
				3415	}
				3416	break;
				3417	}
				3418	spin_unlock(&dentry->d_lock);
				3419	dput(dentry);
				3420
				3421	if (!release)
				3422	goto out;
				3423
				3424	release:
				3425	/* let's just reuse the same message */
				3426	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
				3427	ceph_msg_get(msg);
				3428	ceph_con_send(&session->s_con, msg);
				3429
				3430	out:
				3431	iput(inode);
				3432	mutex_unlock(&session->s_mutex);
				3433	return;
				3434
				3435	bad:
				3436	pr_err("corrupt lease message\n");
				3437	ceph_msg_dump(msg);
				3438	}
				3439
				3440	void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
				3441	struct inode *inode,
				3442	struct dentry *dentry, char action,
				3443	u32 seq)
				3444	{
				3445	struct ceph_msg *msg;
				3446	struct ceph_mds_lease *lease;
				3447	int len = sizeof(*lease) + sizeof(u32);
				3448	int dnamelen = 0;
				3449
				3450	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
				3451	inode, dentry, ceph_lease_op_name(action), session->s_mds);
				3452	dnamelen = dentry->d_name.len;
				3453	len += dnamelen;
				3454
				3455	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
				3456	if (!msg)
				3457	return;
				3458	lease = msg->front.iov_base;
				3459	lease->action = action;
				3460	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
				3461	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
				3462	lease->seq = cpu_to_le32(seq);
				3463	put_unaligned_le32(dnamelen, lease + 1);
				3464	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
				3465
				3466	/*
				3467	* if this is a preemptive lease RELEASE, no need to
				3468	* flush request stream, since the actual request will
				3469	* soon follow.
				3470	*/
				3471	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
				3472
				3473	ceph_con_send(&session->s_con, msg);
				3474	}
				3475
				3476	/*
				3477	* drop all leases (and dentry refs) in preparation for umount
				3478	*/
				3479	static void drop_leases(struct ceph_mds_client *mdsc)
				3480	{
				3481	int i;
				3482
				3483	dout("drop_leases\n");
				3484	mutex_lock(&mdsc->mutex);
				3485	for (i = 0; i < mdsc->max_sessions; i++) {
				3486	struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
				3487	if (!s)
				3488	continue;
				3489	mutex_unlock(&mdsc->mutex);
				3490	mutex_lock(&s->s_mutex);
				3491	mutex_unlock(&s->s_mutex);
				3492	ceph_put_mds_session(s);
				3493	mutex_lock(&mdsc->mutex);
				3494	}
				3495	mutex_unlock(&mdsc->mutex);
				3496	}
				3497
				3498
				3499
				3500	/*
				3501	* delayed work -- periodically trim expired leases, renew caps with mds
				3502	*/
				3503	static void schedule_delayed(struct ceph_mds_client *mdsc)
				3504	{
				3505	int delay = 5;
				3506	unsigned hz = round_jiffies_relative(HZ * delay);
				3507	schedule_delayed_work(&mdsc->delayed_work, hz);
				3508	}
				3509
				3510	static void delayed_work(struct work_struct *work)
				3511	{
				3512	int i;
				3513	struct ceph_mds_client *mdsc =
				3514	container_of(work, struct ceph_mds_client, delayed_work.work);
				3515	int renew_interval;
				3516	int renew_caps;
				3517
				3518	dout("mdsc delayed_work\n");
				3519	ceph_check_delayed_caps(mdsc);
				3520
				3521	if (mdsc->stopping)
				3522	return;
				3523
				3524	mutex_lock(&mdsc->mutex);
				3525	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
				3526	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
				3527	mdsc->last_renew_caps);
				3528	if (renew_caps)
				3529	mdsc->last_renew_caps = jiffies;
				3530
				3531	for (i = 0; i < mdsc->max_sessions; i++) {
				3532	struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
				3533	if (!s)
				3534	continue;
				3535	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
				3536	dout("resending session close request for mds%d\n",
				3537	s->s_mds);
				3538	request_close_session(mdsc, s);
				3539	ceph_put_mds_session(s);
				3540	continue;
				3541	}
				3542	if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
				3543	if (s->s_state == CEPH_MDS_SESSION_OPEN) {
				3544	s->s_state = CEPH_MDS_SESSION_HUNG;
				3545	pr_info("mds%d hung\n", s->s_mds);
				3546	}
				3547	}
				3548	if (s->s_state == CEPH_MDS_SESSION_NEW \|\|
				3549	s->s_state == CEPH_MDS_SESSION_RESTARTING \|\|
				3550	s->s_state == CEPH_MDS_SESSION_REJECTED) {
				3551	/* this mds is failed or recovering, just wait */
				3552	ceph_put_mds_session(s);
				3553	continue;
				3554	}
				3555	mutex_unlock(&mdsc->mutex);
				3556
				3557	mutex_lock(&s->s_mutex);
				3558	if (renew_caps)
				3559	send_renew_caps(mdsc, s);
				3560	else
				3561	ceph_con_keepalive(&s->s_con);
				3562	if (s->s_state == CEPH_MDS_SESSION_OPEN \|\|
				3563	s->s_state == CEPH_MDS_SESSION_HUNG)
				3564	ceph_send_cap_releases(mdsc, s);
				3565	mutex_unlock(&s->s_mutex);
				3566	ceph_put_mds_session(s);
				3567
				3568	mutex_lock(&mdsc->mutex);
				3569	}
				3570	mutex_unlock(&mdsc->mutex);
				3571
				3572	schedule_delayed(mdsc);
				3573	}
				3574
				3575	int ceph_mdsc_init(struct ceph_fs_client *fsc)
				3576
				3577	{
				3578	struct ceph_mds_client *mdsc;
				3579
				3580	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
				3581	if (!mdsc)
				3582	return -ENOMEM;
				3583	mdsc->fsc = fsc;
				3584	fsc->mdsc = mdsc;
				3585	mutex_init(&mdsc->mutex);
				3586	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
				3587	if (!mdsc->mdsmap) {
				3588	kfree(mdsc);
				3589	return -ENOMEM;
				3590	}
				3591
				3592	init_completion(&mdsc->safe_umount_waiters);
				3593	init_waitqueue_head(&mdsc->session_close_wq);
				3594	INIT_LIST_HEAD(&mdsc->waiting_for_map);
				3595	mdsc->sessions = NULL;
				3596	atomic_set(&mdsc->num_sessions, 0);
				3597	mdsc->max_sessions = 0;
				3598	mdsc->stopping = 0;
				3599	mdsc->last_snap_seq = 0;
				3600	init_rwsem(&mdsc->snap_rwsem);
				3601	mdsc->snap_realms = RB_ROOT;
				3602	INIT_LIST_HEAD(&mdsc->snap_empty);
				3603	spin_lock_init(&mdsc->snap_empty_lock);
				3604	mdsc->last_tid = 0;
				3605	mdsc->oldest_tid = 0;
				3606	mdsc->request_tree = RB_ROOT;
				3607	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
				3608	mdsc->last_renew_caps = jiffies;
				3609	INIT_LIST_HEAD(&mdsc->cap_delay_list);
				3610	spin_lock_init(&mdsc->cap_delay_lock);
				3611	INIT_LIST_HEAD(&mdsc->snap_flush_list);
				3612	spin_lock_init(&mdsc->snap_flush_lock);
				3613	mdsc->last_cap_flush_tid = 1;
				3614	INIT_LIST_HEAD(&mdsc->cap_flush_list);
				3615	INIT_LIST_HEAD(&mdsc->cap_dirty);
				3616	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
				3617	mdsc->num_cap_flushing = 0;
				3618	spin_lock_init(&mdsc->cap_dirty_lock);
				3619	init_waitqueue_head(&mdsc->cap_flushing_wq);
				3620	spin_lock_init(&mdsc->dentry_lru_lock);
				3621	INIT_LIST_HEAD(&mdsc->dentry_lru);
				3622
				3623	ceph_caps_init(mdsc);
				3624	ceph_adjust_min_caps(mdsc, fsc->min_caps);
				3625
				3626	init_rwsem(&mdsc->pool_perm_rwsem);
				3627	mdsc->pool_perm_tree = RB_ROOT;
				3628
				3629	strncpy(mdsc->nodename, utsname()->nodename,
				3630	sizeof(mdsc->nodename) - 1);
				3631	return 0;
				3632	}
				3633
				3634	/*
				3635	* Wait for safe replies on open mds requests. If we time out, drop
				3636	* all requests from the tree to avoid dangling dentry refs.
				3637	*/
				3638	static void wait_requests(struct ceph_mds_client *mdsc)
				3639	{
				3640	struct ceph_options *opts = mdsc->fsc->client->options;
				3641	struct ceph_mds_request *req;
				3642
				3643	mutex_lock(&mdsc->mutex);
				3644	if (__get_oldest_req(mdsc)) {
				3645	mutex_unlock(&mdsc->mutex);
				3646
				3647	dout("wait_requests waiting for requests\n");
				3648	wait_for_completion_timeout(&mdsc->safe_umount_waiters,
				3649	ceph_timeout_jiffies(opts->mount_timeout));
				3650
				3651	/* tear down remaining requests */
				3652	mutex_lock(&mdsc->mutex);
				3653	while ((req = __get_oldest_req(mdsc))) {
				3654	dout("wait_requests timed out on tid %llu\n",
				3655	req->r_tid);
				3656	__unregister_request(mdsc, req);
				3657	}
				3658	}
				3659	mutex_unlock(&mdsc->mutex);
				3660	dout("wait_requests done\n");
				3661	}
				3662
				3663	/*
				3664	* called before mount is ro, and before dentries are torn down.
				3665	* (hmm, does this still race with new lookups?)
				3666	*/
				3667	void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
				3668	{
				3669	dout("pre_umount\n");
				3670	mdsc->stopping = 1;
				3671
				3672	drop_leases(mdsc);
				3673	ceph_flush_dirty_caps(mdsc);
				3674	wait_requests(mdsc);
				3675
				3676	/*
				3677	* wait for reply handlers to drop their request refs and
				3678	* their inode/dcache refs
				3679	*/
				3680	ceph_msgr_flush();
				3681	}
				3682
				3683	/*
				3684	* wait for all write mds requests to flush.
				3685	*/
				3686	static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
				3687	{
				3688	struct ceph_mds_request req = NULL, nextreq;
				3689	struct rb_node *n;
				3690
				3691	mutex_lock(&mdsc->mutex);
				3692	dout("wait_unsafe_requests want %lld\n", want_tid);
				3693	restart:
				3694	req = __get_oldest_req(mdsc);
				3695	while (req && req->r_tid <= want_tid) {
				3696	/* find next request */
				3697	n = rb_next(&req->r_node);
				3698	if (n)
				3699	nextreq = rb_entry(n, struct ceph_mds_request, r_node);
				3700	else
				3701	nextreq = NULL;
				3702	if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
				3703	(req->r_op & CEPH_MDS_OP_WRITE)) {
				3704	/* write op */
				3705	ceph_mdsc_get_request(req);
				3706	if (nextreq)
				3707	ceph_mdsc_get_request(nextreq);
				3708	mutex_unlock(&mdsc->mutex);
				3709	dout("wait_unsafe_requests wait on %llu (want %llu)\n",
				3710	req->r_tid, want_tid);
				3711	wait_for_completion(&req->r_safe_completion);
				3712	mutex_lock(&mdsc->mutex);
				3713	ceph_mdsc_put_request(req);
				3714	if (!nextreq)
				3715	break; /* next dne before, so we're done! */
				3716	if (RB_EMPTY_NODE(&nextreq->r_node)) {
				3717	/* next request was removed from tree */
				3718	ceph_mdsc_put_request(nextreq);
				3719	goto restart;
				3720	}
				3721	ceph_mdsc_put_request(nextreq); /* won't go away */
				3722	}
				3723	req = nextreq;
				3724	}
				3725	mutex_unlock(&mdsc->mutex);
				3726	dout("wait_unsafe_requests done\n");
				3727	}
				3728
				3729	void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
				3730	{
				3731	u64 want_tid, want_flush;
				3732
				3733	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				3734	return;
				3735
				3736	dout("sync\n");
				3737	mutex_lock(&mdsc->mutex);
				3738	want_tid = mdsc->last_tid;
				3739	mutex_unlock(&mdsc->mutex);
				3740
				3741	ceph_flush_dirty_caps(mdsc);
				3742	spin_lock(&mdsc->cap_dirty_lock);
				3743	want_flush = mdsc->last_cap_flush_tid;
				3744	if (!list_empty(&mdsc->cap_flush_list)) {
				3745	struct ceph_cap_flush *cf =
				3746	list_last_entry(&mdsc->cap_flush_list,
				3747	struct ceph_cap_flush, g_list);
				3748	cf->wake = true;
				3749	}
				3750	spin_unlock(&mdsc->cap_dirty_lock);
				3751
				3752	dout("sync want tid %lld flush_seq %lld\n",
				3753	want_tid, want_flush);
				3754
				3755	wait_unsafe_requests(mdsc, want_tid);
				3756	wait_caps_flush(mdsc, want_flush);
				3757	}
				3758
				3759	/*
				3760	* true if all sessions are closed, or we force unmount
				3761	*/
				3762	static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
				3763	{
				3764	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				3765	return true;
				3766	return atomic_read(&mdsc->num_sessions) <= skipped;
				3767	}
				3768
				3769	/*
				3770	* called after sb is ro.
				3771	*/
				3772	void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
				3773	{
				3774	struct ceph_options *opts = mdsc->fsc->client->options;
				3775	struct ceph_mds_session *session;
				3776	int i;
				3777	int skipped = 0;
				3778
				3779	dout("close_sessions\n");
				3780
				3781	/* close sessions */
				3782	mutex_lock(&mdsc->mutex);
				3783	for (i = 0; i < mdsc->max_sessions; i++) {
				3784	session = __ceph_lookup_mds_session(mdsc, i);
				3785	if (!session)
				3786	continue;
				3787	mutex_unlock(&mdsc->mutex);
				3788	mutex_lock(&session->s_mutex);
				3789	if (__close_session(mdsc, session) <= 0)
				3790	skipped++;
				3791	mutex_unlock(&session->s_mutex);
				3792	ceph_put_mds_session(session);
				3793	mutex_lock(&mdsc->mutex);
				3794	}
				3795	mutex_unlock(&mdsc->mutex);
				3796
				3797	dout("waiting for sessions to close\n");
				3798	wait_event_timeout(mdsc->session_close_wq,
				3799	done_closing_sessions(mdsc, skipped),
				3800	ceph_timeout_jiffies(opts->mount_timeout));
				3801
				3802	/* tear down remaining sessions */
				3803	mutex_lock(&mdsc->mutex);
				3804	for (i = 0; i < mdsc->max_sessions; i++) {
				3805	if (mdsc->sessions[i]) {
				3806	session = get_session(mdsc->sessions[i]);
				3807	__unregister_session(mdsc, session);
				3808	mutex_unlock(&mdsc->mutex);
				3809	mutex_lock(&session->s_mutex);
				3810	remove_session_caps(session);
				3811	mutex_unlock(&session->s_mutex);
				3812	ceph_put_mds_session(session);
				3813	mutex_lock(&mdsc->mutex);
				3814	}
				3815	}
				3816	WARN_ON(!list_empty(&mdsc->cap_delay_list));
				3817	mutex_unlock(&mdsc->mutex);
				3818
				3819	ceph_cleanup_empty_realms(mdsc);
				3820
				3821	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
				3822
				3823	dout("stopped\n");
				3824	}
				3825
				3826	void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
				3827	{
				3828	struct ceph_mds_session *session;
				3829	int mds;
				3830
				3831	dout("force umount\n");
				3832
				3833	mutex_lock(&mdsc->mutex);
				3834	for (mds = 0; mds < mdsc->max_sessions; mds++) {
				3835	session = __ceph_lookup_mds_session(mdsc, mds);
				3836	if (!session)
				3837	continue;
				3838	mutex_unlock(&mdsc->mutex);
				3839	mutex_lock(&session->s_mutex);
				3840	__close_session(mdsc, session);
				3841	if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
				3842	cleanup_session_requests(mdsc, session);
				3843	remove_session_caps(session);
				3844	}
				3845	mutex_unlock(&session->s_mutex);
				3846	ceph_put_mds_session(session);
				3847	mutex_lock(&mdsc->mutex);
				3848	kick_requests(mdsc, mds);
				3849	}
				3850	__wake_requests(mdsc, &mdsc->waiting_for_map);
				3851	mutex_unlock(&mdsc->mutex);
				3852	}
				3853
				3854	static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
				3855	{
				3856	dout("stop\n");
				3857	/*
				3858	* Make sure the delayed work stopped before releasing
				3859	* the resources.
				3860	*
				3861	* Because the cancel_delayed_work_sync() will only
				3862	* guarantee that the work finishes executing. But the
				3863	* delayed work will re-arm itself again after that.
				3864	*/
				3865	flush_delayed_work(&mdsc->delayed_work);
				3866
				3867	if (mdsc->mdsmap)
				3868	ceph_mdsmap_destroy(mdsc->mdsmap);
				3869	kfree(mdsc->sessions);
				3870	ceph_caps_finalize(mdsc);
				3871	ceph_pool_perm_destroy(mdsc);
				3872	}
				3873
				3874	void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
				3875	{
				3876	struct ceph_mds_client *mdsc = fsc->mdsc;
				3877	dout("mdsc_destroy %p\n", mdsc);
				3878
				3879	/* flush out any connection work with references to us */
				3880	ceph_msgr_flush();
				3881
				3882	ceph_mdsc_stop(mdsc);
				3883
				3884	fsc->mdsc = NULL;
				3885	kfree(mdsc);
				3886	dout("mdsc_destroy %p done\n", mdsc);
				3887	}
				3888
				3889	void ceph_mdsc_handle_fsmap(struct ceph_mds_client mdsc, struct ceph_msg msg)
				3890	{
				3891	struct ceph_fs_client *fsc = mdsc->fsc;
				3892	const char *mds_namespace = fsc->mount_options->mds_namespace;
				3893	void *p = msg->front.iov_base;
				3894	void *end = p + msg->front.iov_len;
				3895	u32 epoch;
				3896	u32 map_len;
				3897	u32 num_fs;
				3898	u32 mount_fscid = (u32)-1;
				3899	u8 struct_v, struct_cv;
				3900	int err = -EINVAL;
				3901
				3902	ceph_decode_need(&p, end, sizeof(u32), bad);
				3903	epoch = ceph_decode_32(&p);
				3904
				3905	dout("handle_fsmap epoch %u\n", epoch);
				3906
				3907	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
				3908	struct_v = ceph_decode_8(&p);
				3909	struct_cv = ceph_decode_8(&p);
				3910	map_len = ceph_decode_32(&p);
				3911
				3912	ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
				3913	p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
				3914
				3915	num_fs = ceph_decode_32(&p);
				3916	while (num_fs-- > 0) {
				3917	void info_p, info_end;
				3918	u32 info_len;
				3919	u8 info_v, info_cv;
				3920	u32 fscid, namelen;
				3921
				3922	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
				3923	info_v = ceph_decode_8(&p);
				3924	info_cv = ceph_decode_8(&p);
				3925	info_len = ceph_decode_32(&p);
				3926	ceph_decode_need(&p, end, info_len, bad);
				3927	info_p = p;
				3928	info_end = p + info_len;
				3929	p = info_end;
				3930
				3931	ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
				3932	fscid = ceph_decode_32(&info_p);
				3933	namelen = ceph_decode_32(&info_p);
				3934	ceph_decode_need(&info_p, info_end, namelen, bad);
				3935
				3936	if (mds_namespace &&
				3937	strlen(mds_namespace) == namelen &&
				3938	!strncmp(mds_namespace, (char *)info_p, namelen)) {
				3939	mount_fscid = fscid;
				3940	break;
				3941	}
				3942	}
				3943
				3944	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
				3945	if (mount_fscid != (u32)-1) {
				3946	fsc->client->monc.fs_cluster_id = mount_fscid;
				3947	ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
				3948	0, true);
				3949	ceph_monc_renew_subs(&fsc->client->monc);
				3950	} else {
				3951	err = -ENOENT;
				3952	goto err_out;
				3953	}
				3954	return;
				3955	bad:
				3956	pr_err("error decoding fsmap\n");
				3957	err_out:
				3958	mutex_lock(&mdsc->mutex);
				3959	mdsc->mdsmap_err = -ENOENT;
				3960	__wake_requests(mdsc, &mdsc->waiting_for_map);
				3961	mutex_unlock(&mdsc->mutex);
				3962	return;
				3963	}
				3964
				3965	/*
				3966	* handle mds map update.
				3967	*/
				3968	void ceph_mdsc_handle_mdsmap(struct ceph_mds_client mdsc, struct ceph_msg msg)
				3969	{
				3970	u32 epoch;
				3971	u32 maplen;
				3972	void *p = msg->front.iov_base;
				3973	void *end = p + msg->front.iov_len;
				3974	struct ceph_mdsmap newmap, oldmap;
				3975	struct ceph_fsid fsid;
				3976	int err = -EINVAL;
				3977
				3978	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
				3979	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				3980	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
				3981	return;
				3982	epoch = ceph_decode_32(&p);
				3983	maplen = ceph_decode_32(&p);
				3984	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
				3985
				3986	/* do we need it? */
				3987	mutex_lock(&mdsc->mutex);
				3988	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
				3989	dout("handle_map epoch %u <= our %u\n",
				3990	epoch, mdsc->mdsmap->m_epoch);
				3991	mutex_unlock(&mdsc->mutex);
				3992	return;
				3993	}
				3994
				3995	newmap = ceph_mdsmap_decode(&p, end);
				3996	if (IS_ERR(newmap)) {
				3997	err = PTR_ERR(newmap);
				3998	goto bad_unlock;
				3999	}
				4000
				4001	/* swap into place */
				4002	if (mdsc->mdsmap) {
				4003	oldmap = mdsc->mdsmap;
				4004	mdsc->mdsmap = newmap;
				4005	check_new_map(mdsc, newmap, oldmap);
				4006	ceph_mdsmap_destroy(oldmap);
				4007	} else {
				4008	mdsc->mdsmap = newmap; /* first mds map */
				4009	}
				4010	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
				4011
				4012	__wake_requests(mdsc, &mdsc->waiting_for_map);
				4013	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
				4014	mdsc->mdsmap->m_epoch);
				4015
				4016	mutex_unlock(&mdsc->mutex);
				4017	schedule_delayed(mdsc);
				4018	return;
				4019
				4020	bad_unlock:
				4021	mutex_unlock(&mdsc->mutex);
				4022	bad:
				4023	pr_err("error decoding mdsmap %d\n", err);
				4024	return;
				4025	}
				4026
				4027	static struct ceph_connection con_get(struct ceph_connection con)
				4028	{
				4029	struct ceph_mds_session *s = con->private;
				4030
				4031	if (get_session(s)) {
				4032	dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
				4033	return con;
				4034	}
				4035	dout("mdsc con_get %p FAIL\n", s);
				4036	return NULL;
				4037	}
				4038
				4039	static void con_put(struct ceph_connection *con)
				4040	{
				4041	struct ceph_mds_session *s = con->private;
				4042
				4043	dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
				4044	ceph_put_mds_session(s);
				4045	}
				4046
				4047	/*
				4048	* if the client is unresponsive for long enough, the mds will kill
				4049	* the session entirely.
				4050	*/
				4051	static void peer_reset(struct ceph_connection *con)
				4052	{
				4053	struct ceph_mds_session *s = con->private;
				4054	struct ceph_mds_client *mdsc = s->s_mdsc;
				4055
				4056	pr_warn("mds%d closed our session\n", s->s_mds);
				4057	send_mds_reconnect(mdsc, s);
				4058	}
				4059
				4060	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				4061	{
				4062	struct ceph_mds_session *s = con->private;
				4063	struct ceph_mds_client *mdsc = s->s_mdsc;
				4064	int type = le16_to_cpu(msg->hdr.type);
				4065
				4066	mutex_lock(&mdsc->mutex);
				4067	if (__verify_registered_session(mdsc, s) < 0) {
				4068	mutex_unlock(&mdsc->mutex);
				4069	goto out;
				4070	}
				4071	mutex_unlock(&mdsc->mutex);
				4072
				4073	switch (type) {
				4074	case CEPH_MSG_MDS_MAP:
				4075	ceph_mdsc_handle_mdsmap(mdsc, msg);
				4076	break;
				4077	case CEPH_MSG_FS_MAP_USER:
				4078	ceph_mdsc_handle_fsmap(mdsc, msg);
				4079	break;
				4080	case CEPH_MSG_CLIENT_SESSION:
				4081	handle_session(s, msg);
				4082	break;
				4083	case CEPH_MSG_CLIENT_REPLY:
				4084	handle_reply(s, msg);
				4085	break;
				4086	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
				4087	handle_forward(mdsc, s, msg);
				4088	break;
				4089	case CEPH_MSG_CLIENT_CAPS:
				4090	ceph_handle_caps(s, msg);
				4091	break;
				4092	case CEPH_MSG_CLIENT_SNAP:
				4093	ceph_handle_snap(mdsc, s, msg);
				4094	break;
				4095	case CEPH_MSG_CLIENT_LEASE:
				4096	handle_lease(mdsc, s, msg);
				4097	break;
				4098
				4099	default:
				4100	pr_err("received unknown message type %d %s\n", type,
				4101	ceph_msg_type_name(type));
				4102	}
				4103	out:
				4104	ceph_msg_put(msg);
				4105	}
				4106
				4107	/*
				4108	* authentication
				4109	*/
				4110
				4111	/*
				4112	* Note: returned pointer is the address of a structure that's
				4113	* managed separately. Caller must not attempt to free it.
				4114	*/
				4115	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				4116	int *proto, int force_new)
				4117	{
				4118	struct ceph_mds_session *s = con->private;
				4119	struct ceph_mds_client *mdsc = s->s_mdsc;
				4120	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4121	struct ceph_auth_handshake *auth = &s->s_auth;
				4122
				4123	if (force_new && auth->authorizer) {
				4124	ceph_auth_destroy_authorizer(auth->authorizer);
				4125	auth->authorizer = NULL;
				4126	}
				4127	if (!auth->authorizer) {
				4128	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
				4129	auth);
				4130	if (ret)
				4131	return ERR_PTR(ret);
				4132	} else {
				4133	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
				4134	auth);
				4135	if (ret)
				4136	return ERR_PTR(ret);
				4137	}
				4138	*proto = ac->protocol;
				4139
				4140	return auth;
				4141	}
				4142
				4143	static int add_authorizer_challenge(struct ceph_connection *con,
				4144	void *challenge_buf, int challenge_buf_len)
				4145	{
				4146	struct ceph_mds_session *s = con->private;
				4147	struct ceph_mds_client *mdsc = s->s_mdsc;
				4148	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4149
				4150	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
				4151	challenge_buf, challenge_buf_len);
				4152	}
				4153
				4154	static int verify_authorizer_reply(struct ceph_connection *con)
				4155	{
				4156	struct ceph_mds_session *s = con->private;
				4157	struct ceph_mds_client *mdsc = s->s_mdsc;
				4158	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4159
				4160	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
				4161	}
				4162
				4163	static int invalidate_authorizer(struct ceph_connection *con)
				4164	{
				4165	struct ceph_mds_session *s = con->private;
				4166	struct ceph_mds_client *mdsc = s->s_mdsc;
				4167	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4168
				4169	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
				4170
				4171	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
				4172	}
				4173
				4174	static struct ceph_msg mds_alloc_msg(struct ceph_connection con,
				4175	struct ceph_msg_header hdr, int skip)
				4176	{
				4177	struct ceph_msg *msg;
				4178	int type = (int) le16_to_cpu(hdr->type);
				4179	int front_len = (int) le32_to_cpu(hdr->front_len);
				4180
				4181	if (con->in_msg)
				4182	return con->in_msg;
				4183
				4184	*skip = 0;
				4185	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
				4186	if (!msg) {
				4187	pr_err("unable to allocate msg type %d len %d\n",
				4188	type, front_len);
				4189	return NULL;
				4190	}
				4191
				4192	return msg;
				4193	}
				4194
				4195	static int mds_sign_message(struct ceph_msg *msg)
				4196	{
				4197	struct ceph_mds_session *s = msg->con->private;
				4198	struct ceph_auth_handshake *auth = &s->s_auth;
				4199
				4200	return ceph_auth_sign_message(auth, msg);
				4201	}
				4202
				4203	static int mds_check_message_signature(struct ceph_msg *msg)
				4204	{
				4205	struct ceph_mds_session *s = msg->con->private;
				4206	struct ceph_auth_handshake *auth = &s->s_auth;
				4207
				4208	return ceph_auth_check_message_signature(auth, msg);
				4209	}
				4210
				4211	static const struct ceph_connection_operations mds_con_ops = {
				4212	.get = con_get,
				4213	.put = con_put,
				4214	.dispatch = dispatch,
				4215	.get_authorizer = get_authorizer,
				4216	.add_authorizer_challenge = add_authorizer_challenge,
				4217	.verify_authorizer_reply = verify_authorizer_reply,
				4218	.invalidate_authorizer = invalidate_authorizer,
				4219	.peer_reset = peer_reset,
				4220	.alloc_msg = mds_alloc_msg,
				4221	.sign_message = mds_sign_message,
				4222	.check_message_signature = mds_check_message_signature,
				4223	};
				4224
				4225	/* eof */