Blame - src/kernel/linux/v4.19/fs/ceph/mds_client.c - T800

blob: 09db6d08614d235797745951980c685e282cdf75 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/fs.h>
				5	#include <linux/wait.h>
				6	#include <linux/slab.h>
				7	#include <linux/gfp.h>
				8	#include <linux/sched.h>
				9	#include <linux/debugfs.h>
				10	#include <linux/seq_file.h>
				11	#include <linux/ratelimit.h>
				12
				13	#include "super.h"
				14	#include "mds_client.h"
				15
				16	#include <linux/ceph/ceph_features.h>
				17	#include <linux/ceph/messenger.h>
				18	#include <linux/ceph/decode.h>
				19	#include <linux/ceph/pagelist.h>
				20	#include <linux/ceph/auth.h>
				21	#include <linux/ceph/debugfs.h>
				22
				23	/*
				24	* A cluster of MDS (metadata server) daemons is responsible for
				25	* managing the file system namespace (the directory hierarchy and
				26	* inodes) and for coordinating shared access to storage. Metadata is
				27	* partitioning hierarchically across a number of servers, and that
				28	* partition varies over time as the cluster adjusts the distribution
				29	* in order to balance load.
				30	*
				31	* The MDS client is primarily responsible to managing synchronous
				32	* metadata requests for operations like open, unlink, and so forth.
				33	* If there is a MDS failure, we find out about it when we (possibly
				34	* request and) receive a new MDS map, and can resubmit affected
				35	* requests.
				36	*
				37	* For the most part, though, we take advantage of a lossless
				38	* communications channel to the MDS, and do not need to worry about
				39	* timing out or resubmitting requests.
				40	*
				41	* We maintain a stateful "session" with each MDS we interact with.
				42	* Within each session, we sent periodic heartbeat messages to ensure
				43	* any capabilities or leases we have been issues remain valid. If
				44	* the session times out and goes stale, our leases and capabilities
				45	* are no longer valid.
				46	*/
				47
				48	struct ceph_reconnect_state {
				49	int nr_caps;
				50	struct ceph_pagelist *pagelist;
				51	unsigned msg_version;
				52	};
				53
				54	static void __wake_requests(struct ceph_mds_client *mdsc,
				55	struct list_head *head);
				56
				57	static const struct ceph_connection_operations mds_con_ops;
				58
				59
				60	/*
				61	* mds reply parsing
				62	*/
				63
				64	/*
				65	* parse individual inode info
				66	*/
				67	static int parse_reply_info_in(void *p, void end,
				68	struct ceph_mds_reply_info_in *info,
				69	u64 features)
				70	{
				71	int err = -EIO;
				72
				73	info->in = *p;
				74	*p += sizeof(struct ceph_mds_reply_inode) +
				75	sizeof(info->in->fragtree.splits)
				76	le32_to_cpu(info->in->fragtree.nsplits);
				77
				78	ceph_decode_32_safe(p, end, info->symlink_len, bad);
				79	ceph_decode_need(p, end, info->symlink_len, bad);
				80	info->symlink = *p;
				81	*p += info->symlink_len;
				82
				83	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
				84	ceph_decode_copy_safe(p, end, &info->dir_layout,
				85	sizeof(info->dir_layout), bad);
				86	else
				87	memset(&info->dir_layout, 0, sizeof(info->dir_layout));
				88
				89	ceph_decode_32_safe(p, end, info->xattr_len, bad);
				90	ceph_decode_need(p, end, info->xattr_len, bad);
				91	info->xattr_data = *p;
				92	*p += info->xattr_len;
				93
				94	if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
				95	ceph_decode_64_safe(p, end, info->inline_version, bad);
				96	ceph_decode_32_safe(p, end, info->inline_len, bad);
				97	ceph_decode_need(p, end, info->inline_len, bad);
				98	info->inline_data = *p;
				99	*p += info->inline_len;
				100	} else
				101	info->inline_version = CEPH_INLINE_NONE;
				102
				103	if (features & CEPH_FEATURE_MDS_QUOTA) {
				104	u8 struct_v, struct_compat;
				105	u32 struct_len;
				106
				107	/*
				108	* both struct_v and struct_compat are expected to be >= 1
				109	*/
				110	ceph_decode_8_safe(p, end, struct_v, bad);
				111	ceph_decode_8_safe(p, end, struct_compat, bad);
				112	if (!struct_v \|\| !struct_compat)
				113	goto bad;
				114	ceph_decode_32_safe(p, end, struct_len, bad);
				115	ceph_decode_need(p, end, struct_len, bad);
				116	ceph_decode_64_safe(p, end, info->max_bytes, bad);
				117	ceph_decode_64_safe(p, end, info->max_files, bad);
				118	} else {
				119	info->max_bytes = 0;
				120	info->max_files = 0;
				121	}
				122
				123	info->pool_ns_len = 0;
				124	info->pool_ns_data = NULL;
				125	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
				126	ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
				127	if (info->pool_ns_len > 0) {
				128	ceph_decode_need(p, end, info->pool_ns_len, bad);
				129	info->pool_ns_data = *p;
				130	*p += info->pool_ns_len;
				131	}
				132	}
				133
				134	return 0;
				135	bad:
				136	return err;
				137	}
				138
				139	/*
				140	* parse a normal reply, which may contain a (dir+)dentry and/or a
				141	* target inode.
				142	*/
				143	static int parse_reply_info_trace(void *p, void end,
				144	struct ceph_mds_reply_info_parsed *info,
				145	u64 features)
				146	{
				147	int err;
				148
				149	if (info->head->is_dentry) {
				150	err = parse_reply_info_in(p, end, &info->diri, features);
				151	if (err < 0)
				152	goto out_bad;
				153
				154	if (unlikely(p + sizeof(info->dirfrag) > end))
				155	goto bad;
				156	info->dirfrag = *p;
				157	p += sizeof(info->dirfrag) +
				158	sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
				159	if (unlikely(*p > end))
				160	goto bad;
				161
				162	ceph_decode_32_safe(p, end, info->dname_len, bad);
				163	ceph_decode_need(p, end, info->dname_len, bad);
				164	info->dname = *p;
				165	*p += info->dname_len;
				166	info->dlease = *p;
				167	p += sizeof(info->dlease);
				168	}
				169
				170	if (info->head->is_target) {
				171	err = parse_reply_info_in(p, end, &info->targeti, features);
				172	if (err < 0)
				173	goto out_bad;
				174	}
				175
				176	if (unlikely(*p != end))
				177	goto bad;
				178	return 0;
				179
				180	bad:
				181	err = -EIO;
				182	out_bad:
				183	pr_err("problem parsing mds trace %d\n", err);
				184	return err;
				185	}
				186
				187	/*
				188	* parse readdir results
				189	*/
				190	static int parse_reply_info_dir(void *p, void end,
				191	struct ceph_mds_reply_info_parsed *info,
				192	u64 features)
				193	{
				194	u32 num, i = 0;
				195	int err;
				196
				197	info->dir_dir = *p;
				198	if (p + sizeof(info->dir_dir) > end)
				199	goto bad;
				200	p += sizeof(info->dir_dir) +
				201	sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
				202	if (*p > end)
				203	goto bad;
				204
				205	ceph_decode_need(p, end, sizeof(num) + 2, bad);
				206	num = ceph_decode_32(p);
				207	{
				208	u16 flags = ceph_decode_16(p);
				209	info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
				210	info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
				211	info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
				212	info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
				213	}
				214	if (num == 0)
				215	goto done;
				216
				217	BUG_ON(!info->dir_entries);
				218	if ((unsigned long)(info->dir_entries + num) >
				219	(unsigned long)info->dir_entries + info->dir_buf_size) {
				220	pr_err("dir contents are larger than expected\n");
				221	WARN_ON(1);
				222	goto bad;
				223	}
				224
				225	info->dir_nr = num;
				226	while (num) {
				227	struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
				228	/* dentry */
				229	ceph_decode_need(p, end, sizeof(u32)*2, bad);
				230	rde->name_len = ceph_decode_32(p);
				231	ceph_decode_need(p, end, rde->name_len, bad);
				232	rde->name = *p;
				233	*p += rde->name_len;
				234	dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
				235	rde->lease = *p;
				236	*p += sizeof(struct ceph_mds_reply_lease);
				237
				238	/* inode */
				239	err = parse_reply_info_in(p, end, &rde->inode, features);
				240	if (err < 0)
				241	goto out_bad;
				242	/* ceph_readdir_prepopulate() will update it */
				243	rde->offset = 0;
				244	i++;
				245	num--;
				246	}
				247
				248	done:
				249	if (*p != end)
				250	goto bad;
				251	return 0;
				252
				253	bad:
				254	err = -EIO;
				255	out_bad:
				256	pr_err("problem parsing dir contents %d\n", err);
				257	return err;
				258	}
				259
				260	/*
				261	* parse fcntl F_GETLK results
				262	*/
				263	static int parse_reply_info_filelock(void *p, void end,
				264	struct ceph_mds_reply_info_parsed *info,
				265	u64 features)
				266	{
				267	if (p + sizeof(info->filelock_reply) > end)
				268	goto bad;
				269
				270	info->filelock_reply = *p;
				271	p += sizeof(info->filelock_reply);
				272
				273	if (unlikely(*p != end))
				274	goto bad;
				275	return 0;
				276
				277	bad:
				278	return -EIO;
				279	}
				280
				281	/*
				282	* parse create results
				283	*/
				284	static int parse_reply_info_create(void *p, void end,
				285	struct ceph_mds_reply_info_parsed *info,
				286	u64 features)
				287	{
				288	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
				289	if (*p == end) {
				290	info->has_create_ino = false;
				291	} else {
				292	info->has_create_ino = true;
				293	info->ino = ceph_decode_64(p);
				294	}
				295	}
				296
				297	if (unlikely(*p != end))
				298	goto bad;
				299	return 0;
				300
				301	bad:
				302	return -EIO;
				303	}
				304
				305	/*
				306	* parse extra results
				307	*/
				308	static int parse_reply_info_extra(void *p, void end,
				309	struct ceph_mds_reply_info_parsed *info,
				310	u64 features)
				311	{
				312	u32 op = le32_to_cpu(info->head->op);
				313
				314	if (op == CEPH_MDS_OP_GETFILELOCK)
				315	return parse_reply_info_filelock(p, end, info, features);
				316	else if (op == CEPH_MDS_OP_READDIR \|\| op == CEPH_MDS_OP_LSSNAP)
				317	return parse_reply_info_dir(p, end, info, features);
				318	else if (op == CEPH_MDS_OP_CREATE)
				319	return parse_reply_info_create(p, end, info, features);
				320	else
				321	return -EIO;
				322	}
				323
				324	/*
				325	* parse entire mds reply
				326	*/
				327	static int parse_reply_info(struct ceph_msg *msg,
				328	struct ceph_mds_reply_info_parsed *info,
				329	u64 features)
				330	{
				331	void p, end;
				332	u32 len;
				333	int err;
				334
				335	info->head = msg->front.iov_base;
				336	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
				337	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
				338
				339	/* trace */
				340	ceph_decode_32_safe(&p, end, len, bad);
				341	if (len > 0) {
				342	ceph_decode_need(&p, end, len, bad);
				343	err = parse_reply_info_trace(&p, p+len, info, features);
				344	if (err < 0)
				345	goto out_bad;
				346	}
				347
				348	/* extra */
				349	ceph_decode_32_safe(&p, end, len, bad);
				350	if (len > 0) {
				351	ceph_decode_need(&p, end, len, bad);
				352	err = parse_reply_info_extra(&p, p+len, info, features);
				353	if (err < 0)
				354	goto out_bad;
				355	}
				356
				357	/* snap blob */
				358	ceph_decode_32_safe(&p, end, len, bad);
				359	info->snapblob_len = len;
				360	info->snapblob = p;
				361	p += len;
				362
				363	if (p != end)
				364	goto bad;
				365	return 0;
				366
				367	bad:
				368	err = -EIO;
				369	out_bad:
				370	pr_err("mds parse_reply err %d\n", err);
				371	return err;
				372	}
				373
				374	static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
				375	{
				376	if (!info->dir_entries)
				377	return;
				378	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
				379	}
				380
				381
				382	/*
				383	* sessions
				384	*/
				385	const char *ceph_session_state_name(int s)
				386	{
				387	switch (s) {
				388	case CEPH_MDS_SESSION_NEW: return "new";
				389	case CEPH_MDS_SESSION_OPENING: return "opening";
				390	case CEPH_MDS_SESSION_OPEN: return "open";
				391	case CEPH_MDS_SESSION_HUNG: return "hung";
				392	case CEPH_MDS_SESSION_CLOSING: return "closing";
				393	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
				394	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
				395	case CEPH_MDS_SESSION_REJECTED: return "rejected";
				396	default: return "???";
				397	}
				398	}
				399
				400	static struct ceph_mds_session get_session(struct ceph_mds_session s)
				401	{
				402	if (refcount_inc_not_zero(&s->s_ref)) {
				403	dout("mdsc get_session %p %d -> %d\n", s,
				404	refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
				405	return s;
				406	} else {
				407	dout("mdsc get_session %p 0 -- FAIL\n", s);
				408	return NULL;
				409	}
				410	}
				411
				412	void ceph_put_mds_session(struct ceph_mds_session *s)
				413	{
				414	dout("mdsc put_session %p %d -> %d\n", s,
				415	refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
				416	if (refcount_dec_and_test(&s->s_ref)) {
				417	if (s->s_auth.authorizer)
				418	ceph_auth_destroy_authorizer(s->s_auth.authorizer);
				419	kfree(s);
				420	}
				421	}
				422
				423	/*
				424	* called under mdsc->mutex
				425	*/
				426	struct ceph_mds_session __ceph_lookup_mds_session(struct ceph_mds_client mdsc,
				427	int mds)
				428	{
				429	struct ceph_mds_session *session;
				430
				431	if (mds >= mdsc->max_sessions \|\| !mdsc->sessions[mds])
				432	return NULL;
				433	session = mdsc->sessions[mds];
				434	dout("lookup_mds_session %p %d\n", session,
				435	refcount_read(&session->s_ref));
				436	get_session(session);
				437	return session;
				438	}
				439
				440	static bool __have_session(struct ceph_mds_client *mdsc, int mds)
				441	{
				442	if (mds >= mdsc->max_sessions \|\| !mdsc->sessions[mds])
				443	return false;
				444	else
				445	return true;
				446	}
				447
				448	static int __verify_registered_session(struct ceph_mds_client *mdsc,
				449	struct ceph_mds_session *s)
				450	{
				451	if (s->s_mds >= mdsc->max_sessions \|\|
				452	mdsc->sessions[s->s_mds] != s)
				453	return -ENOENT;
				454	return 0;
				455	}
				456
				457	/*
				458	* create+register a new session for given mds.
				459	* called under mdsc->mutex.
				460	*/
				461	static struct ceph_mds_session register_session(struct ceph_mds_client mdsc,
				462	int mds)
				463	{
				464	struct ceph_mds_session *s;
				465
				466	if (mds >= mdsc->mdsmap->m_num_mds)
				467	return ERR_PTR(-EINVAL);
				468
				469	s = kzalloc(sizeof(*s), GFP_NOFS);
				470	if (!s)
				471	return ERR_PTR(-ENOMEM);
				472
				473	if (mds >= mdsc->max_sessions) {
				474	int newmax = 1 << get_count_order(mds + 1);
				475	struct ceph_mds_session **sa;
				476
				477	dout("%s: realloc to %d\n", __func__, newmax);
				478	sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
				479	if (!sa)
				480	goto fail_realloc;
				481	if (mdsc->sessions) {
				482	memcpy(sa, mdsc->sessions,
				483	mdsc->max_sessions * sizeof(void *));
				484	kfree(mdsc->sessions);
				485	}
				486	mdsc->sessions = sa;
				487	mdsc->max_sessions = newmax;
				488	}
				489
				490	dout("%s: mds%d\n", __func__, mds);
				491	s->s_mdsc = mdsc;
				492	s->s_mds = mds;
				493	s->s_state = CEPH_MDS_SESSION_NEW;
				494	s->s_ttl = 0;
				495	s->s_seq = 0;
				496	mutex_init(&s->s_mutex);
				497
				498	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
				499
				500	spin_lock_init(&s->s_gen_ttl_lock);
				501	s->s_cap_gen = 0;
				502	s->s_cap_ttl = jiffies - 1;
				503
				504	spin_lock_init(&s->s_cap_lock);
				505	s->s_renew_requested = 0;
				506	s->s_renew_seq = 0;
				507	INIT_LIST_HEAD(&s->s_caps);
				508	s->s_nr_caps = 0;
				509	s->s_trim_caps = 0;
				510	refcount_set(&s->s_ref, 1);
				511	INIT_LIST_HEAD(&s->s_waiting);
				512	INIT_LIST_HEAD(&s->s_unsafe);
				513	s->s_num_cap_releases = 0;
				514	s->s_cap_reconnect = 0;
				515	s->s_cap_iterator = NULL;
				516	INIT_LIST_HEAD(&s->s_cap_releases);
				517	INIT_LIST_HEAD(&s->s_cap_flushing);
				518
				519	mdsc->sessions[mds] = s;
				520	atomic_inc(&mdsc->num_sessions);
				521	refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
				522
				523	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
				524	ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
				525
				526	return s;
				527
				528	fail_realloc:
				529	kfree(s);
				530	return ERR_PTR(-ENOMEM);
				531	}
				532
				533	/*
				534	* called under mdsc->mutex
				535	*/
				536	static void __unregister_session(struct ceph_mds_client *mdsc,
				537	struct ceph_mds_session *s)
				538	{
				539	dout("__unregister_session mds%d %p\n", s->s_mds, s);
				540	BUG_ON(mdsc->sessions[s->s_mds] != s);
				541	mdsc->sessions[s->s_mds] = NULL;
				542	ceph_con_close(&s->s_con);
				543	ceph_put_mds_session(s);
				544	atomic_dec(&mdsc->num_sessions);
				545	}
				546
				547	/*
				548	* drop session refs in request.
				549	*
				550	* should be last request ref, or hold mdsc->mutex
				551	*/
				552	static void put_request_session(struct ceph_mds_request *req)
				553	{
				554	if (req->r_session) {
				555	ceph_put_mds_session(req->r_session);
				556	req->r_session = NULL;
				557	}
				558	}
				559
				560	void ceph_mdsc_release_request(struct kref *kref)
				561	{
				562	struct ceph_mds_request *req = container_of(kref,
				563	struct ceph_mds_request,
				564	r_kref);
				565	destroy_reply_info(&req->r_reply_info);
				566	if (req->r_request)
				567	ceph_msg_put(req->r_request);
				568	if (req->r_reply)
				569	ceph_msg_put(req->r_reply);
				570	if (req->r_inode) {
				571	ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
				572	iput(req->r_inode);
				573	}
				574	if (req->r_parent)
				575	ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
				576	iput(req->r_target_inode);
				577	if (req->r_dentry)
				578	dput(req->r_dentry);
				579	if (req->r_old_dentry)
				580	dput(req->r_old_dentry);
				581	if (req->r_old_dentry_dir) {
				582	/*
				583	* track (and drop pins for) r_old_dentry_dir
				584	* separately, since r_old_dentry's d_parent may have
				585	* changed between the dir mutex being dropped and
				586	* this request being freed.
				587	*/
				588	ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
				589	CEPH_CAP_PIN);
				590	iput(req->r_old_dentry_dir);
				591	}
				592	kfree(req->r_path1);
				593	kfree(req->r_path2);
				594	if (req->r_pagelist)
				595	ceph_pagelist_release(req->r_pagelist);
				596	put_request_session(req);
				597	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
				598	kfree(req);
				599	}
				600
				601	DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
				602
				603	/*
				604	* lookup session, bump ref if found.
				605	*
				606	* called under mdsc->mutex.
				607	*/
				608	static struct ceph_mds_request *
				609	lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
				610	{
				611	struct ceph_mds_request *req;
				612
				613	req = lookup_request(&mdsc->request_tree, tid);
				614	if (req)
				615	ceph_mdsc_get_request(req);
				616
				617	return req;
				618	}
				619
				620	/*
				621	* Register an in-flight request, and assign a tid. Link to directory
				622	* are modifying (if any).
				623	*
				624	* Called under mdsc->mutex.
				625	*/
				626	static void __register_request(struct ceph_mds_client *mdsc,
				627	struct ceph_mds_request *req,
				628	struct inode *dir)
				629	{
				630	int ret = 0;
				631
				632	req->r_tid = ++mdsc->last_tid;
				633	if (req->r_num_caps) {
				634	ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
				635	req->r_num_caps);
				636	if (ret < 0) {
				637	pr_err("__register_request %p "
				638	"failed to reserve caps: %d\n", req, ret);
				639	/* set req->r_err to fail early from __do_request */
				640	req->r_err = ret;
				641	return;
				642	}
				643	}
				644	dout("__register_request %p tid %lld\n", req, req->r_tid);
				645	ceph_mdsc_get_request(req);
				646	insert_request(&mdsc->request_tree, req);
				647
				648	req->r_uid = current_fsuid();
				649	req->r_gid = current_fsgid();
				650
				651	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
				652	mdsc->oldest_tid = req->r_tid;
				653
				654	if (dir) {
				655	ihold(dir);
				656	req->r_unsafe_dir = dir;
				657	}
				658	}
				659
				660	static void __unregister_request(struct ceph_mds_client *mdsc,
				661	struct ceph_mds_request *req)
				662	{
				663	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
				664
				665	/* Never leave an unregistered request on an unsafe list! */
				666	list_del_init(&req->r_unsafe_item);
				667
				668	if (req->r_tid == mdsc->oldest_tid) {
				669	struct rb_node *p = rb_next(&req->r_node);
				670	mdsc->oldest_tid = 0;
				671	while (p) {
				672	struct ceph_mds_request *next_req =
				673	rb_entry(p, struct ceph_mds_request, r_node);
				674	if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
				675	mdsc->oldest_tid = next_req->r_tid;
				676	break;
				677	}
				678	p = rb_next(p);
				679	}
				680	}
				681
				682	erase_request(&mdsc->request_tree, req);
				683
				684	if (req->r_unsafe_dir &&
				685	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				686	struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
				687	spin_lock(&ci->i_unsafe_lock);
				688	list_del_init(&req->r_unsafe_dir_item);
				689	spin_unlock(&ci->i_unsafe_lock);
				690	}
				691	if (req->r_target_inode &&
				692	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				693	struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
				694	spin_lock(&ci->i_unsafe_lock);
				695	list_del_init(&req->r_unsafe_target_item);
				696	spin_unlock(&ci->i_unsafe_lock);
				697	}
				698
				699	if (req->r_unsafe_dir) {
				700	iput(req->r_unsafe_dir);
				701	req->r_unsafe_dir = NULL;
				702	}
				703
				704	complete_all(&req->r_safe_completion);
				705
				706	ceph_mdsc_put_request(req);
				707	}
				708
				709	/*
				710	* Walk back up the dentry tree until we hit a dentry representing a
				711	* non-snapshot inode. We do this using the rcu_read_lock (which must be held
				712	* when calling this) to ensure that the objects won't disappear while we're
				713	* working with them. Once we hit a candidate dentry, we attempt to take a
				714	* reference to it, and return that as the result.
				715	*/
				716	static struct inode get_nonsnap_parent(struct dentry dentry)
				717	{
				718	struct inode *inode = NULL;
				719
				720	while (dentry && !IS_ROOT(dentry)) {
				721	inode = d_inode_rcu(dentry);
				722	if (!inode \|\| ceph_snap(inode) == CEPH_NOSNAP)
				723	break;
				724	dentry = dentry->d_parent;
				725	}
				726	if (inode)
				727	inode = igrab(inode);
				728	return inode;
				729	}
				730
				731	/*
				732	* Choose mds to send request to next. If there is a hint set in the
				733	* request (e.g., due to a prior forward hint from the mds), use that.
				734	* Otherwise, consult frag tree and/or caps to identify the
				735	* appropriate mds. If all else fails, choose randomly.
				736	*
				737	* Called under mdsc->mutex.
				738	*/
				739	static int __choose_mds(struct ceph_mds_client *mdsc,
				740	struct ceph_mds_request *req)
				741	{
				742	struct inode *inode;
				743	struct ceph_inode_info *ci;
				744	struct ceph_cap *cap;
				745	int mode = req->r_direct_mode;
				746	int mds = -1;
				747	u32 hash = req->r_direct_hash;
				748	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
				749
				750	/*
				751	* is there a specific mds we should try? ignore hint if we have
				752	* no session and the mds is not up (active or recovering).
				753	*/
				754	if (req->r_resend_mds >= 0 &&
				755	(__have_session(mdsc, req->r_resend_mds) \|\|
				756	ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
				757	dout("choose_mds using resend_mds mds%d\n",
				758	req->r_resend_mds);
				759	return req->r_resend_mds;
				760	}
				761
				762	if (mode == USE_RANDOM_MDS)
				763	goto random;
				764
				765	inode = NULL;
				766	if (req->r_inode) {
				767	if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
				768	inode = req->r_inode;
				769	ihold(inode);
				770	} else {
				771	/* req->r_dentry is non-null for LSSNAP request */
				772	rcu_read_lock();
				773	inode = get_nonsnap_parent(req->r_dentry);
				774	rcu_read_unlock();
				775	dout("__choose_mds using snapdir's parent %p\n", inode);
				776	}
				777	} else if (req->r_dentry) {
				778	/* ignore race with rename; old or new d_parent is okay */
				779	struct dentry *parent;
				780	struct inode *dir;
				781
				782	rcu_read_lock();
				783	parent = req->r_dentry->d_parent;
				784	dir = req->r_parent ? : d_inode_rcu(parent);
				785
				786	if (!dir \|\| dir->i_sb != mdsc->fsc->sb) {
				787	/* not this fs or parent went negative */
				788	inode = d_inode(req->r_dentry);
				789	if (inode)
				790	ihold(inode);
				791	} else if (ceph_snap(dir) != CEPH_NOSNAP) {
				792	/* direct snapped/virtual snapdir requests
				793	* based on parent dir inode */
				794	inode = get_nonsnap_parent(parent);
				795	dout("__choose_mds using nonsnap parent %p\n", inode);
				796	} else {
				797	/* dentry target */
				798	inode = d_inode(req->r_dentry);
				799	if (!inode \|\| mode == USE_AUTH_MDS) {
				800	/* dir + name */
				801	inode = igrab(dir);
				802	hash = ceph_dentry_hash(dir, req->r_dentry);
				803	is_hash = true;
				804	} else {
				805	ihold(inode);
				806	}
				807	}
				808	rcu_read_unlock();
				809	}
				810
				811	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
				812	(int)hash, mode);
				813	if (!inode)
				814	goto random;
				815	ci = ceph_inode(inode);
				816
				817	if (is_hash && S_ISDIR(inode->i_mode)) {
				818	struct ceph_inode_frag frag;
				819	int found;
				820
				821	ceph_choose_frag(ci, hash, &frag, &found);
				822	if (found) {
				823	if (mode == USE_ANY_MDS && frag.ndist > 0) {
				824	u8 r;
				825
				826	/* choose a random replica */
				827	get_random_bytes(&r, 1);
				828	r %= frag.ndist;
				829	mds = frag.dist[r];
				830	dout("choose_mds %p %llx.%llx "
				831	"frag %u mds%d (%d/%d)\n",
				832	inode, ceph_vinop(inode),
				833	frag.frag, mds,
				834	(int)r, frag.ndist);
				835	if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
				836	CEPH_MDS_STATE_ACTIVE)
				837	goto out;
				838	}
				839
				840	/* since this file/dir wasn't known to be
				841	* replicated, then we want to look for the
				842	* authoritative mds. */
				843	mode = USE_AUTH_MDS;
				844	if (frag.mds >= 0) {
				845	/* choose auth mds */
				846	mds = frag.mds;
				847	dout("choose_mds %p %llx.%llx "
				848	"frag %u mds%d (auth)\n",
				849	inode, ceph_vinop(inode), frag.frag, mds);
				850	if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
				851	CEPH_MDS_STATE_ACTIVE)
				852	goto out;
				853	}
				854	}
				855	}
				856
				857	spin_lock(&ci->i_ceph_lock);
				858	cap = NULL;
				859	if (mode == USE_AUTH_MDS)
				860	cap = ci->i_auth_cap;
				861	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
				862	cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
				863	if (!cap) {
				864	spin_unlock(&ci->i_ceph_lock);
				865	iput(inode);
				866	goto random;
				867	}
				868	mds = cap->session->s_mds;
				869	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
				870	inode, ceph_vinop(inode), mds,
				871	cap == ci->i_auth_cap ? "auth " : "", cap);
				872	spin_unlock(&ci->i_ceph_lock);
				873	out:
				874	iput(inode);
				875	return mds;
				876
				877	random:
				878	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
				879	dout("choose_mds chose random mds%d\n", mds);
				880	return mds;
				881	}
				882
				883
				884	/*
				885	* session messages
				886	*/
				887	static struct ceph_msg *create_session_msg(u32 op, u64 seq)
				888	{
				889	struct ceph_msg *msg;
				890	struct ceph_mds_session_head *h;
				891
				892	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
				893	false);
				894	if (!msg) {
				895	pr_err("create_session_msg ENOMEM creating msg\n");
				896	return NULL;
				897	}
				898	h = msg->front.iov_base;
				899	h->op = cpu_to_le32(op);
				900	h->seq = cpu_to_le64(seq);
				901
				902	return msg;
				903	}
				904
				905	static void encode_supported_features(void *p, void end)
				906	{
				907	static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
				908	static const size_t count = ARRAY_SIZE(bits);
				909
				910	if (count > 0) {
				911	size_t i;
				912	size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8;
				913
				914	BUG_ON(*p + 4 + size > end);
				915	ceph_encode_32(p, size);
				916	memset(*p, 0, size);
				917	for (i = 0; i < count; i++)
				918	((unsigned char)(p))[i / 8] \|= 1 << (bits[i] % 8);
				919	*p += size;
				920	} else {
				921	BUG_ON(*p + 4 > end);
				922	ceph_encode_32(p, 0);
				923	}
				924	}
				925
				926	/*
				927	* session message, specialization for CEPH_SESSION_REQUEST_OPEN
				928	* to include additional client metadata fields.
				929	*/
				930	static struct ceph_msg create_session_open_msg(struct ceph_mds_client mdsc, u64 seq)
				931	{
				932	struct ceph_msg *msg;
				933	struct ceph_mds_session_head *h;
				934	int i = -1;
				935	int extra_bytes = 0;
				936	int metadata_key_count = 0;
				937	struct ceph_options *opt = mdsc->fsc->client->options;
				938	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
				939	void p, end;
				940
				941	const char* metadata[][2] = {
				942	{"hostname", mdsc->nodename},
				943	{"kernel_version", init_utsname()->release},
				944	{"entity_id", opt->name ? : ""},
				945	{"root", fsopt->server_path ? : "/"},
				946	{NULL, NULL}
				947	};
				948
				949	/* Calculate serialized length of metadata */
				950	extra_bytes = 4; /* map length */
				951	for (i = 0; metadata[i][0]; ++i) {
				952	extra_bytes += 8 + strlen(metadata[i][0]) +
				953	strlen(metadata[i][1]);
				954	metadata_key_count++;
				955	}
				956	/* supported feature */
				957	extra_bytes += 4 + 8;
				958
				959	/* Allocate the message */
				960	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
				961	GFP_NOFS, false);
				962	if (!msg) {
				963	pr_err("create_session_msg ENOMEM creating msg\n");
				964	return NULL;
				965	}
				966	p = msg->front.iov_base;
				967	end = p + msg->front.iov_len;
				968
				969	h = p;
				970	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
				971	h->seq = cpu_to_le64(seq);
				972
				973	/*
				974	* Serialize client metadata into waiting buffer space, using
				975	* the format that userspace expects for map<string, string>
				976	*
				977	* ClientSession messages with metadata are v2
				978	*/
				979	msg->hdr.version = cpu_to_le16(3);
				980	msg->hdr.compat_version = cpu_to_le16(1);
				981
				982	/* The write pointer, following the session_head structure */
				983	p += sizeof(*h);
				984
				985	/* Number of entries in the map */
				986	ceph_encode_32(&p, metadata_key_count);
				987
				988	/* Two length-prefixed strings for each entry in the map */
				989	for (i = 0; metadata[i][0]; ++i) {
				990	size_t const key_len = strlen(metadata[i][0]);
				991	size_t const val_len = strlen(metadata[i][1]);
				992
				993	ceph_encode_32(&p, key_len);
				994	memcpy(p, metadata[i][0], key_len);
				995	p += key_len;
				996	ceph_encode_32(&p, val_len);
				997	memcpy(p, metadata[i][1], val_len);
				998	p += val_len;
				999	}
				1000
				1001	encode_supported_features(&p, end);
				1002	msg->front.iov_len = p - msg->front.iov_base;
				1003	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1004
				1005	return msg;
				1006	}
				1007
				1008	/*
				1009	* send session open request.
				1010	*
				1011	* called under mdsc->mutex
				1012	*/
				1013	static int __open_session(struct ceph_mds_client *mdsc,
				1014	struct ceph_mds_session *session)
				1015	{
				1016	struct ceph_msg *msg;
				1017	int mstate;
				1018	int mds = session->s_mds;
				1019
				1020	/* wait for mds to go active? */
				1021	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
				1022	dout("open_session to mds%d (%s)\n", mds,
				1023	ceph_mds_state_name(mstate));
				1024	session->s_state = CEPH_MDS_SESSION_OPENING;
				1025	session->s_renew_requested = jiffies;
				1026
				1027	/* send connect message */
				1028	msg = create_session_open_msg(mdsc, session->s_seq);
				1029	if (!msg)
				1030	return -ENOMEM;
				1031	ceph_con_send(&session->s_con, msg);
				1032	return 0;
				1033	}
				1034
				1035	/*
				1036	* open sessions for any export targets for the given mds
				1037	*
				1038	* called under mdsc->mutex
				1039	*/
				1040	static struct ceph_mds_session *
				1041	__open_export_target_session(struct ceph_mds_client *mdsc, int target)
				1042	{
				1043	struct ceph_mds_session *session;
				1044
				1045	session = __ceph_lookup_mds_session(mdsc, target);
				1046	if (!session) {
				1047	session = register_session(mdsc, target);
				1048	if (IS_ERR(session))
				1049	return session;
				1050	}
				1051	if (session->s_state == CEPH_MDS_SESSION_NEW \|\|
				1052	session->s_state == CEPH_MDS_SESSION_CLOSING)
				1053	__open_session(mdsc, session);
				1054
				1055	return session;
				1056	}
				1057
				1058	struct ceph_mds_session *
				1059	ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
				1060	{
				1061	struct ceph_mds_session *session;
				1062
				1063	dout("open_export_target_session to mds%d\n", target);
				1064
				1065	mutex_lock(&mdsc->mutex);
				1066	session = __open_export_target_session(mdsc, target);
				1067	mutex_unlock(&mdsc->mutex);
				1068
				1069	return session;
				1070	}
				1071
				1072	static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
				1073	struct ceph_mds_session *session)
				1074	{
				1075	struct ceph_mds_info *mi;
				1076	struct ceph_mds_session *ts;
				1077	int i, mds = session->s_mds;
				1078
				1079	if (mds >= mdsc->mdsmap->m_num_mds)
				1080	return;
				1081
				1082	mi = &mdsc->mdsmap->m_info[mds];
				1083	dout("open_export_target_sessions for mds%d (%d targets)\n",
				1084	session->s_mds, mi->num_export_targets);
				1085
				1086	for (i = 0; i < mi->num_export_targets; i++) {
				1087	ts = __open_export_target_session(mdsc, mi->export_targets[i]);
				1088	if (!IS_ERR(ts))
				1089	ceph_put_mds_session(ts);
				1090	}
				1091	}
				1092
				1093	void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
				1094	struct ceph_mds_session *session)
				1095	{
				1096	mutex_lock(&mdsc->mutex);
				1097	__open_export_target_sessions(mdsc, session);
				1098	mutex_unlock(&mdsc->mutex);
				1099	}
				1100
				1101	/*
				1102	* session caps
				1103	*/
				1104
				1105	static void detach_cap_releases(struct ceph_mds_session *session,
				1106	struct list_head *target)
				1107	{
				1108	lockdep_assert_held(&session->s_cap_lock);
				1109
				1110	list_splice_init(&session->s_cap_releases, target);
				1111	session->s_num_cap_releases = 0;
				1112	dout("dispose_cap_releases mds%d\n", session->s_mds);
				1113	}
				1114
				1115	static void dispose_cap_releases(struct ceph_mds_client *mdsc,
				1116	struct list_head *dispose)
				1117	{
				1118	while (!list_empty(dispose)) {
				1119	struct ceph_cap *cap;
				1120	/* zero out the in-progress message */
				1121	cap = list_first_entry(dispose, struct ceph_cap, session_caps);
				1122	list_del(&cap->session_caps);
				1123	ceph_put_cap(mdsc, cap);
				1124	}
				1125	}
				1126
				1127	static void cleanup_session_requests(struct ceph_mds_client *mdsc,
				1128	struct ceph_mds_session *session)
				1129	{
				1130	struct ceph_mds_request *req;
				1131	struct rb_node *p;
				1132
				1133	dout("cleanup_session_requests mds%d\n", session->s_mds);
				1134	mutex_lock(&mdsc->mutex);
				1135	while (!list_empty(&session->s_unsafe)) {
				1136	req = list_first_entry(&session->s_unsafe,
				1137	struct ceph_mds_request, r_unsafe_item);
				1138	pr_warn_ratelimited(" dropping unsafe request %llu\n",
				1139	req->r_tid);
				1140	__unregister_request(mdsc, req);
				1141	}
				1142	/* zero r_attempts, so kick_requests() will re-send requests */
				1143	p = rb_first(&mdsc->request_tree);
				1144	while (p) {
				1145	req = rb_entry(p, struct ceph_mds_request, r_node);
				1146	p = rb_next(p);
				1147	if (req->r_session &&
				1148	req->r_session->s_mds == session->s_mds)
				1149	req->r_attempts = 0;
				1150	}
				1151	mutex_unlock(&mdsc->mutex);
				1152	}
				1153
				1154	/*
				1155	* Helper to safely iterate over all caps associated with a session, with
				1156	* special care taken to handle a racing __ceph_remove_cap().
				1157	*
				1158	* Caller must hold session s_mutex.
				1159	*/
				1160	static int iterate_session_caps(struct ceph_mds_session *session,
				1161	int (cb)(struct inode , struct ceph_cap *,
				1162	void ), void arg)
				1163	{
				1164	struct list_head *p;
				1165	struct ceph_cap *cap;
				1166	struct inode inode, last_inode = NULL;
				1167	struct ceph_cap *old_cap = NULL;
				1168	int ret;
				1169
				1170	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
				1171	spin_lock(&session->s_cap_lock);
				1172	p = session->s_caps.next;
				1173	while (p != &session->s_caps) {
				1174	cap = list_entry(p, struct ceph_cap, session_caps);
				1175	inode = igrab(&cap->ci->vfs_inode);
				1176	if (!inode) {
				1177	p = p->next;
				1178	continue;
				1179	}
				1180	session->s_cap_iterator = cap;
				1181	spin_unlock(&session->s_cap_lock);
				1182
				1183	if (last_inode) {
				1184	iput(last_inode);
				1185	last_inode = NULL;
				1186	}
				1187	if (old_cap) {
				1188	ceph_put_cap(session->s_mdsc, old_cap);
				1189	old_cap = NULL;
				1190	}
				1191
				1192	ret = cb(inode, cap, arg);
				1193	last_inode = inode;
				1194
				1195	spin_lock(&session->s_cap_lock);
				1196	p = p->next;
				1197	if (!cap->ci) {
				1198	dout("iterate_session_caps finishing cap %p removal\n",
				1199	cap);
				1200	BUG_ON(cap->session != session);
				1201	cap->session = NULL;
				1202	list_del_init(&cap->session_caps);
				1203	session->s_nr_caps--;
				1204	if (cap->queue_release) {
				1205	list_add_tail(&cap->session_caps,
				1206	&session->s_cap_releases);
				1207	session->s_num_cap_releases++;
				1208	} else {
				1209	old_cap = cap; /* put_cap it w/o locks held */
				1210	}
				1211	}
				1212	if (ret < 0)
				1213	goto out;
				1214	}
				1215	ret = 0;
				1216	out:
				1217	session->s_cap_iterator = NULL;
				1218	spin_unlock(&session->s_cap_lock);
				1219
				1220	iput(last_inode);
				1221	if (old_cap)
				1222	ceph_put_cap(session->s_mdsc, old_cap);
				1223
				1224	return ret;
				1225	}
				1226
				1227	static int remove_session_caps_cb(struct inode inode, struct ceph_cap cap,
				1228	void *arg)
				1229	{
				1230	struct ceph_fs_client fsc = (struct ceph_fs_client )arg;
				1231	struct ceph_inode_info *ci = ceph_inode(inode);
				1232	LIST_HEAD(to_remove);
				1233	bool drop = false;
				1234	bool invalidate = false;
				1235
				1236	dout("removing cap %p, ci is %p, inode is %p\n",
				1237	cap, ci, &ci->vfs_inode);
				1238	spin_lock(&ci->i_ceph_lock);
				1239	__ceph_remove_cap(cap, false);
				1240	if (!ci->i_auth_cap) {
				1241	struct ceph_cap_flush *cf;
				1242	struct ceph_mds_client *mdsc = fsc->mdsc;
				1243
				1244	ci->i_ceph_flags \|= CEPH_I_CAP_DROPPED;
				1245
				1246	if (ci->i_wrbuffer_ref > 0 &&
				1247	READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				1248	invalidate = true;
				1249
				1250	while (!list_empty(&ci->i_cap_flush_list)) {
				1251	cf = list_first_entry(&ci->i_cap_flush_list,
				1252	struct ceph_cap_flush, i_list);
				1253	list_move(&cf->i_list, &to_remove);
				1254	}
				1255
				1256	spin_lock(&mdsc->cap_dirty_lock);
				1257
				1258	list_for_each_entry(cf, &to_remove, i_list)
				1259	list_del(&cf->g_list);
				1260
				1261	if (!list_empty(&ci->i_dirty_item)) {
				1262	pr_warn_ratelimited(
				1263	" dropping dirty %s state for %p %lld\n",
				1264	ceph_cap_string(ci->i_dirty_caps),
				1265	inode, ceph_ino(inode));
				1266	ci->i_dirty_caps = 0;
				1267	list_del_init(&ci->i_dirty_item);
				1268	drop = true;
				1269	}
				1270	if (!list_empty(&ci->i_flushing_item)) {
				1271	pr_warn_ratelimited(
				1272	" dropping dirty+flushing %s state for %p %lld\n",
				1273	ceph_cap_string(ci->i_flushing_caps),
				1274	inode, ceph_ino(inode));
				1275	ci->i_flushing_caps = 0;
				1276	list_del_init(&ci->i_flushing_item);
				1277	mdsc->num_cap_flushing--;
				1278	drop = true;
				1279	}
				1280	spin_unlock(&mdsc->cap_dirty_lock);
				1281
				1282	if (atomic_read(&ci->i_filelock_ref) > 0) {
				1283	/* make further file lock syscall return -EIO */
				1284	ci->i_ceph_flags \|= CEPH_I_ERROR_FILELOCK;
				1285	pr_warn_ratelimited(" dropping file locks for %p %lld\n",
				1286	inode, ceph_ino(inode));
				1287	}
				1288
				1289	if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
				1290	list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
				1291	ci->i_prealloc_cap_flush = NULL;
				1292	}
				1293
				1294	if (drop &&
				1295	ci->i_wrbuffer_ref_head == 0 &&
				1296	ci->i_wr_ref == 0 &&
				1297	ci->i_dirty_caps == 0 &&
				1298	ci->i_flushing_caps == 0) {
				1299	ceph_put_snap_context(ci->i_head_snapc);
				1300	ci->i_head_snapc = NULL;
				1301	}
				1302	}
				1303	spin_unlock(&ci->i_ceph_lock);
				1304	while (!list_empty(&to_remove)) {
				1305	struct ceph_cap_flush *cf;
				1306	cf = list_first_entry(&to_remove,
				1307	struct ceph_cap_flush, i_list);
				1308	list_del(&cf->i_list);
				1309	ceph_free_cap_flush(cf);
				1310	}
				1311
				1312	wake_up_all(&ci->i_cap_wq);
				1313	if (invalidate)
				1314	ceph_queue_invalidate(inode);
				1315	if (drop)
				1316	iput(inode);
				1317	return 0;
				1318	}
				1319
				1320	/*
				1321	* caller must hold session s_mutex
				1322	*/
				1323	static void remove_session_caps(struct ceph_mds_session *session)
				1324	{
				1325	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
				1326	struct super_block *sb = fsc->sb;
				1327	LIST_HEAD(dispose);
				1328
				1329	dout("remove_session_caps on %p\n", session);
				1330	iterate_session_caps(session, remove_session_caps_cb, fsc);
				1331
				1332	wake_up_all(&fsc->mdsc->cap_flushing_wq);
				1333
				1334	spin_lock(&session->s_cap_lock);
				1335	if (session->s_nr_caps > 0) {
				1336	struct inode *inode;
				1337	struct ceph_cap cap, prev = NULL;
				1338	struct ceph_vino vino;
				1339	/*
				1340	* iterate_session_caps() skips inodes that are being
				1341	* deleted, we need to wait until deletions are complete.
				1342	* __wait_on_freeing_inode() is designed for the job,
				1343	* but it is not exported, so use lookup inode function
				1344	* to access it.
				1345	*/
				1346	while (!list_empty(&session->s_caps)) {
				1347	cap = list_entry(session->s_caps.next,
				1348	struct ceph_cap, session_caps);
				1349	if (cap == prev)
				1350	break;
				1351	prev = cap;
				1352	vino = cap->ci->i_vino;
				1353	spin_unlock(&session->s_cap_lock);
				1354
				1355	inode = ceph_find_inode(sb, vino);
				1356	iput(inode);
				1357
				1358	spin_lock(&session->s_cap_lock);
				1359	}
				1360	}
				1361
				1362	// drop cap expires and unlock s_cap_lock
				1363	detach_cap_releases(session, &dispose);
				1364
				1365	BUG_ON(session->s_nr_caps > 0);
				1366	BUG_ON(!list_empty(&session->s_cap_flushing));
				1367	spin_unlock(&session->s_cap_lock);
				1368	dispose_cap_releases(session->s_mdsc, &dispose);
				1369	}
				1370
				1371	/*
				1372	* wake up any threads waiting on this session's caps. if the cap is
				1373	* old (didn't get renewed on the client reconnect), remove it now.
				1374	*
				1375	* caller must hold s_mutex.
				1376	*/
				1377	static int wake_up_session_cb(struct inode inode, struct ceph_cap cap,
				1378	void *arg)
				1379	{
				1380	struct ceph_inode_info *ci = ceph_inode(inode);
				1381
				1382	if (arg) {
				1383	spin_lock(&ci->i_ceph_lock);
				1384	ci->i_wanted_max_size = 0;
				1385	ci->i_requested_max_size = 0;
				1386	spin_unlock(&ci->i_ceph_lock);
				1387	}
				1388	wake_up_all(&ci->i_cap_wq);
				1389	return 0;
				1390	}
				1391
				1392	static void wake_up_session_caps(struct ceph_mds_session *session,
				1393	int reconnect)
				1394	{
				1395	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
				1396	iterate_session_caps(session, wake_up_session_cb,
				1397	(void *)(unsigned long)reconnect);
				1398	}
				1399
				1400	/*
				1401	* Send periodic message to MDS renewing all currently held caps. The
				1402	* ack will reset the expiration for all caps from this session.
				1403	*
				1404	* caller holds s_mutex
				1405	*/
				1406	static int send_renew_caps(struct ceph_mds_client *mdsc,
				1407	struct ceph_mds_session *session)
				1408	{
				1409	struct ceph_msg *msg;
				1410	int state;
				1411
				1412	if (time_after_eq(jiffies, session->s_cap_ttl) &&
				1413	time_after_eq(session->s_cap_ttl, session->s_renew_requested))
				1414	pr_info("mds%d caps stale\n", session->s_mds);
				1415	session->s_renew_requested = jiffies;
				1416
				1417	/* do not try to renew caps until a recovering mds has reconnected
				1418	* with its clients. */
				1419	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
				1420	if (state < CEPH_MDS_STATE_RECONNECT) {
				1421	dout("send_renew_caps ignoring mds%d (%s)\n",
				1422	session->s_mds, ceph_mds_state_name(state));
				1423	return 0;
				1424	}
				1425
				1426	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
				1427	ceph_mds_state_name(state));
				1428	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
				1429	++session->s_renew_seq);
				1430	if (!msg)
				1431	return -ENOMEM;
				1432	ceph_con_send(&session->s_con, msg);
				1433	return 0;
				1434	}
				1435
				1436	static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
				1437	struct ceph_mds_session *session, u64 seq)
				1438	{
				1439	struct ceph_msg *msg;
				1440
				1441	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
				1442	session->s_mds, ceph_session_state_name(session->s_state), seq);
				1443	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
				1444	if (!msg)
				1445	return -ENOMEM;
				1446	ceph_con_send(&session->s_con, msg);
				1447	return 0;
				1448	}
				1449
				1450
				1451	/*
				1452	* Note new cap ttl, and any transition from stale -> not stale (fresh?).
				1453	*
				1454	* Called under session->s_mutex
				1455	*/
				1456	static void renewed_caps(struct ceph_mds_client *mdsc,
				1457	struct ceph_mds_session *session, int is_renew)
				1458	{
				1459	int was_stale;
				1460	int wake = 0;
				1461
				1462	spin_lock(&session->s_cap_lock);
				1463	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
				1464
				1465	session->s_cap_ttl = session->s_renew_requested +
				1466	mdsc->mdsmap->m_session_timeout*HZ;
				1467
				1468	if (was_stale) {
				1469	if (time_before(jiffies, session->s_cap_ttl)) {
				1470	pr_info("mds%d caps renewed\n", session->s_mds);
				1471	wake = 1;
				1472	} else {
				1473	pr_info("mds%d caps still stale\n", session->s_mds);
				1474	}
				1475	}
				1476	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
				1477	session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
				1478	time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
				1479	spin_unlock(&session->s_cap_lock);
				1480
				1481	if (wake)
				1482	wake_up_session_caps(session, 0);
				1483	}
				1484
				1485	/*
				1486	* send a session close request
				1487	*/
				1488	static int request_close_session(struct ceph_mds_client *mdsc,
				1489	struct ceph_mds_session *session)
				1490	{
				1491	struct ceph_msg *msg;
				1492
				1493	dout("request_close_session mds%d state %s seq %lld\n",
				1494	session->s_mds, ceph_session_state_name(session->s_state),
				1495	session->s_seq);
				1496	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
				1497	if (!msg)
				1498	return -ENOMEM;
				1499	ceph_con_send(&session->s_con, msg);
				1500	return 1;
				1501	}
				1502
				1503	/*
				1504	* Called with s_mutex held.
				1505	*/
				1506	static int __close_session(struct ceph_mds_client *mdsc,
				1507	struct ceph_mds_session *session)
				1508	{
				1509	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
				1510	return 0;
				1511	session->s_state = CEPH_MDS_SESSION_CLOSING;
				1512	return request_close_session(mdsc, session);
				1513	}
				1514
				1515	static bool drop_negative_children(struct dentry *dentry)
				1516	{
				1517	struct dentry *child;
				1518	bool all_negative = true;
				1519
				1520	if (!d_is_dir(dentry))
				1521	goto out;
				1522
				1523	spin_lock(&dentry->d_lock);
				1524	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
				1525	if (d_really_is_positive(child)) {
				1526	all_negative = false;
				1527	break;
				1528	}
				1529	}
				1530	spin_unlock(&dentry->d_lock);
				1531
				1532	if (all_negative)
				1533	shrink_dcache_parent(dentry);
				1534	out:
				1535	return all_negative;
				1536	}
				1537
				1538	/*
				1539	* Trim old(er) caps.
				1540	*
				1541	* Because we can't cache an inode without one or more caps, we do
				1542	* this indirectly: if a cap is unused, we prune its aliases, at which
				1543	* point the inode will hopefully get dropped to.
				1544	*
				1545	* Yes, this is a bit sloppy. Our only real goal here is to respond to
				1546	* memory pressure from the MDS, though, so it needn't be perfect.
				1547	*/
				1548	static int trim_caps_cb(struct inode inode, struct ceph_cap cap, void *arg)
				1549	{
				1550	struct ceph_mds_session *session = arg;
				1551	struct ceph_inode_info *ci = ceph_inode(inode);
				1552	int used, wanted, oissued, mine;
				1553
				1554	if (session->s_trim_caps <= 0)
				1555	return -1;
				1556
				1557	spin_lock(&ci->i_ceph_lock);
				1558	mine = cap->issued \| cap->implemented;
				1559	used = __ceph_caps_used(ci);
				1560	wanted = __ceph_caps_file_wanted(ci);
				1561	oissued = __ceph_caps_issued_other(ci, cap);
				1562
				1563	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
				1564	inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
				1565	ceph_cap_string(used), ceph_cap_string(wanted));
				1566	if (cap == ci->i_auth_cap) {
				1567	if (ci->i_dirty_caps \|\| ci->i_flushing_caps \|\|
				1568	!list_empty(&ci->i_cap_snaps))
				1569	goto out;
				1570	if ((used \| wanted) & CEPH_CAP_ANY_WR)
				1571	goto out;
				1572	/* Note: it's possible that i_filelock_ref becomes non-zero
				1573	* after dropping auth caps. It doesn't hurt because reply
				1574	* of lock mds request will re-add auth caps. */
				1575	if (atomic_read(&ci->i_filelock_ref) > 0)
				1576	goto out;
				1577	}
				1578	/* The inode has cached pages, but it's no longer used.
				1579	* we can safely drop it */
				1580	if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
				1581	!(oissued & CEPH_CAP_FILE_CACHE)) {
				1582	used = 0;
				1583	oissued = 0;
				1584	}
				1585	if ((used \| wanted) & ~oissued & mine)
				1586	goto out; /* we need these caps */
				1587
				1588	if (oissued) {
				1589	/* we aren't the only cap.. just remove us */
				1590	__ceph_remove_cap(cap, true);
				1591	session->s_trim_caps--;
				1592	} else {
				1593	struct dentry *dentry;
				1594	/* try dropping referring dentries */
				1595	spin_unlock(&ci->i_ceph_lock);
				1596	dentry = d_find_any_alias(inode);
				1597	if (dentry && drop_negative_children(dentry)) {
				1598	int count;
				1599	dput(dentry);
				1600	d_prune_aliases(inode);
				1601	count = atomic_read(&inode->i_count);
				1602	if (count == 1)
				1603	session->s_trim_caps--;
				1604	dout("trim_caps_cb %p cap %p pruned, count now %d\n",
				1605	inode, cap, count);
				1606	} else {
				1607	dput(dentry);
				1608	}
				1609	return 0;
				1610	}
				1611
				1612	out:
				1613	spin_unlock(&ci->i_ceph_lock);
				1614	return 0;
				1615	}
				1616
				1617	/*
				1618	* Trim session cap count down to some max number.
				1619	*/
				1620	int ceph_trim_caps(struct ceph_mds_client *mdsc,
				1621	struct ceph_mds_session *session,
				1622	int max_caps)
				1623	{
				1624	int trim_caps = session->s_nr_caps - max_caps;
				1625
				1626	dout("trim_caps mds%d start: %d / %d, trim %d\n",
				1627	session->s_mds, session->s_nr_caps, max_caps, trim_caps);
				1628	if (trim_caps > 0) {
				1629	session->s_trim_caps = trim_caps;
				1630	iterate_session_caps(session, trim_caps_cb, session);
				1631	dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
				1632	session->s_mds, session->s_nr_caps, max_caps,
				1633	trim_caps - session->s_trim_caps);
				1634	session->s_trim_caps = 0;
				1635	}
				1636
				1637	ceph_send_cap_releases(mdsc, session);
				1638	return 0;
				1639	}
				1640
				1641	static int check_caps_flush(struct ceph_mds_client *mdsc,
				1642	u64 want_flush_tid)
				1643	{
				1644	int ret = 1;
				1645
				1646	spin_lock(&mdsc->cap_dirty_lock);
				1647	if (!list_empty(&mdsc->cap_flush_list)) {
				1648	struct ceph_cap_flush *cf =
				1649	list_first_entry(&mdsc->cap_flush_list,
				1650	struct ceph_cap_flush, g_list);
				1651	if (cf->tid <= want_flush_tid) {
				1652	dout("check_caps_flush still flushing tid "
				1653	"%llu <= %llu\n", cf->tid, want_flush_tid);
				1654	ret = 0;
				1655	}
				1656	}
				1657	spin_unlock(&mdsc->cap_dirty_lock);
				1658	return ret;
				1659	}
				1660
				1661	/*
				1662	* flush all dirty inode data to disk.
				1663	*
				1664	* returns true if we've flushed through want_flush_tid
				1665	*/
				1666	static void wait_caps_flush(struct ceph_mds_client *mdsc,
				1667	u64 want_flush_tid)
				1668	{
				1669	dout("check_caps_flush want %llu\n", want_flush_tid);
				1670
				1671	wait_event(mdsc->cap_flushing_wq,
				1672	check_caps_flush(mdsc, want_flush_tid));
				1673
				1674	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
				1675	}
				1676
				1677	/*
				1678	* called under s_mutex
				1679	*/
				1680	void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
				1681	struct ceph_mds_session *session)
				1682	{
				1683	struct ceph_msg *msg = NULL;
				1684	struct ceph_mds_cap_release *head;
				1685	struct ceph_mds_cap_item *item;
				1686	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
				1687	struct ceph_cap *cap;
				1688	LIST_HEAD(tmp_list);
				1689	int num_cap_releases;
				1690	__le32 barrier, *cap_barrier;
				1691
				1692	down_read(&osdc->lock);
				1693	barrier = cpu_to_le32(osdc->epoch_barrier);
				1694	up_read(&osdc->lock);
				1695
				1696	spin_lock(&session->s_cap_lock);
				1697	again:
				1698	list_splice_init(&session->s_cap_releases, &tmp_list);
				1699	num_cap_releases = session->s_num_cap_releases;
				1700	session->s_num_cap_releases = 0;
				1701	spin_unlock(&session->s_cap_lock);
				1702
				1703	while (!list_empty(&tmp_list)) {
				1704	if (!msg) {
				1705	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
				1706	PAGE_SIZE, GFP_NOFS, false);
				1707	if (!msg)
				1708	goto out_err;
				1709	head = msg->front.iov_base;
				1710	head->num = cpu_to_le32(0);
				1711	msg->front.iov_len = sizeof(*head);
				1712
				1713	msg->hdr.version = cpu_to_le16(2);
				1714	msg->hdr.compat_version = cpu_to_le16(1);
				1715	}
				1716
				1717	cap = list_first_entry(&tmp_list, struct ceph_cap,
				1718	session_caps);
				1719	list_del(&cap->session_caps);
				1720	num_cap_releases--;
				1721
				1722	head = msg->front.iov_base;
				1723	le32_add_cpu(&head->num, 1);
				1724	item = msg->front.iov_base + msg->front.iov_len;
				1725	item->ino = cpu_to_le64(cap->cap_ino);
				1726	item->cap_id = cpu_to_le64(cap->cap_id);
				1727	item->migrate_seq = cpu_to_le32(cap->mseq);
				1728	item->seq = cpu_to_le32(cap->issue_seq);
				1729	msg->front.iov_len += sizeof(*item);
				1730
				1731	ceph_put_cap(mdsc, cap);
				1732
				1733	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
				1734	// Append cap_barrier field
				1735	cap_barrier = msg->front.iov_base + msg->front.iov_len;
				1736	*cap_barrier = barrier;
				1737	msg->front.iov_len += sizeof(*cap_barrier);
				1738
				1739	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1740	dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
				1741	ceph_con_send(&session->s_con, msg);
				1742	msg = NULL;
				1743	}
				1744	}
				1745
				1746	BUG_ON(num_cap_releases != 0);
				1747
				1748	spin_lock(&session->s_cap_lock);
				1749	if (!list_empty(&session->s_cap_releases))
				1750	goto again;
				1751	spin_unlock(&session->s_cap_lock);
				1752
				1753	if (msg) {
				1754	// Append cap_barrier field
				1755	cap_barrier = msg->front.iov_base + msg->front.iov_len;
				1756	*cap_barrier = barrier;
				1757	msg->front.iov_len += sizeof(*cap_barrier);
				1758
				1759	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1760	dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
				1761	ceph_con_send(&session->s_con, msg);
				1762	}
				1763	return;
				1764	out_err:
				1765	pr_err("send_cap_releases mds%d, failed to allocate message\n",
				1766	session->s_mds);
				1767	spin_lock(&session->s_cap_lock);
				1768	list_splice(&tmp_list, &session->s_cap_releases);
				1769	session->s_num_cap_releases += num_cap_releases;
				1770	spin_unlock(&session->s_cap_lock);
				1771	}
				1772
				1773	/*
				1774	* requests
				1775	*/
				1776
				1777	int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
				1778	struct inode *dir)
				1779	{
				1780	struct ceph_inode_info *ci = ceph_inode(dir);
				1781	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
				1782	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
				1783	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
				1784	int order, num_entries;
				1785
				1786	spin_lock(&ci->i_ceph_lock);
				1787	num_entries = ci->i_files + ci->i_subdirs;
				1788	spin_unlock(&ci->i_ceph_lock);
				1789	num_entries = max(num_entries, 1);
				1790	num_entries = min(num_entries, opt->max_readdir);
				1791
				1792	order = get_order(size * num_entries);
				1793	while (order >= 0) {
				1794	rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL \|
				1795	__GFP_NOWARN,
				1796	order);
				1797	if (rinfo->dir_entries)
				1798	break;
				1799	order--;
				1800	}
				1801	if (!rinfo->dir_entries)
				1802	return -ENOMEM;
				1803
				1804	num_entries = (PAGE_SIZE << order) / size;
				1805	num_entries = min(num_entries, opt->max_readdir);
				1806
				1807	rinfo->dir_buf_size = PAGE_SIZE << order;
				1808	req->r_num_caps = num_entries + 1;
				1809	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
				1810	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
				1811	return 0;
				1812	}
				1813
				1814	/*
				1815	* Create an mds request.
				1816	*/
				1817	struct ceph_mds_request *
				1818	ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
				1819	{
				1820	struct ceph_mds_request req = kzalloc(sizeof(req), GFP_NOFS);
				1821	struct timespec64 ts;
				1822
				1823	if (!req)
				1824	return ERR_PTR(-ENOMEM);
				1825
				1826	mutex_init(&req->r_fill_mutex);
				1827	req->r_mdsc = mdsc;
				1828	req->r_started = jiffies;
				1829	req->r_resend_mds = -1;
				1830	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
				1831	INIT_LIST_HEAD(&req->r_unsafe_target_item);
				1832	req->r_fmode = -1;
				1833	kref_init(&req->r_kref);
				1834	RB_CLEAR_NODE(&req->r_node);
				1835	INIT_LIST_HEAD(&req->r_wait);
				1836	init_completion(&req->r_completion);
				1837	init_completion(&req->r_safe_completion);
				1838	INIT_LIST_HEAD(&req->r_unsafe_item);
				1839
				1840	ktime_get_coarse_real_ts64(&ts);
				1841	req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran);
				1842
				1843	req->r_op = op;
				1844	req->r_direct_mode = mode;
				1845	return req;
				1846	}
				1847
				1848	/*
				1849	* return oldest (lowest) request, tid in request tree, 0 if none.
				1850	*
				1851	* called under mdsc->mutex.
				1852	*/
				1853	static struct ceph_mds_request __get_oldest_req(struct ceph_mds_client mdsc)
				1854	{
				1855	if (RB_EMPTY_ROOT(&mdsc->request_tree))
				1856	return NULL;
				1857	return rb_entry(rb_first(&mdsc->request_tree),
				1858	struct ceph_mds_request, r_node);
				1859	}
				1860
				1861	static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
				1862	{
				1863	return mdsc->oldest_tid;
				1864	}
				1865
				1866	/*
				1867	* Build a dentry's path. Allocate on heap; caller must kfree. Based
				1868	* on build_path_from_dentry in fs/cifs/dir.c.
				1869	*
				1870	* If @stop_on_nosnap, generate path relative to the first non-snapped
				1871	* inode.
				1872	*
				1873	* Encode hidden .snap dirs as a double /, i.e.
				1874	* foo/.snap/bar -> foo//bar
				1875	*/
				1876	char ceph_mdsc_build_path(struct dentry dentry, int plen, u64 base,
				1877	int stop_on_nosnap)
				1878	{
				1879	struct dentry *temp;
				1880	char *path;
				1881	int len, pos;
				1882	unsigned seq;
				1883
				1884	if (!dentry)
				1885	return ERR_PTR(-EINVAL);
				1886
				1887	retry:
				1888	len = 0;
				1889	seq = read_seqbegin(&rename_lock);
				1890	rcu_read_lock();
				1891	for (temp = dentry; !IS_ROOT(temp);) {
				1892	struct inode *inode = d_inode(temp);
				1893	if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
				1894	len++; /* slash only */
				1895	else if (stop_on_nosnap && inode &&
				1896	ceph_snap(inode) == CEPH_NOSNAP)
				1897	break;
				1898	else
				1899	len += 1 + temp->d_name.len;
				1900	temp = temp->d_parent;
				1901	}
				1902	rcu_read_unlock();
				1903	if (len)
				1904	len--; /* no leading '/' */
				1905
				1906	path = kmalloc(len+1, GFP_NOFS);
				1907	if (!path)
				1908	return ERR_PTR(-ENOMEM);
				1909	pos = len;
				1910	path[pos] = 0; /* trailing null */
				1911	rcu_read_lock();
				1912	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
				1913	struct inode *inode;
				1914
				1915	spin_lock(&temp->d_lock);
				1916	inode = d_inode(temp);
				1917	if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
				1918	dout("build_path path+%d: %p SNAPDIR\n",
				1919	pos, temp);
				1920	} else if (stop_on_nosnap && inode &&
				1921	ceph_snap(inode) == CEPH_NOSNAP) {
				1922	spin_unlock(&temp->d_lock);
				1923	break;
				1924	} else {
				1925	pos -= temp->d_name.len;
				1926	if (pos < 0) {
				1927	spin_unlock(&temp->d_lock);
				1928	break;
				1929	}
				1930	strncpy(path + pos, temp->d_name.name,
				1931	temp->d_name.len);
				1932	}
				1933	spin_unlock(&temp->d_lock);
				1934	if (pos)
				1935	path[--pos] = '/';
				1936	temp = temp->d_parent;
				1937	}
				1938	rcu_read_unlock();
				1939	if (pos != 0 \|\| read_seqretry(&rename_lock, seq)) {
				1940	pr_err("build_path did not end path lookup where "
				1941	"expected, namelen is %d, pos is %d\n", len, pos);
				1942	/* presumably this is only possible if racing with a
				1943	rename of one of the parent directories (we can not
				1944	lock the dentries above us to prevent this, but
				1945	retrying should be harmless) */
				1946	kfree(path);
				1947	goto retry;
				1948	}
				1949
				1950	*base = ceph_ino(d_inode(temp));
				1951	*plen = len;
				1952	dout("build_path on %p %d built %llx '%.*s'\n",
				1953	dentry, d_count(dentry), *base, len, path);
				1954	return path;
				1955	}
				1956
				1957	/* Duplicate the dentry->d_name.name safely */
				1958	static int clone_dentry_name(struct dentry dentry, const char *ppath,
				1959	int *ppathlen)
				1960	{
				1961	u32 len;
				1962	char *name;
				1963
				1964	retry:
				1965	len = READ_ONCE(dentry->d_name.len);
				1966	name = kmalloc(len + 1, GFP_NOFS);
				1967	if (!name)
				1968	return -ENOMEM;
				1969
				1970	spin_lock(&dentry->d_lock);
				1971	if (dentry->d_name.len != len) {
				1972	spin_unlock(&dentry->d_lock);
				1973	kfree(name);
				1974	goto retry;
				1975	}
				1976	memcpy(name, dentry->d_name.name, len);
				1977	spin_unlock(&dentry->d_lock);
				1978
				1979	name[len] = '\0';
				1980	*ppath = name;
				1981	*ppathlen = len;
				1982	return 0;
				1983	}
				1984
				1985	static int build_dentry_path(struct dentry dentry, struct inode dir,
				1986	const char *ppath, int ppathlen, u64 *pino,
				1987	bool *pfreepath, bool parent_locked)
				1988	{
				1989	int ret;
				1990	char *path;
				1991
				1992	rcu_read_lock();
				1993	if (!dir)
				1994	dir = d_inode_rcu(dentry->d_parent);
				1995	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
				1996	*pino = ceph_ino(dir);
				1997	rcu_read_unlock();
				1998	if (parent_locked) {
				1999	*ppath = dentry->d_name.name;
				2000	*ppathlen = dentry->d_name.len;
				2001	} else {
				2002	ret = clone_dentry_name(dentry, ppath, ppathlen);
				2003	if (ret)
				2004	return ret;
				2005	*pfreepath = true;
				2006	}
				2007	return 0;
				2008	}
				2009	rcu_read_unlock();
				2010	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
				2011	if (IS_ERR(path))
				2012	return PTR_ERR(path);
				2013	*ppath = path;
				2014	*pfreepath = true;
				2015	return 0;
				2016	}
				2017
				2018	static int build_inode_path(struct inode *inode,
				2019	const char *ppath, int ppathlen, u64 *pino,
				2020	bool *pfreepath)
				2021	{
				2022	struct dentry *dentry;
				2023	char *path;
				2024
				2025	if (ceph_snap(inode) == CEPH_NOSNAP) {
				2026	*pino = ceph_ino(inode);
				2027	*ppathlen = 0;
				2028	return 0;
				2029	}
				2030	dentry = d_find_alias(inode);
				2031	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
				2032	dput(dentry);
				2033	if (IS_ERR(path))
				2034	return PTR_ERR(path);
				2035	*ppath = path;
				2036	*pfreepath = true;
				2037	return 0;
				2038	}
				2039
				2040	/*
				2041	* request arguments may be specified via an inode , a dentry , or
				2042	* an explicit ino+path.
				2043	*/
				2044	static int set_request_path_attr(struct inode rinode, struct dentry rdentry,
				2045	struct inode rdiri, const char rpath,
				2046	u64 rino, const char *ppath, int pathlen,
				2047	u64 ino, bool freepath, bool parent_locked)
				2048	{
				2049	int r = 0;
				2050
				2051	if (rinode) {
				2052	r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
				2053	dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
				2054	ceph_snap(rinode));
				2055	} else if (rdentry) {
				2056	r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
				2057	freepath, parent_locked);
				2058	dout(" dentry %p %llx/%.s\n", rdentry, ino, *pathlen,
				2059	*ppath);
				2060	} else if (rpath \|\| rino) {
				2061	*ino = rino;
				2062	*ppath = rpath;
				2063	*pathlen = rpath ? strlen(rpath) : 0;
				2064	dout(" path %.s\n", pathlen, rpath);
				2065	}
				2066
				2067	return r;
				2068	}
				2069
				2070	/*
				2071	* called under mdsc->mutex
				2072	*/
				2073	static struct ceph_msg create_request_message(struct ceph_mds_client mdsc,
				2074	struct ceph_mds_request *req,
				2075	int mds, bool drop_cap_releases)
				2076	{
				2077	struct ceph_msg *msg;
				2078	struct ceph_mds_request_head *head;
				2079	const char *path1 = NULL;
				2080	const char *path2 = NULL;
				2081	u64 ino1 = 0, ino2 = 0;
				2082	int pathlen1 = 0, pathlen2 = 0;
				2083	bool freepath1 = false, freepath2 = false;
				2084	int len;
				2085	u16 releases;
				2086	void p, end;
				2087	int ret;
				2088
				2089	ret = set_request_path_attr(req->r_inode, req->r_dentry,
				2090	req->r_parent, req->r_path1, req->r_ino1.ino,
				2091	&path1, &pathlen1, &ino1, &freepath1,
				2092	test_bit(CEPH_MDS_R_PARENT_LOCKED,
				2093	&req->r_req_flags));
				2094	if (ret < 0) {
				2095	msg = ERR_PTR(ret);
				2096	goto out;
				2097	}
				2098
				2099	/* If r_old_dentry is set, then assume that its parent is locked */
				2100	ret = set_request_path_attr(NULL, req->r_old_dentry,
				2101	req->r_old_dentry_dir,
				2102	req->r_path2, req->r_ino2.ino,
				2103	&path2, &pathlen2, &ino2, &freepath2, true);
				2104	if (ret < 0) {
				2105	msg = ERR_PTR(ret);
				2106	goto out_free1;
				2107	}
				2108
				2109	len = sizeof(*head) +
				2110	pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
				2111	sizeof(struct ceph_timespec);
				2112
				2113	/* calculate (max) length for cap releases */
				2114	len += sizeof(struct ceph_mds_request_release) *
				2115	(!!req->r_inode_drop + !!req->r_dentry_drop +
				2116	!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
				2117	if (req->r_dentry_drop)
				2118	len += req->r_dentry->d_name.len;
				2119	if (req->r_old_dentry_drop)
				2120	len += req->r_old_dentry->d_name.len;
				2121
				2122	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
				2123	if (!msg) {
				2124	msg = ERR_PTR(-ENOMEM);
				2125	goto out_free2;
				2126	}
				2127
				2128	msg->hdr.version = cpu_to_le16(2);
				2129	msg->hdr.tid = cpu_to_le64(req->r_tid);
				2130
				2131	head = msg->front.iov_base;
				2132	p = msg->front.iov_base + sizeof(*head);
				2133	end = msg->front.iov_base + msg->front.iov_len;
				2134
				2135	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
				2136	head->op = cpu_to_le32(req->r_op);
				2137	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
				2138	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
				2139	head->args = req->r_args;
				2140
				2141	ceph_encode_filepath(&p, end, ino1, path1);
				2142	ceph_encode_filepath(&p, end, ino2, path2);
				2143
				2144	/* make note of release offset, in case we need to replay */
				2145	req->r_request_release_offset = p - msg->front.iov_base;
				2146
				2147	/* cap releases */
				2148	releases = 0;
				2149	if (req->r_inode_drop)
				2150	releases += ceph_encode_inode_release(&p,
				2151	req->r_inode ? req->r_inode : d_inode(req->r_dentry),
				2152	mds, req->r_inode_drop, req->r_inode_unless, 0);
				2153	if (req->r_dentry_drop)
				2154	releases += ceph_encode_dentry_release(&p, req->r_dentry,
				2155	req->r_parent, mds, req->r_dentry_drop,
				2156	req->r_dentry_unless);
				2157	if (req->r_old_dentry_drop)
				2158	releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
				2159	req->r_old_dentry_dir, mds,
				2160	req->r_old_dentry_drop,
				2161	req->r_old_dentry_unless);
				2162	if (req->r_old_inode_drop)
				2163	releases += ceph_encode_inode_release(&p,
				2164	d_inode(req->r_old_dentry),
				2165	mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
				2166
				2167	if (drop_cap_releases) {
				2168	releases = 0;
				2169	p = msg->front.iov_base + req->r_request_release_offset;
				2170	}
				2171
				2172	head->num_releases = cpu_to_le16(releases);
				2173
				2174	/* time stamp */
				2175	{
				2176	struct ceph_timespec ts;
				2177	ceph_encode_timespec64(&ts, &req->r_stamp);
				2178	ceph_encode_copy(&p, &ts, sizeof(ts));
				2179	}
				2180
				2181	BUG_ON(p > end);
				2182	msg->front.iov_len = p - msg->front.iov_base;
				2183	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2184
				2185	if (req->r_pagelist) {
				2186	struct ceph_pagelist *pagelist = req->r_pagelist;
				2187	refcount_inc(&pagelist->refcnt);
				2188	ceph_msg_data_add_pagelist(msg, pagelist);
				2189	msg->hdr.data_len = cpu_to_le32(pagelist->length);
				2190	} else {
				2191	msg->hdr.data_len = 0;
				2192	}
				2193
				2194	msg->hdr.data_off = cpu_to_le16(0);
				2195
				2196	out_free2:
				2197	if (freepath2)
				2198	kfree((char *)path2);
				2199	out_free1:
				2200	if (freepath1)
				2201	kfree((char *)path1);
				2202	out:
				2203	return msg;
				2204	}
				2205
				2206	/*
				2207	* called under mdsc->mutex if error, under no mutex if
				2208	* success.
				2209	*/
				2210	static void complete_request(struct ceph_mds_client *mdsc,
				2211	struct ceph_mds_request *req)
				2212	{
				2213	if (req->r_callback)
				2214	req->r_callback(mdsc, req);
				2215	else
				2216	complete_all(&req->r_completion);
				2217	}
				2218
				2219	/*
				2220	* called under mdsc->mutex
				2221	*/
				2222	static int __prepare_send_request(struct ceph_mds_client *mdsc,
				2223	struct ceph_mds_request *req,
				2224	int mds, bool drop_cap_releases)
				2225	{
				2226	struct ceph_mds_request_head *rhead;
				2227	struct ceph_msg *msg;
				2228	int flags = 0;
				2229
				2230	req->r_attempts++;
				2231	if (req->r_inode) {
				2232	struct ceph_cap *cap =
				2233	ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
				2234
				2235	if (cap)
				2236	req->r_sent_on_mseq = cap->mseq;
				2237	else
				2238	req->r_sent_on_mseq = -1;
				2239	}
				2240	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
				2241	req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
				2242
				2243	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2244	void *p;
				2245	/*
				2246	* Replay. Do not regenerate message (and rebuild
				2247	* paths, etc.); just use the original message.
				2248	* Rebuilding paths will break for renames because
				2249	* d_move mangles the src name.
				2250	*/
				2251	msg = req->r_request;
				2252	rhead = msg->front.iov_base;
				2253
				2254	flags = le32_to_cpu(rhead->flags);
				2255	flags \|= CEPH_MDS_FLAG_REPLAY;
				2256	rhead->flags = cpu_to_le32(flags);
				2257
				2258	if (req->r_target_inode)
				2259	rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
				2260
				2261	rhead->num_retry = req->r_attempts - 1;
				2262
				2263	/* remove cap/dentry releases from message */
				2264	rhead->num_releases = 0;
				2265
				2266	/* time stamp */
				2267	p = msg->front.iov_base + req->r_request_release_offset;
				2268	{
				2269	struct ceph_timespec ts;
				2270	ceph_encode_timespec64(&ts, &req->r_stamp);
				2271	ceph_encode_copy(&p, &ts, sizeof(ts));
				2272	}
				2273
				2274	msg->front.iov_len = p - msg->front.iov_base;
				2275	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2276	return 0;
				2277	}
				2278
				2279	if (req->r_request) {
				2280	ceph_msg_put(req->r_request);
				2281	req->r_request = NULL;
				2282	}
				2283	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
				2284	if (IS_ERR(msg)) {
				2285	req->r_err = PTR_ERR(msg);
				2286	return PTR_ERR(msg);
				2287	}
				2288	req->r_request = msg;
				2289
				2290	rhead = msg->front.iov_base;
				2291	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
				2292	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2293	flags \|= CEPH_MDS_FLAG_REPLAY;
				2294	if (req->r_parent)
				2295	flags \|= CEPH_MDS_FLAG_WANT_DENTRY;
				2296	rhead->flags = cpu_to_le32(flags);
				2297	rhead->num_fwd = req->r_num_fwd;
				2298	rhead->num_retry = req->r_attempts - 1;
				2299	rhead->ino = 0;
				2300
				2301	dout(" r_parent = %p\n", req->r_parent);
				2302	return 0;
				2303	}
				2304
				2305	/*
				2306	* send request, or put it on the appropriate wait list.
				2307	*/
				2308	static void __do_request(struct ceph_mds_client *mdsc,
				2309	struct ceph_mds_request *req)
				2310	{
				2311	struct ceph_mds_session *session = NULL;
				2312	int mds = -1;
				2313	int err = 0;
				2314
				2315	if (req->r_err \|\| test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
				2316	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
				2317	__unregister_request(mdsc, req);
				2318	return;
				2319	}
				2320
				2321	if (req->r_timeout &&
				2322	time_after_eq(jiffies, req->r_started + req->r_timeout)) {
				2323	dout("do_request timed out\n");
				2324	err = -EIO;
				2325	goto finish;
				2326	}
				2327	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
				2328	dout("do_request forced umount\n");
				2329	err = -EIO;
				2330	goto finish;
				2331	}
				2332	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
				2333	if (mdsc->mdsmap_err) {
				2334	err = mdsc->mdsmap_err;
				2335	dout("do_request mdsmap err %d\n", err);
				2336	goto finish;
				2337	}
				2338	if (mdsc->mdsmap->m_epoch == 0) {
				2339	dout("do_request no mdsmap, waiting for map\n");
				2340	list_add(&req->r_wait, &mdsc->waiting_for_map);
				2341	return;
				2342	}
				2343	if (!(mdsc->fsc->mount_options->flags &
				2344	CEPH_MOUNT_OPT_MOUNTWAIT) &&
				2345	!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
				2346	err = -ENOENT;
				2347	pr_info("probably no mds server is up\n");
				2348	goto finish;
				2349	}
				2350	}
				2351
				2352	put_request_session(req);
				2353
				2354	mds = __choose_mds(mdsc, req);
				2355	if (mds < 0 \|\|
				2356	ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
				2357	dout("do_request no mds or not active, waiting for map\n");
				2358	list_add(&req->r_wait, &mdsc->waiting_for_map);
				2359	return;
				2360	}
				2361
				2362	/* get, open session */
				2363	session = __ceph_lookup_mds_session(mdsc, mds);
				2364	if (!session) {
				2365	session = register_session(mdsc, mds);
				2366	if (IS_ERR(session)) {
				2367	err = PTR_ERR(session);
				2368	goto finish;
				2369	}
				2370	}
				2371	req->r_session = get_session(session);
				2372
				2373	dout("do_request mds%d session %p state %s\n", mds, session,
				2374	ceph_session_state_name(session->s_state));
				2375	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
				2376	session->s_state != CEPH_MDS_SESSION_HUNG) {
				2377	if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
				2378	err = -EACCES;
				2379	goto out_session;
				2380	}
				2381	if (session->s_state == CEPH_MDS_SESSION_NEW \|\|
				2382	session->s_state == CEPH_MDS_SESSION_CLOSING)
				2383	__open_session(mdsc, session);
				2384	list_add(&req->r_wait, &session->s_waiting);
				2385	goto out_session;
				2386	}
				2387
				2388	/* send request */
				2389	req->r_resend_mds = -1; /* forget any previous mds hint */
				2390
				2391	if (req->r_request_started == 0) /* note request start time */
				2392	req->r_request_started = jiffies;
				2393
				2394	err = __prepare_send_request(mdsc, req, mds, false);
				2395	if (!err) {
				2396	ceph_msg_get(req->r_request);
				2397	ceph_con_send(&session->s_con, req->r_request);
				2398	}
				2399
				2400	out_session:
				2401	ceph_put_mds_session(session);
				2402	finish:
				2403	if (err) {
				2404	dout("__do_request early error %d\n", err);
				2405	req->r_err = err;
				2406	complete_request(mdsc, req);
				2407	__unregister_request(mdsc, req);
				2408	}
				2409	return;
				2410	}
				2411
				2412	/*
				2413	* called under mdsc->mutex
				2414	*/
				2415	static void __wake_requests(struct ceph_mds_client *mdsc,
				2416	struct list_head *head)
				2417	{
				2418	struct ceph_mds_request *req;
				2419	LIST_HEAD(tmp_list);
				2420
				2421	list_splice_init(head, &tmp_list);
				2422
				2423	while (!list_empty(&tmp_list)) {
				2424	req = list_entry(tmp_list.next,
				2425	struct ceph_mds_request, r_wait);
				2426	list_del_init(&req->r_wait);
				2427	dout(" wake request %p tid %llu\n", req, req->r_tid);
				2428	__do_request(mdsc, req);
				2429	}
				2430	}
				2431
				2432	/*
				2433	* Wake up threads with requests pending for @mds, so that they can
				2434	* resubmit their requests to a possibly different mds.
				2435	*/
				2436	static void kick_requests(struct ceph_mds_client *mdsc, int mds)
				2437	{
				2438	struct ceph_mds_request *req;
				2439	struct rb_node *p = rb_first(&mdsc->request_tree);
				2440
				2441	dout("kick_requests mds%d\n", mds);
				2442	while (p) {
				2443	req = rb_entry(p, struct ceph_mds_request, r_node);
				2444	p = rb_next(p);
				2445	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2446	continue;
				2447	if (req->r_attempts > 0)
				2448	continue; /* only new requests */
				2449	if (req->r_session &&
				2450	req->r_session->s_mds == mds) {
				2451	dout(" kicking tid %llu\n", req->r_tid);
				2452	list_del_init(&req->r_wait);
				2453	__do_request(mdsc, req);
				2454	}
				2455	}
				2456	}
				2457
				2458	void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
				2459	struct ceph_mds_request *req)
				2460	{
				2461	dout("submit_request on %p\n", req);
				2462	mutex_lock(&mdsc->mutex);
				2463	__register_request(mdsc, req, NULL);
				2464	__do_request(mdsc, req);
				2465	mutex_unlock(&mdsc->mutex);
				2466	}
				2467
				2468	/*
				2469	* Synchrously perform an mds request. Take care of all of the
				2470	* session setup, forwarding, retry details.
				2471	*/
				2472	int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
				2473	struct inode *dir,
				2474	struct ceph_mds_request *req)
				2475	{
				2476	int err;
				2477
				2478	dout("do_request on %p\n", req);
				2479
				2480	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
				2481	if (req->r_inode)
				2482	ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
				2483	if (req->r_parent)
				2484	ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
				2485	if (req->r_old_dentry_dir)
				2486	ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
				2487	CEPH_CAP_PIN);
				2488
				2489	/* issue */
				2490	mutex_lock(&mdsc->mutex);
				2491	__register_request(mdsc, req, dir);
				2492	__do_request(mdsc, req);
				2493
				2494	if (req->r_err) {
				2495	err = req->r_err;
				2496	goto out;
				2497	}
				2498
				2499	/* wait */
				2500	mutex_unlock(&mdsc->mutex);
				2501	dout("do_request waiting\n");
				2502	if (!req->r_timeout && req->r_wait_for_completion) {
				2503	err = req->r_wait_for_completion(mdsc, req);
				2504	} else {
				2505	long timeleft = wait_for_completion_killable_timeout(
				2506	&req->r_completion,
				2507	ceph_timeout_jiffies(req->r_timeout));
				2508	if (timeleft > 0)
				2509	err = 0;
				2510	else if (!timeleft)
				2511	err = -EIO; /* timed out */
				2512	else
				2513	err = timeleft; /* killed */
				2514	}
				2515	dout("do_request waited, got %d\n", err);
				2516	mutex_lock(&mdsc->mutex);
				2517
				2518	/* only abort if we didn't race with a real reply */
				2519	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
				2520	err = le32_to_cpu(req->r_reply_info.head->result);
				2521	} else if (err < 0) {
				2522	dout("aborted request %lld with %d\n", req->r_tid, err);
				2523
				2524	/*
				2525	* ensure we aren't running concurrently with
				2526	* ceph_fill_trace or ceph_readdir_prepopulate, which
				2527	* rely on locks (dir mutex) held by our caller.
				2528	*/
				2529	mutex_lock(&req->r_fill_mutex);
				2530	req->r_err = err;
				2531	set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
				2532	mutex_unlock(&req->r_fill_mutex);
				2533
				2534	if (req->r_parent &&
				2535	(req->r_op & CEPH_MDS_OP_WRITE))
				2536	ceph_invalidate_dir_request(req);
				2537	} else {
				2538	err = req->r_err;
				2539	}
				2540
				2541	out:
				2542	mutex_unlock(&mdsc->mutex);
				2543	dout("do_request %p done, result %d\n", req, err);
				2544	return err;
				2545	}
				2546
				2547	/*
				2548	* Invalidate dir's completeness, dentry lease state on an aborted MDS
				2549	* namespace request.
				2550	*/
				2551	void ceph_invalidate_dir_request(struct ceph_mds_request *req)
				2552	{
				2553	struct inode *dir = req->r_parent;
				2554	struct inode *old_dir = req->r_old_dentry_dir;
				2555
				2556	dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
				2557
				2558	ceph_dir_clear_complete(dir);
				2559	if (old_dir)
				2560	ceph_dir_clear_complete(old_dir);
				2561	if (req->r_dentry)
				2562	ceph_invalidate_dentry_lease(req->r_dentry);
				2563	if (req->r_old_dentry)
				2564	ceph_invalidate_dentry_lease(req->r_old_dentry);
				2565	}
				2566
				2567	/*
				2568	* Handle mds reply.
				2569	*
				2570	* We take the session mutex and parse and process the reply immediately.
				2571	* This preserves the logical ordering of replies, capabilities, etc., sent
				2572	* by the MDS as they are applied to our local cache.
				2573	*/
				2574	static void handle_reply(struct ceph_mds_session session, struct ceph_msg msg)
				2575	{
				2576	struct ceph_mds_client *mdsc = session->s_mdsc;
				2577	struct ceph_mds_request *req;
				2578	struct ceph_mds_reply_head *head = msg->front.iov_base;
				2579	struct ceph_mds_reply_info_parsed rinfo; / parsed reply info */
				2580	struct ceph_snap_realm *realm;
				2581	u64 tid;
				2582	int err, result;
				2583	int mds = session->s_mds;
				2584
				2585	if (msg->front.iov_len < sizeof(*head)) {
				2586	pr_err("mdsc_handle_reply got corrupt (short) reply\n");
				2587	ceph_msg_dump(msg);
				2588	return;
				2589	}
				2590
				2591	/* get request, session */
				2592	tid = le64_to_cpu(msg->hdr.tid);
				2593	mutex_lock(&mdsc->mutex);
				2594	req = lookup_get_request(mdsc, tid);
				2595	if (!req) {
				2596	dout("handle_reply on unknown tid %llu\n", tid);
				2597	mutex_unlock(&mdsc->mutex);
				2598	return;
				2599	}
				2600	dout("handle_reply %p\n", req);
				2601
				2602	/* correct session? */
				2603	if (req->r_session != session) {
				2604	pr_err("mdsc_handle_reply got %llu on session mds%d"
				2605	" not mds%d\n", tid, session->s_mds,
				2606	req->r_session ? req->r_session->s_mds : -1);
				2607	mutex_unlock(&mdsc->mutex);
				2608	goto out;
				2609	}
				2610
				2611	/* dup? */
				2612	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) \|\|
				2613	(test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
				2614	pr_warn("got a dup %s reply on %llu from mds%d\n",
				2615	head->safe ? "safe" : "unsafe", tid, mds);
				2616	mutex_unlock(&mdsc->mutex);
				2617	goto out;
				2618	}
				2619	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
				2620	pr_warn("got unsafe after safe on %llu from mds%d\n",
				2621	tid, mds);
				2622	mutex_unlock(&mdsc->mutex);
				2623	goto out;
				2624	}
				2625
				2626	result = le32_to_cpu(head->result);
				2627
				2628	/*
				2629	* Handle an ESTALE
				2630	* if we're not talking to the authority, send to them
				2631	* if the authority has changed while we weren't looking,
				2632	* send to new authority
				2633	* Otherwise we just have to return an ESTALE
				2634	*/
				2635	if (result == -ESTALE) {
				2636	dout("got ESTALE on request %llu\n", req->r_tid);
				2637	req->r_resend_mds = -1;
				2638	if (req->r_direct_mode != USE_AUTH_MDS) {
				2639	dout("not using auth, setting for that now\n");
				2640	req->r_direct_mode = USE_AUTH_MDS;
				2641	__do_request(mdsc, req);
				2642	mutex_unlock(&mdsc->mutex);
				2643	goto out;
				2644	} else {
				2645	int mds = __choose_mds(mdsc, req);
				2646	if (mds >= 0 && mds != req->r_session->s_mds) {
				2647	dout("but auth changed, so resending\n");
				2648	__do_request(mdsc, req);
				2649	mutex_unlock(&mdsc->mutex);
				2650	goto out;
				2651	}
				2652	}
				2653	dout("have to return ESTALE on request %llu\n", req->r_tid);
				2654	}
				2655
				2656
				2657	if (head->safe) {
				2658	set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
				2659	__unregister_request(mdsc, req);
				2660
				2661	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2662	/*
				2663	* We already handled the unsafe response, now do the
				2664	* cleanup. No need to examine the response; the MDS
				2665	* doesn't include any result info in the safe
				2666	* response. And even if it did, there is nothing
				2667	* useful we could do with a revised return value.
				2668	*/
				2669	dout("got safe reply %llu, mds%d\n", tid, mds);
				2670
				2671	/* last unsafe request during umount? */
				2672	if (mdsc->stopping && !__get_oldest_req(mdsc))
				2673	complete_all(&mdsc->safe_umount_waiters);
				2674	mutex_unlock(&mdsc->mutex);
				2675	goto out;
				2676	}
				2677	} else {
				2678	set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
				2679	list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
				2680	if (req->r_unsafe_dir) {
				2681	struct ceph_inode_info *ci =
				2682	ceph_inode(req->r_unsafe_dir);
				2683	spin_lock(&ci->i_unsafe_lock);
				2684	list_add_tail(&req->r_unsafe_dir_item,
				2685	&ci->i_unsafe_dirops);
				2686	spin_unlock(&ci->i_unsafe_lock);
				2687	}
				2688	}
				2689
				2690	dout("handle_reply tid %lld result %d\n", tid, result);
				2691	rinfo = &req->r_reply_info;
				2692	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
				2693	mutex_unlock(&mdsc->mutex);
				2694
				2695	mutex_lock(&session->s_mutex);
				2696	if (err < 0) {
				2697	pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
				2698	ceph_msg_dump(msg);
				2699	goto out_err;
				2700	}
				2701
				2702	/* snap trace */
				2703	realm = NULL;
				2704	if (rinfo->snapblob_len) {
				2705	down_write(&mdsc->snap_rwsem);
				2706	ceph_update_snap_trace(mdsc, rinfo->snapblob,
				2707	rinfo->snapblob + rinfo->snapblob_len,
				2708	le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
				2709	&realm);
				2710	downgrade_write(&mdsc->snap_rwsem);
				2711	} else {
				2712	down_read(&mdsc->snap_rwsem);
				2713	}
				2714
				2715	/* insert trace into our cache */
				2716	mutex_lock(&req->r_fill_mutex);
				2717	current->journal_info = req;
				2718	err = ceph_fill_trace(mdsc->fsc->sb, req);
				2719	if (err == 0) {
				2720	if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR \|\|
				2721	req->r_op == CEPH_MDS_OP_LSSNAP))
				2722	ceph_readdir_prepopulate(req, req->r_session);
				2723	ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
				2724	}
				2725	current->journal_info = NULL;
				2726	mutex_unlock(&req->r_fill_mutex);
				2727
				2728	up_read(&mdsc->snap_rwsem);
				2729	if (realm)
				2730	ceph_put_snap_realm(mdsc, realm);
				2731
				2732	if (err == 0 && req->r_target_inode &&
				2733	test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
				2734	struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
				2735	spin_lock(&ci->i_unsafe_lock);
				2736	list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
				2737	spin_unlock(&ci->i_unsafe_lock);
				2738	}
				2739	out_err:
				2740	mutex_lock(&mdsc->mutex);
				2741	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
				2742	if (err) {
				2743	req->r_err = err;
				2744	} else {
				2745	req->r_reply = ceph_msg_get(msg);
				2746	set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
				2747	}
				2748	} else {
				2749	dout("reply arrived after request %lld was aborted\n", tid);
				2750	}
				2751	mutex_unlock(&mdsc->mutex);
				2752
				2753	mutex_unlock(&session->s_mutex);
				2754
				2755	/* kick calling process */
				2756	complete_request(mdsc, req);
				2757	out:
				2758	ceph_mdsc_put_request(req);
				2759	return;
				2760	}
				2761
				2762
				2763
				2764	/*
				2765	* handle mds notification that our request has been forwarded.
				2766	*/
				2767	static void handle_forward(struct ceph_mds_client *mdsc,
				2768	struct ceph_mds_session *session,
				2769	struct ceph_msg *msg)
				2770	{
				2771	struct ceph_mds_request *req;
				2772	u64 tid = le64_to_cpu(msg->hdr.tid);
				2773	u32 next_mds;
				2774	u32 fwd_seq;
				2775	int err = -EINVAL;
				2776	void *p = msg->front.iov_base;
				2777	void *end = p + msg->front.iov_len;
				2778
				2779	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				2780	next_mds = ceph_decode_32(&p);
				2781	fwd_seq = ceph_decode_32(&p);
				2782
				2783	mutex_lock(&mdsc->mutex);
				2784	req = lookup_get_request(mdsc, tid);
				2785	if (!req) {
				2786	dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
				2787	goto out; /* dup reply? */
				2788	}
				2789
				2790	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
				2791	dout("forward tid %llu aborted, unregistering\n", tid);
				2792	__unregister_request(mdsc, req);
				2793	} else if (fwd_seq <= req->r_num_fwd) {
				2794	dout("forward tid %llu to mds%d - old seq %d <= %d\n",
				2795	tid, next_mds, req->r_num_fwd, fwd_seq);
				2796	} else {
				2797	/* resend. forward race not possible; mds would drop */
				2798	dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
				2799	BUG_ON(req->r_err);
				2800	BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
				2801	req->r_attempts = 0;
				2802	req->r_num_fwd = fwd_seq;
				2803	req->r_resend_mds = next_mds;
				2804	put_request_session(req);
				2805	__do_request(mdsc, req);
				2806	}
				2807	ceph_mdsc_put_request(req);
				2808	out:
				2809	mutex_unlock(&mdsc->mutex);
				2810	return;
				2811
				2812	bad:
				2813	pr_err("mdsc_handle_forward decode error err=%d\n", err);
				2814	}
				2815
				2816	/*
				2817	* handle a mds session control message
				2818	*/
				2819	static void handle_session(struct ceph_mds_session *session,
				2820	struct ceph_msg *msg)
				2821	{
				2822	struct ceph_mds_client *mdsc = session->s_mdsc;
				2823	u32 op;
				2824	u64 seq;
				2825	int mds = session->s_mds;
				2826	struct ceph_mds_session_head *h = msg->front.iov_base;
				2827	int wake = 0;
				2828
				2829	/* decode */
				2830	if (msg->front.iov_len < sizeof(*h))
				2831	goto bad;
				2832	op = le32_to_cpu(h->op);
				2833	seq = le64_to_cpu(h->seq);
				2834
				2835	mutex_lock(&mdsc->mutex);
				2836	if (op == CEPH_SESSION_CLOSE) {
				2837	get_session(session);
				2838	__unregister_session(mdsc, session);
				2839	}
				2840	/* FIXME: this ttl calculation is generous */
				2841	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
				2842	mutex_unlock(&mdsc->mutex);
				2843
				2844	mutex_lock(&session->s_mutex);
				2845
				2846	dout("handle_session mds%d %s %p state %s seq %llu\n",
				2847	mds, ceph_session_op_name(op), session,
				2848	ceph_session_state_name(session->s_state), seq);
				2849
				2850	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
				2851	session->s_state = CEPH_MDS_SESSION_OPEN;
				2852	pr_info("mds%d came back\n", session->s_mds);
				2853	}
				2854
				2855	switch (op) {
				2856	case CEPH_SESSION_OPEN:
				2857	if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
				2858	pr_info("mds%d reconnect success\n", session->s_mds);
				2859	session->s_state = CEPH_MDS_SESSION_OPEN;
				2860	renewed_caps(mdsc, session, 0);
				2861	wake = 1;
				2862	if (mdsc->stopping)
				2863	__close_session(mdsc, session);
				2864	break;
				2865
				2866	case CEPH_SESSION_RENEWCAPS:
				2867	if (session->s_renew_seq == seq)
				2868	renewed_caps(mdsc, session, 1);
				2869	break;
				2870
				2871	case CEPH_SESSION_CLOSE:
				2872	if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
				2873	pr_info("mds%d reconnect denied\n", session->s_mds);
				2874	cleanup_session_requests(mdsc, session);
				2875	remove_session_caps(session);
				2876	wake = 2; /* for good measure */
				2877	wake_up_all(&mdsc->session_close_wq);
				2878	break;
				2879
				2880	case CEPH_SESSION_STALE:
				2881	pr_info("mds%d caps went stale, renewing\n",
				2882	session->s_mds);
				2883	spin_lock(&session->s_gen_ttl_lock);
				2884	session->s_cap_gen++;
				2885	session->s_cap_ttl = jiffies - 1;
				2886	spin_unlock(&session->s_gen_ttl_lock);
				2887	send_renew_caps(mdsc, session);
				2888	break;
				2889
				2890	case CEPH_SESSION_RECALL_STATE:
				2891	ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
				2892	break;
				2893
				2894	case CEPH_SESSION_FLUSHMSG:
				2895	send_flushmsg_ack(mdsc, session, seq);
				2896	break;
				2897
				2898	case CEPH_SESSION_FORCE_RO:
				2899	dout("force_session_readonly %p\n", session);
				2900	spin_lock(&session->s_cap_lock);
				2901	session->s_readonly = true;
				2902	spin_unlock(&session->s_cap_lock);
				2903	wake_up_session_caps(session, 0);
				2904	break;
				2905
				2906	case CEPH_SESSION_REJECT:
				2907	WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
				2908	pr_info("mds%d rejected session\n", session->s_mds);
				2909	session->s_state = CEPH_MDS_SESSION_REJECTED;
				2910	cleanup_session_requests(mdsc, session);
				2911	remove_session_caps(session);
				2912	wake = 2; /* for good measure */
				2913	break;
				2914
				2915	default:
				2916	pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
				2917	WARN_ON(1);
				2918	}
				2919
				2920	mutex_unlock(&session->s_mutex);
				2921	if (wake) {
				2922	mutex_lock(&mdsc->mutex);
				2923	__wake_requests(mdsc, &session->s_waiting);
				2924	if (wake == 2)
				2925	kick_requests(mdsc, mds);
				2926	mutex_unlock(&mdsc->mutex);
				2927	}
				2928	if (op == CEPH_SESSION_CLOSE)
				2929	ceph_put_mds_session(session);
				2930	return;
				2931
				2932	bad:
				2933	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
				2934	(int)msg->front.iov_len);
				2935	ceph_msg_dump(msg);
				2936	return;
				2937	}
				2938
				2939
				2940	/*
				2941	* called under session->mutex.
				2942	*/
				2943	static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
				2944	struct ceph_mds_session *session)
				2945	{
				2946	struct ceph_mds_request req, nreq;
				2947	struct rb_node *p;
				2948	int err;
				2949
				2950	dout("replay_unsafe_requests mds%d\n", session->s_mds);
				2951
				2952	mutex_lock(&mdsc->mutex);
				2953	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
				2954	err = __prepare_send_request(mdsc, req, session->s_mds, true);
				2955	if (!err) {
				2956	ceph_msg_get(req->r_request);
				2957	ceph_con_send(&session->s_con, req->r_request);
				2958	}
				2959	}
				2960
				2961	/*
				2962	* also re-send old requests when MDS enters reconnect stage. So that MDS
				2963	* can process completed request in clientreplay stage.
				2964	*/
				2965	p = rb_first(&mdsc->request_tree);
				2966	while (p) {
				2967	req = rb_entry(p, struct ceph_mds_request, r_node);
				2968	p = rb_next(p);
				2969	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
				2970	continue;
				2971	if (req->r_attempts == 0)
				2972	continue; /* only old requests */
				2973	if (req->r_session &&
				2974	req->r_session->s_mds == session->s_mds) {
				2975	err = __prepare_send_request(mdsc, req,
				2976	session->s_mds, true);
				2977	if (!err) {
				2978	ceph_msg_get(req->r_request);
				2979	ceph_con_send(&session->s_con, req->r_request);
				2980	}
				2981	}
				2982	}
				2983	mutex_unlock(&mdsc->mutex);
				2984	}
				2985
				2986	/*
				2987	* Encode information about a cap for a reconnect with the MDS.
				2988	*/
				2989	static int encode_caps_cb(struct inode inode, struct ceph_cap cap,
				2990	void *arg)
				2991	{
				2992	union {
				2993	struct ceph_mds_cap_reconnect v2;
				2994	struct ceph_mds_cap_reconnect_v1 v1;
				2995	} rec;
				2996	struct ceph_inode_info *ci = cap->ci;
				2997	struct ceph_reconnect_state *recon_state = arg;
				2998	struct ceph_pagelist *pagelist = recon_state->pagelist;
				2999	char *path;
				3000	int pathlen, err;
				3001	u64 pathbase;
				3002	u64 snap_follows;
				3003	struct dentry *dentry;
				3004
				3005	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
				3006	inode, ceph_vinop(inode), cap, cap->cap_id,
				3007	ceph_cap_string(cap->issued));
				3008	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
				3009	if (err)
				3010	return err;
				3011
				3012	dentry = d_find_alias(inode);
				3013	if (dentry) {
				3014	path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
				3015	if (IS_ERR(path)) {
				3016	err = PTR_ERR(path);
				3017	goto out_dput;
				3018	}
				3019	} else {
				3020	path = NULL;
				3021	pathlen = 0;
				3022	pathbase = 0;
				3023	}
				3024
				3025	spin_lock(&ci->i_ceph_lock);
				3026	cap->seq = 0; /* reset cap seq */
				3027	cap->issue_seq = 0; /* and issue_seq */
				3028	cap->mseq = 0; /* and migrate_seq */
				3029	cap->cap_gen = cap->session->s_cap_gen;
				3030
				3031	if (recon_state->msg_version >= 2) {
				3032	rec.v2.cap_id = cpu_to_le64(cap->cap_id);
				3033	rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
				3034	rec.v2.issued = cpu_to_le32(cap->issued);
				3035	rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
				3036	rec.v2.pathbase = cpu_to_le64(pathbase);
				3037	rec.v2.flock_len = (__force __le32)
				3038	((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
				3039	} else {
				3040	rec.v1.cap_id = cpu_to_le64(cap->cap_id);
				3041	rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
				3042	rec.v1.issued = cpu_to_le32(cap->issued);
				3043	rec.v1.size = cpu_to_le64(inode->i_size);
				3044	ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
				3045	ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
				3046	rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
				3047	rec.v1.pathbase = cpu_to_le64(pathbase);
				3048	}
				3049
				3050	if (list_empty(&ci->i_cap_snaps)) {
				3051	snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
				3052	} else {
				3053	struct ceph_cap_snap *capsnap =
				3054	list_first_entry(&ci->i_cap_snaps,
				3055	struct ceph_cap_snap, ci_item);
				3056	snap_follows = capsnap->follows;
				3057	}
				3058	spin_unlock(&ci->i_ceph_lock);
				3059
				3060	if (recon_state->msg_version >= 2) {
				3061	int num_fcntl_locks, num_flock_locks;
				3062	struct ceph_filelock *flocks = NULL;
				3063	size_t struct_len, total_len = 0;
				3064	u8 struct_v = 0;
				3065
				3066	encode_again:
				3067	if (rec.v2.flock_len) {
				3068	ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
				3069	} else {
				3070	num_fcntl_locks = 0;
				3071	num_flock_locks = 0;
				3072	}
				3073	if (num_fcntl_locks + num_flock_locks > 0) {
				3074	flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
				3075	sizeof(struct ceph_filelock),
				3076	GFP_NOFS);
				3077	if (!flocks) {
				3078	err = -ENOMEM;
				3079	goto out_free;
				3080	}
				3081	err = ceph_encode_locks_to_buffer(inode, flocks,
				3082	num_fcntl_locks,
				3083	num_flock_locks);
				3084	if (err) {
				3085	kfree(flocks);
				3086	flocks = NULL;
				3087	if (err == -ENOSPC)
				3088	goto encode_again;
				3089	goto out_free;
				3090	}
				3091	} else {
				3092	kfree(flocks);
				3093	flocks = NULL;
				3094	}
				3095
				3096	if (recon_state->msg_version >= 3) {
				3097	/* version, compat_version and struct_len */
				3098	total_len = 2 * sizeof(u8) + sizeof(u32);
				3099	struct_v = 2;
				3100	}
				3101	/*
				3102	* number of encoded locks is stable, so copy to pagelist
				3103	*/
				3104	struct_len = 2 * sizeof(u32) +
				3105	(num_fcntl_locks + num_flock_locks) *
				3106	sizeof(struct ceph_filelock);
				3107	rec.v2.flock_len = cpu_to_le32(struct_len);
				3108
				3109	struct_len += sizeof(rec.v2);
				3110	struct_len += sizeof(u32) + pathlen;
				3111
				3112	if (struct_v >= 2)
				3113	struct_len += sizeof(u64); /* snap_follows */
				3114
				3115	total_len += struct_len;
				3116	err = ceph_pagelist_reserve(pagelist, total_len);
				3117
				3118	if (!err) {
				3119	if (recon_state->msg_version >= 3) {
				3120	ceph_pagelist_encode_8(pagelist, struct_v);
				3121	ceph_pagelist_encode_8(pagelist, 1);
				3122	ceph_pagelist_encode_32(pagelist, struct_len);
				3123	}
				3124	ceph_pagelist_encode_string(pagelist, path, pathlen);
				3125	ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
				3126	ceph_locks_to_pagelist(flocks, pagelist,
				3127	num_fcntl_locks,
				3128	num_flock_locks);
				3129	if (struct_v >= 2)
				3130	ceph_pagelist_encode_64(pagelist, snap_follows);
				3131	}
				3132	kfree(flocks);
				3133	} else {
				3134	size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
				3135	err = ceph_pagelist_reserve(pagelist, size);
				3136	if (!err) {
				3137	ceph_pagelist_encode_string(pagelist, path, pathlen);
				3138	ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
				3139	}
				3140	}
				3141
				3142	recon_state->nr_caps++;
				3143	out_free:
				3144	kfree(path);
				3145	out_dput:
				3146	dput(dentry);
				3147	return err;
				3148	}
				3149
				3150
				3151	/*
				3152	* If an MDS fails and recovers, clients need to reconnect in order to
				3153	* reestablish shared state. This includes all caps issued through
				3154	* this session _and_ the snap_realm hierarchy. Because it's not
				3155	* clear which snap realms the mds cares about, we send everything we
				3156	* know about.. that ensures we'll then get any new info the
				3157	* recovering MDS might have.
				3158	*
				3159	* This is a relatively heavyweight operation, but it's rare.
				3160	*
				3161	* called with mdsc->mutex held.
				3162	*/
				3163	static void send_mds_reconnect(struct ceph_mds_client *mdsc,
				3164	struct ceph_mds_session *session)
				3165	{
				3166	struct ceph_msg *reply;
				3167	struct rb_node *p;
				3168	int mds = session->s_mds;
				3169	int err = -ENOMEM;
				3170	int s_nr_caps;
				3171	struct ceph_pagelist *pagelist;
				3172	struct ceph_reconnect_state recon_state;
				3173	LIST_HEAD(dispose);
				3174
				3175	pr_info("mds%d reconnect start\n", mds);
				3176
				3177	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
				3178	if (!pagelist)
				3179	goto fail_nopagelist;
				3180	ceph_pagelist_init(pagelist);
				3181
				3182	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
				3183	if (!reply)
				3184	goto fail_nomsg;
				3185
				3186	mutex_lock(&session->s_mutex);
				3187	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
				3188	session->s_seq = 0;
				3189
				3190	dout("session %p state %s\n", session,
				3191	ceph_session_state_name(session->s_state));
				3192
				3193	spin_lock(&session->s_gen_ttl_lock);
				3194	session->s_cap_gen++;
				3195	spin_unlock(&session->s_gen_ttl_lock);
				3196
				3197	spin_lock(&session->s_cap_lock);
				3198	/* don't know if session is readonly */
				3199	session->s_readonly = 0;
				3200	/*
				3201	* notify __ceph_remove_cap() that we are composing cap reconnect.
				3202	* If a cap get released before being added to the cap reconnect,
				3203	* __ceph_remove_cap() should skip queuing cap release.
				3204	*/
				3205	session->s_cap_reconnect = 1;
				3206	/* drop old cap expires; we're about to reestablish that state */
				3207	detach_cap_releases(session, &dispose);
				3208	spin_unlock(&session->s_cap_lock);
				3209	dispose_cap_releases(mdsc, &dispose);
				3210
				3211	/* trim unused caps to reduce MDS's cache rejoin time */
				3212	if (mdsc->fsc->sb->s_root)
				3213	shrink_dcache_parent(mdsc->fsc->sb->s_root);
				3214
				3215	ceph_con_close(&session->s_con);
				3216	ceph_con_open(&session->s_con,
				3217	CEPH_ENTITY_TYPE_MDS, mds,
				3218	ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
				3219
				3220	/* replay unsafe requests */
				3221	replay_unsafe_requests(mdsc, session);
				3222
				3223	down_read(&mdsc->snap_rwsem);
				3224
				3225	/* traverse this session's caps */
				3226	s_nr_caps = session->s_nr_caps;
				3227	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
				3228	if (err)
				3229	goto fail;
				3230
				3231	recon_state.nr_caps = 0;
				3232	recon_state.pagelist = pagelist;
				3233	if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
				3234	recon_state.msg_version = 3;
				3235	else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
				3236	recon_state.msg_version = 2;
				3237	else
				3238	recon_state.msg_version = 1;
				3239	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
				3240	if (err < 0)
				3241	goto fail;
				3242
				3243	spin_lock(&session->s_cap_lock);
				3244	session->s_cap_reconnect = 0;
				3245	spin_unlock(&session->s_cap_lock);
				3246
				3247	/*
				3248	* snaprealms. we provide mds with the ino, seq (version), and
				3249	* parent for all of our realms. If the mds has any newer info,
				3250	* it will tell us.
				3251	*/
				3252	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
				3253	struct ceph_snap_realm *realm =
				3254	rb_entry(p, struct ceph_snap_realm, node);
				3255	struct ceph_mds_snaprealm_reconnect sr_rec;
				3256
				3257	dout(" adding snap realm %llx seq %lld parent %llx\n",
				3258	realm->ino, realm->seq, realm->parent_ino);
				3259	sr_rec.ino = cpu_to_le64(realm->ino);
				3260	sr_rec.seq = cpu_to_le64(realm->seq);
				3261	sr_rec.parent = cpu_to_le64(realm->parent_ino);
				3262	err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
				3263	if (err)
				3264	goto fail;
				3265	}
				3266
				3267	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
				3268
				3269	/* raced with cap release? */
				3270	if (s_nr_caps != recon_state.nr_caps) {
				3271	struct page *page = list_first_entry(&pagelist->head,
				3272	struct page, lru);
				3273	__le32 *addr = kmap_atomic(page);
				3274	*addr = cpu_to_le32(recon_state.nr_caps);
				3275	kunmap_atomic(addr);
				3276	}
				3277
				3278	reply->hdr.data_len = cpu_to_le32(pagelist->length);
				3279	ceph_msg_data_add_pagelist(reply, pagelist);
				3280
				3281	ceph_early_kick_flushing_caps(mdsc, session);
				3282
				3283	ceph_con_send(&session->s_con, reply);
				3284
				3285	mutex_unlock(&session->s_mutex);
				3286
				3287	mutex_lock(&mdsc->mutex);
				3288	__wake_requests(mdsc, &session->s_waiting);
				3289	mutex_unlock(&mdsc->mutex);
				3290
				3291	up_read(&mdsc->snap_rwsem);
				3292	return;
				3293
				3294	fail:
				3295	ceph_msg_put(reply);
				3296	up_read(&mdsc->snap_rwsem);
				3297	mutex_unlock(&session->s_mutex);
				3298	fail_nomsg:
				3299	ceph_pagelist_release(pagelist);
				3300	fail_nopagelist:
				3301	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
				3302	return;
				3303	}
				3304
				3305
				3306	/*
				3307	* compare old and new mdsmaps, kicking requests
				3308	* and closing out old connections as necessary
				3309	*
				3310	* called under mdsc->mutex.
				3311	*/
				3312	static void check_new_map(struct ceph_mds_client *mdsc,
				3313	struct ceph_mdsmap *newmap,
				3314	struct ceph_mdsmap *oldmap)
				3315	{
				3316	int i;
				3317	int oldstate, newstate;
				3318	struct ceph_mds_session *s;
				3319
				3320	dout("check_new_map new %u old %u\n",
				3321	newmap->m_epoch, oldmap->m_epoch);
				3322
				3323	for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
				3324	if (!mdsc->sessions[i])
				3325	continue;
				3326	s = mdsc->sessions[i];
				3327	oldstate = ceph_mdsmap_get_state(oldmap, i);
				3328	newstate = ceph_mdsmap_get_state(newmap, i);
				3329
				3330	dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
				3331	i, ceph_mds_state_name(oldstate),
				3332	ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
				3333	ceph_mds_state_name(newstate),
				3334	ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
				3335	ceph_session_state_name(s->s_state));
				3336
				3337	if (i >= newmap->m_num_mds \|\|
				3338	memcmp(ceph_mdsmap_get_addr(oldmap, i),
				3339	ceph_mdsmap_get_addr(newmap, i),
				3340	sizeof(struct ceph_entity_addr))) {
				3341	if (s->s_state == CEPH_MDS_SESSION_OPENING) {
				3342	/* the session never opened, just close it
				3343	* out now */
				3344	get_session(s);
				3345	__unregister_session(mdsc, s);
				3346	__wake_requests(mdsc, &s->s_waiting);
				3347	ceph_put_mds_session(s);
				3348	} else if (i >= newmap->m_num_mds) {
				3349	/* force close session for stopped mds */
				3350	get_session(s);
				3351	__unregister_session(mdsc, s);
				3352	__wake_requests(mdsc, &s->s_waiting);
				3353	kick_requests(mdsc, i);
				3354	mutex_unlock(&mdsc->mutex);
				3355
				3356	mutex_lock(&s->s_mutex);
				3357	cleanup_session_requests(mdsc, s);
				3358	remove_session_caps(s);
				3359	mutex_unlock(&s->s_mutex);
				3360
				3361	ceph_put_mds_session(s);
				3362
				3363	mutex_lock(&mdsc->mutex);
				3364	} else {
				3365	/* just close it */
				3366	mutex_unlock(&mdsc->mutex);
				3367	mutex_lock(&s->s_mutex);
				3368	mutex_lock(&mdsc->mutex);
				3369	ceph_con_close(&s->s_con);
				3370	mutex_unlock(&s->s_mutex);
				3371	s->s_state = CEPH_MDS_SESSION_RESTARTING;
				3372	}
				3373	} else if (oldstate == newstate) {
				3374	continue; /* nothing new with this mds */
				3375	}
				3376
				3377	/*
				3378	* send reconnect?
				3379	*/
				3380	if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
				3381	newstate >= CEPH_MDS_STATE_RECONNECT) {
				3382	mutex_unlock(&mdsc->mutex);
				3383	send_mds_reconnect(mdsc, s);
				3384	mutex_lock(&mdsc->mutex);
				3385	}
				3386
				3387	/*
				3388	* kick request on any mds that has gone active.
				3389	*/
				3390	if (oldstate < CEPH_MDS_STATE_ACTIVE &&
				3391	newstate >= CEPH_MDS_STATE_ACTIVE) {
				3392	if (oldstate != CEPH_MDS_STATE_CREATING &&
				3393	oldstate != CEPH_MDS_STATE_STARTING)
				3394	pr_info("mds%d recovery completed\n", s->s_mds);
				3395	kick_requests(mdsc, i);
				3396	ceph_kick_flushing_caps(mdsc, s);
				3397	wake_up_session_caps(s, 1);
				3398	}
				3399	}
				3400
				3401	for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
				3402	s = mdsc->sessions[i];
				3403	if (!s)
				3404	continue;
				3405	if (!ceph_mdsmap_is_laggy(newmap, i))
				3406	continue;
				3407	if (s->s_state == CEPH_MDS_SESSION_OPEN \|\|
				3408	s->s_state == CEPH_MDS_SESSION_HUNG \|\|
				3409	s->s_state == CEPH_MDS_SESSION_CLOSING) {
				3410	dout(" connecting to export targets of laggy mds%d\n",
				3411	i);
				3412	__open_export_target_sessions(mdsc, s);
				3413	}
				3414	}
				3415	}
				3416
				3417
				3418
				3419	/*
				3420	* leases
				3421	*/
				3422
				3423	/*
				3424	* caller must hold session s_mutex, dentry->d_lock
				3425	*/
				3426	void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
				3427	{
				3428	struct ceph_dentry_info *di = ceph_dentry(dentry);
				3429
				3430	ceph_put_mds_session(di->lease_session);
				3431	di->lease_session = NULL;
				3432	}
				3433
				3434	static void handle_lease(struct ceph_mds_client *mdsc,
				3435	struct ceph_mds_session *session,
				3436	struct ceph_msg *msg)
				3437	{
				3438	struct super_block *sb = mdsc->fsc->sb;
				3439	struct inode *inode;
				3440	struct dentry parent, dentry;
				3441	struct ceph_dentry_info *di;
				3442	int mds = session->s_mds;
				3443	struct ceph_mds_lease *h = msg->front.iov_base;
				3444	u32 seq;
				3445	struct ceph_vino vino;
				3446	struct qstr dname;
				3447	int release = 0;
				3448
				3449	dout("handle_lease from mds%d\n", mds);
				3450
				3451	/* decode */
				3452	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
				3453	goto bad;
				3454	vino.ino = le64_to_cpu(h->ino);
				3455	vino.snap = CEPH_NOSNAP;
				3456	seq = le32_to_cpu(h->seq);
				3457	dname.len = get_unaligned_le32(h + 1);
				3458	if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
				3459	goto bad;
				3460	dname.name = (void *)(h + 1) + sizeof(u32);
				3461
				3462	/* lookup inode */
				3463	inode = ceph_find_inode(sb, vino);
				3464	dout("handle_lease %s, ino %llx %p %.*s\n",
				3465	ceph_lease_op_name(h->action), vino.ino, inode,
				3466	dname.len, dname.name);
				3467
				3468	mutex_lock(&session->s_mutex);
				3469	session->s_seq++;
				3470
				3471	if (!inode) {
				3472	dout("handle_lease no inode %llx\n", vino.ino);
				3473	goto release;
				3474	}
				3475
				3476	/* dentry */
				3477	parent = d_find_alias(inode);
				3478	if (!parent) {
				3479	dout("no parent dentry on inode %p\n", inode);
				3480	WARN_ON(1);
				3481	goto release; /* hrm... */
				3482	}
				3483	dname.hash = full_name_hash(parent, dname.name, dname.len);
				3484	dentry = d_lookup(parent, &dname);
				3485	dput(parent);
				3486	if (!dentry)
				3487	goto release;
				3488
				3489	spin_lock(&dentry->d_lock);
				3490	di = ceph_dentry(dentry);
				3491	switch (h->action) {
				3492	case CEPH_MDS_LEASE_REVOKE:
				3493	if (di->lease_session == session) {
				3494	if (ceph_seq_cmp(di->lease_seq, seq) > 0)
				3495	h->seq = cpu_to_le32(di->lease_seq);
				3496	__ceph_mdsc_drop_dentry_lease(dentry);
				3497	}
				3498	release = 1;
				3499	break;
				3500
				3501	case CEPH_MDS_LEASE_RENEW:
				3502	if (di->lease_session == session &&
				3503	di->lease_gen == session->s_cap_gen &&
				3504	di->lease_renew_from &&
				3505	di->lease_renew_after == 0) {
				3506	unsigned long duration =
				3507	msecs_to_jiffies(le32_to_cpu(h->duration_ms));
				3508
				3509	di->lease_seq = seq;
				3510	di->time = di->lease_renew_from + duration;
				3511	di->lease_renew_after = di->lease_renew_from +
				3512	(duration >> 1);
				3513	di->lease_renew_from = 0;
				3514	}
				3515	break;
				3516	}
				3517	spin_unlock(&dentry->d_lock);
				3518	dput(dentry);
				3519
				3520	if (!release)
				3521	goto out;
				3522
				3523	release:
				3524	/* let's just reuse the same message */
				3525	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
				3526	ceph_msg_get(msg);
				3527	ceph_con_send(&session->s_con, msg);
				3528
				3529	out:
				3530	iput(inode);
				3531	mutex_unlock(&session->s_mutex);
				3532	return;
				3533
				3534	bad:
				3535	pr_err("corrupt lease message\n");
				3536	ceph_msg_dump(msg);
				3537	}
				3538
				3539	void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
				3540	struct inode *inode,
				3541	struct dentry *dentry, char action,
				3542	u32 seq)
				3543	{
				3544	struct ceph_msg *msg;
				3545	struct ceph_mds_lease *lease;
				3546	int len = sizeof(*lease) + sizeof(u32);
				3547	int dnamelen = 0;
				3548
				3549	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
				3550	inode, dentry, ceph_lease_op_name(action), session->s_mds);
				3551	dnamelen = dentry->d_name.len;
				3552	len += dnamelen;
				3553
				3554	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
				3555	if (!msg)
				3556	return;
				3557	lease = msg->front.iov_base;
				3558	lease->action = action;
				3559	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
				3560	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
				3561	lease->seq = cpu_to_le32(seq);
				3562	put_unaligned_le32(dnamelen, lease + 1);
				3563	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
				3564
				3565	/*
				3566	* if this is a preemptive lease RELEASE, no need to
				3567	* flush request stream, since the actual request will
				3568	* soon follow.
				3569	*/
				3570	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
				3571
				3572	ceph_con_send(&session->s_con, msg);
				3573	}
				3574
				3575	/*
				3576	* lock unlock sessions, to wait ongoing session activities
				3577	*/
				3578	static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
				3579	{
				3580	int i;
				3581
				3582	mutex_lock(&mdsc->mutex);
				3583	for (i = 0; i < mdsc->max_sessions; i++) {
				3584	struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
				3585	if (!s)
				3586	continue;
				3587	mutex_unlock(&mdsc->mutex);
				3588	mutex_lock(&s->s_mutex);
				3589	mutex_unlock(&s->s_mutex);
				3590	ceph_put_mds_session(s);
				3591	mutex_lock(&mdsc->mutex);
				3592	}
				3593	mutex_unlock(&mdsc->mutex);
				3594	}
				3595
				3596
				3597
				3598	/*
				3599	* delayed work -- periodically trim expired leases, renew caps with mds
				3600	*/
				3601	static void schedule_delayed(struct ceph_mds_client *mdsc)
				3602	{
				3603	int delay = 5;
				3604	unsigned hz = round_jiffies_relative(HZ * delay);
				3605	schedule_delayed_work(&mdsc->delayed_work, hz);
				3606	}
				3607
				3608	static void delayed_work(struct work_struct *work)
				3609	{
				3610	int i;
				3611	struct ceph_mds_client *mdsc =
				3612	container_of(work, struct ceph_mds_client, delayed_work.work);
				3613	int renew_interval;
				3614	int renew_caps;
				3615
				3616	dout("mdsc delayed_work\n");
				3617	ceph_check_delayed_caps(mdsc);
				3618
				3619	mutex_lock(&mdsc->mutex);
				3620	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
				3621	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
				3622	mdsc->last_renew_caps);
				3623	if (renew_caps)
				3624	mdsc->last_renew_caps = jiffies;
				3625
				3626	for (i = 0; i < mdsc->max_sessions; i++) {
				3627	struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
				3628	if (!s)
				3629	continue;
				3630	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
				3631	dout("resending session close request for mds%d\n",
				3632	s->s_mds);
				3633	request_close_session(mdsc, s);
				3634	ceph_put_mds_session(s);
				3635	continue;
				3636	}
				3637	if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
				3638	if (s->s_state == CEPH_MDS_SESSION_OPEN) {
				3639	s->s_state = CEPH_MDS_SESSION_HUNG;
				3640	pr_info("mds%d hung\n", s->s_mds);
				3641	}
				3642	}
				3643	if (s->s_state == CEPH_MDS_SESSION_NEW \|\|
				3644	s->s_state == CEPH_MDS_SESSION_RESTARTING \|\|
				3645	s->s_state == CEPH_MDS_SESSION_REJECTED) {
				3646	/* this mds is failed or recovering, just wait */
				3647	ceph_put_mds_session(s);
				3648	continue;
				3649	}
				3650	mutex_unlock(&mdsc->mutex);
				3651
				3652	mutex_lock(&s->s_mutex);
				3653	if (renew_caps)
				3654	send_renew_caps(mdsc, s);
				3655	else
				3656	ceph_con_keepalive(&s->s_con);
				3657	if (s->s_state == CEPH_MDS_SESSION_OPEN \|\|
				3658	s->s_state == CEPH_MDS_SESSION_HUNG)
				3659	ceph_send_cap_releases(mdsc, s);
				3660	mutex_unlock(&s->s_mutex);
				3661	ceph_put_mds_session(s);
				3662
				3663	mutex_lock(&mdsc->mutex);
				3664	}
				3665	mutex_unlock(&mdsc->mutex);
				3666
				3667	schedule_delayed(mdsc);
				3668	}
				3669
				3670	int ceph_mdsc_init(struct ceph_fs_client *fsc)
				3671
				3672	{
				3673	struct ceph_mds_client *mdsc;
				3674
				3675	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
				3676	if (!mdsc)
				3677	return -ENOMEM;
				3678	mdsc->fsc = fsc;
				3679	mutex_init(&mdsc->mutex);
				3680	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
				3681	if (!mdsc->mdsmap) {
				3682	kfree(mdsc);
				3683	return -ENOMEM;
				3684	}
				3685
				3686	fsc->mdsc = mdsc;
				3687	init_completion(&mdsc->safe_umount_waiters);
				3688	init_waitqueue_head(&mdsc->session_close_wq);
				3689	INIT_LIST_HEAD(&mdsc->waiting_for_map);
				3690	mdsc->sessions = NULL;
				3691	atomic_set(&mdsc->num_sessions, 0);
				3692	mdsc->max_sessions = 0;
				3693	mdsc->stopping = 0;
				3694	atomic64_set(&mdsc->quotarealms_count, 0);
				3695	mdsc->last_snap_seq = 0;
				3696	init_rwsem(&mdsc->snap_rwsem);
				3697	mdsc->snap_realms = RB_ROOT;
				3698	INIT_LIST_HEAD(&mdsc->snap_empty);
				3699	spin_lock_init(&mdsc->snap_empty_lock);
				3700	mdsc->last_tid = 0;
				3701	mdsc->oldest_tid = 0;
				3702	mdsc->request_tree = RB_ROOT;
				3703	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
				3704	mdsc->last_renew_caps = jiffies;
				3705	INIT_LIST_HEAD(&mdsc->cap_delay_list);
				3706	spin_lock_init(&mdsc->cap_delay_lock);
				3707	INIT_LIST_HEAD(&mdsc->snap_flush_list);
				3708	spin_lock_init(&mdsc->snap_flush_lock);
				3709	mdsc->last_cap_flush_tid = 1;
				3710	INIT_LIST_HEAD(&mdsc->cap_flush_list);
				3711	INIT_LIST_HEAD(&mdsc->cap_dirty);
				3712	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
				3713	mdsc->num_cap_flushing = 0;
				3714	spin_lock_init(&mdsc->cap_dirty_lock);
				3715	init_waitqueue_head(&mdsc->cap_flushing_wq);
				3716	spin_lock_init(&mdsc->dentry_lru_lock);
				3717	INIT_LIST_HEAD(&mdsc->dentry_lru);
				3718
				3719	ceph_caps_init(mdsc);
				3720	ceph_adjust_min_caps(mdsc, fsc->min_caps);
				3721
				3722	init_rwsem(&mdsc->pool_perm_rwsem);
				3723	mdsc->pool_perm_tree = RB_ROOT;
				3724
				3725	strscpy(mdsc->nodename, utsname()->nodename,
				3726	sizeof(mdsc->nodename));
				3727	return 0;
				3728	}
				3729
				3730	/*
				3731	* Wait for safe replies on open mds requests. If we time out, drop
				3732	* all requests from the tree to avoid dangling dentry refs.
				3733	*/
				3734	static void wait_requests(struct ceph_mds_client *mdsc)
				3735	{
				3736	struct ceph_options *opts = mdsc->fsc->client->options;
				3737	struct ceph_mds_request *req;
				3738
				3739	mutex_lock(&mdsc->mutex);
				3740	if (__get_oldest_req(mdsc)) {
				3741	mutex_unlock(&mdsc->mutex);
				3742
				3743	dout("wait_requests waiting for requests\n");
				3744	wait_for_completion_timeout(&mdsc->safe_umount_waiters,
				3745	ceph_timeout_jiffies(opts->mount_timeout));
				3746
				3747	/* tear down remaining requests */
				3748	mutex_lock(&mdsc->mutex);
				3749	while ((req = __get_oldest_req(mdsc))) {
				3750	dout("wait_requests timed out on tid %llu\n",
				3751	req->r_tid);
				3752	__unregister_request(mdsc, req);
				3753	}
				3754	}
				3755	mutex_unlock(&mdsc->mutex);
				3756	dout("wait_requests done\n");
				3757	}
				3758
				3759	/*
				3760	* called before mount is ro, and before dentries are torn down.
				3761	* (hmm, does this still race with new lookups?)
				3762	*/
				3763	void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
				3764	{
				3765	dout("pre_umount\n");
				3766	mdsc->stopping = 1;
				3767
				3768	lock_unlock_sessions(mdsc);
				3769	ceph_flush_dirty_caps(mdsc);
				3770	wait_requests(mdsc);
				3771
				3772	/*
				3773	* wait for reply handlers to drop their request refs and
				3774	* their inode/dcache refs
				3775	*/
				3776	ceph_msgr_flush();
				3777	}
				3778
				3779	/*
				3780	* wait for all write mds requests to flush.
				3781	*/
				3782	static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
				3783	{
				3784	struct ceph_mds_request req = NULL, nextreq;
				3785	struct rb_node *n;
				3786
				3787	mutex_lock(&mdsc->mutex);
				3788	dout("wait_unsafe_requests want %lld\n", want_tid);
				3789	restart:
				3790	req = __get_oldest_req(mdsc);
				3791	while (req && req->r_tid <= want_tid) {
				3792	/* find next request */
				3793	n = rb_next(&req->r_node);
				3794	if (n)
				3795	nextreq = rb_entry(n, struct ceph_mds_request, r_node);
				3796	else
				3797	nextreq = NULL;
				3798	if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
				3799	(req->r_op & CEPH_MDS_OP_WRITE)) {
				3800	/* write op */
				3801	ceph_mdsc_get_request(req);
				3802	if (nextreq)
				3803	ceph_mdsc_get_request(nextreq);
				3804	mutex_unlock(&mdsc->mutex);
				3805	dout("wait_unsafe_requests wait on %llu (want %llu)\n",
				3806	req->r_tid, want_tid);
				3807	wait_for_completion(&req->r_safe_completion);
				3808	mutex_lock(&mdsc->mutex);
				3809	ceph_mdsc_put_request(req);
				3810	if (!nextreq)
				3811	break; /* next dne before, so we're done! */
				3812	if (RB_EMPTY_NODE(&nextreq->r_node)) {
				3813	/* next request was removed from tree */
				3814	ceph_mdsc_put_request(nextreq);
				3815	goto restart;
				3816	}
				3817	ceph_mdsc_put_request(nextreq); /* won't go away */
				3818	}
				3819	req = nextreq;
				3820	}
				3821	mutex_unlock(&mdsc->mutex);
				3822	dout("wait_unsafe_requests done\n");
				3823	}
				3824
				3825	void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
				3826	{
				3827	u64 want_tid, want_flush;
				3828
				3829	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				3830	return;
				3831
				3832	dout("sync\n");
				3833	mutex_lock(&mdsc->mutex);
				3834	want_tid = mdsc->last_tid;
				3835	mutex_unlock(&mdsc->mutex);
				3836
				3837	ceph_flush_dirty_caps(mdsc);
				3838	spin_lock(&mdsc->cap_dirty_lock);
				3839	want_flush = mdsc->last_cap_flush_tid;
				3840	if (!list_empty(&mdsc->cap_flush_list)) {
				3841	struct ceph_cap_flush *cf =
				3842	list_last_entry(&mdsc->cap_flush_list,
				3843	struct ceph_cap_flush, g_list);
				3844	cf->wake = true;
				3845	}
				3846	spin_unlock(&mdsc->cap_dirty_lock);
				3847
				3848	dout("sync want tid %lld flush_seq %lld\n",
				3849	want_tid, want_flush);
				3850
				3851	wait_unsafe_requests(mdsc, want_tid);
				3852	wait_caps_flush(mdsc, want_flush);
				3853	}
				3854
				3855	/*
				3856	* true if all sessions are closed, or we force unmount
				3857	*/
				3858	static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
				3859	{
				3860	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
				3861	return true;
				3862	return atomic_read(&mdsc->num_sessions) <= skipped;
				3863	}
				3864
				3865	/*
				3866	* called after sb is ro.
				3867	*/
				3868	void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
				3869	{
				3870	struct ceph_options *opts = mdsc->fsc->client->options;
				3871	struct ceph_mds_session *session;
				3872	int i;
				3873	int skipped = 0;
				3874
				3875	dout("close_sessions\n");
				3876
				3877	/* close sessions */
				3878	mutex_lock(&mdsc->mutex);
				3879	for (i = 0; i < mdsc->max_sessions; i++) {
				3880	session = __ceph_lookup_mds_session(mdsc, i);
				3881	if (!session)
				3882	continue;
				3883	mutex_unlock(&mdsc->mutex);
				3884	mutex_lock(&session->s_mutex);
				3885	if (__close_session(mdsc, session) <= 0)
				3886	skipped++;
				3887	mutex_unlock(&session->s_mutex);
				3888	ceph_put_mds_session(session);
				3889	mutex_lock(&mdsc->mutex);
				3890	}
				3891	mutex_unlock(&mdsc->mutex);
				3892
				3893	dout("waiting for sessions to close\n");
				3894	wait_event_timeout(mdsc->session_close_wq,
				3895	done_closing_sessions(mdsc, skipped),
				3896	ceph_timeout_jiffies(opts->mount_timeout));
				3897
				3898	/* tear down remaining sessions */
				3899	mutex_lock(&mdsc->mutex);
				3900	for (i = 0; i < mdsc->max_sessions; i++) {
				3901	if (mdsc->sessions[i]) {
				3902	session = get_session(mdsc->sessions[i]);
				3903	__unregister_session(mdsc, session);
				3904	mutex_unlock(&mdsc->mutex);
				3905	mutex_lock(&session->s_mutex);
				3906	remove_session_caps(session);
				3907	mutex_unlock(&session->s_mutex);
				3908	ceph_put_mds_session(session);
				3909	mutex_lock(&mdsc->mutex);
				3910	}
				3911	}
				3912	WARN_ON(!list_empty(&mdsc->cap_delay_list));
				3913	mutex_unlock(&mdsc->mutex);
				3914
				3915	ceph_cleanup_empty_realms(mdsc);
				3916
				3917	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
				3918
				3919	dout("stopped\n");
				3920	}
				3921
				3922	void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
				3923	{
				3924	struct ceph_mds_session *session;
				3925	int mds;
				3926
				3927	dout("force umount\n");
				3928
				3929	mutex_lock(&mdsc->mutex);
				3930	for (mds = 0; mds < mdsc->max_sessions; mds++) {
				3931	session = __ceph_lookup_mds_session(mdsc, mds);
				3932	if (!session)
				3933	continue;
				3934	mutex_unlock(&mdsc->mutex);
				3935	mutex_lock(&session->s_mutex);
				3936	__close_session(mdsc, session);
				3937	if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
				3938	cleanup_session_requests(mdsc, session);
				3939	remove_session_caps(session);
				3940	}
				3941	mutex_unlock(&session->s_mutex);
				3942	ceph_put_mds_session(session);
				3943	mutex_lock(&mdsc->mutex);
				3944	kick_requests(mdsc, mds);
				3945	}
				3946	__wake_requests(mdsc, &mdsc->waiting_for_map);
				3947	mutex_unlock(&mdsc->mutex);
				3948	}
				3949
				3950	static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
				3951	{
				3952	dout("stop\n");
				3953	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
				3954	if (mdsc->mdsmap)
				3955	ceph_mdsmap_destroy(mdsc->mdsmap);
				3956	kfree(mdsc->sessions);
				3957	ceph_caps_finalize(mdsc);
				3958	ceph_pool_perm_destroy(mdsc);
				3959	}
				3960
				3961	void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
				3962	{
				3963	struct ceph_mds_client *mdsc = fsc->mdsc;
				3964	dout("mdsc_destroy %p\n", mdsc);
				3965
				3966	if (!mdsc)
				3967	return;
				3968
				3969	/* flush out any connection work with references to us */
				3970	ceph_msgr_flush();
				3971
				3972	ceph_mdsc_stop(mdsc);
				3973
				3974	fsc->mdsc = NULL;
				3975	kfree(mdsc);
				3976	dout("mdsc_destroy %p done\n", mdsc);
				3977	}
				3978
				3979	void ceph_mdsc_handle_fsmap(struct ceph_mds_client mdsc, struct ceph_msg msg)
				3980	{
				3981	struct ceph_fs_client *fsc = mdsc->fsc;
				3982	const char *mds_namespace = fsc->mount_options->mds_namespace;
				3983	void *p = msg->front.iov_base;
				3984	void *end = p + msg->front.iov_len;
				3985	u32 epoch;
				3986	u32 map_len;
				3987	u32 num_fs;
				3988	u32 mount_fscid = (u32)-1;
				3989	u8 struct_v, struct_cv;
				3990	int err = -EINVAL;
				3991
				3992	ceph_decode_need(&p, end, sizeof(u32), bad);
				3993	epoch = ceph_decode_32(&p);
				3994
				3995	dout("handle_fsmap epoch %u\n", epoch);
				3996
				3997	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
				3998	struct_v = ceph_decode_8(&p);
				3999	struct_cv = ceph_decode_8(&p);
				4000	map_len = ceph_decode_32(&p);
				4001
				4002	ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
				4003	p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
				4004
				4005	num_fs = ceph_decode_32(&p);
				4006	while (num_fs-- > 0) {
				4007	void info_p, info_end;
				4008	u32 info_len;
				4009	u8 info_v, info_cv;
				4010	u32 fscid, namelen;
				4011
				4012	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
				4013	info_v = ceph_decode_8(&p);
				4014	info_cv = ceph_decode_8(&p);
				4015	info_len = ceph_decode_32(&p);
				4016	ceph_decode_need(&p, end, info_len, bad);
				4017	info_p = p;
				4018	info_end = p + info_len;
				4019	p = info_end;
				4020
				4021	ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
				4022	fscid = ceph_decode_32(&info_p);
				4023	namelen = ceph_decode_32(&info_p);
				4024	ceph_decode_need(&info_p, info_end, namelen, bad);
				4025
				4026	if (mds_namespace &&
				4027	strlen(mds_namespace) == namelen &&
				4028	!strncmp(mds_namespace, (char *)info_p, namelen)) {
				4029	mount_fscid = fscid;
				4030	break;
				4031	}
				4032	}
				4033
				4034	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
				4035	if (mount_fscid != (u32)-1) {
				4036	fsc->client->monc.fs_cluster_id = mount_fscid;
				4037	ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
				4038	0, true);
				4039	ceph_monc_renew_subs(&fsc->client->monc);
				4040	} else {
				4041	err = -ENOENT;
				4042	goto err_out;
				4043	}
				4044	return;
				4045
				4046	bad:
				4047	pr_err("error decoding fsmap\n");
				4048	err_out:
				4049	mutex_lock(&mdsc->mutex);
				4050	mdsc->mdsmap_err = err;
				4051	__wake_requests(mdsc, &mdsc->waiting_for_map);
				4052	mutex_unlock(&mdsc->mutex);
				4053	}
				4054
				4055	/*
				4056	* handle mds map update.
				4057	*/
				4058	void ceph_mdsc_handle_mdsmap(struct ceph_mds_client mdsc, struct ceph_msg msg)
				4059	{
				4060	u32 epoch;
				4061	u32 maplen;
				4062	void *p = msg->front.iov_base;
				4063	void *end = p + msg->front.iov_len;
				4064	struct ceph_mdsmap newmap, oldmap;
				4065	struct ceph_fsid fsid;
				4066	int err = -EINVAL;
				4067
				4068	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
				4069	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				4070	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
				4071	return;
				4072	epoch = ceph_decode_32(&p);
				4073	maplen = ceph_decode_32(&p);
				4074	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
				4075
				4076	/* do we need it? */
				4077	mutex_lock(&mdsc->mutex);
				4078	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
				4079	dout("handle_map epoch %u <= our %u\n",
				4080	epoch, mdsc->mdsmap->m_epoch);
				4081	mutex_unlock(&mdsc->mutex);
				4082	return;
				4083	}
				4084
				4085	newmap = ceph_mdsmap_decode(&p, end);
				4086	if (IS_ERR(newmap)) {
				4087	err = PTR_ERR(newmap);
				4088	goto bad_unlock;
				4089	}
				4090
				4091	/* swap into place */
				4092	if (mdsc->mdsmap) {
				4093	oldmap = mdsc->mdsmap;
				4094	mdsc->mdsmap = newmap;
				4095	check_new_map(mdsc, newmap, oldmap);
				4096	ceph_mdsmap_destroy(oldmap);
				4097	} else {
				4098	mdsc->mdsmap = newmap; /* first mds map */
				4099	}
				4100	mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
				4101	MAX_LFS_FILESIZE);
				4102
				4103	__wake_requests(mdsc, &mdsc->waiting_for_map);
				4104	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
				4105	mdsc->mdsmap->m_epoch);
				4106
				4107	mutex_unlock(&mdsc->mutex);
				4108	schedule_delayed(mdsc);
				4109	return;
				4110
				4111	bad_unlock:
				4112	mutex_unlock(&mdsc->mutex);
				4113	bad:
				4114	pr_err("error decoding mdsmap %d\n", err);
				4115	return;
				4116	}
				4117
				4118	static struct ceph_connection con_get(struct ceph_connection con)
				4119	{
				4120	struct ceph_mds_session *s = con->private;
				4121
				4122	if (get_session(s)) {
				4123	dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
				4124	return con;
				4125	}
				4126	dout("mdsc con_get %p FAIL\n", s);
				4127	return NULL;
				4128	}
				4129
				4130	static void con_put(struct ceph_connection *con)
				4131	{
				4132	struct ceph_mds_session *s = con->private;
				4133
				4134	dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
				4135	ceph_put_mds_session(s);
				4136	}
				4137
				4138	/*
				4139	* if the client is unresponsive for long enough, the mds will kill
				4140	* the session entirely.
				4141	*/
				4142	static void peer_reset(struct ceph_connection *con)
				4143	{
				4144	struct ceph_mds_session *s = con->private;
				4145	struct ceph_mds_client *mdsc = s->s_mdsc;
				4146
				4147	pr_warn("mds%d closed our session\n", s->s_mds);
				4148	send_mds_reconnect(mdsc, s);
				4149	}
				4150
				4151	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				4152	{
				4153	struct ceph_mds_session *s = con->private;
				4154	struct ceph_mds_client *mdsc = s->s_mdsc;
				4155	int type = le16_to_cpu(msg->hdr.type);
				4156
				4157	mutex_lock(&mdsc->mutex);
				4158	if (__verify_registered_session(mdsc, s) < 0) {
				4159	mutex_unlock(&mdsc->mutex);
				4160	goto out;
				4161	}
				4162	mutex_unlock(&mdsc->mutex);
				4163
				4164	switch (type) {
				4165	case CEPH_MSG_MDS_MAP:
				4166	ceph_mdsc_handle_mdsmap(mdsc, msg);
				4167	break;
				4168	case CEPH_MSG_FS_MAP_USER:
				4169	ceph_mdsc_handle_fsmap(mdsc, msg);
				4170	break;
				4171	case CEPH_MSG_CLIENT_SESSION:
				4172	handle_session(s, msg);
				4173	break;
				4174	case CEPH_MSG_CLIENT_REPLY:
				4175	handle_reply(s, msg);
				4176	break;
				4177	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
				4178	handle_forward(mdsc, s, msg);
				4179	break;
				4180	case CEPH_MSG_CLIENT_CAPS:
				4181	ceph_handle_caps(s, msg);
				4182	break;
				4183	case CEPH_MSG_CLIENT_SNAP:
				4184	ceph_handle_snap(mdsc, s, msg);
				4185	break;
				4186	case CEPH_MSG_CLIENT_LEASE:
				4187	handle_lease(mdsc, s, msg);
				4188	break;
				4189	case CEPH_MSG_CLIENT_QUOTA:
				4190	ceph_handle_quota(mdsc, s, msg);
				4191	break;
				4192
				4193	default:
				4194	pr_err("received unknown message type %d %s\n", type,
				4195	ceph_msg_type_name(type));
				4196	}
				4197	out:
				4198	ceph_msg_put(msg);
				4199	}
				4200
				4201	/*
				4202	* authentication
				4203	*/
				4204
				4205	/*
				4206	* Note: returned pointer is the address of a structure that's
				4207	* managed separately. Caller must not attempt to free it.
				4208	*/
				4209	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				4210	int *proto, int force_new)
				4211	{
				4212	struct ceph_mds_session *s = con->private;
				4213	struct ceph_mds_client *mdsc = s->s_mdsc;
				4214	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4215	struct ceph_auth_handshake *auth = &s->s_auth;
				4216
				4217	if (force_new && auth->authorizer) {
				4218	ceph_auth_destroy_authorizer(auth->authorizer);
				4219	auth->authorizer = NULL;
				4220	}
				4221	if (!auth->authorizer) {
				4222	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
				4223	auth);
				4224	if (ret)
				4225	return ERR_PTR(ret);
				4226	} else {
				4227	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
				4228	auth);
				4229	if (ret)
				4230	return ERR_PTR(ret);
				4231	}
				4232	*proto = ac->protocol;
				4233
				4234	return auth;
				4235	}
				4236
				4237	static int add_authorizer_challenge(struct ceph_connection *con,
				4238	void *challenge_buf, int challenge_buf_len)
				4239	{
				4240	struct ceph_mds_session *s = con->private;
				4241	struct ceph_mds_client *mdsc = s->s_mdsc;
				4242	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4243
				4244	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
				4245	challenge_buf, challenge_buf_len);
				4246	}
				4247
				4248	static int verify_authorizer_reply(struct ceph_connection *con)
				4249	{
				4250	struct ceph_mds_session *s = con->private;
				4251	struct ceph_mds_client *mdsc = s->s_mdsc;
				4252	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4253
				4254	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
				4255	}
				4256
				4257	static int invalidate_authorizer(struct ceph_connection *con)
				4258	{
				4259	struct ceph_mds_session *s = con->private;
				4260	struct ceph_mds_client *mdsc = s->s_mdsc;
				4261	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
				4262
				4263	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
				4264
				4265	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
				4266	}
				4267
				4268	static struct ceph_msg mds_alloc_msg(struct ceph_connection con,
				4269	struct ceph_msg_header hdr, int skip)
				4270	{
				4271	struct ceph_msg *msg;
				4272	int type = (int) le16_to_cpu(hdr->type);
				4273	int front_len = (int) le32_to_cpu(hdr->front_len);
				4274
				4275	if (con->in_msg)
				4276	return con->in_msg;
				4277
				4278	*skip = 0;
				4279	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
				4280	if (!msg) {
				4281	pr_err("unable to allocate msg type %d len %d\n",
				4282	type, front_len);
				4283	return NULL;
				4284	}
				4285
				4286	return msg;
				4287	}
				4288
				4289	static int mds_sign_message(struct ceph_msg *msg)
				4290	{
				4291	struct ceph_mds_session *s = msg->con->private;
				4292	struct ceph_auth_handshake *auth = &s->s_auth;
				4293
				4294	return ceph_auth_sign_message(auth, msg);
				4295	}
				4296
				4297	static int mds_check_message_signature(struct ceph_msg *msg)
				4298	{
				4299	struct ceph_mds_session *s = msg->con->private;
				4300	struct ceph_auth_handshake *auth = &s->s_auth;
				4301
				4302	return ceph_auth_check_message_signature(auth, msg);
				4303	}
				4304
				4305	static const struct ceph_connection_operations mds_con_ops = {
				4306	.get = con_get,
				4307	.put = con_put,
				4308	.dispatch = dispatch,
				4309	.get_authorizer = get_authorizer,
				4310	.add_authorizer_challenge = add_authorizer_challenge,
				4311	.verify_authorizer_reply = verify_authorizer_reply,
				4312	.invalidate_authorizer = invalidate_authorizer,
				4313	.peer_reset = peer_reset,
				4314	.alloc_msg = mds_alloc_msg,
				4315	.sign_message = mds_sign_message,
				4316	.check_message_signature = mds_check_message_signature,
				4317	};
				4318
				4319	/* eof */