Blame - src/kernel/linux/v4.14/net/ceph/osd_client.c - T103

blob: b026128a89d76631460a1658a670ac5b8e0e0ba1 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include <linux/ceph/ceph_debug.h>
				4
				5	#include <linux/module.h>
				6	#include <linux/err.h>
				7	#include <linux/highmem.h>
				8	#include <linux/mm.h>
				9	#include <linux/pagemap.h>
				10	#include <linux/slab.h>
				11	#include <linux/uaccess.h>
				12	#ifdef CONFIG_BLOCK
				13	#include <linux/bio.h>
				14	#endif
				15
				16	#include <linux/ceph/ceph_features.h>
				17	#include <linux/ceph/libceph.h>
				18	#include <linux/ceph/osd_client.h>
				19	#include <linux/ceph/messenger.h>
				20	#include <linux/ceph/decode.h>
				21	#include <linux/ceph/auth.h>
				22	#include <linux/ceph/pagelist.h>
				23
				24	#define OSD_OPREPLY_FRONT_LEN 512
				25
				26	static struct kmem_cache *ceph_osd_request_cache;
				27
				28	static const struct ceph_connection_operations osd_con_ops;
				29
				30	/*
				31	* Implement client access to distributed object storage cluster.
				32	*
				33	* All data objects are stored within a cluster/cloud of OSDs, or
				34	* "object storage devices." (Note that Ceph OSDs have _nothing_ to
				35	* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
				36	* remote daemons serving up and coordinating consistent and safe
				37	* access to storage.
				38	*
				39	* Cluster membership and the mapping of data objects onto storage devices
				40	* are described by the osd map.
				41	*
				42	* We keep track of pending OSD requests (read, write), resubmit
				43	* requests to different OSDs when the cluster topology/data layout
				44	* change, or retry the affected requests when the communications
				45	* channel with an OSD is reset.
				46	*/
				47
				48	static void link_request(struct ceph_osd osd, struct ceph_osd_request req);
				49	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req);
				50	static void link_linger(struct ceph_osd *osd,
				51	struct ceph_osd_linger_request *lreq);
				52	static void unlink_linger(struct ceph_osd *osd,
				53	struct ceph_osd_linger_request *lreq);
				54	static void clear_backoffs(struct ceph_osd *osd);
				55
				56	#if 1
				57	static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
				58	{
				59	bool wrlocked = true;
				60
				61	if (unlikely(down_read_trylock(sem))) {
				62	wrlocked = false;
				63	up_read(sem);
				64	}
				65
				66	return wrlocked;
				67	}
				68	static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
				69	{
				70	WARN_ON(!rwsem_is_locked(&osdc->lock));
				71	}
				72	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
				73	{
				74	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
				75	}
				76	static inline void verify_osd_locked(struct ceph_osd *osd)
				77	{
				78	struct ceph_osd_client *osdc = osd->o_osdc;
				79
				80	WARN_ON(!(mutex_is_locked(&osd->lock) &&
				81	rwsem_is_locked(&osdc->lock)) &&
				82	!rwsem_is_wrlocked(&osdc->lock));
				83	}
				84	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
				85	{
				86	WARN_ON(!mutex_is_locked(&lreq->lock));
				87	}
				88	#else
				89	static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
				90	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
				91	static inline void verify_osd_locked(struct ceph_osd *osd) { }
				92	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
				93	#endif
				94
				95	/*
				96	* calculate the mapping of a file extent onto an object, and fill out the
				97	* request accordingly. shorten extent as necessary if it crosses an
				98	* object boundary.
				99	*
				100	* fill osd op in request message.
				101	*/
				102	static int calc_layout(struct ceph_file_layout layout, u64 off, u64 plen,
				103	u64 objnum, u64 objoff, u64 *objlen)
				104	{
				105	u64 orig_len = *plen;
				106	int r;
				107
				108	/* object extent? */
				109	r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
				110	objoff, objlen);
				111	if (r < 0)
				112	return r;
				113	if (*objlen < orig_len) {
				114	plen = objlen;
				115	dout(" skipping last %llu, final file extent %llu~%llu\n",
				116	orig_len - plen, off, plen);
				117	}
				118
				119	dout("calc_layout objnum=%llx %llu~%llu\n", objnum, objoff, *objlen);
				120
				121	return 0;
				122	}
				123
				124	static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
				125	{
				126	memset(osd_data, 0, sizeof (*osd_data));
				127	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
				128	}
				129
				130	static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
				131	struct page **pages, u64 length, u32 alignment,
				132	bool pages_from_pool, bool own_pages)
				133	{
				134	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
				135	osd_data->pages = pages;
				136	osd_data->length = length;
				137	osd_data->alignment = alignment;
				138	osd_data->pages_from_pool = pages_from_pool;
				139	osd_data->own_pages = own_pages;
				140	}
				141
				142	static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
				143	struct ceph_pagelist *pagelist)
				144	{
				145	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
				146	osd_data->pagelist = pagelist;
				147	}
				148
				149	#ifdef CONFIG_BLOCK
				150	static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
				151	struct bio *bio, size_t bio_length)
				152	{
				153	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
				154	osd_data->bio = bio;
				155	osd_data->bio_length = bio_length;
				156	}
				157	#endif /* CONFIG_BLOCK */
				158
				159	#define osd_req_op_data(oreq, whch, typ, fld) \
				160	({ \
				161	struct ceph_osd_request *__oreq = (oreq); \
				162	unsigned int __whch = (whch); \
				163	BUG_ON(__whch >= __oreq->r_num_ops); \
				164	&__oreq->r_ops[__whch].typ.fld; \
				165	})
				166
				167	static struct ceph_osd_data *
				168	osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
				169	{
				170	BUG_ON(which >= osd_req->r_num_ops);
				171
				172	return &osd_req->r_ops[which].raw_data_in;
				173	}
				174
				175	struct ceph_osd_data *
				176	osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
				177	unsigned int which)
				178	{
				179	return osd_req_op_data(osd_req, which, extent, osd_data);
				180	}
				181	EXPORT_SYMBOL(osd_req_op_extent_osd_data);
				182
				183	void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
				184	unsigned int which, struct page **pages,
				185	u64 length, u32 alignment,
				186	bool pages_from_pool, bool own_pages)
				187	{
				188	struct ceph_osd_data *osd_data;
				189
				190	osd_data = osd_req_op_raw_data_in(osd_req, which);
				191	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				192	pages_from_pool, own_pages);
				193	}
				194	EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
				195
				196	void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
				197	unsigned int which, struct page **pages,
				198	u64 length, u32 alignment,
				199	bool pages_from_pool, bool own_pages)
				200	{
				201	struct ceph_osd_data *osd_data;
				202
				203	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				204	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				205	pages_from_pool, own_pages);
				206	}
				207	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
				208
				209	void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
				210	unsigned int which, struct ceph_pagelist *pagelist)
				211	{
				212	struct ceph_osd_data *osd_data;
				213
				214	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				215	ceph_osd_data_pagelist_init(osd_data, pagelist);
				216	}
				217	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
				218
				219	#ifdef CONFIG_BLOCK
				220	void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
				221	unsigned int which, struct bio *bio, size_t bio_length)
				222	{
				223	struct ceph_osd_data *osd_data;
				224
				225	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				226	ceph_osd_data_bio_init(osd_data, bio, bio_length);
				227	}
				228	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
				229	#endif /* CONFIG_BLOCK */
				230
				231	static void osd_req_op_cls_request_info_pagelist(
				232	struct ceph_osd_request *osd_req,
				233	unsigned int which, struct ceph_pagelist *pagelist)
				234	{
				235	struct ceph_osd_data *osd_data;
				236
				237	osd_data = osd_req_op_data(osd_req, which, cls, request_info);
				238	ceph_osd_data_pagelist_init(osd_data, pagelist);
				239	}
				240
				241	void osd_req_op_cls_request_data_pagelist(
				242	struct ceph_osd_request *osd_req,
				243	unsigned int which, struct ceph_pagelist *pagelist)
				244	{
				245	struct ceph_osd_data *osd_data;
				246
				247	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				248	ceph_osd_data_pagelist_init(osd_data, pagelist);
				249	osd_req->r_ops[which].cls.indata_len += pagelist->length;
				250	osd_req->r_ops[which].indata_len += pagelist->length;
				251	}
				252	EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
				253
				254	void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
				255	unsigned int which, struct page **pages, u64 length,
				256	u32 alignment, bool pages_from_pool, bool own_pages)
				257	{
				258	struct ceph_osd_data *osd_data;
				259
				260	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				261	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				262	pages_from_pool, own_pages);
				263	osd_req->r_ops[which].cls.indata_len += length;
				264	osd_req->r_ops[which].indata_len += length;
				265	}
				266	EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
				267
				268	void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
				269	unsigned int which, struct page **pages, u64 length,
				270	u32 alignment, bool pages_from_pool, bool own_pages)
				271	{
				272	struct ceph_osd_data *osd_data;
				273
				274	osd_data = osd_req_op_data(osd_req, which, cls, response_data);
				275	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				276	pages_from_pool, own_pages);
				277	}
				278	EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
				279
				280	static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
				281	{
				282	switch (osd_data->type) {
				283	case CEPH_OSD_DATA_TYPE_NONE:
				284	return 0;
				285	case CEPH_OSD_DATA_TYPE_PAGES:
				286	return osd_data->length;
				287	case CEPH_OSD_DATA_TYPE_PAGELIST:
				288	return (u64)osd_data->pagelist->length;
				289	#ifdef CONFIG_BLOCK
				290	case CEPH_OSD_DATA_TYPE_BIO:
				291	return (u64)osd_data->bio_length;
				292	#endif /* CONFIG_BLOCK */
				293	default:
				294	WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
				295	return 0;
				296	}
				297	}
				298
				299	static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
				300	{
				301	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
				302	int num_pages;
				303
				304	num_pages = calc_pages_for((u64)osd_data->alignment,
				305	(u64)osd_data->length);
				306	ceph_release_page_vector(osd_data->pages, num_pages);
				307	}
				308	ceph_osd_data_init(osd_data);
				309	}
				310
				311	static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
				312	unsigned int which)
				313	{
				314	struct ceph_osd_req_op *op;
				315
				316	BUG_ON(which >= osd_req->r_num_ops);
				317	op = &osd_req->r_ops[which];
				318
				319	switch (op->op) {
				320	case CEPH_OSD_OP_READ:
				321	case CEPH_OSD_OP_WRITE:
				322	case CEPH_OSD_OP_WRITEFULL:
				323	ceph_osd_data_release(&op->extent.osd_data);
				324	break;
				325	case CEPH_OSD_OP_CALL:
				326	ceph_osd_data_release(&op->cls.request_info);
				327	ceph_osd_data_release(&op->cls.request_data);
				328	ceph_osd_data_release(&op->cls.response_data);
				329	break;
				330	case CEPH_OSD_OP_SETXATTR:
				331	case CEPH_OSD_OP_CMPXATTR:
				332	ceph_osd_data_release(&op->xattr.osd_data);
				333	break;
				334	case CEPH_OSD_OP_STAT:
				335	ceph_osd_data_release(&op->raw_data_in);
				336	break;
				337	case CEPH_OSD_OP_NOTIFY_ACK:
				338	ceph_osd_data_release(&op->notify_ack.request_data);
				339	break;
				340	case CEPH_OSD_OP_NOTIFY:
				341	ceph_osd_data_release(&op->notify.request_data);
				342	ceph_osd_data_release(&op->notify.response_data);
				343	break;
				344	case CEPH_OSD_OP_LIST_WATCHERS:
				345	ceph_osd_data_release(&op->list_watchers.response_data);
				346	break;
				347	default:
				348	break;
				349	}
				350	}
				351
				352	/*
				353	* Assumes @t is zero-initialized.
				354	*/
				355	static void target_init(struct ceph_osd_request_target *t)
				356	{
				357	ceph_oid_init(&t->base_oid);
				358	ceph_oloc_init(&t->base_oloc);
				359	ceph_oid_init(&t->target_oid);
				360	ceph_oloc_init(&t->target_oloc);
				361
				362	ceph_osds_init(&t->acting);
				363	ceph_osds_init(&t->up);
				364	t->size = -1;
				365	t->min_size = -1;
				366
				367	t->osd = CEPH_HOMELESS_OSD;
				368	}
				369
				370	static void target_copy(struct ceph_osd_request_target *dest,
				371	const struct ceph_osd_request_target *src)
				372	{
				373	ceph_oid_copy(&dest->base_oid, &src->base_oid);
				374	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
				375	ceph_oid_copy(&dest->target_oid, &src->target_oid);
				376	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
				377
				378	dest->pgid = src->pgid; /* struct */
				379	dest->spgid = src->spgid; /* struct */
				380	dest->pg_num = src->pg_num;
				381	dest->pg_num_mask = src->pg_num_mask;
				382	ceph_osds_copy(&dest->acting, &src->acting);
				383	ceph_osds_copy(&dest->up, &src->up);
				384	dest->size = src->size;
				385	dest->min_size = src->min_size;
				386	dest->sort_bitwise = src->sort_bitwise;
				387	dest->recovery_deletes = src->recovery_deletes;
				388
				389	dest->flags = src->flags;
				390	dest->paused = src->paused;
				391
				392	dest->epoch = src->epoch;
				393	dest->last_force_resend = src->last_force_resend;
				394
				395	dest->osd = src->osd;
				396	}
				397
				398	static void target_destroy(struct ceph_osd_request_target *t)
				399	{
				400	ceph_oid_destroy(&t->base_oid);
				401	ceph_oloc_destroy(&t->base_oloc);
				402	ceph_oid_destroy(&t->target_oid);
				403	ceph_oloc_destroy(&t->target_oloc);
				404	}
				405
				406	/*
				407	* requests
				408	*/
				409	static void request_release_checks(struct ceph_osd_request *req)
				410	{
				411	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
				412	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
				413	WARN_ON(!list_empty(&req->r_unsafe_item));
				414	WARN_ON(req->r_osd);
				415	}
				416
				417	static void ceph_osdc_release_request(struct kref *kref)
				418	{
				419	struct ceph_osd_request *req = container_of(kref,
				420	struct ceph_osd_request, r_kref);
				421	unsigned int which;
				422
				423	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
				424	req->r_request, req->r_reply);
				425	request_release_checks(req);
				426
				427	if (req->r_request)
				428	ceph_msg_put(req->r_request);
				429	if (req->r_reply)
				430	ceph_msg_put(req->r_reply);
				431
				432	for (which = 0; which < req->r_num_ops; which++)
				433	osd_req_op_data_release(req, which);
				434
				435	target_destroy(&req->r_t);
				436	ceph_put_snap_context(req->r_snapc);
				437
				438	if (req->r_mempool)
				439	mempool_free(req, req->r_osdc->req_mempool);
				440	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
				441	kmem_cache_free(ceph_osd_request_cache, req);
				442	else
				443	kfree(req);
				444	}
				445
				446	void ceph_osdc_get_request(struct ceph_osd_request *req)
				447	{
				448	dout("%s %p (was %d)\n", __func__, req,
				449	kref_read(&req->r_kref));
				450	kref_get(&req->r_kref);
				451	}
				452	EXPORT_SYMBOL(ceph_osdc_get_request);
				453
				454	void ceph_osdc_put_request(struct ceph_osd_request *req)
				455	{
				456	if (req) {
				457	dout("%s %p (was %d)\n", __func__, req,
				458	kref_read(&req->r_kref));
				459	kref_put(&req->r_kref, ceph_osdc_release_request);
				460	}
				461	}
				462	EXPORT_SYMBOL(ceph_osdc_put_request);
				463
				464	static void request_init(struct ceph_osd_request *req)
				465	{
				466	/* req only, each op is zeroed in _osd_req_op_init() */
				467	memset(req, 0, sizeof(*req));
				468
				469	kref_init(&req->r_kref);
				470	init_completion(&req->r_completion);
				471	RB_CLEAR_NODE(&req->r_node);
				472	RB_CLEAR_NODE(&req->r_mc_node);
				473	INIT_LIST_HEAD(&req->r_unsafe_item);
				474
				475	target_init(&req->r_t);
				476	}
				477
				478	/*
				479	* This is ugly, but it allows us to reuse linger registration and ping
				480	* requests, keeping the structure of the code around send_linger{_ping}()
				481	* reasonable. Setting up a min_nr=2 mempool for each linger request
				482	* and dealing with copying ops (this blasts req only, watch op remains
				483	* intact) isn't any better.
				484	*/
				485	static void request_reinit(struct ceph_osd_request *req)
				486	{
				487	struct ceph_osd_client *osdc = req->r_osdc;
				488	bool mempool = req->r_mempool;
				489	unsigned int num_ops = req->r_num_ops;
				490	u64 snapid = req->r_snapid;
				491	struct ceph_snap_context *snapc = req->r_snapc;
				492	bool linger = req->r_linger;
				493	struct ceph_msg *request_msg = req->r_request;
				494	struct ceph_msg *reply_msg = req->r_reply;
				495
				496	dout("%s req %p\n", __func__, req);
				497	WARN_ON(kref_read(&req->r_kref) != 1);
				498	request_release_checks(req);
				499
				500	WARN_ON(kref_read(&request_msg->kref) != 1);
				501	WARN_ON(kref_read(&reply_msg->kref) != 1);
				502	target_destroy(&req->r_t);
				503
				504	request_init(req);
				505	req->r_osdc = osdc;
				506	req->r_mempool = mempool;
				507	req->r_num_ops = num_ops;
				508	req->r_snapid = snapid;
				509	req->r_snapc = snapc;
				510	req->r_linger = linger;
				511	req->r_request = request_msg;
				512	req->r_reply = reply_msg;
				513	}
				514
				515	struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,
				516	struct ceph_snap_context *snapc,
				517	unsigned int num_ops,
				518	bool use_mempool,
				519	gfp_t gfp_flags)
				520	{
				521	struct ceph_osd_request *req;
				522
				523	if (use_mempool) {
				524	BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
				525	req = mempool_alloc(osdc->req_mempool, gfp_flags);
				526	} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
				527	req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
				528	} else {
				529	BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
				530	req = kmalloc(sizeof(req) + num_ops sizeof(req->r_ops[0]),
				531	gfp_flags);
				532	}
				533	if (unlikely(!req))
				534	return NULL;
				535
				536	request_init(req);
				537	req->r_osdc = osdc;
				538	req->r_mempool = use_mempool;
				539	req->r_num_ops = num_ops;
				540	req->r_snapid = CEPH_NOSNAP;
				541	req->r_snapc = ceph_get_snap_context(snapc);
				542
				543	dout("%s req %p\n", __func__, req);
				544	return req;
				545	}
				546	EXPORT_SYMBOL(ceph_osdc_alloc_request);
				547
				548	static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
				549	{
				550	return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
				551	}
				552
				553	int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
				554	{
				555	struct ceph_osd_client *osdc = req->r_osdc;
				556	struct ceph_msg *msg;
				557	int msg_size;
				558
				559	WARN_ON(ceph_oid_empty(&req->r_base_oid));
				560	WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
				561
				562	/* create request message */
				563	msg_size = CEPH_ENCODING_START_BLK_LEN +
				564	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				565	msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
				566	msg_size += CEPH_ENCODING_START_BLK_LEN +
				567	sizeof(struct ceph_osd_reqid); /* reqid */
				568	msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
				569	msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
				570	msg_size += CEPH_ENCODING_START_BLK_LEN +
				571	ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
				572	msg_size += 4 + req->r_base_oid.name_len; /* oid */
				573	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
				574	msg_size += 8; /* snapid */
				575	msg_size += 8; /* snap_seq */
				576	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
				577	msg_size += 4 + 8; /* retry_attempt, features */
				578
				579	if (req->r_mempool)
				580	msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
				581	else
				582	msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
				583	if (!msg)
				584	return -ENOMEM;
				585
				586	memset(msg->front.iov_base, 0, msg->front.iov_len);
				587	req->r_request = msg;
				588
				589	/* create reply message */
				590	msg_size = OSD_OPREPLY_FRONT_LEN;
				591	msg_size += req->r_base_oid.name_len;
				592	msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
				593
				594	if (req->r_mempool)
				595	msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
				596	else
				597	msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
				598	if (!msg)
				599	return -ENOMEM;
				600
				601	req->r_reply = msg;
				602
				603	return 0;
				604	}
				605	EXPORT_SYMBOL(ceph_osdc_alloc_messages);
				606
				607	static bool osd_req_opcode_valid(u16 opcode)
				608	{
				609	switch (opcode) {
				610	#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
				611	__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
				612	#undef GENERATE_CASE
				613	default:
				614	return false;
				615	}
				616	}
				617
				618	/*
				619	* This is an osd op init function for opcodes that have no data or
				620	* other information associated with them. It also serves as a
				621	* common init routine for all the other init functions, below.
				622	*/
				623	static struct ceph_osd_req_op *
				624	_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
				625	u16 opcode, u32 flags)
				626	{
				627	struct ceph_osd_req_op *op;
				628
				629	BUG_ON(which >= osd_req->r_num_ops);
				630	BUG_ON(!osd_req_opcode_valid(opcode));
				631
				632	op = &osd_req->r_ops[which];
				633	memset(op, 0, sizeof (*op));
				634	op->op = opcode;
				635	op->flags = flags;
				636
				637	return op;
				638	}
				639
				640	void osd_req_op_init(struct ceph_osd_request *osd_req,
				641	unsigned int which, u16 opcode, u32 flags)
				642	{
				643	(void)_osd_req_op_init(osd_req, which, opcode, flags);
				644	}
				645	EXPORT_SYMBOL(osd_req_op_init);
				646
				647	void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
				648	unsigned int which, u16 opcode,
				649	u64 offset, u64 length,
				650	u64 truncate_size, u32 truncate_seq)
				651	{
				652	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				653	opcode, 0);
				654	size_t payload_len = 0;
				655
				656	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				657	opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
				658	opcode != CEPH_OSD_OP_TRUNCATE);
				659
				660	op->extent.offset = offset;
				661	op->extent.length = length;
				662	op->extent.truncate_size = truncate_size;
				663	op->extent.truncate_seq = truncate_seq;
				664	if (opcode == CEPH_OSD_OP_WRITE \|\| opcode == CEPH_OSD_OP_WRITEFULL)
				665	payload_len += length;
				666
				667	op->indata_len = payload_len;
				668	}
				669	EXPORT_SYMBOL(osd_req_op_extent_init);
				670
				671	void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
				672	unsigned int which, u64 length)
				673	{
				674	struct ceph_osd_req_op *op;
				675	u64 previous;
				676
				677	BUG_ON(which >= osd_req->r_num_ops);
				678	op = &osd_req->r_ops[which];
				679	previous = op->extent.length;
				680
				681	if (length == previous)
				682	return; /* Nothing to do */
				683	BUG_ON(length > previous);
				684
				685	op->extent.length = length;
				686	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				687	op->indata_len -= previous - length;
				688	}
				689	EXPORT_SYMBOL(osd_req_op_extent_update);
				690
				691	void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
				692	unsigned int which, u64 offset_inc)
				693	{
				694	struct ceph_osd_req_op op, prev_op;
				695
				696	BUG_ON(which + 1 >= osd_req->r_num_ops);
				697
				698	prev_op = &osd_req->r_ops[which];
				699	op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
				700	/* dup previous one */
				701	op->indata_len = prev_op->indata_len;
				702	op->outdata_len = prev_op->outdata_len;
				703	op->extent = prev_op->extent;
				704	/* adjust offset */
				705	op->extent.offset += offset_inc;
				706	op->extent.length -= offset_inc;
				707
				708	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				709	op->indata_len -= offset_inc;
				710	}
				711	EXPORT_SYMBOL(osd_req_op_extent_dup_last);
				712
				713	void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
				714	u16 opcode, const char class, const char method)
				715	{
				716	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				717	opcode, 0);
				718	struct ceph_pagelist *pagelist;
				719	size_t payload_len = 0;
				720	size_t size;
				721
				722	BUG_ON(opcode != CEPH_OSD_OP_CALL);
				723
				724	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
				725	BUG_ON(!pagelist);
				726	ceph_pagelist_init(pagelist);
				727
				728	op->cls.class_name = class;
				729	size = strlen(class);
				730	BUG_ON(size > (size_t) U8_MAX);
				731	op->cls.class_len = size;
				732	ceph_pagelist_append(pagelist, class, size);
				733	payload_len += size;
				734
				735	op->cls.method_name = method;
				736	size = strlen(method);
				737	BUG_ON(size > (size_t) U8_MAX);
				738	op->cls.method_len = size;
				739	ceph_pagelist_append(pagelist, method, size);
				740	payload_len += size;
				741
				742	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
				743
				744	op->indata_len = payload_len;
				745	}
				746	EXPORT_SYMBOL(osd_req_op_cls_init);
				747
				748	int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
				749	u16 opcode, const char name, const void value,
				750	size_t size, u8 cmp_op, u8 cmp_mode)
				751	{
				752	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				753	opcode, 0);
				754	struct ceph_pagelist *pagelist;
				755	size_t payload_len;
				756
				757	BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
				758
				759	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
				760	if (!pagelist)
				761	return -ENOMEM;
				762
				763	ceph_pagelist_init(pagelist);
				764
				765	payload_len = strlen(name);
				766	op->xattr.name_len = payload_len;
				767	ceph_pagelist_append(pagelist, name, payload_len);
				768
				769	op->xattr.value_len = size;
				770	ceph_pagelist_append(pagelist, value, size);
				771	payload_len += size;
				772
				773	op->xattr.cmp_op = cmp_op;
				774	op->xattr.cmp_mode = cmp_mode;
				775
				776	ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
				777	op->indata_len = payload_len;
				778	return 0;
				779	}
				780	EXPORT_SYMBOL(osd_req_op_xattr_init);
				781
				782	/*
				783	* @watch_opcode: CEPH_OSD_WATCH_OP_*
				784	*/
				785	static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
				786	u64 cookie, u8 watch_opcode)
				787	{
				788	struct ceph_osd_req_op *op;
				789
				790	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
				791	op->watch.cookie = cookie;
				792	op->watch.op = watch_opcode;
				793	op->watch.gen = 0;
				794	}
				795
				796	void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
				797	unsigned int which,
				798	u64 expected_object_size,
				799	u64 expected_write_size)
				800	{
				801	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				802	CEPH_OSD_OP_SETALLOCHINT,
				803	0);
				804
				805	op->alloc_hint.expected_object_size = expected_object_size;
				806	op->alloc_hint.expected_write_size = expected_write_size;
				807
				808	/*
				809	* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
				810	* not worth a feature bit. Set FAILOK per-op flag to make
				811	* sure older osds don't trip over an unsupported opcode.
				812	*/
				813	op->flags \|= CEPH_OSD_OP_FLAG_FAILOK;
				814	}
				815	EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
				816
				817	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
				818	struct ceph_osd_data *osd_data)
				819	{
				820	u64 length = ceph_osd_data_length(osd_data);
				821
				822	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
				823	BUG_ON(length > (u64) SIZE_MAX);
				824	if (length)
				825	ceph_msg_data_add_pages(msg, osd_data->pages,
				826	length, osd_data->alignment);
				827	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
				828	BUG_ON(!length);
				829	ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
				830	#ifdef CONFIG_BLOCK
				831	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
				832	ceph_msg_data_add_bio(msg, osd_data->bio, length);
				833	#endif
				834	} else {
				835	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
				836	}
				837	}
				838
				839	static u32 osd_req_encode_op(struct ceph_osd_op *dst,
				840	const struct ceph_osd_req_op *src)
				841	{
				842	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
				843	pr_err("unrecognized osd opcode %d\n", src->op);
				844
				845	return 0;
				846	}
				847
				848	switch (src->op) {
				849	case CEPH_OSD_OP_STAT:
				850	break;
				851	case CEPH_OSD_OP_READ:
				852	case CEPH_OSD_OP_WRITE:
				853	case CEPH_OSD_OP_WRITEFULL:
				854	case CEPH_OSD_OP_ZERO:
				855	case CEPH_OSD_OP_TRUNCATE:
				856	dst->extent.offset = cpu_to_le64(src->extent.offset);
				857	dst->extent.length = cpu_to_le64(src->extent.length);
				858	dst->extent.truncate_size =
				859	cpu_to_le64(src->extent.truncate_size);
				860	dst->extent.truncate_seq =
				861	cpu_to_le32(src->extent.truncate_seq);
				862	break;
				863	case CEPH_OSD_OP_CALL:
				864	dst->cls.class_len = src->cls.class_len;
				865	dst->cls.method_len = src->cls.method_len;
				866	dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
				867	break;
				868	case CEPH_OSD_OP_WATCH:
				869	dst->watch.cookie = cpu_to_le64(src->watch.cookie);
				870	dst->watch.ver = cpu_to_le64(0);
				871	dst->watch.op = src->watch.op;
				872	dst->watch.gen = cpu_to_le32(src->watch.gen);
				873	break;
				874	case CEPH_OSD_OP_NOTIFY_ACK:
				875	break;
				876	case CEPH_OSD_OP_NOTIFY:
				877	dst->notify.cookie = cpu_to_le64(src->notify.cookie);
				878	break;
				879	case CEPH_OSD_OP_LIST_WATCHERS:
				880	break;
				881	case CEPH_OSD_OP_SETALLOCHINT:
				882	dst->alloc_hint.expected_object_size =
				883	cpu_to_le64(src->alloc_hint.expected_object_size);
				884	dst->alloc_hint.expected_write_size =
				885	cpu_to_le64(src->alloc_hint.expected_write_size);
				886	break;
				887	case CEPH_OSD_OP_SETXATTR:
				888	case CEPH_OSD_OP_CMPXATTR:
				889	dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
				890	dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
				891	dst->xattr.cmp_op = src->xattr.cmp_op;
				892	dst->xattr.cmp_mode = src->xattr.cmp_mode;
				893	break;
				894	case CEPH_OSD_OP_CREATE:
				895	case CEPH_OSD_OP_DELETE:
				896	break;
				897	default:
				898	pr_err("unsupported osd opcode %s\n",
				899	ceph_osd_op_name(src->op));
				900	WARN_ON(1);
				901
				902	return 0;
				903	}
				904
				905	dst->op = cpu_to_le16(src->op);
				906	dst->flags = cpu_to_le32(src->flags);
				907	dst->payload_len = cpu_to_le32(src->indata_len);
				908
				909	return src->indata_len;
				910	}
				911
				912	/*
				913	* build new request AND message, calculate layout, and adjust file
				914	* extent as needed.
				915	*
				916	* if the file was recently truncated, we include information about its
				917	* old and new size so that the object can be updated appropriately. (we
				918	* avoid synchronously deleting truncated objects because it's slow.)
				919	*/
				920	struct ceph_osd_request ceph_osdc_new_request(struct ceph_osd_client osdc,
				921	struct ceph_file_layout *layout,
				922	struct ceph_vino vino,
				923	u64 off, u64 *plen,
				924	unsigned int which, int num_ops,
				925	int opcode, int flags,
				926	struct ceph_snap_context *snapc,
				927	u32 truncate_seq,
				928	u64 truncate_size,
				929	bool use_mempool)
				930	{
				931	struct ceph_osd_request *req;
				932	u64 objnum = 0;
				933	u64 objoff = 0;
				934	u64 objlen = 0;
				935	int r;
				936
				937	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				938	opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
				939	opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
				940
				941	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
				942	GFP_NOFS);
				943	if (!req) {
				944	r = -ENOMEM;
				945	goto fail;
				946	}
				947
				948	/* calculate max write size */
				949	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
				950	if (r)
				951	goto fail;
				952
				953	if (opcode == CEPH_OSD_OP_CREATE \|\| opcode == CEPH_OSD_OP_DELETE) {
				954	osd_req_op_init(req, which, opcode, 0);
				955	} else {
				956	u32 object_size = layout->object_size;
				957	u32 object_base = off - objoff;
				958	if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
				959	if (truncate_size <= object_base) {
				960	truncate_size = 0;
				961	} else {
				962	truncate_size -= object_base;
				963	if (truncate_size > object_size)
				964	truncate_size = object_size;
				965	}
				966	}
				967	osd_req_op_extent_init(req, which, opcode, objoff, objlen,
				968	truncate_size, truncate_seq);
				969	}
				970
				971	req->r_abort_on_full = true;
				972	req->r_flags = flags;
				973	req->r_base_oloc.pool = layout->pool_id;
				974	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
				975	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
				976
				977	req->r_snapid = vino.snap;
				978	if (flags & CEPH_OSD_FLAG_WRITE)
				979	req->r_data_offset = off;
				980
				981	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
				982	if (r)
				983	goto fail;
				984
				985	return req;
				986
				987	fail:
				988	ceph_osdc_put_request(req);
				989	return ERR_PTR(r);
				990	}
				991	EXPORT_SYMBOL(ceph_osdc_new_request);
				992
				993	/*
				994	* We keep osd requests in an rbtree, sorted by ->r_tid.
				995	*/
				996	DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
				997	DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
				998
				999	static bool osd_homeless(struct ceph_osd *osd)
				1000	{
				1001	return osd->o_osd == CEPH_HOMELESS_OSD;
				1002	}
				1003
				1004	static bool osd_registered(struct ceph_osd *osd)
				1005	{
				1006	verify_osdc_locked(osd->o_osdc);
				1007
				1008	return !RB_EMPTY_NODE(&osd->o_node);
				1009	}
				1010
				1011	/*
				1012	* Assumes @osd is zero-initialized.
				1013	*/
				1014	static void osd_init(struct ceph_osd *osd)
				1015	{
				1016	refcount_set(&osd->o_ref, 1);
				1017	RB_CLEAR_NODE(&osd->o_node);
				1018	osd->o_requests = RB_ROOT;
				1019	osd->o_linger_requests = RB_ROOT;
				1020	osd->o_backoff_mappings = RB_ROOT;
				1021	osd->o_backoffs_by_id = RB_ROOT;
				1022	INIT_LIST_HEAD(&osd->o_osd_lru);
				1023	INIT_LIST_HEAD(&osd->o_keepalive_item);
				1024	osd->o_incarnation = 1;
				1025	mutex_init(&osd->lock);
				1026	}
				1027
				1028	static void osd_cleanup(struct ceph_osd *osd)
				1029	{
				1030	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
				1031	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				1032	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				1033	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
				1034	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
				1035	WARN_ON(!list_empty(&osd->o_osd_lru));
				1036	WARN_ON(!list_empty(&osd->o_keepalive_item));
				1037
				1038	if (osd->o_auth.authorizer) {
				1039	WARN_ON(osd_homeless(osd));
				1040	ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
				1041	}
				1042	}
				1043
				1044	/*
				1045	* Track open sessions with osds.
				1046	*/
				1047	static struct ceph_osd create_osd(struct ceph_osd_client osdc, int onum)
				1048	{
				1049	struct ceph_osd *osd;
				1050
				1051	WARN_ON(onum == CEPH_HOMELESS_OSD);
				1052
				1053	osd = kzalloc(sizeof(*osd), GFP_NOIO \| __GFP_NOFAIL);
				1054	osd_init(osd);
				1055	osd->o_osdc = osdc;
				1056	osd->o_osd = onum;
				1057
				1058	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
				1059
				1060	return osd;
				1061	}
				1062
				1063	static struct ceph_osd get_osd(struct ceph_osd osd)
				1064	{
				1065	if (refcount_inc_not_zero(&osd->o_ref)) {
				1066	dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
				1067	refcount_read(&osd->o_ref));
				1068	return osd;
				1069	} else {
				1070	dout("get_osd %p FAIL\n", osd);
				1071	return NULL;
				1072	}
				1073	}
				1074
				1075	static void put_osd(struct ceph_osd *osd)
				1076	{
				1077	dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
				1078	refcount_read(&osd->o_ref) - 1);
				1079	if (refcount_dec_and_test(&osd->o_ref)) {
				1080	osd_cleanup(osd);
				1081	kfree(osd);
				1082	}
				1083	}
				1084
				1085	DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
				1086
				1087	static void __move_osd_to_lru(struct ceph_osd *osd)
				1088	{
				1089	struct ceph_osd_client *osdc = osd->o_osdc;
				1090
				1091	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1092	BUG_ON(!list_empty(&osd->o_osd_lru));
				1093
				1094	spin_lock(&osdc->osd_lru_lock);
				1095	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
				1096	spin_unlock(&osdc->osd_lru_lock);
				1097
				1098	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
				1099	}
				1100
				1101	static void maybe_move_osd_to_lru(struct ceph_osd *osd)
				1102	{
				1103	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1104	RB_EMPTY_ROOT(&osd->o_linger_requests))
				1105	__move_osd_to_lru(osd);
				1106	}
				1107
				1108	static void __remove_osd_from_lru(struct ceph_osd *osd)
				1109	{
				1110	struct ceph_osd_client *osdc = osd->o_osdc;
				1111
				1112	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1113
				1114	spin_lock(&osdc->osd_lru_lock);
				1115	if (!list_empty(&osd->o_osd_lru))
				1116	list_del_init(&osd->o_osd_lru);
				1117	spin_unlock(&osdc->osd_lru_lock);
				1118	}
				1119
				1120	/*
				1121	* Close the connection and assign any leftover requests to the
				1122	* homeless session.
				1123	*/
				1124	static void close_osd(struct ceph_osd *osd)
				1125	{
				1126	struct ceph_osd_client *osdc = osd->o_osdc;
				1127	struct rb_node *n;
				1128
				1129	verify_osdc_wrlocked(osdc);
				1130	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1131
				1132	ceph_con_close(&osd->o_con);
				1133
				1134	for (n = rb_first(&osd->o_requests); n; ) {
				1135	struct ceph_osd_request *req =
				1136	rb_entry(n, struct ceph_osd_request, r_node);
				1137
				1138	n = rb_next(n); /* unlink_request() */
				1139
				1140	dout(" reassigning req %p tid %llu\n", req, req->r_tid);
				1141	unlink_request(osd, req);
				1142	link_request(&osdc->homeless_osd, req);
				1143	}
				1144	for (n = rb_first(&osd->o_linger_requests); n; ) {
				1145	struct ceph_osd_linger_request *lreq =
				1146	rb_entry(n, struct ceph_osd_linger_request, node);
				1147
				1148	n = rb_next(n); /* unlink_linger() */
				1149
				1150	dout(" reassigning lreq %p linger_id %llu\n", lreq,
				1151	lreq->linger_id);
				1152	unlink_linger(osd, lreq);
				1153	link_linger(&osdc->homeless_osd, lreq);
				1154	}
				1155	clear_backoffs(osd);
				1156
				1157	__remove_osd_from_lru(osd);
				1158	erase_osd(&osdc->osds, osd);
				1159	put_osd(osd);
				1160	}
				1161
				1162	/*
				1163	* reset osd connect
				1164	*/
				1165	static int reopen_osd(struct ceph_osd *osd)
				1166	{
				1167	struct ceph_entity_addr *peer_addr;
				1168
				1169	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1170
				1171	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1172	RB_EMPTY_ROOT(&osd->o_linger_requests)) {
				1173	close_osd(osd);
				1174	return -ENODEV;
				1175	}
				1176
				1177	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
				1178	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
				1179	!ceph_con_opened(&osd->o_con)) {
				1180	struct rb_node *n;
				1181
				1182	dout("osd addr hasn't changed and connection never opened, "
				1183	"letting msgr retry\n");
				1184	/* touch each r_stamp for handle_timeout()'s benfit */
				1185	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				1186	struct ceph_osd_request *req =
				1187	rb_entry(n, struct ceph_osd_request, r_node);
				1188	req->r_stamp = jiffies;
				1189	}
				1190
				1191	return -EAGAIN;
				1192	}
				1193
				1194	ceph_con_close(&osd->o_con);
				1195	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
				1196	osd->o_incarnation++;
				1197
				1198	return 0;
				1199	}
				1200
				1201	static struct ceph_osd lookup_create_osd(struct ceph_osd_client osdc, int o,
				1202	bool wrlocked)
				1203	{
				1204	struct ceph_osd *osd;
				1205
				1206	if (wrlocked)
				1207	verify_osdc_wrlocked(osdc);
				1208	else
				1209	verify_osdc_locked(osdc);
				1210
				1211	if (o != CEPH_HOMELESS_OSD)
				1212	osd = lookup_osd(&osdc->osds, o);
				1213	else
				1214	osd = &osdc->homeless_osd;
				1215	if (!osd) {
				1216	if (!wrlocked)
				1217	return ERR_PTR(-EAGAIN);
				1218
				1219	osd = create_osd(osdc, o);
				1220	insert_osd(&osdc->osds, osd);
				1221	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
				1222	&osdc->osdmap->osd_addr[osd->o_osd]);
				1223	}
				1224
				1225	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
				1226	return osd;
				1227	}
				1228
				1229	/*
				1230	* Create request <-> OSD session relation.
				1231	*
				1232	* @req has to be assigned a tid, @osd may be homeless.
				1233	*/
				1234	static void link_request(struct ceph_osd osd, struct ceph_osd_request req)
				1235	{
				1236	verify_osd_locked(osd);
				1237	WARN_ON(!req->r_tid \|\| req->r_osd);
				1238	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1239	req, req->r_tid);
				1240
				1241	if (!osd_homeless(osd))
				1242	__remove_osd_from_lru(osd);
				1243	else
				1244	atomic_inc(&osd->o_osdc->num_homeless);
				1245
				1246	get_osd(osd);
				1247	insert_request(&osd->o_requests, req);
				1248	req->r_osd = osd;
				1249	}
				1250
				1251	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req)
				1252	{
				1253	verify_osd_locked(osd);
				1254	WARN_ON(req->r_osd != osd);
				1255	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1256	req, req->r_tid);
				1257
				1258	req->r_osd = NULL;
				1259	erase_request(&osd->o_requests, req);
				1260	put_osd(osd);
				1261
				1262	if (!osd_homeless(osd))
				1263	maybe_move_osd_to_lru(osd);
				1264	else
				1265	atomic_dec(&osd->o_osdc->num_homeless);
				1266	}
				1267
				1268	static bool __pool_full(struct ceph_pg_pool_info *pi)
				1269	{
				1270	return pi->flags & CEPH_POOL_FLAG_FULL;
				1271	}
				1272
				1273	static bool have_pool_full(struct ceph_osd_client *osdc)
				1274	{
				1275	struct rb_node *n;
				1276
				1277	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				1278	struct ceph_pg_pool_info *pi =
				1279	rb_entry(n, struct ceph_pg_pool_info, node);
				1280
				1281	if (__pool_full(pi))
				1282	return true;
				1283	}
				1284
				1285	return false;
				1286	}
				1287
				1288	static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
				1289	{
				1290	struct ceph_pg_pool_info *pi;
				1291
				1292	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				1293	if (!pi)
				1294	return false;
				1295
				1296	return __pool_full(pi);
				1297	}
				1298
				1299	/*
				1300	* Returns whether a request should be blocked from being sent
				1301	* based on the current osdmap and osd_client settings.
				1302	*/
				1303	static bool target_should_be_paused(struct ceph_osd_client *osdc,
				1304	const struct ceph_osd_request_target *t,
				1305	struct ceph_pg_pool_info *pi)
				1306	{
				1307	bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				1308	bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				1309	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				1310	__pool_full(pi);
				1311
				1312	WARN_ON(pi->id != t->target_oloc.pool);
				1313	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) \|\|
				1314	((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) \|\|
				1315	(osdc->osdmap->epoch < osdc->epoch_barrier);
				1316	}
				1317
				1318	enum calc_target_result {
				1319	CALC_TARGET_NO_ACTION = 0,
				1320	CALC_TARGET_NEED_RESEND,
				1321	CALC_TARGET_POOL_DNE,
				1322	};
				1323
				1324	static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
				1325	struct ceph_osd_request_target *t,
				1326	struct ceph_connection *con,
				1327	bool any_change)
				1328	{
				1329	struct ceph_pg_pool_info *pi;
				1330	struct ceph_pg pgid, last_pgid;
				1331	struct ceph_osds up, acting;
				1332	bool force_resend = false;
				1333	bool unpaused = false;
				1334	bool legacy_change = false;
				1335	bool split = false;
				1336	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
				1337	bool recovery_deletes = ceph_osdmap_flag(osdc,
				1338	CEPH_OSDMAP_RECOVERY_DELETES);
				1339	enum calc_target_result ct_res;
				1340	int ret;
				1341
				1342	t->epoch = osdc->osdmap->epoch;
				1343	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
				1344	if (!pi) {
				1345	t->osd = CEPH_HOMELESS_OSD;
				1346	ct_res = CALC_TARGET_POOL_DNE;
				1347	goto out;
				1348	}
				1349
				1350	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
				1351	if (t->last_force_resend < pi->last_force_request_resend) {
				1352	t->last_force_resend = pi->last_force_request_resend;
				1353	force_resend = true;
				1354	} else if (t->last_force_resend == 0) {
				1355	force_resend = true;
				1356	}
				1357	}
				1358
				1359	/* apply tiering */
				1360	ceph_oid_copy(&t->target_oid, &t->base_oid);
				1361	ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
				1362	if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
				1363	if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
				1364	t->target_oloc.pool = pi->read_tier;
				1365	if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
				1366	t->target_oloc.pool = pi->write_tier;
				1367
				1368	pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
				1369	if (!pi) {
				1370	t->osd = CEPH_HOMELESS_OSD;
				1371	ct_res = CALC_TARGET_POOL_DNE;
				1372	goto out;
				1373	}
				1374	}
				1375
				1376	ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc,
				1377	&pgid);
				1378	if (ret) {
				1379	WARN_ON(ret != -ENOENT);
				1380	t->osd = CEPH_HOMELESS_OSD;
				1381	ct_res = CALC_TARGET_POOL_DNE;
				1382	goto out;
				1383	}
				1384	last_pgid.pool = pgid.pool;
				1385	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
				1386
				1387	ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
				1388	if (any_change &&
				1389	ceph_is_new_interval(&t->acting,
				1390	&acting,
				1391	&t->up,
				1392	&up,
				1393	t->size,
				1394	pi->size,
				1395	t->min_size,
				1396	pi->min_size,
				1397	t->pg_num,
				1398	pi->pg_num,
				1399	t->sort_bitwise,
				1400	sort_bitwise,
				1401	t->recovery_deletes,
				1402	recovery_deletes,
				1403	&last_pgid))
				1404	force_resend = true;
				1405
				1406	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
				1407	t->paused = false;
				1408	unpaused = true;
				1409	}
				1410	legacy_change = ceph_pg_compare(&t->pgid, &pgid) \|\|
				1411	ceph_osds_changed(&t->acting, &acting, any_change);
				1412	if (t->pg_num)
				1413	split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
				1414
				1415	if (legacy_change \|\| force_resend \|\| split) {
				1416	t->pgid = pgid; /* struct */
				1417	ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
				1418	ceph_osds_copy(&t->acting, &acting);
				1419	ceph_osds_copy(&t->up, &up);
				1420	t->size = pi->size;
				1421	t->min_size = pi->min_size;
				1422	t->pg_num = pi->pg_num;
				1423	t->pg_num_mask = pi->pg_num_mask;
				1424	t->sort_bitwise = sort_bitwise;
				1425	t->recovery_deletes = recovery_deletes;
				1426
				1427	t->osd = acting.primary;
				1428	}
				1429
				1430	if (unpaused \|\| legacy_change \|\| force_resend \|\| split)
				1431	ct_res = CALC_TARGET_NEED_RESEND;
				1432	else
				1433	ct_res = CALC_TARGET_NO_ACTION;
				1434
				1435	out:
				1436	dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
				1437	legacy_change, force_resend, split, ct_res, t->osd);
				1438	return ct_res;
				1439	}
				1440
				1441	static struct ceph_spg_mapping *alloc_spg_mapping(void)
				1442	{
				1443	struct ceph_spg_mapping *spg;
				1444
				1445	spg = kmalloc(sizeof(*spg), GFP_NOIO);
				1446	if (!spg)
				1447	return NULL;
				1448
				1449	RB_CLEAR_NODE(&spg->node);
				1450	spg->backoffs = RB_ROOT;
				1451	return spg;
				1452	}
				1453
				1454	static void free_spg_mapping(struct ceph_spg_mapping *spg)
				1455	{
				1456	WARN_ON(!RB_EMPTY_NODE(&spg->node));
				1457	WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
				1458
				1459	kfree(spg);
				1460	}
				1461
				1462	/*
				1463	* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
				1464	* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
				1465	* defined only within a specific spgid; it does not pass anything to
				1466	* children on split, or to another primary.
				1467	*/
				1468	DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
				1469	RB_BYPTR, const struct ceph_spg *, node)
				1470
				1471	static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
				1472	{
				1473	return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
				1474	}
				1475
				1476	static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
				1477	void *pkey, size_t pkey_len)
				1478	{
				1479	if (hoid->key_len) {
				1480	*pkey = hoid->key;
				1481	*pkey_len = hoid->key_len;
				1482	} else {
				1483	*pkey = hoid->oid;
				1484	*pkey_len = hoid->oid_len;
				1485	}
				1486	}
				1487
				1488	static int compare_names(const void *name1, size_t name1_len,
				1489	const void *name2, size_t name2_len)
				1490	{
				1491	int ret;
				1492
				1493	ret = memcmp(name1, name2, min(name1_len, name2_len));
				1494	if (!ret) {
				1495	if (name1_len < name2_len)
				1496	ret = -1;
				1497	else if (name1_len > name2_len)
				1498	ret = 1;
				1499	}
				1500	return ret;
				1501	}
				1502
				1503	static int hoid_compare(const struct ceph_hobject_id *lhs,
				1504	const struct ceph_hobject_id *rhs)
				1505	{
				1506	void effective_key1, effective_key2;
				1507	size_t effective_key1_len, effective_key2_len;
				1508	int ret;
				1509
				1510	if (lhs->is_max < rhs->is_max)
				1511	return -1;
				1512	if (lhs->is_max > rhs->is_max)
				1513	return 1;
				1514
				1515	if (lhs->pool < rhs->pool)
				1516	return -1;
				1517	if (lhs->pool > rhs->pool)
				1518	return 1;
				1519
				1520	if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
				1521	return -1;
				1522	if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
				1523	return 1;
				1524
				1525	ret = compare_names(lhs->nspace, lhs->nspace_len,
				1526	rhs->nspace, rhs->nspace_len);
				1527	if (ret)
				1528	return ret;
				1529
				1530	hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
				1531	hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
				1532	ret = compare_names(effective_key1, effective_key1_len,
				1533	effective_key2, effective_key2_len);
				1534	if (ret)
				1535	return ret;
				1536
				1537	ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
				1538	if (ret)
				1539	return ret;
				1540
				1541	if (lhs->snapid < rhs->snapid)
				1542	return -1;
				1543	if (lhs->snapid > rhs->snapid)
				1544	return 1;
				1545
				1546	return 0;
				1547	}
				1548
				1549	/*
				1550	* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
				1551	* compat stuff here.
				1552	*
				1553	* Assumes @hoid is zero-initialized.
				1554	*/
				1555	static int decode_hoid(void *p, void end, struct ceph_hobject_id *hoid)
				1556	{
				1557	u8 struct_v;
				1558	u32 struct_len;
				1559	int ret;
				1560
				1561	ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
				1562	&struct_len);
				1563	if (ret)
				1564	return ret;
				1565
				1566	if (struct_v < 4) {
				1567	pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
				1568	goto e_inval;
				1569	}
				1570
				1571	hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
				1572	GFP_NOIO);
				1573	if (IS_ERR(hoid->key)) {
				1574	ret = PTR_ERR(hoid->key);
				1575	hoid->key = NULL;
				1576	return ret;
				1577	}
				1578
				1579	hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
				1580	GFP_NOIO);
				1581	if (IS_ERR(hoid->oid)) {
				1582	ret = PTR_ERR(hoid->oid);
				1583	hoid->oid = NULL;
				1584	return ret;
				1585	}
				1586
				1587	ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
				1588	ceph_decode_32_safe(p, end, hoid->hash, e_inval);
				1589	ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
				1590
				1591	hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
				1592	GFP_NOIO);
				1593	if (IS_ERR(hoid->nspace)) {
				1594	ret = PTR_ERR(hoid->nspace);
				1595	hoid->nspace = NULL;
				1596	return ret;
				1597	}
				1598
				1599	ceph_decode_64_safe(p, end, hoid->pool, e_inval);
				1600
				1601	ceph_hoid_build_hash_cache(hoid);
				1602	return 0;
				1603
				1604	e_inval:
				1605	return -EINVAL;
				1606	}
				1607
				1608	static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
				1609	{
				1610	return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
				1611	4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
				1612	}
				1613
				1614	static void encode_hoid(void *p, void end, const struct ceph_hobject_id *hoid)
				1615	{
				1616	ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
				1617	ceph_encode_string(p, end, hoid->key, hoid->key_len);
				1618	ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
				1619	ceph_encode_64(p, hoid->snapid);
				1620	ceph_encode_32(p, hoid->hash);
				1621	ceph_encode_8(p, hoid->is_max);
				1622	ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
				1623	ceph_encode_64(p, hoid->pool);
				1624	}
				1625
				1626	static void free_hoid(struct ceph_hobject_id *hoid)
				1627	{
				1628	if (hoid) {
				1629	kfree(hoid->key);
				1630	kfree(hoid->oid);
				1631	kfree(hoid->nspace);
				1632	kfree(hoid);
				1633	}
				1634	}
				1635
				1636	static struct ceph_osd_backoff *alloc_backoff(void)
				1637	{
				1638	struct ceph_osd_backoff *backoff;
				1639
				1640	backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
				1641	if (!backoff)
				1642	return NULL;
				1643
				1644	RB_CLEAR_NODE(&backoff->spg_node);
				1645	RB_CLEAR_NODE(&backoff->id_node);
				1646	return backoff;
				1647	}
				1648
				1649	static void free_backoff(struct ceph_osd_backoff *backoff)
				1650	{
				1651	WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
				1652	WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
				1653
				1654	free_hoid(backoff->begin);
				1655	free_hoid(backoff->end);
				1656	kfree(backoff);
				1657	}
				1658
				1659	/*
				1660	* Within a specific spgid, backoffs are managed by ->begin hoid.
				1661	*/
				1662	DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
				1663	RB_BYVAL, spg_node);
				1664
				1665	static struct ceph_osd_backoff lookup_containing_backoff(struct rb_root root,
				1666	const struct ceph_hobject_id *hoid)
				1667	{
				1668	struct rb_node *n = root->rb_node;
				1669
				1670	while (n) {
				1671	struct ceph_osd_backoff *cur =
				1672	rb_entry(n, struct ceph_osd_backoff, spg_node);
				1673	int cmp;
				1674
				1675	cmp = hoid_compare(hoid, cur->begin);
				1676	if (cmp < 0) {
				1677	n = n->rb_left;
				1678	} else if (cmp > 0) {
				1679	if (hoid_compare(hoid, cur->end) < 0)
				1680	return cur;
				1681
				1682	n = n->rb_right;
				1683	} else {
				1684	return cur;
				1685	}
				1686	}
				1687
				1688	return NULL;
				1689	}
				1690
				1691	/*
				1692	* Each backoff has a unique id within its OSD session.
				1693	*/
				1694	DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
				1695
				1696	static void clear_backoffs(struct ceph_osd *osd)
				1697	{
				1698	while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
				1699	struct ceph_spg_mapping *spg =
				1700	rb_entry(rb_first(&osd->o_backoff_mappings),
				1701	struct ceph_spg_mapping, node);
				1702
				1703	while (!RB_EMPTY_ROOT(&spg->backoffs)) {
				1704	struct ceph_osd_backoff *backoff =
				1705	rb_entry(rb_first(&spg->backoffs),
				1706	struct ceph_osd_backoff, spg_node);
				1707
				1708	erase_backoff(&spg->backoffs, backoff);
				1709	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				1710	free_backoff(backoff);
				1711	}
				1712	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				1713	free_spg_mapping(spg);
				1714	}
				1715	}
				1716
				1717	/*
				1718	* Set up a temporary, non-owning view into @t.
				1719	*/
				1720	static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
				1721	const struct ceph_osd_request_target *t)
				1722	{
				1723	hoid->key = NULL;
				1724	hoid->key_len = 0;
				1725	hoid->oid = t->target_oid.name;
				1726	hoid->oid_len = t->target_oid.name_len;
				1727	hoid->snapid = CEPH_NOSNAP;
				1728	hoid->hash = t->pgid.seed;
				1729	hoid->is_max = false;
				1730	if (t->target_oloc.pool_ns) {
				1731	hoid->nspace = t->target_oloc.pool_ns->str;
				1732	hoid->nspace_len = t->target_oloc.pool_ns->len;
				1733	} else {
				1734	hoid->nspace = NULL;
				1735	hoid->nspace_len = 0;
				1736	}
				1737	hoid->pool = t->target_oloc.pool;
				1738	ceph_hoid_build_hash_cache(hoid);
				1739	}
				1740
				1741	static bool should_plug_request(struct ceph_osd_request *req)
				1742	{
				1743	struct ceph_osd *osd = req->r_osd;
				1744	struct ceph_spg_mapping *spg;
				1745	struct ceph_osd_backoff *backoff;
				1746	struct ceph_hobject_id hoid;
				1747
				1748	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
				1749	if (!spg)
				1750	return false;
				1751
				1752	hoid_fill_from_target(&hoid, &req->r_t);
				1753	backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
				1754	if (!backoff)
				1755	return false;
				1756
				1757	dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
				1758	__func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
				1759	backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
				1760	return true;
				1761	}
				1762
				1763	static void setup_request_data(struct ceph_osd_request *req,
				1764	struct ceph_msg *msg)
				1765	{
				1766	u32 data_len = 0;
				1767	int i;
				1768
				1769	if (!list_empty(&msg->data))
				1770	return;
				1771
				1772	WARN_ON(msg->data_length);
				1773	for (i = 0; i < req->r_num_ops; i++) {
				1774	struct ceph_osd_req_op *op = &req->r_ops[i];
				1775
				1776	switch (op->op) {
				1777	/* request */
				1778	case CEPH_OSD_OP_WRITE:
				1779	case CEPH_OSD_OP_WRITEFULL:
				1780	WARN_ON(op->indata_len != op->extent.length);
				1781	ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
				1782	break;
				1783	case CEPH_OSD_OP_SETXATTR:
				1784	case CEPH_OSD_OP_CMPXATTR:
				1785	WARN_ON(op->indata_len != op->xattr.name_len +
				1786	op->xattr.value_len);
				1787	ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
				1788	break;
				1789	case CEPH_OSD_OP_NOTIFY_ACK:
				1790	ceph_osdc_msg_data_add(msg,
				1791	&op->notify_ack.request_data);
				1792	break;
				1793
				1794	/* reply */
				1795	case CEPH_OSD_OP_STAT:
				1796	ceph_osdc_msg_data_add(req->r_reply,
				1797	&op->raw_data_in);
				1798	break;
				1799	case CEPH_OSD_OP_READ:
				1800	ceph_osdc_msg_data_add(req->r_reply,
				1801	&op->extent.osd_data);
				1802	break;
				1803	case CEPH_OSD_OP_LIST_WATCHERS:
				1804	ceph_osdc_msg_data_add(req->r_reply,
				1805	&op->list_watchers.response_data);
				1806	break;
				1807
				1808	/* both */
				1809	case CEPH_OSD_OP_CALL:
				1810	WARN_ON(op->indata_len != op->cls.class_len +
				1811	op->cls.method_len +
				1812	op->cls.indata_len);
				1813	ceph_osdc_msg_data_add(msg, &op->cls.request_info);
				1814	/* optional, can be NONE */
				1815	ceph_osdc_msg_data_add(msg, &op->cls.request_data);
				1816	/* optional, can be NONE */
				1817	ceph_osdc_msg_data_add(req->r_reply,
				1818	&op->cls.response_data);
				1819	break;
				1820	case CEPH_OSD_OP_NOTIFY:
				1821	ceph_osdc_msg_data_add(msg,
				1822	&op->notify.request_data);
				1823	ceph_osdc_msg_data_add(req->r_reply,
				1824	&op->notify.response_data);
				1825	break;
				1826	}
				1827
				1828	data_len += op->indata_len;
				1829	}
				1830
				1831	WARN_ON(data_len != msg->data_length);
				1832	}
				1833
				1834	static void encode_pgid(void *p, const struct ceph_pg pgid)
				1835	{
				1836	ceph_encode_8(p, 1);
				1837	ceph_encode_64(p, pgid->pool);
				1838	ceph_encode_32(p, pgid->seed);
				1839	ceph_encode_32(p, -1); /* preferred */
				1840	}
				1841
				1842	static void encode_spgid(void *p, const struct ceph_spg spgid)
				1843	{
				1844	ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
				1845	encode_pgid(p, &spgid->pgid);
				1846	ceph_encode_8(p, spgid->shard);
				1847	}
				1848
				1849	static void encode_oloc(void *p, void end,
				1850	const struct ceph_object_locator *oloc)
				1851	{
				1852	ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
				1853	ceph_encode_64(p, oloc->pool);
				1854	ceph_encode_32(p, -1); /* preferred */
				1855	ceph_encode_32(p, 0); /* key len */
				1856	if (oloc->pool_ns)
				1857	ceph_encode_string(p, end, oloc->pool_ns->str,
				1858	oloc->pool_ns->len);
				1859	else
				1860	ceph_encode_32(p, 0);
				1861	}
				1862
				1863	static void encode_request_partial(struct ceph_osd_request *req,
				1864	struct ceph_msg *msg)
				1865	{
				1866	void *p = msg->front.iov_base;
				1867	void *const end = p + msg->front_alloc_len;
				1868	u32 data_len = 0;
				1869	int i;
				1870
				1871	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
				1872	/* snapshots aren't writeable */
				1873	WARN_ON(req->r_snapid != CEPH_NOSNAP);
				1874	} else {
				1875	WARN_ON(req->r_mtime.tv_sec \|\| req->r_mtime.tv_nsec \|\|
				1876	req->r_data_offset \|\| req->r_snapc);
				1877	}
				1878
				1879	setup_request_data(req, msg);
				1880
				1881	encode_spgid(&p, &req->r_t.spgid); /* actual spg */
				1882	ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
				1883	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
				1884	ceph_encode_32(&p, req->r_flags);
				1885
				1886	/* reqid */
				1887	ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
				1888	memset(p, 0, sizeof(struct ceph_osd_reqid));
				1889	p += sizeof(struct ceph_osd_reqid);
				1890
				1891	/* trace */
				1892	memset(p, 0, sizeof(struct ceph_blkin_trace_info));
				1893	p += sizeof(struct ceph_blkin_trace_info);
				1894
				1895	ceph_encode_32(&p, 0); /* client_inc, always 0 */
				1896	ceph_encode_timespec(p, &req->r_mtime);
				1897	p += sizeof(struct ceph_timespec);
				1898
				1899	encode_oloc(&p, end, &req->r_t.target_oloc);
				1900	ceph_encode_string(&p, end, req->r_t.target_oid.name,
				1901	req->r_t.target_oid.name_len);
				1902
				1903	/* ops, can imply data */
				1904	ceph_encode_16(&p, req->r_num_ops);
				1905	for (i = 0; i < req->r_num_ops; i++) {
				1906	data_len += osd_req_encode_op(p, &req->r_ops[i]);
				1907	p += sizeof(struct ceph_osd_op);
				1908	}
				1909
				1910	ceph_encode_64(&p, req->r_snapid); /* snapid */
				1911	if (req->r_snapc) {
				1912	ceph_encode_64(&p, req->r_snapc->seq);
				1913	ceph_encode_32(&p, req->r_snapc->num_snaps);
				1914	for (i = 0; i < req->r_snapc->num_snaps; i++)
				1915	ceph_encode_64(&p, req->r_snapc->snaps[i]);
				1916	} else {
				1917	ceph_encode_64(&p, 0); /* snap_seq */
				1918	ceph_encode_32(&p, 0); /* snaps len */
				1919	}
				1920
				1921	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
				1922	BUG_ON(p > end - 8); /* space for features */
				1923
				1924	msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
				1925	/* front_len is finalized in encode_request_finish() */
				1926	msg->front.iov_len = p - msg->front.iov_base;
				1927	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				1928	msg->hdr.data_len = cpu_to_le32(data_len);
				1929	/*
				1930	* The header "data_off" is a hint to the receiver allowing it
				1931	* to align received data into its buffers such that there's no
				1932	* need to re-copy it before writing it to disk (direct I/O).
				1933	*/
				1934	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
				1935
				1936	dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
				1937	req->r_t.target_oid.name, req->r_t.target_oid.name_len);
				1938	}
				1939
				1940	static void encode_request_finish(struct ceph_msg *msg)
				1941	{
				1942	void *p = msg->front.iov_base;
				1943	void *const partial_end = p + msg->front.iov_len;
				1944	void *const end = p + msg->front_alloc_len;
				1945
				1946	if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
				1947	/* luminous OSD -- encode features and be done */
				1948	p = partial_end;
				1949	ceph_encode_64(&p, msg->con->peer_features);
				1950	} else {
				1951	struct {
				1952	char spgid[CEPH_ENCODING_START_BLK_LEN +
				1953	CEPH_PGID_ENCODING_LEN + 1];
				1954	__le32 hash;
				1955	__le32 epoch;
				1956	__le32 flags;
				1957	char reqid[CEPH_ENCODING_START_BLK_LEN +
				1958	sizeof(struct ceph_osd_reqid)];
				1959	char trace[sizeof(struct ceph_blkin_trace_info)];
				1960	__le32 client_inc;
				1961	struct ceph_timespec mtime;
				1962	} __packed head;
				1963	struct ceph_pg pgid;
				1964	void oloc, oid, *tail;
				1965	int oloc_len, oid_len, tail_len;
				1966	int len;
				1967
				1968	/*
				1969	* Pre-luminous OSD -- reencode v8 into v4 using @head
				1970	* as a temporary buffer. Encode the raw PG; the rest
				1971	* is just a matter of moving oloc, oid and tail blobs
				1972	* around.
				1973	*/
				1974	memcpy(&head, p, sizeof(head));
				1975	p += sizeof(head);
				1976
				1977	oloc = p;
				1978	p += CEPH_ENCODING_START_BLK_LEN;
				1979	pgid.pool = ceph_decode_64(&p);
				1980	p += 4 + 4; /* preferred, key len */
				1981	len = ceph_decode_32(&p);
				1982	p += len; /* nspace */
				1983	oloc_len = p - oloc;
				1984
				1985	oid = p;
				1986	len = ceph_decode_32(&p);
				1987	p += len;
				1988	oid_len = p - oid;
				1989
				1990	tail = p;
				1991	tail_len = partial_end - p;
				1992
				1993	p = msg->front.iov_base;
				1994	ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
				1995	ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
				1996	ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
				1997	ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
				1998
				1999	/* reassert_version */
				2000	memset(p, 0, sizeof(struct ceph_eversion));
				2001	p += sizeof(struct ceph_eversion);
				2002
				2003	BUG_ON(p >= oloc);
				2004	memmove(p, oloc, oloc_len);
				2005	p += oloc_len;
				2006
				2007	pgid.seed = le32_to_cpu(head.hash);
				2008	encode_pgid(&p, &pgid); /* raw pg */
				2009
				2010	BUG_ON(p >= oid);
				2011	memmove(p, oid, oid_len);
				2012	p += oid_len;
				2013
				2014	/* tail -- ops, snapid, snapc, retry_attempt */
				2015	BUG_ON(p >= tail);
				2016	memmove(p, tail, tail_len);
				2017	p += tail_len;
				2018
				2019	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
				2020	}
				2021
				2022	BUG_ON(p > end);
				2023	msg->front.iov_len = p - msg->front.iov_base;
				2024	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2025
				2026	dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
				2027	le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
				2028	le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
				2029	le16_to_cpu(msg->hdr.version));
				2030	}
				2031
				2032	/*
				2033	* @req has to be assigned a tid and registered.
				2034	*/
				2035	static void send_request(struct ceph_osd_request *req)
				2036	{
				2037	struct ceph_osd *osd = req->r_osd;
				2038
				2039	verify_osd_locked(osd);
				2040	WARN_ON(osd->o_osd != req->r_t.osd);
				2041
				2042	/* backoff? */
				2043	if (should_plug_request(req))
				2044	return;
				2045
				2046	/*
				2047	* We may have a previously queued request message hanging
				2048	* around. Cancel it to avoid corrupting the msgr.
				2049	*/
				2050	if (req->r_sent)
				2051	ceph_msg_revoke(req->r_request);
				2052
				2053	req->r_flags \|= CEPH_OSD_FLAG_KNOWN_REDIR;
				2054	if (req->r_attempts)
				2055	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				2056	else
				2057	WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
				2058
				2059	encode_request_partial(req, req->r_request);
				2060
				2061	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
				2062	__func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
				2063	req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
				2064	req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
				2065	req->r_attempts);
				2066
				2067	req->r_t.paused = false;
				2068	req->r_stamp = jiffies;
				2069	req->r_attempts++;
				2070
				2071	req->r_sent = osd->o_incarnation;
				2072	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
				2073	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
				2074	}
				2075
				2076	static void maybe_request_map(struct ceph_osd_client *osdc)
				2077	{
				2078	bool continuous = false;
				2079
				2080	verify_osdc_locked(osdc);
				2081	WARN_ON(!osdc->osdmap->epoch);
				2082
				2083	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2084	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) \|\|
				2085	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2086	dout("%s osdc %p continuous\n", __func__, osdc);
				2087	continuous = true;
				2088	} else {
				2089	dout("%s osdc %p onetime\n", __func__, osdc);
				2090	}
				2091
				2092	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				2093	osdc->osdmap->epoch + 1, continuous))
				2094	ceph_monc_renew_subs(&osdc->client->monc);
				2095	}
				2096
				2097	static void complete_request(struct ceph_osd_request *req, int err);
				2098	static void send_map_check(struct ceph_osd_request *req);
				2099
				2100	static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
				2101	{
				2102	struct ceph_osd_client *osdc = req->r_osdc;
				2103	struct ceph_osd *osd;
				2104	enum calc_target_result ct_res;
				2105	bool need_send = false;
				2106	bool promoted = false;
				2107	bool need_abort = false;
				2108
				2109	WARN_ON(req->r_tid);
				2110	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
				2111
				2112	again:
				2113	ct_res = calc_target(osdc, &req->r_t, NULL, false);
				2114	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
				2115	goto promote;
				2116
				2117	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
				2118	if (IS_ERR(osd)) {
				2119	WARN_ON(PTR_ERR(osd) != -EAGAIN \|\| wrlocked);
				2120	goto promote;
				2121	}
				2122
				2123	if (osdc->osdmap->epoch < osdc->epoch_barrier) {
				2124	dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
				2125	osdc->epoch_barrier);
				2126	req->r_t.paused = true;
				2127	maybe_request_map(osdc);
				2128	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2129	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2130	dout("req %p pausewr\n", req);
				2131	req->r_t.paused = true;
				2132	maybe_request_map(osdc);
				2133	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
				2134	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				2135	dout("req %p pauserd\n", req);
				2136	req->r_t.paused = true;
				2137	maybe_request_map(osdc);
				2138	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2139	!(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY \|
				2140	CEPH_OSD_FLAG_FULL_FORCE)) &&
				2141	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2142	pool_full(osdc, req->r_t.base_oloc.pool))) {
				2143	dout("req %p full/pool_full\n", req);
				2144	pr_warn_ratelimited("FULL or reached pool quota\n");
				2145	req->r_t.paused = true;
				2146	maybe_request_map(osdc);
				2147	if (req->r_abort_on_full)
				2148	need_abort = true;
				2149	} else if (!osd_homeless(osd)) {
				2150	need_send = true;
				2151	} else {
				2152	maybe_request_map(osdc);
				2153	}
				2154
				2155	mutex_lock(&osd->lock);
				2156	/*
				2157	* Assign the tid atomically with send_request() to protect
				2158	* multiple writes to the same object from racing with each
				2159	* other, resulting in out of order ops on the OSDs.
				2160	*/
				2161	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				2162	link_request(osd, req);
				2163	if (need_send)
				2164	send_request(req);
				2165	else if (need_abort)
				2166	complete_request(req, -ENOSPC);
				2167	mutex_unlock(&osd->lock);
				2168
				2169	if (ct_res == CALC_TARGET_POOL_DNE)
				2170	send_map_check(req);
				2171
				2172	if (promoted)
				2173	downgrade_write(&osdc->lock);
				2174	return;
				2175
				2176	promote:
				2177	up_read(&osdc->lock);
				2178	down_write(&osdc->lock);
				2179	wrlocked = true;
				2180	promoted = true;
				2181	goto again;
				2182	}
				2183
				2184	static void account_request(struct ceph_osd_request *req)
				2185	{
				2186	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK \| CEPH_OSD_FLAG_ONDISK));
				2187	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ \| CEPH_OSD_FLAG_WRITE)));
				2188
				2189	req->r_flags \|= CEPH_OSD_FLAG_ONDISK;
				2190	atomic_inc(&req->r_osdc->num_requests);
				2191
				2192	req->r_start_stamp = jiffies;
				2193	}
				2194
				2195	static void submit_request(struct ceph_osd_request *req, bool wrlocked)
				2196	{
				2197	ceph_osdc_get_request(req);
				2198	account_request(req);
				2199	__submit_request(req, wrlocked);
				2200	}
				2201
				2202	static void finish_request(struct ceph_osd_request *req)
				2203	{
				2204	struct ceph_osd_client *osdc = req->r_osdc;
				2205
				2206	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
				2207	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2208
				2209	if (req->r_osd)
				2210	unlink_request(req->r_osd, req);
				2211	atomic_dec(&osdc->num_requests);
				2212
				2213	/*
				2214	* If an OSD has failed or returned and a request has been sent
				2215	* twice, it's possible to get a reply and end up here while the
				2216	* request message is queued for delivery. We will ignore the
				2217	* reply, so not a big deal, but better to try and catch it.
				2218	*/
				2219	ceph_msg_revoke(req->r_request);
				2220	ceph_msg_revoke_incoming(req->r_reply);
				2221	}
				2222
				2223	static void __complete_request(struct ceph_osd_request *req)
				2224	{
				2225	if (req->r_callback) {
				2226	dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
				2227	req->r_tid, req->r_callback, req->r_result);
				2228	req->r_callback(req);
				2229	}
				2230	}
				2231
				2232	/*
				2233	* This is open-coded in handle_reply().
				2234	*/
				2235	static void complete_request(struct ceph_osd_request *req, int err)
				2236	{
				2237	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2238
				2239	req->r_result = err;
				2240	finish_request(req);
				2241	__complete_request(req);
				2242	complete_all(&req->r_completion);
				2243	ceph_osdc_put_request(req);
				2244	}
				2245
				2246	static void cancel_map_check(struct ceph_osd_request *req)
				2247	{
				2248	struct ceph_osd_client *osdc = req->r_osdc;
				2249	struct ceph_osd_request *lookup_req;
				2250
				2251	verify_osdc_wrlocked(osdc);
				2252
				2253	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2254	if (!lookup_req)
				2255	return;
				2256
				2257	WARN_ON(lookup_req != req);
				2258	erase_request_mc(&osdc->map_checks, req);
				2259	ceph_osdc_put_request(req);
				2260	}
				2261
				2262	static void cancel_request(struct ceph_osd_request *req)
				2263	{
				2264	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2265
				2266	cancel_map_check(req);
				2267	finish_request(req);
				2268	complete_all(&req->r_completion);
				2269	ceph_osdc_put_request(req);
				2270	}
				2271
				2272	static void abort_request(struct ceph_osd_request *req, int err)
				2273	{
				2274	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2275
				2276	cancel_map_check(req);
				2277	complete_request(req, err);
				2278	}
				2279
				2280	static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2281	{
				2282	if (likely(eb > osdc->epoch_barrier)) {
				2283	dout("updating epoch_barrier from %u to %u\n",
				2284	osdc->epoch_barrier, eb);
				2285	osdc->epoch_barrier = eb;
				2286	/* Request map if we're not to the barrier yet */
				2287	if (eb > osdc->osdmap->epoch)
				2288	maybe_request_map(osdc);
				2289	}
				2290	}
				2291
				2292	void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2293	{
				2294	down_read(&osdc->lock);
				2295	if (unlikely(eb > osdc->epoch_barrier)) {
				2296	up_read(&osdc->lock);
				2297	down_write(&osdc->lock);
				2298	update_epoch_barrier(osdc, eb);
				2299	up_write(&osdc->lock);
				2300	} else {
				2301	up_read(&osdc->lock);
				2302	}
				2303	}
				2304	EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
				2305
				2306	/*
				2307	* Drop all pending requests that are stalled waiting on a full condition to
				2308	* clear, and complete them with ENOSPC as the return code. Set the
				2309	* osdc->epoch_barrier to the latest map epoch that we've seen if any were
				2310	* cancelled.
				2311	*/
				2312	static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
				2313	{
				2314	struct rb_node *n;
				2315	bool victims = false;
				2316
				2317	dout("enter abort_on_full\n");
				2318
				2319	if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
				2320	goto out;
				2321
				2322	/* Scan list and see if there is anything to abort */
				2323	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				2324	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				2325	struct rb_node *m;
				2326
				2327	m = rb_first(&osd->o_requests);
				2328	while (m) {
				2329	struct ceph_osd_request *req = rb_entry(m,
				2330	struct ceph_osd_request, r_node);
				2331	m = rb_next(m);
				2332
				2333	if (req->r_abort_on_full) {
				2334	victims = true;
				2335	break;
				2336	}
				2337	}
				2338	if (victims)
				2339	break;
				2340	}
				2341
				2342	if (!victims)
				2343	goto out;
				2344
				2345	/*
				2346	* Update the barrier to current epoch if it's behind that point,
				2347	* since we know we have some calls to be aborted in the tree.
				2348	*/
				2349	update_epoch_barrier(osdc, osdc->osdmap->epoch);
				2350
				2351	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				2352	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				2353	struct rb_node *m;
				2354
				2355	m = rb_first(&osd->o_requests);
				2356	while (m) {
				2357	struct ceph_osd_request *req = rb_entry(m,
				2358	struct ceph_osd_request, r_node);
				2359	m = rb_next(m);
				2360
				2361	if (req->r_abort_on_full &&
				2362	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2363	pool_full(osdc, req->r_t.target_oloc.pool)))
				2364	abort_request(req, -ENOSPC);
				2365	}
				2366	}
				2367	out:
				2368	dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
				2369	}
				2370
				2371	static void check_pool_dne(struct ceph_osd_request *req)
				2372	{
				2373	struct ceph_osd_client *osdc = req->r_osdc;
				2374	struct ceph_osdmap *map = osdc->osdmap;
				2375
				2376	verify_osdc_wrlocked(osdc);
				2377	WARN_ON(!map->epoch);
				2378
				2379	if (req->r_attempts) {
				2380	/*
				2381	* We sent a request earlier, which means that
				2382	* previously the pool existed, and now it does not
				2383	* (i.e., it was deleted).
				2384	*/
				2385	req->r_map_dne_bound = map->epoch;
				2386	dout("%s req %p tid %llu pool disappeared\n", __func__, req,
				2387	req->r_tid);
				2388	} else {
				2389	dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
				2390	req, req->r_tid, req->r_map_dne_bound, map->epoch);
				2391	}
				2392
				2393	if (req->r_map_dne_bound) {
				2394	if (map->epoch >= req->r_map_dne_bound) {
				2395	/* we had a new enough map */
				2396	pr_info_ratelimited("tid %llu pool does not exist\n",
				2397	req->r_tid);
				2398	complete_request(req, -ENOENT);
				2399	}
				2400	} else {
				2401	send_map_check(req);
				2402	}
				2403	}
				2404
				2405	static void map_check_cb(struct ceph_mon_generic_request *greq)
				2406	{
				2407	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				2408	struct ceph_osd_request *req;
				2409	u64 tid = greq->private_data;
				2410
				2411	WARN_ON(greq->result \|\| !greq->u.newest);
				2412
				2413	down_write(&osdc->lock);
				2414	req = lookup_request_mc(&osdc->map_checks, tid);
				2415	if (!req) {
				2416	dout("%s tid %llu dne\n", __func__, tid);
				2417	goto out_unlock;
				2418	}
				2419
				2420	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
				2421	req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
				2422	if (!req->r_map_dne_bound)
				2423	req->r_map_dne_bound = greq->u.newest;
				2424	erase_request_mc(&osdc->map_checks, req);
				2425	check_pool_dne(req);
				2426
				2427	ceph_osdc_put_request(req);
				2428	out_unlock:
				2429	up_write(&osdc->lock);
				2430	}
				2431
				2432	static void send_map_check(struct ceph_osd_request *req)
				2433	{
				2434	struct ceph_osd_client *osdc = req->r_osdc;
				2435	struct ceph_osd_request *lookup_req;
				2436	int ret;
				2437
				2438	verify_osdc_wrlocked(osdc);
				2439
				2440	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2441	if (lookup_req) {
				2442	WARN_ON(lookup_req != req);
				2443	return;
				2444	}
				2445
				2446	ceph_osdc_get_request(req);
				2447	insert_request_mc(&osdc->map_checks, req);
				2448	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				2449	map_check_cb, req->r_tid);
				2450	WARN_ON(ret);
				2451	}
				2452
				2453	/*
				2454	* lingering requests, watch/notify v2 infrastructure
				2455	*/
				2456	static void linger_release(struct kref *kref)
				2457	{
				2458	struct ceph_osd_linger_request *lreq =
				2459	container_of(kref, struct ceph_osd_linger_request, kref);
				2460
				2461	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
				2462	lreq->reg_req, lreq->ping_req);
				2463	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
				2464	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
				2465	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
				2466	WARN_ON(!list_empty(&lreq->scan_item));
				2467	WARN_ON(!list_empty(&lreq->pending_lworks));
				2468	WARN_ON(lreq->osd);
				2469
				2470	if (lreq->reg_req)
				2471	ceph_osdc_put_request(lreq->reg_req);
				2472	if (lreq->ping_req)
				2473	ceph_osdc_put_request(lreq->ping_req);
				2474	target_destroy(&lreq->t);
				2475	kfree(lreq);
				2476	}
				2477
				2478	static void linger_put(struct ceph_osd_linger_request *lreq)
				2479	{
				2480	if (lreq)
				2481	kref_put(&lreq->kref, linger_release);
				2482	}
				2483
				2484	static struct ceph_osd_linger_request *
				2485	linger_get(struct ceph_osd_linger_request *lreq)
				2486	{
				2487	kref_get(&lreq->kref);
				2488	return lreq;
				2489	}
				2490
				2491	static struct ceph_osd_linger_request *
				2492	linger_alloc(struct ceph_osd_client *osdc)
				2493	{
				2494	struct ceph_osd_linger_request *lreq;
				2495
				2496	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
				2497	if (!lreq)
				2498	return NULL;
				2499
				2500	kref_init(&lreq->kref);
				2501	mutex_init(&lreq->lock);
				2502	RB_CLEAR_NODE(&lreq->node);
				2503	RB_CLEAR_NODE(&lreq->osdc_node);
				2504	RB_CLEAR_NODE(&lreq->mc_node);
				2505	INIT_LIST_HEAD(&lreq->scan_item);
				2506	INIT_LIST_HEAD(&lreq->pending_lworks);
				2507	init_completion(&lreq->reg_commit_wait);
				2508	init_completion(&lreq->notify_finish_wait);
				2509
				2510	lreq->osdc = osdc;
				2511	target_init(&lreq->t);
				2512
				2513	dout("%s lreq %p\n", __func__, lreq);
				2514	return lreq;
				2515	}
				2516
				2517	DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
				2518	DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
				2519	DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
				2520
				2521	/*
				2522	* Create linger request <-> OSD session relation.
				2523	*
				2524	* @lreq has to be registered, @osd may be homeless.
				2525	*/
				2526	static void link_linger(struct ceph_osd *osd,
				2527	struct ceph_osd_linger_request *lreq)
				2528	{
				2529	verify_osd_locked(osd);
				2530	WARN_ON(!lreq->linger_id \|\| lreq->osd);
				2531	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2532	osd->o_osd, lreq, lreq->linger_id);
				2533
				2534	if (!osd_homeless(osd))
				2535	__remove_osd_from_lru(osd);
				2536	else
				2537	atomic_inc(&osd->o_osdc->num_homeless);
				2538
				2539	get_osd(osd);
				2540	insert_linger(&osd->o_linger_requests, lreq);
				2541	lreq->osd = osd;
				2542	}
				2543
				2544	static void unlink_linger(struct ceph_osd *osd,
				2545	struct ceph_osd_linger_request *lreq)
				2546	{
				2547	verify_osd_locked(osd);
				2548	WARN_ON(lreq->osd != osd);
				2549	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2550	osd->o_osd, lreq, lreq->linger_id);
				2551
				2552	lreq->osd = NULL;
				2553	erase_linger(&osd->o_linger_requests, lreq);
				2554	put_osd(osd);
				2555
				2556	if (!osd_homeless(osd))
				2557	maybe_move_osd_to_lru(osd);
				2558	else
				2559	atomic_dec(&osd->o_osdc->num_homeless);
				2560	}
				2561
				2562	static bool __linger_registered(struct ceph_osd_linger_request *lreq)
				2563	{
				2564	verify_osdc_locked(lreq->osdc);
				2565
				2566	return !RB_EMPTY_NODE(&lreq->osdc_node);
				2567	}
				2568
				2569	static bool linger_registered(struct ceph_osd_linger_request *lreq)
				2570	{
				2571	struct ceph_osd_client *osdc = lreq->osdc;
				2572	bool registered;
				2573
				2574	down_read(&osdc->lock);
				2575	registered = __linger_registered(lreq);
				2576	up_read(&osdc->lock);
				2577
				2578	return registered;
				2579	}
				2580
				2581	static void linger_register(struct ceph_osd_linger_request *lreq)
				2582	{
				2583	struct ceph_osd_client *osdc = lreq->osdc;
				2584
				2585	verify_osdc_wrlocked(osdc);
				2586	WARN_ON(lreq->linger_id);
				2587
				2588	linger_get(lreq);
				2589	lreq->linger_id = ++osdc->last_linger_id;
				2590	insert_linger_osdc(&osdc->linger_requests, lreq);
				2591	}
				2592
				2593	static void linger_unregister(struct ceph_osd_linger_request *lreq)
				2594	{
				2595	struct ceph_osd_client *osdc = lreq->osdc;
				2596
				2597	verify_osdc_wrlocked(osdc);
				2598
				2599	erase_linger_osdc(&osdc->linger_requests, lreq);
				2600	linger_put(lreq);
				2601	}
				2602
				2603	static void cancel_linger_request(struct ceph_osd_request *req)
				2604	{
				2605	struct ceph_osd_linger_request *lreq = req->r_priv;
				2606
				2607	WARN_ON(!req->r_linger);
				2608	cancel_request(req);
				2609	linger_put(lreq);
				2610	}
				2611
				2612	struct linger_work {
				2613	struct work_struct work;
				2614	struct ceph_osd_linger_request *lreq;
				2615	struct list_head pending_item;
				2616	unsigned long queued_stamp;
				2617
				2618	union {
				2619	struct {
				2620	u64 notify_id;
				2621	u64 notifier_id;
				2622	void payload; / points into @msg front */
				2623	size_t payload_len;
				2624
				2625	struct ceph_msg msg; / for ceph_msg_put() */
				2626	} notify;
				2627	struct {
				2628	int err;
				2629	} error;
				2630	};
				2631	};
				2632
				2633	static struct linger_work lwork_alloc(struct ceph_osd_linger_request lreq,
				2634	work_func_t workfn)
				2635	{
				2636	struct linger_work *lwork;
				2637
				2638	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
				2639	if (!lwork)
				2640	return NULL;
				2641
				2642	INIT_WORK(&lwork->work, workfn);
				2643	INIT_LIST_HEAD(&lwork->pending_item);
				2644	lwork->lreq = linger_get(lreq);
				2645
				2646	return lwork;
				2647	}
				2648
				2649	static void lwork_free(struct linger_work *lwork)
				2650	{
				2651	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2652
				2653	mutex_lock(&lreq->lock);
				2654	list_del(&lwork->pending_item);
				2655	mutex_unlock(&lreq->lock);
				2656
				2657	linger_put(lreq);
				2658	kfree(lwork);
				2659	}
				2660
				2661	static void lwork_queue(struct linger_work *lwork)
				2662	{
				2663	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2664	struct ceph_osd_client *osdc = lreq->osdc;
				2665
				2666	verify_lreq_locked(lreq);
				2667	WARN_ON(!list_empty(&lwork->pending_item));
				2668
				2669	lwork->queued_stamp = jiffies;
				2670	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
				2671	queue_work(osdc->notify_wq, &lwork->work);
				2672	}
				2673
				2674	static void do_watch_notify(struct work_struct *w)
				2675	{
				2676	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2677	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2678
				2679	if (!linger_registered(lreq)) {
				2680	dout("%s lreq %p not registered\n", __func__, lreq);
				2681	goto out;
				2682	}
				2683
				2684	WARN_ON(!lreq->is_watch);
				2685	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
				2686	__func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
				2687	lwork->notify.payload_len);
				2688	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
				2689	lwork->notify.notifier_id, lwork->notify.payload,
				2690	lwork->notify.payload_len);
				2691
				2692	out:
				2693	ceph_msg_put(lwork->notify.msg);
				2694	lwork_free(lwork);
				2695	}
				2696
				2697	static void do_watch_error(struct work_struct *w)
				2698	{
				2699	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2700	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2701
				2702	if (!linger_registered(lreq)) {
				2703	dout("%s lreq %p not registered\n", __func__, lreq);
				2704	goto out;
				2705	}
				2706
				2707	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
				2708	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
				2709
				2710	out:
				2711	lwork_free(lwork);
				2712	}
				2713
				2714	static void queue_watch_error(struct ceph_osd_linger_request *lreq)
				2715	{
				2716	struct linger_work *lwork;
				2717
				2718	lwork = lwork_alloc(lreq, do_watch_error);
				2719	if (!lwork) {
				2720	pr_err("failed to allocate error-lwork\n");
				2721	return;
				2722	}
				2723
				2724	lwork->error.err = lreq->last_error;
				2725	lwork_queue(lwork);
				2726	}
				2727
				2728	static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
				2729	int result)
				2730	{
				2731	if (!completion_done(&lreq->reg_commit_wait)) {
				2732	lreq->reg_commit_error = (result <= 0 ? result : 0);
				2733	complete_all(&lreq->reg_commit_wait);
				2734	}
				2735	}
				2736
				2737	static void linger_commit_cb(struct ceph_osd_request *req)
				2738	{
				2739	struct ceph_osd_linger_request *lreq = req->r_priv;
				2740
				2741	mutex_lock(&lreq->lock);
				2742	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
				2743	lreq->linger_id, req->r_result);
				2744	linger_reg_commit_complete(lreq, req->r_result);
				2745	lreq->committed = true;
				2746
				2747	if (!lreq->is_watch) {
				2748	struct ceph_osd_data *osd_data =
				2749	osd_req_op_data(req, 0, notify, response_data);
				2750	void *p = page_address(osd_data->pages[0]);
				2751
				2752	WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY \|\|
				2753	osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
				2754
				2755	/* make note of the notify_id */
				2756	if (req->r_ops[0].outdata_len >= sizeof(u64)) {
				2757	lreq->notify_id = ceph_decode_64(&p);
				2758	dout("lreq %p notify_id %llu\n", lreq,
				2759	lreq->notify_id);
				2760	} else {
				2761	dout("lreq %p no notify_id\n", lreq);
				2762	}
				2763	}
				2764
				2765	mutex_unlock(&lreq->lock);
				2766	linger_put(lreq);
				2767	}
				2768
				2769	static int normalize_watch_error(int err)
				2770	{
				2771	/*
				2772	* Translate ENOENT -> ENOTCONN so that a delete->disconnection
				2773	* notification and a failure to reconnect because we raced with
				2774	* the delete appear the same to the user.
				2775	*/
				2776	if (err == -ENOENT)
				2777	err = -ENOTCONN;
				2778
				2779	return err;
				2780	}
				2781
				2782	static void linger_reconnect_cb(struct ceph_osd_request *req)
				2783	{
				2784	struct ceph_osd_linger_request *lreq = req->r_priv;
				2785
				2786	mutex_lock(&lreq->lock);
				2787	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
				2788	lreq, lreq->linger_id, req->r_result, lreq->last_error);
				2789	if (req->r_result < 0) {
				2790	if (!lreq->last_error) {
				2791	lreq->last_error = normalize_watch_error(req->r_result);
				2792	queue_watch_error(lreq);
				2793	}
				2794	}
				2795
				2796	mutex_unlock(&lreq->lock);
				2797	linger_put(lreq);
				2798	}
				2799
				2800	static void send_linger(struct ceph_osd_linger_request *lreq)
				2801	{
				2802	struct ceph_osd_request *req = lreq->reg_req;
				2803	struct ceph_osd_req_op *op = &req->r_ops[0];
				2804
				2805	verify_osdc_wrlocked(req->r_osdc);
				2806	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				2807
				2808	if (req->r_osd)
				2809	cancel_linger_request(req);
				2810
				2811	request_reinit(req);
				2812	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				2813	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				2814	req->r_flags = lreq->t.flags;
				2815	req->r_mtime = lreq->mtime;
				2816
				2817	mutex_lock(&lreq->lock);
				2818	if (lreq->is_watch && lreq->committed) {
				2819	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				2820	op->watch.cookie != lreq->linger_id);
				2821	op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
				2822	op->watch.gen = ++lreq->register_gen;
				2823	dout("lreq %p reconnect register_gen %u\n", lreq,
				2824	op->watch.gen);
				2825	req->r_callback = linger_reconnect_cb;
				2826	} else {
				2827	if (!lreq->is_watch)
				2828	lreq->notify_id = 0;
				2829	else
				2830	WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
				2831	dout("lreq %p register\n", lreq);
				2832	req->r_callback = linger_commit_cb;
				2833	}
				2834	mutex_unlock(&lreq->lock);
				2835
				2836	req->r_priv = linger_get(lreq);
				2837	req->r_linger = true;
				2838
				2839	submit_request(req, true);
				2840	}
				2841
				2842	static void linger_ping_cb(struct ceph_osd_request *req)
				2843	{
				2844	struct ceph_osd_linger_request *lreq = req->r_priv;
				2845
				2846	mutex_lock(&lreq->lock);
				2847	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
				2848	__func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
				2849	lreq->last_error);
				2850	if (lreq->register_gen == req->r_ops[0].watch.gen) {
				2851	if (!req->r_result) {
				2852	lreq->watch_valid_thru = lreq->ping_sent;
				2853	} else if (!lreq->last_error) {
				2854	lreq->last_error = normalize_watch_error(req->r_result);
				2855	queue_watch_error(lreq);
				2856	}
				2857	} else {
				2858	dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
				2859	lreq->register_gen, req->r_ops[0].watch.gen);
				2860	}
				2861
				2862	mutex_unlock(&lreq->lock);
				2863	linger_put(lreq);
				2864	}
				2865
				2866	static void send_linger_ping(struct ceph_osd_linger_request *lreq)
				2867	{
				2868	struct ceph_osd_client *osdc = lreq->osdc;
				2869	struct ceph_osd_request *req = lreq->ping_req;
				2870	struct ceph_osd_req_op *op = &req->r_ops[0];
				2871
				2872	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				2873	dout("%s PAUSERD\n", __func__);
				2874	return;
				2875	}
				2876
				2877	lreq->ping_sent = jiffies;
				2878	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
				2879	__func__, lreq, lreq->linger_id, lreq->ping_sent,
				2880	lreq->register_gen);
				2881
				2882	if (req->r_osd)
				2883	cancel_linger_request(req);
				2884
				2885	request_reinit(req);
				2886	target_copy(&req->r_t, &lreq->t);
				2887
				2888	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				2889	op->watch.cookie != lreq->linger_id \|\|
				2890	op->watch.op != CEPH_OSD_WATCH_OP_PING);
				2891	op->watch.gen = lreq->register_gen;
				2892	req->r_callback = linger_ping_cb;
				2893	req->r_priv = linger_get(lreq);
				2894	req->r_linger = true;
				2895
				2896	ceph_osdc_get_request(req);
				2897	account_request(req);
				2898	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				2899	link_request(lreq->osd, req);
				2900	send_request(req);
				2901	}
				2902
				2903	static void linger_submit(struct ceph_osd_linger_request *lreq)
				2904	{
				2905	struct ceph_osd_client *osdc = lreq->osdc;
				2906	struct ceph_osd *osd;
				2907
				2908	calc_target(osdc, &lreq->t, NULL, false);
				2909	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				2910	link_linger(osd, lreq);
				2911
				2912	send_linger(lreq);
				2913	}
				2914
				2915	static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
				2916	{
				2917	struct ceph_osd_client *osdc = lreq->osdc;
				2918	struct ceph_osd_linger_request *lookup_lreq;
				2919
				2920	verify_osdc_wrlocked(osdc);
				2921
				2922	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				2923	lreq->linger_id);
				2924	if (!lookup_lreq)
				2925	return;
				2926
				2927	WARN_ON(lookup_lreq != lreq);
				2928	erase_linger_mc(&osdc->linger_map_checks, lreq);
				2929	linger_put(lreq);
				2930	}
				2931
				2932	/*
				2933	* @lreq has to be both registered and linked.
				2934	*/
				2935	static void __linger_cancel(struct ceph_osd_linger_request *lreq)
				2936	{
				2937	if (lreq->is_watch && lreq->ping_req->r_osd)
				2938	cancel_linger_request(lreq->ping_req);
				2939	if (lreq->reg_req->r_osd)
				2940	cancel_linger_request(lreq->reg_req);
				2941	cancel_linger_map_check(lreq);
				2942	unlink_linger(lreq->osd, lreq);
				2943	linger_unregister(lreq);
				2944	}
				2945
				2946	static void linger_cancel(struct ceph_osd_linger_request *lreq)
				2947	{
				2948	struct ceph_osd_client *osdc = lreq->osdc;
				2949
				2950	down_write(&osdc->lock);
				2951	if (__linger_registered(lreq))
				2952	__linger_cancel(lreq);
				2953	up_write(&osdc->lock);
				2954	}
				2955
				2956	static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
				2957
				2958	static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
				2959	{
				2960	struct ceph_osd_client *osdc = lreq->osdc;
				2961	struct ceph_osdmap *map = osdc->osdmap;
				2962
				2963	verify_osdc_wrlocked(osdc);
				2964	WARN_ON(!map->epoch);
				2965
				2966	if (lreq->register_gen) {
				2967	lreq->map_dne_bound = map->epoch;
				2968	dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
				2969	lreq, lreq->linger_id);
				2970	} else {
				2971	dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
				2972	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				2973	map->epoch);
				2974	}
				2975
				2976	if (lreq->map_dne_bound) {
				2977	if (map->epoch >= lreq->map_dne_bound) {
				2978	/* we had a new enough map */
				2979	pr_info("linger_id %llu pool does not exist\n",
				2980	lreq->linger_id);
				2981	linger_reg_commit_complete(lreq, -ENOENT);
				2982	__linger_cancel(lreq);
				2983	}
				2984	} else {
				2985	send_linger_map_check(lreq);
				2986	}
				2987	}
				2988
				2989	static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
				2990	{
				2991	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				2992	struct ceph_osd_linger_request *lreq;
				2993	u64 linger_id = greq->private_data;
				2994
				2995	WARN_ON(greq->result \|\| !greq->u.newest);
				2996
				2997	down_write(&osdc->lock);
				2998	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
				2999	if (!lreq) {
				3000	dout("%s linger_id %llu dne\n", __func__, linger_id);
				3001	goto out_unlock;
				3002	}
				3003
				3004	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
				3005	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				3006	greq->u.newest);
				3007	if (!lreq->map_dne_bound)
				3008	lreq->map_dne_bound = greq->u.newest;
				3009	erase_linger_mc(&osdc->linger_map_checks, lreq);
				3010	check_linger_pool_dne(lreq);
				3011
				3012	linger_put(lreq);
				3013	out_unlock:
				3014	up_write(&osdc->lock);
				3015	}
				3016
				3017	static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
				3018	{
				3019	struct ceph_osd_client *osdc = lreq->osdc;
				3020	struct ceph_osd_linger_request *lookup_lreq;
				3021	int ret;
				3022
				3023	verify_osdc_wrlocked(osdc);
				3024
				3025	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				3026	lreq->linger_id);
				3027	if (lookup_lreq) {
				3028	WARN_ON(lookup_lreq != lreq);
				3029	return;
				3030	}
				3031
				3032	linger_get(lreq);
				3033	insert_linger_mc(&osdc->linger_map_checks, lreq);
				3034	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				3035	linger_map_check_cb, lreq->linger_id);
				3036	WARN_ON(ret);
				3037	}
				3038
				3039	static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
				3040	{
				3041	int ret;
				3042
				3043	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3044	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
				3045	return ret ?: lreq->reg_commit_error;
				3046	}
				3047
				3048	static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
				3049	{
				3050	int ret;
				3051
				3052	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3053	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
				3054	return ret ?: lreq->notify_finish_error;
				3055	}
				3056
				3057	/*
				3058	* Timeout callback, called every N seconds. When 1 or more OSD
				3059	* requests has been active for more than N seconds, we send a keepalive
				3060	* (tag + timestamp) to its OSD to ensure any communications channel
				3061	* reset is detected.
				3062	*/
				3063	static void handle_timeout(struct work_struct *work)
				3064	{
				3065	struct ceph_osd_client *osdc =
				3066	container_of(work, struct ceph_osd_client, timeout_work.work);
				3067	struct ceph_options *opts = osdc->client->options;
				3068	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
				3069	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
				3070	LIST_HEAD(slow_osds);
				3071	struct rb_node n, p;
				3072
				3073	dout("%s osdc %p\n", __func__, osdc);
				3074	down_write(&osdc->lock);
				3075
				3076	/*
				3077	* ping osds that are a bit slow. this ensures that if there
				3078	* is a break in the TCP connection we will notice, and reopen
				3079	* a connection with that osd (from the fault callback).
				3080	*/
				3081	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				3082	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3083	bool found = false;
				3084
				3085	for (p = rb_first(&osd->o_requests); p; ) {
				3086	struct ceph_osd_request *req =
				3087	rb_entry(p, struct ceph_osd_request, r_node);
				3088
				3089	p = rb_next(p); /* abort_request() */
				3090
				3091	if (time_before(req->r_stamp, cutoff)) {
				3092	dout(" req %p tid %llu on osd%d is laggy\n",
				3093	req, req->r_tid, osd->o_osd);
				3094	found = true;
				3095	}
				3096	if (opts->osd_request_timeout &&
				3097	time_before(req->r_start_stamp, expiry_cutoff)) {
				3098	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3099	req->r_tid, osd->o_osd);
				3100	abort_request(req, -ETIMEDOUT);
				3101	}
				3102	}
				3103	for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
				3104	struct ceph_osd_linger_request *lreq =
				3105	rb_entry(p, struct ceph_osd_linger_request, node);
				3106
				3107	dout(" lreq %p linger_id %llu is served by osd%d\n",
				3108	lreq, lreq->linger_id, osd->o_osd);
				3109	found = true;
				3110
				3111	mutex_lock(&lreq->lock);
				3112	if (lreq->is_watch && lreq->committed && !lreq->last_error)
				3113	send_linger_ping(lreq);
				3114	mutex_unlock(&lreq->lock);
				3115	}
				3116
				3117	if (found)
				3118	list_move_tail(&osd->o_keepalive_item, &slow_osds);
				3119	}
				3120
				3121	if (opts->osd_request_timeout) {
				3122	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
				3123	struct ceph_osd_request *req =
				3124	rb_entry(p, struct ceph_osd_request, r_node);
				3125
				3126	p = rb_next(p); /* abort_request() */
				3127
				3128	if (time_before(req->r_start_stamp, expiry_cutoff)) {
				3129	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3130	req->r_tid, osdc->homeless_osd.o_osd);
				3131	abort_request(req, -ETIMEDOUT);
				3132	}
				3133	}
				3134	}
				3135
				3136	if (atomic_read(&osdc->num_homeless) \|\| !list_empty(&slow_osds))
				3137	maybe_request_map(osdc);
				3138
				3139	while (!list_empty(&slow_osds)) {
				3140	struct ceph_osd *osd = list_first_entry(&slow_osds,
				3141	struct ceph_osd,
				3142	o_keepalive_item);
				3143	list_del_init(&osd->o_keepalive_item);
				3144	ceph_con_keepalive(&osd->o_con);
				3145	}
				3146
				3147	up_write(&osdc->lock);
				3148	schedule_delayed_work(&osdc->timeout_work,
				3149	osdc->client->options->osd_keepalive_timeout);
				3150	}
				3151
				3152	static void handle_osds_timeout(struct work_struct *work)
				3153	{
				3154	struct ceph_osd_client *osdc =
				3155	container_of(work, struct ceph_osd_client,
				3156	osds_timeout_work.work);
				3157	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
				3158	struct ceph_osd osd, nosd;
				3159
				3160	dout("%s osdc %p\n", __func__, osdc);
				3161	down_write(&osdc->lock);
				3162	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
				3163	if (time_before(jiffies, osd->lru_ttl))
				3164	break;
				3165
				3166	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				3167	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				3168	close_osd(osd);
				3169	}
				3170
				3171	up_write(&osdc->lock);
				3172	schedule_delayed_work(&osdc->osds_timeout_work,
				3173	round_jiffies_relative(delay));
				3174	}
				3175
				3176	static int ceph_oloc_decode(void *p, void end,
				3177	struct ceph_object_locator *oloc)
				3178	{
				3179	u8 struct_v, struct_cv;
				3180	u32 len;
				3181	void *struct_end;
				3182	int ret = 0;
				3183
				3184	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3185	struct_v = ceph_decode_8(p);
				3186	struct_cv = ceph_decode_8(p);
				3187	if (struct_v < 3) {
				3188	pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
				3189	struct_v, struct_cv);
				3190	goto e_inval;
				3191	}
				3192	if (struct_cv > 6) {
				3193	pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
				3194	struct_v, struct_cv);
				3195	goto e_inval;
				3196	}
				3197	len = ceph_decode_32(p);
				3198	ceph_decode_need(p, end, len, e_inval);
				3199	struct_end = *p + len;
				3200
				3201	oloc->pool = ceph_decode_64(p);
				3202	p += 4; / skip preferred */
				3203
				3204	len = ceph_decode_32(p);
				3205	if (len > 0) {
				3206	pr_warn("ceph_object_locator::key is set\n");
				3207	goto e_inval;
				3208	}
				3209
				3210	if (struct_v >= 5) {
				3211	bool changed = false;
				3212
				3213	len = ceph_decode_32(p);
				3214	if (len > 0) {
				3215	ceph_decode_need(p, end, len, e_inval);
				3216	if (!oloc->pool_ns \|\|
				3217	ceph_compare_string(oloc->pool_ns, *p, len))
				3218	changed = true;
				3219	*p += len;
				3220	} else {
				3221	if (oloc->pool_ns)
				3222	changed = true;
				3223	}
				3224	if (changed) {
				3225	/* redirect changes namespace */
				3226	pr_warn("ceph_object_locator::nspace is changed\n");
				3227	goto e_inval;
				3228	}
				3229	}
				3230
				3231	if (struct_v >= 6) {
				3232	s64 hash = ceph_decode_64(p);
				3233	if (hash != -1) {
				3234	pr_warn("ceph_object_locator::hash is set\n");
				3235	goto e_inval;
				3236	}
				3237	}
				3238
				3239	/* skip the rest */
				3240	*p = struct_end;
				3241	out:
				3242	return ret;
				3243
				3244	e_inval:
				3245	ret = -EINVAL;
				3246	goto out;
				3247	}
				3248
				3249	static int ceph_redirect_decode(void *p, void end,
				3250	struct ceph_request_redirect *redir)
				3251	{
				3252	u8 struct_v, struct_cv;
				3253	u32 len;
				3254	void *struct_end;
				3255	int ret;
				3256
				3257	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3258	struct_v = ceph_decode_8(p);
				3259	struct_cv = ceph_decode_8(p);
				3260	if (struct_cv > 1) {
				3261	pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
				3262	struct_v, struct_cv);
				3263	goto e_inval;
				3264	}
				3265	len = ceph_decode_32(p);
				3266	ceph_decode_need(p, end, len, e_inval);
				3267	struct_end = *p + len;
				3268
				3269	ret = ceph_oloc_decode(p, end, &redir->oloc);
				3270	if (ret)
				3271	goto out;
				3272
				3273	len = ceph_decode_32(p);
				3274	if (len > 0) {
				3275	pr_warn("ceph_request_redirect::object_name is set\n");
				3276	goto e_inval;
				3277	}
				3278
				3279	len = ceph_decode_32(p);
				3280	p += len; / skip osd_instructions */
				3281
				3282	/* skip the rest */
				3283	*p = struct_end;
				3284	out:
				3285	return ret;
				3286
				3287	e_inval:
				3288	ret = -EINVAL;
				3289	goto out;
				3290	}
				3291
				3292	struct MOSDOpReply {
				3293	struct ceph_pg pgid;
				3294	u64 flags;
				3295	int result;
				3296	u32 epoch;
				3297	int num_ops;
				3298	u32 outdata_len[CEPH_OSD_MAX_OPS];
				3299	s32 rval[CEPH_OSD_MAX_OPS];
				3300	int retry_attempt;
				3301	struct ceph_eversion replay_version;
				3302	u64 user_version;
				3303	struct ceph_request_redirect redirect;
				3304	};
				3305
				3306	static int decode_MOSDOpReply(const struct ceph_msg msg, struct MOSDOpReply m)
				3307	{
				3308	void *p = msg->front.iov_base;
				3309	void *const end = p + msg->front.iov_len;
				3310	u16 version = le16_to_cpu(msg->hdr.version);
				3311	struct ceph_eversion bad_replay_version;
				3312	u8 decode_redir;
				3313	u32 len;
				3314	int ret;
				3315	int i;
				3316
				3317	ceph_decode_32_safe(&p, end, len, e_inval);
				3318	ceph_decode_need(&p, end, len, e_inval);
				3319	p += len; /* skip oid */
				3320
				3321	ret = ceph_decode_pgid(&p, end, &m->pgid);
				3322	if (ret)
				3323	return ret;
				3324
				3325	ceph_decode_64_safe(&p, end, m->flags, e_inval);
				3326	ceph_decode_32_safe(&p, end, m->result, e_inval);
				3327	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
				3328	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
				3329	p += sizeof(bad_replay_version);
				3330	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
				3331
				3332	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
				3333	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
				3334	goto e_inval;
				3335
				3336	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
				3337	e_inval);
				3338	for (i = 0; i < m->num_ops; i++) {
				3339	struct ceph_osd_op *op = p;
				3340
				3341	m->outdata_len[i] = le32_to_cpu(op->payload_len);
				3342	p += sizeof(*op);
				3343	}
				3344
				3345	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
				3346	for (i = 0; i < m->num_ops; i++)
				3347	ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
				3348
				3349	if (version >= 5) {
				3350	ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
				3351	memcpy(&m->replay_version, p, sizeof(m->replay_version));
				3352	p += sizeof(m->replay_version);
				3353	ceph_decode_64_safe(&p, end, m->user_version, e_inval);
				3354	} else {
				3355	m->replay_version = bad_replay_version; /* struct */
				3356	m->user_version = le64_to_cpu(m->replay_version.version);
				3357	}
				3358
				3359	if (version >= 6) {
				3360	if (version >= 7)
				3361	ceph_decode_8_safe(&p, end, decode_redir, e_inval);
				3362	else
				3363	decode_redir = 1;
				3364	} else {
				3365	decode_redir = 0;
				3366	}
				3367
				3368	if (decode_redir) {
				3369	ret = ceph_redirect_decode(&p, end, &m->redirect);
				3370	if (ret)
				3371	return ret;
				3372	} else {
				3373	ceph_oloc_init(&m->redirect.oloc);
				3374	}
				3375
				3376	return 0;
				3377
				3378	e_inval:
				3379	return -EINVAL;
				3380	}
				3381
				3382	/*
				3383	* Handle MOSDOpReply. Set ->r_result and call the callback if it is
				3384	* specified.
				3385	*/
				3386	static void handle_reply(struct ceph_osd osd, struct ceph_msg msg)
				3387	{
				3388	struct ceph_osd_client *osdc = osd->o_osdc;
				3389	struct ceph_osd_request *req;
				3390	struct MOSDOpReply m;
				3391	u64 tid = le64_to_cpu(msg->hdr.tid);
				3392	u32 data_len = 0;
				3393	int ret;
				3394	int i;
				3395
				3396	dout("%s msg %p tid %llu\n", __func__, msg, tid);
				3397
				3398	down_read(&osdc->lock);
				3399	if (!osd_registered(osd)) {
				3400	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				3401	goto out_unlock_osdc;
				3402	}
				3403	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				3404
				3405	mutex_lock(&osd->lock);
				3406	req = lookup_request(&osd->o_requests, tid);
				3407	if (!req) {
				3408	dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
				3409	goto out_unlock_session;
				3410	}
				3411
				3412	m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
				3413	ret = decode_MOSDOpReply(msg, &m);
				3414	m.redirect.oloc.pool_ns = NULL;
				3415	if (ret) {
				3416	pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
				3417	req->r_tid, ret);
				3418	ceph_msg_dump(msg);
				3419	goto fail_request;
				3420	}
				3421	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
				3422	__func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
				3423	m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
				3424	le64_to_cpu(m.replay_version.version), m.user_version);
				3425
				3426	if (m.retry_attempt >= 0) {
				3427	if (m.retry_attempt != req->r_attempts - 1) {
				3428	dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
				3429	req, req->r_tid, m.retry_attempt,
				3430	req->r_attempts - 1);
				3431	goto out_unlock_session;
				3432	}
				3433	} else {
				3434	WARN_ON(1); /* MOSDOpReply v4 is assumed */
				3435	}
				3436
				3437	if (!ceph_oloc_empty(&m.redirect.oloc)) {
				3438	dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
				3439	m.redirect.oloc.pool);
				3440	unlink_request(osd, req);
				3441	mutex_unlock(&osd->lock);
				3442
				3443	/*
				3444	* Not ceph_oloc_copy() - changing pool_ns is not
				3445	* supported.
				3446	*/
				3447	req->r_t.target_oloc.pool = m.redirect.oloc.pool;
				3448	req->r_flags \|= CEPH_OSD_FLAG_REDIRECTED \|
				3449	CEPH_OSD_FLAG_IGNORE_OVERLAY \|
				3450	CEPH_OSD_FLAG_IGNORE_CACHE;
				3451	req->r_tid = 0;
				3452	__submit_request(req, false);
				3453	goto out_unlock_osdc;
				3454	}
				3455
				3456	if (m.num_ops != req->r_num_ops) {
				3457	pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
				3458	req->r_num_ops, req->r_tid);
				3459	goto fail_request;
				3460	}
				3461	for (i = 0; i < req->r_num_ops; i++) {
				3462	dout(" req %p tid %llu op %d rval %d len %u\n", req,
				3463	req->r_tid, i, m.rval[i], m.outdata_len[i]);
				3464	req->r_ops[i].rval = m.rval[i];
				3465	req->r_ops[i].outdata_len = m.outdata_len[i];
				3466	data_len += m.outdata_len[i];
				3467	}
				3468	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
				3469	pr_err("sum of lens %u != %u for tid %llu\n", data_len,
				3470	le32_to_cpu(msg->hdr.data_len), req->r_tid);
				3471	goto fail_request;
				3472	}
				3473	dout("%s req %p tid %llu result %d data_len %u\n", __func__,
				3474	req, req->r_tid, m.result, data_len);
				3475
				3476	/*
				3477	* Since we only ever request ONDISK, we should only ever get
				3478	* one (type of) reply back.
				3479	*/
				3480	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
				3481	req->r_result = m.result ?: data_len;
				3482	finish_request(req);
				3483	mutex_unlock(&osd->lock);
				3484	up_read(&osdc->lock);
				3485
				3486	__complete_request(req);
				3487	complete_all(&req->r_completion);
				3488	ceph_osdc_put_request(req);
				3489	return;
				3490
				3491	fail_request:
				3492	complete_request(req, -EIO);
				3493	out_unlock_session:
				3494	mutex_unlock(&osd->lock);
				3495	out_unlock_osdc:
				3496	up_read(&osdc->lock);
				3497	}
				3498
				3499	static void set_pool_was_full(struct ceph_osd_client *osdc)
				3500	{
				3501	struct rb_node *n;
				3502
				3503	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				3504	struct ceph_pg_pool_info *pi =
				3505	rb_entry(n, struct ceph_pg_pool_info, node);
				3506
				3507	pi->was_full = __pool_full(pi);
				3508	}
				3509	}
				3510
				3511	static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
				3512	{
				3513	struct ceph_pg_pool_info *pi;
				3514
				3515	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				3516	if (!pi)
				3517	return false;
				3518
				3519	return pi->was_full && !__pool_full(pi);
				3520	}
				3521
				3522	static enum calc_target_result
				3523	recalc_linger_target(struct ceph_osd_linger_request *lreq)
				3524	{
				3525	struct ceph_osd_client *osdc = lreq->osdc;
				3526	enum calc_target_result ct_res;
				3527
				3528	ct_res = calc_target(osdc, &lreq->t, NULL, true);
				3529	if (ct_res == CALC_TARGET_NEED_RESEND) {
				3530	struct ceph_osd *osd;
				3531
				3532	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				3533	if (osd != lreq->osd) {
				3534	unlink_linger(lreq->osd, lreq);
				3535	link_linger(osd, lreq);
				3536	}
				3537	}
				3538
				3539	return ct_res;
				3540	}
				3541
				3542	/*
				3543	* Requeue requests whose mapping to an OSD has changed.
				3544	*/
				3545	static void scan_requests(struct ceph_osd *osd,
				3546	bool force_resend,
				3547	bool cleared_full,
				3548	bool check_pool_cleared_full,
				3549	struct rb_root *need_resend,
				3550	struct list_head *need_resend_linger)
				3551	{
				3552	struct ceph_osd_client *osdc = osd->o_osdc;
				3553	struct rb_node *n;
				3554	bool force_resend_writes;
				3555
				3556	for (n = rb_first(&osd->o_linger_requests); n; ) {
				3557	struct ceph_osd_linger_request *lreq =
				3558	rb_entry(n, struct ceph_osd_linger_request, node);
				3559	enum calc_target_result ct_res;
				3560
				3561	n = rb_next(n); /* recalc_linger_target() */
				3562
				3563	dout("%s lreq %p linger_id %llu\n", __func__, lreq,
				3564	lreq->linger_id);
				3565	ct_res = recalc_linger_target(lreq);
				3566	switch (ct_res) {
				3567	case CALC_TARGET_NO_ACTION:
				3568	force_resend_writes = cleared_full \|\|
				3569	(check_pool_cleared_full &&
				3570	pool_cleared_full(osdc, lreq->t.base_oloc.pool));
				3571	if (!force_resend && !force_resend_writes)
				3572	break;
				3573
				3574	/* fall through */
				3575	case CALC_TARGET_NEED_RESEND:
				3576	cancel_linger_map_check(lreq);
				3577	/*
				3578	* scan_requests() for the previous epoch(s)
				3579	* may have already added it to the list, since
				3580	* it's not unlinked here.
				3581	*/
				3582	if (list_empty(&lreq->scan_item))
				3583	list_add_tail(&lreq->scan_item, need_resend_linger);
				3584	break;
				3585	case CALC_TARGET_POOL_DNE:
				3586	list_del_init(&lreq->scan_item);
				3587	check_linger_pool_dne(lreq);
				3588	break;
				3589	}
				3590	}
				3591
				3592	for (n = rb_first(&osd->o_requests); n; ) {
				3593	struct ceph_osd_request *req =
				3594	rb_entry(n, struct ceph_osd_request, r_node);
				3595	enum calc_target_result ct_res;
				3596
				3597	n = rb_next(n); /* unlink_request(), check_pool_dne() */
				3598
				3599	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				3600	ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
				3601	false);
				3602	switch (ct_res) {
				3603	case CALC_TARGET_NO_ACTION:
				3604	force_resend_writes = cleared_full \|\|
				3605	(check_pool_cleared_full &&
				3606	pool_cleared_full(osdc, req->r_t.base_oloc.pool));
				3607	if (!force_resend &&
				3608	(!(req->r_flags & CEPH_OSD_FLAG_WRITE) \|\|
				3609	!force_resend_writes))
				3610	break;
				3611
				3612	/* fall through */
				3613	case CALC_TARGET_NEED_RESEND:
				3614	cancel_map_check(req);
				3615	unlink_request(osd, req);
				3616	insert_request(need_resend, req);
				3617	break;
				3618	case CALC_TARGET_POOL_DNE:
				3619	check_pool_dne(req);
				3620	break;
				3621	}
				3622	}
				3623	}
				3624
				3625	static int handle_one_map(struct ceph_osd_client *osdc,
				3626	void p, void end, bool incremental,
				3627	struct rb_root *need_resend,
				3628	struct list_head *need_resend_linger)
				3629	{
				3630	struct ceph_osdmap *newmap;
				3631	struct rb_node *n;
				3632	bool skipped_map = false;
				3633	bool was_full;
				3634
				3635	was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3636	set_pool_was_full(osdc);
				3637
				3638	if (incremental)
				3639	newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
				3640	else
				3641	newmap = ceph_osdmap_decode(&p, end);
				3642	if (IS_ERR(newmap))
				3643	return PTR_ERR(newmap);
				3644
				3645	if (newmap != osdc->osdmap) {
				3646	/*
				3647	* Preserve ->was_full before destroying the old map.
				3648	* For pools that weren't in the old map, ->was_full
				3649	* should be false.
				3650	*/
				3651	for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
				3652	struct ceph_pg_pool_info *pi =
				3653	rb_entry(n, struct ceph_pg_pool_info, node);
				3654	struct ceph_pg_pool_info *old_pi;
				3655
				3656	old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
				3657	if (old_pi)
				3658	pi->was_full = old_pi->was_full;
				3659	else
				3660	WARN_ON(pi->was_full);
				3661	}
				3662
				3663	if (osdc->osdmap->epoch &&
				3664	osdc->osdmap->epoch + 1 < newmap->epoch) {
				3665	WARN_ON(incremental);
				3666	skipped_map = true;
				3667	}
				3668
				3669	ceph_osdmap_destroy(osdc->osdmap);
				3670	osdc->osdmap = newmap;
				3671	}
				3672
				3673	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3674	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
				3675	need_resend, need_resend_linger);
				3676
				3677	for (n = rb_first(&osdc->osds); n; ) {
				3678	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3679
				3680	n = rb_next(n); /* close_osd() */
				3681
				3682	scan_requests(osd, skipped_map, was_full, true, need_resend,
				3683	need_resend_linger);
				3684	if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) \|\|
				3685	memcmp(&osd->o_con.peer_addr,
				3686	ceph_osd_addr(osdc->osdmap, osd->o_osd),
				3687	sizeof(struct ceph_entity_addr)))
				3688	close_osd(osd);
				3689	}
				3690
				3691	return 0;
				3692	}
				3693
				3694	static void kick_requests(struct ceph_osd_client *osdc,
				3695	struct rb_root *need_resend,
				3696	struct list_head *need_resend_linger)
				3697	{
				3698	struct ceph_osd_linger_request lreq, nlreq;
				3699	enum calc_target_result ct_res;
				3700	struct rb_node *n;
				3701
				3702	/* make sure need_resend targets reflect latest map */
				3703	for (n = rb_first(need_resend); n; ) {
				3704	struct ceph_osd_request *req =
				3705	rb_entry(n, struct ceph_osd_request, r_node);
				3706
				3707	n = rb_next(n);
				3708
				3709	if (req->r_t.epoch < osdc->osdmap->epoch) {
				3710	ct_res = calc_target(osdc, &req->r_t, NULL, false);
				3711	if (ct_res == CALC_TARGET_POOL_DNE) {
				3712	erase_request(need_resend, req);
				3713	check_pool_dne(req);
				3714	}
				3715	}
				3716	}
				3717
				3718	for (n = rb_first(need_resend); n; ) {
				3719	struct ceph_osd_request *req =
				3720	rb_entry(n, struct ceph_osd_request, r_node);
				3721	struct ceph_osd *osd;
				3722
				3723	n = rb_next(n);
				3724	erase_request(need_resend, req); /* before link_request() */
				3725
				3726	osd = lookup_create_osd(osdc, req->r_t.osd, true);
				3727	link_request(osd, req);
				3728	if (!req->r_linger) {
				3729	if (!osd_homeless(osd) && !req->r_t.paused)
				3730	send_request(req);
				3731	} else {
				3732	cancel_linger_request(req);
				3733	}
				3734	}
				3735
				3736	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
				3737	if (!osd_homeless(lreq->osd))
				3738	send_linger(lreq);
				3739
				3740	list_del_init(&lreq->scan_item);
				3741	}
				3742	}
				3743
				3744	/*
				3745	* Process updated osd map.
				3746	*
				3747	* The message contains any number of incremental and full maps, normally
				3748	* indicating some sort of topology change in the cluster. Kick requests
				3749	* off to different OSDs as needed.
				3750	*/
				3751	void ceph_osdc_handle_map(struct ceph_osd_client osdc, struct ceph_msg msg)
				3752	{
				3753	void *p = msg->front.iov_base;
				3754	void *const end = p + msg->front.iov_len;
				3755	u32 nr_maps, maplen;
				3756	u32 epoch;
				3757	struct ceph_fsid fsid;
				3758	struct rb_root need_resend = RB_ROOT;
				3759	LIST_HEAD(need_resend_linger);
				3760	bool handled_incremental = false;
				3761	bool was_pauserd, was_pausewr;
				3762	bool pauserd, pausewr;
				3763	int err;
				3764
				3765	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
				3766	down_write(&osdc->lock);
				3767
				3768	/* verify fsid */
				3769	ceph_decode_need(&p, end, sizeof(fsid), bad);
				3770	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				3771	if (ceph_check_fsid(osdc->client, &fsid) < 0)
				3772	goto bad;
				3773
				3774	was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				3775	was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				3776	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				3777	have_pool_full(osdc);
				3778
				3779	/* incremental maps */
				3780	ceph_decode_32_safe(&p, end, nr_maps, bad);
				3781	dout(" %d inc maps\n", nr_maps);
				3782	while (nr_maps > 0) {
				3783	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				3784	epoch = ceph_decode_32(&p);
				3785	maplen = ceph_decode_32(&p);
				3786	ceph_decode_need(&p, end, maplen, bad);
				3787	if (osdc->osdmap->epoch &&
				3788	osdc->osdmap->epoch + 1 == epoch) {
				3789	dout("applying incremental map %u len %d\n",
				3790	epoch, maplen);
				3791	err = handle_one_map(osdc, p, p + maplen, true,
				3792	&need_resend, &need_resend_linger);
				3793	if (err)
				3794	goto bad;
				3795	handled_incremental = true;
				3796	} else {
				3797	dout("ignoring incremental map %u len %d\n",
				3798	epoch, maplen);
				3799	}
				3800	p += maplen;
				3801	nr_maps--;
				3802	}
				3803	if (handled_incremental)
				3804	goto done;
				3805
				3806	/* full maps */
				3807	ceph_decode_32_safe(&p, end, nr_maps, bad);
				3808	dout(" %d full maps\n", nr_maps);
				3809	while (nr_maps) {
				3810	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				3811	epoch = ceph_decode_32(&p);
				3812	maplen = ceph_decode_32(&p);
				3813	ceph_decode_need(&p, end, maplen, bad);
				3814	if (nr_maps > 1) {
				3815	dout("skipping non-latest full map %u len %d\n",
				3816	epoch, maplen);
				3817	} else if (osdc->osdmap->epoch >= epoch) {
				3818	dout("skipping full map %u len %d, "
				3819	"older than our %u\n", epoch, maplen,
				3820	osdc->osdmap->epoch);
				3821	} else {
				3822	dout("taking full map %u len %d\n", epoch, maplen);
				3823	err = handle_one_map(osdc, p, p + maplen, false,
				3824	&need_resend, &need_resend_linger);
				3825	if (err)
				3826	goto bad;
				3827	}
				3828	p += maplen;
				3829	nr_maps--;
				3830	}
				3831
				3832	done:
				3833	/*
				3834	* subscribe to subsequent osdmap updates if full to ensure
				3835	* we find out when we are no longer full and stop returning
				3836	* ENOSPC.
				3837	*/
				3838	pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				3839	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				3840	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				3841	have_pool_full(osdc);
				3842	if (was_pauserd \|\| was_pausewr \|\| pauserd \|\| pausewr \|\|
				3843	osdc->osdmap->epoch < osdc->epoch_barrier)
				3844	maybe_request_map(osdc);
				3845
				3846	kick_requests(osdc, &need_resend, &need_resend_linger);
				3847
				3848	ceph_osdc_abort_on_full(osdc);
				3849	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				3850	osdc->osdmap->epoch);
				3851	up_write(&osdc->lock);
				3852	wake_up_all(&osdc->client->auth_wq);
				3853	return;
				3854
				3855	bad:
				3856	pr_err("osdc handle_map corrupt msg\n");
				3857	ceph_msg_dump(msg);
				3858	up_write(&osdc->lock);
				3859	}
				3860
				3861	/*
				3862	* Resubmit requests pending on the given osd.
				3863	*/
				3864	static void kick_osd_requests(struct ceph_osd *osd)
				3865	{
				3866	struct rb_node *n;
				3867
				3868	clear_backoffs(osd);
				3869
				3870	for (n = rb_first(&osd->o_requests); n; ) {
				3871	struct ceph_osd_request *req =
				3872	rb_entry(n, struct ceph_osd_request, r_node);
				3873
				3874	n = rb_next(n); /* cancel_linger_request() */
				3875
				3876	if (!req->r_linger) {
				3877	if (!req->r_t.paused)
				3878	send_request(req);
				3879	} else {
				3880	cancel_linger_request(req);
				3881	}
				3882	}
				3883	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
				3884	struct ceph_osd_linger_request *lreq =
				3885	rb_entry(n, struct ceph_osd_linger_request, node);
				3886
				3887	send_linger(lreq);
				3888	}
				3889	}
				3890
				3891	/*
				3892	* If the osd connection drops, we need to resubmit all requests.
				3893	*/
				3894	static void osd_fault(struct ceph_connection *con)
				3895	{
				3896	struct ceph_osd *osd = con->private;
				3897	struct ceph_osd_client *osdc = osd->o_osdc;
				3898
				3899	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				3900
				3901	down_write(&osdc->lock);
				3902	if (!osd_registered(osd)) {
				3903	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				3904	goto out_unlock;
				3905	}
				3906
				3907	if (!reopen_osd(osd))
				3908	kick_osd_requests(osd);
				3909	maybe_request_map(osdc);
				3910
				3911	out_unlock:
				3912	up_write(&osdc->lock);
				3913	}
				3914
				3915	struct MOSDBackoff {
				3916	struct ceph_spg spgid;
				3917	u32 map_epoch;
				3918	u8 op;
				3919	u64 id;
				3920	struct ceph_hobject_id *begin;
				3921	struct ceph_hobject_id *end;
				3922	};
				3923
				3924	static int decode_MOSDBackoff(const struct ceph_msg msg, struct MOSDBackoff m)
				3925	{
				3926	void *p = msg->front.iov_base;
				3927	void *const end = p + msg->front.iov_len;
				3928	u8 struct_v;
				3929	u32 struct_len;
				3930	int ret;
				3931
				3932	ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
				3933	if (ret)
				3934	return ret;
				3935
				3936	ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
				3937	if (ret)
				3938	return ret;
				3939
				3940	ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
				3941	ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
				3942	ceph_decode_8_safe(&p, end, m->op, e_inval);
				3943	ceph_decode_64_safe(&p, end, m->id, e_inval);
				3944
				3945	m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
				3946	if (!m->begin)
				3947	return -ENOMEM;
				3948
				3949	ret = decode_hoid(&p, end, m->begin);
				3950	if (ret) {
				3951	free_hoid(m->begin);
				3952	return ret;
				3953	}
				3954
				3955	m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
				3956	if (!m->end) {
				3957	free_hoid(m->begin);
				3958	return -ENOMEM;
				3959	}
				3960
				3961	ret = decode_hoid(&p, end, m->end);
				3962	if (ret) {
				3963	free_hoid(m->begin);
				3964	free_hoid(m->end);
				3965	return ret;
				3966	}
				3967
				3968	return 0;
				3969
				3970	e_inval:
				3971	return -EINVAL;
				3972	}
				3973
				3974	static struct ceph_msg *create_backoff_message(
				3975	const struct ceph_osd_backoff *backoff,
				3976	u32 map_epoch)
				3977	{
				3978	struct ceph_msg *msg;
				3979	void p, end;
				3980	int msg_size;
				3981
				3982	msg_size = CEPH_ENCODING_START_BLK_LEN +
				3983	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				3984	msg_size += 4 + 1 + 8; /* map_epoch, op, id */
				3985	msg_size += CEPH_ENCODING_START_BLK_LEN +
				3986	hoid_encoding_size(backoff->begin);
				3987	msg_size += CEPH_ENCODING_START_BLK_LEN +
				3988	hoid_encoding_size(backoff->end);
				3989
				3990	msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
				3991	if (!msg)
				3992	return NULL;
				3993
				3994	p = msg->front.iov_base;
				3995	end = p + msg->front_alloc_len;
				3996
				3997	encode_spgid(&p, &backoff->spgid);
				3998	ceph_encode_32(&p, map_epoch);
				3999	ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
				4000	ceph_encode_64(&p, backoff->id);
				4001	encode_hoid(&p, end, backoff->begin);
				4002	encode_hoid(&p, end, backoff->end);
				4003	BUG_ON(p != end);
				4004
				4005	msg->front.iov_len = p - msg->front.iov_base;
				4006	msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
				4007	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				4008
				4009	return msg;
				4010	}
				4011
				4012	static void handle_backoff_block(struct ceph_osd osd, struct MOSDBackoff m)
				4013	{
				4014	struct ceph_spg_mapping *spg;
				4015	struct ceph_osd_backoff *backoff;
				4016	struct ceph_msg *msg;
				4017
				4018	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4019	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4020
				4021	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
				4022	if (!spg) {
				4023	spg = alloc_spg_mapping();
				4024	if (!spg) {
				4025	pr_err("%s failed to allocate spg\n", __func__);
				4026	return;
				4027	}
				4028	spg->spgid = m->spgid; /* struct */
				4029	insert_spg_mapping(&osd->o_backoff_mappings, spg);
				4030	}
				4031
				4032	backoff = alloc_backoff();
				4033	if (!backoff) {
				4034	pr_err("%s failed to allocate backoff\n", __func__);
				4035	return;
				4036	}
				4037	backoff->spgid = m->spgid; /* struct */
				4038	backoff->id = m->id;
				4039	backoff->begin = m->begin;
				4040	m->begin = NULL; /* backoff now owns this */
				4041	backoff->end = m->end;
				4042	m->end = NULL; /* ditto */
				4043
				4044	insert_backoff(&spg->backoffs, backoff);
				4045	insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4046
				4047	/*
				4048	* Ack with original backoff's epoch so that the OSD can
				4049	* discard this if there was a PG split.
				4050	*/
				4051	msg = create_backoff_message(backoff, m->map_epoch);
				4052	if (!msg) {
				4053	pr_err("%s failed to allocate msg\n", __func__);
				4054	return;
				4055	}
				4056	ceph_con_send(&osd->o_con, msg);
				4057	}
				4058
				4059	static bool target_contained_by(const struct ceph_osd_request_target *t,
				4060	const struct ceph_hobject_id *begin,
				4061	const struct ceph_hobject_id *end)
				4062	{
				4063	struct ceph_hobject_id hoid;
				4064	int cmp;
				4065
				4066	hoid_fill_from_target(&hoid, t);
				4067	cmp = hoid_compare(&hoid, begin);
				4068	return !cmp \|\| (cmp > 0 && hoid_compare(&hoid, end) < 0);
				4069	}
				4070
				4071	static void handle_backoff_unblock(struct ceph_osd *osd,
				4072	const struct MOSDBackoff *m)
				4073	{
				4074	struct ceph_spg_mapping *spg;
				4075	struct ceph_osd_backoff *backoff;
				4076	struct rb_node *n;
				4077
				4078	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4079	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4080
				4081	backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
				4082	if (!backoff) {
				4083	pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
				4084	__func__, osd->o_osd, m->spgid.pgid.pool,
				4085	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4086	return;
				4087	}
				4088
				4089	if (hoid_compare(backoff->begin, m->begin) &&
				4090	hoid_compare(backoff->end, m->end)) {
				4091	pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
				4092	__func__, osd->o_osd, m->spgid.pgid.pool,
				4093	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4094	/* unblock it anyway... */
				4095	}
				4096
				4097	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
				4098	BUG_ON(!spg);
				4099
				4100	erase_backoff(&spg->backoffs, backoff);
				4101	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4102	free_backoff(backoff);
				4103
				4104	if (RB_EMPTY_ROOT(&spg->backoffs)) {
				4105	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				4106	free_spg_mapping(spg);
				4107	}
				4108
				4109	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				4110	struct ceph_osd_request *req =
				4111	rb_entry(n, struct ceph_osd_request, r_node);
				4112
				4113	if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
				4114	/*
				4115	* Match against @m, not @backoff -- the PG may
				4116	* have split on the OSD.
				4117	*/
				4118	if (target_contained_by(&req->r_t, m->begin, m->end)) {
				4119	/*
				4120	* If no other installed backoff applies,
				4121	* resend.
				4122	*/
				4123	send_request(req);
				4124	}
				4125	}
				4126	}
				4127	}
				4128
				4129	static void handle_backoff(struct ceph_osd osd, struct ceph_msg msg)
				4130	{
				4131	struct ceph_osd_client *osdc = osd->o_osdc;
				4132	struct MOSDBackoff m;
				4133	int ret;
				4134
				4135	down_read(&osdc->lock);
				4136	if (!osd_registered(osd)) {
				4137	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				4138	up_read(&osdc->lock);
				4139	return;
				4140	}
				4141	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				4142
				4143	mutex_lock(&osd->lock);
				4144	ret = decode_MOSDBackoff(msg, &m);
				4145	if (ret) {
				4146	pr_err("failed to decode MOSDBackoff: %d\n", ret);
				4147	ceph_msg_dump(msg);
				4148	goto out_unlock;
				4149	}
				4150
				4151	switch (m.op) {
				4152	case CEPH_OSD_BACKOFF_OP_BLOCK:
				4153	handle_backoff_block(osd, &m);
				4154	break;
				4155	case CEPH_OSD_BACKOFF_OP_UNBLOCK:
				4156	handle_backoff_unblock(osd, &m);
				4157	break;
				4158	default:
				4159	pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
				4160	}
				4161
				4162	free_hoid(m.begin);
				4163	free_hoid(m.end);
				4164
				4165	out_unlock:
				4166	mutex_unlock(&osd->lock);
				4167	up_read(&osdc->lock);
				4168	}
				4169
				4170	/*
				4171	* Process osd watch notifications
				4172	*/
				4173	static void handle_watch_notify(struct ceph_osd_client *osdc,
				4174	struct ceph_msg *msg)
				4175	{
				4176	void *p = msg->front.iov_base;
				4177	void *const end = p + msg->front.iov_len;
				4178	struct ceph_osd_linger_request *lreq;
				4179	struct linger_work *lwork;
				4180	u8 proto_ver, opcode;
				4181	u64 cookie, notify_id;
				4182	u64 notifier_id = 0;
				4183	s32 return_code = 0;
				4184	void *payload = NULL;
				4185	u32 payload_len = 0;
				4186
				4187	ceph_decode_8_safe(&p, end, proto_ver, bad);
				4188	ceph_decode_8_safe(&p, end, opcode, bad);
				4189	ceph_decode_64_safe(&p, end, cookie, bad);
				4190	p += 8; /* skip ver */
				4191	ceph_decode_64_safe(&p, end, notify_id, bad);
				4192
				4193	if (proto_ver >= 1) {
				4194	ceph_decode_32_safe(&p, end, payload_len, bad);
				4195	ceph_decode_need(&p, end, payload_len, bad);
				4196	payload = p;
				4197	p += payload_len;
				4198	}
				4199
				4200	if (le16_to_cpu(msg->hdr.version) >= 2)
				4201	ceph_decode_32_safe(&p, end, return_code, bad);
				4202
				4203	if (le16_to_cpu(msg->hdr.version) >= 3)
				4204	ceph_decode_64_safe(&p, end, notifier_id, bad);
				4205
				4206	down_read(&osdc->lock);
				4207	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
				4208	if (!lreq) {
				4209	dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
				4210	cookie);
				4211	goto out_unlock_osdc;
				4212	}
				4213
				4214	mutex_lock(&lreq->lock);
				4215	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
				4216	opcode, cookie, lreq, lreq->is_watch);
				4217	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
				4218	if (!lreq->last_error) {
				4219	lreq->last_error = -ENOTCONN;
				4220	queue_watch_error(lreq);
				4221	}
				4222	} else if (!lreq->is_watch) {
				4223	/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
				4224	if (lreq->notify_id && lreq->notify_id != notify_id) {
				4225	dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
				4226	lreq->notify_id, notify_id);
				4227	} else if (!completion_done(&lreq->notify_finish_wait)) {
				4228	struct ceph_msg_data *data =
				4229	list_first_entry_or_null(&msg->data,
				4230	struct ceph_msg_data,
				4231	links);
				4232
				4233	if (data) {
				4234	if (lreq->preply_pages) {
				4235	WARN_ON(data->type !=
				4236	CEPH_MSG_DATA_PAGES);
				4237	*lreq->preply_pages = data->pages;
				4238	*lreq->preply_len = data->length;
				4239	} else {
				4240	ceph_release_page_vector(data->pages,
				4241	calc_pages_for(0, data->length));
				4242	}
				4243	}
				4244	lreq->notify_finish_error = return_code;
				4245	complete_all(&lreq->notify_finish_wait);
				4246	}
				4247	} else {
				4248	/* CEPH_WATCH_EVENT_NOTIFY */
				4249	lwork = lwork_alloc(lreq, do_watch_notify);
				4250	if (!lwork) {
				4251	pr_err("failed to allocate notify-lwork\n");
				4252	goto out_unlock_lreq;
				4253	}
				4254
				4255	lwork->notify.notify_id = notify_id;
				4256	lwork->notify.notifier_id = notifier_id;
				4257	lwork->notify.payload = payload;
				4258	lwork->notify.payload_len = payload_len;
				4259	lwork->notify.msg = ceph_msg_get(msg);
				4260	lwork_queue(lwork);
				4261	}
				4262
				4263	out_unlock_lreq:
				4264	mutex_unlock(&lreq->lock);
				4265	out_unlock_osdc:
				4266	up_read(&osdc->lock);
				4267	return;
				4268
				4269	bad:
				4270	pr_err("osdc handle_watch_notify corrupt msg\n");
				4271	}
				4272
				4273	/*
				4274	* Register request, send initial attempt.
				4275	*/
				4276	int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				4277	struct ceph_osd_request *req,
				4278	bool nofail)
				4279	{
				4280	down_read(&osdc->lock);
				4281	submit_request(req, false);
				4282	up_read(&osdc->lock);
				4283
				4284	return 0;
				4285	}
				4286	EXPORT_SYMBOL(ceph_osdc_start_request);
				4287
				4288	/*
				4289	* Unregister a registered request. The request is not completed:
				4290	* ->r_result isn't set and __complete_request() isn't called.
				4291	*/
				4292	void ceph_osdc_cancel_request(struct ceph_osd_request *req)
				4293	{
				4294	struct ceph_osd_client *osdc = req->r_osdc;
				4295
				4296	down_write(&osdc->lock);
				4297	if (req->r_osd)
				4298	cancel_request(req);
				4299	up_write(&osdc->lock);
				4300	}
				4301	EXPORT_SYMBOL(ceph_osdc_cancel_request);
				4302
				4303	/*
				4304	* @timeout: in jiffies, 0 means "wait forever"
				4305	*/
				4306	static int wait_request_timeout(struct ceph_osd_request *req,
				4307	unsigned long timeout)
				4308	{
				4309	long left;
				4310
				4311	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				4312	left = wait_for_completion_killable_timeout(&req->r_completion,
				4313	ceph_timeout_jiffies(timeout));
				4314	if (left <= 0) {
				4315	left = left ?: -ETIMEDOUT;
				4316	ceph_osdc_cancel_request(req);
				4317	} else {
				4318	left = req->r_result; /* completed */
				4319	}
				4320
				4321	return left;
				4322	}
				4323
				4324	/*
				4325	* wait for a request to complete
				4326	*/
				4327	int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				4328	struct ceph_osd_request *req)
				4329	{
				4330	return wait_request_timeout(req, 0);
				4331	}
				4332	EXPORT_SYMBOL(ceph_osdc_wait_request);
				4333
				4334	/*
				4335	* sync - wait for all in-flight requests to flush. avoid starvation.
				4336	*/
				4337	void ceph_osdc_sync(struct ceph_osd_client *osdc)
				4338	{
				4339	struct rb_node n, p;
				4340	u64 last_tid = atomic64_read(&osdc->last_tid);
				4341
				4342	again:
				4343	down_read(&osdc->lock);
				4344	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				4345	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				4346
				4347	mutex_lock(&osd->lock);
				4348	for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
				4349	struct ceph_osd_request *req =
				4350	rb_entry(p, struct ceph_osd_request, r_node);
				4351
				4352	if (req->r_tid > last_tid)
				4353	break;
				4354
				4355	if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
				4356	continue;
				4357
				4358	ceph_osdc_get_request(req);
				4359	mutex_unlock(&osd->lock);
				4360	up_read(&osdc->lock);
				4361	dout("%s waiting on req %p tid %llu last_tid %llu\n",
				4362	__func__, req, req->r_tid, last_tid);
				4363	wait_for_completion(&req->r_completion);
				4364	ceph_osdc_put_request(req);
				4365	goto again;
				4366	}
				4367
				4368	mutex_unlock(&osd->lock);
				4369	}
				4370
				4371	up_read(&osdc->lock);
				4372	dout("%s done last_tid %llu\n", __func__, last_tid);
				4373	}
				4374	EXPORT_SYMBOL(ceph_osdc_sync);
				4375
				4376	static struct ceph_osd_request *
				4377	alloc_linger_request(struct ceph_osd_linger_request *lreq)
				4378	{
				4379	struct ceph_osd_request *req;
				4380
				4381	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
				4382	if (!req)
				4383	return NULL;
				4384
				4385	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4386	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				4387
				4388	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
				4389	ceph_osdc_put_request(req);
				4390	return NULL;
				4391	}
				4392
				4393	return req;
				4394	}
				4395
				4396	/*
				4397	* Returns a handle, caller owns a ref.
				4398	*/
				4399	struct ceph_osd_linger_request *
				4400	ceph_osdc_watch(struct ceph_osd_client *osdc,
				4401	struct ceph_object_id *oid,
				4402	struct ceph_object_locator *oloc,
				4403	rados_watchcb2_t wcb,
				4404	rados_watcherrcb_t errcb,
				4405	void *data)
				4406	{
				4407	struct ceph_osd_linger_request *lreq;
				4408	int ret;
				4409
				4410	lreq = linger_alloc(osdc);
				4411	if (!lreq)
				4412	return ERR_PTR(-ENOMEM);
				4413
				4414	lreq->is_watch = true;
				4415	lreq->wcb = wcb;
				4416	lreq->errcb = errcb;
				4417	lreq->data = data;
				4418	lreq->watch_valid_thru = jiffies;
				4419
				4420	ceph_oid_copy(&lreq->t.base_oid, oid);
				4421	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4422	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
				4423	ktime_get_real_ts(&lreq->mtime);
				4424
				4425	lreq->reg_req = alloc_linger_request(lreq);
				4426	if (!lreq->reg_req) {
				4427	ret = -ENOMEM;
				4428	goto err_put_lreq;
				4429	}
				4430
				4431	lreq->ping_req = alloc_linger_request(lreq);
				4432	if (!lreq->ping_req) {
				4433	ret = -ENOMEM;
				4434	goto err_put_lreq;
				4435	}
				4436
				4437	down_write(&osdc->lock);
				4438	linger_register(lreq); /* before osd_req_op_* */
				4439	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
				4440	CEPH_OSD_WATCH_OP_WATCH);
				4441	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
				4442	CEPH_OSD_WATCH_OP_PING);
				4443	linger_submit(lreq);
				4444	up_write(&osdc->lock);
				4445
				4446	ret = linger_reg_commit_wait(lreq);
				4447	if (ret) {
				4448	linger_cancel(lreq);
				4449	goto err_put_lreq;
				4450	}
				4451
				4452	return lreq;
				4453
				4454	err_put_lreq:
				4455	linger_put(lreq);
				4456	return ERR_PTR(ret);
				4457	}
				4458	EXPORT_SYMBOL(ceph_osdc_watch);
				4459
				4460	/*
				4461	* Releases a ref.
				4462	*
				4463	* Times out after mount_timeout to preserve rbd unmap behaviour
				4464	* introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
				4465	* with mount_timeout").
				4466	*/
				4467	int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
				4468	struct ceph_osd_linger_request *lreq)
				4469	{
				4470	struct ceph_options *opts = osdc->client->options;
				4471	struct ceph_osd_request *req;
				4472	int ret;
				4473
				4474	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4475	if (!req)
				4476	return -ENOMEM;
				4477
				4478	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4479	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				4480	req->r_flags = CEPH_OSD_FLAG_WRITE;
				4481	ktime_get_real_ts(&req->r_mtime);
				4482	osd_req_op_watch_init(req, 0, lreq->linger_id,
				4483	CEPH_OSD_WATCH_OP_UNWATCH);
				4484
				4485	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4486	if (ret)
				4487	goto out_put_req;
				4488
				4489	ceph_osdc_start_request(osdc, req, false);
				4490	linger_cancel(lreq);
				4491	linger_put(lreq);
				4492	ret = wait_request_timeout(req, opts->mount_timeout);
				4493
				4494	out_put_req:
				4495	ceph_osdc_put_request(req);
				4496	return ret;
				4497	}
				4498	EXPORT_SYMBOL(ceph_osdc_unwatch);
				4499
				4500	static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
				4501	u64 notify_id, u64 cookie, void *payload,
				4502	size_t payload_len)
				4503	{
				4504	struct ceph_osd_req_op *op;
				4505	struct ceph_pagelist *pl;
				4506	int ret;
				4507
				4508	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
				4509
				4510	pl = kmalloc(sizeof(*pl), GFP_NOIO);
				4511	if (!pl)
				4512	return -ENOMEM;
				4513
				4514	ceph_pagelist_init(pl);
				4515	ret = ceph_pagelist_encode_64(pl, notify_id);
				4516	ret \|= ceph_pagelist_encode_64(pl, cookie);
				4517	if (payload) {
				4518	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4519	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4520	} else {
				4521	ret \|= ceph_pagelist_encode_32(pl, 0);
				4522	}
				4523	if (ret) {
				4524	ceph_pagelist_release(pl);
				4525	return -ENOMEM;
				4526	}
				4527
				4528	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
				4529	op->indata_len = pl->length;
				4530	return 0;
				4531	}
				4532
				4533	int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
				4534	struct ceph_object_id *oid,
				4535	struct ceph_object_locator *oloc,
				4536	u64 notify_id,
				4537	u64 cookie,
				4538	void *payload,
				4539	size_t payload_len)
				4540	{
				4541	struct ceph_osd_request *req;
				4542	int ret;
				4543
				4544	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4545	if (!req)
				4546	return -ENOMEM;
				4547
				4548	ceph_oid_copy(&req->r_base_oid, oid);
				4549	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4550	req->r_flags = CEPH_OSD_FLAG_READ;
				4551
				4552	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4553	if (ret)
				4554	goto out_put_req;
				4555
				4556	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
				4557	payload_len);
				4558	if (ret)
				4559	goto out_put_req;
				4560
				4561	ceph_osdc_start_request(osdc, req, false);
				4562	ret = ceph_osdc_wait_request(osdc, req);
				4563
				4564	out_put_req:
				4565	ceph_osdc_put_request(req);
				4566	return ret;
				4567	}
				4568	EXPORT_SYMBOL(ceph_osdc_notify_ack);
				4569
				4570	static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
				4571	u64 cookie, u32 prot_ver, u32 timeout,
				4572	void *payload, size_t payload_len)
				4573	{
				4574	struct ceph_osd_req_op *op;
				4575	struct ceph_pagelist *pl;
				4576	int ret;
				4577
				4578	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
				4579	op->notify.cookie = cookie;
				4580
				4581	pl = kmalloc(sizeof(*pl), GFP_NOIO);
				4582	if (!pl)
				4583	return -ENOMEM;
				4584
				4585	ceph_pagelist_init(pl);
				4586	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
				4587	ret \|= ceph_pagelist_encode_32(pl, timeout);
				4588	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4589	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4590	if (ret) {
				4591	ceph_pagelist_release(pl);
				4592	return -ENOMEM;
				4593	}
				4594
				4595	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
				4596	op->indata_len = pl->length;
				4597	return 0;
				4598	}
				4599
				4600	/*
				4601	* @timeout: in seconds
				4602	*
				4603	* @preply_{pages,len} are initialized both on success and error.
				4604	* The caller is responsible for:
				4605	*
				4606	* ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
				4607	*/
				4608	int ceph_osdc_notify(struct ceph_osd_client *osdc,
				4609	struct ceph_object_id *oid,
				4610	struct ceph_object_locator *oloc,
				4611	void *payload,
				4612	size_t payload_len,
				4613	u32 timeout,
				4614	struct page ***preply_pages,
				4615	size_t *preply_len)
				4616	{
				4617	struct ceph_osd_linger_request *lreq;
				4618	struct page **pages;
				4619	int ret;
				4620
				4621	WARN_ON(!timeout);
				4622	if (preply_pages) {
				4623	*preply_pages = NULL;
				4624	*preply_len = 0;
				4625	}
				4626
				4627	lreq = linger_alloc(osdc);
				4628	if (!lreq)
				4629	return -ENOMEM;
				4630
				4631	lreq->preply_pages = preply_pages;
				4632	lreq->preply_len = preply_len;
				4633
				4634	ceph_oid_copy(&lreq->t.base_oid, oid);
				4635	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4636	lreq->t.flags = CEPH_OSD_FLAG_READ;
				4637
				4638	lreq->reg_req = alloc_linger_request(lreq);
				4639	if (!lreq->reg_req) {
				4640	ret = -ENOMEM;
				4641	goto out_put_lreq;
				4642	}
				4643
				4644	/* for notify_id */
				4645	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				4646	if (IS_ERR(pages)) {
				4647	ret = PTR_ERR(pages);
				4648	goto out_put_lreq;
				4649	}
				4650
				4651	down_write(&osdc->lock);
				4652	linger_register(lreq); /* before osd_req_op_* */
				4653	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
				4654	timeout, payload, payload_len);
				4655	if (ret) {
				4656	linger_unregister(lreq);
				4657	up_write(&osdc->lock);
				4658	ceph_release_page_vector(pages, 1);
				4659	goto out_put_lreq;
				4660	}
				4661	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
				4662	response_data),
				4663	pages, PAGE_SIZE, 0, false, true);
				4664	linger_submit(lreq);
				4665	up_write(&osdc->lock);
				4666
				4667	ret = linger_reg_commit_wait(lreq);
				4668	if (!ret)
				4669	ret = linger_notify_finish_wait(lreq);
				4670	else
				4671	dout("lreq %p failed to initiate notify %d\n", lreq, ret);
				4672
				4673	linger_cancel(lreq);
				4674	out_put_lreq:
				4675	linger_put(lreq);
				4676	return ret;
				4677	}
				4678	EXPORT_SYMBOL(ceph_osdc_notify);
				4679
				4680	/*
				4681	* Return the number of milliseconds since the watch was last
				4682	* confirmed, or an error. If there is an error, the watch is no
				4683	* longer valid, and should be destroyed with ceph_osdc_unwatch().
				4684	*/
				4685	int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
				4686	struct ceph_osd_linger_request *lreq)
				4687	{
				4688	unsigned long stamp, age;
				4689	int ret;
				4690
				4691	down_read(&osdc->lock);
				4692	mutex_lock(&lreq->lock);
				4693	stamp = lreq->watch_valid_thru;
				4694	if (!list_empty(&lreq->pending_lworks)) {
				4695	struct linger_work *lwork =
				4696	list_first_entry(&lreq->pending_lworks,
				4697	struct linger_work,
				4698	pending_item);
				4699
				4700	if (time_before(lwork->queued_stamp, stamp))
				4701	stamp = lwork->queued_stamp;
				4702	}
				4703	age = jiffies - stamp;
				4704	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
				4705	lreq, lreq->linger_id, age, lreq->last_error);
				4706	/* we are truncating to msecs, so return a safe upper bound */
				4707	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
				4708
				4709	mutex_unlock(&lreq->lock);
				4710	up_read(&osdc->lock);
				4711	return ret;
				4712	}
				4713
				4714	static int decode_watcher(void *p, void end, struct ceph_watch_item *item)
				4715	{
				4716	u8 struct_v;
				4717	u32 struct_len;
				4718	int ret;
				4719
				4720	ret = ceph_start_decoding(p, end, 2, "watch_item_t",
				4721	&struct_v, &struct_len);
				4722	if (ret)
				4723	return ret;
				4724
				4725	ceph_decode_copy(p, &item->name, sizeof(item->name));
				4726	item->cookie = ceph_decode_64(p);
				4727	p += 4; / skip timeout_seconds */
				4728	if (struct_v >= 2) {
				4729	ceph_decode_copy(p, &item->addr, sizeof(item->addr));
				4730	ceph_decode_addr(&item->addr);
				4731	}
				4732
				4733	dout("%s %s%llu cookie %llu addr %s\n", __func__,
				4734	ENTITY_NAME(item->name), item->cookie,
				4735	ceph_pr_addr(&item->addr.in_addr));
				4736	return 0;
				4737	}
				4738
				4739	static int decode_watchers(void *p, void end,
				4740	struct ceph_watch_item **watchers,
				4741	u32 *num_watchers)
				4742	{
				4743	u8 struct_v;
				4744	u32 struct_len;
				4745	int i;
				4746	int ret;
				4747
				4748	ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
				4749	&struct_v, &struct_len);
				4750	if (ret)
				4751	return ret;
				4752
				4753	*num_watchers = ceph_decode_32(p);
				4754	watchers = kcalloc(num_watchers, sizeof(**watchers), GFP_NOIO);
				4755	if (!*watchers)
				4756	return -ENOMEM;
				4757
				4758	for (i = 0; i < *num_watchers; i++) {
				4759	ret = decode_watcher(p, end, *watchers + i);
				4760	if (ret) {
				4761	kfree(*watchers);
				4762	return ret;
				4763	}
				4764	}
				4765
				4766	return 0;
				4767	}
				4768
				4769	/*
				4770	* On success, the caller is responsible for:
				4771	*
				4772	* kfree(watchers);
				4773	*/
				4774	int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
				4775	struct ceph_object_id *oid,
				4776	struct ceph_object_locator *oloc,
				4777	struct ceph_watch_item **watchers,
				4778	u32 *num_watchers)
				4779	{
				4780	struct ceph_osd_request *req;
				4781	struct page **pages;
				4782	int ret;
				4783
				4784	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4785	if (!req)
				4786	return -ENOMEM;
				4787
				4788	ceph_oid_copy(&req->r_base_oid, oid);
				4789	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4790	req->r_flags = CEPH_OSD_FLAG_READ;
				4791
				4792	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4793	if (ret)
				4794	goto out_put_req;
				4795
				4796	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				4797	if (IS_ERR(pages)) {
				4798	ret = PTR_ERR(pages);
				4799	goto out_put_req;
				4800	}
				4801
				4802	osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
				4803	ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
				4804	response_data),
				4805	pages, PAGE_SIZE, 0, false, true);
				4806
				4807	ceph_osdc_start_request(osdc, req, false);
				4808	ret = ceph_osdc_wait_request(osdc, req);
				4809	if (ret >= 0) {
				4810	void *p = page_address(pages[0]);
				4811	void *const end = p + req->r_ops[0].outdata_len;
				4812
				4813	ret = decode_watchers(&p, end, watchers, num_watchers);
				4814	}
				4815
				4816	out_put_req:
				4817	ceph_osdc_put_request(req);
				4818	return ret;
				4819	}
				4820	EXPORT_SYMBOL(ceph_osdc_list_watchers);
				4821
				4822	/*
				4823	* Call all pending notify callbacks - for use after a watch is
				4824	* unregistered, to make sure no more callbacks for it will be invoked
				4825	*/
				4826	void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
				4827	{
				4828	dout("%s osdc %p\n", __func__, osdc);
				4829	flush_workqueue(osdc->notify_wq);
				4830	}
				4831	EXPORT_SYMBOL(ceph_osdc_flush_notifies);
				4832
				4833	void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
				4834	{
				4835	down_read(&osdc->lock);
				4836	maybe_request_map(osdc);
				4837	up_read(&osdc->lock);
				4838	}
				4839	EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
				4840
				4841	/*
				4842	* Execute an OSD class method on an object.
				4843	*
				4844	* @flags: CEPH_OSD_FLAG_*
				4845	* @resp_len: in/out param for reply length
				4846	*/
				4847	int ceph_osdc_call(struct ceph_osd_client *osdc,
				4848	struct ceph_object_id *oid,
				4849	struct ceph_object_locator *oloc,
				4850	const char class, const char method,
				4851	unsigned int flags,
				4852	struct page *req_page, size_t req_len,
				4853	struct page resp_page, size_t resp_len)
				4854	{
				4855	struct ceph_osd_request *req;
				4856	int ret;
				4857
				4858	if (req_len > PAGE_SIZE \|\| (resp_page && *resp_len > PAGE_SIZE))
				4859	return -E2BIG;
				4860
				4861	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4862	if (!req)
				4863	return -ENOMEM;
				4864
				4865	ceph_oid_copy(&req->r_base_oid, oid);
				4866	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4867	req->r_flags = flags;
				4868
				4869	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4870	if (ret)
				4871	goto out_put_req;
				4872
				4873	osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
				4874	if (req_page)
				4875	osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
				4876	0, false, false);
				4877	if (resp_page)
				4878	osd_req_op_cls_response_data_pages(req, 0, &resp_page,
				4879	*resp_len, 0, false, false);
				4880
				4881	ceph_osdc_start_request(osdc, req, false);
				4882	ret = ceph_osdc_wait_request(osdc, req);
				4883	if (ret >= 0) {
				4884	ret = req->r_ops[0].rval;
				4885	if (resp_page)
				4886	*resp_len = req->r_ops[0].outdata_len;
				4887	}
				4888
				4889	out_put_req:
				4890	ceph_osdc_put_request(req);
				4891	return ret;
				4892	}
				4893	EXPORT_SYMBOL(ceph_osdc_call);
				4894
				4895	/*
				4896	* init, shutdown
				4897	*/
				4898	int ceph_osdc_init(struct ceph_osd_client osdc, struct ceph_client client)
				4899	{
				4900	int err;
				4901
				4902	dout("init\n");
				4903	osdc->client = client;
				4904	init_rwsem(&osdc->lock);
				4905	osdc->osds = RB_ROOT;
				4906	INIT_LIST_HEAD(&osdc->osd_lru);
				4907	spin_lock_init(&osdc->osd_lru_lock);
				4908	osd_init(&osdc->homeless_osd);
				4909	osdc->homeless_osd.o_osdc = osdc;
				4910	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
				4911	osdc->last_linger_id = CEPH_LINGER_ID_START;
				4912	osdc->linger_requests = RB_ROOT;
				4913	osdc->map_checks = RB_ROOT;
				4914	osdc->linger_map_checks = RB_ROOT;
				4915	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
				4916	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
				4917
				4918	err = -ENOMEM;
				4919	osdc->osdmap = ceph_osdmap_alloc();
				4920	if (!osdc->osdmap)
				4921	goto out;
				4922
				4923	osdc->req_mempool = mempool_create_slab_pool(10,
				4924	ceph_osd_request_cache);
				4925	if (!osdc->req_mempool)
				4926	goto out_map;
				4927
				4928	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
				4929	PAGE_SIZE, 10, true, "osd_op");
				4930	if (err < 0)
				4931	goto out_mempool;
				4932	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
				4933	PAGE_SIZE, 10, true, "osd_op_reply");
				4934	if (err < 0)
				4935	goto out_msgpool;
				4936
				4937	err = -ENOMEM;
				4938	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
				4939	if (!osdc->notify_wq)
				4940	goto out_msgpool_reply;
				4941
				4942	schedule_delayed_work(&osdc->timeout_work,
				4943	osdc->client->options->osd_keepalive_timeout);
				4944	schedule_delayed_work(&osdc->osds_timeout_work,
				4945	round_jiffies_relative(osdc->client->options->osd_idle_ttl));
				4946
				4947	return 0;
				4948
				4949	out_msgpool_reply:
				4950	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				4951	out_msgpool:
				4952	ceph_msgpool_destroy(&osdc->msgpool_op);
				4953	out_mempool:
				4954	mempool_destroy(osdc->req_mempool);
				4955	out_map:
				4956	ceph_osdmap_destroy(osdc->osdmap);
				4957	out:
				4958	return err;
				4959	}
				4960
				4961	void ceph_osdc_stop(struct ceph_osd_client *osdc)
				4962	{
				4963	flush_workqueue(osdc->notify_wq);
				4964	destroy_workqueue(osdc->notify_wq);
				4965	cancel_delayed_work_sync(&osdc->timeout_work);
				4966	cancel_delayed_work_sync(&osdc->osds_timeout_work);
				4967
				4968	down_write(&osdc->lock);
				4969	while (!RB_EMPTY_ROOT(&osdc->osds)) {
				4970	struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
				4971	struct ceph_osd, o_node);
				4972	close_osd(osd);
				4973	}
				4974	up_write(&osdc->lock);
				4975	WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
				4976	osd_cleanup(&osdc->homeless_osd);
				4977
				4978	WARN_ON(!list_empty(&osdc->osd_lru));
				4979	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
				4980	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
				4981	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
				4982	WARN_ON(atomic_read(&osdc->num_requests));
				4983	WARN_ON(atomic_read(&osdc->num_homeless));
				4984
				4985	ceph_osdmap_destroy(osdc->osdmap);
				4986	mempool_destroy(osdc->req_mempool);
				4987	ceph_msgpool_destroy(&osdc->msgpool_op);
				4988	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				4989	}
				4990
				4991	/*
				4992	* Read some contiguous pages. If we cross a stripe boundary, shorten
				4993	* *plen. Return number of bytes read, or error.
				4994	*/
				4995	int ceph_osdc_readpages(struct ceph_osd_client *osdc,
				4996	struct ceph_vino vino, struct ceph_file_layout *layout,
				4997	u64 off, u64 *plen,
				4998	u32 truncate_seq, u64 truncate_size,
				4999	struct page **pages, int num_pages, int page_align)
				5000	{
				5001	struct ceph_osd_request *req;
				5002	int rc = 0;
				5003
				5004	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
				5005	vino.snap, off, *plen);
				5006	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
				5007	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				5008	NULL, truncate_seq, truncate_size,
				5009	false);
				5010	if (IS_ERR(req))
				5011	return PTR_ERR(req);
				5012
				5013	/* it may be a short read due to an object boundary */
				5014	osd_req_op_extent_osd_data_pages(req, 0,
				5015	pages, *plen, page_align, false, false);
				5016
				5017	dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
				5018	off, plen, plen, page_align);
				5019
				5020	rc = ceph_osdc_start_request(osdc, req, false);
				5021	if (!rc)
				5022	rc = ceph_osdc_wait_request(osdc, req);
				5023
				5024	ceph_osdc_put_request(req);
				5025	dout("readpages result %d\n", rc);
				5026	return rc;
				5027	}
				5028	EXPORT_SYMBOL(ceph_osdc_readpages);
				5029
				5030	/*
				5031	* do a synchronous write on N pages
				5032	*/
				5033	int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
				5034	struct ceph_file_layout *layout,
				5035	struct ceph_snap_context *snapc,
				5036	u64 off, u64 len,
				5037	u32 truncate_seq, u64 truncate_size,
				5038	struct timespec *mtime,
				5039	struct page **pages, int num_pages)
				5040	{
				5041	struct ceph_osd_request *req;
				5042	int rc = 0;
				5043	int page_align = off & ~PAGE_MASK;
				5044
				5045	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
				5046	CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
				5047	snapc, truncate_seq, truncate_size,
				5048	true);
				5049	if (IS_ERR(req))
				5050	return PTR_ERR(req);
				5051
				5052	/* it may be a short write due to an object boundary */
				5053	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
				5054	false, false);
				5055	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
				5056
				5057	req->r_mtime = *mtime;
				5058	rc = ceph_osdc_start_request(osdc, req, true);
				5059	if (!rc)
				5060	rc = ceph_osdc_wait_request(osdc, req);
				5061
				5062	ceph_osdc_put_request(req);
				5063	if (rc == 0)
				5064	rc = len;
				5065	dout("writepages result %d\n", rc);
				5066	return rc;
				5067	}
				5068	EXPORT_SYMBOL(ceph_osdc_writepages);
				5069
				5070	int ceph_osdc_setup(void)
				5071	{
				5072	size_t size = sizeof(struct ceph_osd_request) +
				5073	CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
				5074
				5075	BUG_ON(ceph_osd_request_cache);
				5076	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
				5077	0, 0, NULL);
				5078
				5079	return ceph_osd_request_cache ? 0 : -ENOMEM;
				5080	}
				5081	EXPORT_SYMBOL(ceph_osdc_setup);
				5082
				5083	void ceph_osdc_cleanup(void)
				5084	{
				5085	BUG_ON(!ceph_osd_request_cache);
				5086	kmem_cache_destroy(ceph_osd_request_cache);
				5087	ceph_osd_request_cache = NULL;
				5088	}
				5089	EXPORT_SYMBOL(ceph_osdc_cleanup);
				5090
				5091	/*
				5092	* handle incoming message
				5093	*/
				5094	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				5095	{
				5096	struct ceph_osd *osd = con->private;
				5097	struct ceph_osd_client *osdc = osd->o_osdc;
				5098	int type = le16_to_cpu(msg->hdr.type);
				5099
				5100	switch (type) {
				5101	case CEPH_MSG_OSD_MAP:
				5102	ceph_osdc_handle_map(osdc, msg);
				5103	break;
				5104	case CEPH_MSG_OSD_OPREPLY:
				5105	handle_reply(osd, msg);
				5106	break;
				5107	case CEPH_MSG_OSD_BACKOFF:
				5108	handle_backoff(osd, msg);
				5109	break;
				5110	case CEPH_MSG_WATCH_NOTIFY:
				5111	handle_watch_notify(osdc, msg);
				5112	break;
				5113
				5114	default:
				5115	pr_err("received unknown message type %d %s\n", type,
				5116	ceph_msg_type_name(type));
				5117	}
				5118
				5119	ceph_msg_put(msg);
				5120	}
				5121
				5122	/*
				5123	* Lookup and return message for incoming reply. Don't try to do
				5124	* anything about a larger than preallocated data portion of the
				5125	* message at the moment - for now, just skip the message.
				5126	*/
				5127	static struct ceph_msg get_reply(struct ceph_connection con,
				5128	struct ceph_msg_header *hdr,
				5129	int *skip)
				5130	{
				5131	struct ceph_osd *osd = con->private;
				5132	struct ceph_osd_client *osdc = osd->o_osdc;
				5133	struct ceph_msg *m = NULL;
				5134	struct ceph_osd_request *req;
				5135	int front_len = le32_to_cpu(hdr->front_len);
				5136	int data_len = le32_to_cpu(hdr->data_len);
				5137	u64 tid = le64_to_cpu(hdr->tid);
				5138
				5139	down_read(&osdc->lock);
				5140	if (!osd_registered(osd)) {
				5141	dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
				5142	*skip = 1;
				5143	goto out_unlock_osdc;
				5144	}
				5145	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
				5146
				5147	mutex_lock(&osd->lock);
				5148	req = lookup_request(&osd->o_requests, tid);
				5149	if (!req) {
				5150	dout("%s osd%d tid %llu unknown, skipping\n", __func__,
				5151	osd->o_osd, tid);
				5152	*skip = 1;
				5153	goto out_unlock_session;
				5154	}
				5155
				5156	ceph_msg_revoke_incoming(req->r_reply);
				5157
				5158	if (front_len > req->r_reply->front_alloc_len) {
				5159	pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
				5160	__func__, osd->o_osd, req->r_tid, front_len,
				5161	req->r_reply->front_alloc_len);
				5162	m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
				5163	false);
				5164	if (!m)
				5165	goto out_unlock_session;
				5166	ceph_msg_put(req->r_reply);
				5167	req->r_reply = m;
				5168	}
				5169
				5170	if (data_len > req->r_reply->data_length) {
				5171	pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
				5172	__func__, osd->o_osd, req->r_tid, data_len,
				5173	req->r_reply->data_length);
				5174	m = NULL;
				5175	*skip = 1;
				5176	goto out_unlock_session;
				5177	}
				5178
				5179	m = ceph_msg_get(req->r_reply);
				5180	dout("get_reply tid %lld %p\n", tid, m);
				5181
				5182	out_unlock_session:
				5183	mutex_unlock(&osd->lock);
				5184	out_unlock_osdc:
				5185	up_read(&osdc->lock);
				5186	return m;
				5187	}
				5188
				5189	/*
				5190	* TODO: switch to a msg-owned pagelist
				5191	*/
				5192	static struct ceph_msg alloc_msg_with_page_vector(struct ceph_msg_header hdr)
				5193	{
				5194	struct ceph_msg *m;
				5195	int type = le16_to_cpu(hdr->type);
				5196	u32 front_len = le32_to_cpu(hdr->front_len);
				5197	u32 data_len = le32_to_cpu(hdr->data_len);
				5198
				5199	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
				5200	if (!m)
				5201	return NULL;
				5202
				5203	if (data_len) {
				5204	struct page **pages;
				5205	struct ceph_osd_data osd_data;
				5206
				5207	pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
				5208	GFP_NOIO);
				5209	if (IS_ERR(pages)) {
				5210	ceph_msg_put(m);
				5211	return NULL;
				5212	}
				5213
				5214	ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
				5215	false);
				5216	ceph_osdc_msg_data_add(m, &osd_data);
				5217	}
				5218
				5219	return m;
				5220	}
				5221
				5222	static struct ceph_msg alloc_msg(struct ceph_connection con,
				5223	struct ceph_msg_header *hdr,
				5224	int *skip)
				5225	{
				5226	struct ceph_osd *osd = con->private;
				5227	int type = le16_to_cpu(hdr->type);
				5228
				5229	*skip = 0;
				5230	switch (type) {
				5231	case CEPH_MSG_OSD_MAP:
				5232	case CEPH_MSG_OSD_BACKOFF:
				5233	case CEPH_MSG_WATCH_NOTIFY:
				5234	return alloc_msg_with_page_vector(hdr);
				5235	case CEPH_MSG_OSD_OPREPLY:
				5236	return get_reply(con, hdr, skip);
				5237	default:
				5238	pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
				5239	osd->o_osd, type);
				5240	*skip = 1;
				5241	return NULL;
				5242	}
				5243	}
				5244
				5245	/*
				5246	* Wrappers to refcount containing ceph_osd struct
				5247	*/
				5248	static struct ceph_connection get_osd_con(struct ceph_connection con)
				5249	{
				5250	struct ceph_osd *osd = con->private;
				5251	if (get_osd(osd))
				5252	return con;
				5253	return NULL;
				5254	}
				5255
				5256	static void put_osd_con(struct ceph_connection *con)
				5257	{
				5258	struct ceph_osd *osd = con->private;
				5259	put_osd(osd);
				5260	}
				5261
				5262	/*
				5263	* authentication
				5264	*/
				5265	/*
				5266	* Note: returned pointer is the address of a structure that's
				5267	* managed separately. Caller must not attempt to free it.
				5268	*/
				5269	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				5270	int *proto, int force_new)
				5271	{
				5272	struct ceph_osd *o = con->private;
				5273	struct ceph_osd_client *osdc = o->o_osdc;
				5274	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5275	struct ceph_auth_handshake *auth = &o->o_auth;
				5276
				5277	if (force_new && auth->authorizer) {
				5278	ceph_auth_destroy_authorizer(auth->authorizer);
				5279	auth->authorizer = NULL;
				5280	}
				5281	if (!auth->authorizer) {
				5282	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5283	auth);
				5284	if (ret)
				5285	return ERR_PTR(ret);
				5286	} else {
				5287	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5288	auth);
				5289	if (ret)
				5290	return ERR_PTR(ret);
				5291	}
				5292	*proto = ac->protocol;
				5293
				5294	return auth;
				5295	}
				5296
				5297	static int add_authorizer_challenge(struct ceph_connection *con,
				5298	void *challenge_buf, int challenge_buf_len)
				5299	{
				5300	struct ceph_osd *o = con->private;
				5301	struct ceph_osd_client *osdc = o->o_osdc;
				5302	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5303
				5304	return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
				5305	challenge_buf, challenge_buf_len);
				5306	}
				5307
				5308	static int verify_authorizer_reply(struct ceph_connection *con)
				5309	{
				5310	struct ceph_osd *o = con->private;
				5311	struct ceph_osd_client *osdc = o->o_osdc;
				5312	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5313
				5314	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
				5315	}
				5316
				5317	static int invalidate_authorizer(struct ceph_connection *con)
				5318	{
				5319	struct ceph_osd *o = con->private;
				5320	struct ceph_osd_client *osdc = o->o_osdc;
				5321	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5322
				5323	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
				5324	return ceph_monc_validate_auth(&osdc->client->monc);
				5325	}
				5326
				5327	static void osd_reencode_message(struct ceph_msg *msg)
				5328	{
				5329	int type = le16_to_cpu(msg->hdr.type);
				5330
				5331	if (type == CEPH_MSG_OSD_OP)
				5332	encode_request_finish(msg);
				5333	}
				5334
				5335	static int osd_sign_message(struct ceph_msg *msg)
				5336	{
				5337	struct ceph_osd *o = msg->con->private;
				5338	struct ceph_auth_handshake *auth = &o->o_auth;
				5339
				5340	return ceph_auth_sign_message(auth, msg);
				5341	}
				5342
				5343	static int osd_check_message_signature(struct ceph_msg *msg)
				5344	{
				5345	struct ceph_osd *o = msg->con->private;
				5346	struct ceph_auth_handshake *auth = &o->o_auth;
				5347
				5348	return ceph_auth_check_message_signature(auth, msg);
				5349	}
				5350
				5351	static const struct ceph_connection_operations osd_con_ops = {
				5352	.get = get_osd_con,
				5353	.put = put_osd_con,
				5354	.dispatch = dispatch,
				5355	.get_authorizer = get_authorizer,
				5356	.add_authorizer_challenge = add_authorizer_challenge,
				5357	.verify_authorizer_reply = verify_authorizer_reply,
				5358	.invalidate_authorizer = invalidate_authorizer,
				5359	.alloc_msg = alloc_msg,
				5360	.reencode_message = osd_reencode_message,
				5361	.sign_message = osd_sign_message,
				5362	.check_message_signature = osd_check_message_signature,
				5363	.fault = osd_fault,
				5364	};