Blame - src/kernel/linux/v4.19/net/ceph/osd_client.c - T800

blob: 76c41a84550e76459aa6dd5ec8677c8275127fd0 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include <linux/ceph/ceph_debug.h>
				4
				5	#include <linux/module.h>
				6	#include <linux/err.h>
				7	#include <linux/highmem.h>
				8	#include <linux/mm.h>
				9	#include <linux/pagemap.h>
				10	#include <linux/slab.h>
				11	#include <linux/uaccess.h>
				12	#ifdef CONFIG_BLOCK
				13	#include <linux/bio.h>
				14	#endif
				15
				16	#include <linux/ceph/ceph_features.h>
				17	#include <linux/ceph/libceph.h>
				18	#include <linux/ceph/osd_client.h>
				19	#include <linux/ceph/messenger.h>
				20	#include <linux/ceph/decode.h>
				21	#include <linux/ceph/auth.h>
				22	#include <linux/ceph/pagelist.h>
				23	#include <linux/ceph/striper.h>
				24
				25	#define OSD_OPREPLY_FRONT_LEN 512
				26
				27	static struct kmem_cache *ceph_osd_request_cache;
				28
				29	static const struct ceph_connection_operations osd_con_ops;
				30
				31	/*
				32	* Implement client access to distributed object storage cluster.
				33	*
				34	* All data objects are stored within a cluster/cloud of OSDs, or
				35	* "object storage devices." (Note that Ceph OSDs have _nothing_ to
				36	* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
				37	* remote daemons serving up and coordinating consistent and safe
				38	* access to storage.
				39	*
				40	* Cluster membership and the mapping of data objects onto storage devices
				41	* are described by the osd map.
				42	*
				43	* We keep track of pending OSD requests (read, write), resubmit
				44	* requests to different OSDs when the cluster topology/data layout
				45	* change, or retry the affected requests when the communications
				46	* channel with an OSD is reset.
				47	*/
				48
				49	static void link_request(struct ceph_osd osd, struct ceph_osd_request req);
				50	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req);
				51	static void link_linger(struct ceph_osd *osd,
				52	struct ceph_osd_linger_request *lreq);
				53	static void unlink_linger(struct ceph_osd *osd,
				54	struct ceph_osd_linger_request *lreq);
				55	static void clear_backoffs(struct ceph_osd *osd);
				56
				57	#if 1
				58	static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
				59	{
				60	bool wrlocked = true;
				61
				62	if (unlikely(down_read_trylock(sem))) {
				63	wrlocked = false;
				64	up_read(sem);
				65	}
				66
				67	return wrlocked;
				68	}
				69	static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
				70	{
				71	WARN_ON(!rwsem_is_locked(&osdc->lock));
				72	}
				73	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
				74	{
				75	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
				76	}
				77	static inline void verify_osd_locked(struct ceph_osd *osd)
				78	{
				79	struct ceph_osd_client *osdc = osd->o_osdc;
				80
				81	WARN_ON(!(mutex_is_locked(&osd->lock) &&
				82	rwsem_is_locked(&osdc->lock)) &&
				83	!rwsem_is_wrlocked(&osdc->lock));
				84	}
				85	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
				86	{
				87	WARN_ON(!mutex_is_locked(&lreq->lock));
				88	}
				89	#else
				90	static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
				91	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
				92	static inline void verify_osd_locked(struct ceph_osd *osd) { }
				93	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
				94	#endif
				95
				96	/*
				97	* calculate the mapping of a file extent onto an object, and fill out the
				98	* request accordingly. shorten extent as necessary if it crosses an
				99	* object boundary.
				100	*
				101	* fill osd op in request message.
				102	*/
				103	static int calc_layout(struct ceph_file_layout layout, u64 off, u64 plen,
				104	u64 objnum, u64 objoff, u64 *objlen)
				105	{
				106	u64 orig_len = *plen;
				107	u32 xlen;
				108
				109	/* object extent? */
				110	ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
				111	objoff, &xlen);
				112	*objlen = xlen;
				113	if (*objlen < orig_len) {
				114	plen = objlen;
				115	dout(" skipping last %llu, final file extent %llu~%llu\n",
				116	orig_len - plen, off, plen);
				117	}
				118
				119	dout("calc_layout objnum=%llx %llu~%llu\n", objnum, objoff, *objlen);
				120	return 0;
				121	}
				122
				123	static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
				124	{
				125	memset(osd_data, 0, sizeof (*osd_data));
				126	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
				127	}
				128
				129	static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
				130	struct page **pages, u64 length, u32 alignment,
				131	bool pages_from_pool, bool own_pages)
				132	{
				133	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
				134	osd_data->pages = pages;
				135	osd_data->length = length;
				136	osd_data->alignment = alignment;
				137	osd_data->pages_from_pool = pages_from_pool;
				138	osd_data->own_pages = own_pages;
				139	}
				140
				141	static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
				142	struct ceph_pagelist *pagelist)
				143	{
				144	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
				145	osd_data->pagelist = pagelist;
				146	}
				147
				148	#ifdef CONFIG_BLOCK
				149	static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
				150	struct ceph_bio_iter *bio_pos,
				151	u32 bio_length)
				152	{
				153	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
				154	osd_data->bio_pos = *bio_pos;
				155	osd_data->bio_length = bio_length;
				156	}
				157	#endif /* CONFIG_BLOCK */
				158
				159	static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
				160	struct ceph_bvec_iter *bvec_pos,
				161	u32 num_bvecs)
				162	{
				163	osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
				164	osd_data->bvec_pos = *bvec_pos;
				165	osd_data->num_bvecs = num_bvecs;
				166	}
				167
				168	#define osd_req_op_data(oreq, whch, typ, fld) \
				169	({ \
				170	struct ceph_osd_request *__oreq = (oreq); \
				171	unsigned int __whch = (whch); \
				172	BUG_ON(__whch >= __oreq->r_num_ops); \
				173	&__oreq->r_ops[__whch].typ.fld; \
				174	})
				175
				176	static struct ceph_osd_data *
				177	osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
				178	{
				179	BUG_ON(which >= osd_req->r_num_ops);
				180
				181	return &osd_req->r_ops[which].raw_data_in;
				182	}
				183
				184	struct ceph_osd_data *
				185	osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
				186	unsigned int which)
				187	{
				188	return osd_req_op_data(osd_req, which, extent, osd_data);
				189	}
				190	EXPORT_SYMBOL(osd_req_op_extent_osd_data);
				191
				192	void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
				193	unsigned int which, struct page **pages,
				194	u64 length, u32 alignment,
				195	bool pages_from_pool, bool own_pages)
				196	{
				197	struct ceph_osd_data *osd_data;
				198
				199	osd_data = osd_req_op_raw_data_in(osd_req, which);
				200	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				201	pages_from_pool, own_pages);
				202	}
				203	EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
				204
				205	void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
				206	unsigned int which, struct page **pages,
				207	u64 length, u32 alignment,
				208	bool pages_from_pool, bool own_pages)
				209	{
				210	struct ceph_osd_data *osd_data;
				211
				212	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				213	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				214	pages_from_pool, own_pages);
				215	}
				216	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
				217
				218	void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
				219	unsigned int which, struct ceph_pagelist *pagelist)
				220	{
				221	struct ceph_osd_data *osd_data;
				222
				223	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				224	ceph_osd_data_pagelist_init(osd_data, pagelist);
				225	}
				226	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
				227
				228	#ifdef CONFIG_BLOCK
				229	void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
				230	unsigned int which,
				231	struct ceph_bio_iter *bio_pos,
				232	u32 bio_length)
				233	{
				234	struct ceph_osd_data *osd_data;
				235
				236	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				237	ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
				238	}
				239	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
				240	#endif /* CONFIG_BLOCK */
				241
				242	void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
				243	unsigned int which,
				244	struct bio_vec *bvecs, u32 num_bvecs,
				245	u32 bytes)
				246	{
				247	struct ceph_osd_data *osd_data;
				248	struct ceph_bvec_iter it = {
				249	.bvecs = bvecs,
				250	.iter = { .bi_size = bytes },
				251	};
				252
				253	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				254	ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
				255	}
				256	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs);
				257
				258	void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
				259	unsigned int which,
				260	struct ceph_bvec_iter *bvec_pos)
				261	{
				262	struct ceph_osd_data *osd_data;
				263
				264	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				265	ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0);
				266	}
				267	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
				268
				269	static void osd_req_op_cls_request_info_pagelist(
				270	struct ceph_osd_request *osd_req,
				271	unsigned int which, struct ceph_pagelist *pagelist)
				272	{
				273	struct ceph_osd_data *osd_data;
				274
				275	osd_data = osd_req_op_data(osd_req, which, cls, request_info);
				276	ceph_osd_data_pagelist_init(osd_data, pagelist);
				277	}
				278
				279	void osd_req_op_cls_request_data_pagelist(
				280	struct ceph_osd_request *osd_req,
				281	unsigned int which, struct ceph_pagelist *pagelist)
				282	{
				283	struct ceph_osd_data *osd_data;
				284
				285	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				286	ceph_osd_data_pagelist_init(osd_data, pagelist);
				287	osd_req->r_ops[which].cls.indata_len += pagelist->length;
				288	osd_req->r_ops[which].indata_len += pagelist->length;
				289	}
				290	EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
				291
				292	void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
				293	unsigned int which, struct page **pages, u64 length,
				294	u32 alignment, bool pages_from_pool, bool own_pages)
				295	{
				296	struct ceph_osd_data *osd_data;
				297
				298	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				299	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				300	pages_from_pool, own_pages);
				301	osd_req->r_ops[which].cls.indata_len += length;
				302	osd_req->r_ops[which].indata_len += length;
				303	}
				304	EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
				305
				306	void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
				307	unsigned int which,
				308	struct bio_vec *bvecs, u32 num_bvecs,
				309	u32 bytes)
				310	{
				311	struct ceph_osd_data *osd_data;
				312	struct ceph_bvec_iter it = {
				313	.bvecs = bvecs,
				314	.iter = { .bi_size = bytes },
				315	};
				316
				317	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				318	ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
				319	osd_req->r_ops[which].cls.indata_len += bytes;
				320	osd_req->r_ops[which].indata_len += bytes;
				321	}
				322	EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
				323
				324	void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
				325	unsigned int which, struct page **pages, u64 length,
				326	u32 alignment, bool pages_from_pool, bool own_pages)
				327	{
				328	struct ceph_osd_data *osd_data;
				329
				330	osd_data = osd_req_op_data(osd_req, which, cls, response_data);
				331	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				332	pages_from_pool, own_pages);
				333	}
				334	EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
				335
				336	static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
				337	{
				338	switch (osd_data->type) {
				339	case CEPH_OSD_DATA_TYPE_NONE:
				340	return 0;
				341	case CEPH_OSD_DATA_TYPE_PAGES:
				342	return osd_data->length;
				343	case CEPH_OSD_DATA_TYPE_PAGELIST:
				344	return (u64)osd_data->pagelist->length;
				345	#ifdef CONFIG_BLOCK
				346	case CEPH_OSD_DATA_TYPE_BIO:
				347	return (u64)osd_data->bio_length;
				348	#endif /* CONFIG_BLOCK */
				349	case CEPH_OSD_DATA_TYPE_BVECS:
				350	return osd_data->bvec_pos.iter.bi_size;
				351	default:
				352	WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
				353	return 0;
				354	}
				355	}
				356
				357	static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
				358	{
				359	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
				360	int num_pages;
				361
				362	num_pages = calc_pages_for((u64)osd_data->alignment,
				363	(u64)osd_data->length);
				364	ceph_release_page_vector(osd_data->pages, num_pages);
				365	}
				366	ceph_osd_data_init(osd_data);
				367	}
				368
				369	static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
				370	unsigned int which)
				371	{
				372	struct ceph_osd_req_op *op;
				373
				374	BUG_ON(which >= osd_req->r_num_ops);
				375	op = &osd_req->r_ops[which];
				376
				377	switch (op->op) {
				378	case CEPH_OSD_OP_READ:
				379	case CEPH_OSD_OP_WRITE:
				380	case CEPH_OSD_OP_WRITEFULL:
				381	ceph_osd_data_release(&op->extent.osd_data);
				382	break;
				383	case CEPH_OSD_OP_CALL:
				384	ceph_osd_data_release(&op->cls.request_info);
				385	ceph_osd_data_release(&op->cls.request_data);
				386	ceph_osd_data_release(&op->cls.response_data);
				387	break;
				388	case CEPH_OSD_OP_SETXATTR:
				389	case CEPH_OSD_OP_CMPXATTR:
				390	ceph_osd_data_release(&op->xattr.osd_data);
				391	break;
				392	case CEPH_OSD_OP_STAT:
				393	ceph_osd_data_release(&op->raw_data_in);
				394	break;
				395	case CEPH_OSD_OP_NOTIFY_ACK:
				396	ceph_osd_data_release(&op->notify_ack.request_data);
				397	break;
				398	case CEPH_OSD_OP_NOTIFY:
				399	ceph_osd_data_release(&op->notify.request_data);
				400	ceph_osd_data_release(&op->notify.response_data);
				401	break;
				402	case CEPH_OSD_OP_LIST_WATCHERS:
				403	ceph_osd_data_release(&op->list_watchers.response_data);
				404	break;
				405	default:
				406	break;
				407	}
				408	}
				409
				410	/*
				411	* Assumes @t is zero-initialized.
				412	*/
				413	static void target_init(struct ceph_osd_request_target *t)
				414	{
				415	ceph_oid_init(&t->base_oid);
				416	ceph_oloc_init(&t->base_oloc);
				417	ceph_oid_init(&t->target_oid);
				418	ceph_oloc_init(&t->target_oloc);
				419
				420	ceph_osds_init(&t->acting);
				421	ceph_osds_init(&t->up);
				422	t->size = -1;
				423	t->min_size = -1;
				424
				425	t->osd = CEPH_HOMELESS_OSD;
				426	}
				427
				428	static void target_copy(struct ceph_osd_request_target *dest,
				429	const struct ceph_osd_request_target *src)
				430	{
				431	ceph_oid_copy(&dest->base_oid, &src->base_oid);
				432	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
				433	ceph_oid_copy(&dest->target_oid, &src->target_oid);
				434	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
				435
				436	dest->pgid = src->pgid; /* struct */
				437	dest->spgid = src->spgid; /* struct */
				438	dest->pg_num = src->pg_num;
				439	dest->pg_num_mask = src->pg_num_mask;
				440	ceph_osds_copy(&dest->acting, &src->acting);
				441	ceph_osds_copy(&dest->up, &src->up);
				442	dest->size = src->size;
				443	dest->min_size = src->min_size;
				444	dest->sort_bitwise = src->sort_bitwise;
				445
				446	dest->flags = src->flags;
				447	dest->paused = src->paused;
				448
				449	dest->epoch = src->epoch;
				450	dest->last_force_resend = src->last_force_resend;
				451
				452	dest->osd = src->osd;
				453	}
				454
				455	static void target_destroy(struct ceph_osd_request_target *t)
				456	{
				457	ceph_oid_destroy(&t->base_oid);
				458	ceph_oloc_destroy(&t->base_oloc);
				459	ceph_oid_destroy(&t->target_oid);
				460	ceph_oloc_destroy(&t->target_oloc);
				461	}
				462
				463	/*
				464	* requests
				465	*/
				466	static void request_release_checks(struct ceph_osd_request *req)
				467	{
				468	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
				469	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
				470	WARN_ON(!list_empty(&req->r_unsafe_item));
				471	WARN_ON(req->r_osd);
				472	}
				473
				474	static void ceph_osdc_release_request(struct kref *kref)
				475	{
				476	struct ceph_osd_request *req = container_of(kref,
				477	struct ceph_osd_request, r_kref);
				478	unsigned int which;
				479
				480	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
				481	req->r_request, req->r_reply);
				482	request_release_checks(req);
				483
				484	if (req->r_request)
				485	ceph_msg_put(req->r_request);
				486	if (req->r_reply)
				487	ceph_msg_put(req->r_reply);
				488
				489	for (which = 0; which < req->r_num_ops; which++)
				490	osd_req_op_data_release(req, which);
				491
				492	target_destroy(&req->r_t);
				493	ceph_put_snap_context(req->r_snapc);
				494
				495	if (req->r_mempool)
				496	mempool_free(req, req->r_osdc->req_mempool);
				497	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
				498	kmem_cache_free(ceph_osd_request_cache, req);
				499	else
				500	kfree(req);
				501	}
				502
				503	void ceph_osdc_get_request(struct ceph_osd_request *req)
				504	{
				505	dout("%s %p (was %d)\n", __func__, req,
				506	kref_read(&req->r_kref));
				507	kref_get(&req->r_kref);
				508	}
				509	EXPORT_SYMBOL(ceph_osdc_get_request);
				510
				511	void ceph_osdc_put_request(struct ceph_osd_request *req)
				512	{
				513	if (req) {
				514	dout("%s %p (was %d)\n", __func__, req,
				515	kref_read(&req->r_kref));
				516	kref_put(&req->r_kref, ceph_osdc_release_request);
				517	}
				518	}
				519	EXPORT_SYMBOL(ceph_osdc_put_request);
				520
				521	static void request_init(struct ceph_osd_request *req)
				522	{
				523	/* req only, each op is zeroed in _osd_req_op_init() */
				524	memset(req, 0, sizeof(*req));
				525
				526	kref_init(&req->r_kref);
				527	init_completion(&req->r_completion);
				528	RB_CLEAR_NODE(&req->r_node);
				529	RB_CLEAR_NODE(&req->r_mc_node);
				530	INIT_LIST_HEAD(&req->r_unsafe_item);
				531
				532	target_init(&req->r_t);
				533	}
				534
				535	/*
				536	* This is ugly, but it allows us to reuse linger registration and ping
				537	* requests, keeping the structure of the code around send_linger{_ping}()
				538	* reasonable. Setting up a min_nr=2 mempool for each linger request
				539	* and dealing with copying ops (this blasts req only, watch op remains
				540	* intact) isn't any better.
				541	*/
				542	static void request_reinit(struct ceph_osd_request *req)
				543	{
				544	struct ceph_osd_client *osdc = req->r_osdc;
				545	bool mempool = req->r_mempool;
				546	unsigned int num_ops = req->r_num_ops;
				547	u64 snapid = req->r_snapid;
				548	struct ceph_snap_context *snapc = req->r_snapc;
				549	bool linger = req->r_linger;
				550	struct ceph_msg *request_msg = req->r_request;
				551	struct ceph_msg *reply_msg = req->r_reply;
				552
				553	dout("%s req %p\n", __func__, req);
				554	WARN_ON(kref_read(&req->r_kref) != 1);
				555	request_release_checks(req);
				556
				557	WARN_ON(kref_read(&request_msg->kref) != 1);
				558	WARN_ON(kref_read(&reply_msg->kref) != 1);
				559	target_destroy(&req->r_t);
				560
				561	request_init(req);
				562	req->r_osdc = osdc;
				563	req->r_mempool = mempool;
				564	req->r_num_ops = num_ops;
				565	req->r_snapid = snapid;
				566	req->r_snapc = snapc;
				567	req->r_linger = linger;
				568	req->r_request = request_msg;
				569	req->r_reply = reply_msg;
				570	}
				571
				572	struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,
				573	struct ceph_snap_context *snapc,
				574	unsigned int num_ops,
				575	bool use_mempool,
				576	gfp_t gfp_flags)
				577	{
				578	struct ceph_osd_request *req;
				579
				580	if (use_mempool) {
				581	BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
				582	req = mempool_alloc(osdc->req_mempool, gfp_flags);
				583	} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
				584	req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
				585	} else {
				586	BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
				587	req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags);
				588	}
				589	if (unlikely(!req))
				590	return NULL;
				591
				592	request_init(req);
				593	req->r_osdc = osdc;
				594	req->r_mempool = use_mempool;
				595	req->r_num_ops = num_ops;
				596	req->r_snapid = CEPH_NOSNAP;
				597	req->r_snapc = ceph_get_snap_context(snapc);
				598
				599	dout("%s req %p\n", __func__, req);
				600	return req;
				601	}
				602	EXPORT_SYMBOL(ceph_osdc_alloc_request);
				603
				604	static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
				605	{
				606	return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
				607	}
				608
				609	int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
				610	{
				611	struct ceph_osd_client *osdc = req->r_osdc;
				612	struct ceph_msg *msg;
				613	int msg_size;
				614
				615	WARN_ON(ceph_oid_empty(&req->r_base_oid));
				616	WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
				617
				618	/* create request message */
				619	msg_size = CEPH_ENCODING_START_BLK_LEN +
				620	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				621	msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
				622	msg_size += CEPH_ENCODING_START_BLK_LEN +
				623	sizeof(struct ceph_osd_reqid); /* reqid */
				624	msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
				625	msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
				626	msg_size += CEPH_ENCODING_START_BLK_LEN +
				627	ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
				628	msg_size += 4 + req->r_base_oid.name_len; /* oid */
				629	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
				630	msg_size += 8; /* snapid */
				631	msg_size += 8; /* snap_seq */
				632	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
				633	msg_size += 4 + 8; /* retry_attempt, features */
				634
				635	if (req->r_mempool)
				636	msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
				637	else
				638	msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
				639	if (!msg)
				640	return -ENOMEM;
				641
				642	memset(msg->front.iov_base, 0, msg->front.iov_len);
				643	req->r_request = msg;
				644
				645	/* create reply message */
				646	msg_size = OSD_OPREPLY_FRONT_LEN;
				647	msg_size += req->r_base_oid.name_len;
				648	msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
				649
				650	if (req->r_mempool)
				651	msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
				652	else
				653	msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
				654	if (!msg)
				655	return -ENOMEM;
				656
				657	req->r_reply = msg;
				658
				659	return 0;
				660	}
				661	EXPORT_SYMBOL(ceph_osdc_alloc_messages);
				662
				663	static bool osd_req_opcode_valid(u16 opcode)
				664	{
				665	switch (opcode) {
				666	#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
				667	__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
				668	#undef GENERATE_CASE
				669	default:
				670	return false;
				671	}
				672	}
				673
				674	/*
				675	* This is an osd op init function for opcodes that have no data or
				676	* other information associated with them. It also serves as a
				677	* common init routine for all the other init functions, below.
				678	*/
				679	static struct ceph_osd_req_op *
				680	_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
				681	u16 opcode, u32 flags)
				682	{
				683	struct ceph_osd_req_op *op;
				684
				685	BUG_ON(which >= osd_req->r_num_ops);
				686	BUG_ON(!osd_req_opcode_valid(opcode));
				687
				688	op = &osd_req->r_ops[which];
				689	memset(op, 0, sizeof (*op));
				690	op->op = opcode;
				691	op->flags = flags;
				692
				693	return op;
				694	}
				695
				696	void osd_req_op_init(struct ceph_osd_request *osd_req,
				697	unsigned int which, u16 opcode, u32 flags)
				698	{
				699	(void)_osd_req_op_init(osd_req, which, opcode, flags);
				700	}
				701	EXPORT_SYMBOL(osd_req_op_init);
				702
				703	void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
				704	unsigned int which, u16 opcode,
				705	u64 offset, u64 length,
				706	u64 truncate_size, u32 truncate_seq)
				707	{
				708	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				709	opcode, 0);
				710	size_t payload_len = 0;
				711
				712	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				713	opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
				714	opcode != CEPH_OSD_OP_TRUNCATE);
				715
				716	op->extent.offset = offset;
				717	op->extent.length = length;
				718	op->extent.truncate_size = truncate_size;
				719	op->extent.truncate_seq = truncate_seq;
				720	if (opcode == CEPH_OSD_OP_WRITE \|\| opcode == CEPH_OSD_OP_WRITEFULL)
				721	payload_len += length;
				722
				723	op->indata_len = payload_len;
				724	}
				725	EXPORT_SYMBOL(osd_req_op_extent_init);
				726
				727	void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
				728	unsigned int which, u64 length)
				729	{
				730	struct ceph_osd_req_op *op;
				731	u64 previous;
				732
				733	BUG_ON(which >= osd_req->r_num_ops);
				734	op = &osd_req->r_ops[which];
				735	previous = op->extent.length;
				736
				737	if (length == previous)
				738	return; /* Nothing to do */
				739	BUG_ON(length > previous);
				740
				741	op->extent.length = length;
				742	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				743	op->indata_len -= previous - length;
				744	}
				745	EXPORT_SYMBOL(osd_req_op_extent_update);
				746
				747	void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
				748	unsigned int which, u64 offset_inc)
				749	{
				750	struct ceph_osd_req_op op, prev_op;
				751
				752	BUG_ON(which + 1 >= osd_req->r_num_ops);
				753
				754	prev_op = &osd_req->r_ops[which];
				755	op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
				756	/* dup previous one */
				757	op->indata_len = prev_op->indata_len;
				758	op->outdata_len = prev_op->outdata_len;
				759	op->extent = prev_op->extent;
				760	/* adjust offset */
				761	op->extent.offset += offset_inc;
				762	op->extent.length -= offset_inc;
				763
				764	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				765	op->indata_len -= offset_inc;
				766	}
				767	EXPORT_SYMBOL(osd_req_op_extent_dup_last);
				768
				769	int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
				770	u16 opcode, const char class, const char method)
				771	{
				772	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				773	opcode, 0);
				774	struct ceph_pagelist *pagelist;
				775	size_t payload_len = 0;
				776	size_t size;
				777
				778	BUG_ON(opcode != CEPH_OSD_OP_CALL);
				779
				780	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
				781	if (!pagelist)
				782	return -ENOMEM;
				783
				784	ceph_pagelist_init(pagelist);
				785
				786	op->cls.class_name = class;
				787	size = strlen(class);
				788	BUG_ON(size > (size_t) U8_MAX);
				789	op->cls.class_len = size;
				790	ceph_pagelist_append(pagelist, class, size);
				791	payload_len += size;
				792
				793	op->cls.method_name = method;
				794	size = strlen(method);
				795	BUG_ON(size > (size_t) U8_MAX);
				796	op->cls.method_len = size;
				797	ceph_pagelist_append(pagelist, method, size);
				798	payload_len += size;
				799
				800	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
				801
				802	op->indata_len = payload_len;
				803	return 0;
				804	}
				805	EXPORT_SYMBOL(osd_req_op_cls_init);
				806
				807	int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
				808	u16 opcode, const char name, const void value,
				809	size_t size, u8 cmp_op, u8 cmp_mode)
				810	{
				811	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				812	opcode, 0);
				813	struct ceph_pagelist *pagelist;
				814	size_t payload_len;
				815
				816	BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
				817
				818	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
				819	if (!pagelist)
				820	return -ENOMEM;
				821
				822	ceph_pagelist_init(pagelist);
				823
				824	payload_len = strlen(name);
				825	op->xattr.name_len = payload_len;
				826	ceph_pagelist_append(pagelist, name, payload_len);
				827
				828	op->xattr.value_len = size;
				829	ceph_pagelist_append(pagelist, value, size);
				830	payload_len += size;
				831
				832	op->xattr.cmp_op = cmp_op;
				833	op->xattr.cmp_mode = cmp_mode;
				834
				835	ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
				836	op->indata_len = payload_len;
				837	return 0;
				838	}
				839	EXPORT_SYMBOL(osd_req_op_xattr_init);
				840
				841	/*
				842	* @watch_opcode: CEPH_OSD_WATCH_OP_*
				843	*/
				844	static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
				845	u64 cookie, u8 watch_opcode)
				846	{
				847	struct ceph_osd_req_op *op;
				848
				849	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
				850	op->watch.cookie = cookie;
				851	op->watch.op = watch_opcode;
				852	op->watch.gen = 0;
				853	}
				854
				855	void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
				856	unsigned int which,
				857	u64 expected_object_size,
				858	u64 expected_write_size)
				859	{
				860	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				861	CEPH_OSD_OP_SETALLOCHINT,
				862	0);
				863
				864	op->alloc_hint.expected_object_size = expected_object_size;
				865	op->alloc_hint.expected_write_size = expected_write_size;
				866
				867	/*
				868	* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
				869	* not worth a feature bit. Set FAILOK per-op flag to make
				870	* sure older osds don't trip over an unsupported opcode.
				871	*/
				872	op->flags \|= CEPH_OSD_OP_FLAG_FAILOK;
				873	}
				874	EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
				875
				876	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
				877	struct ceph_osd_data *osd_data)
				878	{
				879	u64 length = ceph_osd_data_length(osd_data);
				880
				881	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
				882	BUG_ON(length > (u64) SIZE_MAX);
				883	if (length)
				884	ceph_msg_data_add_pages(msg, osd_data->pages,
				885	length, osd_data->alignment);
				886	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
				887	BUG_ON(!length);
				888	ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
				889	#ifdef CONFIG_BLOCK
				890	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
				891	ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
				892	#endif
				893	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
				894	ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
				895	} else {
				896	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
				897	}
				898	}
				899
				900	static u32 osd_req_encode_op(struct ceph_osd_op *dst,
				901	const struct ceph_osd_req_op *src)
				902	{
				903	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
				904	pr_err("unrecognized osd opcode %d\n", src->op);
				905
				906	return 0;
				907	}
				908
				909	switch (src->op) {
				910	case CEPH_OSD_OP_STAT:
				911	break;
				912	case CEPH_OSD_OP_READ:
				913	case CEPH_OSD_OP_WRITE:
				914	case CEPH_OSD_OP_WRITEFULL:
				915	case CEPH_OSD_OP_ZERO:
				916	case CEPH_OSD_OP_TRUNCATE:
				917	dst->extent.offset = cpu_to_le64(src->extent.offset);
				918	dst->extent.length = cpu_to_le64(src->extent.length);
				919	dst->extent.truncate_size =
				920	cpu_to_le64(src->extent.truncate_size);
				921	dst->extent.truncate_seq =
				922	cpu_to_le32(src->extent.truncate_seq);
				923	break;
				924	case CEPH_OSD_OP_CALL:
				925	dst->cls.class_len = src->cls.class_len;
				926	dst->cls.method_len = src->cls.method_len;
				927	dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
				928	break;
				929	case CEPH_OSD_OP_WATCH:
				930	dst->watch.cookie = cpu_to_le64(src->watch.cookie);
				931	dst->watch.ver = cpu_to_le64(0);
				932	dst->watch.op = src->watch.op;
				933	dst->watch.gen = cpu_to_le32(src->watch.gen);
				934	break;
				935	case CEPH_OSD_OP_NOTIFY_ACK:
				936	break;
				937	case CEPH_OSD_OP_NOTIFY:
				938	dst->notify.cookie = cpu_to_le64(src->notify.cookie);
				939	break;
				940	case CEPH_OSD_OP_LIST_WATCHERS:
				941	break;
				942	case CEPH_OSD_OP_SETALLOCHINT:
				943	dst->alloc_hint.expected_object_size =
				944	cpu_to_le64(src->alloc_hint.expected_object_size);
				945	dst->alloc_hint.expected_write_size =
				946	cpu_to_le64(src->alloc_hint.expected_write_size);
				947	break;
				948	case CEPH_OSD_OP_SETXATTR:
				949	case CEPH_OSD_OP_CMPXATTR:
				950	dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
				951	dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
				952	dst->xattr.cmp_op = src->xattr.cmp_op;
				953	dst->xattr.cmp_mode = src->xattr.cmp_mode;
				954	break;
				955	case CEPH_OSD_OP_CREATE:
				956	case CEPH_OSD_OP_DELETE:
				957	break;
				958	default:
				959	pr_err("unsupported osd opcode %s\n",
				960	ceph_osd_op_name(src->op));
				961	WARN_ON(1);
				962
				963	return 0;
				964	}
				965
				966	dst->op = cpu_to_le16(src->op);
				967	dst->flags = cpu_to_le32(src->flags);
				968	dst->payload_len = cpu_to_le32(src->indata_len);
				969
				970	return src->indata_len;
				971	}
				972
				973	/*
				974	* build new request AND message, calculate layout, and adjust file
				975	* extent as needed.
				976	*
				977	* if the file was recently truncated, we include information about its
				978	* old and new size so that the object can be updated appropriately. (we
				979	* avoid synchronously deleting truncated objects because it's slow.)
				980	*/
				981	struct ceph_osd_request ceph_osdc_new_request(struct ceph_osd_client osdc,
				982	struct ceph_file_layout *layout,
				983	struct ceph_vino vino,
				984	u64 off, u64 *plen,
				985	unsigned int which, int num_ops,
				986	int opcode, int flags,
				987	struct ceph_snap_context *snapc,
				988	u32 truncate_seq,
				989	u64 truncate_size,
				990	bool use_mempool)
				991	{
				992	struct ceph_osd_request *req;
				993	u64 objnum = 0;
				994	u64 objoff = 0;
				995	u64 objlen = 0;
				996	int r;
				997
				998	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				999	opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
				1000	opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
				1001
				1002	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
				1003	GFP_NOFS);
				1004	if (!req) {
				1005	r = -ENOMEM;
				1006	goto fail;
				1007	}
				1008
				1009	/* calculate max write size */
				1010	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
				1011	if (r)
				1012	goto fail;
				1013
				1014	if (opcode == CEPH_OSD_OP_CREATE \|\| opcode == CEPH_OSD_OP_DELETE) {
				1015	osd_req_op_init(req, which, opcode, 0);
				1016	} else {
				1017	u32 object_size = layout->object_size;
				1018	u32 object_base = off - objoff;
				1019	if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
				1020	if (truncate_size <= object_base) {
				1021	truncate_size = 0;
				1022	} else {
				1023	truncate_size -= object_base;
				1024	if (truncate_size > object_size)
				1025	truncate_size = object_size;
				1026	}
				1027	}
				1028	osd_req_op_extent_init(req, which, opcode, objoff, objlen,
				1029	truncate_size, truncate_seq);
				1030	}
				1031
				1032	req->r_flags = flags;
				1033	req->r_base_oloc.pool = layout->pool_id;
				1034	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
				1035	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
				1036
				1037	req->r_snapid = vino.snap;
				1038	if (flags & CEPH_OSD_FLAG_WRITE)
				1039	req->r_data_offset = off;
				1040
				1041	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
				1042	if (r)
				1043	goto fail;
				1044
				1045	return req;
				1046
				1047	fail:
				1048	ceph_osdc_put_request(req);
				1049	return ERR_PTR(r);
				1050	}
				1051	EXPORT_SYMBOL(ceph_osdc_new_request);
				1052
				1053	/*
				1054	* We keep osd requests in an rbtree, sorted by ->r_tid.
				1055	*/
				1056	DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
				1057	DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
				1058
				1059	/*
				1060	* Call @fn on each OSD request as long as @fn returns 0.
				1061	*/
				1062	static void for_each_request(struct ceph_osd_client *osdc,
				1063	int (fn)(struct ceph_osd_request req, void *arg),
				1064	void *arg)
				1065	{
				1066	struct rb_node n, p;
				1067
				1068	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				1069	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				1070
				1071	for (p = rb_first(&osd->o_requests); p; ) {
				1072	struct ceph_osd_request *req =
				1073	rb_entry(p, struct ceph_osd_request, r_node);
				1074
				1075	p = rb_next(p);
				1076	if (fn(req, arg))
				1077	return;
				1078	}
				1079	}
				1080
				1081	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
				1082	struct ceph_osd_request *req =
				1083	rb_entry(p, struct ceph_osd_request, r_node);
				1084
				1085	p = rb_next(p);
				1086	if (fn(req, arg))
				1087	return;
				1088	}
				1089	}
				1090
				1091	static bool osd_homeless(struct ceph_osd *osd)
				1092	{
				1093	return osd->o_osd == CEPH_HOMELESS_OSD;
				1094	}
				1095
				1096	static bool osd_registered(struct ceph_osd *osd)
				1097	{
				1098	verify_osdc_locked(osd->o_osdc);
				1099
				1100	return !RB_EMPTY_NODE(&osd->o_node);
				1101	}
				1102
				1103	/*
				1104	* Assumes @osd is zero-initialized.
				1105	*/
				1106	static void osd_init(struct ceph_osd *osd)
				1107	{
				1108	refcount_set(&osd->o_ref, 1);
				1109	RB_CLEAR_NODE(&osd->o_node);
				1110	osd->o_requests = RB_ROOT;
				1111	osd->o_linger_requests = RB_ROOT;
				1112	osd->o_backoff_mappings = RB_ROOT;
				1113	osd->o_backoffs_by_id = RB_ROOT;
				1114	INIT_LIST_HEAD(&osd->o_osd_lru);
				1115	INIT_LIST_HEAD(&osd->o_keepalive_item);
				1116	osd->o_incarnation = 1;
				1117	mutex_init(&osd->lock);
				1118	}
				1119
				1120	static void osd_cleanup(struct ceph_osd *osd)
				1121	{
				1122	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
				1123	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				1124	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				1125	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
				1126	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
				1127	WARN_ON(!list_empty(&osd->o_osd_lru));
				1128	WARN_ON(!list_empty(&osd->o_keepalive_item));
				1129
				1130	if (osd->o_auth.authorizer) {
				1131	WARN_ON(osd_homeless(osd));
				1132	ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
				1133	}
				1134	}
				1135
				1136	/*
				1137	* Track open sessions with osds.
				1138	*/
				1139	static struct ceph_osd create_osd(struct ceph_osd_client osdc, int onum)
				1140	{
				1141	struct ceph_osd *osd;
				1142
				1143	WARN_ON(onum == CEPH_HOMELESS_OSD);
				1144
				1145	osd = kzalloc(sizeof(*osd), GFP_NOIO \| __GFP_NOFAIL);
				1146	osd_init(osd);
				1147	osd->o_osdc = osdc;
				1148	osd->o_osd = onum;
				1149
				1150	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
				1151
				1152	return osd;
				1153	}
				1154
				1155	static struct ceph_osd get_osd(struct ceph_osd osd)
				1156	{
				1157	if (refcount_inc_not_zero(&osd->o_ref)) {
				1158	dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
				1159	refcount_read(&osd->o_ref));
				1160	return osd;
				1161	} else {
				1162	dout("get_osd %p FAIL\n", osd);
				1163	return NULL;
				1164	}
				1165	}
				1166
				1167	static void put_osd(struct ceph_osd *osd)
				1168	{
				1169	dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
				1170	refcount_read(&osd->o_ref) - 1);
				1171	if (refcount_dec_and_test(&osd->o_ref)) {
				1172	osd_cleanup(osd);
				1173	kfree(osd);
				1174	}
				1175	}
				1176
				1177	DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
				1178
				1179	static void __move_osd_to_lru(struct ceph_osd *osd)
				1180	{
				1181	struct ceph_osd_client *osdc = osd->o_osdc;
				1182
				1183	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1184	BUG_ON(!list_empty(&osd->o_osd_lru));
				1185
				1186	spin_lock(&osdc->osd_lru_lock);
				1187	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
				1188	spin_unlock(&osdc->osd_lru_lock);
				1189
				1190	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
				1191	}
				1192
				1193	static void maybe_move_osd_to_lru(struct ceph_osd *osd)
				1194	{
				1195	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1196	RB_EMPTY_ROOT(&osd->o_linger_requests))
				1197	__move_osd_to_lru(osd);
				1198	}
				1199
				1200	static void __remove_osd_from_lru(struct ceph_osd *osd)
				1201	{
				1202	struct ceph_osd_client *osdc = osd->o_osdc;
				1203
				1204	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1205
				1206	spin_lock(&osdc->osd_lru_lock);
				1207	if (!list_empty(&osd->o_osd_lru))
				1208	list_del_init(&osd->o_osd_lru);
				1209	spin_unlock(&osdc->osd_lru_lock);
				1210	}
				1211
				1212	/*
				1213	* Close the connection and assign any leftover requests to the
				1214	* homeless session.
				1215	*/
				1216	static void close_osd(struct ceph_osd *osd)
				1217	{
				1218	struct ceph_osd_client *osdc = osd->o_osdc;
				1219	struct rb_node *n;
				1220
				1221	verify_osdc_wrlocked(osdc);
				1222	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1223
				1224	ceph_con_close(&osd->o_con);
				1225
				1226	for (n = rb_first(&osd->o_requests); n; ) {
				1227	struct ceph_osd_request *req =
				1228	rb_entry(n, struct ceph_osd_request, r_node);
				1229
				1230	n = rb_next(n); /* unlink_request() */
				1231
				1232	dout(" reassigning req %p tid %llu\n", req, req->r_tid);
				1233	unlink_request(osd, req);
				1234	link_request(&osdc->homeless_osd, req);
				1235	}
				1236	for (n = rb_first(&osd->o_linger_requests); n; ) {
				1237	struct ceph_osd_linger_request *lreq =
				1238	rb_entry(n, struct ceph_osd_linger_request, node);
				1239
				1240	n = rb_next(n); /* unlink_linger() */
				1241
				1242	dout(" reassigning lreq %p linger_id %llu\n", lreq,
				1243	lreq->linger_id);
				1244	unlink_linger(osd, lreq);
				1245	link_linger(&osdc->homeless_osd, lreq);
				1246	}
				1247	clear_backoffs(osd);
				1248
				1249	__remove_osd_from_lru(osd);
				1250	erase_osd(&osdc->osds, osd);
				1251	put_osd(osd);
				1252	}
				1253
				1254	/*
				1255	* reset osd connect
				1256	*/
				1257	static int reopen_osd(struct ceph_osd *osd)
				1258	{
				1259	struct ceph_entity_addr *peer_addr;
				1260
				1261	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1262
				1263	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1264	RB_EMPTY_ROOT(&osd->o_linger_requests)) {
				1265	close_osd(osd);
				1266	return -ENODEV;
				1267	}
				1268
				1269	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
				1270	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
				1271	!ceph_con_opened(&osd->o_con)) {
				1272	struct rb_node *n;
				1273
				1274	dout("osd addr hasn't changed and connection never opened, "
				1275	"letting msgr retry\n");
				1276	/* touch each r_stamp for handle_timeout()'s benfit */
				1277	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				1278	struct ceph_osd_request *req =
				1279	rb_entry(n, struct ceph_osd_request, r_node);
				1280	req->r_stamp = jiffies;
				1281	}
				1282
				1283	return -EAGAIN;
				1284	}
				1285
				1286	ceph_con_close(&osd->o_con);
				1287	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
				1288	osd->o_incarnation++;
				1289
				1290	return 0;
				1291	}
				1292
				1293	static struct ceph_osd lookup_create_osd(struct ceph_osd_client osdc, int o,
				1294	bool wrlocked)
				1295	{
				1296	struct ceph_osd *osd;
				1297
				1298	if (wrlocked)
				1299	verify_osdc_wrlocked(osdc);
				1300	else
				1301	verify_osdc_locked(osdc);
				1302
				1303	if (o != CEPH_HOMELESS_OSD)
				1304	osd = lookup_osd(&osdc->osds, o);
				1305	else
				1306	osd = &osdc->homeless_osd;
				1307	if (!osd) {
				1308	if (!wrlocked)
				1309	return ERR_PTR(-EAGAIN);
				1310
				1311	osd = create_osd(osdc, o);
				1312	insert_osd(&osdc->osds, osd);
				1313	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
				1314	&osdc->osdmap->osd_addr[osd->o_osd]);
				1315	}
				1316
				1317	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
				1318	return osd;
				1319	}
				1320
				1321	/*
				1322	* Create request <-> OSD session relation.
				1323	*
				1324	* @req has to be assigned a tid, @osd may be homeless.
				1325	*/
				1326	static void link_request(struct ceph_osd osd, struct ceph_osd_request req)
				1327	{
				1328	verify_osd_locked(osd);
				1329	WARN_ON(!req->r_tid \|\| req->r_osd);
				1330	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1331	req, req->r_tid);
				1332
				1333	if (!osd_homeless(osd))
				1334	__remove_osd_from_lru(osd);
				1335	else
				1336	atomic_inc(&osd->o_osdc->num_homeless);
				1337
				1338	get_osd(osd);
				1339	insert_request(&osd->o_requests, req);
				1340	req->r_osd = osd;
				1341	}
				1342
				1343	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req)
				1344	{
				1345	verify_osd_locked(osd);
				1346	WARN_ON(req->r_osd != osd);
				1347	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1348	req, req->r_tid);
				1349
				1350	req->r_osd = NULL;
				1351	erase_request(&osd->o_requests, req);
				1352	put_osd(osd);
				1353
				1354	if (!osd_homeless(osd))
				1355	maybe_move_osd_to_lru(osd);
				1356	else
				1357	atomic_dec(&osd->o_osdc->num_homeless);
				1358	}
				1359
				1360	static bool __pool_full(struct ceph_pg_pool_info *pi)
				1361	{
				1362	return pi->flags & CEPH_POOL_FLAG_FULL;
				1363	}
				1364
				1365	static bool have_pool_full(struct ceph_osd_client *osdc)
				1366	{
				1367	struct rb_node *n;
				1368
				1369	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				1370	struct ceph_pg_pool_info *pi =
				1371	rb_entry(n, struct ceph_pg_pool_info, node);
				1372
				1373	if (__pool_full(pi))
				1374	return true;
				1375	}
				1376
				1377	return false;
				1378	}
				1379
				1380	static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
				1381	{
				1382	struct ceph_pg_pool_info *pi;
				1383
				1384	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				1385	if (!pi)
				1386	return false;
				1387
				1388	return __pool_full(pi);
				1389	}
				1390
				1391	/*
				1392	* Returns whether a request should be blocked from being sent
				1393	* based on the current osdmap and osd_client settings.
				1394	*/
				1395	static bool target_should_be_paused(struct ceph_osd_client *osdc,
				1396	const struct ceph_osd_request_target *t,
				1397	struct ceph_pg_pool_info *pi)
				1398	{
				1399	bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				1400	bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				1401	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				1402	__pool_full(pi);
				1403
				1404	WARN_ON(pi->id != t->target_oloc.pool);
				1405	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) \|\|
				1406	((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) \|\|
				1407	(osdc->osdmap->epoch < osdc->epoch_barrier);
				1408	}
				1409
				1410	enum calc_target_result {
				1411	CALC_TARGET_NO_ACTION = 0,
				1412	CALC_TARGET_NEED_RESEND,
				1413	CALC_TARGET_POOL_DNE,
				1414	};
				1415
				1416	static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
				1417	struct ceph_osd_request_target *t,
				1418	struct ceph_connection *con,
				1419	bool any_change)
				1420	{
				1421	struct ceph_pg_pool_info *pi;
				1422	struct ceph_pg pgid, last_pgid;
				1423	struct ceph_osds up, acting;
				1424	bool force_resend = false;
				1425	bool unpaused = false;
				1426	bool legacy_change = false;
				1427	bool split = false;
				1428	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
				1429	bool recovery_deletes = ceph_osdmap_flag(osdc,
				1430	CEPH_OSDMAP_RECOVERY_DELETES);
				1431	enum calc_target_result ct_res;
				1432
				1433	t->epoch = osdc->osdmap->epoch;
				1434	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
				1435	if (!pi) {
				1436	t->osd = CEPH_HOMELESS_OSD;
				1437	ct_res = CALC_TARGET_POOL_DNE;
				1438	goto out;
				1439	}
				1440
				1441	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
				1442	if (t->last_force_resend < pi->last_force_request_resend) {
				1443	t->last_force_resend = pi->last_force_request_resend;
				1444	force_resend = true;
				1445	} else if (t->last_force_resend == 0) {
				1446	force_resend = true;
				1447	}
				1448	}
				1449
				1450	/* apply tiering */
				1451	ceph_oid_copy(&t->target_oid, &t->base_oid);
				1452	ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
				1453	if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
				1454	if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
				1455	t->target_oloc.pool = pi->read_tier;
				1456	if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
				1457	t->target_oloc.pool = pi->write_tier;
				1458
				1459	pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
				1460	if (!pi) {
				1461	t->osd = CEPH_HOMELESS_OSD;
				1462	ct_res = CALC_TARGET_POOL_DNE;
				1463	goto out;
				1464	}
				1465	}
				1466
				1467	__ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid);
				1468	last_pgid.pool = pgid.pool;
				1469	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
				1470
				1471	ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
				1472	if (any_change &&
				1473	ceph_is_new_interval(&t->acting,
				1474	&acting,
				1475	&t->up,
				1476	&up,
				1477	t->size,
				1478	pi->size,
				1479	t->min_size,
				1480	pi->min_size,
				1481	t->pg_num,
				1482	pi->pg_num,
				1483	t->sort_bitwise,
				1484	sort_bitwise,
				1485	t->recovery_deletes,
				1486	recovery_deletes,
				1487	&last_pgid))
				1488	force_resend = true;
				1489
				1490	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
				1491	t->paused = false;
				1492	unpaused = true;
				1493	}
				1494	legacy_change = ceph_pg_compare(&t->pgid, &pgid) \|\|
				1495	ceph_osds_changed(&t->acting, &acting, any_change);
				1496	if (t->pg_num)
				1497	split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
				1498
				1499	if (legacy_change \|\| force_resend \|\| split) {
				1500	t->pgid = pgid; /* struct */
				1501	ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
				1502	ceph_osds_copy(&t->acting, &acting);
				1503	ceph_osds_copy(&t->up, &up);
				1504	t->size = pi->size;
				1505	t->min_size = pi->min_size;
				1506	t->pg_num = pi->pg_num;
				1507	t->pg_num_mask = pi->pg_num_mask;
				1508	t->sort_bitwise = sort_bitwise;
				1509	t->recovery_deletes = recovery_deletes;
				1510
				1511	t->osd = acting.primary;
				1512	}
				1513
				1514	if (unpaused \|\| legacy_change \|\| force_resend \|\| split)
				1515	ct_res = CALC_TARGET_NEED_RESEND;
				1516	else
				1517	ct_res = CALC_TARGET_NO_ACTION;
				1518
				1519	out:
				1520	dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
				1521	legacy_change, force_resend, split, ct_res, t->osd);
				1522	return ct_res;
				1523	}
				1524
				1525	static struct ceph_spg_mapping *alloc_spg_mapping(void)
				1526	{
				1527	struct ceph_spg_mapping *spg;
				1528
				1529	spg = kmalloc(sizeof(*spg), GFP_NOIO);
				1530	if (!spg)
				1531	return NULL;
				1532
				1533	RB_CLEAR_NODE(&spg->node);
				1534	spg->backoffs = RB_ROOT;
				1535	return spg;
				1536	}
				1537
				1538	static void free_spg_mapping(struct ceph_spg_mapping *spg)
				1539	{
				1540	WARN_ON(!RB_EMPTY_NODE(&spg->node));
				1541	WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
				1542
				1543	kfree(spg);
				1544	}
				1545
				1546	/*
				1547	* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
				1548	* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
				1549	* defined only within a specific spgid; it does not pass anything to
				1550	* children on split, or to another primary.
				1551	*/
				1552	DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
				1553	RB_BYPTR, const struct ceph_spg *, node)
				1554
				1555	static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
				1556	{
				1557	return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
				1558	}
				1559
				1560	static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
				1561	void *pkey, size_t pkey_len)
				1562	{
				1563	if (hoid->key_len) {
				1564	*pkey = hoid->key;
				1565	*pkey_len = hoid->key_len;
				1566	} else {
				1567	*pkey = hoid->oid;
				1568	*pkey_len = hoid->oid_len;
				1569	}
				1570	}
				1571
				1572	static int compare_names(const void *name1, size_t name1_len,
				1573	const void *name2, size_t name2_len)
				1574	{
				1575	int ret;
				1576
				1577	ret = memcmp(name1, name2, min(name1_len, name2_len));
				1578	if (!ret) {
				1579	if (name1_len < name2_len)
				1580	ret = -1;
				1581	else if (name1_len > name2_len)
				1582	ret = 1;
				1583	}
				1584	return ret;
				1585	}
				1586
				1587	static int hoid_compare(const struct ceph_hobject_id *lhs,
				1588	const struct ceph_hobject_id *rhs)
				1589	{
				1590	void effective_key1, effective_key2;
				1591	size_t effective_key1_len, effective_key2_len;
				1592	int ret;
				1593
				1594	if (lhs->is_max < rhs->is_max)
				1595	return -1;
				1596	if (lhs->is_max > rhs->is_max)
				1597	return 1;
				1598
				1599	if (lhs->pool < rhs->pool)
				1600	return -1;
				1601	if (lhs->pool > rhs->pool)
				1602	return 1;
				1603
				1604	if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
				1605	return -1;
				1606	if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
				1607	return 1;
				1608
				1609	ret = compare_names(lhs->nspace, lhs->nspace_len,
				1610	rhs->nspace, rhs->nspace_len);
				1611	if (ret)
				1612	return ret;
				1613
				1614	hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
				1615	hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
				1616	ret = compare_names(effective_key1, effective_key1_len,
				1617	effective_key2, effective_key2_len);
				1618	if (ret)
				1619	return ret;
				1620
				1621	ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
				1622	if (ret)
				1623	return ret;
				1624
				1625	if (lhs->snapid < rhs->snapid)
				1626	return -1;
				1627	if (lhs->snapid > rhs->snapid)
				1628	return 1;
				1629
				1630	return 0;
				1631	}
				1632
				1633	/*
				1634	* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
				1635	* compat stuff here.
				1636	*
				1637	* Assumes @hoid is zero-initialized.
				1638	*/
				1639	static int decode_hoid(void *p, void end, struct ceph_hobject_id *hoid)
				1640	{
				1641	u8 struct_v;
				1642	u32 struct_len;
				1643	int ret;
				1644
				1645	ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
				1646	&struct_len);
				1647	if (ret)
				1648	return ret;
				1649
				1650	if (struct_v < 4) {
				1651	pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
				1652	goto e_inval;
				1653	}
				1654
				1655	hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
				1656	GFP_NOIO);
				1657	if (IS_ERR(hoid->key)) {
				1658	ret = PTR_ERR(hoid->key);
				1659	hoid->key = NULL;
				1660	return ret;
				1661	}
				1662
				1663	hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
				1664	GFP_NOIO);
				1665	if (IS_ERR(hoid->oid)) {
				1666	ret = PTR_ERR(hoid->oid);
				1667	hoid->oid = NULL;
				1668	return ret;
				1669	}
				1670
				1671	ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
				1672	ceph_decode_32_safe(p, end, hoid->hash, e_inval);
				1673	ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
				1674
				1675	hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
				1676	GFP_NOIO);
				1677	if (IS_ERR(hoid->nspace)) {
				1678	ret = PTR_ERR(hoid->nspace);
				1679	hoid->nspace = NULL;
				1680	return ret;
				1681	}
				1682
				1683	ceph_decode_64_safe(p, end, hoid->pool, e_inval);
				1684
				1685	ceph_hoid_build_hash_cache(hoid);
				1686	return 0;
				1687
				1688	e_inval:
				1689	return -EINVAL;
				1690	}
				1691
				1692	static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
				1693	{
				1694	return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
				1695	4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
				1696	}
				1697
				1698	static void encode_hoid(void *p, void end, const struct ceph_hobject_id *hoid)
				1699	{
				1700	ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
				1701	ceph_encode_string(p, end, hoid->key, hoid->key_len);
				1702	ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
				1703	ceph_encode_64(p, hoid->snapid);
				1704	ceph_encode_32(p, hoid->hash);
				1705	ceph_encode_8(p, hoid->is_max);
				1706	ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
				1707	ceph_encode_64(p, hoid->pool);
				1708	}
				1709
				1710	static void free_hoid(struct ceph_hobject_id *hoid)
				1711	{
				1712	if (hoid) {
				1713	kfree(hoid->key);
				1714	kfree(hoid->oid);
				1715	kfree(hoid->nspace);
				1716	kfree(hoid);
				1717	}
				1718	}
				1719
				1720	static struct ceph_osd_backoff *alloc_backoff(void)
				1721	{
				1722	struct ceph_osd_backoff *backoff;
				1723
				1724	backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
				1725	if (!backoff)
				1726	return NULL;
				1727
				1728	RB_CLEAR_NODE(&backoff->spg_node);
				1729	RB_CLEAR_NODE(&backoff->id_node);
				1730	return backoff;
				1731	}
				1732
				1733	static void free_backoff(struct ceph_osd_backoff *backoff)
				1734	{
				1735	WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
				1736	WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
				1737
				1738	free_hoid(backoff->begin);
				1739	free_hoid(backoff->end);
				1740	kfree(backoff);
				1741	}
				1742
				1743	/*
				1744	* Within a specific spgid, backoffs are managed by ->begin hoid.
				1745	*/
				1746	DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
				1747	RB_BYVAL, spg_node);
				1748
				1749	static struct ceph_osd_backoff lookup_containing_backoff(struct rb_root root,
				1750	const struct ceph_hobject_id *hoid)
				1751	{
				1752	struct rb_node *n = root->rb_node;
				1753
				1754	while (n) {
				1755	struct ceph_osd_backoff *cur =
				1756	rb_entry(n, struct ceph_osd_backoff, spg_node);
				1757	int cmp;
				1758
				1759	cmp = hoid_compare(hoid, cur->begin);
				1760	if (cmp < 0) {
				1761	n = n->rb_left;
				1762	} else if (cmp > 0) {
				1763	if (hoid_compare(hoid, cur->end) < 0)
				1764	return cur;
				1765
				1766	n = n->rb_right;
				1767	} else {
				1768	return cur;
				1769	}
				1770	}
				1771
				1772	return NULL;
				1773	}
				1774
				1775	/*
				1776	* Each backoff has a unique id within its OSD session.
				1777	*/
				1778	DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
				1779
				1780	static void clear_backoffs(struct ceph_osd *osd)
				1781	{
				1782	while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
				1783	struct ceph_spg_mapping *spg =
				1784	rb_entry(rb_first(&osd->o_backoff_mappings),
				1785	struct ceph_spg_mapping, node);
				1786
				1787	while (!RB_EMPTY_ROOT(&spg->backoffs)) {
				1788	struct ceph_osd_backoff *backoff =
				1789	rb_entry(rb_first(&spg->backoffs),
				1790	struct ceph_osd_backoff, spg_node);
				1791
				1792	erase_backoff(&spg->backoffs, backoff);
				1793	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				1794	free_backoff(backoff);
				1795	}
				1796	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				1797	free_spg_mapping(spg);
				1798	}
				1799	}
				1800
				1801	/*
				1802	* Set up a temporary, non-owning view into @t.
				1803	*/
				1804	static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
				1805	const struct ceph_osd_request_target *t)
				1806	{
				1807	hoid->key = NULL;
				1808	hoid->key_len = 0;
				1809	hoid->oid = t->target_oid.name;
				1810	hoid->oid_len = t->target_oid.name_len;
				1811	hoid->snapid = CEPH_NOSNAP;
				1812	hoid->hash = t->pgid.seed;
				1813	hoid->is_max = false;
				1814	if (t->target_oloc.pool_ns) {
				1815	hoid->nspace = t->target_oloc.pool_ns->str;
				1816	hoid->nspace_len = t->target_oloc.pool_ns->len;
				1817	} else {
				1818	hoid->nspace = NULL;
				1819	hoid->nspace_len = 0;
				1820	}
				1821	hoid->pool = t->target_oloc.pool;
				1822	ceph_hoid_build_hash_cache(hoid);
				1823	}
				1824
				1825	static bool should_plug_request(struct ceph_osd_request *req)
				1826	{
				1827	struct ceph_osd *osd = req->r_osd;
				1828	struct ceph_spg_mapping *spg;
				1829	struct ceph_osd_backoff *backoff;
				1830	struct ceph_hobject_id hoid;
				1831
				1832	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
				1833	if (!spg)
				1834	return false;
				1835
				1836	hoid_fill_from_target(&hoid, &req->r_t);
				1837	backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
				1838	if (!backoff)
				1839	return false;
				1840
				1841	dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
				1842	__func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
				1843	backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
				1844	return true;
				1845	}
				1846
				1847	static void setup_request_data(struct ceph_osd_request *req,
				1848	struct ceph_msg *msg)
				1849	{
				1850	u32 data_len = 0;
				1851	int i;
				1852
				1853	if (!list_empty(&msg->data))
				1854	return;
				1855
				1856	WARN_ON(msg->data_length);
				1857	for (i = 0; i < req->r_num_ops; i++) {
				1858	struct ceph_osd_req_op *op = &req->r_ops[i];
				1859
				1860	switch (op->op) {
				1861	/* request */
				1862	case CEPH_OSD_OP_WRITE:
				1863	case CEPH_OSD_OP_WRITEFULL:
				1864	WARN_ON(op->indata_len != op->extent.length);
				1865	ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
				1866	break;
				1867	case CEPH_OSD_OP_SETXATTR:
				1868	case CEPH_OSD_OP_CMPXATTR:
				1869	WARN_ON(op->indata_len != op->xattr.name_len +
				1870	op->xattr.value_len);
				1871	ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
				1872	break;
				1873	case CEPH_OSD_OP_NOTIFY_ACK:
				1874	ceph_osdc_msg_data_add(msg,
				1875	&op->notify_ack.request_data);
				1876	break;
				1877
				1878	/* reply */
				1879	case CEPH_OSD_OP_STAT:
				1880	ceph_osdc_msg_data_add(req->r_reply,
				1881	&op->raw_data_in);
				1882	break;
				1883	case CEPH_OSD_OP_READ:
				1884	ceph_osdc_msg_data_add(req->r_reply,
				1885	&op->extent.osd_data);
				1886	break;
				1887	case CEPH_OSD_OP_LIST_WATCHERS:
				1888	ceph_osdc_msg_data_add(req->r_reply,
				1889	&op->list_watchers.response_data);
				1890	break;
				1891
				1892	/* both */
				1893	case CEPH_OSD_OP_CALL:
				1894	WARN_ON(op->indata_len != op->cls.class_len +
				1895	op->cls.method_len +
				1896	op->cls.indata_len);
				1897	ceph_osdc_msg_data_add(msg, &op->cls.request_info);
				1898	/* optional, can be NONE */
				1899	ceph_osdc_msg_data_add(msg, &op->cls.request_data);
				1900	/* optional, can be NONE */
				1901	ceph_osdc_msg_data_add(req->r_reply,
				1902	&op->cls.response_data);
				1903	break;
				1904	case CEPH_OSD_OP_NOTIFY:
				1905	ceph_osdc_msg_data_add(msg,
				1906	&op->notify.request_data);
				1907	ceph_osdc_msg_data_add(req->r_reply,
				1908	&op->notify.response_data);
				1909	break;
				1910	}
				1911
				1912	data_len += op->indata_len;
				1913	}
				1914
				1915	WARN_ON(data_len != msg->data_length);
				1916	}
				1917
				1918	static void encode_pgid(void *p, const struct ceph_pg pgid)
				1919	{
				1920	ceph_encode_8(p, 1);
				1921	ceph_encode_64(p, pgid->pool);
				1922	ceph_encode_32(p, pgid->seed);
				1923	ceph_encode_32(p, -1); /* preferred */
				1924	}
				1925
				1926	static void encode_spgid(void *p, const struct ceph_spg spgid)
				1927	{
				1928	ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
				1929	encode_pgid(p, &spgid->pgid);
				1930	ceph_encode_8(p, spgid->shard);
				1931	}
				1932
				1933	static void encode_oloc(void *p, void end,
				1934	const struct ceph_object_locator *oloc)
				1935	{
				1936	ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
				1937	ceph_encode_64(p, oloc->pool);
				1938	ceph_encode_32(p, -1); /* preferred */
				1939	ceph_encode_32(p, 0); /* key len */
				1940	if (oloc->pool_ns)
				1941	ceph_encode_string(p, end, oloc->pool_ns->str,
				1942	oloc->pool_ns->len);
				1943	else
				1944	ceph_encode_32(p, 0);
				1945	}
				1946
				1947	static void encode_request_partial(struct ceph_osd_request *req,
				1948	struct ceph_msg *msg)
				1949	{
				1950	void *p = msg->front.iov_base;
				1951	void *const end = p + msg->front_alloc_len;
				1952	u32 data_len = 0;
				1953	int i;
				1954
				1955	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
				1956	/* snapshots aren't writeable */
				1957	WARN_ON(req->r_snapid != CEPH_NOSNAP);
				1958	} else {
				1959	WARN_ON(req->r_mtime.tv_sec \|\| req->r_mtime.tv_nsec \|\|
				1960	req->r_data_offset \|\| req->r_snapc);
				1961	}
				1962
				1963	setup_request_data(req, msg);
				1964
				1965	encode_spgid(&p, &req->r_t.spgid); /* actual spg */
				1966	ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
				1967	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
				1968	ceph_encode_32(&p, req->r_flags);
				1969
				1970	/* reqid */
				1971	ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
				1972	memset(p, 0, sizeof(struct ceph_osd_reqid));
				1973	p += sizeof(struct ceph_osd_reqid);
				1974
				1975	/* trace */
				1976	memset(p, 0, sizeof(struct ceph_blkin_trace_info));
				1977	p += sizeof(struct ceph_blkin_trace_info);
				1978
				1979	ceph_encode_32(&p, 0); /* client_inc, always 0 */
				1980	ceph_encode_timespec64(p, &req->r_mtime);
				1981	p += sizeof(struct ceph_timespec);
				1982
				1983	encode_oloc(&p, end, &req->r_t.target_oloc);
				1984	ceph_encode_string(&p, end, req->r_t.target_oid.name,
				1985	req->r_t.target_oid.name_len);
				1986
				1987	/* ops, can imply data */
				1988	ceph_encode_16(&p, req->r_num_ops);
				1989	for (i = 0; i < req->r_num_ops; i++) {
				1990	data_len += osd_req_encode_op(p, &req->r_ops[i]);
				1991	p += sizeof(struct ceph_osd_op);
				1992	}
				1993
				1994	ceph_encode_64(&p, req->r_snapid); /* snapid */
				1995	if (req->r_snapc) {
				1996	ceph_encode_64(&p, req->r_snapc->seq);
				1997	ceph_encode_32(&p, req->r_snapc->num_snaps);
				1998	for (i = 0; i < req->r_snapc->num_snaps; i++)
				1999	ceph_encode_64(&p, req->r_snapc->snaps[i]);
				2000	} else {
				2001	ceph_encode_64(&p, 0); /* snap_seq */
				2002	ceph_encode_32(&p, 0); /* snaps len */
				2003	}
				2004
				2005	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
				2006	BUG_ON(p > end - 8); /* space for features */
				2007
				2008	msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
				2009	/* front_len is finalized in encode_request_finish() */
				2010	msg->front.iov_len = p - msg->front.iov_base;
				2011	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2012	msg->hdr.data_len = cpu_to_le32(data_len);
				2013	/*
				2014	* The header "data_off" is a hint to the receiver allowing it
				2015	* to align received data into its buffers such that there's no
				2016	* need to re-copy it before writing it to disk (direct I/O).
				2017	*/
				2018	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
				2019
				2020	dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
				2021	req->r_t.target_oid.name, req->r_t.target_oid.name_len);
				2022	}
				2023
				2024	static void encode_request_finish(struct ceph_msg *msg)
				2025	{
				2026	void *p = msg->front.iov_base;
				2027	void *const partial_end = p + msg->front.iov_len;
				2028	void *const end = p + msg->front_alloc_len;
				2029
				2030	if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
				2031	/* luminous OSD -- encode features and be done */
				2032	p = partial_end;
				2033	ceph_encode_64(&p, msg->con->peer_features);
				2034	} else {
				2035	struct {
				2036	char spgid[CEPH_ENCODING_START_BLK_LEN +
				2037	CEPH_PGID_ENCODING_LEN + 1];
				2038	__le32 hash;
				2039	__le32 epoch;
				2040	__le32 flags;
				2041	char reqid[CEPH_ENCODING_START_BLK_LEN +
				2042	sizeof(struct ceph_osd_reqid)];
				2043	char trace[sizeof(struct ceph_blkin_trace_info)];
				2044	__le32 client_inc;
				2045	struct ceph_timespec mtime;
				2046	} __packed head;
				2047	struct ceph_pg pgid;
				2048	void oloc, oid, *tail;
				2049	int oloc_len, oid_len, tail_len;
				2050	int len;
				2051
				2052	/*
				2053	* Pre-luminous OSD -- reencode v8 into v4 using @head
				2054	* as a temporary buffer. Encode the raw PG; the rest
				2055	* is just a matter of moving oloc, oid and tail blobs
				2056	* around.
				2057	*/
				2058	memcpy(&head, p, sizeof(head));
				2059	p += sizeof(head);
				2060
				2061	oloc = p;
				2062	p += CEPH_ENCODING_START_BLK_LEN;
				2063	pgid.pool = ceph_decode_64(&p);
				2064	p += 4 + 4; /* preferred, key len */
				2065	len = ceph_decode_32(&p);
				2066	p += len; /* nspace */
				2067	oloc_len = p - oloc;
				2068
				2069	oid = p;
				2070	len = ceph_decode_32(&p);
				2071	p += len;
				2072	oid_len = p - oid;
				2073
				2074	tail = p;
				2075	tail_len = partial_end - p;
				2076
				2077	p = msg->front.iov_base;
				2078	ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
				2079	ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
				2080	ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
				2081	ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
				2082
				2083	/* reassert_version */
				2084	memset(p, 0, sizeof(struct ceph_eversion));
				2085	p += sizeof(struct ceph_eversion);
				2086
				2087	BUG_ON(p >= oloc);
				2088	memmove(p, oloc, oloc_len);
				2089	p += oloc_len;
				2090
				2091	pgid.seed = le32_to_cpu(head.hash);
				2092	encode_pgid(&p, &pgid); /* raw pg */
				2093
				2094	BUG_ON(p >= oid);
				2095	memmove(p, oid, oid_len);
				2096	p += oid_len;
				2097
				2098	/* tail -- ops, snapid, snapc, retry_attempt */
				2099	BUG_ON(p >= tail);
				2100	memmove(p, tail, tail_len);
				2101	p += tail_len;
				2102
				2103	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
				2104	}
				2105
				2106	BUG_ON(p > end);
				2107	msg->front.iov_len = p - msg->front.iov_base;
				2108	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2109
				2110	dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
				2111	le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
				2112	le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
				2113	le16_to_cpu(msg->hdr.version));
				2114	}
				2115
				2116	/*
				2117	* @req has to be assigned a tid and registered.
				2118	*/
				2119	static void send_request(struct ceph_osd_request *req)
				2120	{
				2121	struct ceph_osd *osd = req->r_osd;
				2122
				2123	verify_osd_locked(osd);
				2124	WARN_ON(osd->o_osd != req->r_t.osd);
				2125
				2126	/* backoff? */
				2127	if (should_plug_request(req))
				2128	return;
				2129
				2130	/*
				2131	* We may have a previously queued request message hanging
				2132	* around. Cancel it to avoid corrupting the msgr.
				2133	*/
				2134	if (req->r_sent)
				2135	ceph_msg_revoke(req->r_request);
				2136
				2137	req->r_flags \|= CEPH_OSD_FLAG_KNOWN_REDIR;
				2138	if (req->r_attempts)
				2139	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				2140	else
				2141	WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
				2142
				2143	encode_request_partial(req, req->r_request);
				2144
				2145	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
				2146	__func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
				2147	req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
				2148	req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
				2149	req->r_attempts);
				2150
				2151	req->r_t.paused = false;
				2152	req->r_stamp = jiffies;
				2153	req->r_attempts++;
				2154
				2155	req->r_sent = osd->o_incarnation;
				2156	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
				2157	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
				2158	}
				2159
				2160	static void maybe_request_map(struct ceph_osd_client *osdc)
				2161	{
				2162	bool continuous = false;
				2163
				2164	verify_osdc_locked(osdc);
				2165	WARN_ON(!osdc->osdmap->epoch);
				2166
				2167	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2168	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) \|\|
				2169	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2170	dout("%s osdc %p continuous\n", __func__, osdc);
				2171	continuous = true;
				2172	} else {
				2173	dout("%s osdc %p onetime\n", __func__, osdc);
				2174	}
				2175
				2176	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				2177	osdc->osdmap->epoch + 1, continuous))
				2178	ceph_monc_renew_subs(&osdc->client->monc);
				2179	}
				2180
				2181	static void complete_request(struct ceph_osd_request *req, int err);
				2182	static void send_map_check(struct ceph_osd_request *req);
				2183
				2184	static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
				2185	{
				2186	struct ceph_osd_client *osdc = req->r_osdc;
				2187	struct ceph_osd *osd;
				2188	enum calc_target_result ct_res;
				2189	int err = 0;
				2190	bool need_send = false;
				2191	bool promoted = false;
				2192
				2193	WARN_ON(req->r_tid);
				2194	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
				2195
				2196	again:
				2197	ct_res = calc_target(osdc, &req->r_t, NULL, false);
				2198	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
				2199	goto promote;
				2200
				2201	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
				2202	if (IS_ERR(osd)) {
				2203	WARN_ON(PTR_ERR(osd) != -EAGAIN \|\| wrlocked);
				2204	goto promote;
				2205	}
				2206
				2207	if (osdc->abort_err) {
				2208	dout("req %p abort_err %d\n", req, osdc->abort_err);
				2209	err = osdc->abort_err;
				2210	} else if (osdc->osdmap->epoch < osdc->epoch_barrier) {
				2211	dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
				2212	osdc->epoch_barrier);
				2213	req->r_t.paused = true;
				2214	maybe_request_map(osdc);
				2215	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2216	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2217	dout("req %p pausewr\n", req);
				2218	req->r_t.paused = true;
				2219	maybe_request_map(osdc);
				2220	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
				2221	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				2222	dout("req %p pauserd\n", req);
				2223	req->r_t.paused = true;
				2224	maybe_request_map(osdc);
				2225	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2226	!(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY \|
				2227	CEPH_OSD_FLAG_FULL_FORCE)) &&
				2228	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2229	pool_full(osdc, req->r_t.base_oloc.pool))) {
				2230	dout("req %p full/pool_full\n", req);
				2231	if (osdc->abort_on_full) {
				2232	err = -ENOSPC;
				2233	} else {
				2234	pr_warn_ratelimited("FULL or reached pool quota\n");
				2235	req->r_t.paused = true;
				2236	maybe_request_map(osdc);
				2237	}
				2238	} else if (!osd_homeless(osd)) {
				2239	need_send = true;
				2240	} else {
				2241	maybe_request_map(osdc);
				2242	}
				2243
				2244	mutex_lock(&osd->lock);
				2245	/*
				2246	* Assign the tid atomically with send_request() to protect
				2247	* multiple writes to the same object from racing with each
				2248	* other, resulting in out of order ops on the OSDs.
				2249	*/
				2250	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				2251	link_request(osd, req);
				2252	if (need_send)
				2253	send_request(req);
				2254	else if (err)
				2255	complete_request(req, err);
				2256	mutex_unlock(&osd->lock);
				2257
				2258	if (!err && ct_res == CALC_TARGET_POOL_DNE)
				2259	send_map_check(req);
				2260
				2261	if (promoted)
				2262	downgrade_write(&osdc->lock);
				2263	return;
				2264
				2265	promote:
				2266	up_read(&osdc->lock);
				2267	down_write(&osdc->lock);
				2268	wrlocked = true;
				2269	promoted = true;
				2270	goto again;
				2271	}
				2272
				2273	static void account_request(struct ceph_osd_request *req)
				2274	{
				2275	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK \| CEPH_OSD_FLAG_ONDISK));
				2276	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ \| CEPH_OSD_FLAG_WRITE)));
				2277
				2278	req->r_flags \|= CEPH_OSD_FLAG_ONDISK;
				2279	atomic_inc(&req->r_osdc->num_requests);
				2280
				2281	req->r_start_stamp = jiffies;
				2282	}
				2283
				2284	static void submit_request(struct ceph_osd_request *req, bool wrlocked)
				2285	{
				2286	ceph_osdc_get_request(req);
				2287	account_request(req);
				2288	__submit_request(req, wrlocked);
				2289	}
				2290
				2291	static void finish_request(struct ceph_osd_request *req)
				2292	{
				2293	struct ceph_osd_client *osdc = req->r_osdc;
				2294
				2295	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
				2296	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2297
				2298	if (req->r_osd)
				2299	unlink_request(req->r_osd, req);
				2300	atomic_dec(&osdc->num_requests);
				2301
				2302	/*
				2303	* If an OSD has failed or returned and a request has been sent
				2304	* twice, it's possible to get a reply and end up here while the
				2305	* request message is queued for delivery. We will ignore the
				2306	* reply, so not a big deal, but better to try and catch it.
				2307	*/
				2308	ceph_msg_revoke(req->r_request);
				2309	ceph_msg_revoke_incoming(req->r_reply);
				2310	}
				2311
				2312	static void __complete_request(struct ceph_osd_request *req)
				2313	{
				2314	dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
				2315	req->r_tid, req->r_callback, req->r_result);
				2316
				2317	if (req->r_callback)
				2318	req->r_callback(req);
				2319	complete_all(&req->r_completion);
				2320	ceph_osdc_put_request(req);
				2321	}
				2322
				2323	static void complete_request_workfn(struct work_struct *work)
				2324	{
				2325	struct ceph_osd_request *req =
				2326	container_of(work, struct ceph_osd_request, r_complete_work);
				2327
				2328	__complete_request(req);
				2329	}
				2330
				2331	/*
				2332	* This is open-coded in handle_reply().
				2333	*/
				2334	static void complete_request(struct ceph_osd_request *req, int err)
				2335	{
				2336	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2337
				2338	req->r_result = err;
				2339	finish_request(req);
				2340
				2341	INIT_WORK(&req->r_complete_work, complete_request_workfn);
				2342	queue_work(req->r_osdc->completion_wq, &req->r_complete_work);
				2343	}
				2344
				2345	static void cancel_map_check(struct ceph_osd_request *req)
				2346	{
				2347	struct ceph_osd_client *osdc = req->r_osdc;
				2348	struct ceph_osd_request *lookup_req;
				2349
				2350	verify_osdc_wrlocked(osdc);
				2351
				2352	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2353	if (!lookup_req)
				2354	return;
				2355
				2356	WARN_ON(lookup_req != req);
				2357	erase_request_mc(&osdc->map_checks, req);
				2358	ceph_osdc_put_request(req);
				2359	}
				2360
				2361	static void cancel_request(struct ceph_osd_request *req)
				2362	{
				2363	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2364
				2365	cancel_map_check(req);
				2366	finish_request(req);
				2367	complete_all(&req->r_completion);
				2368	ceph_osdc_put_request(req);
				2369	}
				2370
				2371	static void abort_request(struct ceph_osd_request *req, int err)
				2372	{
				2373	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2374
				2375	cancel_map_check(req);
				2376	complete_request(req, err);
				2377	}
				2378
				2379	static int abort_fn(struct ceph_osd_request req, void arg)
				2380	{
				2381	int err = (int )arg;
				2382
				2383	abort_request(req, err);
				2384	return 0; /* continue iteration */
				2385	}
				2386
				2387	/*
				2388	* Abort all in-flight requests with @err and arrange for all future
				2389	* requests to be failed immediately.
				2390	*/
				2391	void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
				2392	{
				2393	dout("%s osdc %p err %d\n", __func__, osdc, err);
				2394	down_write(&osdc->lock);
				2395	for_each_request(osdc, abort_fn, &err);
				2396	osdc->abort_err = err;
				2397	up_write(&osdc->lock);
				2398	}
				2399	EXPORT_SYMBOL(ceph_osdc_abort_requests);
				2400
				2401	static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2402	{
				2403	if (likely(eb > osdc->epoch_barrier)) {
				2404	dout("updating epoch_barrier from %u to %u\n",
				2405	osdc->epoch_barrier, eb);
				2406	osdc->epoch_barrier = eb;
				2407	/* Request map if we're not to the barrier yet */
				2408	if (eb > osdc->osdmap->epoch)
				2409	maybe_request_map(osdc);
				2410	}
				2411	}
				2412
				2413	void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2414	{
				2415	down_read(&osdc->lock);
				2416	if (unlikely(eb > osdc->epoch_barrier)) {
				2417	up_read(&osdc->lock);
				2418	down_write(&osdc->lock);
				2419	update_epoch_barrier(osdc, eb);
				2420	up_write(&osdc->lock);
				2421	} else {
				2422	up_read(&osdc->lock);
				2423	}
				2424	}
				2425	EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
				2426
				2427	/*
				2428	* We can end up releasing caps as a result of abort_request().
				2429	* In that case, we probably want to ensure that the cap release message
				2430	* has an updated epoch barrier in it, so set the epoch barrier prior to
				2431	* aborting the first request.
				2432	*/
				2433	static int abort_on_full_fn(struct ceph_osd_request req, void arg)
				2434	{
				2435	struct ceph_osd_client *osdc = req->r_osdc;
				2436	bool *victims = arg;
				2437
				2438	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2439	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2440	pool_full(osdc, req->r_t.base_oloc.pool))) {
				2441	if (!*victims) {
				2442	update_epoch_barrier(osdc, osdc->osdmap->epoch);
				2443	*victims = true;
				2444	}
				2445	abort_request(req, -ENOSPC);
				2446	}
				2447
				2448	return 0; /* continue iteration */
				2449	}
				2450
				2451	/*
				2452	* Drop all pending requests that are stalled waiting on a full condition to
				2453	* clear, and complete them with ENOSPC as the return code. Set the
				2454	* osdc->epoch_barrier to the latest map epoch that we've seen if any were
				2455	* cancelled.
				2456	*/
				2457	static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
				2458	{
				2459	bool victims = false;
				2460
				2461	if (osdc->abort_on_full &&
				2462	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\| have_pool_full(osdc)))
				2463	for_each_request(osdc, abort_on_full_fn, &victims);
				2464	}
				2465
				2466	static void check_pool_dne(struct ceph_osd_request *req)
				2467	{
				2468	struct ceph_osd_client *osdc = req->r_osdc;
				2469	struct ceph_osdmap *map = osdc->osdmap;
				2470
				2471	verify_osdc_wrlocked(osdc);
				2472	WARN_ON(!map->epoch);
				2473
				2474	if (req->r_attempts) {
				2475	/*
				2476	* We sent a request earlier, which means that
				2477	* previously the pool existed, and now it does not
				2478	* (i.e., it was deleted).
				2479	*/
				2480	req->r_map_dne_bound = map->epoch;
				2481	dout("%s req %p tid %llu pool disappeared\n", __func__, req,
				2482	req->r_tid);
				2483	} else {
				2484	dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
				2485	req, req->r_tid, req->r_map_dne_bound, map->epoch);
				2486	}
				2487
				2488	if (req->r_map_dne_bound) {
				2489	if (map->epoch >= req->r_map_dne_bound) {
				2490	/* we had a new enough map */
				2491	pr_info_ratelimited("tid %llu pool does not exist\n",
				2492	req->r_tid);
				2493	complete_request(req, -ENOENT);
				2494	}
				2495	} else {
				2496	send_map_check(req);
				2497	}
				2498	}
				2499
				2500	static void map_check_cb(struct ceph_mon_generic_request *greq)
				2501	{
				2502	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				2503	struct ceph_osd_request *req;
				2504	u64 tid = greq->private_data;
				2505
				2506	WARN_ON(greq->result \|\| !greq->u.newest);
				2507
				2508	down_write(&osdc->lock);
				2509	req = lookup_request_mc(&osdc->map_checks, tid);
				2510	if (!req) {
				2511	dout("%s tid %llu dne\n", __func__, tid);
				2512	goto out_unlock;
				2513	}
				2514
				2515	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
				2516	req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
				2517	if (!req->r_map_dne_bound)
				2518	req->r_map_dne_bound = greq->u.newest;
				2519	erase_request_mc(&osdc->map_checks, req);
				2520	check_pool_dne(req);
				2521
				2522	ceph_osdc_put_request(req);
				2523	out_unlock:
				2524	up_write(&osdc->lock);
				2525	}
				2526
				2527	static void send_map_check(struct ceph_osd_request *req)
				2528	{
				2529	struct ceph_osd_client *osdc = req->r_osdc;
				2530	struct ceph_osd_request *lookup_req;
				2531	int ret;
				2532
				2533	verify_osdc_wrlocked(osdc);
				2534
				2535	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2536	if (lookup_req) {
				2537	WARN_ON(lookup_req != req);
				2538	return;
				2539	}
				2540
				2541	ceph_osdc_get_request(req);
				2542	insert_request_mc(&osdc->map_checks, req);
				2543	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				2544	map_check_cb, req->r_tid);
				2545	WARN_ON(ret);
				2546	}
				2547
				2548	/*
				2549	* lingering requests, watch/notify v2 infrastructure
				2550	*/
				2551	static void linger_release(struct kref *kref)
				2552	{
				2553	struct ceph_osd_linger_request *lreq =
				2554	container_of(kref, struct ceph_osd_linger_request, kref);
				2555
				2556	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
				2557	lreq->reg_req, lreq->ping_req);
				2558	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
				2559	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
				2560	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
				2561	WARN_ON(!list_empty(&lreq->scan_item));
				2562	WARN_ON(!list_empty(&lreq->pending_lworks));
				2563	WARN_ON(lreq->osd);
				2564
				2565	if (lreq->reg_req)
				2566	ceph_osdc_put_request(lreq->reg_req);
				2567	if (lreq->ping_req)
				2568	ceph_osdc_put_request(lreq->ping_req);
				2569	target_destroy(&lreq->t);
				2570	kfree(lreq);
				2571	}
				2572
				2573	static void linger_put(struct ceph_osd_linger_request *lreq)
				2574	{
				2575	if (lreq)
				2576	kref_put(&lreq->kref, linger_release);
				2577	}
				2578
				2579	static struct ceph_osd_linger_request *
				2580	linger_get(struct ceph_osd_linger_request *lreq)
				2581	{
				2582	kref_get(&lreq->kref);
				2583	return lreq;
				2584	}
				2585
				2586	static struct ceph_osd_linger_request *
				2587	linger_alloc(struct ceph_osd_client *osdc)
				2588	{
				2589	struct ceph_osd_linger_request *lreq;
				2590
				2591	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
				2592	if (!lreq)
				2593	return NULL;
				2594
				2595	kref_init(&lreq->kref);
				2596	mutex_init(&lreq->lock);
				2597	RB_CLEAR_NODE(&lreq->node);
				2598	RB_CLEAR_NODE(&lreq->osdc_node);
				2599	RB_CLEAR_NODE(&lreq->mc_node);
				2600	INIT_LIST_HEAD(&lreq->scan_item);
				2601	INIT_LIST_HEAD(&lreq->pending_lworks);
				2602	init_completion(&lreq->reg_commit_wait);
				2603	init_completion(&lreq->notify_finish_wait);
				2604
				2605	lreq->osdc = osdc;
				2606	target_init(&lreq->t);
				2607
				2608	dout("%s lreq %p\n", __func__, lreq);
				2609	return lreq;
				2610	}
				2611
				2612	DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
				2613	DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
				2614	DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
				2615
				2616	/*
				2617	* Create linger request <-> OSD session relation.
				2618	*
				2619	* @lreq has to be registered, @osd may be homeless.
				2620	*/
				2621	static void link_linger(struct ceph_osd *osd,
				2622	struct ceph_osd_linger_request *lreq)
				2623	{
				2624	verify_osd_locked(osd);
				2625	WARN_ON(!lreq->linger_id \|\| lreq->osd);
				2626	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2627	osd->o_osd, lreq, lreq->linger_id);
				2628
				2629	if (!osd_homeless(osd))
				2630	__remove_osd_from_lru(osd);
				2631	else
				2632	atomic_inc(&osd->o_osdc->num_homeless);
				2633
				2634	get_osd(osd);
				2635	insert_linger(&osd->o_linger_requests, lreq);
				2636	lreq->osd = osd;
				2637	}
				2638
				2639	static void unlink_linger(struct ceph_osd *osd,
				2640	struct ceph_osd_linger_request *lreq)
				2641	{
				2642	verify_osd_locked(osd);
				2643	WARN_ON(lreq->osd != osd);
				2644	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2645	osd->o_osd, lreq, lreq->linger_id);
				2646
				2647	lreq->osd = NULL;
				2648	erase_linger(&osd->o_linger_requests, lreq);
				2649	put_osd(osd);
				2650
				2651	if (!osd_homeless(osd))
				2652	maybe_move_osd_to_lru(osd);
				2653	else
				2654	atomic_dec(&osd->o_osdc->num_homeless);
				2655	}
				2656
				2657	static bool __linger_registered(struct ceph_osd_linger_request *lreq)
				2658	{
				2659	verify_osdc_locked(lreq->osdc);
				2660
				2661	return !RB_EMPTY_NODE(&lreq->osdc_node);
				2662	}
				2663
				2664	static bool linger_registered(struct ceph_osd_linger_request *lreq)
				2665	{
				2666	struct ceph_osd_client *osdc = lreq->osdc;
				2667	bool registered;
				2668
				2669	down_read(&osdc->lock);
				2670	registered = __linger_registered(lreq);
				2671	up_read(&osdc->lock);
				2672
				2673	return registered;
				2674	}
				2675
				2676	static void linger_register(struct ceph_osd_linger_request *lreq)
				2677	{
				2678	struct ceph_osd_client *osdc = lreq->osdc;
				2679
				2680	verify_osdc_wrlocked(osdc);
				2681	WARN_ON(lreq->linger_id);
				2682
				2683	linger_get(lreq);
				2684	lreq->linger_id = ++osdc->last_linger_id;
				2685	insert_linger_osdc(&osdc->linger_requests, lreq);
				2686	}
				2687
				2688	static void linger_unregister(struct ceph_osd_linger_request *lreq)
				2689	{
				2690	struct ceph_osd_client *osdc = lreq->osdc;
				2691
				2692	verify_osdc_wrlocked(osdc);
				2693
				2694	erase_linger_osdc(&osdc->linger_requests, lreq);
				2695	linger_put(lreq);
				2696	}
				2697
				2698	static void cancel_linger_request(struct ceph_osd_request *req)
				2699	{
				2700	struct ceph_osd_linger_request *lreq = req->r_priv;
				2701
				2702	WARN_ON(!req->r_linger);
				2703	cancel_request(req);
				2704	linger_put(lreq);
				2705	}
				2706
				2707	struct linger_work {
				2708	struct work_struct work;
				2709	struct ceph_osd_linger_request *lreq;
				2710	struct list_head pending_item;
				2711	unsigned long queued_stamp;
				2712
				2713	union {
				2714	struct {
				2715	u64 notify_id;
				2716	u64 notifier_id;
				2717	void payload; / points into @msg front */
				2718	size_t payload_len;
				2719
				2720	struct ceph_msg msg; / for ceph_msg_put() */
				2721	} notify;
				2722	struct {
				2723	int err;
				2724	} error;
				2725	};
				2726	};
				2727
				2728	static struct linger_work lwork_alloc(struct ceph_osd_linger_request lreq,
				2729	work_func_t workfn)
				2730	{
				2731	struct linger_work *lwork;
				2732
				2733	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
				2734	if (!lwork)
				2735	return NULL;
				2736
				2737	INIT_WORK(&lwork->work, workfn);
				2738	INIT_LIST_HEAD(&lwork->pending_item);
				2739	lwork->lreq = linger_get(lreq);
				2740
				2741	return lwork;
				2742	}
				2743
				2744	static void lwork_free(struct linger_work *lwork)
				2745	{
				2746	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2747
				2748	mutex_lock(&lreq->lock);
				2749	list_del(&lwork->pending_item);
				2750	mutex_unlock(&lreq->lock);
				2751
				2752	linger_put(lreq);
				2753	kfree(lwork);
				2754	}
				2755
				2756	static void lwork_queue(struct linger_work *lwork)
				2757	{
				2758	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2759	struct ceph_osd_client *osdc = lreq->osdc;
				2760
				2761	verify_lreq_locked(lreq);
				2762	WARN_ON(!list_empty(&lwork->pending_item));
				2763
				2764	lwork->queued_stamp = jiffies;
				2765	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
				2766	queue_work(osdc->notify_wq, &lwork->work);
				2767	}
				2768
				2769	static void do_watch_notify(struct work_struct *w)
				2770	{
				2771	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2772	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2773
				2774	if (!linger_registered(lreq)) {
				2775	dout("%s lreq %p not registered\n", __func__, lreq);
				2776	goto out;
				2777	}
				2778
				2779	WARN_ON(!lreq->is_watch);
				2780	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
				2781	__func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
				2782	lwork->notify.payload_len);
				2783	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
				2784	lwork->notify.notifier_id, lwork->notify.payload,
				2785	lwork->notify.payload_len);
				2786
				2787	out:
				2788	ceph_msg_put(lwork->notify.msg);
				2789	lwork_free(lwork);
				2790	}
				2791
				2792	static void do_watch_error(struct work_struct *w)
				2793	{
				2794	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2795	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2796
				2797	if (!linger_registered(lreq)) {
				2798	dout("%s lreq %p not registered\n", __func__, lreq);
				2799	goto out;
				2800	}
				2801
				2802	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
				2803	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
				2804
				2805	out:
				2806	lwork_free(lwork);
				2807	}
				2808
				2809	static void queue_watch_error(struct ceph_osd_linger_request *lreq)
				2810	{
				2811	struct linger_work *lwork;
				2812
				2813	lwork = lwork_alloc(lreq, do_watch_error);
				2814	if (!lwork) {
				2815	pr_err("failed to allocate error-lwork\n");
				2816	return;
				2817	}
				2818
				2819	lwork->error.err = lreq->last_error;
				2820	lwork_queue(lwork);
				2821	}
				2822
				2823	static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
				2824	int result)
				2825	{
				2826	if (!completion_done(&lreq->reg_commit_wait)) {
				2827	lreq->reg_commit_error = (result <= 0 ? result : 0);
				2828	complete_all(&lreq->reg_commit_wait);
				2829	}
				2830	}
				2831
				2832	static void linger_commit_cb(struct ceph_osd_request *req)
				2833	{
				2834	struct ceph_osd_linger_request *lreq = req->r_priv;
				2835
				2836	mutex_lock(&lreq->lock);
				2837	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
				2838	lreq->linger_id, req->r_result);
				2839	linger_reg_commit_complete(lreq, req->r_result);
				2840	lreq->committed = true;
				2841
				2842	if (!lreq->is_watch) {
				2843	struct ceph_osd_data *osd_data =
				2844	osd_req_op_data(req, 0, notify, response_data);
				2845	void *p = page_address(osd_data->pages[0]);
				2846
				2847	WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY \|\|
				2848	osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
				2849
				2850	/* make note of the notify_id */
				2851	if (req->r_ops[0].outdata_len >= sizeof(u64)) {
				2852	lreq->notify_id = ceph_decode_64(&p);
				2853	dout("lreq %p notify_id %llu\n", lreq,
				2854	lreq->notify_id);
				2855	} else {
				2856	dout("lreq %p no notify_id\n", lreq);
				2857	}
				2858	}
				2859
				2860	mutex_unlock(&lreq->lock);
				2861	linger_put(lreq);
				2862	}
				2863
				2864	static int normalize_watch_error(int err)
				2865	{
				2866	/*
				2867	* Translate ENOENT -> ENOTCONN so that a delete->disconnection
				2868	* notification and a failure to reconnect because we raced with
				2869	* the delete appear the same to the user.
				2870	*/
				2871	if (err == -ENOENT)
				2872	err = -ENOTCONN;
				2873
				2874	return err;
				2875	}
				2876
				2877	static void linger_reconnect_cb(struct ceph_osd_request *req)
				2878	{
				2879	struct ceph_osd_linger_request *lreq = req->r_priv;
				2880
				2881	mutex_lock(&lreq->lock);
				2882	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
				2883	lreq, lreq->linger_id, req->r_result, lreq->last_error);
				2884	if (req->r_result < 0) {
				2885	if (!lreq->last_error) {
				2886	lreq->last_error = normalize_watch_error(req->r_result);
				2887	queue_watch_error(lreq);
				2888	}
				2889	}
				2890
				2891	mutex_unlock(&lreq->lock);
				2892	linger_put(lreq);
				2893	}
				2894
				2895	static void send_linger(struct ceph_osd_linger_request *lreq)
				2896	{
				2897	struct ceph_osd_request *req = lreq->reg_req;
				2898	struct ceph_osd_req_op *op = &req->r_ops[0];
				2899
				2900	verify_osdc_wrlocked(req->r_osdc);
				2901	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				2902
				2903	if (req->r_osd)
				2904	cancel_linger_request(req);
				2905
				2906	request_reinit(req);
				2907	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				2908	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				2909	req->r_flags = lreq->t.flags;
				2910	req->r_mtime = lreq->mtime;
				2911
				2912	mutex_lock(&lreq->lock);
				2913	if (lreq->is_watch && lreq->committed) {
				2914	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				2915	op->watch.cookie != lreq->linger_id);
				2916	op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
				2917	op->watch.gen = ++lreq->register_gen;
				2918	dout("lreq %p reconnect register_gen %u\n", lreq,
				2919	op->watch.gen);
				2920	req->r_callback = linger_reconnect_cb;
				2921	} else {
				2922	if (!lreq->is_watch)
				2923	lreq->notify_id = 0;
				2924	else
				2925	WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
				2926	dout("lreq %p register\n", lreq);
				2927	req->r_callback = linger_commit_cb;
				2928	}
				2929	mutex_unlock(&lreq->lock);
				2930
				2931	req->r_priv = linger_get(lreq);
				2932	req->r_linger = true;
				2933
				2934	submit_request(req, true);
				2935	}
				2936
				2937	static void linger_ping_cb(struct ceph_osd_request *req)
				2938	{
				2939	struct ceph_osd_linger_request *lreq = req->r_priv;
				2940
				2941	mutex_lock(&lreq->lock);
				2942	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
				2943	__func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
				2944	lreq->last_error);
				2945	if (lreq->register_gen == req->r_ops[0].watch.gen) {
				2946	if (!req->r_result) {
				2947	lreq->watch_valid_thru = lreq->ping_sent;
				2948	} else if (!lreq->last_error) {
				2949	lreq->last_error = normalize_watch_error(req->r_result);
				2950	queue_watch_error(lreq);
				2951	}
				2952	} else {
				2953	dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
				2954	lreq->register_gen, req->r_ops[0].watch.gen);
				2955	}
				2956
				2957	mutex_unlock(&lreq->lock);
				2958	linger_put(lreq);
				2959	}
				2960
				2961	static void send_linger_ping(struct ceph_osd_linger_request *lreq)
				2962	{
				2963	struct ceph_osd_client *osdc = lreq->osdc;
				2964	struct ceph_osd_request *req = lreq->ping_req;
				2965	struct ceph_osd_req_op *op = &req->r_ops[0];
				2966
				2967	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				2968	dout("%s PAUSERD\n", __func__);
				2969	return;
				2970	}
				2971
				2972	lreq->ping_sent = jiffies;
				2973	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
				2974	__func__, lreq, lreq->linger_id, lreq->ping_sent,
				2975	lreq->register_gen);
				2976
				2977	if (req->r_osd)
				2978	cancel_linger_request(req);
				2979
				2980	request_reinit(req);
				2981	target_copy(&req->r_t, &lreq->t);
				2982
				2983	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				2984	op->watch.cookie != lreq->linger_id \|\|
				2985	op->watch.op != CEPH_OSD_WATCH_OP_PING);
				2986	op->watch.gen = lreq->register_gen;
				2987	req->r_callback = linger_ping_cb;
				2988	req->r_priv = linger_get(lreq);
				2989	req->r_linger = true;
				2990
				2991	ceph_osdc_get_request(req);
				2992	account_request(req);
				2993	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				2994	link_request(lreq->osd, req);
				2995	send_request(req);
				2996	}
				2997
				2998	static void linger_submit(struct ceph_osd_linger_request *lreq)
				2999	{
				3000	struct ceph_osd_client *osdc = lreq->osdc;
				3001	struct ceph_osd *osd;
				3002
				3003	calc_target(osdc, &lreq->t, NULL, false);
				3004	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				3005	link_linger(osd, lreq);
				3006
				3007	send_linger(lreq);
				3008	}
				3009
				3010	static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
				3011	{
				3012	struct ceph_osd_client *osdc = lreq->osdc;
				3013	struct ceph_osd_linger_request *lookup_lreq;
				3014
				3015	verify_osdc_wrlocked(osdc);
				3016
				3017	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				3018	lreq->linger_id);
				3019	if (!lookup_lreq)
				3020	return;
				3021
				3022	WARN_ON(lookup_lreq != lreq);
				3023	erase_linger_mc(&osdc->linger_map_checks, lreq);
				3024	linger_put(lreq);
				3025	}
				3026
				3027	/*
				3028	* @lreq has to be both registered and linked.
				3029	*/
				3030	static void __linger_cancel(struct ceph_osd_linger_request *lreq)
				3031	{
				3032	if (lreq->is_watch && lreq->ping_req->r_osd)
				3033	cancel_linger_request(lreq->ping_req);
				3034	if (lreq->reg_req->r_osd)
				3035	cancel_linger_request(lreq->reg_req);
				3036	cancel_linger_map_check(lreq);
				3037	unlink_linger(lreq->osd, lreq);
				3038	linger_unregister(lreq);
				3039	}
				3040
				3041	static void linger_cancel(struct ceph_osd_linger_request *lreq)
				3042	{
				3043	struct ceph_osd_client *osdc = lreq->osdc;
				3044
				3045	down_write(&osdc->lock);
				3046	if (__linger_registered(lreq))
				3047	__linger_cancel(lreq);
				3048	up_write(&osdc->lock);
				3049	}
				3050
				3051	static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
				3052
				3053	static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
				3054	{
				3055	struct ceph_osd_client *osdc = lreq->osdc;
				3056	struct ceph_osdmap *map = osdc->osdmap;
				3057
				3058	verify_osdc_wrlocked(osdc);
				3059	WARN_ON(!map->epoch);
				3060
				3061	if (lreq->register_gen) {
				3062	lreq->map_dne_bound = map->epoch;
				3063	dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
				3064	lreq, lreq->linger_id);
				3065	} else {
				3066	dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
				3067	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				3068	map->epoch);
				3069	}
				3070
				3071	if (lreq->map_dne_bound) {
				3072	if (map->epoch >= lreq->map_dne_bound) {
				3073	/* we had a new enough map */
				3074	pr_info("linger_id %llu pool does not exist\n",
				3075	lreq->linger_id);
				3076	linger_reg_commit_complete(lreq, -ENOENT);
				3077	__linger_cancel(lreq);
				3078	}
				3079	} else {
				3080	send_linger_map_check(lreq);
				3081	}
				3082	}
				3083
				3084	static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
				3085	{
				3086	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				3087	struct ceph_osd_linger_request *lreq;
				3088	u64 linger_id = greq->private_data;
				3089
				3090	WARN_ON(greq->result \|\| !greq->u.newest);
				3091
				3092	down_write(&osdc->lock);
				3093	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
				3094	if (!lreq) {
				3095	dout("%s linger_id %llu dne\n", __func__, linger_id);
				3096	goto out_unlock;
				3097	}
				3098
				3099	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
				3100	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				3101	greq->u.newest);
				3102	if (!lreq->map_dne_bound)
				3103	lreq->map_dne_bound = greq->u.newest;
				3104	erase_linger_mc(&osdc->linger_map_checks, lreq);
				3105	check_linger_pool_dne(lreq);
				3106
				3107	linger_put(lreq);
				3108	out_unlock:
				3109	up_write(&osdc->lock);
				3110	}
				3111
				3112	static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
				3113	{
				3114	struct ceph_osd_client *osdc = lreq->osdc;
				3115	struct ceph_osd_linger_request *lookup_lreq;
				3116	int ret;
				3117
				3118	verify_osdc_wrlocked(osdc);
				3119
				3120	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				3121	lreq->linger_id);
				3122	if (lookup_lreq) {
				3123	WARN_ON(lookup_lreq != lreq);
				3124	return;
				3125	}
				3126
				3127	linger_get(lreq);
				3128	insert_linger_mc(&osdc->linger_map_checks, lreq);
				3129	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				3130	linger_map_check_cb, lreq->linger_id);
				3131	WARN_ON(ret);
				3132	}
				3133
				3134	static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
				3135	{
				3136	int ret;
				3137
				3138	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3139	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
				3140	return ret ?: lreq->reg_commit_error;
				3141	}
				3142
				3143	static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
				3144	{
				3145	int ret;
				3146
				3147	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3148	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
				3149	return ret ?: lreq->notify_finish_error;
				3150	}
				3151
				3152	/*
				3153	* Timeout callback, called every N seconds. When 1 or more OSD
				3154	* requests has been active for more than N seconds, we send a keepalive
				3155	* (tag + timestamp) to its OSD to ensure any communications channel
				3156	* reset is detected.
				3157	*/
				3158	static void handle_timeout(struct work_struct *work)
				3159	{
				3160	struct ceph_osd_client *osdc =
				3161	container_of(work, struct ceph_osd_client, timeout_work.work);
				3162	struct ceph_options *opts = osdc->client->options;
				3163	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
				3164	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
				3165	LIST_HEAD(slow_osds);
				3166	struct rb_node n, p;
				3167
				3168	dout("%s osdc %p\n", __func__, osdc);
				3169	down_write(&osdc->lock);
				3170
				3171	/*
				3172	* ping osds that are a bit slow. this ensures that if there
				3173	* is a break in the TCP connection we will notice, and reopen
				3174	* a connection with that osd (from the fault callback).
				3175	*/
				3176	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				3177	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3178	bool found = false;
				3179
				3180	for (p = rb_first(&osd->o_requests); p; ) {
				3181	struct ceph_osd_request *req =
				3182	rb_entry(p, struct ceph_osd_request, r_node);
				3183
				3184	p = rb_next(p); /* abort_request() */
				3185
				3186	if (time_before(req->r_stamp, cutoff)) {
				3187	dout(" req %p tid %llu on osd%d is laggy\n",
				3188	req, req->r_tid, osd->o_osd);
				3189	found = true;
				3190	}
				3191	if (opts->osd_request_timeout &&
				3192	time_before(req->r_start_stamp, expiry_cutoff)) {
				3193	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3194	req->r_tid, osd->o_osd);
				3195	abort_request(req, -ETIMEDOUT);
				3196	}
				3197	}
				3198	for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
				3199	struct ceph_osd_linger_request *lreq =
				3200	rb_entry(p, struct ceph_osd_linger_request, node);
				3201
				3202	dout(" lreq %p linger_id %llu is served by osd%d\n",
				3203	lreq, lreq->linger_id, osd->o_osd);
				3204	found = true;
				3205
				3206	mutex_lock(&lreq->lock);
				3207	if (lreq->is_watch && lreq->committed && !lreq->last_error)
				3208	send_linger_ping(lreq);
				3209	mutex_unlock(&lreq->lock);
				3210	}
				3211
				3212	if (found)
				3213	list_move_tail(&osd->o_keepalive_item, &slow_osds);
				3214	}
				3215
				3216	if (opts->osd_request_timeout) {
				3217	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
				3218	struct ceph_osd_request *req =
				3219	rb_entry(p, struct ceph_osd_request, r_node);
				3220
				3221	p = rb_next(p); /* abort_request() */
				3222
				3223	if (time_before(req->r_start_stamp, expiry_cutoff)) {
				3224	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3225	req->r_tid, osdc->homeless_osd.o_osd);
				3226	abort_request(req, -ETIMEDOUT);
				3227	}
				3228	}
				3229	}
				3230
				3231	if (atomic_read(&osdc->num_homeless) \|\| !list_empty(&slow_osds))
				3232	maybe_request_map(osdc);
				3233
				3234	while (!list_empty(&slow_osds)) {
				3235	struct ceph_osd *osd = list_first_entry(&slow_osds,
				3236	struct ceph_osd,
				3237	o_keepalive_item);
				3238	list_del_init(&osd->o_keepalive_item);
				3239	ceph_con_keepalive(&osd->o_con);
				3240	}
				3241
				3242	up_write(&osdc->lock);
				3243	schedule_delayed_work(&osdc->timeout_work,
				3244	osdc->client->options->osd_keepalive_timeout);
				3245	}
				3246
				3247	static void handle_osds_timeout(struct work_struct *work)
				3248	{
				3249	struct ceph_osd_client *osdc =
				3250	container_of(work, struct ceph_osd_client,
				3251	osds_timeout_work.work);
				3252	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
				3253	struct ceph_osd osd, nosd;
				3254
				3255	dout("%s osdc %p\n", __func__, osdc);
				3256	down_write(&osdc->lock);
				3257	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
				3258	if (time_before(jiffies, osd->lru_ttl))
				3259	break;
				3260
				3261	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				3262	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				3263	close_osd(osd);
				3264	}
				3265
				3266	up_write(&osdc->lock);
				3267	schedule_delayed_work(&osdc->osds_timeout_work,
				3268	round_jiffies_relative(delay));
				3269	}
				3270
				3271	static int ceph_oloc_decode(void *p, void end,
				3272	struct ceph_object_locator *oloc)
				3273	{
				3274	u8 struct_v, struct_cv;
				3275	u32 len;
				3276	void *struct_end;
				3277	int ret = 0;
				3278
				3279	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3280	struct_v = ceph_decode_8(p);
				3281	struct_cv = ceph_decode_8(p);
				3282	if (struct_v < 3) {
				3283	pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
				3284	struct_v, struct_cv);
				3285	goto e_inval;
				3286	}
				3287	if (struct_cv > 6) {
				3288	pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
				3289	struct_v, struct_cv);
				3290	goto e_inval;
				3291	}
				3292	len = ceph_decode_32(p);
				3293	ceph_decode_need(p, end, len, e_inval);
				3294	struct_end = *p + len;
				3295
				3296	oloc->pool = ceph_decode_64(p);
				3297	p += 4; / skip preferred */
				3298
				3299	len = ceph_decode_32(p);
				3300	if (len > 0) {
				3301	pr_warn("ceph_object_locator::key is set\n");
				3302	goto e_inval;
				3303	}
				3304
				3305	if (struct_v >= 5) {
				3306	bool changed = false;
				3307
				3308	len = ceph_decode_32(p);
				3309	if (len > 0) {
				3310	ceph_decode_need(p, end, len, e_inval);
				3311	if (!oloc->pool_ns \|\|
				3312	ceph_compare_string(oloc->pool_ns, *p, len))
				3313	changed = true;
				3314	*p += len;
				3315	} else {
				3316	if (oloc->pool_ns)
				3317	changed = true;
				3318	}
				3319	if (changed) {
				3320	/* redirect changes namespace */
				3321	pr_warn("ceph_object_locator::nspace is changed\n");
				3322	goto e_inval;
				3323	}
				3324	}
				3325
				3326	if (struct_v >= 6) {
				3327	s64 hash = ceph_decode_64(p);
				3328	if (hash != -1) {
				3329	pr_warn("ceph_object_locator::hash is set\n");
				3330	goto e_inval;
				3331	}
				3332	}
				3333
				3334	/* skip the rest */
				3335	*p = struct_end;
				3336	out:
				3337	return ret;
				3338
				3339	e_inval:
				3340	ret = -EINVAL;
				3341	goto out;
				3342	}
				3343
				3344	static int ceph_redirect_decode(void *p, void end,
				3345	struct ceph_request_redirect *redir)
				3346	{
				3347	u8 struct_v, struct_cv;
				3348	u32 len;
				3349	void *struct_end;
				3350	int ret;
				3351
				3352	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3353	struct_v = ceph_decode_8(p);
				3354	struct_cv = ceph_decode_8(p);
				3355	if (struct_cv > 1) {
				3356	pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
				3357	struct_v, struct_cv);
				3358	goto e_inval;
				3359	}
				3360	len = ceph_decode_32(p);
				3361	ceph_decode_need(p, end, len, e_inval);
				3362	struct_end = *p + len;
				3363
				3364	ret = ceph_oloc_decode(p, end, &redir->oloc);
				3365	if (ret)
				3366	goto out;
				3367
				3368	len = ceph_decode_32(p);
				3369	if (len > 0) {
				3370	pr_warn("ceph_request_redirect::object_name is set\n");
				3371	goto e_inval;
				3372	}
				3373
				3374	len = ceph_decode_32(p);
				3375	p += len; / skip osd_instructions */
				3376
				3377	/* skip the rest */
				3378	*p = struct_end;
				3379	out:
				3380	return ret;
				3381
				3382	e_inval:
				3383	ret = -EINVAL;
				3384	goto out;
				3385	}
				3386
				3387	struct MOSDOpReply {
				3388	struct ceph_pg pgid;
				3389	u64 flags;
				3390	int result;
				3391	u32 epoch;
				3392	int num_ops;
				3393	u32 outdata_len[CEPH_OSD_MAX_OPS];
				3394	s32 rval[CEPH_OSD_MAX_OPS];
				3395	int retry_attempt;
				3396	struct ceph_eversion replay_version;
				3397	u64 user_version;
				3398	struct ceph_request_redirect redirect;
				3399	};
				3400
				3401	static int decode_MOSDOpReply(const struct ceph_msg msg, struct MOSDOpReply m)
				3402	{
				3403	void *p = msg->front.iov_base;
				3404	void *const end = p + msg->front.iov_len;
				3405	u16 version = le16_to_cpu(msg->hdr.version);
				3406	struct ceph_eversion bad_replay_version;
				3407	u8 decode_redir;
				3408	u32 len;
				3409	int ret;
				3410	int i;
				3411
				3412	ceph_decode_32_safe(&p, end, len, e_inval);
				3413	ceph_decode_need(&p, end, len, e_inval);
				3414	p += len; /* skip oid */
				3415
				3416	ret = ceph_decode_pgid(&p, end, &m->pgid);
				3417	if (ret)
				3418	return ret;
				3419
				3420	ceph_decode_64_safe(&p, end, m->flags, e_inval);
				3421	ceph_decode_32_safe(&p, end, m->result, e_inval);
				3422	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
				3423	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
				3424	p += sizeof(bad_replay_version);
				3425	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
				3426
				3427	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
				3428	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
				3429	goto e_inval;
				3430
				3431	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
				3432	e_inval);
				3433	for (i = 0; i < m->num_ops; i++) {
				3434	struct ceph_osd_op *op = p;
				3435
				3436	m->outdata_len[i] = le32_to_cpu(op->payload_len);
				3437	p += sizeof(*op);
				3438	}
				3439
				3440	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
				3441	for (i = 0; i < m->num_ops; i++)
				3442	ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
				3443
				3444	if (version >= 5) {
				3445	ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
				3446	memcpy(&m->replay_version, p, sizeof(m->replay_version));
				3447	p += sizeof(m->replay_version);
				3448	ceph_decode_64_safe(&p, end, m->user_version, e_inval);
				3449	} else {
				3450	m->replay_version = bad_replay_version; /* struct */
				3451	m->user_version = le64_to_cpu(m->replay_version.version);
				3452	}
				3453
				3454	if (version >= 6) {
				3455	if (version >= 7)
				3456	ceph_decode_8_safe(&p, end, decode_redir, e_inval);
				3457	else
				3458	decode_redir = 1;
				3459	} else {
				3460	decode_redir = 0;
				3461	}
				3462
				3463	if (decode_redir) {
				3464	ret = ceph_redirect_decode(&p, end, &m->redirect);
				3465	if (ret)
				3466	return ret;
				3467	} else {
				3468	ceph_oloc_init(&m->redirect.oloc);
				3469	}
				3470
				3471	return 0;
				3472
				3473	e_inval:
				3474	return -EINVAL;
				3475	}
				3476
				3477	/*
				3478	* Handle MOSDOpReply. Set ->r_result and call the callback if it is
				3479	* specified.
				3480	*/
				3481	static void handle_reply(struct ceph_osd osd, struct ceph_msg msg)
				3482	{
				3483	struct ceph_osd_client *osdc = osd->o_osdc;
				3484	struct ceph_osd_request *req;
				3485	struct MOSDOpReply m;
				3486	u64 tid = le64_to_cpu(msg->hdr.tid);
				3487	u32 data_len = 0;
				3488	int ret;
				3489	int i;
				3490
				3491	dout("%s msg %p tid %llu\n", __func__, msg, tid);
				3492
				3493	down_read(&osdc->lock);
				3494	if (!osd_registered(osd)) {
				3495	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				3496	goto out_unlock_osdc;
				3497	}
				3498	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				3499
				3500	mutex_lock(&osd->lock);
				3501	req = lookup_request(&osd->o_requests, tid);
				3502	if (!req) {
				3503	dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
				3504	goto out_unlock_session;
				3505	}
				3506
				3507	m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
				3508	ret = decode_MOSDOpReply(msg, &m);
				3509	m.redirect.oloc.pool_ns = NULL;
				3510	if (ret) {
				3511	pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
				3512	req->r_tid, ret);
				3513	ceph_msg_dump(msg);
				3514	goto fail_request;
				3515	}
				3516	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
				3517	__func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
				3518	m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
				3519	le64_to_cpu(m.replay_version.version), m.user_version);
				3520
				3521	if (m.retry_attempt >= 0) {
				3522	if (m.retry_attempt != req->r_attempts - 1) {
				3523	dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
				3524	req, req->r_tid, m.retry_attempt,
				3525	req->r_attempts - 1);
				3526	goto out_unlock_session;
				3527	}
				3528	} else {
				3529	WARN_ON(1); /* MOSDOpReply v4 is assumed */
				3530	}
				3531
				3532	if (!ceph_oloc_empty(&m.redirect.oloc)) {
				3533	dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
				3534	m.redirect.oloc.pool);
				3535	unlink_request(osd, req);
				3536	mutex_unlock(&osd->lock);
				3537
				3538	/*
				3539	* Not ceph_oloc_copy() - changing pool_ns is not
				3540	* supported.
				3541	*/
				3542	req->r_t.target_oloc.pool = m.redirect.oloc.pool;
				3543	req->r_flags \|= CEPH_OSD_FLAG_REDIRECTED;
				3544	req->r_tid = 0;
				3545	__submit_request(req, false);
				3546	goto out_unlock_osdc;
				3547	}
				3548
				3549	if (m.num_ops != req->r_num_ops) {
				3550	pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
				3551	req->r_num_ops, req->r_tid);
				3552	goto fail_request;
				3553	}
				3554	for (i = 0; i < req->r_num_ops; i++) {
				3555	dout(" req %p tid %llu op %d rval %d len %u\n", req,
				3556	req->r_tid, i, m.rval[i], m.outdata_len[i]);
				3557	req->r_ops[i].rval = m.rval[i];
				3558	req->r_ops[i].outdata_len = m.outdata_len[i];
				3559	data_len += m.outdata_len[i];
				3560	}
				3561	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
				3562	pr_err("sum of lens %u != %u for tid %llu\n", data_len,
				3563	le32_to_cpu(msg->hdr.data_len), req->r_tid);
				3564	goto fail_request;
				3565	}
				3566	dout("%s req %p tid %llu result %d data_len %u\n", __func__,
				3567	req, req->r_tid, m.result, data_len);
				3568
				3569	/*
				3570	* Since we only ever request ONDISK, we should only ever get
				3571	* one (type of) reply back.
				3572	*/
				3573	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
				3574	req->r_result = m.result ?: data_len;
				3575	finish_request(req);
				3576	mutex_unlock(&osd->lock);
				3577	up_read(&osdc->lock);
				3578
				3579	__complete_request(req);
				3580	return;
				3581
				3582	fail_request:
				3583	complete_request(req, -EIO);
				3584	out_unlock_session:
				3585	mutex_unlock(&osd->lock);
				3586	out_unlock_osdc:
				3587	up_read(&osdc->lock);
				3588	}
				3589
				3590	static void set_pool_was_full(struct ceph_osd_client *osdc)
				3591	{
				3592	struct rb_node *n;
				3593
				3594	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				3595	struct ceph_pg_pool_info *pi =
				3596	rb_entry(n, struct ceph_pg_pool_info, node);
				3597
				3598	pi->was_full = __pool_full(pi);
				3599	}
				3600	}
				3601
				3602	static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
				3603	{
				3604	struct ceph_pg_pool_info *pi;
				3605
				3606	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				3607	if (!pi)
				3608	return false;
				3609
				3610	return pi->was_full && !__pool_full(pi);
				3611	}
				3612
				3613	static enum calc_target_result
				3614	recalc_linger_target(struct ceph_osd_linger_request *lreq)
				3615	{
				3616	struct ceph_osd_client *osdc = lreq->osdc;
				3617	enum calc_target_result ct_res;
				3618
				3619	ct_res = calc_target(osdc, &lreq->t, NULL, true);
				3620	if (ct_res == CALC_TARGET_NEED_RESEND) {
				3621	struct ceph_osd *osd;
				3622
				3623	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				3624	if (osd != lreq->osd) {
				3625	unlink_linger(lreq->osd, lreq);
				3626	link_linger(osd, lreq);
				3627	}
				3628	}
				3629
				3630	return ct_res;
				3631	}
				3632
				3633	/*
				3634	* Requeue requests whose mapping to an OSD has changed.
				3635	*/
				3636	static void scan_requests(struct ceph_osd *osd,
				3637	bool force_resend,
				3638	bool cleared_full,
				3639	bool check_pool_cleared_full,
				3640	struct rb_root *need_resend,
				3641	struct list_head *need_resend_linger)
				3642	{
				3643	struct ceph_osd_client *osdc = osd->o_osdc;
				3644	struct rb_node *n;
				3645	bool force_resend_writes;
				3646
				3647	for (n = rb_first(&osd->o_linger_requests); n; ) {
				3648	struct ceph_osd_linger_request *lreq =
				3649	rb_entry(n, struct ceph_osd_linger_request, node);
				3650	enum calc_target_result ct_res;
				3651
				3652	n = rb_next(n); /* recalc_linger_target() */
				3653
				3654	dout("%s lreq %p linger_id %llu\n", __func__, lreq,
				3655	lreq->linger_id);
				3656	ct_res = recalc_linger_target(lreq);
				3657	switch (ct_res) {
				3658	case CALC_TARGET_NO_ACTION:
				3659	force_resend_writes = cleared_full \|\|
				3660	(check_pool_cleared_full &&
				3661	pool_cleared_full(osdc, lreq->t.base_oloc.pool));
				3662	if (!force_resend && !force_resend_writes)
				3663	break;
				3664
				3665	/* fall through */
				3666	case CALC_TARGET_NEED_RESEND:
				3667	cancel_linger_map_check(lreq);
				3668	/*
				3669	* scan_requests() for the previous epoch(s)
				3670	* may have already added it to the list, since
				3671	* it's not unlinked here.
				3672	*/
				3673	if (list_empty(&lreq->scan_item))
				3674	list_add_tail(&lreq->scan_item, need_resend_linger);
				3675	break;
				3676	case CALC_TARGET_POOL_DNE:
				3677	list_del_init(&lreq->scan_item);
				3678	check_linger_pool_dne(lreq);
				3679	break;
				3680	}
				3681	}
				3682
				3683	for (n = rb_first(&osd->o_requests); n; ) {
				3684	struct ceph_osd_request *req =
				3685	rb_entry(n, struct ceph_osd_request, r_node);
				3686	enum calc_target_result ct_res;
				3687
				3688	n = rb_next(n); /* unlink_request(), check_pool_dne() */
				3689
				3690	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				3691	ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
				3692	false);
				3693	switch (ct_res) {
				3694	case CALC_TARGET_NO_ACTION:
				3695	force_resend_writes = cleared_full \|\|
				3696	(check_pool_cleared_full &&
				3697	pool_cleared_full(osdc, req->r_t.base_oloc.pool));
				3698	if (!force_resend &&
				3699	(!(req->r_flags & CEPH_OSD_FLAG_WRITE) \|\|
				3700	!force_resend_writes))
				3701	break;
				3702
				3703	/* fall through */
				3704	case CALC_TARGET_NEED_RESEND:
				3705	cancel_map_check(req);
				3706	unlink_request(osd, req);
				3707	insert_request(need_resend, req);
				3708	break;
				3709	case CALC_TARGET_POOL_DNE:
				3710	check_pool_dne(req);
				3711	break;
				3712	}
				3713	}
				3714	}
				3715
				3716	static int handle_one_map(struct ceph_osd_client *osdc,
				3717	void p, void end, bool incremental,
				3718	struct rb_root *need_resend,
				3719	struct list_head *need_resend_linger)
				3720	{
				3721	struct ceph_osdmap *newmap;
				3722	struct rb_node *n;
				3723	bool skipped_map = false;
				3724	bool was_full;
				3725
				3726	was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3727	set_pool_was_full(osdc);
				3728
				3729	if (incremental)
				3730	newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
				3731	else
				3732	newmap = ceph_osdmap_decode(&p, end);
				3733	if (IS_ERR(newmap))
				3734	return PTR_ERR(newmap);
				3735
				3736	if (newmap != osdc->osdmap) {
				3737	/*
				3738	* Preserve ->was_full before destroying the old map.
				3739	* For pools that weren't in the old map, ->was_full
				3740	* should be false.
				3741	*/
				3742	for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
				3743	struct ceph_pg_pool_info *pi =
				3744	rb_entry(n, struct ceph_pg_pool_info, node);
				3745	struct ceph_pg_pool_info *old_pi;
				3746
				3747	old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
				3748	if (old_pi)
				3749	pi->was_full = old_pi->was_full;
				3750	else
				3751	WARN_ON(pi->was_full);
				3752	}
				3753
				3754	if (osdc->osdmap->epoch &&
				3755	osdc->osdmap->epoch + 1 < newmap->epoch) {
				3756	WARN_ON(incremental);
				3757	skipped_map = true;
				3758	}
				3759
				3760	ceph_osdmap_destroy(osdc->osdmap);
				3761	osdc->osdmap = newmap;
				3762	}
				3763
				3764	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3765	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
				3766	need_resend, need_resend_linger);
				3767
				3768	for (n = rb_first(&osdc->osds); n; ) {
				3769	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3770
				3771	n = rb_next(n); /* close_osd() */
				3772
				3773	scan_requests(osd, skipped_map, was_full, true, need_resend,
				3774	need_resend_linger);
				3775	if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) \|\|
				3776	memcmp(&osd->o_con.peer_addr,
				3777	ceph_osd_addr(osdc->osdmap, osd->o_osd),
				3778	sizeof(struct ceph_entity_addr)))
				3779	close_osd(osd);
				3780	}
				3781
				3782	return 0;
				3783	}
				3784
				3785	static void kick_requests(struct ceph_osd_client *osdc,
				3786	struct rb_root *need_resend,
				3787	struct list_head *need_resend_linger)
				3788	{
				3789	struct ceph_osd_linger_request lreq, nlreq;
				3790	enum calc_target_result ct_res;
				3791	struct rb_node *n;
				3792
				3793	/* make sure need_resend targets reflect latest map */
				3794	for (n = rb_first(need_resend); n; ) {
				3795	struct ceph_osd_request *req =
				3796	rb_entry(n, struct ceph_osd_request, r_node);
				3797
				3798	n = rb_next(n);
				3799
				3800	if (req->r_t.epoch < osdc->osdmap->epoch) {
				3801	ct_res = calc_target(osdc, &req->r_t, NULL, false);
				3802	if (ct_res == CALC_TARGET_POOL_DNE) {
				3803	erase_request(need_resend, req);
				3804	check_pool_dne(req);
				3805	}
				3806	}
				3807	}
				3808
				3809	for (n = rb_first(need_resend); n; ) {
				3810	struct ceph_osd_request *req =
				3811	rb_entry(n, struct ceph_osd_request, r_node);
				3812	struct ceph_osd *osd;
				3813
				3814	n = rb_next(n);
				3815	erase_request(need_resend, req); /* before link_request() */
				3816
				3817	osd = lookup_create_osd(osdc, req->r_t.osd, true);
				3818	link_request(osd, req);
				3819	if (!req->r_linger) {
				3820	if (!osd_homeless(osd) && !req->r_t.paused)
				3821	send_request(req);
				3822	} else {
				3823	cancel_linger_request(req);
				3824	}
				3825	}
				3826
				3827	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
				3828	if (!osd_homeless(lreq->osd))
				3829	send_linger(lreq);
				3830
				3831	list_del_init(&lreq->scan_item);
				3832	}
				3833	}
				3834
				3835	/*
				3836	* Process updated osd map.
				3837	*
				3838	* The message contains any number of incremental and full maps, normally
				3839	* indicating some sort of topology change in the cluster. Kick requests
				3840	* off to different OSDs as needed.
				3841	*/
				3842	void ceph_osdc_handle_map(struct ceph_osd_client osdc, struct ceph_msg msg)
				3843	{
				3844	void *p = msg->front.iov_base;
				3845	void *const end = p + msg->front.iov_len;
				3846	u32 nr_maps, maplen;
				3847	u32 epoch;
				3848	struct ceph_fsid fsid;
				3849	struct rb_root need_resend = RB_ROOT;
				3850	LIST_HEAD(need_resend_linger);
				3851	bool handled_incremental = false;
				3852	bool was_pauserd, was_pausewr;
				3853	bool pauserd, pausewr;
				3854	int err;
				3855
				3856	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
				3857	down_write(&osdc->lock);
				3858
				3859	/* verify fsid */
				3860	ceph_decode_need(&p, end, sizeof(fsid), bad);
				3861	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				3862	if (ceph_check_fsid(osdc->client, &fsid) < 0)
				3863	goto bad;
				3864
				3865	was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				3866	was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				3867	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				3868	have_pool_full(osdc);
				3869
				3870	/* incremental maps */
				3871	ceph_decode_32_safe(&p, end, nr_maps, bad);
				3872	dout(" %d inc maps\n", nr_maps);
				3873	while (nr_maps > 0) {
				3874	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				3875	epoch = ceph_decode_32(&p);
				3876	maplen = ceph_decode_32(&p);
				3877	ceph_decode_need(&p, end, maplen, bad);
				3878	if (osdc->osdmap->epoch &&
				3879	osdc->osdmap->epoch + 1 == epoch) {
				3880	dout("applying incremental map %u len %d\n",
				3881	epoch, maplen);
				3882	err = handle_one_map(osdc, p, p + maplen, true,
				3883	&need_resend, &need_resend_linger);
				3884	if (err)
				3885	goto bad;
				3886	handled_incremental = true;
				3887	} else {
				3888	dout("ignoring incremental map %u len %d\n",
				3889	epoch, maplen);
				3890	}
				3891	p += maplen;
				3892	nr_maps--;
				3893	}
				3894	if (handled_incremental)
				3895	goto done;
				3896
				3897	/* full maps */
				3898	ceph_decode_32_safe(&p, end, nr_maps, bad);
				3899	dout(" %d full maps\n", nr_maps);
				3900	while (nr_maps) {
				3901	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				3902	epoch = ceph_decode_32(&p);
				3903	maplen = ceph_decode_32(&p);
				3904	ceph_decode_need(&p, end, maplen, bad);
				3905	if (nr_maps > 1) {
				3906	dout("skipping non-latest full map %u len %d\n",
				3907	epoch, maplen);
				3908	} else if (osdc->osdmap->epoch >= epoch) {
				3909	dout("skipping full map %u len %d, "
				3910	"older than our %u\n", epoch, maplen,
				3911	osdc->osdmap->epoch);
				3912	} else {
				3913	dout("taking full map %u len %d\n", epoch, maplen);
				3914	err = handle_one_map(osdc, p, p + maplen, false,
				3915	&need_resend, &need_resend_linger);
				3916	if (err)
				3917	goto bad;
				3918	}
				3919	p += maplen;
				3920	nr_maps--;
				3921	}
				3922
				3923	done:
				3924	/*
				3925	* subscribe to subsequent osdmap updates if full to ensure
				3926	* we find out when we are no longer full and stop returning
				3927	* ENOSPC.
				3928	*/
				3929	pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				3930	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				3931	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				3932	have_pool_full(osdc);
				3933	if (was_pauserd \|\| was_pausewr \|\| pauserd \|\| pausewr \|\|
				3934	osdc->osdmap->epoch < osdc->epoch_barrier)
				3935	maybe_request_map(osdc);
				3936
				3937	kick_requests(osdc, &need_resend, &need_resend_linger);
				3938
				3939	ceph_osdc_abort_on_full(osdc);
				3940	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				3941	osdc->osdmap->epoch);
				3942	up_write(&osdc->lock);
				3943	wake_up_all(&osdc->client->auth_wq);
				3944	return;
				3945
				3946	bad:
				3947	pr_err("osdc handle_map corrupt msg\n");
				3948	ceph_msg_dump(msg);
				3949	up_write(&osdc->lock);
				3950	}
				3951
				3952	/*
				3953	* Resubmit requests pending on the given osd.
				3954	*/
				3955	static void kick_osd_requests(struct ceph_osd *osd)
				3956	{
				3957	struct rb_node *n;
				3958
				3959	clear_backoffs(osd);
				3960
				3961	for (n = rb_first(&osd->o_requests); n; ) {
				3962	struct ceph_osd_request *req =
				3963	rb_entry(n, struct ceph_osd_request, r_node);
				3964
				3965	n = rb_next(n); /* cancel_linger_request() */
				3966
				3967	if (!req->r_linger) {
				3968	if (!req->r_t.paused)
				3969	send_request(req);
				3970	} else {
				3971	cancel_linger_request(req);
				3972	}
				3973	}
				3974	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
				3975	struct ceph_osd_linger_request *lreq =
				3976	rb_entry(n, struct ceph_osd_linger_request, node);
				3977
				3978	send_linger(lreq);
				3979	}
				3980	}
				3981
				3982	/*
				3983	* If the osd connection drops, we need to resubmit all requests.
				3984	*/
				3985	static void osd_fault(struct ceph_connection *con)
				3986	{
				3987	struct ceph_osd *osd = con->private;
				3988	struct ceph_osd_client *osdc = osd->o_osdc;
				3989
				3990	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				3991
				3992	down_write(&osdc->lock);
				3993	if (!osd_registered(osd)) {
				3994	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				3995	goto out_unlock;
				3996	}
				3997
				3998	if (!reopen_osd(osd))
				3999	kick_osd_requests(osd);
				4000	maybe_request_map(osdc);
				4001
				4002	out_unlock:
				4003	up_write(&osdc->lock);
				4004	}
				4005
				4006	struct MOSDBackoff {
				4007	struct ceph_spg spgid;
				4008	u32 map_epoch;
				4009	u8 op;
				4010	u64 id;
				4011	struct ceph_hobject_id *begin;
				4012	struct ceph_hobject_id *end;
				4013	};
				4014
				4015	static int decode_MOSDBackoff(const struct ceph_msg msg, struct MOSDBackoff m)
				4016	{
				4017	void *p = msg->front.iov_base;
				4018	void *const end = p + msg->front.iov_len;
				4019	u8 struct_v;
				4020	u32 struct_len;
				4021	int ret;
				4022
				4023	ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
				4024	if (ret)
				4025	return ret;
				4026
				4027	ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
				4028	if (ret)
				4029	return ret;
				4030
				4031	ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
				4032	ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
				4033	ceph_decode_8_safe(&p, end, m->op, e_inval);
				4034	ceph_decode_64_safe(&p, end, m->id, e_inval);
				4035
				4036	m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
				4037	if (!m->begin)
				4038	return -ENOMEM;
				4039
				4040	ret = decode_hoid(&p, end, m->begin);
				4041	if (ret) {
				4042	free_hoid(m->begin);
				4043	return ret;
				4044	}
				4045
				4046	m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
				4047	if (!m->end) {
				4048	free_hoid(m->begin);
				4049	return -ENOMEM;
				4050	}
				4051
				4052	ret = decode_hoid(&p, end, m->end);
				4053	if (ret) {
				4054	free_hoid(m->begin);
				4055	free_hoid(m->end);
				4056	return ret;
				4057	}
				4058
				4059	return 0;
				4060
				4061	e_inval:
				4062	return -EINVAL;
				4063	}
				4064
				4065	static struct ceph_msg *create_backoff_message(
				4066	const struct ceph_osd_backoff *backoff,
				4067	u32 map_epoch)
				4068	{
				4069	struct ceph_msg *msg;
				4070	void p, end;
				4071	int msg_size;
				4072
				4073	msg_size = CEPH_ENCODING_START_BLK_LEN +
				4074	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				4075	msg_size += 4 + 1 + 8; /* map_epoch, op, id */
				4076	msg_size += CEPH_ENCODING_START_BLK_LEN +
				4077	hoid_encoding_size(backoff->begin);
				4078	msg_size += CEPH_ENCODING_START_BLK_LEN +
				4079	hoid_encoding_size(backoff->end);
				4080
				4081	msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
				4082	if (!msg)
				4083	return NULL;
				4084
				4085	p = msg->front.iov_base;
				4086	end = p + msg->front_alloc_len;
				4087
				4088	encode_spgid(&p, &backoff->spgid);
				4089	ceph_encode_32(&p, map_epoch);
				4090	ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
				4091	ceph_encode_64(&p, backoff->id);
				4092	encode_hoid(&p, end, backoff->begin);
				4093	encode_hoid(&p, end, backoff->end);
				4094	BUG_ON(p != end);
				4095
				4096	msg->front.iov_len = p - msg->front.iov_base;
				4097	msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
				4098	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				4099
				4100	return msg;
				4101	}
				4102
				4103	static void handle_backoff_block(struct ceph_osd osd, struct MOSDBackoff m)
				4104	{
				4105	struct ceph_spg_mapping *spg;
				4106	struct ceph_osd_backoff *backoff;
				4107	struct ceph_msg *msg;
				4108
				4109	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4110	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4111
				4112	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
				4113	if (!spg) {
				4114	spg = alloc_spg_mapping();
				4115	if (!spg) {
				4116	pr_err("%s failed to allocate spg\n", __func__);
				4117	return;
				4118	}
				4119	spg->spgid = m->spgid; /* struct */
				4120	insert_spg_mapping(&osd->o_backoff_mappings, spg);
				4121	}
				4122
				4123	backoff = alloc_backoff();
				4124	if (!backoff) {
				4125	pr_err("%s failed to allocate backoff\n", __func__);
				4126	return;
				4127	}
				4128	backoff->spgid = m->spgid; /* struct */
				4129	backoff->id = m->id;
				4130	backoff->begin = m->begin;
				4131	m->begin = NULL; /* backoff now owns this */
				4132	backoff->end = m->end;
				4133	m->end = NULL; /* ditto */
				4134
				4135	insert_backoff(&spg->backoffs, backoff);
				4136	insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4137
				4138	/*
				4139	* Ack with original backoff's epoch so that the OSD can
				4140	* discard this if there was a PG split.
				4141	*/
				4142	msg = create_backoff_message(backoff, m->map_epoch);
				4143	if (!msg) {
				4144	pr_err("%s failed to allocate msg\n", __func__);
				4145	return;
				4146	}
				4147	ceph_con_send(&osd->o_con, msg);
				4148	}
				4149
				4150	static bool target_contained_by(const struct ceph_osd_request_target *t,
				4151	const struct ceph_hobject_id *begin,
				4152	const struct ceph_hobject_id *end)
				4153	{
				4154	struct ceph_hobject_id hoid;
				4155	int cmp;
				4156
				4157	hoid_fill_from_target(&hoid, t);
				4158	cmp = hoid_compare(&hoid, begin);
				4159	return !cmp \|\| (cmp > 0 && hoid_compare(&hoid, end) < 0);
				4160	}
				4161
				4162	static void handle_backoff_unblock(struct ceph_osd *osd,
				4163	const struct MOSDBackoff *m)
				4164	{
				4165	struct ceph_spg_mapping *spg;
				4166	struct ceph_osd_backoff *backoff;
				4167	struct rb_node *n;
				4168
				4169	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4170	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4171
				4172	backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
				4173	if (!backoff) {
				4174	pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
				4175	__func__, osd->o_osd, m->spgid.pgid.pool,
				4176	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4177	return;
				4178	}
				4179
				4180	if (hoid_compare(backoff->begin, m->begin) &&
				4181	hoid_compare(backoff->end, m->end)) {
				4182	pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
				4183	__func__, osd->o_osd, m->spgid.pgid.pool,
				4184	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4185	/* unblock it anyway... */
				4186	}
				4187
				4188	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
				4189	BUG_ON(!spg);
				4190
				4191	erase_backoff(&spg->backoffs, backoff);
				4192	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4193	free_backoff(backoff);
				4194
				4195	if (RB_EMPTY_ROOT(&spg->backoffs)) {
				4196	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				4197	free_spg_mapping(spg);
				4198	}
				4199
				4200	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				4201	struct ceph_osd_request *req =
				4202	rb_entry(n, struct ceph_osd_request, r_node);
				4203
				4204	if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
				4205	/*
				4206	* Match against @m, not @backoff -- the PG may
				4207	* have split on the OSD.
				4208	*/
				4209	if (target_contained_by(&req->r_t, m->begin, m->end)) {
				4210	/*
				4211	* If no other installed backoff applies,
				4212	* resend.
				4213	*/
				4214	send_request(req);
				4215	}
				4216	}
				4217	}
				4218	}
				4219
				4220	static void handle_backoff(struct ceph_osd osd, struct ceph_msg msg)
				4221	{
				4222	struct ceph_osd_client *osdc = osd->o_osdc;
				4223	struct MOSDBackoff m;
				4224	int ret;
				4225
				4226	down_read(&osdc->lock);
				4227	if (!osd_registered(osd)) {
				4228	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				4229	up_read(&osdc->lock);
				4230	return;
				4231	}
				4232	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				4233
				4234	mutex_lock(&osd->lock);
				4235	ret = decode_MOSDBackoff(msg, &m);
				4236	if (ret) {
				4237	pr_err("failed to decode MOSDBackoff: %d\n", ret);
				4238	ceph_msg_dump(msg);
				4239	goto out_unlock;
				4240	}
				4241
				4242	switch (m.op) {
				4243	case CEPH_OSD_BACKOFF_OP_BLOCK:
				4244	handle_backoff_block(osd, &m);
				4245	break;
				4246	case CEPH_OSD_BACKOFF_OP_UNBLOCK:
				4247	handle_backoff_unblock(osd, &m);
				4248	break;
				4249	default:
				4250	pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
				4251	}
				4252
				4253	free_hoid(m.begin);
				4254	free_hoid(m.end);
				4255
				4256	out_unlock:
				4257	mutex_unlock(&osd->lock);
				4258	up_read(&osdc->lock);
				4259	}
				4260
				4261	/*
				4262	* Process osd watch notifications
				4263	*/
				4264	static void handle_watch_notify(struct ceph_osd_client *osdc,
				4265	struct ceph_msg *msg)
				4266	{
				4267	void *p = msg->front.iov_base;
				4268	void *const end = p + msg->front.iov_len;
				4269	struct ceph_osd_linger_request *lreq;
				4270	struct linger_work *lwork;
				4271	u8 proto_ver, opcode;
				4272	u64 cookie, notify_id;
				4273	u64 notifier_id = 0;
				4274	s32 return_code = 0;
				4275	void *payload = NULL;
				4276	u32 payload_len = 0;
				4277
				4278	ceph_decode_8_safe(&p, end, proto_ver, bad);
				4279	ceph_decode_8_safe(&p, end, opcode, bad);
				4280	ceph_decode_64_safe(&p, end, cookie, bad);
				4281	p += 8; /* skip ver */
				4282	ceph_decode_64_safe(&p, end, notify_id, bad);
				4283
				4284	if (proto_ver >= 1) {
				4285	ceph_decode_32_safe(&p, end, payload_len, bad);
				4286	ceph_decode_need(&p, end, payload_len, bad);
				4287	payload = p;
				4288	p += payload_len;
				4289	}
				4290
				4291	if (le16_to_cpu(msg->hdr.version) >= 2)
				4292	ceph_decode_32_safe(&p, end, return_code, bad);
				4293
				4294	if (le16_to_cpu(msg->hdr.version) >= 3)
				4295	ceph_decode_64_safe(&p, end, notifier_id, bad);
				4296
				4297	down_read(&osdc->lock);
				4298	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
				4299	if (!lreq) {
				4300	dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
				4301	cookie);
				4302	goto out_unlock_osdc;
				4303	}
				4304
				4305	mutex_lock(&lreq->lock);
				4306	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
				4307	opcode, cookie, lreq, lreq->is_watch);
				4308	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
				4309	if (!lreq->last_error) {
				4310	lreq->last_error = -ENOTCONN;
				4311	queue_watch_error(lreq);
				4312	}
				4313	} else if (!lreq->is_watch) {
				4314	/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
				4315	if (lreq->notify_id && lreq->notify_id != notify_id) {
				4316	dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
				4317	lreq->notify_id, notify_id);
				4318	} else if (!completion_done(&lreq->notify_finish_wait)) {
				4319	struct ceph_msg_data *data =
				4320	list_first_entry_or_null(&msg->data,
				4321	struct ceph_msg_data,
				4322	links);
				4323
				4324	if (data) {
				4325	if (lreq->preply_pages) {
				4326	WARN_ON(data->type !=
				4327	CEPH_MSG_DATA_PAGES);
				4328	*lreq->preply_pages = data->pages;
				4329	*lreq->preply_len = data->length;
				4330	} else {
				4331	ceph_release_page_vector(data->pages,
				4332	calc_pages_for(0, data->length));
				4333	}
				4334	}
				4335	lreq->notify_finish_error = return_code;
				4336	complete_all(&lreq->notify_finish_wait);
				4337	}
				4338	} else {
				4339	/* CEPH_WATCH_EVENT_NOTIFY */
				4340	lwork = lwork_alloc(lreq, do_watch_notify);
				4341	if (!lwork) {
				4342	pr_err("failed to allocate notify-lwork\n");
				4343	goto out_unlock_lreq;
				4344	}
				4345
				4346	lwork->notify.notify_id = notify_id;
				4347	lwork->notify.notifier_id = notifier_id;
				4348	lwork->notify.payload = payload;
				4349	lwork->notify.payload_len = payload_len;
				4350	lwork->notify.msg = ceph_msg_get(msg);
				4351	lwork_queue(lwork);
				4352	}
				4353
				4354	out_unlock_lreq:
				4355	mutex_unlock(&lreq->lock);
				4356	out_unlock_osdc:
				4357	up_read(&osdc->lock);
				4358	return;
				4359
				4360	bad:
				4361	pr_err("osdc handle_watch_notify corrupt msg\n");
				4362	}
				4363
				4364	/*
				4365	* Register request, send initial attempt.
				4366	*/
				4367	int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				4368	struct ceph_osd_request *req,
				4369	bool nofail)
				4370	{
				4371	down_read(&osdc->lock);
				4372	submit_request(req, false);
				4373	up_read(&osdc->lock);
				4374
				4375	return 0;
				4376	}
				4377	EXPORT_SYMBOL(ceph_osdc_start_request);
				4378
				4379	/*
				4380	* Unregister a registered request. The request is not completed:
				4381	* ->r_result isn't set and __complete_request() isn't called.
				4382	*/
				4383	void ceph_osdc_cancel_request(struct ceph_osd_request *req)
				4384	{
				4385	struct ceph_osd_client *osdc = req->r_osdc;
				4386
				4387	down_write(&osdc->lock);
				4388	if (req->r_osd)
				4389	cancel_request(req);
				4390	up_write(&osdc->lock);
				4391	}
				4392	EXPORT_SYMBOL(ceph_osdc_cancel_request);
				4393
				4394	/*
				4395	* @timeout: in jiffies, 0 means "wait forever"
				4396	*/
				4397	static int wait_request_timeout(struct ceph_osd_request *req,
				4398	unsigned long timeout)
				4399	{
				4400	long left;
				4401
				4402	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				4403	left = wait_for_completion_killable_timeout(&req->r_completion,
				4404	ceph_timeout_jiffies(timeout));
				4405	if (left <= 0) {
				4406	left = left ?: -ETIMEDOUT;
				4407	ceph_osdc_cancel_request(req);
				4408	} else {
				4409	left = req->r_result; /* completed */
				4410	}
				4411
				4412	return left;
				4413	}
				4414
				4415	/*
				4416	* wait for a request to complete
				4417	*/
				4418	int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				4419	struct ceph_osd_request *req)
				4420	{
				4421	return wait_request_timeout(req, 0);
				4422	}
				4423	EXPORT_SYMBOL(ceph_osdc_wait_request);
				4424
				4425	/*
				4426	* sync - wait for all in-flight requests to flush. avoid starvation.
				4427	*/
				4428	void ceph_osdc_sync(struct ceph_osd_client *osdc)
				4429	{
				4430	struct rb_node n, p;
				4431	u64 last_tid = atomic64_read(&osdc->last_tid);
				4432
				4433	again:
				4434	down_read(&osdc->lock);
				4435	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				4436	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				4437
				4438	mutex_lock(&osd->lock);
				4439	for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
				4440	struct ceph_osd_request *req =
				4441	rb_entry(p, struct ceph_osd_request, r_node);
				4442
				4443	if (req->r_tid > last_tid)
				4444	break;
				4445
				4446	if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
				4447	continue;
				4448
				4449	ceph_osdc_get_request(req);
				4450	mutex_unlock(&osd->lock);
				4451	up_read(&osdc->lock);
				4452	dout("%s waiting on req %p tid %llu last_tid %llu\n",
				4453	__func__, req, req->r_tid, last_tid);
				4454	wait_for_completion(&req->r_completion);
				4455	ceph_osdc_put_request(req);
				4456	goto again;
				4457	}
				4458
				4459	mutex_unlock(&osd->lock);
				4460	}
				4461
				4462	up_read(&osdc->lock);
				4463	dout("%s done last_tid %llu\n", __func__, last_tid);
				4464	}
				4465	EXPORT_SYMBOL(ceph_osdc_sync);
				4466
				4467	static struct ceph_osd_request *
				4468	alloc_linger_request(struct ceph_osd_linger_request *lreq)
				4469	{
				4470	struct ceph_osd_request *req;
				4471
				4472	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
				4473	if (!req)
				4474	return NULL;
				4475
				4476	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4477	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				4478
				4479	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
				4480	ceph_osdc_put_request(req);
				4481	return NULL;
				4482	}
				4483
				4484	return req;
				4485	}
				4486
				4487	/*
				4488	* Returns a handle, caller owns a ref.
				4489	*/
				4490	struct ceph_osd_linger_request *
				4491	ceph_osdc_watch(struct ceph_osd_client *osdc,
				4492	struct ceph_object_id *oid,
				4493	struct ceph_object_locator *oloc,
				4494	rados_watchcb2_t wcb,
				4495	rados_watcherrcb_t errcb,
				4496	void *data)
				4497	{
				4498	struct ceph_osd_linger_request *lreq;
				4499	int ret;
				4500
				4501	lreq = linger_alloc(osdc);
				4502	if (!lreq)
				4503	return ERR_PTR(-ENOMEM);
				4504
				4505	lreq->is_watch = true;
				4506	lreq->wcb = wcb;
				4507	lreq->errcb = errcb;
				4508	lreq->data = data;
				4509	lreq->watch_valid_thru = jiffies;
				4510
				4511	ceph_oid_copy(&lreq->t.base_oid, oid);
				4512	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4513	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
				4514	ktime_get_real_ts64(&lreq->mtime);
				4515
				4516	lreq->reg_req = alloc_linger_request(lreq);
				4517	if (!lreq->reg_req) {
				4518	ret = -ENOMEM;
				4519	goto err_put_lreq;
				4520	}
				4521
				4522	lreq->ping_req = alloc_linger_request(lreq);
				4523	if (!lreq->ping_req) {
				4524	ret = -ENOMEM;
				4525	goto err_put_lreq;
				4526	}
				4527
				4528	down_write(&osdc->lock);
				4529	linger_register(lreq); /* before osd_req_op_* */
				4530	osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
				4531	CEPH_OSD_WATCH_OP_WATCH);
				4532	osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
				4533	CEPH_OSD_WATCH_OP_PING);
				4534	linger_submit(lreq);
				4535	up_write(&osdc->lock);
				4536
				4537	ret = linger_reg_commit_wait(lreq);
				4538	if (ret) {
				4539	linger_cancel(lreq);
				4540	goto err_put_lreq;
				4541	}
				4542
				4543	return lreq;
				4544
				4545	err_put_lreq:
				4546	linger_put(lreq);
				4547	return ERR_PTR(ret);
				4548	}
				4549	EXPORT_SYMBOL(ceph_osdc_watch);
				4550
				4551	/*
				4552	* Releases a ref.
				4553	*
				4554	* Times out after mount_timeout to preserve rbd unmap behaviour
				4555	* introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
				4556	* with mount_timeout").
				4557	*/
				4558	int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
				4559	struct ceph_osd_linger_request *lreq)
				4560	{
				4561	struct ceph_options *opts = osdc->client->options;
				4562	struct ceph_osd_request *req;
				4563	int ret;
				4564
				4565	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4566	if (!req)
				4567	return -ENOMEM;
				4568
				4569	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4570	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				4571	req->r_flags = CEPH_OSD_FLAG_WRITE;
				4572	ktime_get_real_ts64(&req->r_mtime);
				4573	osd_req_op_watch_init(req, 0, lreq->linger_id,
				4574	CEPH_OSD_WATCH_OP_UNWATCH);
				4575
				4576	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4577	if (ret)
				4578	goto out_put_req;
				4579
				4580	ceph_osdc_start_request(osdc, req, false);
				4581	linger_cancel(lreq);
				4582	linger_put(lreq);
				4583	ret = wait_request_timeout(req, opts->mount_timeout);
				4584
				4585	out_put_req:
				4586	ceph_osdc_put_request(req);
				4587	return ret;
				4588	}
				4589	EXPORT_SYMBOL(ceph_osdc_unwatch);
				4590
				4591	static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
				4592	u64 notify_id, u64 cookie, void *payload,
				4593	u32 payload_len)
				4594	{
				4595	struct ceph_osd_req_op *op;
				4596	struct ceph_pagelist *pl;
				4597	int ret;
				4598
				4599	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
				4600
				4601	pl = kmalloc(sizeof(*pl), GFP_NOIO);
				4602	if (!pl)
				4603	return -ENOMEM;
				4604
				4605	ceph_pagelist_init(pl);
				4606	ret = ceph_pagelist_encode_64(pl, notify_id);
				4607	ret \|= ceph_pagelist_encode_64(pl, cookie);
				4608	if (payload) {
				4609	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4610	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4611	} else {
				4612	ret \|= ceph_pagelist_encode_32(pl, 0);
				4613	}
				4614	if (ret) {
				4615	ceph_pagelist_release(pl);
				4616	return -ENOMEM;
				4617	}
				4618
				4619	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
				4620	op->indata_len = pl->length;
				4621	return 0;
				4622	}
				4623
				4624	int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
				4625	struct ceph_object_id *oid,
				4626	struct ceph_object_locator *oloc,
				4627	u64 notify_id,
				4628	u64 cookie,
				4629	void *payload,
				4630	u32 payload_len)
				4631	{
				4632	struct ceph_osd_request *req;
				4633	int ret;
				4634
				4635	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4636	if (!req)
				4637	return -ENOMEM;
				4638
				4639	ceph_oid_copy(&req->r_base_oid, oid);
				4640	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4641	req->r_flags = CEPH_OSD_FLAG_READ;
				4642
				4643	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4644	if (ret)
				4645	goto out_put_req;
				4646
				4647	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
				4648	payload_len);
				4649	if (ret)
				4650	goto out_put_req;
				4651
				4652	ceph_osdc_start_request(osdc, req, false);
				4653	ret = ceph_osdc_wait_request(osdc, req);
				4654
				4655	out_put_req:
				4656	ceph_osdc_put_request(req);
				4657	return ret;
				4658	}
				4659	EXPORT_SYMBOL(ceph_osdc_notify_ack);
				4660
				4661	static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
				4662	u64 cookie, u32 prot_ver, u32 timeout,
				4663	void *payload, u32 payload_len)
				4664	{
				4665	struct ceph_osd_req_op *op;
				4666	struct ceph_pagelist *pl;
				4667	int ret;
				4668
				4669	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
				4670	op->notify.cookie = cookie;
				4671
				4672	pl = kmalloc(sizeof(*pl), GFP_NOIO);
				4673	if (!pl)
				4674	return -ENOMEM;
				4675
				4676	ceph_pagelist_init(pl);
				4677	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
				4678	ret \|= ceph_pagelist_encode_32(pl, timeout);
				4679	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4680	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4681	if (ret) {
				4682	ceph_pagelist_release(pl);
				4683	return -ENOMEM;
				4684	}
				4685
				4686	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
				4687	op->indata_len = pl->length;
				4688	return 0;
				4689	}
				4690
				4691	/*
				4692	* @timeout: in seconds
				4693	*
				4694	* @preply_{pages,len} are initialized both on success and error.
				4695	* The caller is responsible for:
				4696	*
				4697	* ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
				4698	*/
				4699	int ceph_osdc_notify(struct ceph_osd_client *osdc,
				4700	struct ceph_object_id *oid,
				4701	struct ceph_object_locator *oloc,
				4702	void *payload,
				4703	u32 payload_len,
				4704	u32 timeout,
				4705	struct page ***preply_pages,
				4706	size_t *preply_len)
				4707	{
				4708	struct ceph_osd_linger_request *lreq;
				4709	struct page **pages;
				4710	int ret;
				4711
				4712	WARN_ON(!timeout);
				4713	if (preply_pages) {
				4714	*preply_pages = NULL;
				4715	*preply_len = 0;
				4716	}
				4717
				4718	lreq = linger_alloc(osdc);
				4719	if (!lreq)
				4720	return -ENOMEM;
				4721
				4722	lreq->preply_pages = preply_pages;
				4723	lreq->preply_len = preply_len;
				4724
				4725	ceph_oid_copy(&lreq->t.base_oid, oid);
				4726	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4727	lreq->t.flags = CEPH_OSD_FLAG_READ;
				4728
				4729	lreq->reg_req = alloc_linger_request(lreq);
				4730	if (!lreq->reg_req) {
				4731	ret = -ENOMEM;
				4732	goto out_put_lreq;
				4733	}
				4734
				4735	/* for notify_id */
				4736	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				4737	if (IS_ERR(pages)) {
				4738	ret = PTR_ERR(pages);
				4739	goto out_put_lreq;
				4740	}
				4741
				4742	down_write(&osdc->lock);
				4743	linger_register(lreq); /* before osd_req_op_* */
				4744	ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
				4745	timeout, payload, payload_len);
				4746	if (ret) {
				4747	linger_unregister(lreq);
				4748	up_write(&osdc->lock);
				4749	ceph_release_page_vector(pages, 1);
				4750	goto out_put_lreq;
				4751	}
				4752	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
				4753	response_data),
				4754	pages, PAGE_SIZE, 0, false, true);
				4755	linger_submit(lreq);
				4756	up_write(&osdc->lock);
				4757
				4758	ret = linger_reg_commit_wait(lreq);
				4759	if (!ret)
				4760	ret = linger_notify_finish_wait(lreq);
				4761	else
				4762	dout("lreq %p failed to initiate notify %d\n", lreq, ret);
				4763
				4764	linger_cancel(lreq);
				4765	out_put_lreq:
				4766	linger_put(lreq);
				4767	return ret;
				4768	}
				4769	EXPORT_SYMBOL(ceph_osdc_notify);
				4770
				4771	/*
				4772	* Return the number of milliseconds since the watch was last
				4773	* confirmed, or an error. If there is an error, the watch is no
				4774	* longer valid, and should be destroyed with ceph_osdc_unwatch().
				4775	*/
				4776	int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
				4777	struct ceph_osd_linger_request *lreq)
				4778	{
				4779	unsigned long stamp, age;
				4780	int ret;
				4781
				4782	down_read(&osdc->lock);
				4783	mutex_lock(&lreq->lock);
				4784	stamp = lreq->watch_valid_thru;
				4785	if (!list_empty(&lreq->pending_lworks)) {
				4786	struct linger_work *lwork =
				4787	list_first_entry(&lreq->pending_lworks,
				4788	struct linger_work,
				4789	pending_item);
				4790
				4791	if (time_before(lwork->queued_stamp, stamp))
				4792	stamp = lwork->queued_stamp;
				4793	}
				4794	age = jiffies - stamp;
				4795	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
				4796	lreq, lreq->linger_id, age, lreq->last_error);
				4797	/* we are truncating to msecs, so return a safe upper bound */
				4798	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
				4799
				4800	mutex_unlock(&lreq->lock);
				4801	up_read(&osdc->lock);
				4802	return ret;
				4803	}
				4804
				4805	static int decode_watcher(void *p, void end, struct ceph_watch_item *item)
				4806	{
				4807	u8 struct_v;
				4808	u32 struct_len;
				4809	int ret;
				4810
				4811	ret = ceph_start_decoding(p, end, 2, "watch_item_t",
				4812	&struct_v, &struct_len);
				4813	if (ret)
				4814	return ret;
				4815
				4816	ceph_decode_copy(p, &item->name, sizeof(item->name));
				4817	item->cookie = ceph_decode_64(p);
				4818	p += 4; / skip timeout_seconds */
				4819	if (struct_v >= 2) {
				4820	ceph_decode_copy(p, &item->addr, sizeof(item->addr));
				4821	ceph_decode_addr(&item->addr);
				4822	}
				4823
				4824	dout("%s %s%llu cookie %llu addr %s\n", __func__,
				4825	ENTITY_NAME(item->name), item->cookie,
				4826	ceph_pr_addr(&item->addr.in_addr));
				4827	return 0;
				4828	}
				4829
				4830	static int decode_watchers(void *p, void end,
				4831	struct ceph_watch_item **watchers,
				4832	u32 *num_watchers)
				4833	{
				4834	u8 struct_v;
				4835	u32 struct_len;
				4836	int i;
				4837	int ret;
				4838
				4839	ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
				4840	&struct_v, &struct_len);
				4841	if (ret)
				4842	return ret;
				4843
				4844	*num_watchers = ceph_decode_32(p);
				4845	watchers = kcalloc(num_watchers, sizeof(**watchers), GFP_NOIO);
				4846	if (!*watchers)
				4847	return -ENOMEM;
				4848
				4849	for (i = 0; i < *num_watchers; i++) {
				4850	ret = decode_watcher(p, end, *watchers + i);
				4851	if (ret) {
				4852	kfree(*watchers);
				4853	return ret;
				4854	}
				4855	}
				4856
				4857	return 0;
				4858	}
				4859
				4860	/*
				4861	* On success, the caller is responsible for:
				4862	*
				4863	* kfree(watchers);
				4864	*/
				4865	int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
				4866	struct ceph_object_id *oid,
				4867	struct ceph_object_locator *oloc,
				4868	struct ceph_watch_item **watchers,
				4869	u32 *num_watchers)
				4870	{
				4871	struct ceph_osd_request *req;
				4872	struct page **pages;
				4873	int ret;
				4874
				4875	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4876	if (!req)
				4877	return -ENOMEM;
				4878
				4879	ceph_oid_copy(&req->r_base_oid, oid);
				4880	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4881	req->r_flags = CEPH_OSD_FLAG_READ;
				4882
				4883	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4884	if (ret)
				4885	goto out_put_req;
				4886
				4887	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				4888	if (IS_ERR(pages)) {
				4889	ret = PTR_ERR(pages);
				4890	goto out_put_req;
				4891	}
				4892
				4893	osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
				4894	ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
				4895	response_data),
				4896	pages, PAGE_SIZE, 0, false, true);
				4897
				4898	ceph_osdc_start_request(osdc, req, false);
				4899	ret = ceph_osdc_wait_request(osdc, req);
				4900	if (ret >= 0) {
				4901	void *p = page_address(pages[0]);
				4902	void *const end = p + req->r_ops[0].outdata_len;
				4903
				4904	ret = decode_watchers(&p, end, watchers, num_watchers);
				4905	}
				4906
				4907	out_put_req:
				4908	ceph_osdc_put_request(req);
				4909	return ret;
				4910	}
				4911	EXPORT_SYMBOL(ceph_osdc_list_watchers);
				4912
				4913	/*
				4914	* Call all pending notify callbacks - for use after a watch is
				4915	* unregistered, to make sure no more callbacks for it will be invoked
				4916	*/
				4917	void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
				4918	{
				4919	dout("%s osdc %p\n", __func__, osdc);
				4920	flush_workqueue(osdc->notify_wq);
				4921	}
				4922	EXPORT_SYMBOL(ceph_osdc_flush_notifies);
				4923
				4924	void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
				4925	{
				4926	down_read(&osdc->lock);
				4927	maybe_request_map(osdc);
				4928	up_read(&osdc->lock);
				4929	}
				4930	EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
				4931
				4932	/*
				4933	* Execute an OSD class method on an object.
				4934	*
				4935	* @flags: CEPH_OSD_FLAG_*
				4936	* @resp_len: in/out param for reply length
				4937	*/
				4938	int ceph_osdc_call(struct ceph_osd_client *osdc,
				4939	struct ceph_object_id *oid,
				4940	struct ceph_object_locator *oloc,
				4941	const char class, const char method,
				4942	unsigned int flags,
				4943	struct page *req_page, size_t req_len,
				4944	struct page resp_page, size_t resp_len)
				4945	{
				4946	struct ceph_osd_request *req;
				4947	int ret;
				4948
				4949	if (req_len > PAGE_SIZE \|\| (resp_page && *resp_len > PAGE_SIZE))
				4950	return -E2BIG;
				4951
				4952	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4953	if (!req)
				4954	return -ENOMEM;
				4955
				4956	ceph_oid_copy(&req->r_base_oid, oid);
				4957	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4958	req->r_flags = flags;
				4959
				4960	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4961	if (ret)
				4962	goto out_put_req;
				4963
				4964	ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
				4965	if (ret)
				4966	goto out_put_req;
				4967
				4968	if (req_page)
				4969	osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
				4970	0, false, false);
				4971	if (resp_page)
				4972	osd_req_op_cls_response_data_pages(req, 0, &resp_page,
				4973	*resp_len, 0, false, false);
				4974
				4975	ceph_osdc_start_request(osdc, req, false);
				4976	ret = ceph_osdc_wait_request(osdc, req);
				4977	if (ret >= 0) {
				4978	ret = req->r_ops[0].rval;
				4979	if (resp_page)
				4980	*resp_len = req->r_ops[0].outdata_len;
				4981	}
				4982
				4983	out_put_req:
				4984	ceph_osdc_put_request(req);
				4985	return ret;
				4986	}
				4987	EXPORT_SYMBOL(ceph_osdc_call);
				4988
				4989	/*
				4990	* init, shutdown
				4991	*/
				4992	int ceph_osdc_init(struct ceph_osd_client osdc, struct ceph_client client)
				4993	{
				4994	int err;
				4995
				4996	dout("init\n");
				4997	osdc->client = client;
				4998	init_rwsem(&osdc->lock);
				4999	osdc->osds = RB_ROOT;
				5000	INIT_LIST_HEAD(&osdc->osd_lru);
				5001	spin_lock_init(&osdc->osd_lru_lock);
				5002	osd_init(&osdc->homeless_osd);
				5003	osdc->homeless_osd.o_osdc = osdc;
				5004	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
				5005	osdc->last_linger_id = CEPH_LINGER_ID_START;
				5006	osdc->linger_requests = RB_ROOT;
				5007	osdc->map_checks = RB_ROOT;
				5008	osdc->linger_map_checks = RB_ROOT;
				5009	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
				5010	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
				5011
				5012	err = -ENOMEM;
				5013	osdc->osdmap = ceph_osdmap_alloc();
				5014	if (!osdc->osdmap)
				5015	goto out;
				5016
				5017	osdc->req_mempool = mempool_create_slab_pool(10,
				5018	ceph_osd_request_cache);
				5019	if (!osdc->req_mempool)
				5020	goto out_map;
				5021
				5022	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
				5023	PAGE_SIZE, 10, true, "osd_op");
				5024	if (err < 0)
				5025	goto out_mempool;
				5026	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
				5027	PAGE_SIZE, 10, true, "osd_op_reply");
				5028	if (err < 0)
				5029	goto out_msgpool;
				5030
				5031	err = -ENOMEM;
				5032	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
				5033	if (!osdc->notify_wq)
				5034	goto out_msgpool_reply;
				5035
				5036	osdc->completion_wq = create_singlethread_workqueue("ceph-completion");
				5037	if (!osdc->completion_wq)
				5038	goto out_notify_wq;
				5039
				5040	schedule_delayed_work(&osdc->timeout_work,
				5041	osdc->client->options->osd_keepalive_timeout);
				5042	schedule_delayed_work(&osdc->osds_timeout_work,
				5043	round_jiffies_relative(osdc->client->options->osd_idle_ttl));
				5044
				5045	return 0;
				5046
				5047	out_notify_wq:
				5048	destroy_workqueue(osdc->notify_wq);
				5049	out_msgpool_reply:
				5050	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				5051	out_msgpool:
				5052	ceph_msgpool_destroy(&osdc->msgpool_op);
				5053	out_mempool:
				5054	mempool_destroy(osdc->req_mempool);
				5055	out_map:
				5056	ceph_osdmap_destroy(osdc->osdmap);
				5057	out:
				5058	return err;
				5059	}
				5060
				5061	void ceph_osdc_stop(struct ceph_osd_client *osdc)
				5062	{
				5063	destroy_workqueue(osdc->completion_wq);
				5064	destroy_workqueue(osdc->notify_wq);
				5065	cancel_delayed_work_sync(&osdc->timeout_work);
				5066	cancel_delayed_work_sync(&osdc->osds_timeout_work);
				5067
				5068	down_write(&osdc->lock);
				5069	while (!RB_EMPTY_ROOT(&osdc->osds)) {
				5070	struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
				5071	struct ceph_osd, o_node);
				5072	close_osd(osd);
				5073	}
				5074	up_write(&osdc->lock);
				5075	WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
				5076	osd_cleanup(&osdc->homeless_osd);
				5077
				5078	WARN_ON(!list_empty(&osdc->osd_lru));
				5079	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
				5080	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
				5081	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
				5082	WARN_ON(atomic_read(&osdc->num_requests));
				5083	WARN_ON(atomic_read(&osdc->num_homeless));
				5084
				5085	ceph_osdmap_destroy(osdc->osdmap);
				5086	mempool_destroy(osdc->req_mempool);
				5087	ceph_msgpool_destroy(&osdc->msgpool_op);
				5088	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				5089	}
				5090
				5091	/*
				5092	* Read some contiguous pages. If we cross a stripe boundary, shorten
				5093	* *plen. Return number of bytes read, or error.
				5094	*/
				5095	int ceph_osdc_readpages(struct ceph_osd_client *osdc,
				5096	struct ceph_vino vino, struct ceph_file_layout *layout,
				5097	u64 off, u64 *plen,
				5098	u32 truncate_seq, u64 truncate_size,
				5099	struct page **pages, int num_pages, int page_align)
				5100	{
				5101	struct ceph_osd_request *req;
				5102	int rc = 0;
				5103
				5104	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
				5105	vino.snap, off, *plen);
				5106	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
				5107	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				5108	NULL, truncate_seq, truncate_size,
				5109	false);
				5110	if (IS_ERR(req))
				5111	return PTR_ERR(req);
				5112
				5113	/* it may be a short read due to an object boundary */
				5114	osd_req_op_extent_osd_data_pages(req, 0,
				5115	pages, *plen, page_align, false, false);
				5116
				5117	dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
				5118	off, plen, plen, page_align);
				5119
				5120	rc = ceph_osdc_start_request(osdc, req, false);
				5121	if (!rc)
				5122	rc = ceph_osdc_wait_request(osdc, req);
				5123
				5124	ceph_osdc_put_request(req);
				5125	dout("readpages result %d\n", rc);
				5126	return rc;
				5127	}
				5128	EXPORT_SYMBOL(ceph_osdc_readpages);
				5129
				5130	/*
				5131	* do a synchronous write on N pages
				5132	*/
				5133	int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
				5134	struct ceph_file_layout *layout,
				5135	struct ceph_snap_context *snapc,
				5136	u64 off, u64 len,
				5137	u32 truncate_seq, u64 truncate_size,
				5138	struct timespec64 *mtime,
				5139	struct page **pages, int num_pages)
				5140	{
				5141	struct ceph_osd_request *req;
				5142	int rc = 0;
				5143	int page_align = off & ~PAGE_MASK;
				5144
				5145	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
				5146	CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
				5147	snapc, truncate_seq, truncate_size,
				5148	true);
				5149	if (IS_ERR(req))
				5150	return PTR_ERR(req);
				5151
				5152	/* it may be a short write due to an object boundary */
				5153	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
				5154	false, false);
				5155	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
				5156
				5157	req->r_mtime = *mtime;
				5158	rc = ceph_osdc_start_request(osdc, req, true);
				5159	if (!rc)
				5160	rc = ceph_osdc_wait_request(osdc, req);
				5161
				5162	ceph_osdc_put_request(req);
				5163	if (rc == 0)
				5164	rc = len;
				5165	dout("writepages result %d\n", rc);
				5166	return rc;
				5167	}
				5168	EXPORT_SYMBOL(ceph_osdc_writepages);
				5169
				5170	int __init ceph_osdc_setup(void)
				5171	{
				5172	size_t size = sizeof(struct ceph_osd_request) +
				5173	CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
				5174
				5175	BUG_ON(ceph_osd_request_cache);
				5176	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
				5177	0, 0, NULL);
				5178
				5179	return ceph_osd_request_cache ? 0 : -ENOMEM;
				5180	}
				5181
				5182	void ceph_osdc_cleanup(void)
				5183	{
				5184	BUG_ON(!ceph_osd_request_cache);
				5185	kmem_cache_destroy(ceph_osd_request_cache);
				5186	ceph_osd_request_cache = NULL;
				5187	}
				5188
				5189	/*
				5190	* handle incoming message
				5191	*/
				5192	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				5193	{
				5194	struct ceph_osd *osd = con->private;
				5195	struct ceph_osd_client *osdc = osd->o_osdc;
				5196	int type = le16_to_cpu(msg->hdr.type);
				5197
				5198	switch (type) {
				5199	case CEPH_MSG_OSD_MAP:
				5200	ceph_osdc_handle_map(osdc, msg);
				5201	break;
				5202	case CEPH_MSG_OSD_OPREPLY:
				5203	handle_reply(osd, msg);
				5204	break;
				5205	case CEPH_MSG_OSD_BACKOFF:
				5206	handle_backoff(osd, msg);
				5207	break;
				5208	case CEPH_MSG_WATCH_NOTIFY:
				5209	handle_watch_notify(osdc, msg);
				5210	break;
				5211
				5212	default:
				5213	pr_err("received unknown message type %d %s\n", type,
				5214	ceph_msg_type_name(type));
				5215	}
				5216
				5217	ceph_msg_put(msg);
				5218	}
				5219
				5220	/*
				5221	* Lookup and return message for incoming reply. Don't try to do
				5222	* anything about a larger than preallocated data portion of the
				5223	* message at the moment - for now, just skip the message.
				5224	*/
				5225	static struct ceph_msg get_reply(struct ceph_connection con,
				5226	struct ceph_msg_header *hdr,
				5227	int *skip)
				5228	{
				5229	struct ceph_osd *osd = con->private;
				5230	struct ceph_osd_client *osdc = osd->o_osdc;
				5231	struct ceph_msg *m = NULL;
				5232	struct ceph_osd_request *req;
				5233	int front_len = le32_to_cpu(hdr->front_len);
				5234	int data_len = le32_to_cpu(hdr->data_len);
				5235	u64 tid = le64_to_cpu(hdr->tid);
				5236
				5237	down_read(&osdc->lock);
				5238	if (!osd_registered(osd)) {
				5239	dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
				5240	*skip = 1;
				5241	goto out_unlock_osdc;
				5242	}
				5243	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
				5244
				5245	mutex_lock(&osd->lock);
				5246	req = lookup_request(&osd->o_requests, tid);
				5247	if (!req) {
				5248	dout("%s osd%d tid %llu unknown, skipping\n", __func__,
				5249	osd->o_osd, tid);
				5250	*skip = 1;
				5251	goto out_unlock_session;
				5252	}
				5253
				5254	ceph_msg_revoke_incoming(req->r_reply);
				5255
				5256	if (front_len > req->r_reply->front_alloc_len) {
				5257	pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
				5258	__func__, osd->o_osd, req->r_tid, front_len,
				5259	req->r_reply->front_alloc_len);
				5260	m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
				5261	false);
				5262	if (!m)
				5263	goto out_unlock_session;
				5264	ceph_msg_put(req->r_reply);
				5265	req->r_reply = m;
				5266	}
				5267
				5268	if (data_len > req->r_reply->data_length) {
				5269	pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
				5270	__func__, osd->o_osd, req->r_tid, data_len,
				5271	req->r_reply->data_length);
				5272	m = NULL;
				5273	*skip = 1;
				5274	goto out_unlock_session;
				5275	}
				5276
				5277	m = ceph_msg_get(req->r_reply);
				5278	dout("get_reply tid %lld %p\n", tid, m);
				5279
				5280	out_unlock_session:
				5281	mutex_unlock(&osd->lock);
				5282	out_unlock_osdc:
				5283	up_read(&osdc->lock);
				5284	return m;
				5285	}
				5286
				5287	/*
				5288	* TODO: switch to a msg-owned pagelist
				5289	*/
				5290	static struct ceph_msg alloc_msg_with_page_vector(struct ceph_msg_header hdr)
				5291	{
				5292	struct ceph_msg *m;
				5293	int type = le16_to_cpu(hdr->type);
				5294	u32 front_len = le32_to_cpu(hdr->front_len);
				5295	u32 data_len = le32_to_cpu(hdr->data_len);
				5296
				5297	m = ceph_msg_new(type, front_len, GFP_NOIO, false);
				5298	if (!m)
				5299	return NULL;
				5300
				5301	if (data_len) {
				5302	struct page **pages;
				5303	struct ceph_osd_data osd_data;
				5304
				5305	pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
				5306	GFP_NOIO);
				5307	if (IS_ERR(pages)) {
				5308	ceph_msg_put(m);
				5309	return NULL;
				5310	}
				5311
				5312	ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
				5313	false);
				5314	ceph_osdc_msg_data_add(m, &osd_data);
				5315	}
				5316
				5317	return m;
				5318	}
				5319
				5320	static struct ceph_msg alloc_msg(struct ceph_connection con,
				5321	struct ceph_msg_header *hdr,
				5322	int *skip)
				5323	{
				5324	struct ceph_osd *osd = con->private;
				5325	int type = le16_to_cpu(hdr->type);
				5326
				5327	*skip = 0;
				5328	switch (type) {
				5329	case CEPH_MSG_OSD_MAP:
				5330	case CEPH_MSG_OSD_BACKOFF:
				5331	case CEPH_MSG_WATCH_NOTIFY:
				5332	return alloc_msg_with_page_vector(hdr);
				5333	case CEPH_MSG_OSD_OPREPLY:
				5334	return get_reply(con, hdr, skip);
				5335	default:
				5336	pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
				5337	osd->o_osd, type);
				5338	*skip = 1;
				5339	return NULL;
				5340	}
				5341	}
				5342
				5343	/*
				5344	* Wrappers to refcount containing ceph_osd struct
				5345	*/
				5346	static struct ceph_connection get_osd_con(struct ceph_connection con)
				5347	{
				5348	struct ceph_osd *osd = con->private;
				5349	if (get_osd(osd))
				5350	return con;
				5351	return NULL;
				5352	}
				5353
				5354	static void put_osd_con(struct ceph_connection *con)
				5355	{
				5356	struct ceph_osd *osd = con->private;
				5357	put_osd(osd);
				5358	}
				5359
				5360	/*
				5361	* authentication
				5362	*/
				5363	/*
				5364	* Note: returned pointer is the address of a structure that's
				5365	* managed separately. Caller must not attempt to free it.
				5366	*/
				5367	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				5368	int *proto, int force_new)
				5369	{
				5370	struct ceph_osd *o = con->private;
				5371	struct ceph_osd_client *osdc = o->o_osdc;
				5372	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5373	struct ceph_auth_handshake *auth = &o->o_auth;
				5374
				5375	if (force_new && auth->authorizer) {
				5376	ceph_auth_destroy_authorizer(auth->authorizer);
				5377	auth->authorizer = NULL;
				5378	}
				5379	if (!auth->authorizer) {
				5380	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5381	auth);
				5382	if (ret)
				5383	return ERR_PTR(ret);
				5384	} else {
				5385	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5386	auth);
				5387	if (ret)
				5388	return ERR_PTR(ret);
				5389	}
				5390	*proto = ac->protocol;
				5391
				5392	return auth;
				5393	}
				5394
				5395	static int add_authorizer_challenge(struct ceph_connection *con,
				5396	void *challenge_buf, int challenge_buf_len)
				5397	{
				5398	struct ceph_osd *o = con->private;
				5399	struct ceph_osd_client *osdc = o->o_osdc;
				5400	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5401
				5402	return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
				5403	challenge_buf, challenge_buf_len);
				5404	}
				5405
				5406	static int verify_authorizer_reply(struct ceph_connection *con)
				5407	{
				5408	struct ceph_osd *o = con->private;
				5409	struct ceph_osd_client *osdc = o->o_osdc;
				5410	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5411
				5412	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
				5413	}
				5414
				5415	static int invalidate_authorizer(struct ceph_connection *con)
				5416	{
				5417	struct ceph_osd *o = con->private;
				5418	struct ceph_osd_client *osdc = o->o_osdc;
				5419	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5420
				5421	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
				5422	return ceph_monc_validate_auth(&osdc->client->monc);
				5423	}
				5424
				5425	static void osd_reencode_message(struct ceph_msg *msg)
				5426	{
				5427	int type = le16_to_cpu(msg->hdr.type);
				5428
				5429	if (type == CEPH_MSG_OSD_OP)
				5430	encode_request_finish(msg);
				5431	}
				5432
				5433	static int osd_sign_message(struct ceph_msg *msg)
				5434	{
				5435	struct ceph_osd *o = msg->con->private;
				5436	struct ceph_auth_handshake *auth = &o->o_auth;
				5437
				5438	return ceph_auth_sign_message(auth, msg);
				5439	}
				5440
				5441	static int osd_check_message_signature(struct ceph_msg *msg)
				5442	{
				5443	struct ceph_osd *o = msg->con->private;
				5444	struct ceph_auth_handshake *auth = &o->o_auth;
				5445
				5446	return ceph_auth_check_message_signature(auth, msg);
				5447	}
				5448
				5449	static const struct ceph_connection_operations osd_con_ops = {
				5450	.get = get_osd_con,
				5451	.put = put_osd_con,
				5452	.dispatch = dispatch,
				5453	.get_authorizer = get_authorizer,
				5454	.add_authorizer_challenge = add_authorizer_challenge,
				5455	.verify_authorizer_reply = verify_authorizer_reply,
				5456	.invalidate_authorizer = invalidate_authorizer,
				5457	.alloc_msg = alloc_msg,
				5458	.reencode_message = osd_reencode_message,
				5459	.sign_message = osd_sign_message,
				5460	.check_message_signature = osd_check_message_signature,
				5461	.fault = osd_fault,
				5462	};