Blame - src/kernel/linux/v4.14/drivers/nvme/target/rdma.c - T103

blob: da56cc277b71c86dc43d9ba0046b46914c0dced4 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* NVMe over Fabrics RDMA target.
				3	* Copyright (c) 2015-2016 HGST, a Western Digital Company.
				4	*
				5	* This program is free software; you can redistribute it and/or modify it
				6	* under the terms and conditions of the GNU General Public License,
				7	* version 2, as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope it will be useful, but WITHOUT
				10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				12	* more details.
				13	*/
				14	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				15	#include <linux/atomic.h>
				16	#include <linux/ctype.h>
				17	#include <linux/delay.h>
				18	#include <linux/err.h>
				19	#include <linux/init.h>
				20	#include <linux/module.h>
				21	#include <linux/nvme.h>
				22	#include <linux/slab.h>
				23	#include <linux/string.h>
				24	#include <linux/wait.h>
				25	#include <linux/inet.h>
				26	#include <asm/unaligned.h>
				27
				28	#include <rdma/ib_verbs.h>
				29	#include <rdma/rdma_cm.h>
				30	#include <rdma/rw.h>
				31
				32	#include <linux/nvme-rdma.h>
				33	#include "nvmet.h"
				34
				35	/*
				36	* We allow up to a page of inline data to go with the SQE
				37	*/
				38	#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE
				39
				40	struct nvmet_rdma_cmd {
				41	struct ib_sge sge[2];
				42	struct ib_cqe cqe;
				43	struct ib_recv_wr wr;
				44	struct scatterlist inline_sg;
				45	struct page *inline_page;
				46	struct nvme_command *nvme_cmd;
				47	struct nvmet_rdma_queue *queue;
				48	};
				49
				50	enum {
				51	NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
				52	NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
				53	};
				54
				55	struct nvmet_rdma_rsp {
				56	struct ib_sge send_sge;
				57	struct ib_cqe send_cqe;
				58	struct ib_send_wr send_wr;
				59
				60	struct nvmet_rdma_cmd *cmd;
				61	struct nvmet_rdma_queue *queue;
				62
				63	struct ib_cqe read_cqe;
				64	struct rdma_rw_ctx rw;
				65
				66	struct nvmet_req req;
				67
				68	bool allocated;
				69	u8 n_rdma;
				70	u32 flags;
				71	u32 invalidate_rkey;
				72
				73	struct list_head wait_list;
				74	struct list_head free_list;
				75	};
				76
				77	enum nvmet_rdma_queue_state {
				78	NVMET_RDMA_Q_CONNECTING,
				79	NVMET_RDMA_Q_LIVE,
				80	NVMET_RDMA_Q_DISCONNECTING,
				81	NVMET_RDMA_IN_DEVICE_REMOVAL,
				82	};
				83
				84	struct nvmet_rdma_queue {
				85	struct rdma_cm_id *cm_id;
				86	struct nvmet_port *port;
				87	struct ib_cq *cq;
				88	atomic_t sq_wr_avail;
				89	struct nvmet_rdma_device *dev;
				90	spinlock_t state_lock;
				91	enum nvmet_rdma_queue_state state;
				92	struct nvmet_cq nvme_cq;
				93	struct nvmet_sq nvme_sq;
				94
				95	struct nvmet_rdma_rsp *rsps;
				96	struct list_head free_rsps;
				97	spinlock_t rsps_lock;
				98	struct nvmet_rdma_cmd *cmds;
				99
				100	struct work_struct release_work;
				101	struct list_head rsp_wait_list;
				102	struct list_head rsp_wr_wait_list;
				103	spinlock_t rsp_wr_wait_lock;
				104
				105	int idx;
				106	int host_qid;
				107	int recv_queue_size;
				108	int send_queue_size;
				109
				110	struct list_head queue_list;
				111	};
				112
				113	struct nvmet_rdma_device {
				114	struct ib_device *device;
				115	struct ib_pd *pd;
				116	struct ib_srq *srq;
				117	struct nvmet_rdma_cmd *srq_cmds;
				118	size_t srq_size;
				119	struct kref ref;
				120	struct list_head entry;
				121	};
				122
				123	static bool nvmet_rdma_use_srq;
				124	module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
				125	MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
				126
				127	static DEFINE_IDA(nvmet_rdma_queue_ida);
				128	static LIST_HEAD(nvmet_rdma_queue_list);
				129	static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
				130
				131	static LIST_HEAD(device_list);
				132	static DEFINE_MUTEX(device_list_mutex);
				133
				134	static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
				135	static void nvmet_rdma_send_done(struct ib_cq cq, struct ib_wc wc);
				136	static void nvmet_rdma_recv_done(struct ib_cq cq, struct ib_wc wc);
				137	static void nvmet_rdma_read_data_done(struct ib_cq cq, struct ib_wc wc);
				138	static void nvmet_rdma_qp_event(struct ib_event event, void priv);
				139	static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
				140	static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
				141	struct nvmet_rdma_rsp *r);
				142	static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
				143	struct nvmet_rdma_rsp *r);
				144
				145	static struct nvmet_fabrics_ops nvmet_rdma_ops;
				146
				147	/* XXX: really should move to a generic header sooner or later.. */
				148	static inline u32 get_unaligned_le24(const u8 *p)
				149	{
				150	return (u32)p[0] \| (u32)p[1] << 8 \| (u32)p[2] << 16;
				151	}
				152
				153	static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
				154	{
				155	return nvme_is_write(rsp->req.cmd) &&
				156	rsp->req.data_len &&
				157	!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
				158	}
				159
				160	static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
				161	{
				162	return !nvme_is_write(rsp->req.cmd) &&
				163	rsp->req.data_len &&
				164	!rsp->req.rsp->status &&
				165	!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
				166	}
				167
				168	static inline struct nvmet_rdma_rsp *
				169	nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
				170	{
				171	struct nvmet_rdma_rsp *rsp;
				172	unsigned long flags;
				173
				174	spin_lock_irqsave(&queue->rsps_lock, flags);
				175	rsp = list_first_entry_or_null(&queue->free_rsps,
				176	struct nvmet_rdma_rsp, free_list);
				177	if (likely(rsp))
				178	list_del(&rsp->free_list);
				179	spin_unlock_irqrestore(&queue->rsps_lock, flags);
				180
				181	if (unlikely(!rsp)) {
				182	int ret;
				183
				184	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
				185	if (unlikely(!rsp))
				186	return NULL;
				187	ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
				188	if (unlikely(ret)) {
				189	kfree(rsp);
				190	return NULL;
				191	}
				192
				193	rsp->allocated = true;
				194	}
				195
				196	return rsp;
				197	}
				198
				199	static inline void
				200	nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
				201	{
				202	unsigned long flags;
				203
				204	if (unlikely(rsp->allocated)) {
				205	nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
				206	kfree(rsp);
				207	return;
				208	}
				209
				210	spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
				211	list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
				212	spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
				213	}
				214
				215	static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
				216	{
				217	struct scatterlist *sg;
				218	int count;
				219
				220	if (!sgl \|\| !nents)
				221	return;
				222
				223	for_each_sg(sgl, sg, nents, count)
				224	__free_page(sg_page(sg));
				225	kfree(sgl);
				226	}
				227
				228	static int nvmet_rdma_alloc_sgl(struct scatterlist *sgl, unsigned int nents,
				229	u32 length)
				230	{
				231	struct scatterlist *sg;
				232	struct page *page;
				233	unsigned int nent;
				234	int i = 0;
				235
				236	nent = DIV_ROUND_UP(length, PAGE_SIZE);
				237	sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
				238	if (!sg)
				239	goto out;
				240
				241	sg_init_table(sg, nent);
				242
				243	while (length) {
				244	u32 page_len = min_t(u32, length, PAGE_SIZE);
				245
				246	page = alloc_page(GFP_KERNEL);
				247	if (!page)
				248	goto out_free_pages;
				249
				250	sg_set_page(&sg[i], page, page_len, 0);
				251	length -= page_len;
				252	i++;
				253	}
				254	*sgl = sg;
				255	*nents = nent;
				256	return 0;
				257
				258	out_free_pages:
				259	while (i > 0) {
				260	i--;
				261	__free_page(sg_page(&sg[i]));
				262	}
				263	kfree(sg);
				264	out:
				265	return NVME_SC_INTERNAL;
				266	}
				267
				268	static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
				269	struct nvmet_rdma_cmd *c, bool admin)
				270	{
				271	/* NVMe command / RDMA RECV */
				272	c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
				273	if (!c->nvme_cmd)
				274	goto out;
				275
				276	c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
				277	sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
				278	if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
				279	goto out_free_cmd;
				280
				281	c->sge[0].length = sizeof(*c->nvme_cmd);
				282	c->sge[0].lkey = ndev->pd->local_dma_lkey;
				283
				284	if (!admin) {
				285	c->inline_page = alloc_pages(GFP_KERNEL,
				286	get_order(NVMET_RDMA_INLINE_DATA_SIZE));
				287	if (!c->inline_page)
				288	goto out_unmap_cmd;
				289	c->sge[1].addr = ib_dma_map_page(ndev->device,
				290	c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
				291	DMA_FROM_DEVICE);
				292	if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
				293	goto out_free_inline_page;
				294	c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
				295	c->sge[1].lkey = ndev->pd->local_dma_lkey;
				296	}
				297
				298	c->cqe.done = nvmet_rdma_recv_done;
				299
				300	c->wr.wr_cqe = &c->cqe;
				301	c->wr.sg_list = c->sge;
				302	c->wr.num_sge = admin ? 1 : 2;
				303
				304	return 0;
				305
				306	out_free_inline_page:
				307	if (!admin) {
				308	__free_pages(c->inline_page,
				309	get_order(NVMET_RDMA_INLINE_DATA_SIZE));
				310	}
				311	out_unmap_cmd:
				312	ib_dma_unmap_single(ndev->device, c->sge[0].addr,
				313	sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
				314	out_free_cmd:
				315	kfree(c->nvme_cmd);
				316
				317	out:
				318	return -ENOMEM;
				319	}
				320
				321	static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
				322	struct nvmet_rdma_cmd *c, bool admin)
				323	{
				324	if (!admin) {
				325	ib_dma_unmap_page(ndev->device, c->sge[1].addr,
				326	NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
				327	__free_pages(c->inline_page,
				328	get_order(NVMET_RDMA_INLINE_DATA_SIZE));
				329	}
				330	ib_dma_unmap_single(ndev->device, c->sge[0].addr,
				331	sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
				332	kfree(c->nvme_cmd);
				333	}
				334
				335	static struct nvmet_rdma_cmd *
				336	nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
				337	int nr_cmds, bool admin)
				338	{
				339	struct nvmet_rdma_cmd *cmds;
				340	int ret = -EINVAL, i;
				341
				342	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
				343	if (!cmds)
				344	goto out;
				345
				346	for (i = 0; i < nr_cmds; i++) {
				347	ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
				348	if (ret)
				349	goto out_free;
				350	}
				351
				352	return cmds;
				353
				354	out_free:
				355	while (--i >= 0)
				356	nvmet_rdma_free_cmd(ndev, cmds + i, admin);
				357	kfree(cmds);
				358	out:
				359	return ERR_PTR(ret);
				360	}
				361
				362	static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
				363	struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
				364	{
				365	int i;
				366
				367	for (i = 0; i < nr_cmds; i++)
				368	nvmet_rdma_free_cmd(ndev, cmds + i, admin);
				369	kfree(cmds);
				370	}
				371
				372	static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
				373	struct nvmet_rdma_rsp *r)
				374	{
				375	/* NVMe CQE / RDMA SEND */
				376	r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
				377	if (!r->req.rsp)
				378	goto out;
				379
				380	r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
				381	sizeof(*r->req.rsp), DMA_TO_DEVICE);
				382	if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
				383	goto out_free_rsp;
				384
				385	r->send_sge.length = sizeof(*r->req.rsp);
				386	r->send_sge.lkey = ndev->pd->local_dma_lkey;
				387
				388	r->send_cqe.done = nvmet_rdma_send_done;
				389
				390	r->send_wr.wr_cqe = &r->send_cqe;
				391	r->send_wr.sg_list = &r->send_sge;
				392	r->send_wr.num_sge = 1;
				393	r->send_wr.send_flags = IB_SEND_SIGNALED;
				394
				395	/* Data In / RDMA READ */
				396	r->read_cqe.done = nvmet_rdma_read_data_done;
				397	return 0;
				398
				399	out_free_rsp:
				400	kfree(r->req.rsp);
				401	out:
				402	return -ENOMEM;
				403	}
				404
				405	static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
				406	struct nvmet_rdma_rsp *r)
				407	{
				408	ib_dma_unmap_single(ndev->device, r->send_sge.addr,
				409	sizeof(*r->req.rsp), DMA_TO_DEVICE);
				410	kfree(r->req.rsp);
				411	}
				412
				413	static int
				414	nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
				415	{
				416	struct nvmet_rdma_device *ndev = queue->dev;
				417	int nr_rsps = queue->recv_queue_size * 2;
				418	int ret = -EINVAL, i;
				419
				420	queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
				421	GFP_KERNEL);
				422	if (!queue->rsps)
				423	goto out;
				424
				425	for (i = 0; i < nr_rsps; i++) {
				426	struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
				427
				428	ret = nvmet_rdma_alloc_rsp(ndev, rsp);
				429	if (ret)
				430	goto out_free;
				431
				432	list_add_tail(&rsp->free_list, &queue->free_rsps);
				433	}
				434
				435	return 0;
				436
				437	out_free:
				438	while (--i >= 0) {
				439	struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
				440
				441	list_del(&rsp->free_list);
				442	nvmet_rdma_free_rsp(ndev, rsp);
				443	}
				444	kfree(queue->rsps);
				445	out:
				446	return ret;
				447	}
				448
				449	static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
				450	{
				451	struct nvmet_rdma_device *ndev = queue->dev;
				452	int i, nr_rsps = queue->recv_queue_size * 2;
				453
				454	for (i = 0; i < nr_rsps; i++) {
				455	struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
				456
				457	list_del(&rsp->free_list);
				458	nvmet_rdma_free_rsp(ndev, rsp);
				459	}
				460	kfree(queue->rsps);
				461	}
				462
				463	static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
				464	struct nvmet_rdma_cmd *cmd)
				465	{
				466	struct ib_recv_wr *bad_wr;
				467
				468	ib_dma_sync_single_for_device(ndev->device,
				469	cmd->sge[0].addr, cmd->sge[0].length,
				470	DMA_FROM_DEVICE);
				471
				472	if (ndev->srq)
				473	return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
				474	return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
				475	}
				476
				477	static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
				478	{
				479	spin_lock(&queue->rsp_wr_wait_lock);
				480	while (!list_empty(&queue->rsp_wr_wait_list)) {
				481	struct nvmet_rdma_rsp *rsp;
				482	bool ret;
				483
				484	rsp = list_entry(queue->rsp_wr_wait_list.next,
				485	struct nvmet_rdma_rsp, wait_list);
				486	list_del(&rsp->wait_list);
				487
				488	spin_unlock(&queue->rsp_wr_wait_lock);
				489	ret = nvmet_rdma_execute_command(rsp);
				490	spin_lock(&queue->rsp_wr_wait_lock);
				491
				492	if (!ret) {
				493	list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
				494	break;
				495	}
				496	}
				497	spin_unlock(&queue->rsp_wr_wait_lock);
				498	}
				499
				500
				501	static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
				502	{
				503	struct nvmet_rdma_queue *queue = rsp->queue;
				504
				505	atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
				506
				507	if (rsp->n_rdma) {
				508	rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
				509	queue->cm_id->port_num, rsp->req.sg,
				510	rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
				511	}
				512
				513	if (rsp->req.sg != &rsp->cmd->inline_sg)
				514	nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
				515
				516	if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
				517	nvmet_rdma_process_wr_wait_list(queue);
				518
				519	nvmet_rdma_put_rsp(rsp);
				520	}
				521
				522	static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
				523	{
				524	if (queue->nvme_sq.ctrl) {
				525	nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
				526	} else {
				527	/*
				528	* we didn't setup the controller yet in case
				529	* of admin connect error, just disconnect and
				530	* cleanup the queue
				531	*/
				532	nvmet_rdma_queue_disconnect(queue);
				533	}
				534	}
				535
				536	static void nvmet_rdma_send_done(struct ib_cq cq, struct ib_wc wc)
				537	{
				538	struct nvmet_rdma_rsp *rsp =
				539	container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
				540	struct nvmet_rdma_queue *queue = cq->cq_context;
				541
				542	nvmet_rdma_release_rsp(rsp);
				543
				544	if (unlikely(wc->status != IB_WC_SUCCESS &&
				545	wc->status != IB_WC_WR_FLUSH_ERR)) {
				546	pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
				547	wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
				548	nvmet_rdma_error_comp(queue);
				549	}
				550	}
				551
				552	static void nvmet_rdma_queue_response(struct nvmet_req *req)
				553	{
				554	struct nvmet_rdma_rsp *rsp =
				555	container_of(req, struct nvmet_rdma_rsp, req);
				556	struct rdma_cm_id *cm_id = rsp->queue->cm_id;
				557	struct ib_send_wr first_wr, bad_wr;
				558
				559	if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
				560	rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
				561	rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
				562	} else {
				563	rsp->send_wr.opcode = IB_WR_SEND;
				564	}
				565
				566	if (nvmet_rdma_need_data_out(rsp))
				567	first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
				568	cm_id->port_num, NULL, &rsp->send_wr);
				569	else
				570	first_wr = &rsp->send_wr;
				571
				572	nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
				573
				574	ib_dma_sync_single_for_device(rsp->queue->dev->device,
				575	rsp->send_sge.addr, rsp->send_sge.length,
				576	DMA_TO_DEVICE);
				577
				578	if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
				579	pr_err("sending cmd response failed\n");
				580	nvmet_rdma_release_rsp(rsp);
				581	}
				582	}
				583
				584	static void nvmet_rdma_read_data_done(struct ib_cq cq, struct ib_wc wc)
				585	{
				586	struct nvmet_rdma_rsp *rsp =
				587	container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
				588	struct nvmet_rdma_queue *queue = cq->cq_context;
				589
				590	WARN_ON(rsp->n_rdma <= 0);
				591	atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
				592	rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
				593	queue->cm_id->port_num, rsp->req.sg,
				594	rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
				595	rsp->n_rdma = 0;
				596
				597	if (unlikely(wc->status != IB_WC_SUCCESS)) {
				598	nvmet_req_uninit(&rsp->req);
				599	nvmet_rdma_release_rsp(rsp);
				600	if (wc->status != IB_WC_WR_FLUSH_ERR) {
				601	pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
				602	wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
				603	nvmet_rdma_error_comp(queue);
				604	}
				605	return;
				606	}
				607
				608	rsp->req.execute(&rsp->req);
				609	}
				610
				611	static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
				612	u64 off)
				613	{
				614	sg_init_table(&rsp->cmd->inline_sg, 1);
				615	sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
				616	rsp->req.sg = &rsp->cmd->inline_sg;
				617	rsp->req.sg_cnt = 1;
				618	}
				619
				620	static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
				621	{
				622	struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
				623	u64 off = le64_to_cpu(sgl->addr);
				624	u32 len = le32_to_cpu(sgl->length);
				625
				626	if (!nvme_is_write(rsp->req.cmd))
				627	return NVME_SC_INVALID_FIELD \| NVME_SC_DNR;
				628
				629	if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
				630	pr_err("invalid inline data offset!\n");
				631	return NVME_SC_SGL_INVALID_OFFSET \| NVME_SC_DNR;
				632	}
				633
				634	/* no data command? */
				635	if (!len)
				636	return 0;
				637
				638	nvmet_rdma_use_inline_sg(rsp, len, off);
				639	rsp->flags \|= NVMET_RDMA_REQ_INLINE_DATA;
				640	return 0;
				641	}
				642
				643	static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
				644	struct nvme_keyed_sgl_desc *sgl, bool invalidate)
				645	{
				646	struct rdma_cm_id *cm_id = rsp->queue->cm_id;
				647	u64 addr = le64_to_cpu(sgl->addr);
				648	u32 len = get_unaligned_le24(sgl->length);
				649	u32 key = get_unaligned_le32(sgl->key);
				650	int ret;
				651	u16 status;
				652
				653	/* no data command? */
				654	if (!len)
				655	return 0;
				656
				657	status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
				658	len);
				659	if (status)
				660	return status;
				661
				662	ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
				663	rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
				664	nvmet_data_dir(&rsp->req));
				665	if (ret < 0)
				666	return NVME_SC_INTERNAL;
				667	rsp->n_rdma += ret;
				668
				669	if (invalidate) {
				670	rsp->invalidate_rkey = key;
				671	rsp->flags \|= NVMET_RDMA_REQ_INVALIDATE_RKEY;
				672	}
				673
				674	return 0;
				675	}
				676
				677	static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
				678	{
				679	struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
				680
				681	switch (sgl->type >> 4) {
				682	case NVME_SGL_FMT_DATA_DESC:
				683	switch (sgl->type & 0xf) {
				684	case NVME_SGL_FMT_OFFSET:
				685	return nvmet_rdma_map_sgl_inline(rsp);
				686	default:
				687	pr_err("invalid SGL subtype: %#x\n", sgl->type);
				688	return NVME_SC_INVALID_FIELD \| NVME_SC_DNR;
				689	}
				690	case NVME_KEY_SGL_FMT_DATA_DESC:
				691	switch (sgl->type & 0xf) {
				692	case NVME_SGL_FMT_ADDRESS \| NVME_SGL_FMT_INVALIDATE:
				693	return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
				694	case NVME_SGL_FMT_ADDRESS:
				695	return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
				696	default:
				697	pr_err("invalid SGL subtype: %#x\n", sgl->type);
				698	return NVME_SC_INVALID_FIELD \| NVME_SC_DNR;
				699	}
				700	default:
				701	pr_err("invalid SGL type: %#x\n", sgl->type);
				702	return NVME_SC_SGL_INVALID_TYPE \| NVME_SC_DNR;
				703	}
				704	}
				705
				706	static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
				707	{
				708	struct nvmet_rdma_queue *queue = rsp->queue;
				709
				710	if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
				711	&queue->sq_wr_avail) < 0)) {
				712	pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
				713	1 + rsp->n_rdma, queue->idx,
				714	queue->nvme_sq.ctrl->cntlid);
				715	atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
				716	return false;
				717	}
				718
				719	if (nvmet_rdma_need_data_in(rsp)) {
				720	if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
				721	queue->cm_id->port_num, &rsp->read_cqe, NULL))
				722	nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
				723	} else {
				724	rsp->req.execute(&rsp->req);
				725	}
				726
				727	return true;
				728	}
				729
				730	static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
				731	struct nvmet_rdma_rsp *cmd)
				732	{
				733	u16 status;
				734
				735	ib_dma_sync_single_for_cpu(queue->dev->device,
				736	cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
				737	DMA_FROM_DEVICE);
				738	ib_dma_sync_single_for_cpu(queue->dev->device,
				739	cmd->send_sge.addr, cmd->send_sge.length,
				740	DMA_TO_DEVICE);
				741
				742	if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
				743	&queue->nvme_sq, &nvmet_rdma_ops))
				744	return;
				745
				746	status = nvmet_rdma_map_sgl(cmd);
				747	if (status)
				748	goto out_err;
				749
				750	if (unlikely(!nvmet_rdma_execute_command(cmd))) {
				751	spin_lock(&queue->rsp_wr_wait_lock);
				752	list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
				753	spin_unlock(&queue->rsp_wr_wait_lock);
				754	}
				755
				756	return;
				757
				758	out_err:
				759	nvmet_req_complete(&cmd->req, status);
				760	}
				761
				762	static void nvmet_rdma_recv_done(struct ib_cq cq, struct ib_wc wc)
				763	{
				764	struct nvmet_rdma_cmd *cmd =
				765	container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
				766	struct nvmet_rdma_queue *queue = cq->cq_context;
				767	struct nvmet_rdma_rsp *rsp;
				768
				769	if (unlikely(wc->status != IB_WC_SUCCESS)) {
				770	if (wc->status != IB_WC_WR_FLUSH_ERR) {
				771	pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
				772	wc->wr_cqe, ib_wc_status_msg(wc->status),
				773	wc->status);
				774	nvmet_rdma_error_comp(queue);
				775	}
				776	return;
				777	}
				778
				779	if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
				780	pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
				781	nvmet_rdma_error_comp(queue);
				782	return;
				783	}
				784
				785	cmd->queue = queue;
				786	rsp = nvmet_rdma_get_rsp(queue);
				787	if (unlikely(!rsp)) {
				788	/*
				789	* we get here only under memory pressure,
				790	* silently drop and have the host retry
				791	* as we can't even fail it.
				792	*/
				793	nvmet_rdma_post_recv(queue->dev, cmd);
				794	return;
				795	}
				796	rsp->queue = queue;
				797	rsp->cmd = cmd;
				798	rsp->flags = 0;
				799	rsp->req.cmd = cmd->nvme_cmd;
				800	rsp->req.port = queue->port;
				801	rsp->n_rdma = 0;
				802
				803	if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
				804	unsigned long flags;
				805
				806	spin_lock_irqsave(&queue->state_lock, flags);
				807	if (queue->state == NVMET_RDMA_Q_CONNECTING)
				808	list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
				809	else
				810	nvmet_rdma_put_rsp(rsp);
				811	spin_unlock_irqrestore(&queue->state_lock, flags);
				812	return;
				813	}
				814
				815	nvmet_rdma_handle_command(queue, rsp);
				816	}
				817
				818	static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
				819	{
				820	if (!ndev->srq)
				821	return;
				822
				823	nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
				824	ib_destroy_srq(ndev->srq);
				825	}
				826
				827	static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
				828	{
				829	struct ib_srq_init_attr srq_attr = { NULL, };
				830	struct ib_srq *srq;
				831	size_t srq_size;
				832	int ret, i;
				833
				834	srq_size = 4095; /* XXX: tune */
				835
				836	srq_attr.attr.max_wr = srq_size;
				837	srq_attr.attr.max_sge = 2;
				838	srq_attr.attr.srq_limit = 0;
				839	srq_attr.srq_type = IB_SRQT_BASIC;
				840	srq = ib_create_srq(ndev->pd, &srq_attr);
				841	if (IS_ERR(srq)) {
				842	/*
				843	* If SRQs aren't supported we just go ahead and use normal
				844	* non-shared receive queues.
				845	*/
				846	pr_info("SRQ requested but not supported.\n");
				847	return 0;
				848	}
				849
				850	ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
				851	if (IS_ERR(ndev->srq_cmds)) {
				852	ret = PTR_ERR(ndev->srq_cmds);
				853	goto out_destroy_srq;
				854	}
				855
				856	ndev->srq = srq;
				857	ndev->srq_size = srq_size;
				858
				859	for (i = 0; i < srq_size; i++)
				860	nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
				861
				862	return 0;
				863
				864	out_destroy_srq:
				865	ib_destroy_srq(srq);
				866	return ret;
				867	}
				868
				869	static void nvmet_rdma_free_dev(struct kref *ref)
				870	{
				871	struct nvmet_rdma_device *ndev =
				872	container_of(ref, struct nvmet_rdma_device, ref);
				873
				874	mutex_lock(&device_list_mutex);
				875	list_del(&ndev->entry);
				876	mutex_unlock(&device_list_mutex);
				877
				878	nvmet_rdma_destroy_srq(ndev);
				879	ib_dealloc_pd(ndev->pd);
				880
				881	kfree(ndev);
				882	}
				883
				884	static struct nvmet_rdma_device *
				885	nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
				886	{
				887	struct nvmet_rdma_device *ndev;
				888	int ret;
				889
				890	mutex_lock(&device_list_mutex);
				891	list_for_each_entry(ndev, &device_list, entry) {
				892	if (ndev->device->node_guid == cm_id->device->node_guid &&
				893	kref_get_unless_zero(&ndev->ref))
				894	goto out_unlock;
				895	}
				896
				897	ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
				898	if (!ndev)
				899	goto out_err;
				900
				901	ndev->device = cm_id->device;
				902	kref_init(&ndev->ref);
				903
				904	ndev->pd = ib_alloc_pd(ndev->device, 0);
				905	if (IS_ERR(ndev->pd))
				906	goto out_free_dev;
				907
				908	if (nvmet_rdma_use_srq) {
				909	ret = nvmet_rdma_init_srq(ndev);
				910	if (ret)
				911	goto out_free_pd;
				912	}
				913
				914	list_add(&ndev->entry, &device_list);
				915	out_unlock:
				916	mutex_unlock(&device_list_mutex);
				917	pr_debug("added %s.\n", ndev->device->name);
				918	return ndev;
				919
				920	out_free_pd:
				921	ib_dealloc_pd(ndev->pd);
				922	out_free_dev:
				923	kfree(ndev);
				924	out_err:
				925	mutex_unlock(&device_list_mutex);
				926	return NULL;
				927	}
				928
				929	static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
				930	{
				931	struct ib_qp_init_attr qp_attr;
				932	struct nvmet_rdma_device *ndev = queue->dev;
				933	int comp_vector, nr_cqe, ret, i;
				934
				935	/*
				936	* Spread the io queues across completion vectors,
				937	* but still keep all admin queues on vector 0.
				938	*/
				939	comp_vector = !queue->host_qid ? 0 :
				940	queue->idx % ndev->device->num_comp_vectors;
				941
				942	/*
				943	* Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
				944	*/
				945	nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
				946
				947	queue->cq = ib_alloc_cq(ndev->device, queue,
				948	nr_cqe + 1, comp_vector,
				949	IB_POLL_WORKQUEUE);
				950	if (IS_ERR(queue->cq)) {
				951	ret = PTR_ERR(queue->cq);
				952	pr_err("failed to create CQ cqe= %d ret= %d\n",
				953	nr_cqe + 1, ret);
				954	goto out;
				955	}
				956
				957	memset(&qp_attr, 0, sizeof(qp_attr));
				958	qp_attr.qp_context = queue;
				959	qp_attr.event_handler = nvmet_rdma_qp_event;
				960	qp_attr.send_cq = queue->cq;
				961	qp_attr.recv_cq = queue->cq;
				962	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
				963	qp_attr.qp_type = IB_QPT_RC;
				964	/* +1 for drain */
				965	qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
				966	qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
				967	qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
				968	ndev->device->attrs.max_sge);
				969
				970	if (ndev->srq) {
				971	qp_attr.srq = ndev->srq;
				972	} else {
				973	/* +1 for drain */
				974	qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
				975	qp_attr.cap.max_recv_sge = 2;
				976	}
				977
				978	ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
				979	if (ret) {
				980	pr_err("failed to create_qp ret= %d\n", ret);
				981	goto err_destroy_cq;
				982	}
				983
				984	atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
				985
				986	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
				987	__func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
				988	qp_attr.cap.max_send_wr, queue->cm_id);
				989
				990	if (!ndev->srq) {
				991	for (i = 0; i < queue->recv_queue_size; i++) {
				992	queue->cmds[i].queue = queue;
				993	nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
				994	}
				995	}
				996
				997	out:
				998	return ret;
				999
				1000	err_destroy_cq:
				1001	ib_free_cq(queue->cq);
				1002	goto out;
				1003	}
				1004
				1005	static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
				1006	{
				1007	ib_drain_qp(queue->cm_id->qp);
				1008	rdma_destroy_qp(queue->cm_id);
				1009	ib_free_cq(queue->cq);
				1010	}
				1011
				1012	static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
				1013	{
				1014	pr_info("freeing queue %d\n", queue->idx);
				1015
				1016	nvmet_sq_destroy(&queue->nvme_sq);
				1017
				1018	nvmet_rdma_destroy_queue_ib(queue);
				1019	if (!queue->dev->srq) {
				1020	nvmet_rdma_free_cmds(queue->dev, queue->cmds,
				1021	queue->recv_queue_size,
				1022	!queue->host_qid);
				1023	}
				1024	nvmet_rdma_free_rsps(queue);
				1025	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
				1026	kfree(queue);
				1027	}
				1028
				1029	static void nvmet_rdma_release_queue_work(struct work_struct *w)
				1030	{
				1031	struct nvmet_rdma_queue *queue =
				1032	container_of(w, struct nvmet_rdma_queue, release_work);
				1033	struct rdma_cm_id *cm_id = queue->cm_id;
				1034	struct nvmet_rdma_device *dev = queue->dev;
				1035	enum nvmet_rdma_queue_state state = queue->state;
				1036
				1037	nvmet_rdma_free_queue(queue);
				1038
				1039	if (state != NVMET_RDMA_IN_DEVICE_REMOVAL)
				1040	rdma_destroy_id(cm_id);
				1041
				1042	kref_put(&dev->ref, nvmet_rdma_free_dev);
				1043	}
				1044
				1045	static int
				1046	nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
				1047	struct nvmet_rdma_queue *queue)
				1048	{
				1049	struct nvme_rdma_cm_req *req;
				1050
				1051	req = (struct nvme_rdma_cm_req *)conn->private_data;
				1052	if (!req \|\| conn->private_data_len == 0)
				1053	return NVME_RDMA_CM_INVALID_LEN;
				1054
				1055	if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
				1056	return NVME_RDMA_CM_INVALID_RECFMT;
				1057
				1058	queue->host_qid = le16_to_cpu(req->qid);
				1059
				1060	/*
				1061	* req->hsqsize corresponds to our recv queue size plus 1
				1062	* req->hrqsize corresponds to our send queue size
				1063	*/
				1064	queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
				1065	queue->send_queue_size = le16_to_cpu(req->hrqsize);
				1066
				1067	if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
				1068	return NVME_RDMA_CM_INVALID_HSQSIZE;
				1069
				1070	/* XXX: Should we enforce some kind of max for IO queues? */
				1071
				1072	return 0;
				1073	}
				1074
				1075	static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
				1076	enum nvme_rdma_cm_status status)
				1077	{
				1078	struct nvme_rdma_cm_rej rej;
				1079
				1080	pr_debug("rejecting connect request: status %d (%s)\n",
				1081	status, nvme_rdma_cm_msg(status));
				1082
				1083	rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
				1084	rej.sts = cpu_to_le16(status);
				1085
				1086	return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
				1087	}
				1088
				1089	static struct nvmet_rdma_queue *
				1090	nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
				1091	struct rdma_cm_id *cm_id,
				1092	struct rdma_cm_event *event)
				1093	{
				1094	struct nvmet_rdma_queue *queue;
				1095	int ret;
				1096
				1097	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
				1098	if (!queue) {
				1099	ret = NVME_RDMA_CM_NO_RSC;
				1100	goto out_reject;
				1101	}
				1102
				1103	ret = nvmet_sq_init(&queue->nvme_sq);
				1104	if (ret) {
				1105	ret = NVME_RDMA_CM_NO_RSC;
				1106	goto out_free_queue;
				1107	}
				1108
				1109	ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
				1110	if (ret)
				1111	goto out_destroy_sq;
				1112
				1113	/*
				1114	* Schedules the actual release because calling rdma_destroy_id from
				1115	* inside a CM callback would trigger a deadlock. (great API design..)
				1116	*/
				1117	INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
				1118	queue->dev = ndev;
				1119	queue->cm_id = cm_id;
				1120
				1121	spin_lock_init(&queue->state_lock);
				1122	queue->state = NVMET_RDMA_Q_CONNECTING;
				1123	INIT_LIST_HEAD(&queue->rsp_wait_list);
				1124	INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
				1125	spin_lock_init(&queue->rsp_wr_wait_lock);
				1126	INIT_LIST_HEAD(&queue->free_rsps);
				1127	spin_lock_init(&queue->rsps_lock);
				1128	INIT_LIST_HEAD(&queue->queue_list);
				1129
				1130	queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
				1131	if (queue->idx < 0) {
				1132	ret = NVME_RDMA_CM_NO_RSC;
				1133	goto out_destroy_sq;
				1134	}
				1135
				1136	ret = nvmet_rdma_alloc_rsps(queue);
				1137	if (ret) {
				1138	ret = NVME_RDMA_CM_NO_RSC;
				1139	goto out_ida_remove;
				1140	}
				1141
				1142	if (!ndev->srq) {
				1143	queue->cmds = nvmet_rdma_alloc_cmds(ndev,
				1144	queue->recv_queue_size,
				1145	!queue->host_qid);
				1146	if (IS_ERR(queue->cmds)) {
				1147	ret = NVME_RDMA_CM_NO_RSC;
				1148	goto out_free_responses;
				1149	}
				1150	}
				1151
				1152	ret = nvmet_rdma_create_queue_ib(queue);
				1153	if (ret) {
				1154	pr_err("%s: creating RDMA queue failed (%d).\n",
				1155	__func__, ret);
				1156	ret = NVME_RDMA_CM_NO_RSC;
				1157	goto out_free_cmds;
				1158	}
				1159
				1160	return queue;
				1161
				1162	out_free_cmds:
				1163	if (!ndev->srq) {
				1164	nvmet_rdma_free_cmds(queue->dev, queue->cmds,
				1165	queue->recv_queue_size,
				1166	!queue->host_qid);
				1167	}
				1168	out_free_responses:
				1169	nvmet_rdma_free_rsps(queue);
				1170	out_ida_remove:
				1171	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
				1172	out_destroy_sq:
				1173	nvmet_sq_destroy(&queue->nvme_sq);
				1174	out_free_queue:
				1175	kfree(queue);
				1176	out_reject:
				1177	nvmet_rdma_cm_reject(cm_id, ret);
				1178	return NULL;
				1179	}
				1180
				1181	static void nvmet_rdma_qp_event(struct ib_event event, void priv)
				1182	{
				1183	struct nvmet_rdma_queue *queue = priv;
				1184
				1185	switch (event->event) {
				1186	case IB_EVENT_COMM_EST:
				1187	rdma_notify(queue->cm_id, event->event);
				1188	break;
				1189	default:
				1190	pr_err("received IB QP event: %s (%d)\n",
				1191	ib_event_msg(event->event), event->event);
				1192	break;
				1193	}
				1194	}
				1195
				1196	static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
				1197	struct nvmet_rdma_queue *queue,
				1198	struct rdma_conn_param *p)
				1199	{
				1200	struct rdma_conn_param param = { };
				1201	struct nvme_rdma_cm_rep priv = { };
				1202	int ret = -ENOMEM;
				1203
				1204	param.rnr_retry_count = 7;
				1205	param.flow_control = 1;
				1206	param.initiator_depth = min_t(u8, p->initiator_depth,
				1207	queue->dev->device->attrs.max_qp_init_rd_atom);
				1208	param.private_data = &priv;
				1209	param.private_data_len = sizeof(priv);
				1210	priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
				1211	priv.crqsize = cpu_to_le16(queue->recv_queue_size);
				1212
				1213	ret = rdma_accept(cm_id, &param);
				1214	if (ret)
				1215	pr_err("rdma_accept failed (error code = %d)\n", ret);
				1216
				1217	return ret;
				1218	}
				1219
				1220	static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
				1221	struct rdma_cm_event *event)
				1222	{
				1223	struct nvmet_rdma_device *ndev;
				1224	struct nvmet_rdma_queue *queue;
				1225	int ret = -EINVAL;
				1226
				1227	ndev = nvmet_rdma_find_get_device(cm_id);
				1228	if (!ndev) {
				1229	nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
				1230	return -ECONNREFUSED;
				1231	}
				1232
				1233	queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
				1234	if (!queue) {
				1235	ret = -ENOMEM;
				1236	goto put_device;
				1237	}
				1238	queue->port = cm_id->context;
				1239
				1240	if (queue->host_qid == 0) {
				1241	/* Let inflight controller teardown complete */
				1242	flush_scheduled_work();
				1243	}
				1244
				1245	ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
				1246	if (ret)
				1247	goto release_queue;
				1248
				1249	mutex_lock(&nvmet_rdma_queue_mutex);
				1250	list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
				1251	mutex_unlock(&nvmet_rdma_queue_mutex);
				1252
				1253	return 0;
				1254
				1255	release_queue:
				1256	nvmet_rdma_free_queue(queue);
				1257	put_device:
				1258	kref_put(&ndev->ref, nvmet_rdma_free_dev);
				1259
				1260	return ret;
				1261	}
				1262
				1263	static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
				1264	{
				1265	unsigned long flags;
				1266
				1267	spin_lock_irqsave(&queue->state_lock, flags);
				1268	if (queue->state != NVMET_RDMA_Q_CONNECTING) {
				1269	pr_warn("trying to establish a connected queue\n");
				1270	goto out_unlock;
				1271	}
				1272	queue->state = NVMET_RDMA_Q_LIVE;
				1273
				1274	while (!list_empty(&queue->rsp_wait_list)) {
				1275	struct nvmet_rdma_rsp *cmd;
				1276
				1277	cmd = list_first_entry(&queue->rsp_wait_list,
				1278	struct nvmet_rdma_rsp, wait_list);
				1279	list_del(&cmd->wait_list);
				1280
				1281	spin_unlock_irqrestore(&queue->state_lock, flags);
				1282	nvmet_rdma_handle_command(queue, cmd);
				1283	spin_lock_irqsave(&queue->state_lock, flags);
				1284	}
				1285
				1286	out_unlock:
				1287	spin_unlock_irqrestore(&queue->state_lock, flags);
				1288	}
				1289
				1290	static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
				1291	{
				1292	bool disconnect = false;
				1293	unsigned long flags;
				1294
				1295	pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
				1296
				1297	spin_lock_irqsave(&queue->state_lock, flags);
				1298	switch (queue->state) {
				1299	case NVMET_RDMA_Q_CONNECTING:
				1300	case NVMET_RDMA_Q_LIVE:
				1301	queue->state = NVMET_RDMA_Q_DISCONNECTING;
				1302	case NVMET_RDMA_IN_DEVICE_REMOVAL:
				1303	disconnect = true;
				1304	break;
				1305	case NVMET_RDMA_Q_DISCONNECTING:
				1306	break;
				1307	}
				1308	spin_unlock_irqrestore(&queue->state_lock, flags);
				1309
				1310	if (disconnect) {
				1311	rdma_disconnect(queue->cm_id);
				1312	schedule_work(&queue->release_work);
				1313	}
				1314	}
				1315
				1316	static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
				1317	{
				1318	bool disconnect = false;
				1319
				1320	mutex_lock(&nvmet_rdma_queue_mutex);
				1321	if (!list_empty(&queue->queue_list)) {
				1322	list_del_init(&queue->queue_list);
				1323	disconnect = true;
				1324	}
				1325	mutex_unlock(&nvmet_rdma_queue_mutex);
				1326
				1327	if (disconnect)
				1328	__nvmet_rdma_queue_disconnect(queue);
				1329	}
				1330
				1331	static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
				1332	struct nvmet_rdma_queue *queue)
				1333	{
				1334	WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
				1335
				1336	mutex_lock(&nvmet_rdma_queue_mutex);
				1337	if (!list_empty(&queue->queue_list))
				1338	list_del_init(&queue->queue_list);
				1339	mutex_unlock(&nvmet_rdma_queue_mutex);
				1340
				1341	pr_err("failed to connect queue %d\n", queue->idx);
				1342	schedule_work(&queue->release_work);
				1343	}
				1344
				1345	/**
				1346	* nvme_rdma_device_removal() - Handle RDMA device removal
				1347	* @cm_id: rdma_cm id, used for nvmet port
				1348	* @queue: nvmet rdma queue (cm id qp_context)
				1349	*
				1350	* DEVICE_REMOVAL event notifies us that the RDMA device is about
				1351	* to unplug. Note that this event can be generated on a normal
				1352	* queue cm_id and/or a device bound listener cm_id (where in this
				1353	* case queue will be null).
				1354	*
				1355	* We registered an ib_client to handle device removal for queues,
				1356	* so we only need to handle the listening port cm_ids. In this case
				1357	* we nullify the priv to prevent double cm_id destruction and destroying
				1358	* the cm_id implicitely by returning a non-zero rc to the callout.
				1359	*/
				1360	static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
				1361	struct nvmet_rdma_queue *queue)
				1362	{
				1363	struct nvmet_port *port;
				1364
				1365	if (queue) {
				1366	/*
				1367	* This is a queue cm_id. we have registered
				1368	* an ib_client to handle queues removal
				1369	* so don't interfear and just return.
				1370	*/
				1371	return 0;
				1372	}
				1373
				1374	port = cm_id->context;
				1375
				1376	/*
				1377	* This is a listener cm_id. Make sure that
				1378	* future remove_port won't invoke a double
				1379	* cm_id destroy. use atomic xchg to make sure
				1380	* we don't compete with remove_port.
				1381	*/
				1382	if (xchg(&port->priv, NULL) != cm_id)
				1383	return 0;
				1384
				1385	/*
				1386	* We need to return 1 so that the core will destroy
				1387	* it's own ID. What a great API design..
				1388	*/
				1389	return 1;
				1390	}
				1391
				1392	static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
				1393	struct rdma_cm_event *event)
				1394	{
				1395	struct nvmet_rdma_queue *queue = NULL;
				1396	int ret = 0;
				1397
				1398	if (cm_id->qp)
				1399	queue = cm_id->qp->qp_context;
				1400
				1401	pr_debug("%s (%d): status %d id %p\n",
				1402	rdma_event_msg(event->event), event->event,
				1403	event->status, cm_id);
				1404
				1405	switch (event->event) {
				1406	case RDMA_CM_EVENT_CONNECT_REQUEST:
				1407	ret = nvmet_rdma_queue_connect(cm_id, event);
				1408	break;
				1409	case RDMA_CM_EVENT_ESTABLISHED:
				1410	nvmet_rdma_queue_established(queue);
				1411	break;
				1412	case RDMA_CM_EVENT_ADDR_CHANGE:
				1413	case RDMA_CM_EVENT_DISCONNECTED:
				1414	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
				1415	/*
				1416	* We might end up here when we already freed the qp
				1417	* which means queue release sequence is in progress,
				1418	* so don't get in the way...
				1419	*/
				1420	if (queue)
				1421	nvmet_rdma_queue_disconnect(queue);
				1422	break;
				1423	case RDMA_CM_EVENT_DEVICE_REMOVAL:
				1424	ret = nvmet_rdma_device_removal(cm_id, queue);
				1425	break;
				1426	case RDMA_CM_EVENT_REJECTED:
				1427	pr_debug("Connection rejected: %s\n",
				1428	rdma_reject_msg(cm_id, event->status));
				1429	/* FALLTHROUGH */
				1430	case RDMA_CM_EVENT_UNREACHABLE:
				1431	case RDMA_CM_EVENT_CONNECT_ERROR:
				1432	nvmet_rdma_queue_connect_fail(cm_id, queue);
				1433	break;
				1434	default:
				1435	pr_err("received unrecognized RDMA CM event %d\n",
				1436	event->event);
				1437	break;
				1438	}
				1439
				1440	return ret;
				1441	}
				1442
				1443	static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
				1444	{
				1445	struct nvmet_rdma_queue *queue;
				1446
				1447	restart:
				1448	mutex_lock(&nvmet_rdma_queue_mutex);
				1449	list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
				1450	if (queue->nvme_sq.ctrl == ctrl) {
				1451	list_del_init(&queue->queue_list);
				1452	mutex_unlock(&nvmet_rdma_queue_mutex);
				1453
				1454	__nvmet_rdma_queue_disconnect(queue);
				1455	goto restart;
				1456	}
				1457	}
				1458	mutex_unlock(&nvmet_rdma_queue_mutex);
				1459	}
				1460
				1461	static int nvmet_rdma_add_port(struct nvmet_port *port)
				1462	{
				1463	struct rdma_cm_id *cm_id;
				1464	struct sockaddr_storage addr = { };
				1465	__kernel_sa_family_t af;
				1466	int ret;
				1467
				1468	switch (port->disc_addr.adrfam) {
				1469	case NVMF_ADDR_FAMILY_IP4:
				1470	af = AF_INET;
				1471	break;
				1472	case NVMF_ADDR_FAMILY_IP6:
				1473	af = AF_INET6;
				1474	break;
				1475	default:
				1476	pr_err("address family %d not supported\n",
				1477	port->disc_addr.adrfam);
				1478	return -EINVAL;
				1479	}
				1480
				1481	ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
				1482	port->disc_addr.trsvcid, &addr);
				1483	if (ret) {
				1484	pr_err("malformed ip/port passed: %s:%s\n",
				1485	port->disc_addr.traddr, port->disc_addr.trsvcid);
				1486	return ret;
				1487	}
				1488
				1489	cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
				1490	RDMA_PS_TCP, IB_QPT_RC);
				1491	if (IS_ERR(cm_id)) {
				1492	pr_err("CM ID creation failed\n");
				1493	return PTR_ERR(cm_id);
				1494	}
				1495
				1496	/*
				1497	* Allow both IPv4 and IPv6 sockets to bind a single port
				1498	* at the same time.
				1499	*/
				1500	ret = rdma_set_afonly(cm_id, 1);
				1501	if (ret) {
				1502	pr_err("rdma_set_afonly failed (%d)\n", ret);
				1503	goto out_destroy_id;
				1504	}
				1505
				1506	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
				1507	if (ret) {
				1508	pr_err("binding CM ID to %pISpcs failed (%d)\n",
				1509	(struct sockaddr *)&addr, ret);
				1510	goto out_destroy_id;
				1511	}
				1512
				1513	ret = rdma_listen(cm_id, 128);
				1514	if (ret) {
				1515	pr_err("listening to %pISpcs failed (%d)\n",
				1516	(struct sockaddr *)&addr, ret);
				1517	goto out_destroy_id;
				1518	}
				1519
				1520	pr_info("enabling port %d (%pISpcs)\n",
				1521	le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
				1522	port->priv = cm_id;
				1523	return 0;
				1524
				1525	out_destroy_id:
				1526	rdma_destroy_id(cm_id);
				1527	return ret;
				1528	}
				1529
				1530	static void nvmet_rdma_remove_port(struct nvmet_port *port)
				1531	{
				1532	struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
				1533
				1534	if (cm_id)
				1535	rdma_destroy_id(cm_id);
				1536	}
				1537
				1538	static struct nvmet_fabrics_ops nvmet_rdma_ops = {
				1539	.owner = THIS_MODULE,
				1540	.type = NVMF_TRTYPE_RDMA,
				1541	.sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
				1542	.msdbd = 1,
				1543	.has_keyed_sgls = 1,
				1544	.add_port = nvmet_rdma_add_port,
				1545	.remove_port = nvmet_rdma_remove_port,
				1546	.queue_response = nvmet_rdma_queue_response,
				1547	.delete_ctrl = nvmet_rdma_delete_ctrl,
				1548	};
				1549
				1550	static void nvmet_rdma_remove_one(struct ib_device ib_device, void client_data)
				1551	{
				1552	struct nvmet_rdma_queue queue, tmp;
				1553
				1554	/* Device is being removed, delete all queues using this device */
				1555	mutex_lock(&nvmet_rdma_queue_mutex);
				1556	list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
				1557	queue_list) {
				1558	if (queue->dev->device != ib_device)
				1559	continue;
				1560
				1561	pr_info("Removing queue %d\n", queue->idx);
				1562	list_del_init(&queue->queue_list);
				1563	__nvmet_rdma_queue_disconnect(queue);
				1564	}
				1565	mutex_unlock(&nvmet_rdma_queue_mutex);
				1566
				1567	flush_scheduled_work();
				1568	}
				1569
				1570	static struct ib_client nvmet_rdma_ib_client = {
				1571	.name = "nvmet_rdma",
				1572	.remove = nvmet_rdma_remove_one
				1573	};
				1574
				1575	static int __init nvmet_rdma_init(void)
				1576	{
				1577	int ret;
				1578
				1579	ret = ib_register_client(&nvmet_rdma_ib_client);
				1580	if (ret)
				1581	return ret;
				1582
				1583	ret = nvmet_register_transport(&nvmet_rdma_ops);
				1584	if (ret)
				1585	goto err_ib_client;
				1586
				1587	return 0;
				1588
				1589	err_ib_client:
				1590	ib_unregister_client(&nvmet_rdma_ib_client);
				1591	return ret;
				1592	}
				1593
				1594	static void __exit nvmet_rdma_exit(void)
				1595	{
				1596	struct nvmet_rdma_queue *queue;
				1597
				1598	nvmet_unregister_transport(&nvmet_rdma_ops);
				1599
				1600	flush_scheduled_work();
				1601
				1602	mutex_lock(&nvmet_rdma_queue_mutex);
				1603	while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
				1604	struct nvmet_rdma_queue, queue_list))) {
				1605	list_del_init(&queue->queue_list);
				1606
				1607	mutex_unlock(&nvmet_rdma_queue_mutex);
				1608	__nvmet_rdma_queue_disconnect(queue);
				1609	mutex_lock(&nvmet_rdma_queue_mutex);
				1610	}
				1611	mutex_unlock(&nvmet_rdma_queue_mutex);
				1612
				1613	flush_scheduled_work();
				1614	ib_unregister_client(&nvmet_rdma_ib_client);
				1615	ida_destroy(&nvmet_rdma_queue_ida);
				1616	}
				1617
				1618	module_init(nvmet_rdma_init);
				1619	module_exit(nvmet_rdma_exit);
				1620
				1621	MODULE_LICENSE("GPL v2");
				1622	MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */