blob: 5547bd8e6f4278201951674d3f729923da3f5078 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/interrupt.h>
51#include <linux/pci.h> /* for Tavor hack below */
52#include <linux/slab.h>
53
54#include "xprt_rdma.h"
55
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
64/*
65 * internal functions
66 */
67
68/*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75static LIST_HEAD(rpcrdma_tasklets_g);
76
77static void
78rpcrdma_run_tasklet(unsigned long data)
79{
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102}
103
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129}
130
131static void
132rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133{
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143}
144
145static inline
146void rpcrdma_event_process(struct ib_wc *wc)
147{
148 struct rpcrdma_mw *frmr;
149 struct rpcrdma_rep *rep =
150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
153 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155 if (!rep) /* send or bind completion that we don't care about */
156 return;
157
158 if (IB_WC_SUCCESS != wc->status) {
159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
160 __func__, wc->opcode, wc->status);
161 rep->rr_len = ~0U;
162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163 rpcrdma_schedule_tasklet(rep);
164 return;
165 }
166
167 switch (wc->opcode) {
168 case IB_WC_FAST_REG_MR:
169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170 frmr->r.frmr.state = FRMR_IS_VALID;
171 break;
172 case IB_WC_LOCAL_INV:
173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174 frmr->r.frmr.state = FRMR_IS_INVALID;
175 break;
176 case IB_WC_RECV:
177 rep->rr_len = wc->byte_len;
178 ib_dma_sync_single_for_cpu(
179 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181 /* Keep (only) the most recent credits, after check validity */
182 if (rep->rr_len >= 16) {
183 struct rpcrdma_msg *p =
184 (struct rpcrdma_msg *) rep->rr_base;
185 unsigned int credits = ntohl(p->rm_credit);
186 if (credits == 0) {
187 dprintk("RPC: %s: server"
188 " dropped credits to 0!\n", __func__);
189 /* don't deadlock */
190 credits = 1;
191 } else if (credits > rep->rr_buffer->rb_max_requests) {
192 dprintk("RPC: %s: server"
193 " over-crediting: %d (%d)\n",
194 __func__, credits,
195 rep->rr_buffer->rb_max_requests);
196 credits = rep->rr_buffer->rb_max_requests;
197 }
198 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 }
200 /* fall through */
201 case IB_WC_BIND_MW:
202 rpcrdma_schedule_tasklet(rep);
203 break;
204 default:
205 dprintk("RPC: %s: unexpected WC event %X\n",
206 __func__, wc->opcode);
207 break;
208 }
209}
210
211static inline int
212rpcrdma_cq_poll(struct ib_cq *cq)
213{
214 struct ib_wc wc;
215 int rc;
216
217 for (;;) {
218 rc = ib_poll_cq(cq, 1, &wc);
219 if (rc < 0) {
220 dprintk("RPC: %s: ib_poll_cq failed %i\n",
221 __func__, rc);
222 return rc;
223 }
224 if (rc == 0)
225 break;
226
227 rpcrdma_event_process(&wc);
228 }
229
230 return 0;
231}
232
233/*
234 * rpcrdma_cq_event_upcall
235 *
236 * This upcall handles recv, send, bind and unbind events.
237 * It is reentrant but processes single events in order to maintain
238 * ordering of receives to keep server credits.
239 *
240 * It is the responsibility of the scheduled tasklet to return
241 * recv buffers to the pool. NOTE: this affects synchronization of
242 * connection shutdown. That is, the structures required for
243 * the completion of the reply handler must remain intact until
244 * all memory has been reclaimed.
245 *
246 * Note that send events are suppressed and do not result in an upcall.
247 */
248static void
249rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250{
251 int rc;
252
253 rc = rpcrdma_cq_poll(cq);
254 if (rc)
255 return;
256
257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258 if (rc) {
259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
260 __func__, rc);
261 return;
262 }
263
264 rpcrdma_cq_poll(cq);
265}
266
267#ifdef RPC_DEBUG
268static const char * const conn[] = {
269 "address resolved",
270 "address error",
271 "route resolved",
272 "route error",
273 "connect request",
274 "connect response",
275 "connect error",
276 "unreachable",
277 "rejected",
278 "established",
279 "disconnected",
280 "device removal"
281};
282#endif
283
284static int
285rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286{
287 struct rpcrdma_xprt *xprt = id->context;
288 struct rpcrdma_ia *ia = &xprt->rx_ia;
289 struct rpcrdma_ep *ep = &xprt->rx_ep;
290#ifdef RPC_DEBUG
291 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
292#endif
293 struct ib_qp_attr attr;
294 struct ib_qp_init_attr iattr;
295 int connstate = 0;
296
297 switch (event->event) {
298 case RDMA_CM_EVENT_ADDR_RESOLVED:
299 case RDMA_CM_EVENT_ROUTE_RESOLVED:
300 ia->ri_async_rc = 0;
301 complete(&ia->ri_done);
302 break;
303 case RDMA_CM_EVENT_ADDR_ERROR:
304 ia->ri_async_rc = -EHOSTUNREACH;
305 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
306 __func__, ep);
307 complete(&ia->ri_done);
308 break;
309 case RDMA_CM_EVENT_ROUTE_ERROR:
310 ia->ri_async_rc = -ENETUNREACH;
311 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
312 __func__, ep);
313 complete(&ia->ri_done);
314 break;
315 case RDMA_CM_EVENT_ESTABLISHED:
316 connstate = 1;
317 ib_query_qp(ia->ri_id->qp, &attr,
318 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319 &iattr);
320 dprintk("RPC: %s: %d responder resources"
321 " (%d initiator)\n",
322 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323 goto connected;
324 case RDMA_CM_EVENT_CONNECT_ERROR:
325 connstate = -ENOTCONN;
326 goto connected;
327 case RDMA_CM_EVENT_UNREACHABLE:
328 connstate = -ENETDOWN;
329 goto connected;
330 case RDMA_CM_EVENT_REJECTED:
331 connstate = -ECONNREFUSED;
332 goto connected;
333 case RDMA_CM_EVENT_DISCONNECTED:
334 connstate = -ECONNABORTED;
335 goto connected;
336 case RDMA_CM_EVENT_DEVICE_REMOVAL:
337 connstate = -ENODEV;
338connected:
339 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
340 __func__,
341 (event->event <= 11) ? conn[event->event] :
342 "unknown connection error",
343 &addr->sin_addr.s_addr,
344 ntohs(addr->sin_port),
345 ep, event->event);
346 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347 dprintk("RPC: %s: %sconnected\n",
348 __func__, connstate > 0 ? "" : "dis");
349 ep->rep_connected = connstate;
350 ep->rep_func(ep);
351 wake_up_all(&ep->rep_connect_wait);
352 break;
353 default:
354 dprintk("RPC: %s: unexpected CM event %d\n",
355 __func__, event->event);
356 break;
357 }
358
359#ifdef RPC_DEBUG
360 if (connstate == 1) {
361 int ird = attr.max_dest_rd_atomic;
362 int tird = ep->rep_remote_cma.responder_resources;
363 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
364 "on %s, memreg %d slots %d ird %d%s\n",
365 &addr->sin_addr.s_addr,
366 ntohs(addr->sin_port),
367 ia->ri_id->device->name,
368 ia->ri_memreg_strategy,
369 xprt->rx_buf.rb_max_requests,
370 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371 } else if (connstate < 0) {
372 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373 &addr->sin_addr.s_addr,
374 ntohs(addr->sin_port),
375 connstate);
376 }
377#endif
378
379 return 0;
380}
381
382static struct rdma_cm_id *
383rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384 struct rpcrdma_ia *ia, struct sockaddr *addr)
385{
386 struct rdma_cm_id *id;
387 int rc;
388
389 init_completion(&ia->ri_done);
390
391 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
392 if (IS_ERR(id)) {
393 rc = PTR_ERR(id);
394 dprintk("RPC: %s: rdma_create_id() failed %i\n",
395 __func__, rc);
396 return id;
397 }
398
399 ia->ri_async_rc = -ETIMEDOUT;
400 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401 if (rc) {
402 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
403 __func__, rc);
404 goto out;
405 }
406 wait_for_completion_interruptible_timeout(&ia->ri_done,
407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
408 rc = ia->ri_async_rc;
409 if (rc)
410 goto out;
411
412 ia->ri_async_rc = -ETIMEDOUT;
413 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414 if (rc) {
415 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
416 __func__, rc);
417 goto out;
418 }
419 wait_for_completion_interruptible_timeout(&ia->ri_done,
420 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
421 rc = ia->ri_async_rc;
422 if (rc)
423 goto out;
424
425 return id;
426
427out:
428 rdma_destroy_id(id);
429 return ERR_PTR(rc);
430}
431
432/*
433 * Drain any cq, prior to teardown.
434 */
435static void
436rpcrdma_clean_cq(struct ib_cq *cq)
437{
438 struct ib_wc wc;
439 int count = 0;
440
441 while (1 == ib_poll_cq(cq, 1, &wc))
442 ++count;
443
444 if (count)
445 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
446 __func__, count, wc.opcode);
447}
448
449/*
450 * Exported functions.
451 */
452
453/*
454 * Open and initialize an Interface Adapter.
455 * o initializes fields of struct rpcrdma_ia, including
456 * interface and provider attributes and protection zone.
457 */
458int
459rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460{
461 int rc, mem_priv;
462 struct ib_device_attr devattr;
463 struct rpcrdma_ia *ia = &xprt->rx_ia;
464
465 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466 if (IS_ERR(ia->ri_id)) {
467 rc = PTR_ERR(ia->ri_id);
468 goto out1;
469 }
470
471 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472 if (IS_ERR(ia->ri_pd)) {
473 rc = PTR_ERR(ia->ri_pd);
474 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
475 __func__, rc);
476 goto out2;
477 }
478
479 /*
480 * Query the device to determine if the requested memory
481 * registration strategy is supported. If it isn't, set the
482 * strategy to a globally supported model.
483 */
484 rc = ib_query_device(ia->ri_id->device, &devattr);
485 if (rc) {
486 dprintk("RPC: %s: ib_query_device failed %d\n",
487 __func__, rc);
488 goto out3;
489 }
490
491 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492 ia->ri_have_dma_lkey = 1;
493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494 }
495
496 switch (memreg) {
497 case RPCRDMA_MEMWINDOWS:
498 case RPCRDMA_MEMWINDOWS_ASYNC:
499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500 dprintk("RPC: %s: MEMWINDOWS registration "
501 "specified but not supported by adapter, "
502 "using slower RPCRDMA_REGISTER\n",
503 __func__);
504 memreg = RPCRDMA_REGISTER;
505 }
506 break;
507 case RPCRDMA_MTHCAFMR:
508 if (!ia->ri_id->device->alloc_fmr) {
509#if RPCRDMA_PERSISTENT_REGISTRATION
510 dprintk("RPC: %s: MTHCAFMR registration "
511 "specified but not supported by adapter, "
512 "using riskier RPCRDMA_ALLPHYSICAL\n",
513 __func__);
514 memreg = RPCRDMA_ALLPHYSICAL;
515#else
516 dprintk("RPC: %s: MTHCAFMR registration "
517 "specified but not supported by adapter, "
518 "using slower RPCRDMA_REGISTER\n",
519 __func__);
520 memreg = RPCRDMA_REGISTER;
521#endif
522 }
523 break;
524 case RPCRDMA_FRMR:
525 /* Requires both frmr reg and local dma lkey */
526 if ((devattr.device_cap_flags &
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529#if RPCRDMA_PERSISTENT_REGISTRATION
530 dprintk("RPC: %s: FRMR registration "
531 "specified but not supported by adapter, "
532 "using riskier RPCRDMA_ALLPHYSICAL\n",
533 __func__);
534 memreg = RPCRDMA_ALLPHYSICAL;
535#else
536 dprintk("RPC: %s: FRMR registration "
537 "specified but not supported by adapter, "
538 "using slower RPCRDMA_REGISTER\n",
539 __func__);
540 memreg = RPCRDMA_REGISTER;
541#endif
542 }
543 break;
544 }
545
546 /*
547 * Optionally obtain an underlying physical identity mapping in
548 * order to do a memory window-based bind. This base registration
549 * is protected from remote access - that is enabled only by binding
550 * for the specific bytes targeted during each RPC operation, and
551 * revoked after the corresponding completion similar to a storage
552 * adapter.
553 */
554 switch (memreg) {
555 case RPCRDMA_BOUNCEBUFFERS:
556 case RPCRDMA_REGISTER:
557 case RPCRDMA_FRMR:
558 break;
559#if RPCRDMA_PERSISTENT_REGISTRATION
560 case RPCRDMA_ALLPHYSICAL:
561 mem_priv = IB_ACCESS_LOCAL_WRITE |
562 IB_ACCESS_REMOTE_WRITE |
563 IB_ACCESS_REMOTE_READ;
564 goto register_setup;
565#endif
566 case RPCRDMA_MEMWINDOWS_ASYNC:
567 case RPCRDMA_MEMWINDOWS:
568 mem_priv = IB_ACCESS_LOCAL_WRITE |
569 IB_ACCESS_MW_BIND;
570 goto register_setup;
571 case RPCRDMA_MTHCAFMR:
572 if (ia->ri_have_dma_lkey)
573 break;
574 mem_priv = IB_ACCESS_LOCAL_WRITE;
575 register_setup:
576 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
577 if (IS_ERR(ia->ri_bind_mem)) {
578 printk(KERN_ALERT "%s: ib_get_dma_mr for "
579 "phys register failed with %lX\n\t"
580 "Will continue with degraded performance\n",
581 __func__, PTR_ERR(ia->ri_bind_mem));
582 memreg = RPCRDMA_REGISTER;
583 ia->ri_bind_mem = NULL;
584 }
585 break;
586 default:
587 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
588 __func__, memreg);
589 rc = -EINVAL;
590 goto out3;
591 }
592 dprintk("RPC: %s: memory registration strategy is %d\n",
593 __func__, memreg);
594
595 /* Else will do memory reg/dereg for each chunk */
596 ia->ri_memreg_strategy = memreg;
597
598 return 0;
599
600out3:
601 ib_dealloc_pd(ia->ri_pd);
602 ia->ri_pd = NULL;
603out2:
604 rdma_destroy_id(ia->ri_id);
605 ia->ri_id = NULL;
606out1:
607 return rc;
608}
609
610/*
611 * Clean up/close an IA.
612 * o if event handles and PD have been initialized, free them.
613 * o close the IA
614 */
615void
616rpcrdma_ia_close(struct rpcrdma_ia *ia)
617{
618 int rc;
619
620 dprintk("RPC: %s: entering\n", __func__);
621 if (ia->ri_bind_mem != NULL) {
622 rc = ib_dereg_mr(ia->ri_bind_mem);
623 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
624 __func__, rc);
625 }
626 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
627 if (ia->ri_id->qp)
628 rdma_destroy_qp(ia->ri_id);
629 rdma_destroy_id(ia->ri_id);
630 ia->ri_id = NULL;
631 }
632 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
633 rc = ib_dealloc_pd(ia->ri_pd);
634 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
635 __func__, rc);
636 }
637}
638
639/*
640 * Create unconnected endpoint.
641 */
642int
643rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
644 struct rpcrdma_create_data_internal *cdata)
645{
646 struct ib_device_attr devattr;
647 int rc, err;
648
649 rc = ib_query_device(ia->ri_id->device, &devattr);
650 if (rc) {
651 dprintk("RPC: %s: ib_query_device failed %d\n",
652 __func__, rc);
653 return rc;
654 }
655
656 /* check provider's send/recv wr limits */
657 if (cdata->max_requests > devattr.max_qp_wr)
658 cdata->max_requests = devattr.max_qp_wr;
659
660 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
661 ep->rep_attr.qp_context = ep;
662 /* send_cq and recv_cq initialized below */
663 ep->rep_attr.srq = NULL;
664 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
665 switch (ia->ri_memreg_strategy) {
666 case RPCRDMA_FRMR:
667 /* Add room for frmr register and invalidate WRs.
668 * 1. FRMR reg WR for head
669 * 2. FRMR invalidate WR for head
670 * 3. FRMR reg WR for pagelist
671 * 4. FRMR invalidate WR for pagelist
672 * 5. FRMR reg WR for tail
673 * 6. FRMR invalidate WR for tail
674 * 7. The RDMA_SEND WR
675 */
676 ep->rep_attr.cap.max_send_wr *= 7;
677 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
678 cdata->max_requests = devattr.max_qp_wr / 7;
679 if (!cdata->max_requests)
680 return -EINVAL;
681 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
682 }
683 break;
684 case RPCRDMA_MEMWINDOWS_ASYNC:
685 case RPCRDMA_MEMWINDOWS:
686 /* Add room for mw_binds+unbinds - overkill! */
687 ep->rep_attr.cap.max_send_wr++;
688 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
689 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
690 return -EINVAL;
691 break;
692 default:
693 break;
694 }
695 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
696 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
697 ep->rep_attr.cap.max_recv_sge = 1;
698 ep->rep_attr.cap.max_inline_data = 0;
699 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
700 ep->rep_attr.qp_type = IB_QPT_RC;
701 ep->rep_attr.port_num = ~0;
702
703 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
704 "iovs: send %d recv %d\n",
705 __func__,
706 ep->rep_attr.cap.max_send_wr,
707 ep->rep_attr.cap.max_recv_wr,
708 ep->rep_attr.cap.max_send_sge,
709 ep->rep_attr.cap.max_recv_sge);
710
711 /* set trigger for requesting send completion */
712 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
713 switch (ia->ri_memreg_strategy) {
714 case RPCRDMA_MEMWINDOWS_ASYNC:
715 case RPCRDMA_MEMWINDOWS:
716 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
717 break;
718 default:
719 break;
720 }
721 if (ep->rep_cqinit <= 2)
722 ep->rep_cqinit = 0;
723 INIT_CQCOUNT(ep);
724 ep->rep_ia = ia;
725 init_waitqueue_head(&ep->rep_connect_wait);
726
727 /*
728 * Create a single cq for receive dto and mw_bind (only ever
729 * care about unbind, really). Send completions are suppressed.
730 * Use single threaded tasklet upcalls to maintain ordering.
731 */
732 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
733 rpcrdma_cq_async_error_upcall, NULL,
734 ep->rep_attr.cap.max_recv_wr +
735 ep->rep_attr.cap.max_send_wr + 1, 0);
736 if (IS_ERR(ep->rep_cq)) {
737 rc = PTR_ERR(ep->rep_cq);
738 dprintk("RPC: %s: ib_create_cq failed: %i\n",
739 __func__, rc);
740 goto out1;
741 }
742
743 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
744 if (rc) {
745 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
746 __func__, rc);
747 goto out2;
748 }
749
750 ep->rep_attr.send_cq = ep->rep_cq;
751 ep->rep_attr.recv_cq = ep->rep_cq;
752
753 /* Initialize cma parameters */
754
755 /* RPC/RDMA does not use private data */
756 ep->rep_remote_cma.private_data = NULL;
757 ep->rep_remote_cma.private_data_len = 0;
758
759 /* Client offers RDMA Read but does not initiate */
760 ep->rep_remote_cma.initiator_depth = 0;
761 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
762 ep->rep_remote_cma.responder_resources = 0;
763 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
764 ep->rep_remote_cma.responder_resources = 32;
765 else
766 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
767
768 ep->rep_remote_cma.retry_count = 7;
769 ep->rep_remote_cma.flow_control = 0;
770 ep->rep_remote_cma.rnr_retry_count = 0;
771
772 return 0;
773
774out2:
775 err = ib_destroy_cq(ep->rep_cq);
776 if (err)
777 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
778 __func__, err);
779out1:
780 return rc;
781}
782
783/*
784 * rpcrdma_ep_destroy
785 *
786 * Disconnect and destroy endpoint. After this, the only
787 * valid operations on the ep are to free it (if dynamically
788 * allocated) or re-create it.
789 *
790 * The caller's error handling must be sure to not leak the endpoint
791 * if this function fails.
792 */
793int
794rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
795{
796 int rc;
797
798 dprintk("RPC: %s: entering, connected is %d\n",
799 __func__, ep->rep_connected);
800
801 if (ia->ri_id->qp) {
802 rc = rpcrdma_ep_disconnect(ep, ia);
803 if (rc)
804 dprintk("RPC: %s: rpcrdma_ep_disconnect"
805 " returned %i\n", __func__, rc);
806 rdma_destroy_qp(ia->ri_id);
807 ia->ri_id->qp = NULL;
808 }
809
810 /* padding - could be done in rpcrdma_buffer_destroy... */
811 if (ep->rep_pad_mr) {
812 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
813 ep->rep_pad_mr = NULL;
814 }
815
816 rpcrdma_clean_cq(ep->rep_cq);
817 rc = ib_destroy_cq(ep->rep_cq);
818 if (rc)
819 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
820 __func__, rc);
821
822 return rc;
823}
824
825/*
826 * Connect unconnected endpoint.
827 */
828int
829rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
830{
831 struct rdma_cm_id *id;
832 int rc = 0;
833 int retry_count = 0;
834
835 if (ep->rep_connected != 0) {
836 struct rpcrdma_xprt *xprt;
837retry:
838 rc = rpcrdma_ep_disconnect(ep, ia);
839 if (rc && rc != -ENOTCONN)
840 dprintk("RPC: %s: rpcrdma_ep_disconnect"
841 " status %i\n", __func__, rc);
842 rpcrdma_clean_cq(ep->rep_cq);
843
844 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
845 id = rpcrdma_create_id(xprt, ia,
846 (struct sockaddr *)&xprt->rx_data.addr);
847 if (IS_ERR(id)) {
848 rc = PTR_ERR(id);
849 goto out;
850 }
851 /* TEMP TEMP TEMP - fail if new device:
852 * Deregister/remarshal *all* requests!
853 * Close and recreate adapter, pd, etc!
854 * Re-determine all attributes still sane!
855 * More stuff I haven't thought of!
856 * Rrrgh!
857 */
858 if (ia->ri_id->device != id->device) {
859 printk("RPC: %s: can't reconnect on "
860 "different device!\n", __func__);
861 rdma_destroy_id(id);
862 rc = -ENETDOWN;
863 goto out;
864 }
865 /* END TEMP */
866 rdma_destroy_qp(ia->ri_id);
867 rdma_destroy_id(ia->ri_id);
868 ia->ri_id = id;
869 }
870
871 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
872 if (rc) {
873 dprintk("RPC: %s: rdma_create_qp failed %i\n",
874 __func__, rc);
875 goto out;
876 }
877
878/* XXX Tavor device performs badly with 2K MTU! */
879if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
880 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
881 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
882 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
883 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
884 struct ib_qp_attr attr = {
885 .path_mtu = IB_MTU_1024
886 };
887 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
888 }
889}
890
891 ep->rep_connected = 0;
892
893 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
894 if (rc) {
895 dprintk("RPC: %s: rdma_connect() failed with %i\n",
896 __func__, rc);
897 goto out;
898 }
899
900 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
901
902 /*
903 * Check state. A non-peer reject indicates no listener
904 * (ECONNREFUSED), which may be a transient state. All
905 * others indicate a transport condition which has already
906 * undergone a best-effort.
907 */
908 if (ep->rep_connected == -ECONNREFUSED &&
909 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
910 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
911 goto retry;
912 }
913 if (ep->rep_connected <= 0) {
914 /* Sometimes, the only way to reliably connect to remote
915 * CMs is to use same nonzero values for ORD and IRD. */
916 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
917 (ep->rep_remote_cma.responder_resources == 0 ||
918 ep->rep_remote_cma.initiator_depth !=
919 ep->rep_remote_cma.responder_resources)) {
920 if (ep->rep_remote_cma.responder_resources == 0)
921 ep->rep_remote_cma.responder_resources = 1;
922 ep->rep_remote_cma.initiator_depth =
923 ep->rep_remote_cma.responder_resources;
924 goto retry;
925 }
926 rc = ep->rep_connected;
927 } else {
928 dprintk("RPC: %s: connected\n", __func__);
929 }
930
931out:
932 if (rc)
933 ep->rep_connected = rc;
934 return rc;
935}
936
937/*
938 * rpcrdma_ep_disconnect
939 *
940 * This is separate from destroy to facilitate the ability
941 * to reconnect without recreating the endpoint.
942 *
943 * This call is not reentrant, and must not be made in parallel
944 * on the same endpoint.
945 */
946int
947rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
948{
949 int rc;
950
951 rpcrdma_clean_cq(ep->rep_cq);
952 rc = rdma_disconnect(ia->ri_id);
953 if (!rc) {
954 /* returns without wait if not connected */
955 wait_event_interruptible(ep->rep_connect_wait,
956 ep->rep_connected != 1);
957 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
958 (ep->rep_connected == 1) ? "still " : "dis");
959 } else {
960 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
961 ep->rep_connected = rc;
962 }
963 return rc;
964}
965
966/*
967 * Initialize buffer memory
968 */
969int
970rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
971 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
972{
973 char *p;
974 size_t len;
975 int i, rc;
976 struct rpcrdma_mw *r;
977
978 buf->rb_max_requests = cdata->max_requests;
979 spin_lock_init(&buf->rb_lock);
980 atomic_set(&buf->rb_credits, 1);
981
982 /* Need to allocate:
983 * 1. arrays for send and recv pointers
984 * 2. arrays of struct rpcrdma_req to fill in pointers
985 * 3. array of struct rpcrdma_rep for replies
986 * 4. padding, if any
987 * 5. mw's, fmr's or frmr's, if any
988 * Send/recv buffers in req/rep need to be registered
989 */
990
991 len = buf->rb_max_requests *
992 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
993 len += cdata->padding;
994 switch (ia->ri_memreg_strategy) {
995 case RPCRDMA_FRMR:
996 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MTHCAFMR:
1000 /* TBD we are perhaps overallocating here */
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 case RPCRDMA_MEMWINDOWS_ASYNC:
1005 case RPCRDMA_MEMWINDOWS:
1006 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1007 sizeof(struct rpcrdma_mw);
1008 break;
1009 default:
1010 break;
1011 }
1012
1013 /* allocate 1, 4 and 5 in one shot */
1014 p = kzalloc(len, GFP_KERNEL);
1015 if (p == NULL) {
1016 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1017 __func__, len);
1018 rc = -ENOMEM;
1019 goto out;
1020 }
1021 buf->rb_pool = p; /* for freeing it later */
1022
1023 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1024 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1025 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1026 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1027
1028 /*
1029 * Register the zeroed pad buffer, if any.
1030 */
1031 if (cdata->padding) {
1032 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1033 &ep->rep_pad_mr, &ep->rep_pad);
1034 if (rc)
1035 goto out;
1036 }
1037 p += cdata->padding;
1038
1039 /*
1040 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1041 * We "cycle" the mw's in order to minimize rkey reuse,
1042 * and also reduce unbind-to-bind collision.
1043 */
1044 INIT_LIST_HEAD(&buf->rb_mws);
1045 r = (struct rpcrdma_mw *)p;
1046 switch (ia->ri_memreg_strategy) {
1047 case RPCRDMA_FRMR:
1048 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1049 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1050 RPCRDMA_MAX_SEGS);
1051 if (IS_ERR(r->r.frmr.fr_mr)) {
1052 rc = PTR_ERR(r->r.frmr.fr_mr);
1053 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1054 " failed %i\n", __func__, rc);
1055 goto out;
1056 }
1057 r->r.frmr.fr_pgl =
1058 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1059 RPCRDMA_MAX_SEGS);
1060 if (IS_ERR(r->r.frmr.fr_pgl)) {
1061 rc = PTR_ERR(r->r.frmr.fr_pgl);
1062 dprintk("RPC: %s: "
1063 "ib_alloc_fast_reg_page_list "
1064 "failed %i\n", __func__, rc);
1065 goto out;
1066 }
1067 list_add(&r->mw_list, &buf->rb_mws);
1068 ++r;
1069 }
1070 break;
1071 case RPCRDMA_MTHCAFMR:
1072 /* TBD we are perhaps overallocating here */
1073 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1074 static struct ib_fmr_attr fa =
1075 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1076 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1077 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1078 &fa);
1079 if (IS_ERR(r->r.fmr)) {
1080 rc = PTR_ERR(r->r.fmr);
1081 dprintk("RPC: %s: ib_alloc_fmr"
1082 " failed %i\n", __func__, rc);
1083 goto out;
1084 }
1085 list_add(&r->mw_list, &buf->rb_mws);
1086 ++r;
1087 }
1088 break;
1089 case RPCRDMA_MEMWINDOWS_ASYNC:
1090 case RPCRDMA_MEMWINDOWS:
1091 /* Allocate one extra request's worth, for full cycling */
1092 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1093 r->r.mw = ib_alloc_mw(ia->ri_pd);
1094 if (IS_ERR(r->r.mw)) {
1095 rc = PTR_ERR(r->r.mw);
1096 dprintk("RPC: %s: ib_alloc_mw"
1097 " failed %i\n", __func__, rc);
1098 goto out;
1099 }
1100 list_add(&r->mw_list, &buf->rb_mws);
1101 ++r;
1102 }
1103 break;
1104 default:
1105 break;
1106 }
1107
1108 /*
1109 * Allocate/init the request/reply buffers. Doing this
1110 * using kmalloc for now -- one for each buf.
1111 */
1112 for (i = 0; i < buf->rb_max_requests; i++) {
1113 struct rpcrdma_req *req;
1114 struct rpcrdma_rep *rep;
1115
1116 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1117 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1118 /* Typical ~2400b, so rounding up saves work later */
1119 if (len < 4096)
1120 len = 4096;
1121 req = kmalloc(len, GFP_KERNEL);
1122 if (req == NULL) {
1123 dprintk("RPC: %s: request buffer %d alloc"
1124 " failed\n", __func__, i);
1125 rc = -ENOMEM;
1126 goto out;
1127 }
1128 memset(req, 0, sizeof(struct rpcrdma_req));
1129 buf->rb_send_bufs[i] = req;
1130 buf->rb_send_bufs[i]->rl_buffer = buf;
1131
1132 rc = rpcrdma_register_internal(ia, req->rl_base,
1133 len - offsetof(struct rpcrdma_req, rl_base),
1134 &buf->rb_send_bufs[i]->rl_handle,
1135 &buf->rb_send_bufs[i]->rl_iov);
1136 if (rc)
1137 goto out;
1138
1139 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1140
1141 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1142 rep = kmalloc(len, GFP_KERNEL);
1143 if (rep == NULL) {
1144 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1145 __func__, i);
1146 rc = -ENOMEM;
1147 goto out;
1148 }
1149 memset(rep, 0, sizeof(struct rpcrdma_rep));
1150 buf->rb_recv_bufs[i] = rep;
1151 buf->rb_recv_bufs[i]->rr_buffer = buf;
1152 init_waitqueue_head(&rep->rr_unbind);
1153
1154 rc = rpcrdma_register_internal(ia, rep->rr_base,
1155 len - offsetof(struct rpcrdma_rep, rr_base),
1156 &buf->rb_recv_bufs[i]->rr_handle,
1157 &buf->rb_recv_bufs[i]->rr_iov);
1158 if (rc)
1159 goto out;
1160
1161 }
1162 dprintk("RPC: %s: max_requests %d\n",
1163 __func__, buf->rb_max_requests);
1164 /* done */
1165 return 0;
1166out:
1167 rpcrdma_buffer_destroy(buf);
1168 return rc;
1169}
1170
1171/*
1172 * Unregister and destroy buffer memory. Need to deal with
1173 * partial initialization, so it's callable from failed create.
1174 * Must be called before destroying endpoint, as registrations
1175 * reference it.
1176 */
1177void
1178rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1179{
1180 int rc, i;
1181 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1182 struct rpcrdma_mw *r;
1183
1184 /* clean up in reverse order from create
1185 * 1. recv mr memory (mr free, then kfree)
1186 * 1a. bind mw memory
1187 * 2. send mr memory (mr free, then kfree)
1188 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1189 * 4. arrays
1190 */
1191 dprintk("RPC: %s: entering\n", __func__);
1192
1193 for (i = 0; i < buf->rb_max_requests; i++) {
1194 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1195 rpcrdma_deregister_internal(ia,
1196 buf->rb_recv_bufs[i]->rr_handle,
1197 &buf->rb_recv_bufs[i]->rr_iov);
1198 kfree(buf->rb_recv_bufs[i]);
1199 }
1200 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1201 while (!list_empty(&buf->rb_mws)) {
1202 r = list_entry(buf->rb_mws.next,
1203 struct rpcrdma_mw, mw_list);
1204 list_del(&r->mw_list);
1205 switch (ia->ri_memreg_strategy) {
1206 case RPCRDMA_FRMR:
1207 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1208 if (rc)
1209 dprintk("RPC: %s:"
1210 " ib_dereg_mr"
1211 " failed %i\n",
1212 __func__, rc);
1213 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1214 break;
1215 case RPCRDMA_MTHCAFMR:
1216 rc = ib_dealloc_fmr(r->r.fmr);
1217 if (rc)
1218 dprintk("RPC: %s:"
1219 " ib_dealloc_fmr"
1220 " failed %i\n",
1221 __func__, rc);
1222 break;
1223 case RPCRDMA_MEMWINDOWS_ASYNC:
1224 case RPCRDMA_MEMWINDOWS:
1225 rc = ib_dealloc_mw(r->r.mw);
1226 if (rc)
1227 dprintk("RPC: %s:"
1228 " ib_dealloc_mw"
1229 " failed %i\n",
1230 __func__, rc);
1231 break;
1232 default:
1233 break;
1234 }
1235 }
1236 rpcrdma_deregister_internal(ia,
1237 buf->rb_send_bufs[i]->rl_handle,
1238 &buf->rb_send_bufs[i]->rl_iov);
1239 kfree(buf->rb_send_bufs[i]);
1240 }
1241 }
1242
1243 kfree(buf->rb_pool);
1244}
1245
1246/*
1247 * Get a set of request/reply buffers.
1248 *
1249 * Reply buffer (if needed) is attached to send buffer upon return.
1250 * Rule:
1251 * rb_send_index and rb_recv_index MUST always be pointing to the
1252 * *next* available buffer (non-NULL). They are incremented after
1253 * removing buffers, and decremented *before* returning them.
1254 */
1255struct rpcrdma_req *
1256rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1257{
1258 struct rpcrdma_req *req;
1259 unsigned long flags;
1260 int i;
1261 struct rpcrdma_mw *r;
1262
1263 spin_lock_irqsave(&buffers->rb_lock, flags);
1264 if (buffers->rb_send_index == buffers->rb_max_requests) {
1265 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1266 dprintk("RPC: %s: out of request buffers\n", __func__);
1267 return ((struct rpcrdma_req *)NULL);
1268 }
1269
1270 req = buffers->rb_send_bufs[buffers->rb_send_index];
1271 if (buffers->rb_send_index < buffers->rb_recv_index) {
1272 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1273 __func__,
1274 buffers->rb_recv_index - buffers->rb_send_index);
1275 req->rl_reply = NULL;
1276 } else {
1277 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1278 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1279 }
1280 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1281 if (!list_empty(&buffers->rb_mws)) {
1282 i = RPCRDMA_MAX_SEGS - 1;
1283 do {
1284 r = list_entry(buffers->rb_mws.next,
1285 struct rpcrdma_mw, mw_list);
1286 list_del(&r->mw_list);
1287 req->rl_segments[i].mr_chunk.rl_mw = r;
1288 } while (--i >= 0);
1289 }
1290 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1291 return req;
1292}
1293
1294/*
1295 * Put request/reply buffers back into pool.
1296 * Pre-decrement counter/array index.
1297 */
1298void
1299rpcrdma_buffer_put(struct rpcrdma_req *req)
1300{
1301 struct rpcrdma_buffer *buffers = req->rl_buffer;
1302 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1303 int i;
1304 unsigned long flags;
1305
1306 BUG_ON(req->rl_nchunks != 0);
1307 spin_lock_irqsave(&buffers->rb_lock, flags);
1308 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1309 req->rl_niovs = 0;
1310 if (req->rl_reply) {
1311 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1312 init_waitqueue_head(&req->rl_reply->rr_unbind);
1313 req->rl_reply->rr_func = NULL;
1314 req->rl_reply = NULL;
1315 }
1316 switch (ia->ri_memreg_strategy) {
1317 case RPCRDMA_FRMR:
1318 case RPCRDMA_MTHCAFMR:
1319 case RPCRDMA_MEMWINDOWS_ASYNC:
1320 case RPCRDMA_MEMWINDOWS:
1321 /*
1322 * Cycle mw's back in reverse order, and "spin" them.
1323 * This delays and scrambles reuse as much as possible.
1324 */
1325 i = 1;
1326 do {
1327 struct rpcrdma_mw **mw;
1328 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1329 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1330 *mw = NULL;
1331 } while (++i < RPCRDMA_MAX_SEGS);
1332 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1333 &buffers->rb_mws);
1334 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1335 break;
1336 default:
1337 break;
1338 }
1339 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1340}
1341
1342/*
1343 * Recover reply buffers from pool.
1344 * This happens when recovering from error conditions.
1345 * Post-increment counter/array index.
1346 */
1347void
1348rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1349{
1350 struct rpcrdma_buffer *buffers = req->rl_buffer;
1351 unsigned long flags;
1352
1353 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1354 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1355 spin_lock_irqsave(&buffers->rb_lock, flags);
1356 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1357 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1358 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1359 }
1360 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1361}
1362
1363/*
1364 * Put reply buffers back into pool when not attached to
1365 * request. This happens in error conditions, and when
1366 * aborting unbinds. Pre-decrement counter/array index.
1367 */
1368void
1369rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1370{
1371 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1372 unsigned long flags;
1373
1374 rep->rr_func = NULL;
1375 spin_lock_irqsave(&buffers->rb_lock, flags);
1376 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1377 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1378}
1379
1380/*
1381 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1382 */
1383
1384int
1385rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1386 struct ib_mr **mrp, struct ib_sge *iov)
1387{
1388 struct ib_phys_buf ipb;
1389 struct ib_mr *mr;
1390 int rc;
1391
1392 /*
1393 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1394 */
1395 iov->addr = ib_dma_map_single(ia->ri_id->device,
1396 va, len, DMA_BIDIRECTIONAL);
1397 iov->length = len;
1398
1399 if (ia->ri_have_dma_lkey) {
1400 *mrp = NULL;
1401 iov->lkey = ia->ri_dma_lkey;
1402 return 0;
1403 } else if (ia->ri_bind_mem != NULL) {
1404 *mrp = NULL;
1405 iov->lkey = ia->ri_bind_mem->lkey;
1406 return 0;
1407 }
1408
1409 ipb.addr = iov->addr;
1410 ipb.size = iov->length;
1411 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1412 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1413
1414 dprintk("RPC: %s: phys convert: 0x%llx "
1415 "registered 0x%llx length %d\n",
1416 __func__, (unsigned long long)ipb.addr,
1417 (unsigned long long)iov->addr, len);
1418
1419 if (IS_ERR(mr)) {
1420 *mrp = NULL;
1421 rc = PTR_ERR(mr);
1422 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1423 } else {
1424 *mrp = mr;
1425 iov->lkey = mr->lkey;
1426 rc = 0;
1427 }
1428
1429 return rc;
1430}
1431
1432int
1433rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1434 struct ib_mr *mr, struct ib_sge *iov)
1435{
1436 int rc;
1437
1438 ib_dma_unmap_single(ia->ri_id->device,
1439 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1440
1441 if (NULL == mr)
1442 return 0;
1443
1444 rc = ib_dereg_mr(mr);
1445 if (rc)
1446 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1447 return rc;
1448}
1449
1450/*
1451 * Wrappers for chunk registration, shared by read/write chunk code.
1452 */
1453
1454static void
1455rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1456{
1457 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1458 seg->mr_dmalen = seg->mr_len;
1459 if (seg->mr_page)
1460 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1461 seg->mr_page, offset_in_page(seg->mr_offset),
1462 seg->mr_dmalen, seg->mr_dir);
1463 else
1464 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1465 seg->mr_offset,
1466 seg->mr_dmalen, seg->mr_dir);
1467 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1468 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1469 __func__,
1470 (unsigned long long)seg->mr_dma,
1471 seg->mr_offset, seg->mr_dmalen);
1472 }
1473}
1474
1475static void
1476rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1477{
1478 if (seg->mr_page)
1479 ib_dma_unmap_page(ia->ri_id->device,
1480 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1481 else
1482 ib_dma_unmap_single(ia->ri_id->device,
1483 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1484}
1485
1486static int
1487rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1488 int *nsegs, int writing, struct rpcrdma_ia *ia,
1489 struct rpcrdma_xprt *r_xprt)
1490{
1491 struct rpcrdma_mr_seg *seg1 = seg;
1492 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1493
1494 u8 key;
1495 int len, pageoff;
1496 int i, rc;
1497 int seg_len;
1498 u64 pa;
1499 int page_no;
1500
1501 pageoff = offset_in_page(seg1->mr_offset);
1502 seg1->mr_offset -= pageoff; /* start of page */
1503 seg1->mr_len += pageoff;
1504 len = -pageoff;
1505 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1506 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1507 for (page_no = i = 0; i < *nsegs;) {
1508 rpcrdma_map_one(ia, seg, writing);
1509 pa = seg->mr_dma;
1510 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1511 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1512 page_list[page_no++] = pa;
1513 pa += PAGE_SIZE;
1514 }
1515 len += seg->mr_len;
1516 ++seg;
1517 ++i;
1518 /* Check for holes */
1519 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1520 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1521 break;
1522 }
1523 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1524 __func__, seg1->mr_chunk.rl_mw, i);
1525
1526 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1527 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1528 __func__,
1529 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1530 /* Invalidate before using. */
1531 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1532 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1533 invalidate_wr.next = &frmr_wr;
1534 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1535 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1536 invalidate_wr.ex.invalidate_rkey =
1537 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1538 DECR_CQCOUNT(&r_xprt->rx_ep);
1539 post_wr = &invalidate_wr;
1540 } else
1541 post_wr = &frmr_wr;
1542
1543 /* Bump the key */
1544 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1545 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1546
1547 /* Prepare FRMR WR */
1548 memset(&frmr_wr, 0, sizeof frmr_wr);
1549 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1550 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1551 frmr_wr.send_flags = IB_SEND_SIGNALED;
1552 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1553 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1554 frmr_wr.wr.fast_reg.page_list_len = page_no;
1555 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1556 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1557 BUG_ON(frmr_wr.wr.fast_reg.length < len);
1558 frmr_wr.wr.fast_reg.access_flags = (writing ?
1559 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1560 IB_ACCESS_REMOTE_READ);
1561 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1562 DECR_CQCOUNT(&r_xprt->rx_ep);
1563
1564 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1565
1566 if (rc) {
1567 dprintk("RPC: %s: failed ib_post_send for register,"
1568 " status %i\n", __func__, rc);
1569 while (i--)
1570 rpcrdma_unmap_one(ia, --seg);
1571 } else {
1572 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1573 seg1->mr_base = seg1->mr_dma + pageoff;
1574 seg1->mr_nsegs = i;
1575 seg1->mr_len = len;
1576 }
1577 *nsegs = i;
1578 return rc;
1579}
1580
1581static int
1582rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1583 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1584{
1585 struct rpcrdma_mr_seg *seg1 = seg;
1586 struct ib_send_wr invalidate_wr, *bad_wr;
1587 int rc;
1588
1589 while (seg1->mr_nsegs--)
1590 rpcrdma_unmap_one(ia, seg++);
1591
1592 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1593 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1594 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1595 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1596 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1597 DECR_CQCOUNT(&r_xprt->rx_ep);
1598
1599 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1600 if (rc)
1601 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1602 " status %i\n", __func__, rc);
1603 return rc;
1604}
1605
1606static int
1607rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1608 int *nsegs, int writing, struct rpcrdma_ia *ia)
1609{
1610 struct rpcrdma_mr_seg *seg1 = seg;
1611 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1612 int len, pageoff, i, rc;
1613
1614 pageoff = offset_in_page(seg1->mr_offset);
1615 seg1->mr_offset -= pageoff; /* start of page */
1616 seg1->mr_len += pageoff;
1617 len = -pageoff;
1618 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1619 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1620 for (i = 0; i < *nsegs;) {
1621 rpcrdma_map_one(ia, seg, writing);
1622 physaddrs[i] = seg->mr_dma;
1623 len += seg->mr_len;
1624 ++seg;
1625 ++i;
1626 /* Check for holes */
1627 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1628 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1629 break;
1630 }
1631 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1632 physaddrs, i, seg1->mr_dma);
1633 if (rc) {
1634 dprintk("RPC: %s: failed ib_map_phys_fmr "
1635 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1636 len, (unsigned long long)seg1->mr_dma,
1637 pageoff, i, rc);
1638 while (i--)
1639 rpcrdma_unmap_one(ia, --seg);
1640 } else {
1641 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1642 seg1->mr_base = seg1->mr_dma + pageoff;
1643 seg1->mr_nsegs = i;
1644 seg1->mr_len = len;
1645 }
1646 *nsegs = i;
1647 return rc;
1648}
1649
1650static int
1651rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1652 struct rpcrdma_ia *ia)
1653{
1654 struct rpcrdma_mr_seg *seg1 = seg;
1655 LIST_HEAD(l);
1656 int rc;
1657
1658 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1659 rc = ib_unmap_fmr(&l);
1660 while (seg1->mr_nsegs--)
1661 rpcrdma_unmap_one(ia, seg++);
1662 if (rc)
1663 dprintk("RPC: %s: failed ib_unmap_fmr,"
1664 " status %i\n", __func__, rc);
1665 return rc;
1666}
1667
1668static int
1669rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1670 int *nsegs, int writing, struct rpcrdma_ia *ia,
1671 struct rpcrdma_xprt *r_xprt)
1672{
1673 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1674 IB_ACCESS_REMOTE_READ);
1675 struct ib_mw_bind param;
1676 int rc;
1677
1678 *nsegs = 1;
1679 rpcrdma_map_one(ia, seg, writing);
1680 param.mr = ia->ri_bind_mem;
1681 param.wr_id = 0ULL; /* no send cookie */
1682 param.addr = seg->mr_dma;
1683 param.length = seg->mr_len;
1684 param.send_flags = 0;
1685 param.mw_access_flags = mem_priv;
1686
1687 DECR_CQCOUNT(&r_xprt->rx_ep);
1688 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1689 if (rc) {
1690 dprintk("RPC: %s: failed ib_bind_mw "
1691 "%u@0x%llx status %i\n",
1692 __func__, seg->mr_len,
1693 (unsigned long long)seg->mr_dma, rc);
1694 rpcrdma_unmap_one(ia, seg);
1695 } else {
1696 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1697 seg->mr_base = param.addr;
1698 seg->mr_nsegs = 1;
1699 }
1700 return rc;
1701}
1702
1703static int
1704rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1705 struct rpcrdma_ia *ia,
1706 struct rpcrdma_xprt *r_xprt, void **r)
1707{
1708 struct ib_mw_bind param;
1709 LIST_HEAD(l);
1710 int rc;
1711
1712 BUG_ON(seg->mr_nsegs != 1);
1713 param.mr = ia->ri_bind_mem;
1714 param.addr = 0ULL; /* unbind */
1715 param.length = 0;
1716 param.mw_access_flags = 0;
1717 if (*r) {
1718 param.wr_id = (u64) (unsigned long) *r;
1719 param.send_flags = IB_SEND_SIGNALED;
1720 INIT_CQCOUNT(&r_xprt->rx_ep);
1721 } else {
1722 param.wr_id = 0ULL;
1723 param.send_flags = 0;
1724 DECR_CQCOUNT(&r_xprt->rx_ep);
1725 }
1726 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1727 rpcrdma_unmap_one(ia, seg);
1728 if (rc)
1729 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1730 " status %i\n", __func__, rc);
1731 else
1732 *r = NULL; /* will upcall on completion */
1733 return rc;
1734}
1735
1736static int
1737rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1738 int *nsegs, int writing, struct rpcrdma_ia *ia)
1739{
1740 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1741 IB_ACCESS_REMOTE_READ);
1742 struct rpcrdma_mr_seg *seg1 = seg;
1743 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1744 int len, i, rc = 0;
1745
1746 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1747 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1748 for (len = 0, i = 0; i < *nsegs;) {
1749 rpcrdma_map_one(ia, seg, writing);
1750 ipb[i].addr = seg->mr_dma;
1751 ipb[i].size = seg->mr_len;
1752 len += seg->mr_len;
1753 ++seg;
1754 ++i;
1755 /* Check for holes */
1756 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1757 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1758 break;
1759 }
1760 seg1->mr_base = seg1->mr_dma;
1761 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1762 ipb, i, mem_priv, &seg1->mr_base);
1763 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1764 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1765 dprintk("RPC: %s: failed ib_reg_phys_mr "
1766 "%u@0x%llx (%d)... status %i\n",
1767 __func__, len,
1768 (unsigned long long)seg1->mr_dma, i, rc);
1769 while (i--)
1770 rpcrdma_unmap_one(ia, --seg);
1771 } else {
1772 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1773 seg1->mr_nsegs = i;
1774 seg1->mr_len = len;
1775 }
1776 *nsegs = i;
1777 return rc;
1778}
1779
1780static int
1781rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1782 struct rpcrdma_ia *ia)
1783{
1784 struct rpcrdma_mr_seg *seg1 = seg;
1785 int rc;
1786
1787 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1788 seg1->mr_chunk.rl_mr = NULL;
1789 while (seg1->mr_nsegs--)
1790 rpcrdma_unmap_one(ia, seg++);
1791 if (rc)
1792 dprintk("RPC: %s: failed ib_dereg_mr,"
1793 " status %i\n", __func__, rc);
1794 return rc;
1795}
1796
1797int
1798rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1799 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1800{
1801 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1802 int rc = 0;
1803
1804 switch (ia->ri_memreg_strategy) {
1805
1806#if RPCRDMA_PERSISTENT_REGISTRATION
1807 case RPCRDMA_ALLPHYSICAL:
1808 rpcrdma_map_one(ia, seg, writing);
1809 seg->mr_rkey = ia->ri_bind_mem->rkey;
1810 seg->mr_base = seg->mr_dma;
1811 seg->mr_nsegs = 1;
1812 nsegs = 1;
1813 break;
1814#endif
1815
1816 /* Registration using frmr registration */
1817 case RPCRDMA_FRMR:
1818 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1819 break;
1820
1821 /* Registration using fmr memory registration */
1822 case RPCRDMA_MTHCAFMR:
1823 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1824 break;
1825
1826 /* Registration using memory windows */
1827 case RPCRDMA_MEMWINDOWS_ASYNC:
1828 case RPCRDMA_MEMWINDOWS:
1829 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1830 break;
1831
1832 /* Default registration each time */
1833 default:
1834 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1835 break;
1836 }
1837 if (rc)
1838 return -1;
1839
1840 return nsegs;
1841}
1842
1843int
1844rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1845 struct rpcrdma_xprt *r_xprt, void *r)
1846{
1847 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1848 int nsegs = seg->mr_nsegs, rc;
1849
1850 switch (ia->ri_memreg_strategy) {
1851
1852#if RPCRDMA_PERSISTENT_REGISTRATION
1853 case RPCRDMA_ALLPHYSICAL:
1854 BUG_ON(nsegs != 1);
1855 rpcrdma_unmap_one(ia, seg);
1856 rc = 0;
1857 break;
1858#endif
1859
1860 case RPCRDMA_FRMR:
1861 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1862 break;
1863
1864 case RPCRDMA_MTHCAFMR:
1865 rc = rpcrdma_deregister_fmr_external(seg, ia);
1866 break;
1867
1868 case RPCRDMA_MEMWINDOWS_ASYNC:
1869 case RPCRDMA_MEMWINDOWS:
1870 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1871 break;
1872
1873 default:
1874 rc = rpcrdma_deregister_default_external(seg, ia);
1875 break;
1876 }
1877 if (r) {
1878 struct rpcrdma_rep *rep = r;
1879 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1880 rep->rr_func = NULL;
1881 func(rep); /* dereg done, callback now */
1882 }
1883 return nsegs;
1884}
1885
1886/*
1887 * Prepost any receive buffer, then post send.
1888 *
1889 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1890 */
1891int
1892rpcrdma_ep_post(struct rpcrdma_ia *ia,
1893 struct rpcrdma_ep *ep,
1894 struct rpcrdma_req *req)
1895{
1896 struct ib_send_wr send_wr, *send_wr_fail;
1897 struct rpcrdma_rep *rep = req->rl_reply;
1898 int rc;
1899
1900 if (rep) {
1901 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1902 if (rc)
1903 goto out;
1904 req->rl_reply = NULL;
1905 }
1906
1907 send_wr.next = NULL;
1908 send_wr.wr_id = 0ULL; /* no send cookie */
1909 send_wr.sg_list = req->rl_send_iov;
1910 send_wr.num_sge = req->rl_niovs;
1911 send_wr.opcode = IB_WR_SEND;
1912 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1913 ib_dma_sync_single_for_device(ia->ri_id->device,
1914 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1915 DMA_TO_DEVICE);
1916 ib_dma_sync_single_for_device(ia->ri_id->device,
1917 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1918 DMA_TO_DEVICE);
1919 ib_dma_sync_single_for_device(ia->ri_id->device,
1920 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1921 DMA_TO_DEVICE);
1922
1923 if (DECR_CQCOUNT(ep) > 0)
1924 send_wr.send_flags = 0;
1925 else { /* Provider must take a send completion every now and then */
1926 INIT_CQCOUNT(ep);
1927 send_wr.send_flags = IB_SEND_SIGNALED;
1928 }
1929
1930 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1931 if (rc)
1932 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1933 rc);
1934out:
1935 return rc;
1936}
1937
1938/*
1939 * (Re)post a receive buffer.
1940 */
1941int
1942rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1943 struct rpcrdma_ep *ep,
1944 struct rpcrdma_rep *rep)
1945{
1946 struct ib_recv_wr recv_wr, *recv_wr_fail;
1947 int rc;
1948
1949 recv_wr.next = NULL;
1950 recv_wr.wr_id = (u64) (unsigned long) rep;
1951 recv_wr.sg_list = &rep->rr_iov;
1952 recv_wr.num_sge = 1;
1953
1954 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1955 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1956
1957 DECR_CQCOUNT(ep);
1958 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1959
1960 if (rc)
1961 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1962 rc);
1963 return rc;
1964}