blob: 11c8506e04ca3aa037b0efb34bad4caf908d6714 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/inet.h>
15#include <linux/llist.h>
16#include <crypto/hash.h>
17
18#include "nvmet.h"
19
20#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
21#define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */
22
23#define NVMET_TCP_RECV_BUDGET 8
24#define NVMET_TCP_SEND_BUDGET 8
25#define NVMET_TCP_IO_WORK_BUDGET 64
26
27enum nvmet_tcp_send_state {
28 NVMET_TCP_SEND_DATA_PDU,
29 NVMET_TCP_SEND_DATA,
30 NVMET_TCP_SEND_R2T,
31 NVMET_TCP_SEND_DDGST,
32 NVMET_TCP_SEND_RESPONSE
33};
34
35enum nvmet_tcp_recv_state {
36 NVMET_TCP_RECV_PDU,
37 NVMET_TCP_RECV_DATA,
38 NVMET_TCP_RECV_DDGST,
39 NVMET_TCP_RECV_ERR,
40};
41
42enum {
43 NVMET_TCP_F_INIT_FAILED = (1 << 0),
44};
45
46struct nvmet_tcp_cmd {
47 struct nvmet_tcp_queue *queue;
48 struct nvmet_req req;
49
50 struct nvme_tcp_cmd_pdu *cmd_pdu;
51 struct nvme_tcp_rsp_pdu *rsp_pdu;
52 struct nvme_tcp_data_pdu *data_pdu;
53 struct nvme_tcp_r2t_pdu *r2t_pdu;
54
55 u32 rbytes_done;
56 u32 wbytes_done;
57
58 u32 pdu_len;
59 u32 pdu_recv;
60 int sg_idx;
61 int nr_mapped;
62 struct msghdr recv_msg;
63 struct kvec *iov;
64 u32 flags;
65
66 struct list_head entry;
67 struct llist_node lentry;
68
69 /* send state */
70 u32 offset;
71 struct scatterlist *cur_sg;
72 enum nvmet_tcp_send_state state;
73
74 __le32 exp_ddgst;
75 __le32 recv_ddgst;
76};
77
78enum nvmet_tcp_queue_state {
79 NVMET_TCP_Q_CONNECTING,
80 NVMET_TCP_Q_LIVE,
81 NVMET_TCP_Q_DISCONNECTING,
82};
83
84struct nvmet_tcp_queue {
85 struct socket *sock;
86 struct nvmet_tcp_port *port;
87 struct work_struct io_work;
88 int cpu;
89 struct nvmet_cq nvme_cq;
90 struct nvmet_sq nvme_sq;
91
92 /* send state */
93 struct nvmet_tcp_cmd *cmds;
94 unsigned int nr_cmds;
95 struct list_head free_list;
96 struct llist_head resp_list;
97 struct list_head resp_send_list;
98 int send_list_len;
99 struct nvmet_tcp_cmd *snd_cmd;
100
101 /* recv state */
102 int offset;
103 int left;
104 enum nvmet_tcp_recv_state rcv_state;
105 struct nvmet_tcp_cmd *cmd;
106 union nvme_tcp_pdu pdu;
107
108 /* digest state */
109 bool hdr_digest;
110 bool data_digest;
111 struct ahash_request *snd_hash;
112 struct ahash_request *rcv_hash;
113
114 spinlock_t state_lock;
115 enum nvmet_tcp_queue_state state;
116
117 struct sockaddr_storage sockaddr;
118 struct sockaddr_storage sockaddr_peer;
119 struct work_struct release_work;
120
121 int idx;
122 struct list_head queue_list;
123
124 struct nvmet_tcp_cmd connect;
125
126 struct page_frag_cache pf_cache;
127
128 void (*data_ready)(struct sock *);
129 void (*state_change)(struct sock *);
130 void (*write_space)(struct sock *);
131};
132
133struct nvmet_tcp_port {
134 struct socket *sock;
135 struct work_struct accept_work;
136 struct nvmet_port *nport;
137 struct sockaddr_storage addr;
138 int last_cpu;
139 void (*data_ready)(struct sock *);
140};
141
142static DEFINE_IDA(nvmet_tcp_queue_ida);
143static LIST_HEAD(nvmet_tcp_queue_list);
144static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
145
146static struct workqueue_struct *nvmet_tcp_wq;
147static struct nvmet_fabrics_ops nvmet_tcp_ops;
148static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
149static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
150
151static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
152 struct nvmet_tcp_cmd *cmd)
153{
154 if (unlikely(!queue->nr_cmds)) {
155 /* We didn't allocate cmds yet, send 0xffff */
156 return USHRT_MAX;
157 }
158
159 return cmd - queue->cmds;
160}
161
162static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
163{
164 return nvme_is_write(cmd->req.cmd) &&
165 cmd->rbytes_done < cmd->req.transfer_len;
166}
167
168static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
169{
170 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
171}
172
173static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
174{
175 return !nvme_is_write(cmd->req.cmd) &&
176 cmd->req.transfer_len > 0 &&
177 !cmd->req.cqe->status;
178}
179
180static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
181{
182 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
183 !cmd->rbytes_done;
184}
185
186static inline struct nvmet_tcp_cmd *
187nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
188{
189 struct nvmet_tcp_cmd *cmd;
190
191 cmd = list_first_entry_or_null(&queue->free_list,
192 struct nvmet_tcp_cmd, entry);
193 if (!cmd)
194 return NULL;
195 list_del_init(&cmd->entry);
196
197 cmd->rbytes_done = cmd->wbytes_done = 0;
198 cmd->pdu_len = 0;
199 cmd->pdu_recv = 0;
200 cmd->iov = NULL;
201 cmd->flags = 0;
202 return cmd;
203}
204
205static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
206{
207 if (unlikely(cmd == &cmd->queue->connect))
208 return;
209
210 list_add_tail(&cmd->entry, &cmd->queue->free_list);
211}
212
213static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
214{
215 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
216}
217
218static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
219{
220 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
221}
222
223static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
224 void *pdu, size_t len)
225{
226 struct scatterlist sg;
227
228 sg_init_one(&sg, pdu, len);
229 ahash_request_set_crypt(hash, &sg, pdu + len, len);
230 crypto_ahash_digest(hash);
231}
232
233static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
234 void *pdu, size_t len)
235{
236 struct nvme_tcp_hdr *hdr = pdu;
237 __le32 recv_digest;
238 __le32 exp_digest;
239
240 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
241 pr_err("queue %d: header digest enabled but no header digest\n",
242 queue->idx);
243 return -EPROTO;
244 }
245
246 recv_digest = *(__le32 *)(pdu + hdr->hlen);
247 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
248 exp_digest = *(__le32 *)(pdu + hdr->hlen);
249 if (recv_digest != exp_digest) {
250 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
251 queue->idx, le32_to_cpu(recv_digest),
252 le32_to_cpu(exp_digest));
253 return -EPROTO;
254 }
255
256 return 0;
257}
258
259static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
260{
261 struct nvme_tcp_hdr *hdr = pdu;
262 u8 digest_len = nvmet_tcp_hdgst_len(queue);
263 u32 len;
264
265 len = le32_to_cpu(hdr->plen) - hdr->hlen -
266 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
267
268 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
269 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
270 return -EPROTO;
271 }
272
273 return 0;
274}
275
276static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
277{
278 struct scatterlist *sg;
279 int i;
280
281 sg = &cmd->req.sg[cmd->sg_idx];
282
283 for (i = 0; i < cmd->nr_mapped; i++)
284 kunmap(sg_page(&sg[i]));
285}
286
287static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
288{
289 struct kvec *iov = cmd->iov;
290 struct scatterlist *sg;
291 u32 length, offset, sg_offset;
292
293 length = cmd->pdu_len;
294 cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
295 offset = cmd->rbytes_done;
296 cmd->sg_idx = offset / PAGE_SIZE;
297 sg_offset = offset % PAGE_SIZE;
298 sg = &cmd->req.sg[cmd->sg_idx];
299
300 while (length) {
301 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
302
303 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
304 iov->iov_len = iov_len;
305
306 length -= iov_len;
307 sg = sg_next(sg);
308 iov++;
309 sg_offset = 0;
310 }
311
312 iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
313 cmd->nr_mapped, cmd->pdu_len);
314}
315
316static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
317{
318 queue->rcv_state = NVMET_TCP_RECV_ERR;
319 if (queue->nvme_sq.ctrl)
320 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
321 else
322 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
323}
324
325static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
326{
327 queue->rcv_state = NVMET_TCP_RECV_ERR;
328 if (status == -EPIPE || status == -ECONNRESET)
329 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
330 else
331 nvmet_tcp_fatal_error(queue);
332}
333
334static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
335{
336 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
337 u32 len = le32_to_cpu(sgl->length);
338
339 if (!cmd->req.data_len)
340 return 0;
341
342 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
343 NVME_SGL_FMT_OFFSET)) {
344 if (!nvme_is_write(cmd->req.cmd))
345 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
346
347 if (len > cmd->req.port->inline_data_size)
348 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
349 cmd->pdu_len = len;
350 }
351 cmd->req.transfer_len += len;
352
353 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
354 if (!cmd->req.sg)
355 return NVME_SC_INTERNAL;
356 cmd->cur_sg = cmd->req.sg;
357
358 if (nvmet_tcp_has_data_in(cmd)) {
359 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
360 sizeof(*cmd->iov), GFP_KERNEL);
361 if (!cmd->iov)
362 goto err;
363 }
364
365 return 0;
366err:
367 sgl_free(cmd->req.sg);
368 return NVME_SC_INTERNAL;
369}
370
371static void nvmet_tcp_ddgst(struct ahash_request *hash,
372 struct nvmet_tcp_cmd *cmd)
373{
374 ahash_request_set_crypt(hash, cmd->req.sg,
375 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
376 crypto_ahash_digest(hash);
377}
378
379static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
380{
381 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
382 struct nvmet_tcp_queue *queue = cmd->queue;
383 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
384 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
385
386 cmd->offset = 0;
387 cmd->state = NVMET_TCP_SEND_DATA_PDU;
388
389 pdu->hdr.type = nvme_tcp_c2h_data;
390 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
391 NVME_TCP_F_DATA_SUCCESS : 0);
392 pdu->hdr.hlen = sizeof(*pdu);
393 pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
394 pdu->hdr.plen =
395 cpu_to_le32(pdu->hdr.hlen + hdgst +
396 cmd->req.transfer_len + ddgst);
397 pdu->command_id = cmd->req.cqe->command_id;
398 pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
399 pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
400
401 if (queue->data_digest) {
402 pdu->hdr.flags |= NVME_TCP_F_DDGST;
403 nvmet_tcp_ddgst(queue->snd_hash, cmd);
404 }
405
406 if (cmd->queue->hdr_digest) {
407 pdu->hdr.flags |= NVME_TCP_F_HDGST;
408 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
409 }
410}
411
412static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
413{
414 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
415 struct nvmet_tcp_queue *queue = cmd->queue;
416 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
417
418 cmd->offset = 0;
419 cmd->state = NVMET_TCP_SEND_R2T;
420
421 pdu->hdr.type = nvme_tcp_r2t;
422 pdu->hdr.flags = 0;
423 pdu->hdr.hlen = sizeof(*pdu);
424 pdu->hdr.pdo = 0;
425 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
426
427 pdu->command_id = cmd->req.cmd->common.command_id;
428 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
429 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
430 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
431 if (cmd->queue->hdr_digest) {
432 pdu->hdr.flags |= NVME_TCP_F_HDGST;
433 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
434 }
435}
436
437static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
438{
439 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
440 struct nvmet_tcp_queue *queue = cmd->queue;
441 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
442
443 cmd->offset = 0;
444 cmd->state = NVMET_TCP_SEND_RESPONSE;
445
446 pdu->hdr.type = nvme_tcp_rsp;
447 pdu->hdr.flags = 0;
448 pdu->hdr.hlen = sizeof(*pdu);
449 pdu->hdr.pdo = 0;
450 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
451 if (cmd->queue->hdr_digest) {
452 pdu->hdr.flags |= NVME_TCP_F_HDGST;
453 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
454 }
455}
456
457static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
458{
459 struct llist_node *node;
460
461 node = llist_del_all(&queue->resp_list);
462 if (!node)
463 return;
464
465 while (node) {
466 struct nvmet_tcp_cmd *cmd = llist_entry(node,
467 struct nvmet_tcp_cmd, lentry);
468
469 list_add(&cmd->entry, &queue->resp_send_list);
470 node = node->next;
471 queue->send_list_len++;
472 }
473}
474
475static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
476{
477 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
478 struct nvmet_tcp_cmd, entry);
479 if (!queue->snd_cmd) {
480 nvmet_tcp_process_resp_list(queue);
481 queue->snd_cmd =
482 list_first_entry_or_null(&queue->resp_send_list,
483 struct nvmet_tcp_cmd, entry);
484 if (unlikely(!queue->snd_cmd))
485 return NULL;
486 }
487
488 list_del_init(&queue->snd_cmd->entry);
489 queue->send_list_len--;
490
491 if (nvmet_tcp_need_data_out(queue->snd_cmd))
492 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
493 else if (nvmet_tcp_need_data_in(queue->snd_cmd))
494 nvmet_setup_r2t_pdu(queue->snd_cmd);
495 else
496 nvmet_setup_response_pdu(queue->snd_cmd);
497
498 return queue->snd_cmd;
499}
500
501static void nvmet_tcp_queue_response(struct nvmet_req *req)
502{
503 struct nvmet_tcp_cmd *cmd =
504 container_of(req, struct nvmet_tcp_cmd, req);
505 struct nvmet_tcp_queue *queue = cmd->queue;
506
507 llist_add(&cmd->lentry, &queue->resp_list);
508 queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
509}
510
511static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
512{
513 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
514 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
515 int ret;
516
517 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
518 offset_in_page(cmd->data_pdu) + cmd->offset,
519 left, MSG_DONTWAIT | MSG_MORE);
520 if (ret <= 0)
521 return ret;
522
523 cmd->offset += ret;
524 left -= ret;
525
526 if (left)
527 return -EAGAIN;
528
529 cmd->state = NVMET_TCP_SEND_DATA;
530 cmd->offset = 0;
531 return 1;
532}
533
534static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
535{
536 struct nvmet_tcp_queue *queue = cmd->queue;
537 int ret;
538
539 while (cmd->cur_sg) {
540 struct page *page = sg_page(cmd->cur_sg);
541 u32 left = cmd->cur_sg->length - cmd->offset;
542 int flags = MSG_DONTWAIT;
543
544 if ((!last_in_batch && cmd->queue->send_list_len) ||
545 cmd->wbytes_done + left < cmd->req.transfer_len ||
546 queue->data_digest || !queue->nvme_sq.sqhd_disabled)
547 flags |= MSG_MORE;
548
549 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
550 left, flags);
551 if (ret <= 0)
552 return ret;
553
554 cmd->offset += ret;
555 cmd->wbytes_done += ret;
556
557 /* Done with sg?*/
558 if (cmd->offset == cmd->cur_sg->length) {
559 cmd->cur_sg = sg_next(cmd->cur_sg);
560 cmd->offset = 0;
561 }
562 }
563
564 if (queue->data_digest) {
565 cmd->state = NVMET_TCP_SEND_DDGST;
566 cmd->offset = 0;
567 } else {
568 if (queue->nvme_sq.sqhd_disabled) {
569 cmd->queue->snd_cmd = NULL;
570 nvmet_tcp_put_cmd(cmd);
571 } else {
572 nvmet_setup_response_pdu(cmd);
573 }
574 }
575
576 if (queue->nvme_sq.sqhd_disabled) {
577 kfree(cmd->iov);
578 sgl_free(cmd->req.sg);
579 }
580
581 return 1;
582
583}
584
585static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
586 bool last_in_batch)
587{
588 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
589 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
590 int flags = MSG_DONTWAIT;
591 int ret;
592
593 if (!last_in_batch && cmd->queue->send_list_len)
594 flags |= MSG_MORE;
595 else
596 flags |= MSG_EOR;
597
598 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
599 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
600 if (ret <= 0)
601 return ret;
602 cmd->offset += ret;
603 left -= ret;
604
605 if (left)
606 return -EAGAIN;
607
608 kfree(cmd->iov);
609 sgl_free(cmd->req.sg);
610 cmd->queue->snd_cmd = NULL;
611 nvmet_tcp_put_cmd(cmd);
612 return 1;
613}
614
615static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
616{
617 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
618 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
619 int flags = MSG_DONTWAIT;
620 int ret;
621
622 if (!last_in_batch && cmd->queue->send_list_len)
623 flags |= MSG_MORE;
624 else
625 flags |= MSG_EOR;
626
627 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
628 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
629 if (ret <= 0)
630 return ret;
631 cmd->offset += ret;
632 left -= ret;
633
634 if (left)
635 return -EAGAIN;
636
637 cmd->queue->snd_cmd = NULL;
638 return 1;
639}
640
641static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
642{
643 struct nvmet_tcp_queue *queue = cmd->queue;
644 int left = NVME_TCP_DIGEST_LENGTH - cmd->offset;
645 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
646 struct kvec iov = {
647 .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
648 .iov_len = left
649 };
650 int ret;
651
652 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
653 if (unlikely(ret <= 0))
654 return ret;
655
656 cmd->offset += ret;
657 left -= ret;
658
659 if (left)
660 return -EAGAIN;
661
662 if (queue->nvme_sq.sqhd_disabled) {
663 cmd->queue->snd_cmd = NULL;
664 nvmet_tcp_put_cmd(cmd);
665 } else {
666 nvmet_setup_response_pdu(cmd);
667 }
668 return 1;
669}
670
671static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
672 bool last_in_batch)
673{
674 struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
675 int ret = 0;
676
677 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
678 cmd = nvmet_tcp_fetch_cmd(queue);
679 if (unlikely(!cmd))
680 return 0;
681 }
682
683 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
684 ret = nvmet_try_send_data_pdu(cmd);
685 if (ret <= 0)
686 goto done_send;
687 }
688
689 if (cmd->state == NVMET_TCP_SEND_DATA) {
690 ret = nvmet_try_send_data(cmd, last_in_batch);
691 if (ret <= 0)
692 goto done_send;
693 }
694
695 if (cmd->state == NVMET_TCP_SEND_DDGST) {
696 ret = nvmet_try_send_ddgst(cmd);
697 if (ret <= 0)
698 goto done_send;
699 }
700
701 if (cmd->state == NVMET_TCP_SEND_R2T) {
702 ret = nvmet_try_send_r2t(cmd, last_in_batch);
703 if (ret <= 0)
704 goto done_send;
705 }
706
707 if (cmd->state == NVMET_TCP_SEND_RESPONSE)
708 ret = nvmet_try_send_response(cmd, last_in_batch);
709
710done_send:
711 if (ret < 0) {
712 if (ret == -EAGAIN)
713 return 0;
714 return ret;
715 }
716
717 return 1;
718}
719
720static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
721 int budget, int *sends)
722{
723 int i, ret = 0;
724
725 for (i = 0; i < budget; i++) {
726 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
727 if (unlikely(ret < 0)) {
728 nvmet_tcp_socket_error(queue, ret);
729 goto done;
730 } else if (ret == 0) {
731 break;
732 }
733 (*sends)++;
734 }
735done:
736 return ret;
737}
738
739static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
740{
741 queue->offset = 0;
742 queue->left = sizeof(struct nvme_tcp_hdr);
743 queue->cmd = NULL;
744 queue->rcv_state = NVMET_TCP_RECV_PDU;
745}
746
747static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
748{
749 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
750
751 ahash_request_free(queue->rcv_hash);
752 ahash_request_free(queue->snd_hash);
753 crypto_free_ahash(tfm);
754}
755
756static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
757{
758 struct crypto_ahash *tfm;
759
760 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
761 if (IS_ERR(tfm))
762 return PTR_ERR(tfm);
763
764 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
765 if (!queue->snd_hash)
766 goto free_tfm;
767 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
768
769 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
770 if (!queue->rcv_hash)
771 goto free_snd_hash;
772 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
773
774 return 0;
775free_snd_hash:
776 ahash_request_free(queue->snd_hash);
777free_tfm:
778 crypto_free_ahash(tfm);
779 return -ENOMEM;
780}
781
782
783static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
784{
785 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
786 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
787 struct msghdr msg = {};
788 struct kvec iov;
789 int ret;
790
791 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
792 pr_err("bad nvme-tcp pdu length (%d)\n",
793 le32_to_cpu(icreq->hdr.plen));
794 nvmet_tcp_fatal_error(queue);
795 return -EPROTO;
796 }
797
798 if (icreq->pfv != NVME_TCP_PFV_1_0) {
799 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
800 return -EPROTO;
801 }
802
803 if (icreq->hpda != 0) {
804 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
805 icreq->hpda);
806 return -EPROTO;
807 }
808
809 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
810 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
811 if (queue->hdr_digest || queue->data_digest) {
812 ret = nvmet_tcp_alloc_crypto(queue);
813 if (ret)
814 return ret;
815 }
816
817 memset(icresp, 0, sizeof(*icresp));
818 icresp->hdr.type = nvme_tcp_icresp;
819 icresp->hdr.hlen = sizeof(*icresp);
820 icresp->hdr.pdo = 0;
821 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
822 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
823 icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA);
824 icresp->cpda = 0;
825 if (queue->hdr_digest)
826 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
827 if (queue->data_digest)
828 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
829
830 iov.iov_base = icresp;
831 iov.iov_len = sizeof(*icresp);
832 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
833 if (ret < 0)
834 return ret; /* queue removal will cleanup */
835
836 queue->state = NVMET_TCP_Q_LIVE;
837 nvmet_prepare_receive_pdu(queue);
838 return 0;
839}
840
841static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
842 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
843{
844 int ret;
845
846 /* recover the expected data transfer length */
847 req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
848
849 if (!nvme_is_write(cmd->req.cmd) ||
850 req->data_len > cmd->req.port->inline_data_size) {
851 nvmet_prepare_receive_pdu(queue);
852 return;
853 }
854
855 ret = nvmet_tcp_map_data(cmd);
856 if (unlikely(ret)) {
857 pr_err("queue %d: failed to map data\n", queue->idx);
858 nvmet_tcp_fatal_error(queue);
859 return;
860 }
861
862 queue->rcv_state = NVMET_TCP_RECV_DATA;
863 nvmet_tcp_map_pdu_iovec(cmd);
864 cmd->flags |= NVMET_TCP_F_INIT_FAILED;
865}
866
867static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
868{
869 struct nvme_tcp_data_pdu *data = &queue->pdu.data;
870 struct nvmet_tcp_cmd *cmd;
871 unsigned int exp_data_len;
872
873 if (likely(queue->nr_cmds)) {
874 if (unlikely(data->ttag >= queue->nr_cmds)) {
875 pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
876 queue->idx, data->ttag, queue->nr_cmds);
877 nvmet_tcp_fatal_error(queue);
878 return -EPROTO;
879 }
880 cmd = &queue->cmds[data->ttag];
881 } else {
882 cmd = &queue->connect;
883 }
884
885 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
886 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
887 data->ttag, le32_to_cpu(data->data_offset),
888 cmd->rbytes_done);
889 /* FIXME: use path and transport errors */
890 nvmet_tcp_fatal_error(queue);
891 return -EPROTO;
892 }
893
894 exp_data_len = le32_to_cpu(data->hdr.plen) -
895 nvmet_tcp_hdgst_len(queue) -
896 nvmet_tcp_ddgst_len(queue) -
897 sizeof(*data);
898
899 cmd->pdu_len = le32_to_cpu(data->data_length);
900 if (unlikely(cmd->pdu_len != exp_data_len ||
901 cmd->pdu_len == 0 ||
902 cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) {
903 pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len);
904 /* FIXME: use proper transport errors */
905 nvmet_tcp_fatal_error(queue);
906 return -EPROTO;
907 }
908 cmd->pdu_recv = 0;
909 nvmet_tcp_map_pdu_iovec(cmd);
910 queue->cmd = cmd;
911 queue->rcv_state = NVMET_TCP_RECV_DATA;
912
913 return 0;
914}
915
916static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
917{
918 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
919 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
920 struct nvmet_req *req;
921 int ret;
922
923 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
924 if (hdr->type != nvme_tcp_icreq) {
925 pr_err("unexpected pdu type (%d) before icreq\n",
926 hdr->type);
927 nvmet_tcp_fatal_error(queue);
928 return -EPROTO;
929 }
930 return nvmet_tcp_handle_icreq(queue);
931 }
932
933 if (hdr->type == nvme_tcp_h2c_data) {
934 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
935 if (unlikely(ret))
936 return ret;
937 return 0;
938 }
939
940 queue->cmd = nvmet_tcp_get_cmd(queue);
941 if (unlikely(!queue->cmd)) {
942 /* This should never happen */
943 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
944 queue->idx, queue->nr_cmds, queue->send_list_len,
945 nvme_cmd->common.opcode);
946 nvmet_tcp_fatal_error(queue);
947 return -ENOMEM;
948 }
949
950 req = &queue->cmd->req;
951 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
952
953 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
954 &queue->nvme_sq, &nvmet_tcp_ops))) {
955 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
956 req->cmd, req->cmd->common.command_id,
957 req->cmd->common.opcode,
958 le32_to_cpu(req->cmd->common.dptr.sgl.length));
959
960 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
961 return -EAGAIN;
962 }
963
964 ret = nvmet_tcp_map_data(queue->cmd);
965 if (unlikely(ret)) {
966 pr_err("queue %d: failed to map data\n", queue->idx);
967 if (nvmet_tcp_has_inline_data(queue->cmd))
968 nvmet_tcp_fatal_error(queue);
969 else
970 nvmet_req_complete(req, ret);
971 ret = -EAGAIN;
972 goto out;
973 }
974
975 if (nvmet_tcp_need_data_in(queue->cmd)) {
976 if (nvmet_tcp_has_inline_data(queue->cmd)) {
977 queue->rcv_state = NVMET_TCP_RECV_DATA;
978 nvmet_tcp_map_pdu_iovec(queue->cmd);
979 return 0;
980 }
981 /* send back R2T */
982 nvmet_tcp_queue_response(&queue->cmd->req);
983 goto out;
984 }
985
986 nvmet_req_execute(&queue->cmd->req);
987out:
988 nvmet_prepare_receive_pdu(queue);
989 return ret;
990}
991
992static const u8 nvme_tcp_pdu_sizes[] = {
993 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu),
994 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu),
995 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu),
996};
997
998static inline u8 nvmet_tcp_pdu_size(u8 type)
999{
1000 size_t idx = type;
1001
1002 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1003 nvme_tcp_pdu_sizes[idx]) ?
1004 nvme_tcp_pdu_sizes[idx] : 0;
1005}
1006
1007static inline bool nvmet_tcp_pdu_valid(u8 type)
1008{
1009 switch (type) {
1010 case nvme_tcp_icreq:
1011 case nvme_tcp_cmd:
1012 case nvme_tcp_h2c_data:
1013 /* fallthru */
1014 return true;
1015 }
1016
1017 return false;
1018}
1019
1020static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1021{
1022 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1023 int len;
1024 struct kvec iov;
1025 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1026
1027recv:
1028 iov.iov_base = (void *)&queue->pdu + queue->offset;
1029 iov.iov_len = queue->left;
1030 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1031 iov.iov_len, msg.msg_flags);
1032 if (unlikely(len < 0))
1033 return len;
1034
1035 queue->offset += len;
1036 queue->left -= len;
1037 if (queue->left)
1038 return -EAGAIN;
1039
1040 if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1041 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1042
1043 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1044 pr_err("unexpected pdu type %d\n", hdr->type);
1045 nvmet_tcp_fatal_error(queue);
1046 return -EIO;
1047 }
1048
1049 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1050 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1051 return -EIO;
1052 }
1053
1054 queue->left = hdr->hlen - queue->offset + hdgst;
1055 goto recv;
1056 }
1057
1058 if (queue->hdr_digest &&
1059 nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
1060 nvmet_tcp_fatal_error(queue); /* fatal */
1061 return -EPROTO;
1062 }
1063
1064 if (queue->data_digest &&
1065 nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1066 nvmet_tcp_fatal_error(queue); /* fatal */
1067 return -EPROTO;
1068 }
1069
1070 return nvmet_tcp_done_recv_pdu(queue);
1071}
1072
1073static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1074{
1075 struct nvmet_tcp_queue *queue = cmd->queue;
1076
1077 nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1078 queue->offset = 0;
1079 queue->left = NVME_TCP_DIGEST_LENGTH;
1080 queue->rcv_state = NVMET_TCP_RECV_DDGST;
1081}
1082
1083static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1084{
1085 struct nvmet_tcp_cmd *cmd = queue->cmd;
1086 int ret;
1087
1088 while (msg_data_left(&cmd->recv_msg)) {
1089 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1090 cmd->recv_msg.msg_flags);
1091 if (ret <= 0)
1092 return ret;
1093
1094 cmd->pdu_recv += ret;
1095 cmd->rbytes_done += ret;
1096 }
1097
1098 nvmet_tcp_unmap_pdu_iovec(cmd);
1099
1100 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1101 cmd->rbytes_done == cmd->req.transfer_len) {
1102 if (queue->data_digest) {
1103 nvmet_tcp_prep_recv_ddgst(cmd);
1104 return 0;
1105 }
1106 nvmet_req_execute(&cmd->req);
1107 }
1108
1109 nvmet_prepare_receive_pdu(queue);
1110 return 0;
1111}
1112
1113static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1114{
1115 struct nvmet_tcp_cmd *cmd = queue->cmd;
1116 int ret;
1117 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1118 struct kvec iov = {
1119 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1120 .iov_len = queue->left
1121 };
1122
1123 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1124 iov.iov_len, msg.msg_flags);
1125 if (unlikely(ret < 0))
1126 return ret;
1127
1128 queue->offset += ret;
1129 queue->left -= ret;
1130 if (queue->left)
1131 return -EAGAIN;
1132
1133 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1134 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1135 queue->idx, cmd->req.cmd->common.command_id,
1136 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1137 le32_to_cpu(cmd->exp_ddgst));
1138 nvmet_tcp_finish_cmd(cmd);
1139 nvmet_tcp_fatal_error(queue);
1140 ret = -EPROTO;
1141 goto out;
1142 }
1143
1144 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1145 cmd->rbytes_done == cmd->req.transfer_len)
1146 nvmet_req_execute(&cmd->req);
1147 ret = 0;
1148out:
1149 nvmet_prepare_receive_pdu(queue);
1150 return ret;
1151}
1152
1153static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1154{
1155 int result = 0;
1156
1157 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1158 return 0;
1159
1160 if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1161 result = nvmet_tcp_try_recv_pdu(queue);
1162 if (result != 0)
1163 goto done_recv;
1164 }
1165
1166 if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1167 result = nvmet_tcp_try_recv_data(queue);
1168 if (result != 0)
1169 goto done_recv;
1170 }
1171
1172 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1173 result = nvmet_tcp_try_recv_ddgst(queue);
1174 if (result != 0)
1175 goto done_recv;
1176 }
1177
1178done_recv:
1179 if (result < 0) {
1180 if (result == -EAGAIN)
1181 return 0;
1182 return result;
1183 }
1184 return 1;
1185}
1186
1187static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1188 int budget, int *recvs)
1189{
1190 int i, ret = 0;
1191
1192 for (i = 0; i < budget; i++) {
1193 ret = nvmet_tcp_try_recv_one(queue);
1194 if (unlikely(ret < 0)) {
1195 nvmet_tcp_socket_error(queue, ret);
1196 goto done;
1197 } else if (ret == 0) {
1198 break;
1199 }
1200 (*recvs)++;
1201 }
1202done:
1203 return ret;
1204}
1205
1206static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1207{
1208 spin_lock(&queue->state_lock);
1209 if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1210 queue->state = NVMET_TCP_Q_DISCONNECTING;
1211 schedule_work(&queue->release_work);
1212 }
1213 spin_unlock(&queue->state_lock);
1214}
1215
1216static void nvmet_tcp_io_work(struct work_struct *w)
1217{
1218 struct nvmet_tcp_queue *queue =
1219 container_of(w, struct nvmet_tcp_queue, io_work);
1220 bool pending;
1221 int ret, ops = 0;
1222
1223 do {
1224 pending = false;
1225
1226 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1227 if (ret > 0)
1228 pending = true;
1229 else if (ret < 0)
1230 return;
1231
1232 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1233 if (ret > 0)
1234 pending = true;
1235 else if (ret < 0)
1236 return;
1237
1238 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1239
1240 /*
1241 * We exahusted our budget, requeue our selves
1242 */
1243 if (pending)
1244 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1245}
1246
1247static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1248 struct nvmet_tcp_cmd *c)
1249{
1250 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1251
1252 c->queue = queue;
1253 c->req.port = queue->port->nport;
1254
1255 c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1256 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1257 if (!c->cmd_pdu)
1258 return -ENOMEM;
1259 c->req.cmd = &c->cmd_pdu->cmd;
1260
1261 c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1262 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1263 if (!c->rsp_pdu)
1264 goto out_free_cmd;
1265 c->req.cqe = &c->rsp_pdu->cqe;
1266
1267 c->data_pdu = page_frag_alloc(&queue->pf_cache,
1268 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1269 if (!c->data_pdu)
1270 goto out_free_rsp;
1271
1272 c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1273 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1274 if (!c->r2t_pdu)
1275 goto out_free_data;
1276
1277 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1278
1279 list_add_tail(&c->entry, &queue->free_list);
1280
1281 return 0;
1282out_free_data:
1283 page_frag_free(c->data_pdu);
1284out_free_rsp:
1285 page_frag_free(c->rsp_pdu);
1286out_free_cmd:
1287 page_frag_free(c->cmd_pdu);
1288 return -ENOMEM;
1289}
1290
1291static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1292{
1293 page_frag_free(c->r2t_pdu);
1294 page_frag_free(c->data_pdu);
1295 page_frag_free(c->rsp_pdu);
1296 page_frag_free(c->cmd_pdu);
1297}
1298
1299static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1300{
1301 struct nvmet_tcp_cmd *cmds;
1302 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1303
1304 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1305 if (!cmds)
1306 goto out;
1307
1308 for (i = 0; i < nr_cmds; i++) {
1309 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1310 if (ret)
1311 goto out_free;
1312 }
1313
1314 queue->cmds = cmds;
1315
1316 return 0;
1317out_free:
1318 while (--i >= 0)
1319 nvmet_tcp_free_cmd(cmds + i);
1320 kfree(cmds);
1321out:
1322 return ret;
1323}
1324
1325static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1326{
1327 struct nvmet_tcp_cmd *cmds = queue->cmds;
1328 int i;
1329
1330 for (i = 0; i < queue->nr_cmds; i++)
1331 nvmet_tcp_free_cmd(cmds + i);
1332
1333 nvmet_tcp_free_cmd(&queue->connect);
1334 kfree(cmds);
1335}
1336
1337static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1338{
1339 struct socket *sock = queue->sock;
1340
1341 write_lock_bh(&sock->sk->sk_callback_lock);
1342 sock->sk->sk_data_ready = queue->data_ready;
1343 sock->sk->sk_state_change = queue->state_change;
1344 sock->sk->sk_write_space = queue->write_space;
1345 sock->sk->sk_user_data = NULL;
1346 write_unlock_bh(&sock->sk->sk_callback_lock);
1347}
1348
1349static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1350{
1351 nvmet_req_uninit(&cmd->req);
1352 nvmet_tcp_unmap_pdu_iovec(cmd);
1353 kfree(cmd->iov);
1354 sgl_free(cmd->req.sg);
1355}
1356
1357static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1358{
1359 struct nvmet_tcp_cmd *cmd = queue->cmds;
1360 int i;
1361
1362 for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1363 if (nvmet_tcp_need_data_in(cmd))
1364 nvmet_tcp_finish_cmd(cmd);
1365 }
1366
1367 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1368 /* failed in connect */
1369 nvmet_tcp_finish_cmd(&queue->connect);
1370 }
1371}
1372
1373static void nvmet_tcp_release_queue_work(struct work_struct *w)
1374{
1375 struct page *page;
1376 struct nvmet_tcp_queue *queue =
1377 container_of(w, struct nvmet_tcp_queue, release_work);
1378
1379 mutex_lock(&nvmet_tcp_queue_mutex);
1380 list_del_init(&queue->queue_list);
1381 mutex_unlock(&nvmet_tcp_queue_mutex);
1382
1383 nvmet_tcp_restore_socket_callbacks(queue);
1384 flush_work(&queue->io_work);
1385
1386 nvmet_tcp_uninit_data_in_cmds(queue);
1387 nvmet_sq_destroy(&queue->nvme_sq);
1388 cancel_work_sync(&queue->io_work);
1389 sock_release(queue->sock);
1390 nvmet_tcp_free_cmds(queue);
1391 if (queue->hdr_digest || queue->data_digest)
1392 nvmet_tcp_free_crypto(queue);
1393 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1394
1395 page = virt_to_head_page(queue->pf_cache.va);
1396 __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1397 kfree(queue);
1398}
1399
1400static void nvmet_tcp_data_ready(struct sock *sk)
1401{
1402 struct nvmet_tcp_queue *queue;
1403
1404 read_lock_bh(&sk->sk_callback_lock);
1405 queue = sk->sk_user_data;
1406 if (likely(queue))
1407 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1408 read_unlock_bh(&sk->sk_callback_lock);
1409}
1410
1411static void nvmet_tcp_write_space(struct sock *sk)
1412{
1413 struct nvmet_tcp_queue *queue;
1414
1415 read_lock_bh(&sk->sk_callback_lock);
1416 queue = sk->sk_user_data;
1417 if (unlikely(!queue))
1418 goto out;
1419
1420 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1421 queue->write_space(sk);
1422 goto out;
1423 }
1424
1425 if (sk_stream_is_writeable(sk)) {
1426 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1427 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1428 }
1429out:
1430 read_unlock_bh(&sk->sk_callback_lock);
1431}
1432
1433static void nvmet_tcp_state_change(struct sock *sk)
1434{
1435 struct nvmet_tcp_queue *queue;
1436
1437 read_lock_bh(&sk->sk_callback_lock);
1438 queue = sk->sk_user_data;
1439 if (!queue)
1440 goto done;
1441
1442 switch (sk->sk_state) {
1443 case TCP_FIN_WAIT2:
1444 case TCP_LAST_ACK:
1445 break;
1446 case TCP_FIN_WAIT1:
1447 case TCP_CLOSE_WAIT:
1448 case TCP_CLOSE:
1449 /* FALLTHRU */
1450 nvmet_tcp_schedule_release_queue(queue);
1451 break;
1452 default:
1453 pr_warn("queue %d unhandled state %d\n",
1454 queue->idx, sk->sk_state);
1455 }
1456done:
1457 read_unlock_bh(&sk->sk_callback_lock);
1458}
1459
1460static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1461{
1462 struct socket *sock = queue->sock;
1463 struct inet_sock *inet = inet_sk(sock->sk);
1464 struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1465 int ret;
1466
1467 ret = kernel_getsockname(sock,
1468 (struct sockaddr *)&queue->sockaddr);
1469 if (ret < 0)
1470 return ret;
1471
1472 ret = kernel_getpeername(sock,
1473 (struct sockaddr *)&queue->sockaddr_peer);
1474 if (ret < 0)
1475 return ret;
1476
1477 /*
1478 * Cleanup whatever is sitting in the TCP transmit queue on socket
1479 * close. This is done to prevent stale data from being sent should
1480 * the network connection be restored before TCP times out.
1481 */
1482 ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1483 (char *)&sol, sizeof(sol));
1484 if (ret)
1485 return ret;
1486
1487 /* Set socket type of service */
1488 if (inet->rcv_tos > 0) {
1489 int tos = inet->rcv_tos;
1490
1491 ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
1492 (char *)&tos, sizeof(tos));
1493 if (ret)
1494 return ret;
1495 }
1496
1497 write_lock_bh(&sock->sk->sk_callback_lock);
1498 sock->sk->sk_user_data = queue;
1499 queue->data_ready = sock->sk->sk_data_ready;
1500 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1501 queue->state_change = sock->sk->sk_state_change;
1502 sock->sk->sk_state_change = nvmet_tcp_state_change;
1503 queue->write_space = sock->sk->sk_write_space;
1504 sock->sk->sk_write_space = nvmet_tcp_write_space;
1505 write_unlock_bh(&sock->sk->sk_callback_lock);
1506
1507 return 0;
1508}
1509
1510static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1511 struct socket *newsock)
1512{
1513 struct nvmet_tcp_queue *queue;
1514 int ret;
1515
1516 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1517 if (!queue)
1518 return -ENOMEM;
1519
1520 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1521 INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1522 queue->sock = newsock;
1523 queue->port = port;
1524 queue->nr_cmds = 0;
1525 spin_lock_init(&queue->state_lock);
1526 queue->state = NVMET_TCP_Q_CONNECTING;
1527 INIT_LIST_HEAD(&queue->free_list);
1528 init_llist_head(&queue->resp_list);
1529 INIT_LIST_HEAD(&queue->resp_send_list);
1530
1531 queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1532 if (queue->idx < 0) {
1533 ret = queue->idx;
1534 goto out_free_queue;
1535 }
1536
1537 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1538 if (ret)
1539 goto out_ida_remove;
1540
1541 ret = nvmet_sq_init(&queue->nvme_sq);
1542 if (ret)
1543 goto out_free_connect;
1544
1545 port->last_cpu = cpumask_next_wrap(port->last_cpu,
1546 cpu_online_mask, -1, false);
1547 queue->cpu = port->last_cpu;
1548 nvmet_prepare_receive_pdu(queue);
1549
1550 mutex_lock(&nvmet_tcp_queue_mutex);
1551 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1552 mutex_unlock(&nvmet_tcp_queue_mutex);
1553
1554 ret = nvmet_tcp_set_queue_sock(queue);
1555 if (ret)
1556 goto out_destroy_sq;
1557
1558 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1559
1560 return 0;
1561out_destroy_sq:
1562 mutex_lock(&nvmet_tcp_queue_mutex);
1563 list_del_init(&queue->queue_list);
1564 mutex_unlock(&nvmet_tcp_queue_mutex);
1565 nvmet_sq_destroy(&queue->nvme_sq);
1566out_free_connect:
1567 nvmet_tcp_free_cmd(&queue->connect);
1568out_ida_remove:
1569 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1570out_free_queue:
1571 kfree(queue);
1572 return ret;
1573}
1574
1575static void nvmet_tcp_accept_work(struct work_struct *w)
1576{
1577 struct nvmet_tcp_port *port =
1578 container_of(w, struct nvmet_tcp_port, accept_work);
1579 struct socket *newsock;
1580 int ret;
1581
1582 while (true) {
1583 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1584 if (ret < 0) {
1585 if (ret != -EAGAIN)
1586 pr_warn("failed to accept err=%d\n", ret);
1587 return;
1588 }
1589 ret = nvmet_tcp_alloc_queue(port, newsock);
1590 if (ret) {
1591 pr_err("failed to allocate queue\n");
1592 sock_release(newsock);
1593 }
1594 }
1595}
1596
1597static void nvmet_tcp_listen_data_ready(struct sock *sk)
1598{
1599 struct nvmet_tcp_port *port;
1600
1601 read_lock_bh(&sk->sk_callback_lock);
1602 port = sk->sk_user_data;
1603 if (!port)
1604 goto out;
1605
1606 if (sk->sk_state == TCP_LISTEN)
1607 schedule_work(&port->accept_work);
1608out:
1609 read_unlock_bh(&sk->sk_callback_lock);
1610}
1611
1612static int nvmet_tcp_add_port(struct nvmet_port *nport)
1613{
1614 struct nvmet_tcp_port *port;
1615 __kernel_sa_family_t af;
1616 int opt, ret;
1617
1618 port = kzalloc(sizeof(*port), GFP_KERNEL);
1619 if (!port)
1620 return -ENOMEM;
1621
1622 switch (nport->disc_addr.adrfam) {
1623 case NVMF_ADDR_FAMILY_IP4:
1624 af = AF_INET;
1625 break;
1626 case NVMF_ADDR_FAMILY_IP6:
1627 af = AF_INET6;
1628 break;
1629 default:
1630 pr_err("address family %d not supported\n",
1631 nport->disc_addr.adrfam);
1632 ret = -EINVAL;
1633 goto err_port;
1634 }
1635
1636 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1637 nport->disc_addr.trsvcid, &port->addr);
1638 if (ret) {
1639 pr_err("malformed ip/port passed: %s:%s\n",
1640 nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1641 goto err_port;
1642 }
1643
1644 port->nport = nport;
1645 port->last_cpu = -1;
1646 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1647 if (port->nport->inline_data_size < 0)
1648 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1649
1650 ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1651 IPPROTO_TCP, &port->sock);
1652 if (ret) {
1653 pr_err("failed to create a socket\n");
1654 goto err_port;
1655 }
1656
1657 port->sock->sk->sk_user_data = port;
1658 port->data_ready = port->sock->sk->sk_data_ready;
1659 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1660
1661 opt = 1;
1662 ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1663 TCP_NODELAY, (char *)&opt, sizeof(opt));
1664 if (ret) {
1665 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1666 goto err_sock;
1667 }
1668
1669 ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1670 (char *)&opt, sizeof(opt));
1671 if (ret) {
1672 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1673 goto err_sock;
1674 }
1675
1676 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1677 sizeof(port->addr));
1678 if (ret) {
1679 pr_err("failed to bind port socket %d\n", ret);
1680 goto err_sock;
1681 }
1682
1683 ret = kernel_listen(port->sock, 128);
1684 if (ret) {
1685 pr_err("failed to listen %d on port sock\n", ret);
1686 goto err_sock;
1687 }
1688
1689 nport->priv = port;
1690 pr_info("enabling port %d (%pISpc)\n",
1691 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1692
1693 return 0;
1694
1695err_sock:
1696 sock_release(port->sock);
1697err_port:
1698 kfree(port);
1699 return ret;
1700}
1701
1702static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
1703{
1704 struct nvmet_tcp_queue *queue;
1705
1706 mutex_lock(&nvmet_tcp_queue_mutex);
1707 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1708 if (queue->port == port)
1709 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1710 mutex_unlock(&nvmet_tcp_queue_mutex);
1711}
1712
1713static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1714{
1715 struct nvmet_tcp_port *port = nport->priv;
1716
1717 write_lock_bh(&port->sock->sk->sk_callback_lock);
1718 port->sock->sk->sk_data_ready = port->data_ready;
1719 port->sock->sk->sk_user_data = NULL;
1720 write_unlock_bh(&port->sock->sk->sk_callback_lock);
1721 cancel_work_sync(&port->accept_work);
1722 /*
1723 * Destroy the remaining queues, which are not belong to any
1724 * controller yet.
1725 */
1726 nvmet_tcp_destroy_port_queues(port);
1727
1728 sock_release(port->sock);
1729 kfree(port);
1730}
1731
1732static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1733{
1734 struct nvmet_tcp_queue *queue;
1735
1736 mutex_lock(&nvmet_tcp_queue_mutex);
1737 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1738 if (queue->nvme_sq.ctrl == ctrl)
1739 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1740 mutex_unlock(&nvmet_tcp_queue_mutex);
1741}
1742
1743static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1744{
1745 struct nvmet_tcp_queue *queue =
1746 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1747
1748 if (sq->qid == 0) {
1749 /* Let inflight controller teardown complete */
1750 flush_scheduled_work();
1751 }
1752
1753 queue->nr_cmds = sq->size * 2;
1754 if (nvmet_tcp_alloc_cmds(queue)) {
1755 queue->nr_cmds = 0;
1756 return NVME_SC_INTERNAL;
1757 }
1758 return 0;
1759}
1760
1761static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1762 struct nvmet_port *nport, char *traddr)
1763{
1764 struct nvmet_tcp_port *port = nport->priv;
1765
1766 if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1767 struct nvmet_tcp_cmd *cmd =
1768 container_of(req, struct nvmet_tcp_cmd, req);
1769 struct nvmet_tcp_queue *queue = cmd->queue;
1770
1771 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1772 } else {
1773 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1774 }
1775}
1776
1777static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1778 .owner = THIS_MODULE,
1779 .type = NVMF_TRTYPE_TCP,
1780 .msdbd = 1,
1781 .has_keyed_sgls = 0,
1782 .add_port = nvmet_tcp_add_port,
1783 .remove_port = nvmet_tcp_remove_port,
1784 .queue_response = nvmet_tcp_queue_response,
1785 .delete_ctrl = nvmet_tcp_delete_ctrl,
1786 .install_queue = nvmet_tcp_install_queue,
1787 .disc_traddr = nvmet_tcp_disc_port_addr,
1788};
1789
1790static int __init nvmet_tcp_init(void)
1791{
1792 int ret;
1793
1794 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
1795 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1796 if (!nvmet_tcp_wq)
1797 return -ENOMEM;
1798
1799 ret = nvmet_register_transport(&nvmet_tcp_ops);
1800 if (ret)
1801 goto err;
1802
1803 return 0;
1804err:
1805 destroy_workqueue(nvmet_tcp_wq);
1806 return ret;
1807}
1808
1809static void __exit nvmet_tcp_exit(void)
1810{
1811 struct nvmet_tcp_queue *queue;
1812
1813 nvmet_unregister_transport(&nvmet_tcp_ops);
1814
1815 flush_scheduled_work();
1816 mutex_lock(&nvmet_tcp_queue_mutex);
1817 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1818 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1819 mutex_unlock(&nvmet_tcp_queue_mutex);
1820 flush_scheduled_work();
1821
1822 destroy_workqueue(nvmet_tcp_wq);
1823 ida_destroy(&nvmet_tcp_queue_ida);
1824}
1825
1826module_init(nvmet_tcp_init);
1827module_exit(nvmet_tcp_exit);
1828
1829MODULE_LICENSE("GPL v2");
1830MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */