1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
22 #define NVMET_TCP_RECV_BUDGET 8
23 #define NVMET_TCP_SEND_BUDGET 8
24 #define NVMET_TCP_IO_WORK_BUDGET 64
26 enum nvmet_tcp_send_state
{
27 NVMET_TCP_SEND_DATA_PDU
,
31 NVMET_TCP_SEND_RESPONSE
34 enum nvmet_tcp_recv_state
{
42 NVMET_TCP_F_INIT_FAILED
= (1 << 0),
45 struct nvmet_tcp_cmd
{
46 struct nvmet_tcp_queue
*queue
;
49 struct nvme_tcp_cmd_pdu
*cmd_pdu
;
50 struct nvme_tcp_rsp_pdu
*rsp_pdu
;
51 struct nvme_tcp_data_pdu
*data_pdu
;
52 struct nvme_tcp_r2t_pdu
*r2t_pdu
;
61 struct msghdr recv_msg
;
65 struct list_head entry
;
66 struct llist_node lentry
;
70 struct scatterlist
*cur_sg
;
71 enum nvmet_tcp_send_state state
;
77 enum nvmet_tcp_queue_state
{
78 NVMET_TCP_Q_CONNECTING
,
80 NVMET_TCP_Q_DISCONNECTING
,
83 struct nvmet_tcp_queue
{
85 struct nvmet_tcp_port
*port
;
86 struct work_struct io_work
;
88 struct nvmet_cq nvme_cq
;
89 struct nvmet_sq nvme_sq
;
92 struct nvmet_tcp_cmd
*cmds
;
94 struct list_head free_list
;
95 struct llist_head resp_list
;
96 struct list_head resp_send_list
;
98 struct nvmet_tcp_cmd
*snd_cmd
;
103 enum nvmet_tcp_recv_state rcv_state
;
104 struct nvmet_tcp_cmd
*cmd
;
105 union nvme_tcp_pdu pdu
;
110 struct ahash_request
*snd_hash
;
111 struct ahash_request
*rcv_hash
;
113 spinlock_t state_lock
;
114 enum nvmet_tcp_queue_state state
;
116 struct sockaddr_storage sockaddr
;
117 struct sockaddr_storage sockaddr_peer
;
118 struct work_struct release_work
;
121 struct list_head queue_list
;
123 struct nvmet_tcp_cmd connect
;
125 struct page_frag_cache pf_cache
;
127 void (*data_ready
)(struct sock
*);
128 void (*state_change
)(struct sock
*);
129 void (*write_space
)(struct sock
*);
132 struct nvmet_tcp_port
{
134 struct work_struct accept_work
;
135 struct nvmet_port
*nport
;
136 struct sockaddr_storage addr
;
138 void (*data_ready
)(struct sock
*);
141 static DEFINE_IDA(nvmet_tcp_queue_ida
);
142 static LIST_HEAD(nvmet_tcp_queue_list
);
143 static DEFINE_MUTEX(nvmet_tcp_queue_mutex
);
145 static struct workqueue_struct
*nvmet_tcp_wq
;
146 static struct nvmet_fabrics_ops nvmet_tcp_ops
;
147 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
);
148 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
);
150 static inline u16
nvmet_tcp_cmd_tag(struct nvmet_tcp_queue
*queue
,
151 struct nvmet_tcp_cmd
*cmd
)
153 return cmd
- queue
->cmds
;
156 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd
*cmd
)
158 return nvme_is_write(cmd
->req
.cmd
) &&
159 cmd
->rbytes_done
< cmd
->req
.transfer_len
;
162 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd
*cmd
)
164 return nvmet_tcp_has_data_in(cmd
) && !cmd
->req
.rsp
->status
;
167 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd
*cmd
)
169 return !nvme_is_write(cmd
->req
.cmd
) &&
170 cmd
->req
.transfer_len
> 0 &&
171 !cmd
->req
.rsp
->status
;
174 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd
*cmd
)
176 return nvme_is_write(cmd
->req
.cmd
) && cmd
->pdu_len
&&
180 static inline struct nvmet_tcp_cmd
*
181 nvmet_tcp_get_cmd(struct nvmet_tcp_queue
*queue
)
183 struct nvmet_tcp_cmd
*cmd
;
185 cmd
= list_first_entry_or_null(&queue
->free_list
,
186 struct nvmet_tcp_cmd
, entry
);
189 list_del_init(&cmd
->entry
);
191 cmd
->rbytes_done
= cmd
->wbytes_done
= 0;
199 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd
*cmd
)
201 if (unlikely(cmd
== &cmd
->queue
->connect
))
204 list_add_tail(&cmd
->entry
, &cmd
->queue
->free_list
);
207 static inline u8
nvmet_tcp_hdgst_len(struct nvmet_tcp_queue
*queue
)
209 return queue
->hdr_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
212 static inline u8
nvmet_tcp_ddgst_len(struct nvmet_tcp_queue
*queue
)
214 return queue
->data_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
217 static inline void nvmet_tcp_hdgst(struct ahash_request
*hash
,
218 void *pdu
, size_t len
)
220 struct scatterlist sg
;
222 sg_init_one(&sg
, pdu
, len
);
223 ahash_request_set_crypt(hash
, &sg
, pdu
+ len
, len
);
224 crypto_ahash_digest(hash
);
227 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue
*queue
,
228 void *pdu
, size_t len
)
230 struct nvme_tcp_hdr
*hdr
= pdu
;
234 if (unlikely(!(hdr
->flags
& NVME_TCP_F_HDGST
))) {
235 pr_err("queue %d: header digest enabled but no header digest\n",
240 recv_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
241 nvmet_tcp_hdgst(queue
->rcv_hash
, pdu
, len
);
242 exp_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
243 if (recv_digest
!= exp_digest
) {
244 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
245 queue
->idx
, le32_to_cpu(recv_digest
),
246 le32_to_cpu(exp_digest
));
253 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue
*queue
, void *pdu
)
255 struct nvme_tcp_hdr
*hdr
= pdu
;
256 u8 digest_len
= nvmet_tcp_hdgst_len(queue
);
259 len
= le32_to_cpu(hdr
->plen
) - hdr
->hlen
-
260 (hdr
->flags
& NVME_TCP_F_HDGST
? digest_len
: 0);
262 if (unlikely(len
&& !(hdr
->flags
& NVME_TCP_F_DDGST
))) {
263 pr_err("queue %d: data digest flag is cleared\n", queue
->idx
);
270 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
272 struct scatterlist
*sg
;
275 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
277 for (i
= 0; i
< cmd
->nr_mapped
; i
++)
278 kunmap(sg_page(&sg
[i
]));
281 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
283 struct kvec
*iov
= cmd
->iov
;
284 struct scatterlist
*sg
;
285 u32 length
, offset
, sg_offset
;
287 length
= cmd
->pdu_len
;
288 cmd
->nr_mapped
= DIV_ROUND_UP(length
, PAGE_SIZE
);
289 offset
= cmd
->rbytes_done
;
290 cmd
->sg_idx
= DIV_ROUND_UP(offset
, PAGE_SIZE
);
291 sg_offset
= offset
% PAGE_SIZE
;
292 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
295 u32 iov_len
= min_t(u32
, length
, sg
->length
- sg_offset
);
297 iov
->iov_base
= kmap(sg_page(sg
)) + sg
->offset
+ sg_offset
;
298 iov
->iov_len
= iov_len
;
305 iov_iter_kvec(&cmd
->recv_msg
.msg_iter
, READ
, cmd
->iov
,
306 cmd
->nr_mapped
, cmd
->pdu_len
);
309 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue
*queue
)
311 queue
->rcv_state
= NVMET_TCP_RECV_ERR
;
312 if (queue
->nvme_sq
.ctrl
)
313 nvmet_ctrl_fatal_error(queue
->nvme_sq
.ctrl
);
315 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
318 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd
*cmd
)
320 struct nvme_sgl_desc
*sgl
= &cmd
->req
.cmd
->common
.dptr
.sgl
;
321 u32 len
= le32_to_cpu(sgl
->length
);
323 if (!cmd
->req
.data_len
)
326 if (sgl
->type
== ((NVME_SGL_FMT_DATA_DESC
<< 4) |
327 NVME_SGL_FMT_OFFSET
)) {
328 if (!nvme_is_write(cmd
->req
.cmd
))
329 return NVME_SC_INVALID_FIELD
| NVME_SC_DNR
;
331 if (len
> cmd
->req
.port
->inline_data_size
)
332 return NVME_SC_SGL_INVALID_OFFSET
| NVME_SC_DNR
;
335 cmd
->req
.transfer_len
+= len
;
337 cmd
->req
.sg
= sgl_alloc(len
, GFP_KERNEL
, &cmd
->req
.sg_cnt
);
339 return NVME_SC_INTERNAL
;
340 cmd
->cur_sg
= cmd
->req
.sg
;
342 if (nvmet_tcp_has_data_in(cmd
)) {
343 cmd
->iov
= kmalloc_array(cmd
->req
.sg_cnt
,
344 sizeof(*cmd
->iov
), GFP_KERNEL
);
351 sgl_free(cmd
->req
.sg
);
352 return NVME_SC_INTERNAL
;
355 static void nvmet_tcp_ddgst(struct ahash_request
*hash
,
356 struct nvmet_tcp_cmd
*cmd
)
358 ahash_request_set_crypt(hash
, cmd
->req
.sg
,
359 (void *)&cmd
->exp_ddgst
, cmd
->req
.transfer_len
);
360 crypto_ahash_digest(hash
);
363 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd
*cmd
)
365 struct nvme_tcp_data_pdu
*pdu
= cmd
->data_pdu
;
366 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
367 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
368 u8 ddgst
= nvmet_tcp_ddgst_len(cmd
->queue
);
371 cmd
->state
= NVMET_TCP_SEND_DATA_PDU
;
373 pdu
->hdr
.type
= nvme_tcp_c2h_data
;
374 pdu
->hdr
.flags
= NVME_TCP_F_DATA_LAST
;
375 pdu
->hdr
.hlen
= sizeof(*pdu
);
376 pdu
->hdr
.pdo
= pdu
->hdr
.hlen
+ hdgst
;
378 cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
+
379 cmd
->req
.transfer_len
+ ddgst
);
380 pdu
->command_id
= cmd
->req
.rsp
->command_id
;
381 pdu
->data_length
= cpu_to_le32(cmd
->req
.transfer_len
);
382 pdu
->data_offset
= cpu_to_le32(cmd
->wbytes_done
);
384 if (queue
->data_digest
) {
385 pdu
->hdr
.flags
|= NVME_TCP_F_DDGST
;
386 nvmet_tcp_ddgst(queue
->snd_hash
, cmd
);
389 if (cmd
->queue
->hdr_digest
) {
390 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
391 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
395 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd
*cmd
)
397 struct nvme_tcp_r2t_pdu
*pdu
= cmd
->r2t_pdu
;
398 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
399 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
402 cmd
->state
= NVMET_TCP_SEND_R2T
;
404 pdu
->hdr
.type
= nvme_tcp_r2t
;
406 pdu
->hdr
.hlen
= sizeof(*pdu
);
408 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
410 pdu
->command_id
= cmd
->req
.cmd
->common
.command_id
;
411 pdu
->ttag
= nvmet_tcp_cmd_tag(cmd
->queue
, cmd
);
412 pdu
->r2t_length
= cpu_to_le32(cmd
->req
.transfer_len
- cmd
->rbytes_done
);
413 pdu
->r2t_offset
= cpu_to_le32(cmd
->rbytes_done
);
414 if (cmd
->queue
->hdr_digest
) {
415 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
416 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
420 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd
*cmd
)
422 struct nvme_tcp_rsp_pdu
*pdu
= cmd
->rsp_pdu
;
423 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
424 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
427 cmd
->state
= NVMET_TCP_SEND_RESPONSE
;
429 pdu
->hdr
.type
= nvme_tcp_rsp
;
431 pdu
->hdr
.hlen
= sizeof(*pdu
);
433 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
434 if (cmd
->queue
->hdr_digest
) {
435 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
436 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
440 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue
*queue
)
442 struct llist_node
*node
;
444 node
= llist_del_all(&queue
->resp_list
);
449 struct nvmet_tcp_cmd
*cmd
= llist_entry(node
,
450 struct nvmet_tcp_cmd
, lentry
);
452 list_add(&cmd
->entry
, &queue
->resp_send_list
);
454 queue
->send_list_len
++;
458 static struct nvmet_tcp_cmd
*nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue
*queue
)
460 queue
->snd_cmd
= list_first_entry_or_null(&queue
->resp_send_list
,
461 struct nvmet_tcp_cmd
, entry
);
462 if (!queue
->snd_cmd
) {
463 nvmet_tcp_process_resp_list(queue
);
465 list_first_entry_or_null(&queue
->resp_send_list
,
466 struct nvmet_tcp_cmd
, entry
);
467 if (unlikely(!queue
->snd_cmd
))
471 list_del_init(&queue
->snd_cmd
->entry
);
472 queue
->send_list_len
--;
474 if (nvmet_tcp_need_data_out(queue
->snd_cmd
))
475 nvmet_setup_c2h_data_pdu(queue
->snd_cmd
);
476 else if (nvmet_tcp_need_data_in(queue
->snd_cmd
))
477 nvmet_setup_r2t_pdu(queue
->snd_cmd
);
479 nvmet_setup_response_pdu(queue
->snd_cmd
);
481 return queue
->snd_cmd
;
484 static void nvmet_tcp_queue_response(struct nvmet_req
*req
)
486 struct nvmet_tcp_cmd
*cmd
=
487 container_of(req
, struct nvmet_tcp_cmd
, req
);
488 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
490 llist_add(&cmd
->lentry
, &queue
->resp_list
);
491 queue_work_on(cmd
->queue
->cpu
, nvmet_tcp_wq
, &cmd
->queue
->io_work
);
494 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd
*cmd
)
496 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
497 int left
= sizeof(*cmd
->data_pdu
) - cmd
->offset
+ hdgst
;
500 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->data_pdu
),
501 offset_in_page(cmd
->data_pdu
) + cmd
->offset
,
502 left
, MSG_DONTWAIT
| MSG_MORE
);
512 cmd
->state
= NVMET_TCP_SEND_DATA
;
517 static int nvmet_try_send_data(struct nvmet_tcp_cmd
*cmd
)
519 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
522 while (cmd
->cur_sg
) {
523 struct page
*page
= sg_page(cmd
->cur_sg
);
524 u32 left
= cmd
->cur_sg
->length
- cmd
->offset
;
526 ret
= kernel_sendpage(cmd
->queue
->sock
, page
, cmd
->offset
,
527 left
, MSG_DONTWAIT
| MSG_MORE
);
532 cmd
->wbytes_done
+= ret
;
535 if (cmd
->offset
== cmd
->cur_sg
->length
) {
536 cmd
->cur_sg
= sg_next(cmd
->cur_sg
);
541 if (queue
->data_digest
) {
542 cmd
->state
= NVMET_TCP_SEND_DDGST
;
545 nvmet_setup_response_pdu(cmd
);
551 static int nvmet_try_send_response(struct nvmet_tcp_cmd
*cmd
,
554 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
555 int left
= sizeof(*cmd
->rsp_pdu
) - cmd
->offset
+ hdgst
;
556 int flags
= MSG_DONTWAIT
;
559 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
564 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->rsp_pdu
),
565 offset_in_page(cmd
->rsp_pdu
) + cmd
->offset
, left
, flags
);
575 sgl_free(cmd
->req
.sg
);
576 cmd
->queue
->snd_cmd
= NULL
;
577 nvmet_tcp_put_cmd(cmd
);
581 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd
*cmd
, bool last_in_batch
)
583 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
584 int left
= sizeof(*cmd
->r2t_pdu
) - cmd
->offset
+ hdgst
;
585 int flags
= MSG_DONTWAIT
;
588 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
593 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->r2t_pdu
),
594 offset_in_page(cmd
->r2t_pdu
) + cmd
->offset
, left
, flags
);
603 cmd
->queue
->snd_cmd
= NULL
;
607 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd
*cmd
)
609 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
610 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
612 .iov_base
= &cmd
->exp_ddgst
+ cmd
->offset
,
613 .iov_len
= NVME_TCP_DIGEST_LENGTH
- cmd
->offset
617 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
618 if (unlikely(ret
<= 0))
622 nvmet_setup_response_pdu(cmd
);
626 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue
*queue
,
629 struct nvmet_tcp_cmd
*cmd
= queue
->snd_cmd
;
632 if (!cmd
|| queue
->state
== NVMET_TCP_Q_DISCONNECTING
) {
633 cmd
= nvmet_tcp_fetch_cmd(queue
);
638 if (cmd
->state
== NVMET_TCP_SEND_DATA_PDU
) {
639 ret
= nvmet_try_send_data_pdu(cmd
);
644 if (cmd
->state
== NVMET_TCP_SEND_DATA
) {
645 ret
= nvmet_try_send_data(cmd
);
650 if (cmd
->state
== NVMET_TCP_SEND_DDGST
) {
651 ret
= nvmet_try_send_ddgst(cmd
);
656 if (cmd
->state
== NVMET_TCP_SEND_R2T
) {
657 ret
= nvmet_try_send_r2t(cmd
, last_in_batch
);
662 if (cmd
->state
== NVMET_TCP_SEND_RESPONSE
)
663 ret
= nvmet_try_send_response(cmd
, last_in_batch
);
675 static int nvmet_tcp_try_send(struct nvmet_tcp_queue
*queue
,
676 int budget
, int *sends
)
680 for (i
= 0; i
< budget
; i
++) {
681 ret
= nvmet_tcp_try_send_one(queue
, i
== budget
- 1);
690 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue
*queue
)
693 queue
->left
= sizeof(struct nvme_tcp_hdr
);
695 queue
->rcv_state
= NVMET_TCP_RECV_PDU
;
698 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue
*queue
)
700 struct crypto_ahash
*tfm
= crypto_ahash_reqtfm(queue
->rcv_hash
);
702 ahash_request_free(queue
->rcv_hash
);
703 ahash_request_free(queue
->snd_hash
);
704 crypto_free_ahash(tfm
);
707 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue
*queue
)
709 struct crypto_ahash
*tfm
;
711 tfm
= crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC
);
715 queue
->snd_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
716 if (!queue
->snd_hash
)
718 ahash_request_set_callback(queue
->snd_hash
, 0, NULL
, NULL
);
720 queue
->rcv_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
721 if (!queue
->rcv_hash
)
723 ahash_request_set_callback(queue
->rcv_hash
, 0, NULL
, NULL
);
727 ahash_request_free(queue
->snd_hash
);
729 crypto_free_ahash(tfm
);
734 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue
*queue
)
736 struct nvme_tcp_icreq_pdu
*icreq
= &queue
->pdu
.icreq
;
737 struct nvme_tcp_icresp_pdu
*icresp
= &queue
->pdu
.icresp
;
738 struct msghdr msg
= {};
742 if (le32_to_cpu(icreq
->hdr
.plen
) != sizeof(struct nvme_tcp_icreq_pdu
)) {
743 pr_err("bad nvme-tcp pdu length (%d)\n",
744 le32_to_cpu(icreq
->hdr
.plen
));
745 nvmet_tcp_fatal_error(queue
);
748 if (icreq
->pfv
!= NVME_TCP_PFV_1_0
) {
749 pr_err("queue %d: bad pfv %d\n", queue
->idx
, icreq
->pfv
);
753 if (icreq
->hpda
!= 0) {
754 pr_err("queue %d: unsupported hpda %d\n", queue
->idx
,
759 if (icreq
->maxr2t
!= 0) {
760 pr_err("queue %d: unsupported maxr2t %d\n", queue
->idx
,
761 le32_to_cpu(icreq
->maxr2t
) + 1);
765 queue
->hdr_digest
= !!(icreq
->digest
& NVME_TCP_HDR_DIGEST_ENABLE
);
766 queue
->data_digest
= !!(icreq
->digest
& NVME_TCP_DATA_DIGEST_ENABLE
);
767 if (queue
->hdr_digest
|| queue
->data_digest
) {
768 ret
= nvmet_tcp_alloc_crypto(queue
);
773 memset(icresp
, 0, sizeof(*icresp
));
774 icresp
->hdr
.type
= nvme_tcp_icresp
;
775 icresp
->hdr
.hlen
= sizeof(*icresp
);
777 icresp
->hdr
.plen
= cpu_to_le32(icresp
->hdr
.hlen
);
778 icresp
->pfv
= cpu_to_le16(NVME_TCP_PFV_1_0
);
779 icresp
->maxdata
= cpu_to_le32(0xffff); /* FIXME: support r2t */
781 if (queue
->hdr_digest
)
782 icresp
->digest
|= NVME_TCP_HDR_DIGEST_ENABLE
;
783 if (queue
->data_digest
)
784 icresp
->digest
|= NVME_TCP_DATA_DIGEST_ENABLE
;
786 iov
.iov_base
= icresp
;
787 iov
.iov_len
= sizeof(*icresp
);
788 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
792 queue
->state
= NVMET_TCP_Q_LIVE
;
793 nvmet_prepare_receive_pdu(queue
);
796 if (queue
->hdr_digest
|| queue
->data_digest
)
797 nvmet_tcp_free_crypto(queue
);
801 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue
*queue
,
802 struct nvmet_tcp_cmd
*cmd
, struct nvmet_req
*req
)
806 /* recover the expected data transfer length */
807 req
->data_len
= le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
);
809 if (!nvme_is_write(cmd
->req
.cmd
) ||
810 req
->data_len
> cmd
->req
.port
->inline_data_size
) {
811 nvmet_prepare_receive_pdu(queue
);
815 ret
= nvmet_tcp_map_data(cmd
);
817 pr_err("queue %d: failed to map data\n", queue
->idx
);
818 nvmet_tcp_fatal_error(queue
);
822 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
823 nvmet_tcp_map_pdu_iovec(cmd
);
824 cmd
->flags
|= NVMET_TCP_F_INIT_FAILED
;
827 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue
*queue
)
829 struct nvme_tcp_data_pdu
*data
= &queue
->pdu
.data
;
830 struct nvmet_tcp_cmd
*cmd
;
832 cmd
= &queue
->cmds
[data
->ttag
];
834 if (le32_to_cpu(data
->data_offset
) != cmd
->rbytes_done
) {
835 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
836 data
->ttag
, le32_to_cpu(data
->data_offset
),
838 /* FIXME: use path and transport errors */
839 nvmet_req_complete(&cmd
->req
,
840 NVME_SC_INVALID_FIELD
| NVME_SC_DNR
);
844 cmd
->pdu_len
= le32_to_cpu(data
->data_length
);
846 nvmet_tcp_map_pdu_iovec(cmd
);
848 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
853 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue
*queue
)
855 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
856 struct nvme_command
*nvme_cmd
= &queue
->pdu
.cmd
.cmd
;
857 struct nvmet_req
*req
;
860 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
861 if (hdr
->type
!= nvme_tcp_icreq
) {
862 pr_err("unexpected pdu type (%d) before icreq\n",
864 nvmet_tcp_fatal_error(queue
);
867 return nvmet_tcp_handle_icreq(queue
);
870 if (hdr
->type
== nvme_tcp_h2c_data
) {
871 ret
= nvmet_tcp_handle_h2c_data_pdu(queue
);
877 queue
->cmd
= nvmet_tcp_get_cmd(queue
);
878 if (unlikely(!queue
->cmd
)) {
879 /* This should never happen */
880 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
881 queue
->idx
, queue
->nr_cmds
, queue
->send_list_len
,
882 nvme_cmd
->common
.opcode
);
883 nvmet_tcp_fatal_error(queue
);
887 req
= &queue
->cmd
->req
;
888 memcpy(req
->cmd
, nvme_cmd
, sizeof(*nvme_cmd
));
890 if (unlikely(!nvmet_req_init(req
, &queue
->nvme_cq
,
891 &queue
->nvme_sq
, &nvmet_tcp_ops
))) {
892 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
893 req
->cmd
, req
->cmd
->common
.command_id
,
894 req
->cmd
->common
.opcode
,
895 le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
));
897 nvmet_tcp_handle_req_failure(queue
, queue
->cmd
, req
);
901 ret
= nvmet_tcp_map_data(queue
->cmd
);
903 pr_err("queue %d: failed to map data\n", queue
->idx
);
904 if (nvmet_tcp_has_inline_data(queue
->cmd
))
905 nvmet_tcp_fatal_error(queue
);
907 nvmet_req_complete(req
, ret
);
912 if (nvmet_tcp_need_data_in(queue
->cmd
)) {
913 if (nvmet_tcp_has_inline_data(queue
->cmd
)) {
914 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
915 nvmet_tcp_map_pdu_iovec(queue
->cmd
);
919 nvmet_tcp_queue_response(&queue
->cmd
->req
);
923 nvmet_req_execute(&queue
->cmd
->req
);
925 nvmet_prepare_receive_pdu(queue
);
929 static const u8 nvme_tcp_pdu_sizes
[] = {
930 [nvme_tcp_icreq
] = sizeof(struct nvme_tcp_icreq_pdu
),
931 [nvme_tcp_cmd
] = sizeof(struct nvme_tcp_cmd_pdu
),
932 [nvme_tcp_h2c_data
] = sizeof(struct nvme_tcp_data_pdu
),
935 static inline u8
nvmet_tcp_pdu_size(u8 type
)
939 return (idx
< ARRAY_SIZE(nvme_tcp_pdu_sizes
) &&
940 nvme_tcp_pdu_sizes
[idx
]) ?
941 nvme_tcp_pdu_sizes
[idx
] : 0;
944 static inline bool nvmet_tcp_pdu_valid(u8 type
)
949 case nvme_tcp_h2c_data
:
957 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue
*queue
)
959 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
962 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
965 iov
.iov_base
= (void *)&queue
->pdu
+ queue
->offset
;
966 iov
.iov_len
= queue
->left
;
967 len
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
968 iov
.iov_len
, msg
.msg_flags
);
969 if (unlikely(len
< 0))
972 queue
->offset
+= len
;
977 if (queue
->offset
== sizeof(struct nvme_tcp_hdr
)) {
978 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
980 if (unlikely(!nvmet_tcp_pdu_valid(hdr
->type
))) {
981 pr_err("unexpected pdu type %d\n", hdr
->type
);
982 nvmet_tcp_fatal_error(queue
);
986 if (unlikely(hdr
->hlen
!= nvmet_tcp_pdu_size(hdr
->type
))) {
987 pr_err("pdu %d bad hlen %d\n", hdr
->type
, hdr
->hlen
);
991 queue
->left
= hdr
->hlen
- queue
->offset
+ hdgst
;
995 if (queue
->hdr_digest
&&
996 nvmet_tcp_verify_hdgst(queue
, &queue
->pdu
, queue
->offset
)) {
997 nvmet_tcp_fatal_error(queue
); /* fatal */
1001 if (queue
->data_digest
&&
1002 nvmet_tcp_check_ddgst(queue
, &queue
->pdu
)) {
1003 nvmet_tcp_fatal_error(queue
); /* fatal */
1007 return nvmet_tcp_done_recv_pdu(queue
);
1010 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd
*cmd
)
1012 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1014 nvmet_tcp_ddgst(queue
->rcv_hash
, cmd
);
1016 queue
->left
= NVME_TCP_DIGEST_LENGTH
;
1017 queue
->rcv_state
= NVMET_TCP_RECV_DDGST
;
1020 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue
*queue
)
1022 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1025 while (msg_data_left(&cmd
->recv_msg
)) {
1026 ret
= sock_recvmsg(cmd
->queue
->sock
, &cmd
->recv_msg
,
1027 cmd
->recv_msg
.msg_flags
);
1031 cmd
->pdu_recv
+= ret
;
1032 cmd
->rbytes_done
+= ret
;
1035 nvmet_tcp_unmap_pdu_iovec(cmd
);
1037 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1038 cmd
->rbytes_done
== cmd
->req
.transfer_len
) {
1039 if (queue
->data_digest
) {
1040 nvmet_tcp_prep_recv_ddgst(cmd
);
1043 nvmet_req_execute(&cmd
->req
);
1046 nvmet_prepare_receive_pdu(queue
);
1050 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue
*queue
)
1052 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1054 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
1056 .iov_base
= (void *)&cmd
->recv_ddgst
+ queue
->offset
,
1057 .iov_len
= queue
->left
1060 ret
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
1061 iov
.iov_len
, msg
.msg_flags
);
1062 if (unlikely(ret
< 0))
1065 queue
->offset
+= ret
;
1070 if (queue
->data_digest
&& cmd
->exp_ddgst
!= cmd
->recv_ddgst
) {
1071 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1072 queue
->idx
, cmd
->req
.cmd
->common
.command_id
,
1073 queue
->pdu
.cmd
.hdr
.type
, le32_to_cpu(cmd
->recv_ddgst
),
1074 le32_to_cpu(cmd
->exp_ddgst
));
1075 nvmet_tcp_finish_cmd(cmd
);
1076 nvmet_tcp_fatal_error(queue
);
1081 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1082 cmd
->rbytes_done
== cmd
->req
.transfer_len
)
1083 nvmet_req_execute(&cmd
->req
);
1086 nvmet_prepare_receive_pdu(queue
);
1090 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue
*queue
)
1094 if (unlikely(queue
->rcv_state
== NVMET_TCP_RECV_ERR
))
1097 if (queue
->rcv_state
== NVMET_TCP_RECV_PDU
) {
1098 result
= nvmet_tcp_try_recv_pdu(queue
);
1103 if (queue
->rcv_state
== NVMET_TCP_RECV_DATA
) {
1104 result
= nvmet_tcp_try_recv_data(queue
);
1109 if (queue
->rcv_state
== NVMET_TCP_RECV_DDGST
) {
1110 result
= nvmet_tcp_try_recv_ddgst(queue
);
1117 if (result
== -EAGAIN
)
1124 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue
*queue
,
1125 int budget
, int *recvs
)
1129 for (i
= 0; i
< budget
; i
++) {
1130 ret
= nvmet_tcp_try_recv_one(queue
);
1139 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue
*queue
)
1141 spin_lock(&queue
->state_lock
);
1142 if (queue
->state
!= NVMET_TCP_Q_DISCONNECTING
) {
1143 queue
->state
= NVMET_TCP_Q_DISCONNECTING
;
1144 schedule_work(&queue
->release_work
);
1146 spin_unlock(&queue
->state_lock
);
1149 static void nvmet_tcp_io_work(struct work_struct
*w
)
1151 struct nvmet_tcp_queue
*queue
=
1152 container_of(w
, struct nvmet_tcp_queue
, io_work
);
1159 ret
= nvmet_tcp_try_recv(queue
, NVMET_TCP_RECV_BUDGET
, &ops
);
1162 } else if (ret
< 0) {
1163 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1164 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1166 nvmet_tcp_fatal_error(queue
);
1170 ret
= nvmet_tcp_try_send(queue
, NVMET_TCP_SEND_BUDGET
, &ops
);
1172 /* transmitted message/data */
1174 } else if (ret
< 0) {
1175 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1176 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1178 nvmet_tcp_fatal_error(queue
);
1182 } while (pending
&& ops
< NVMET_TCP_IO_WORK_BUDGET
);
1185 * We exahusted our budget, requeue our selves
1188 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1191 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue
*queue
,
1192 struct nvmet_tcp_cmd
*c
)
1194 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
1197 c
->req
.port
= queue
->port
->nport
;
1199 c
->cmd_pdu
= page_frag_alloc(&queue
->pf_cache
,
1200 sizeof(*c
->cmd_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1203 c
->req
.cmd
= &c
->cmd_pdu
->cmd
;
1205 c
->rsp_pdu
= page_frag_alloc(&queue
->pf_cache
,
1206 sizeof(*c
->rsp_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1209 c
->req
.rsp
= &c
->rsp_pdu
->cqe
;
1211 c
->data_pdu
= page_frag_alloc(&queue
->pf_cache
,
1212 sizeof(*c
->data_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1216 c
->r2t_pdu
= page_frag_alloc(&queue
->pf_cache
,
1217 sizeof(*c
->r2t_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1221 c
->recv_msg
.msg_flags
= MSG_DONTWAIT
| MSG_NOSIGNAL
;
1223 list_add_tail(&c
->entry
, &queue
->free_list
);
1227 page_frag_free(c
->data_pdu
);
1229 page_frag_free(c
->rsp_pdu
);
1231 page_frag_free(c
->cmd_pdu
);
1235 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
)
1237 page_frag_free(c
->r2t_pdu
);
1238 page_frag_free(c
->data_pdu
);
1239 page_frag_free(c
->rsp_pdu
);
1240 page_frag_free(c
->cmd_pdu
);
1243 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue
*queue
)
1245 struct nvmet_tcp_cmd
*cmds
;
1246 int i
, ret
= -EINVAL
, nr_cmds
= queue
->nr_cmds
;
1248 cmds
= kcalloc(nr_cmds
, sizeof(struct nvmet_tcp_cmd
), GFP_KERNEL
);
1252 for (i
= 0; i
< nr_cmds
; i
++) {
1253 ret
= nvmet_tcp_alloc_cmd(queue
, cmds
+ i
);
1263 nvmet_tcp_free_cmd(cmds
+ i
);
1269 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue
*queue
)
1271 struct nvmet_tcp_cmd
*cmds
= queue
->cmds
;
1274 for (i
= 0; i
< queue
->nr_cmds
; i
++)
1275 nvmet_tcp_free_cmd(cmds
+ i
);
1277 nvmet_tcp_free_cmd(&queue
->connect
);
1281 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue
*queue
)
1283 struct socket
*sock
= queue
->sock
;
1285 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1286 sock
->sk
->sk_data_ready
= queue
->data_ready
;
1287 sock
->sk
->sk_state_change
= queue
->state_change
;
1288 sock
->sk
->sk_write_space
= queue
->write_space
;
1289 sock
->sk
->sk_user_data
= NULL
;
1290 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1293 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
)
1295 nvmet_req_uninit(&cmd
->req
);
1296 nvmet_tcp_unmap_pdu_iovec(cmd
);
1297 sgl_free(cmd
->req
.sg
);
1300 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue
*queue
)
1302 struct nvmet_tcp_cmd
*cmd
= queue
->cmds
;
1305 for (i
= 0; i
< queue
->nr_cmds
; i
++, cmd
++) {
1306 if (nvmet_tcp_need_data_in(cmd
))
1307 nvmet_tcp_finish_cmd(cmd
);
1310 if (!queue
->nr_cmds
&& nvmet_tcp_need_data_in(&queue
->connect
)) {
1311 /* failed in connect */
1312 nvmet_tcp_finish_cmd(&queue
->connect
);
1316 static void nvmet_tcp_release_queue_work(struct work_struct
*w
)
1318 struct nvmet_tcp_queue
*queue
=
1319 container_of(w
, struct nvmet_tcp_queue
, release_work
);
1321 mutex_lock(&nvmet_tcp_queue_mutex
);
1322 list_del_init(&queue
->queue_list
);
1323 mutex_unlock(&nvmet_tcp_queue_mutex
);
1325 nvmet_tcp_restore_socket_callbacks(queue
);
1326 flush_work(&queue
->io_work
);
1328 nvmet_tcp_uninit_data_in_cmds(queue
);
1329 nvmet_sq_destroy(&queue
->nvme_sq
);
1330 cancel_work_sync(&queue
->io_work
);
1331 sock_release(queue
->sock
);
1332 nvmet_tcp_free_cmds(queue
);
1333 if (queue
->hdr_digest
|| queue
->data_digest
)
1334 nvmet_tcp_free_crypto(queue
);
1335 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1340 static void nvmet_tcp_data_ready(struct sock
*sk
)
1342 struct nvmet_tcp_queue
*queue
;
1344 read_lock_bh(&sk
->sk_callback_lock
);
1345 queue
= sk
->sk_user_data
;
1347 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1348 read_unlock_bh(&sk
->sk_callback_lock
);
1351 static void nvmet_tcp_write_space(struct sock
*sk
)
1353 struct nvmet_tcp_queue
*queue
;
1355 read_lock_bh(&sk
->sk_callback_lock
);
1356 queue
= sk
->sk_user_data
;
1357 if (unlikely(!queue
))
1360 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
1361 queue
->write_space(sk
);
1365 if (sk_stream_is_writeable(sk
)) {
1366 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
1367 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1370 read_unlock_bh(&sk
->sk_callback_lock
);
1373 static void nvmet_tcp_state_change(struct sock
*sk
)
1375 struct nvmet_tcp_queue
*queue
;
1377 write_lock_bh(&sk
->sk_callback_lock
);
1378 queue
= sk
->sk_user_data
;
1382 switch (sk
->sk_state
) {
1384 case TCP_CLOSE_WAIT
:
1387 sk
->sk_user_data
= NULL
;
1388 nvmet_tcp_schedule_release_queue(queue
);
1391 pr_warn("queue %d unhandled state %d\n",
1392 queue
->idx
, sk
->sk_state
);
1395 write_unlock_bh(&sk
->sk_callback_lock
);
1398 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue
*queue
)
1400 struct socket
*sock
= queue
->sock
;
1401 struct linger sol
= { .l_onoff
= 1, .l_linger
= 0 };
1404 ret
= kernel_getsockname(sock
,
1405 (struct sockaddr
*)&queue
->sockaddr
);
1409 ret
= kernel_getpeername(sock
,
1410 (struct sockaddr
*)&queue
->sockaddr_peer
);
1415 * Cleanup whatever is sitting in the TCP transmit queue on socket
1416 * close. This is done to prevent stale data from being sent should
1417 * the network connection be restored before TCP times out.
1419 ret
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_LINGER
,
1420 (char *)&sol
, sizeof(sol
));
1424 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1425 sock
->sk
->sk_user_data
= queue
;
1426 queue
->data_ready
= sock
->sk
->sk_data_ready
;
1427 sock
->sk
->sk_data_ready
= nvmet_tcp_data_ready
;
1428 queue
->state_change
= sock
->sk
->sk_state_change
;
1429 sock
->sk
->sk_state_change
= nvmet_tcp_state_change
;
1430 queue
->write_space
= sock
->sk
->sk_write_space
;
1431 sock
->sk
->sk_write_space
= nvmet_tcp_write_space
;
1432 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1437 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port
*port
,
1438 struct socket
*newsock
)
1440 struct nvmet_tcp_queue
*queue
;
1443 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
1447 INIT_WORK(&queue
->release_work
, nvmet_tcp_release_queue_work
);
1448 INIT_WORK(&queue
->io_work
, nvmet_tcp_io_work
);
1449 queue
->sock
= newsock
;
1452 spin_lock_init(&queue
->state_lock
);
1453 queue
->state
= NVMET_TCP_Q_CONNECTING
;
1454 INIT_LIST_HEAD(&queue
->free_list
);
1455 init_llist_head(&queue
->resp_list
);
1456 INIT_LIST_HEAD(&queue
->resp_send_list
);
1458 queue
->idx
= ida_simple_get(&nvmet_tcp_queue_ida
, 0, 0, GFP_KERNEL
);
1459 if (queue
->idx
< 0) {
1461 goto out_free_queue
;
1464 ret
= nvmet_tcp_alloc_cmd(queue
, &queue
->connect
);
1466 goto out_ida_remove
;
1468 ret
= nvmet_sq_init(&queue
->nvme_sq
);
1470 goto out_free_connect
;
1472 port
->last_cpu
= cpumask_next_wrap(port
->last_cpu
,
1473 cpu_online_mask
, -1, false);
1474 queue
->cpu
= port
->last_cpu
;
1475 nvmet_prepare_receive_pdu(queue
);
1477 mutex_lock(&nvmet_tcp_queue_mutex
);
1478 list_add_tail(&queue
->queue_list
, &nvmet_tcp_queue_list
);
1479 mutex_unlock(&nvmet_tcp_queue_mutex
);
1481 ret
= nvmet_tcp_set_queue_sock(queue
);
1483 goto out_destroy_sq
;
1485 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1489 mutex_lock(&nvmet_tcp_queue_mutex
);
1490 list_del_init(&queue
->queue_list
);
1491 mutex_unlock(&nvmet_tcp_queue_mutex
);
1492 nvmet_sq_destroy(&queue
->nvme_sq
);
1494 nvmet_tcp_free_cmd(&queue
->connect
);
1496 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1502 static void nvmet_tcp_accept_work(struct work_struct
*w
)
1504 struct nvmet_tcp_port
*port
=
1505 container_of(w
, struct nvmet_tcp_port
, accept_work
);
1506 struct socket
*newsock
;
1510 ret
= kernel_accept(port
->sock
, &newsock
, O_NONBLOCK
);
1513 pr_warn("failed to accept err=%d\n", ret
);
1516 ret
= nvmet_tcp_alloc_queue(port
, newsock
);
1518 pr_err("failed to allocate queue\n");
1519 sock_release(newsock
);
1524 static void nvmet_tcp_listen_data_ready(struct sock
*sk
)
1526 struct nvmet_tcp_port
*port
;
1528 read_lock_bh(&sk
->sk_callback_lock
);
1529 port
= sk
->sk_user_data
;
1533 if (sk
->sk_state
== TCP_LISTEN
)
1534 schedule_work(&port
->accept_work
);
1536 read_unlock_bh(&sk
->sk_callback_lock
);
1539 static int nvmet_tcp_add_port(struct nvmet_port
*nport
)
1541 struct nvmet_tcp_port
*port
;
1542 __kernel_sa_family_t af
;
1545 port
= kzalloc(sizeof(*port
), GFP_KERNEL
);
1549 switch (nport
->disc_addr
.adrfam
) {
1550 case NVMF_ADDR_FAMILY_IP4
:
1553 case NVMF_ADDR_FAMILY_IP6
:
1557 pr_err("address family %d not supported\n",
1558 nport
->disc_addr
.adrfam
);
1563 ret
= inet_pton_with_scope(&init_net
, af
, nport
->disc_addr
.traddr
,
1564 nport
->disc_addr
.trsvcid
, &port
->addr
);
1566 pr_err("malformed ip/port passed: %s:%s\n",
1567 nport
->disc_addr
.traddr
, nport
->disc_addr
.trsvcid
);
1571 port
->nport
= nport
;
1572 port
->last_cpu
= -1;
1573 INIT_WORK(&port
->accept_work
, nvmet_tcp_accept_work
);
1574 if (port
->nport
->inline_data_size
< 0)
1575 port
->nport
->inline_data_size
= NVMET_TCP_DEF_INLINE_DATA_SIZE
;
1577 ret
= sock_create(port
->addr
.ss_family
, SOCK_STREAM
,
1578 IPPROTO_TCP
, &port
->sock
);
1580 pr_err("failed to create a socket\n");
1584 port
->sock
->sk
->sk_user_data
= port
;
1585 port
->data_ready
= port
->sock
->sk
->sk_data_ready
;
1586 port
->sock
->sk
->sk_data_ready
= nvmet_tcp_listen_data_ready
;
1589 ret
= kernel_setsockopt(port
->sock
, IPPROTO_TCP
,
1590 TCP_NODELAY
, (char *)&opt
, sizeof(opt
));
1592 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret
);
1596 ret
= kernel_setsockopt(port
->sock
, SOL_SOCKET
, SO_REUSEADDR
,
1597 (char *)&opt
, sizeof(opt
));
1599 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret
);
1603 ret
= kernel_bind(port
->sock
, (struct sockaddr
*)&port
->addr
,
1604 sizeof(port
->addr
));
1606 pr_err("failed to bind port socket %d\n", ret
);
1610 ret
= kernel_listen(port
->sock
, 128);
1612 pr_err("failed to listen %d on port sock\n", ret
);
1617 pr_info("enabling port %d (%pISpc)\n",
1618 le16_to_cpu(nport
->disc_addr
.portid
), &port
->addr
);
1623 sock_release(port
->sock
);
1629 static void nvmet_tcp_remove_port(struct nvmet_port
*nport
)
1631 struct nvmet_tcp_port
*port
= nport
->priv
;
1633 write_lock_bh(&port
->sock
->sk
->sk_callback_lock
);
1634 port
->sock
->sk
->sk_data_ready
= port
->data_ready
;
1635 port
->sock
->sk
->sk_user_data
= NULL
;
1636 write_unlock_bh(&port
->sock
->sk
->sk_callback_lock
);
1637 cancel_work_sync(&port
->accept_work
);
1639 sock_release(port
->sock
);
1643 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl
*ctrl
)
1645 struct nvmet_tcp_queue
*queue
;
1647 mutex_lock(&nvmet_tcp_queue_mutex
);
1648 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1649 if (queue
->nvme_sq
.ctrl
== ctrl
)
1650 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1651 mutex_unlock(&nvmet_tcp_queue_mutex
);
1654 static u16
nvmet_tcp_install_queue(struct nvmet_sq
*sq
)
1656 struct nvmet_tcp_queue
*queue
=
1657 container_of(sq
, struct nvmet_tcp_queue
, nvme_sq
);
1660 /* Let inflight controller teardown complete */
1661 flush_scheduled_work();
1664 queue
->nr_cmds
= sq
->size
* 2;
1665 if (nvmet_tcp_alloc_cmds(queue
))
1666 return NVME_SC_INTERNAL
;
1670 static void nvmet_tcp_disc_port_addr(struct nvmet_req
*req
,
1671 struct nvmet_port
*nport
, char *traddr
)
1673 struct nvmet_tcp_port
*port
= nport
->priv
;
1675 if (inet_addr_is_any((struct sockaddr
*)&port
->addr
)) {
1676 struct nvmet_tcp_cmd
*cmd
=
1677 container_of(req
, struct nvmet_tcp_cmd
, req
);
1678 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1680 sprintf(traddr
, "%pISc", (struct sockaddr
*)&queue
->sockaddr
);
1682 memcpy(traddr
, nport
->disc_addr
.traddr
, NVMF_TRADDR_SIZE
);
1686 static struct nvmet_fabrics_ops nvmet_tcp_ops
= {
1687 .owner
= THIS_MODULE
,
1688 .type
= NVMF_TRTYPE_TCP
,
1690 .has_keyed_sgls
= 0,
1691 .add_port
= nvmet_tcp_add_port
,
1692 .remove_port
= nvmet_tcp_remove_port
,
1693 .queue_response
= nvmet_tcp_queue_response
,
1694 .delete_ctrl
= nvmet_tcp_delete_ctrl
,
1695 .install_queue
= nvmet_tcp_install_queue
,
1696 .disc_traddr
= nvmet_tcp_disc_port_addr
,
1699 static int __init
nvmet_tcp_init(void)
1703 nvmet_tcp_wq
= alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI
, 0);
1707 ret
= nvmet_register_transport(&nvmet_tcp_ops
);
1713 destroy_workqueue(nvmet_tcp_wq
);
1717 static void __exit
nvmet_tcp_exit(void)
1719 struct nvmet_tcp_queue
*queue
;
1721 nvmet_unregister_transport(&nvmet_tcp_ops
);
1723 flush_scheduled_work();
1724 mutex_lock(&nvmet_tcp_queue_mutex
);
1725 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1726 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1727 mutex_unlock(&nvmet_tcp_queue_mutex
);
1728 flush_scheduled_work();
1730 destroy_workqueue(nvmet_tcp_wq
);
1733 module_init(nvmet_tcp_init
);
1734 module_exit(nvmet_tcp_exit
);
1736 MODULE_LICENSE("GPL v2");
1737 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */