1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
22 #define NVMET_TCP_RECV_BUDGET 8
23 #define NVMET_TCP_SEND_BUDGET 8
24 #define NVMET_TCP_IO_WORK_BUDGET 64
26 enum nvmet_tcp_send_state
{
27 NVMET_TCP_SEND_DATA_PDU
,
31 NVMET_TCP_SEND_RESPONSE
34 enum nvmet_tcp_recv_state
{
42 NVMET_TCP_F_INIT_FAILED
= (1 << 0),
45 struct nvmet_tcp_cmd
{
46 struct nvmet_tcp_queue
*queue
;
49 struct nvme_tcp_cmd_pdu
*cmd_pdu
;
50 struct nvme_tcp_rsp_pdu
*rsp_pdu
;
51 struct nvme_tcp_data_pdu
*data_pdu
;
52 struct nvme_tcp_r2t_pdu
*r2t_pdu
;
61 struct msghdr recv_msg
;
65 struct list_head entry
;
66 struct llist_node lentry
;
70 struct scatterlist
*cur_sg
;
71 enum nvmet_tcp_send_state state
;
77 enum nvmet_tcp_queue_state
{
78 NVMET_TCP_Q_CONNECTING
,
80 NVMET_TCP_Q_DISCONNECTING
,
83 struct nvmet_tcp_queue
{
85 struct nvmet_tcp_port
*port
;
86 struct work_struct io_work
;
88 struct nvmet_cq nvme_cq
;
89 struct nvmet_sq nvme_sq
;
92 struct nvmet_tcp_cmd
*cmds
;
94 struct list_head free_list
;
95 struct llist_head resp_list
;
96 struct list_head resp_send_list
;
98 struct nvmet_tcp_cmd
*snd_cmd
;
103 enum nvmet_tcp_recv_state rcv_state
;
104 struct nvmet_tcp_cmd
*cmd
;
105 union nvme_tcp_pdu pdu
;
110 struct ahash_request
*snd_hash
;
111 struct ahash_request
*rcv_hash
;
113 spinlock_t state_lock
;
114 enum nvmet_tcp_queue_state state
;
116 struct sockaddr_storage sockaddr
;
117 struct sockaddr_storage sockaddr_peer
;
118 struct work_struct release_work
;
121 struct list_head queue_list
;
123 struct nvmet_tcp_cmd connect
;
125 struct page_frag_cache pf_cache
;
127 void (*data_ready
)(struct sock
*);
128 void (*state_change
)(struct sock
*);
129 void (*write_space
)(struct sock
*);
132 struct nvmet_tcp_port
{
134 struct work_struct accept_work
;
135 struct nvmet_port
*nport
;
136 struct sockaddr_storage addr
;
138 void (*data_ready
)(struct sock
*);
141 static DEFINE_IDA(nvmet_tcp_queue_ida
);
142 static LIST_HEAD(nvmet_tcp_queue_list
);
143 static DEFINE_MUTEX(nvmet_tcp_queue_mutex
);
145 static struct workqueue_struct
*nvmet_tcp_wq
;
146 static struct nvmet_fabrics_ops nvmet_tcp_ops
;
147 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
);
148 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
);
150 static inline u16
nvmet_tcp_cmd_tag(struct nvmet_tcp_queue
*queue
,
151 struct nvmet_tcp_cmd
*cmd
)
153 return cmd
- queue
->cmds
;
156 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd
*cmd
)
158 return nvme_is_write(cmd
->req
.cmd
) &&
159 cmd
->rbytes_done
< cmd
->req
.transfer_len
;
162 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd
*cmd
)
164 return nvmet_tcp_has_data_in(cmd
) && !cmd
->req
.cqe
->status
;
167 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd
*cmd
)
169 return !nvme_is_write(cmd
->req
.cmd
) &&
170 cmd
->req
.transfer_len
> 0 &&
171 !cmd
->req
.cqe
->status
;
174 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd
*cmd
)
176 return nvme_is_write(cmd
->req
.cmd
) && cmd
->pdu_len
&&
180 static inline struct nvmet_tcp_cmd
*
181 nvmet_tcp_get_cmd(struct nvmet_tcp_queue
*queue
)
183 struct nvmet_tcp_cmd
*cmd
;
185 cmd
= list_first_entry_or_null(&queue
->free_list
,
186 struct nvmet_tcp_cmd
, entry
);
189 list_del_init(&cmd
->entry
);
191 cmd
->rbytes_done
= cmd
->wbytes_done
= 0;
199 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd
*cmd
)
201 if (unlikely(cmd
== &cmd
->queue
->connect
))
204 list_add_tail(&cmd
->entry
, &cmd
->queue
->free_list
);
207 static inline u8
nvmet_tcp_hdgst_len(struct nvmet_tcp_queue
*queue
)
209 return queue
->hdr_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
212 static inline u8
nvmet_tcp_ddgst_len(struct nvmet_tcp_queue
*queue
)
214 return queue
->data_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
217 static inline void nvmet_tcp_hdgst(struct ahash_request
*hash
,
218 void *pdu
, size_t len
)
220 struct scatterlist sg
;
222 sg_init_one(&sg
, pdu
, len
);
223 ahash_request_set_crypt(hash
, &sg
, pdu
+ len
, len
);
224 crypto_ahash_digest(hash
);
227 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue
*queue
,
228 void *pdu
, size_t len
)
230 struct nvme_tcp_hdr
*hdr
= pdu
;
234 if (unlikely(!(hdr
->flags
& NVME_TCP_F_HDGST
))) {
235 pr_err("queue %d: header digest enabled but no header digest\n",
240 recv_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
241 nvmet_tcp_hdgst(queue
->rcv_hash
, pdu
, len
);
242 exp_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
243 if (recv_digest
!= exp_digest
) {
244 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
245 queue
->idx
, le32_to_cpu(recv_digest
),
246 le32_to_cpu(exp_digest
));
253 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue
*queue
, void *pdu
)
255 struct nvme_tcp_hdr
*hdr
= pdu
;
256 u8 digest_len
= nvmet_tcp_hdgst_len(queue
);
259 len
= le32_to_cpu(hdr
->plen
) - hdr
->hlen
-
260 (hdr
->flags
& NVME_TCP_F_HDGST
? digest_len
: 0);
262 if (unlikely(len
&& !(hdr
->flags
& NVME_TCP_F_DDGST
))) {
263 pr_err("queue %d: data digest flag is cleared\n", queue
->idx
);
270 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
272 struct scatterlist
*sg
;
275 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
277 for (i
= 0; i
< cmd
->nr_mapped
; i
++)
278 kunmap(sg_page(&sg
[i
]));
281 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
283 struct kvec
*iov
= cmd
->iov
;
284 struct scatterlist
*sg
;
285 u32 length
, offset
, sg_offset
;
287 length
= cmd
->pdu_len
;
288 cmd
->nr_mapped
= DIV_ROUND_UP(length
, PAGE_SIZE
);
289 offset
= cmd
->rbytes_done
;
290 cmd
->sg_idx
= DIV_ROUND_UP(offset
, PAGE_SIZE
);
291 sg_offset
= offset
% PAGE_SIZE
;
292 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
295 u32 iov_len
= min_t(u32
, length
, sg
->length
- sg_offset
);
297 iov
->iov_base
= kmap(sg_page(sg
)) + sg
->offset
+ sg_offset
;
298 iov
->iov_len
= iov_len
;
305 iov_iter_kvec(&cmd
->recv_msg
.msg_iter
, READ
, cmd
->iov
,
306 cmd
->nr_mapped
, cmd
->pdu_len
);
309 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue
*queue
)
311 queue
->rcv_state
= NVMET_TCP_RECV_ERR
;
312 if (queue
->nvme_sq
.ctrl
)
313 nvmet_ctrl_fatal_error(queue
->nvme_sq
.ctrl
);
315 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
318 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd
*cmd
)
320 struct nvme_sgl_desc
*sgl
= &cmd
->req
.cmd
->common
.dptr
.sgl
;
321 u32 len
= le32_to_cpu(sgl
->length
);
326 if (sgl
->type
== ((NVME_SGL_FMT_DATA_DESC
<< 4) |
327 NVME_SGL_FMT_OFFSET
)) {
328 if (!nvme_is_write(cmd
->req
.cmd
))
329 return NVME_SC_INVALID_FIELD
| NVME_SC_DNR
;
331 if (len
> cmd
->req
.port
->inline_data_size
)
332 return NVME_SC_SGL_INVALID_OFFSET
| NVME_SC_DNR
;
335 cmd
->req
.transfer_len
+= len
;
337 cmd
->req
.sg
= sgl_alloc(len
, GFP_KERNEL
, &cmd
->req
.sg_cnt
);
339 return NVME_SC_INTERNAL
;
340 cmd
->cur_sg
= cmd
->req
.sg
;
342 if (nvmet_tcp_has_data_in(cmd
)) {
343 cmd
->iov
= kmalloc_array(cmd
->req
.sg_cnt
,
344 sizeof(*cmd
->iov
), GFP_KERNEL
);
351 sgl_free(cmd
->req
.sg
);
352 return NVME_SC_INTERNAL
;
355 static void nvmet_tcp_ddgst(struct ahash_request
*hash
,
356 struct nvmet_tcp_cmd
*cmd
)
358 ahash_request_set_crypt(hash
, cmd
->req
.sg
,
359 (void *)&cmd
->exp_ddgst
, cmd
->req
.transfer_len
);
360 crypto_ahash_digest(hash
);
363 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd
*cmd
)
365 struct nvme_tcp_data_pdu
*pdu
= cmd
->data_pdu
;
366 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
367 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
368 u8 ddgst
= nvmet_tcp_ddgst_len(cmd
->queue
);
371 cmd
->state
= NVMET_TCP_SEND_DATA_PDU
;
373 pdu
->hdr
.type
= nvme_tcp_c2h_data
;
374 pdu
->hdr
.flags
= NVME_TCP_F_DATA_LAST
| (queue
->nvme_sq
.sqhd_disabled
?
375 NVME_TCP_F_DATA_SUCCESS
: 0);
376 pdu
->hdr
.hlen
= sizeof(*pdu
);
377 pdu
->hdr
.pdo
= pdu
->hdr
.hlen
+ hdgst
;
379 cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
+
380 cmd
->req
.transfer_len
+ ddgst
);
381 pdu
->command_id
= cmd
->req
.cqe
->command_id
;
382 pdu
->data_length
= cpu_to_le32(cmd
->req
.transfer_len
);
383 pdu
->data_offset
= cpu_to_le32(cmd
->wbytes_done
);
385 if (queue
->data_digest
) {
386 pdu
->hdr
.flags
|= NVME_TCP_F_DDGST
;
387 nvmet_tcp_ddgst(queue
->snd_hash
, cmd
);
390 if (cmd
->queue
->hdr_digest
) {
391 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
392 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
396 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd
*cmd
)
398 struct nvme_tcp_r2t_pdu
*pdu
= cmd
->r2t_pdu
;
399 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
400 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
403 cmd
->state
= NVMET_TCP_SEND_R2T
;
405 pdu
->hdr
.type
= nvme_tcp_r2t
;
407 pdu
->hdr
.hlen
= sizeof(*pdu
);
409 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
411 pdu
->command_id
= cmd
->req
.cmd
->common
.command_id
;
412 pdu
->ttag
= nvmet_tcp_cmd_tag(cmd
->queue
, cmd
);
413 pdu
->r2t_length
= cpu_to_le32(cmd
->req
.transfer_len
- cmd
->rbytes_done
);
414 pdu
->r2t_offset
= cpu_to_le32(cmd
->rbytes_done
);
415 if (cmd
->queue
->hdr_digest
) {
416 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
417 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
421 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd
*cmd
)
423 struct nvme_tcp_rsp_pdu
*pdu
= cmd
->rsp_pdu
;
424 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
425 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
428 cmd
->state
= NVMET_TCP_SEND_RESPONSE
;
430 pdu
->hdr
.type
= nvme_tcp_rsp
;
432 pdu
->hdr
.hlen
= sizeof(*pdu
);
434 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
435 if (cmd
->queue
->hdr_digest
) {
436 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
437 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
441 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue
*queue
)
443 struct llist_node
*node
;
445 node
= llist_del_all(&queue
->resp_list
);
450 struct nvmet_tcp_cmd
*cmd
= llist_entry(node
,
451 struct nvmet_tcp_cmd
, lentry
);
453 list_add(&cmd
->entry
, &queue
->resp_send_list
);
455 queue
->send_list_len
++;
459 static struct nvmet_tcp_cmd
*nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue
*queue
)
461 queue
->snd_cmd
= list_first_entry_or_null(&queue
->resp_send_list
,
462 struct nvmet_tcp_cmd
, entry
);
463 if (!queue
->snd_cmd
) {
464 nvmet_tcp_process_resp_list(queue
);
466 list_first_entry_or_null(&queue
->resp_send_list
,
467 struct nvmet_tcp_cmd
, entry
);
468 if (unlikely(!queue
->snd_cmd
))
472 list_del_init(&queue
->snd_cmd
->entry
);
473 queue
->send_list_len
--;
475 if (nvmet_tcp_need_data_out(queue
->snd_cmd
))
476 nvmet_setup_c2h_data_pdu(queue
->snd_cmd
);
477 else if (nvmet_tcp_need_data_in(queue
->snd_cmd
))
478 nvmet_setup_r2t_pdu(queue
->snd_cmd
);
480 nvmet_setup_response_pdu(queue
->snd_cmd
);
482 return queue
->snd_cmd
;
485 static void nvmet_tcp_queue_response(struct nvmet_req
*req
)
487 struct nvmet_tcp_cmd
*cmd
=
488 container_of(req
, struct nvmet_tcp_cmd
, req
);
489 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
491 llist_add(&cmd
->lentry
, &queue
->resp_list
);
492 queue_work_on(cmd
->queue
->cpu
, nvmet_tcp_wq
, &cmd
->queue
->io_work
);
495 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd
*cmd
)
497 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
498 int left
= sizeof(*cmd
->data_pdu
) - cmd
->offset
+ hdgst
;
501 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->data_pdu
),
502 offset_in_page(cmd
->data_pdu
) + cmd
->offset
,
503 left
, MSG_DONTWAIT
| MSG_MORE
);
513 cmd
->state
= NVMET_TCP_SEND_DATA
;
518 static int nvmet_try_send_data(struct nvmet_tcp_cmd
*cmd
)
520 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
523 while (cmd
->cur_sg
) {
524 struct page
*page
= sg_page(cmd
->cur_sg
);
525 u32 left
= cmd
->cur_sg
->length
- cmd
->offset
;
527 ret
= kernel_sendpage(cmd
->queue
->sock
, page
, cmd
->offset
,
528 left
, MSG_DONTWAIT
| MSG_MORE
);
533 cmd
->wbytes_done
+= ret
;
536 if (cmd
->offset
== cmd
->cur_sg
->length
) {
537 cmd
->cur_sg
= sg_next(cmd
->cur_sg
);
542 if (queue
->data_digest
) {
543 cmd
->state
= NVMET_TCP_SEND_DDGST
;
546 if (queue
->nvme_sq
.sqhd_disabled
) {
547 cmd
->queue
->snd_cmd
= NULL
;
548 nvmet_tcp_put_cmd(cmd
);
550 nvmet_setup_response_pdu(cmd
);
554 if (queue
->nvme_sq
.sqhd_disabled
) {
556 sgl_free(cmd
->req
.sg
);
563 static int nvmet_try_send_response(struct nvmet_tcp_cmd
*cmd
,
566 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
567 int left
= sizeof(*cmd
->rsp_pdu
) - cmd
->offset
+ hdgst
;
568 int flags
= MSG_DONTWAIT
;
571 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
576 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->rsp_pdu
),
577 offset_in_page(cmd
->rsp_pdu
) + cmd
->offset
, left
, flags
);
587 sgl_free(cmd
->req
.sg
);
588 cmd
->queue
->snd_cmd
= NULL
;
589 nvmet_tcp_put_cmd(cmd
);
593 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd
*cmd
, bool last_in_batch
)
595 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
596 int left
= sizeof(*cmd
->r2t_pdu
) - cmd
->offset
+ hdgst
;
597 int flags
= MSG_DONTWAIT
;
600 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
605 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->r2t_pdu
),
606 offset_in_page(cmd
->r2t_pdu
) + cmd
->offset
, left
, flags
);
615 cmd
->queue
->snd_cmd
= NULL
;
619 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd
*cmd
)
621 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
622 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
624 .iov_base
= &cmd
->exp_ddgst
+ cmd
->offset
,
625 .iov_len
= NVME_TCP_DIGEST_LENGTH
- cmd
->offset
629 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
630 if (unlikely(ret
<= 0))
635 if (queue
->nvme_sq
.sqhd_disabled
) {
636 cmd
->queue
->snd_cmd
= NULL
;
637 nvmet_tcp_put_cmd(cmd
);
639 nvmet_setup_response_pdu(cmd
);
644 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue
*queue
,
647 struct nvmet_tcp_cmd
*cmd
= queue
->snd_cmd
;
650 if (!cmd
|| queue
->state
== NVMET_TCP_Q_DISCONNECTING
) {
651 cmd
= nvmet_tcp_fetch_cmd(queue
);
656 if (cmd
->state
== NVMET_TCP_SEND_DATA_PDU
) {
657 ret
= nvmet_try_send_data_pdu(cmd
);
662 if (cmd
->state
== NVMET_TCP_SEND_DATA
) {
663 ret
= nvmet_try_send_data(cmd
);
668 if (cmd
->state
== NVMET_TCP_SEND_DDGST
) {
669 ret
= nvmet_try_send_ddgst(cmd
);
674 if (cmd
->state
== NVMET_TCP_SEND_R2T
) {
675 ret
= nvmet_try_send_r2t(cmd
, last_in_batch
);
680 if (cmd
->state
== NVMET_TCP_SEND_RESPONSE
)
681 ret
= nvmet_try_send_response(cmd
, last_in_batch
);
693 static int nvmet_tcp_try_send(struct nvmet_tcp_queue
*queue
,
694 int budget
, int *sends
)
698 for (i
= 0; i
< budget
; i
++) {
699 ret
= nvmet_tcp_try_send_one(queue
, i
== budget
- 1);
708 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue
*queue
)
711 queue
->left
= sizeof(struct nvme_tcp_hdr
);
713 queue
->rcv_state
= NVMET_TCP_RECV_PDU
;
716 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue
*queue
)
718 struct crypto_ahash
*tfm
= crypto_ahash_reqtfm(queue
->rcv_hash
);
720 ahash_request_free(queue
->rcv_hash
);
721 ahash_request_free(queue
->snd_hash
);
722 crypto_free_ahash(tfm
);
725 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue
*queue
)
727 struct crypto_ahash
*tfm
;
729 tfm
= crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC
);
733 queue
->snd_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
734 if (!queue
->snd_hash
)
736 ahash_request_set_callback(queue
->snd_hash
, 0, NULL
, NULL
);
738 queue
->rcv_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
739 if (!queue
->rcv_hash
)
741 ahash_request_set_callback(queue
->rcv_hash
, 0, NULL
, NULL
);
745 ahash_request_free(queue
->snd_hash
);
747 crypto_free_ahash(tfm
);
752 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue
*queue
)
754 struct nvme_tcp_icreq_pdu
*icreq
= &queue
->pdu
.icreq
;
755 struct nvme_tcp_icresp_pdu
*icresp
= &queue
->pdu
.icresp
;
756 struct msghdr msg
= {};
760 if (le32_to_cpu(icreq
->hdr
.plen
) != sizeof(struct nvme_tcp_icreq_pdu
)) {
761 pr_err("bad nvme-tcp pdu length (%d)\n",
762 le32_to_cpu(icreq
->hdr
.plen
));
763 nvmet_tcp_fatal_error(queue
);
766 if (icreq
->pfv
!= NVME_TCP_PFV_1_0
) {
767 pr_err("queue %d: bad pfv %d\n", queue
->idx
, icreq
->pfv
);
771 if (icreq
->hpda
!= 0) {
772 pr_err("queue %d: unsupported hpda %d\n", queue
->idx
,
777 queue
->hdr_digest
= !!(icreq
->digest
& NVME_TCP_HDR_DIGEST_ENABLE
);
778 queue
->data_digest
= !!(icreq
->digest
& NVME_TCP_DATA_DIGEST_ENABLE
);
779 if (queue
->hdr_digest
|| queue
->data_digest
) {
780 ret
= nvmet_tcp_alloc_crypto(queue
);
785 memset(icresp
, 0, sizeof(*icresp
));
786 icresp
->hdr
.type
= nvme_tcp_icresp
;
787 icresp
->hdr
.hlen
= sizeof(*icresp
);
789 icresp
->hdr
.plen
= cpu_to_le32(icresp
->hdr
.hlen
);
790 icresp
->pfv
= cpu_to_le16(NVME_TCP_PFV_1_0
);
791 icresp
->maxdata
= cpu_to_le32(0xffff); /* FIXME: support r2t */
793 if (queue
->hdr_digest
)
794 icresp
->digest
|= NVME_TCP_HDR_DIGEST_ENABLE
;
795 if (queue
->data_digest
)
796 icresp
->digest
|= NVME_TCP_DATA_DIGEST_ENABLE
;
798 iov
.iov_base
= icresp
;
799 iov
.iov_len
= sizeof(*icresp
);
800 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
804 queue
->state
= NVMET_TCP_Q_LIVE
;
805 nvmet_prepare_receive_pdu(queue
);
808 if (queue
->hdr_digest
|| queue
->data_digest
)
809 nvmet_tcp_free_crypto(queue
);
813 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue
*queue
,
814 struct nvmet_tcp_cmd
*cmd
, struct nvmet_req
*req
)
816 size_t data_len
= le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
);
819 if (!nvme_is_write(cmd
->req
.cmd
) ||
820 data_len
> cmd
->req
.port
->inline_data_size
) {
821 nvmet_prepare_receive_pdu(queue
);
825 ret
= nvmet_tcp_map_data(cmd
);
827 pr_err("queue %d: failed to map data\n", queue
->idx
);
828 nvmet_tcp_fatal_error(queue
);
832 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
833 nvmet_tcp_map_pdu_iovec(cmd
);
834 cmd
->flags
|= NVMET_TCP_F_INIT_FAILED
;
837 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue
*queue
)
839 struct nvme_tcp_data_pdu
*data
= &queue
->pdu
.data
;
840 struct nvmet_tcp_cmd
*cmd
;
842 cmd
= &queue
->cmds
[data
->ttag
];
844 if (le32_to_cpu(data
->data_offset
) != cmd
->rbytes_done
) {
845 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
846 data
->ttag
, le32_to_cpu(data
->data_offset
),
848 /* FIXME: use path and transport errors */
849 nvmet_req_complete(&cmd
->req
,
850 NVME_SC_INVALID_FIELD
| NVME_SC_DNR
);
854 cmd
->pdu_len
= le32_to_cpu(data
->data_length
);
856 nvmet_tcp_map_pdu_iovec(cmd
);
858 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
863 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue
*queue
)
865 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
866 struct nvme_command
*nvme_cmd
= &queue
->pdu
.cmd
.cmd
;
867 struct nvmet_req
*req
;
870 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
871 if (hdr
->type
!= nvme_tcp_icreq
) {
872 pr_err("unexpected pdu type (%d) before icreq\n",
874 nvmet_tcp_fatal_error(queue
);
877 return nvmet_tcp_handle_icreq(queue
);
880 if (hdr
->type
== nvme_tcp_h2c_data
) {
881 ret
= nvmet_tcp_handle_h2c_data_pdu(queue
);
887 queue
->cmd
= nvmet_tcp_get_cmd(queue
);
888 if (unlikely(!queue
->cmd
)) {
889 /* This should never happen */
890 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
891 queue
->idx
, queue
->nr_cmds
, queue
->send_list_len
,
892 nvme_cmd
->common
.opcode
);
893 nvmet_tcp_fatal_error(queue
);
897 req
= &queue
->cmd
->req
;
898 memcpy(req
->cmd
, nvme_cmd
, sizeof(*nvme_cmd
));
900 if (unlikely(!nvmet_req_init(req
, &queue
->nvme_cq
,
901 &queue
->nvme_sq
, &nvmet_tcp_ops
))) {
902 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
903 req
->cmd
, req
->cmd
->common
.command_id
,
904 req
->cmd
->common
.opcode
,
905 le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
));
907 nvmet_tcp_handle_req_failure(queue
, queue
->cmd
, req
);
911 ret
= nvmet_tcp_map_data(queue
->cmd
);
913 pr_err("queue %d: failed to map data\n", queue
->idx
);
914 if (nvmet_tcp_has_inline_data(queue
->cmd
))
915 nvmet_tcp_fatal_error(queue
);
917 nvmet_req_complete(req
, ret
);
922 if (nvmet_tcp_need_data_in(queue
->cmd
)) {
923 if (nvmet_tcp_has_inline_data(queue
->cmd
)) {
924 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
925 nvmet_tcp_map_pdu_iovec(queue
->cmd
);
929 nvmet_tcp_queue_response(&queue
->cmd
->req
);
933 queue
->cmd
->req
.execute(&queue
->cmd
->req
);
935 nvmet_prepare_receive_pdu(queue
);
939 static const u8 nvme_tcp_pdu_sizes
[] = {
940 [nvme_tcp_icreq
] = sizeof(struct nvme_tcp_icreq_pdu
),
941 [nvme_tcp_cmd
] = sizeof(struct nvme_tcp_cmd_pdu
),
942 [nvme_tcp_h2c_data
] = sizeof(struct nvme_tcp_data_pdu
),
945 static inline u8
nvmet_tcp_pdu_size(u8 type
)
949 return (idx
< ARRAY_SIZE(nvme_tcp_pdu_sizes
) &&
950 nvme_tcp_pdu_sizes
[idx
]) ?
951 nvme_tcp_pdu_sizes
[idx
] : 0;
954 static inline bool nvmet_tcp_pdu_valid(u8 type
)
959 case nvme_tcp_h2c_data
:
967 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue
*queue
)
969 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
972 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
975 iov
.iov_base
= (void *)&queue
->pdu
+ queue
->offset
;
976 iov
.iov_len
= queue
->left
;
977 len
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
978 iov
.iov_len
, msg
.msg_flags
);
979 if (unlikely(len
< 0))
982 queue
->offset
+= len
;
987 if (queue
->offset
== sizeof(struct nvme_tcp_hdr
)) {
988 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
990 if (unlikely(!nvmet_tcp_pdu_valid(hdr
->type
))) {
991 pr_err("unexpected pdu type %d\n", hdr
->type
);
992 nvmet_tcp_fatal_error(queue
);
996 if (unlikely(hdr
->hlen
!= nvmet_tcp_pdu_size(hdr
->type
))) {
997 pr_err("pdu %d bad hlen %d\n", hdr
->type
, hdr
->hlen
);
1001 queue
->left
= hdr
->hlen
- queue
->offset
+ hdgst
;
1005 if (queue
->hdr_digest
&&
1006 nvmet_tcp_verify_hdgst(queue
, &queue
->pdu
, queue
->offset
)) {
1007 nvmet_tcp_fatal_error(queue
); /* fatal */
1011 if (queue
->data_digest
&&
1012 nvmet_tcp_check_ddgst(queue
, &queue
->pdu
)) {
1013 nvmet_tcp_fatal_error(queue
); /* fatal */
1017 return nvmet_tcp_done_recv_pdu(queue
);
1020 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd
*cmd
)
1022 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1024 nvmet_tcp_ddgst(queue
->rcv_hash
, cmd
);
1026 queue
->left
= NVME_TCP_DIGEST_LENGTH
;
1027 queue
->rcv_state
= NVMET_TCP_RECV_DDGST
;
1030 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue
*queue
)
1032 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1035 while (msg_data_left(&cmd
->recv_msg
)) {
1036 ret
= sock_recvmsg(cmd
->queue
->sock
, &cmd
->recv_msg
,
1037 cmd
->recv_msg
.msg_flags
);
1041 cmd
->pdu_recv
+= ret
;
1042 cmd
->rbytes_done
+= ret
;
1045 nvmet_tcp_unmap_pdu_iovec(cmd
);
1047 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1048 cmd
->rbytes_done
== cmd
->req
.transfer_len
) {
1049 if (queue
->data_digest
) {
1050 nvmet_tcp_prep_recv_ddgst(cmd
);
1053 cmd
->req
.execute(&cmd
->req
);
1056 nvmet_prepare_receive_pdu(queue
);
1060 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue
*queue
)
1062 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1064 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
1066 .iov_base
= (void *)&cmd
->recv_ddgst
+ queue
->offset
,
1067 .iov_len
= queue
->left
1070 ret
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
1071 iov
.iov_len
, msg
.msg_flags
);
1072 if (unlikely(ret
< 0))
1075 queue
->offset
+= ret
;
1080 if (queue
->data_digest
&& cmd
->exp_ddgst
!= cmd
->recv_ddgst
) {
1081 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1082 queue
->idx
, cmd
->req
.cmd
->common
.command_id
,
1083 queue
->pdu
.cmd
.hdr
.type
, le32_to_cpu(cmd
->recv_ddgst
),
1084 le32_to_cpu(cmd
->exp_ddgst
));
1085 nvmet_tcp_finish_cmd(cmd
);
1086 nvmet_tcp_fatal_error(queue
);
1091 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1092 cmd
->rbytes_done
== cmd
->req
.transfer_len
)
1093 cmd
->req
.execute(&cmd
->req
);
1096 nvmet_prepare_receive_pdu(queue
);
1100 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue
*queue
)
1104 if (unlikely(queue
->rcv_state
== NVMET_TCP_RECV_ERR
))
1107 if (queue
->rcv_state
== NVMET_TCP_RECV_PDU
) {
1108 result
= nvmet_tcp_try_recv_pdu(queue
);
1113 if (queue
->rcv_state
== NVMET_TCP_RECV_DATA
) {
1114 result
= nvmet_tcp_try_recv_data(queue
);
1119 if (queue
->rcv_state
== NVMET_TCP_RECV_DDGST
) {
1120 result
= nvmet_tcp_try_recv_ddgst(queue
);
1127 if (result
== -EAGAIN
)
1134 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue
*queue
,
1135 int budget
, int *recvs
)
1139 for (i
= 0; i
< budget
; i
++) {
1140 ret
= nvmet_tcp_try_recv_one(queue
);
1149 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue
*queue
)
1151 spin_lock(&queue
->state_lock
);
1152 if (queue
->state
!= NVMET_TCP_Q_DISCONNECTING
) {
1153 queue
->state
= NVMET_TCP_Q_DISCONNECTING
;
1154 schedule_work(&queue
->release_work
);
1156 spin_unlock(&queue
->state_lock
);
1159 static void nvmet_tcp_io_work(struct work_struct
*w
)
1161 struct nvmet_tcp_queue
*queue
=
1162 container_of(w
, struct nvmet_tcp_queue
, io_work
);
1169 ret
= nvmet_tcp_try_recv(queue
, NVMET_TCP_RECV_BUDGET
, &ops
);
1172 } else if (ret
< 0) {
1173 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1174 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1176 nvmet_tcp_fatal_error(queue
);
1180 ret
= nvmet_tcp_try_send(queue
, NVMET_TCP_SEND_BUDGET
, &ops
);
1182 /* transmitted message/data */
1184 } else if (ret
< 0) {
1185 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1186 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1188 nvmet_tcp_fatal_error(queue
);
1192 } while (pending
&& ops
< NVMET_TCP_IO_WORK_BUDGET
);
1195 * We exahusted our budget, requeue our selves
1198 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1201 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue
*queue
,
1202 struct nvmet_tcp_cmd
*c
)
1204 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
1207 c
->req
.port
= queue
->port
->nport
;
1209 c
->cmd_pdu
= page_frag_alloc(&queue
->pf_cache
,
1210 sizeof(*c
->cmd_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1213 c
->req
.cmd
= &c
->cmd_pdu
->cmd
;
1215 c
->rsp_pdu
= page_frag_alloc(&queue
->pf_cache
,
1216 sizeof(*c
->rsp_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1219 c
->req
.cqe
= &c
->rsp_pdu
->cqe
;
1221 c
->data_pdu
= page_frag_alloc(&queue
->pf_cache
,
1222 sizeof(*c
->data_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1226 c
->r2t_pdu
= page_frag_alloc(&queue
->pf_cache
,
1227 sizeof(*c
->r2t_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1231 c
->recv_msg
.msg_flags
= MSG_DONTWAIT
| MSG_NOSIGNAL
;
1233 list_add_tail(&c
->entry
, &queue
->free_list
);
1237 page_frag_free(c
->data_pdu
);
1239 page_frag_free(c
->rsp_pdu
);
1241 page_frag_free(c
->cmd_pdu
);
1245 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
)
1247 page_frag_free(c
->r2t_pdu
);
1248 page_frag_free(c
->data_pdu
);
1249 page_frag_free(c
->rsp_pdu
);
1250 page_frag_free(c
->cmd_pdu
);
1253 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue
*queue
)
1255 struct nvmet_tcp_cmd
*cmds
;
1256 int i
, ret
= -EINVAL
, nr_cmds
= queue
->nr_cmds
;
1258 cmds
= kcalloc(nr_cmds
, sizeof(struct nvmet_tcp_cmd
), GFP_KERNEL
);
1262 for (i
= 0; i
< nr_cmds
; i
++) {
1263 ret
= nvmet_tcp_alloc_cmd(queue
, cmds
+ i
);
1273 nvmet_tcp_free_cmd(cmds
+ i
);
1279 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue
*queue
)
1281 struct nvmet_tcp_cmd
*cmds
= queue
->cmds
;
1284 for (i
= 0; i
< queue
->nr_cmds
; i
++)
1285 nvmet_tcp_free_cmd(cmds
+ i
);
1287 nvmet_tcp_free_cmd(&queue
->connect
);
1291 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue
*queue
)
1293 struct socket
*sock
= queue
->sock
;
1295 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1296 sock
->sk
->sk_data_ready
= queue
->data_ready
;
1297 sock
->sk
->sk_state_change
= queue
->state_change
;
1298 sock
->sk
->sk_write_space
= queue
->write_space
;
1299 sock
->sk
->sk_user_data
= NULL
;
1300 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1303 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
)
1305 nvmet_req_uninit(&cmd
->req
);
1306 nvmet_tcp_unmap_pdu_iovec(cmd
);
1308 sgl_free(cmd
->req
.sg
);
1311 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue
*queue
)
1313 struct nvmet_tcp_cmd
*cmd
= queue
->cmds
;
1316 for (i
= 0; i
< queue
->nr_cmds
; i
++, cmd
++) {
1317 if (nvmet_tcp_need_data_in(cmd
))
1318 nvmet_tcp_finish_cmd(cmd
);
1321 if (!queue
->nr_cmds
&& nvmet_tcp_need_data_in(&queue
->connect
)) {
1322 /* failed in connect */
1323 nvmet_tcp_finish_cmd(&queue
->connect
);
1327 static void nvmet_tcp_release_queue_work(struct work_struct
*w
)
1329 struct nvmet_tcp_queue
*queue
=
1330 container_of(w
, struct nvmet_tcp_queue
, release_work
);
1332 mutex_lock(&nvmet_tcp_queue_mutex
);
1333 list_del_init(&queue
->queue_list
);
1334 mutex_unlock(&nvmet_tcp_queue_mutex
);
1336 nvmet_tcp_restore_socket_callbacks(queue
);
1337 flush_work(&queue
->io_work
);
1339 nvmet_tcp_uninit_data_in_cmds(queue
);
1340 nvmet_sq_destroy(&queue
->nvme_sq
);
1341 cancel_work_sync(&queue
->io_work
);
1342 sock_release(queue
->sock
);
1343 nvmet_tcp_free_cmds(queue
);
1344 if (queue
->hdr_digest
|| queue
->data_digest
)
1345 nvmet_tcp_free_crypto(queue
);
1346 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1351 static void nvmet_tcp_data_ready(struct sock
*sk
)
1353 struct nvmet_tcp_queue
*queue
;
1355 read_lock_bh(&sk
->sk_callback_lock
);
1356 queue
= sk
->sk_user_data
;
1358 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1359 read_unlock_bh(&sk
->sk_callback_lock
);
1362 static void nvmet_tcp_write_space(struct sock
*sk
)
1364 struct nvmet_tcp_queue
*queue
;
1366 read_lock_bh(&sk
->sk_callback_lock
);
1367 queue
= sk
->sk_user_data
;
1368 if (unlikely(!queue
))
1371 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
1372 queue
->write_space(sk
);
1376 if (sk_stream_is_writeable(sk
)) {
1377 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
1378 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1381 read_unlock_bh(&sk
->sk_callback_lock
);
1384 static void nvmet_tcp_state_change(struct sock
*sk
)
1386 struct nvmet_tcp_queue
*queue
;
1388 write_lock_bh(&sk
->sk_callback_lock
);
1389 queue
= sk
->sk_user_data
;
1393 switch (sk
->sk_state
) {
1395 case TCP_CLOSE_WAIT
:
1398 sk
->sk_user_data
= NULL
;
1399 nvmet_tcp_schedule_release_queue(queue
);
1402 pr_warn("queue %d unhandled state %d\n",
1403 queue
->idx
, sk
->sk_state
);
1406 write_unlock_bh(&sk
->sk_callback_lock
);
1409 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue
*queue
)
1411 struct socket
*sock
= queue
->sock
;
1412 struct inet_sock
*inet
= inet_sk(sock
->sk
);
1413 struct linger sol
= { .l_onoff
= 1, .l_linger
= 0 };
1416 ret
= kernel_getsockname(sock
,
1417 (struct sockaddr
*)&queue
->sockaddr
);
1421 ret
= kernel_getpeername(sock
,
1422 (struct sockaddr
*)&queue
->sockaddr_peer
);
1427 * Cleanup whatever is sitting in the TCP transmit queue on socket
1428 * close. This is done to prevent stale data from being sent should
1429 * the network connection be restored before TCP times out.
1431 ret
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_LINGER
,
1432 (char *)&sol
, sizeof(sol
));
1436 /* Set socket type of service */
1437 if (inet
->rcv_tos
> 0) {
1438 int tos
= inet
->rcv_tos
;
1440 ret
= kernel_setsockopt(sock
, SOL_IP
, IP_TOS
,
1441 (char *)&tos
, sizeof(tos
));
1446 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1447 sock
->sk
->sk_user_data
= queue
;
1448 queue
->data_ready
= sock
->sk
->sk_data_ready
;
1449 sock
->sk
->sk_data_ready
= nvmet_tcp_data_ready
;
1450 queue
->state_change
= sock
->sk
->sk_state_change
;
1451 sock
->sk
->sk_state_change
= nvmet_tcp_state_change
;
1452 queue
->write_space
= sock
->sk
->sk_write_space
;
1453 sock
->sk
->sk_write_space
= nvmet_tcp_write_space
;
1454 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1459 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port
*port
,
1460 struct socket
*newsock
)
1462 struct nvmet_tcp_queue
*queue
;
1465 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
1469 INIT_WORK(&queue
->release_work
, nvmet_tcp_release_queue_work
);
1470 INIT_WORK(&queue
->io_work
, nvmet_tcp_io_work
);
1471 queue
->sock
= newsock
;
1474 spin_lock_init(&queue
->state_lock
);
1475 queue
->state
= NVMET_TCP_Q_CONNECTING
;
1476 INIT_LIST_HEAD(&queue
->free_list
);
1477 init_llist_head(&queue
->resp_list
);
1478 INIT_LIST_HEAD(&queue
->resp_send_list
);
1480 queue
->idx
= ida_simple_get(&nvmet_tcp_queue_ida
, 0, 0, GFP_KERNEL
);
1481 if (queue
->idx
< 0) {
1483 goto out_free_queue
;
1486 ret
= nvmet_tcp_alloc_cmd(queue
, &queue
->connect
);
1488 goto out_ida_remove
;
1490 ret
= nvmet_sq_init(&queue
->nvme_sq
);
1492 goto out_free_connect
;
1494 port
->last_cpu
= cpumask_next_wrap(port
->last_cpu
,
1495 cpu_online_mask
, -1, false);
1496 queue
->cpu
= port
->last_cpu
;
1497 nvmet_prepare_receive_pdu(queue
);
1499 mutex_lock(&nvmet_tcp_queue_mutex
);
1500 list_add_tail(&queue
->queue_list
, &nvmet_tcp_queue_list
);
1501 mutex_unlock(&nvmet_tcp_queue_mutex
);
1503 ret
= nvmet_tcp_set_queue_sock(queue
);
1505 goto out_destroy_sq
;
1507 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1511 mutex_lock(&nvmet_tcp_queue_mutex
);
1512 list_del_init(&queue
->queue_list
);
1513 mutex_unlock(&nvmet_tcp_queue_mutex
);
1514 nvmet_sq_destroy(&queue
->nvme_sq
);
1516 nvmet_tcp_free_cmd(&queue
->connect
);
1518 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1524 static void nvmet_tcp_accept_work(struct work_struct
*w
)
1526 struct nvmet_tcp_port
*port
=
1527 container_of(w
, struct nvmet_tcp_port
, accept_work
);
1528 struct socket
*newsock
;
1532 ret
= kernel_accept(port
->sock
, &newsock
, O_NONBLOCK
);
1535 pr_warn("failed to accept err=%d\n", ret
);
1538 ret
= nvmet_tcp_alloc_queue(port
, newsock
);
1540 pr_err("failed to allocate queue\n");
1541 sock_release(newsock
);
1546 static void nvmet_tcp_listen_data_ready(struct sock
*sk
)
1548 struct nvmet_tcp_port
*port
;
1550 read_lock_bh(&sk
->sk_callback_lock
);
1551 port
= sk
->sk_user_data
;
1555 if (sk
->sk_state
== TCP_LISTEN
)
1556 schedule_work(&port
->accept_work
);
1558 read_unlock_bh(&sk
->sk_callback_lock
);
1561 static int nvmet_tcp_add_port(struct nvmet_port
*nport
)
1563 struct nvmet_tcp_port
*port
;
1564 __kernel_sa_family_t af
;
1567 port
= kzalloc(sizeof(*port
), GFP_KERNEL
);
1571 switch (nport
->disc_addr
.adrfam
) {
1572 case NVMF_ADDR_FAMILY_IP4
:
1575 case NVMF_ADDR_FAMILY_IP6
:
1579 pr_err("address family %d not supported\n",
1580 nport
->disc_addr
.adrfam
);
1585 ret
= inet_pton_with_scope(&init_net
, af
, nport
->disc_addr
.traddr
,
1586 nport
->disc_addr
.trsvcid
, &port
->addr
);
1588 pr_err("malformed ip/port passed: %s:%s\n",
1589 nport
->disc_addr
.traddr
, nport
->disc_addr
.trsvcid
);
1593 port
->nport
= nport
;
1594 port
->last_cpu
= -1;
1595 INIT_WORK(&port
->accept_work
, nvmet_tcp_accept_work
);
1596 if (port
->nport
->inline_data_size
< 0)
1597 port
->nport
->inline_data_size
= NVMET_TCP_DEF_INLINE_DATA_SIZE
;
1599 ret
= sock_create(port
->addr
.ss_family
, SOCK_STREAM
,
1600 IPPROTO_TCP
, &port
->sock
);
1602 pr_err("failed to create a socket\n");
1606 port
->sock
->sk
->sk_user_data
= port
;
1607 port
->data_ready
= port
->sock
->sk
->sk_data_ready
;
1608 port
->sock
->sk
->sk_data_ready
= nvmet_tcp_listen_data_ready
;
1611 ret
= kernel_setsockopt(port
->sock
, IPPROTO_TCP
,
1612 TCP_NODELAY
, (char *)&opt
, sizeof(opt
));
1614 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret
);
1618 ret
= kernel_setsockopt(port
->sock
, SOL_SOCKET
, SO_REUSEADDR
,
1619 (char *)&opt
, sizeof(opt
));
1621 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret
);
1625 ret
= kernel_bind(port
->sock
, (struct sockaddr
*)&port
->addr
,
1626 sizeof(port
->addr
));
1628 pr_err("failed to bind port socket %d\n", ret
);
1632 ret
= kernel_listen(port
->sock
, 128);
1634 pr_err("failed to listen %d on port sock\n", ret
);
1639 pr_info("enabling port %d (%pISpc)\n",
1640 le16_to_cpu(nport
->disc_addr
.portid
), &port
->addr
);
1645 sock_release(port
->sock
);
1651 static void nvmet_tcp_remove_port(struct nvmet_port
*nport
)
1653 struct nvmet_tcp_port
*port
= nport
->priv
;
1655 write_lock_bh(&port
->sock
->sk
->sk_callback_lock
);
1656 port
->sock
->sk
->sk_data_ready
= port
->data_ready
;
1657 port
->sock
->sk
->sk_user_data
= NULL
;
1658 write_unlock_bh(&port
->sock
->sk
->sk_callback_lock
);
1659 cancel_work_sync(&port
->accept_work
);
1661 sock_release(port
->sock
);
1665 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl
*ctrl
)
1667 struct nvmet_tcp_queue
*queue
;
1669 mutex_lock(&nvmet_tcp_queue_mutex
);
1670 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1671 if (queue
->nvme_sq
.ctrl
== ctrl
)
1672 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1673 mutex_unlock(&nvmet_tcp_queue_mutex
);
1676 static u16
nvmet_tcp_install_queue(struct nvmet_sq
*sq
)
1678 struct nvmet_tcp_queue
*queue
=
1679 container_of(sq
, struct nvmet_tcp_queue
, nvme_sq
);
1682 /* Let inflight controller teardown complete */
1683 flush_scheduled_work();
1686 queue
->nr_cmds
= sq
->size
* 2;
1687 if (nvmet_tcp_alloc_cmds(queue
))
1688 return NVME_SC_INTERNAL
;
1692 static void nvmet_tcp_disc_port_addr(struct nvmet_req
*req
,
1693 struct nvmet_port
*nport
, char *traddr
)
1695 struct nvmet_tcp_port
*port
= nport
->priv
;
1697 if (inet_addr_is_any((struct sockaddr
*)&port
->addr
)) {
1698 struct nvmet_tcp_cmd
*cmd
=
1699 container_of(req
, struct nvmet_tcp_cmd
, req
);
1700 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1702 sprintf(traddr
, "%pISc", (struct sockaddr
*)&queue
->sockaddr
);
1704 memcpy(traddr
, nport
->disc_addr
.traddr
, NVMF_TRADDR_SIZE
);
1708 static struct nvmet_fabrics_ops nvmet_tcp_ops
= {
1709 .owner
= THIS_MODULE
,
1710 .type
= NVMF_TRTYPE_TCP
,
1712 .has_keyed_sgls
= 0,
1713 .add_port
= nvmet_tcp_add_port
,
1714 .remove_port
= nvmet_tcp_remove_port
,
1715 .queue_response
= nvmet_tcp_queue_response
,
1716 .delete_ctrl
= nvmet_tcp_delete_ctrl
,
1717 .install_queue
= nvmet_tcp_install_queue
,
1718 .disc_traddr
= nvmet_tcp_disc_port_addr
,
1721 static int __init
nvmet_tcp_init(void)
1725 nvmet_tcp_wq
= alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI
, 0);
1729 ret
= nvmet_register_transport(&nvmet_tcp_ops
);
1735 destroy_workqueue(nvmet_tcp_wq
);
1739 static void __exit
nvmet_tcp_exit(void)
1741 struct nvmet_tcp_queue
*queue
;
1743 nvmet_unregister_transport(&nvmet_tcp_ops
);
1745 flush_scheduled_work();
1746 mutex_lock(&nvmet_tcp_queue_mutex
);
1747 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1748 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1749 mutex_unlock(&nvmet_tcp_queue_mutex
);
1750 flush_scheduled_work();
1752 destroy_workqueue(nvmet_tcp_wq
);
1755 module_init(nvmet_tcp_init
);
1756 module_exit(nvmet_tcp_exit
);
1758 MODULE_LICENSE("GPL v2");
1759 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */