1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
22 /* Define the socket priority to use for connections were it is desirable
23 * that the NIC consider performing optimized packet processing or filtering.
24 * A non-zero value being sufficient to indicate general consideration of any
25 * possible optimization. Making it a module param allows for alternative
26 * values that may be unique for some NIC implementations.
28 static int so_priority
;
29 module_param(so_priority
, int, 0644);
30 MODULE_PARM_DESC(so_priority
, "nvmet tcp socket optimize priority");
32 #define NVMET_TCP_RECV_BUDGET 8
33 #define NVMET_TCP_SEND_BUDGET 8
34 #define NVMET_TCP_IO_WORK_BUDGET 64
36 enum nvmet_tcp_send_state
{
37 NVMET_TCP_SEND_DATA_PDU
,
41 NVMET_TCP_SEND_RESPONSE
44 enum nvmet_tcp_recv_state
{
52 NVMET_TCP_F_INIT_FAILED
= (1 << 0),
55 struct nvmet_tcp_cmd
{
56 struct nvmet_tcp_queue
*queue
;
59 struct nvme_tcp_cmd_pdu
*cmd_pdu
;
60 struct nvme_tcp_rsp_pdu
*rsp_pdu
;
61 struct nvme_tcp_data_pdu
*data_pdu
;
62 struct nvme_tcp_r2t_pdu
*r2t_pdu
;
71 struct msghdr recv_msg
;
75 struct list_head entry
;
76 struct llist_node lentry
;
80 struct scatterlist
*cur_sg
;
81 enum nvmet_tcp_send_state state
;
87 enum nvmet_tcp_queue_state
{
88 NVMET_TCP_Q_CONNECTING
,
90 NVMET_TCP_Q_DISCONNECTING
,
93 struct nvmet_tcp_queue
{
95 struct nvmet_tcp_port
*port
;
96 struct work_struct io_work
;
98 struct nvmet_cq nvme_cq
;
99 struct nvmet_sq nvme_sq
;
102 struct nvmet_tcp_cmd
*cmds
;
103 unsigned int nr_cmds
;
104 struct list_head free_list
;
105 struct llist_head resp_list
;
106 struct list_head resp_send_list
;
108 struct nvmet_tcp_cmd
*snd_cmd
;
113 enum nvmet_tcp_recv_state rcv_state
;
114 struct nvmet_tcp_cmd
*cmd
;
115 union nvme_tcp_pdu pdu
;
120 struct ahash_request
*snd_hash
;
121 struct ahash_request
*rcv_hash
;
123 spinlock_t state_lock
;
124 enum nvmet_tcp_queue_state state
;
126 struct sockaddr_storage sockaddr
;
127 struct sockaddr_storage sockaddr_peer
;
128 struct work_struct release_work
;
131 struct list_head queue_list
;
133 struct nvmet_tcp_cmd connect
;
135 struct page_frag_cache pf_cache
;
137 void (*data_ready
)(struct sock
*);
138 void (*state_change
)(struct sock
*);
139 void (*write_space
)(struct sock
*);
142 struct nvmet_tcp_port
{
144 struct work_struct accept_work
;
145 struct nvmet_port
*nport
;
146 struct sockaddr_storage addr
;
148 void (*data_ready
)(struct sock
*);
151 static DEFINE_IDA(nvmet_tcp_queue_ida
);
152 static LIST_HEAD(nvmet_tcp_queue_list
);
153 static DEFINE_MUTEX(nvmet_tcp_queue_mutex
);
155 static struct workqueue_struct
*nvmet_tcp_wq
;
156 static struct nvmet_fabrics_ops nvmet_tcp_ops
;
157 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
);
158 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
);
160 static inline u16
nvmet_tcp_cmd_tag(struct nvmet_tcp_queue
*queue
,
161 struct nvmet_tcp_cmd
*cmd
)
163 return cmd
- queue
->cmds
;
166 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd
*cmd
)
168 return nvme_is_write(cmd
->req
.cmd
) &&
169 cmd
->rbytes_done
< cmd
->req
.transfer_len
;
172 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd
*cmd
)
174 return nvmet_tcp_has_data_in(cmd
) && !cmd
->req
.cqe
->status
;
177 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd
*cmd
)
179 return !nvme_is_write(cmd
->req
.cmd
) &&
180 cmd
->req
.transfer_len
> 0 &&
181 !cmd
->req
.cqe
->status
;
184 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd
*cmd
)
186 return nvme_is_write(cmd
->req
.cmd
) && cmd
->pdu_len
&&
190 static inline struct nvmet_tcp_cmd
*
191 nvmet_tcp_get_cmd(struct nvmet_tcp_queue
*queue
)
193 struct nvmet_tcp_cmd
*cmd
;
195 cmd
= list_first_entry_or_null(&queue
->free_list
,
196 struct nvmet_tcp_cmd
, entry
);
199 list_del_init(&cmd
->entry
);
201 cmd
->rbytes_done
= cmd
->wbytes_done
= 0;
209 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd
*cmd
)
211 if (unlikely(cmd
== &cmd
->queue
->connect
))
214 list_add_tail(&cmd
->entry
, &cmd
->queue
->free_list
);
217 static inline u8
nvmet_tcp_hdgst_len(struct nvmet_tcp_queue
*queue
)
219 return queue
->hdr_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
222 static inline u8
nvmet_tcp_ddgst_len(struct nvmet_tcp_queue
*queue
)
224 return queue
->data_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
227 static inline void nvmet_tcp_hdgst(struct ahash_request
*hash
,
228 void *pdu
, size_t len
)
230 struct scatterlist sg
;
232 sg_init_one(&sg
, pdu
, len
);
233 ahash_request_set_crypt(hash
, &sg
, pdu
+ len
, len
);
234 crypto_ahash_digest(hash
);
237 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue
*queue
,
238 void *pdu
, size_t len
)
240 struct nvme_tcp_hdr
*hdr
= pdu
;
244 if (unlikely(!(hdr
->flags
& NVME_TCP_F_HDGST
))) {
245 pr_err("queue %d: header digest enabled but no header digest\n",
250 recv_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
251 nvmet_tcp_hdgst(queue
->rcv_hash
, pdu
, len
);
252 exp_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
253 if (recv_digest
!= exp_digest
) {
254 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
255 queue
->idx
, le32_to_cpu(recv_digest
),
256 le32_to_cpu(exp_digest
));
263 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue
*queue
, void *pdu
)
265 struct nvme_tcp_hdr
*hdr
= pdu
;
266 u8 digest_len
= nvmet_tcp_hdgst_len(queue
);
269 len
= le32_to_cpu(hdr
->plen
) - hdr
->hlen
-
270 (hdr
->flags
& NVME_TCP_F_HDGST
? digest_len
: 0);
272 if (unlikely(len
&& !(hdr
->flags
& NVME_TCP_F_DDGST
))) {
273 pr_err("queue %d: data digest flag is cleared\n", queue
->idx
);
280 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
282 struct scatterlist
*sg
;
285 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
287 for (i
= 0; i
< cmd
->nr_mapped
; i
++)
288 kunmap(sg_page(&sg
[i
]));
291 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd
*cmd
)
293 struct kvec
*iov
= cmd
->iov
;
294 struct scatterlist
*sg
;
295 u32 length
, offset
, sg_offset
;
297 length
= cmd
->pdu_len
;
298 cmd
->nr_mapped
= DIV_ROUND_UP(length
, PAGE_SIZE
);
299 offset
= cmd
->rbytes_done
;
300 cmd
->sg_idx
= DIV_ROUND_UP(offset
, PAGE_SIZE
);
301 sg_offset
= offset
% PAGE_SIZE
;
302 sg
= &cmd
->req
.sg
[cmd
->sg_idx
];
305 u32 iov_len
= min_t(u32
, length
, sg
->length
- sg_offset
);
307 iov
->iov_base
= kmap(sg_page(sg
)) + sg
->offset
+ sg_offset
;
308 iov
->iov_len
= iov_len
;
315 iov_iter_kvec(&cmd
->recv_msg
.msg_iter
, READ
, cmd
->iov
,
316 cmd
->nr_mapped
, cmd
->pdu_len
);
319 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue
*queue
)
321 queue
->rcv_state
= NVMET_TCP_RECV_ERR
;
322 if (queue
->nvme_sq
.ctrl
)
323 nvmet_ctrl_fatal_error(queue
->nvme_sq
.ctrl
);
325 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
328 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd
*cmd
)
330 struct nvme_sgl_desc
*sgl
= &cmd
->req
.cmd
->common
.dptr
.sgl
;
331 u32 len
= le32_to_cpu(sgl
->length
);
336 if (sgl
->type
== ((NVME_SGL_FMT_DATA_DESC
<< 4) |
337 NVME_SGL_FMT_OFFSET
)) {
338 if (!nvme_is_write(cmd
->req
.cmd
))
339 return NVME_SC_INVALID_FIELD
| NVME_SC_DNR
;
341 if (len
> cmd
->req
.port
->inline_data_size
)
342 return NVME_SC_SGL_INVALID_OFFSET
| NVME_SC_DNR
;
345 cmd
->req
.transfer_len
+= len
;
347 cmd
->req
.sg
= sgl_alloc(len
, GFP_KERNEL
, &cmd
->req
.sg_cnt
);
349 return NVME_SC_INTERNAL
;
350 cmd
->cur_sg
= cmd
->req
.sg
;
352 if (nvmet_tcp_has_data_in(cmd
)) {
353 cmd
->iov
= kmalloc_array(cmd
->req
.sg_cnt
,
354 sizeof(*cmd
->iov
), GFP_KERNEL
);
361 sgl_free(cmd
->req
.sg
);
362 return NVME_SC_INTERNAL
;
365 static void nvmet_tcp_ddgst(struct ahash_request
*hash
,
366 struct nvmet_tcp_cmd
*cmd
)
368 ahash_request_set_crypt(hash
, cmd
->req
.sg
,
369 (void *)&cmd
->exp_ddgst
, cmd
->req
.transfer_len
);
370 crypto_ahash_digest(hash
);
373 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd
*cmd
)
375 struct nvme_tcp_data_pdu
*pdu
= cmd
->data_pdu
;
376 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
377 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
378 u8 ddgst
= nvmet_tcp_ddgst_len(cmd
->queue
);
381 cmd
->state
= NVMET_TCP_SEND_DATA_PDU
;
383 pdu
->hdr
.type
= nvme_tcp_c2h_data
;
384 pdu
->hdr
.flags
= NVME_TCP_F_DATA_LAST
| (queue
->nvme_sq
.sqhd_disabled
?
385 NVME_TCP_F_DATA_SUCCESS
: 0);
386 pdu
->hdr
.hlen
= sizeof(*pdu
);
387 pdu
->hdr
.pdo
= pdu
->hdr
.hlen
+ hdgst
;
389 cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
+
390 cmd
->req
.transfer_len
+ ddgst
);
391 pdu
->command_id
= cmd
->req
.cqe
->command_id
;
392 pdu
->data_length
= cpu_to_le32(cmd
->req
.transfer_len
);
393 pdu
->data_offset
= cpu_to_le32(cmd
->wbytes_done
);
395 if (queue
->data_digest
) {
396 pdu
->hdr
.flags
|= NVME_TCP_F_DDGST
;
397 nvmet_tcp_ddgst(queue
->snd_hash
, cmd
);
400 if (cmd
->queue
->hdr_digest
) {
401 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
402 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
406 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd
*cmd
)
408 struct nvme_tcp_r2t_pdu
*pdu
= cmd
->r2t_pdu
;
409 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
410 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
413 cmd
->state
= NVMET_TCP_SEND_R2T
;
415 pdu
->hdr
.type
= nvme_tcp_r2t
;
417 pdu
->hdr
.hlen
= sizeof(*pdu
);
419 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
421 pdu
->command_id
= cmd
->req
.cmd
->common
.command_id
;
422 pdu
->ttag
= nvmet_tcp_cmd_tag(cmd
->queue
, cmd
);
423 pdu
->r2t_length
= cpu_to_le32(cmd
->req
.transfer_len
- cmd
->rbytes_done
);
424 pdu
->r2t_offset
= cpu_to_le32(cmd
->rbytes_done
);
425 if (cmd
->queue
->hdr_digest
) {
426 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
427 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
431 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd
*cmd
)
433 struct nvme_tcp_rsp_pdu
*pdu
= cmd
->rsp_pdu
;
434 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
435 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
438 cmd
->state
= NVMET_TCP_SEND_RESPONSE
;
440 pdu
->hdr
.type
= nvme_tcp_rsp
;
442 pdu
->hdr
.hlen
= sizeof(*pdu
);
444 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
445 if (cmd
->queue
->hdr_digest
) {
446 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
447 nvmet_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
451 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue
*queue
)
453 struct llist_node
*node
;
455 node
= llist_del_all(&queue
->resp_list
);
460 struct nvmet_tcp_cmd
*cmd
= llist_entry(node
,
461 struct nvmet_tcp_cmd
, lentry
);
463 list_add(&cmd
->entry
, &queue
->resp_send_list
);
465 queue
->send_list_len
++;
469 static struct nvmet_tcp_cmd
*nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue
*queue
)
471 queue
->snd_cmd
= list_first_entry_or_null(&queue
->resp_send_list
,
472 struct nvmet_tcp_cmd
, entry
);
473 if (!queue
->snd_cmd
) {
474 nvmet_tcp_process_resp_list(queue
);
476 list_first_entry_or_null(&queue
->resp_send_list
,
477 struct nvmet_tcp_cmd
, entry
);
478 if (unlikely(!queue
->snd_cmd
))
482 list_del_init(&queue
->snd_cmd
->entry
);
483 queue
->send_list_len
--;
485 if (nvmet_tcp_need_data_out(queue
->snd_cmd
))
486 nvmet_setup_c2h_data_pdu(queue
->snd_cmd
);
487 else if (nvmet_tcp_need_data_in(queue
->snd_cmd
))
488 nvmet_setup_r2t_pdu(queue
->snd_cmd
);
490 nvmet_setup_response_pdu(queue
->snd_cmd
);
492 return queue
->snd_cmd
;
495 static void nvmet_tcp_queue_response(struct nvmet_req
*req
)
497 struct nvmet_tcp_cmd
*cmd
=
498 container_of(req
, struct nvmet_tcp_cmd
, req
);
499 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
501 llist_add(&cmd
->lentry
, &queue
->resp_list
);
502 queue_work_on(cmd
->queue
->cpu
, nvmet_tcp_wq
, &cmd
->queue
->io_work
);
505 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd
*cmd
)
507 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
508 int left
= sizeof(*cmd
->data_pdu
) - cmd
->offset
+ hdgst
;
511 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->data_pdu
),
512 offset_in_page(cmd
->data_pdu
) + cmd
->offset
,
513 left
, MSG_DONTWAIT
| MSG_MORE
);
523 cmd
->state
= NVMET_TCP_SEND_DATA
;
528 static int nvmet_try_send_data(struct nvmet_tcp_cmd
*cmd
, bool last_in_batch
)
530 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
533 while (cmd
->cur_sg
) {
534 struct page
*page
= sg_page(cmd
->cur_sg
);
535 u32 left
= cmd
->cur_sg
->length
- cmd
->offset
;
536 int flags
= MSG_DONTWAIT
;
538 if ((!last_in_batch
&& cmd
->queue
->send_list_len
) ||
539 cmd
->wbytes_done
+ left
< cmd
->req
.transfer_len
||
540 queue
->data_digest
|| !queue
->nvme_sq
.sqhd_disabled
)
543 ret
= kernel_sendpage(cmd
->queue
->sock
, page
, cmd
->offset
,
549 cmd
->wbytes_done
+= ret
;
552 if (cmd
->offset
== cmd
->cur_sg
->length
) {
553 cmd
->cur_sg
= sg_next(cmd
->cur_sg
);
558 if (queue
->data_digest
) {
559 cmd
->state
= NVMET_TCP_SEND_DDGST
;
562 if (queue
->nvme_sq
.sqhd_disabled
) {
563 cmd
->queue
->snd_cmd
= NULL
;
564 nvmet_tcp_put_cmd(cmd
);
566 nvmet_setup_response_pdu(cmd
);
570 if (queue
->nvme_sq
.sqhd_disabled
) {
572 sgl_free(cmd
->req
.sg
);
579 static int nvmet_try_send_response(struct nvmet_tcp_cmd
*cmd
,
582 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
583 int left
= sizeof(*cmd
->rsp_pdu
) - cmd
->offset
+ hdgst
;
584 int flags
= MSG_DONTWAIT
;
587 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
592 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->rsp_pdu
),
593 offset_in_page(cmd
->rsp_pdu
) + cmd
->offset
, left
, flags
);
603 sgl_free(cmd
->req
.sg
);
604 cmd
->queue
->snd_cmd
= NULL
;
605 nvmet_tcp_put_cmd(cmd
);
609 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd
*cmd
, bool last_in_batch
)
611 u8 hdgst
= nvmet_tcp_hdgst_len(cmd
->queue
);
612 int left
= sizeof(*cmd
->r2t_pdu
) - cmd
->offset
+ hdgst
;
613 int flags
= MSG_DONTWAIT
;
616 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
621 ret
= kernel_sendpage(cmd
->queue
->sock
, virt_to_page(cmd
->r2t_pdu
),
622 offset_in_page(cmd
->r2t_pdu
) + cmd
->offset
, left
, flags
);
631 cmd
->queue
->snd_cmd
= NULL
;
635 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd
*cmd
, bool last_in_batch
)
637 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
638 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
640 .iov_base
= &cmd
->exp_ddgst
+ cmd
->offset
,
641 .iov_len
= NVME_TCP_DIGEST_LENGTH
- cmd
->offset
645 if (!last_in_batch
&& cmd
->queue
->send_list_len
)
646 msg
.msg_flags
|= MSG_MORE
;
648 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
649 if (unlikely(ret
<= 0))
654 if (queue
->nvme_sq
.sqhd_disabled
) {
655 cmd
->queue
->snd_cmd
= NULL
;
656 nvmet_tcp_put_cmd(cmd
);
658 nvmet_setup_response_pdu(cmd
);
663 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue
*queue
,
666 struct nvmet_tcp_cmd
*cmd
= queue
->snd_cmd
;
669 if (!cmd
|| queue
->state
== NVMET_TCP_Q_DISCONNECTING
) {
670 cmd
= nvmet_tcp_fetch_cmd(queue
);
675 if (cmd
->state
== NVMET_TCP_SEND_DATA_PDU
) {
676 ret
= nvmet_try_send_data_pdu(cmd
);
681 if (cmd
->state
== NVMET_TCP_SEND_DATA
) {
682 ret
= nvmet_try_send_data(cmd
, last_in_batch
);
687 if (cmd
->state
== NVMET_TCP_SEND_DDGST
) {
688 ret
= nvmet_try_send_ddgst(cmd
, last_in_batch
);
693 if (cmd
->state
== NVMET_TCP_SEND_R2T
) {
694 ret
= nvmet_try_send_r2t(cmd
, last_in_batch
);
699 if (cmd
->state
== NVMET_TCP_SEND_RESPONSE
)
700 ret
= nvmet_try_send_response(cmd
, last_in_batch
);
712 static int nvmet_tcp_try_send(struct nvmet_tcp_queue
*queue
,
713 int budget
, int *sends
)
717 for (i
= 0; i
< budget
; i
++) {
718 ret
= nvmet_tcp_try_send_one(queue
, i
== budget
- 1);
727 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue
*queue
)
730 queue
->left
= sizeof(struct nvme_tcp_hdr
);
732 queue
->rcv_state
= NVMET_TCP_RECV_PDU
;
735 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue
*queue
)
737 struct crypto_ahash
*tfm
= crypto_ahash_reqtfm(queue
->rcv_hash
);
739 ahash_request_free(queue
->rcv_hash
);
740 ahash_request_free(queue
->snd_hash
);
741 crypto_free_ahash(tfm
);
744 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue
*queue
)
746 struct crypto_ahash
*tfm
;
748 tfm
= crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC
);
752 queue
->snd_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
753 if (!queue
->snd_hash
)
755 ahash_request_set_callback(queue
->snd_hash
, 0, NULL
, NULL
);
757 queue
->rcv_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
758 if (!queue
->rcv_hash
)
760 ahash_request_set_callback(queue
->rcv_hash
, 0, NULL
, NULL
);
764 ahash_request_free(queue
->snd_hash
);
766 crypto_free_ahash(tfm
);
771 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue
*queue
)
773 struct nvme_tcp_icreq_pdu
*icreq
= &queue
->pdu
.icreq
;
774 struct nvme_tcp_icresp_pdu
*icresp
= &queue
->pdu
.icresp
;
775 struct msghdr msg
= {};
779 if (le32_to_cpu(icreq
->hdr
.plen
) != sizeof(struct nvme_tcp_icreq_pdu
)) {
780 pr_err("bad nvme-tcp pdu length (%d)\n",
781 le32_to_cpu(icreq
->hdr
.plen
));
782 nvmet_tcp_fatal_error(queue
);
785 if (icreq
->pfv
!= NVME_TCP_PFV_1_0
) {
786 pr_err("queue %d: bad pfv %d\n", queue
->idx
, icreq
->pfv
);
790 if (icreq
->hpda
!= 0) {
791 pr_err("queue %d: unsupported hpda %d\n", queue
->idx
,
796 queue
->hdr_digest
= !!(icreq
->digest
& NVME_TCP_HDR_DIGEST_ENABLE
);
797 queue
->data_digest
= !!(icreq
->digest
& NVME_TCP_DATA_DIGEST_ENABLE
);
798 if (queue
->hdr_digest
|| queue
->data_digest
) {
799 ret
= nvmet_tcp_alloc_crypto(queue
);
804 memset(icresp
, 0, sizeof(*icresp
));
805 icresp
->hdr
.type
= nvme_tcp_icresp
;
806 icresp
->hdr
.hlen
= sizeof(*icresp
);
808 icresp
->hdr
.plen
= cpu_to_le32(icresp
->hdr
.hlen
);
809 icresp
->pfv
= cpu_to_le16(NVME_TCP_PFV_1_0
);
810 icresp
->maxdata
= cpu_to_le32(0x400000); /* 16M arbitrary limit */
812 if (queue
->hdr_digest
)
813 icresp
->digest
|= NVME_TCP_HDR_DIGEST_ENABLE
;
814 if (queue
->data_digest
)
815 icresp
->digest
|= NVME_TCP_DATA_DIGEST_ENABLE
;
817 iov
.iov_base
= icresp
;
818 iov
.iov_len
= sizeof(*icresp
);
819 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
823 queue
->state
= NVMET_TCP_Q_LIVE
;
824 nvmet_prepare_receive_pdu(queue
);
827 if (queue
->hdr_digest
|| queue
->data_digest
)
828 nvmet_tcp_free_crypto(queue
);
832 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue
*queue
,
833 struct nvmet_tcp_cmd
*cmd
, struct nvmet_req
*req
)
835 size_t data_len
= le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
);
838 if (!nvme_is_write(cmd
->req
.cmd
) ||
839 data_len
> cmd
->req
.port
->inline_data_size
) {
840 nvmet_prepare_receive_pdu(queue
);
844 ret
= nvmet_tcp_map_data(cmd
);
846 pr_err("queue %d: failed to map data\n", queue
->idx
);
847 nvmet_tcp_fatal_error(queue
);
851 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
852 nvmet_tcp_map_pdu_iovec(cmd
);
853 cmd
->flags
|= NVMET_TCP_F_INIT_FAILED
;
856 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue
*queue
)
858 struct nvme_tcp_data_pdu
*data
= &queue
->pdu
.data
;
859 struct nvmet_tcp_cmd
*cmd
;
861 cmd
= &queue
->cmds
[data
->ttag
];
863 if (le32_to_cpu(data
->data_offset
) != cmd
->rbytes_done
) {
864 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
865 data
->ttag
, le32_to_cpu(data
->data_offset
),
867 /* FIXME: use path and transport errors */
868 nvmet_req_complete(&cmd
->req
,
869 NVME_SC_INVALID_FIELD
| NVME_SC_DNR
);
873 cmd
->pdu_len
= le32_to_cpu(data
->data_length
);
875 nvmet_tcp_map_pdu_iovec(cmd
);
877 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
882 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue
*queue
)
884 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
885 struct nvme_command
*nvme_cmd
= &queue
->pdu
.cmd
.cmd
;
886 struct nvmet_req
*req
;
889 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
890 if (hdr
->type
!= nvme_tcp_icreq
) {
891 pr_err("unexpected pdu type (%d) before icreq\n",
893 nvmet_tcp_fatal_error(queue
);
896 return nvmet_tcp_handle_icreq(queue
);
899 if (hdr
->type
== nvme_tcp_h2c_data
) {
900 ret
= nvmet_tcp_handle_h2c_data_pdu(queue
);
906 queue
->cmd
= nvmet_tcp_get_cmd(queue
);
907 if (unlikely(!queue
->cmd
)) {
908 /* This should never happen */
909 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
910 queue
->idx
, queue
->nr_cmds
, queue
->send_list_len
,
911 nvme_cmd
->common
.opcode
);
912 nvmet_tcp_fatal_error(queue
);
916 req
= &queue
->cmd
->req
;
917 memcpy(req
->cmd
, nvme_cmd
, sizeof(*nvme_cmd
));
919 if (unlikely(!nvmet_req_init(req
, &queue
->nvme_cq
,
920 &queue
->nvme_sq
, &nvmet_tcp_ops
))) {
921 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
922 req
->cmd
, req
->cmd
->common
.command_id
,
923 req
->cmd
->common
.opcode
,
924 le32_to_cpu(req
->cmd
->common
.dptr
.sgl
.length
));
926 nvmet_tcp_handle_req_failure(queue
, queue
->cmd
, req
);
930 ret
= nvmet_tcp_map_data(queue
->cmd
);
932 pr_err("queue %d: failed to map data\n", queue
->idx
);
933 if (nvmet_tcp_has_inline_data(queue
->cmd
))
934 nvmet_tcp_fatal_error(queue
);
936 nvmet_req_complete(req
, ret
);
941 if (nvmet_tcp_need_data_in(queue
->cmd
)) {
942 if (nvmet_tcp_has_inline_data(queue
->cmd
)) {
943 queue
->rcv_state
= NVMET_TCP_RECV_DATA
;
944 nvmet_tcp_map_pdu_iovec(queue
->cmd
);
948 nvmet_tcp_queue_response(&queue
->cmd
->req
);
952 queue
->cmd
->req
.execute(&queue
->cmd
->req
);
954 nvmet_prepare_receive_pdu(queue
);
958 static const u8 nvme_tcp_pdu_sizes
[] = {
959 [nvme_tcp_icreq
] = sizeof(struct nvme_tcp_icreq_pdu
),
960 [nvme_tcp_cmd
] = sizeof(struct nvme_tcp_cmd_pdu
),
961 [nvme_tcp_h2c_data
] = sizeof(struct nvme_tcp_data_pdu
),
964 static inline u8
nvmet_tcp_pdu_size(u8 type
)
968 return (idx
< ARRAY_SIZE(nvme_tcp_pdu_sizes
) &&
969 nvme_tcp_pdu_sizes
[idx
]) ?
970 nvme_tcp_pdu_sizes
[idx
] : 0;
973 static inline bool nvmet_tcp_pdu_valid(u8 type
)
978 case nvme_tcp_h2c_data
:
986 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue
*queue
)
988 struct nvme_tcp_hdr
*hdr
= &queue
->pdu
.cmd
.hdr
;
991 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
994 iov
.iov_base
= (void *)&queue
->pdu
+ queue
->offset
;
995 iov
.iov_len
= queue
->left
;
996 len
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
997 iov
.iov_len
, msg
.msg_flags
);
998 if (unlikely(len
< 0))
1001 queue
->offset
+= len
;
1006 if (queue
->offset
== sizeof(struct nvme_tcp_hdr
)) {
1007 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
1009 if (unlikely(!nvmet_tcp_pdu_valid(hdr
->type
))) {
1010 pr_err("unexpected pdu type %d\n", hdr
->type
);
1011 nvmet_tcp_fatal_error(queue
);
1015 if (unlikely(hdr
->hlen
!= nvmet_tcp_pdu_size(hdr
->type
))) {
1016 pr_err("pdu %d bad hlen %d\n", hdr
->type
, hdr
->hlen
);
1020 queue
->left
= hdr
->hlen
- queue
->offset
+ hdgst
;
1024 if (queue
->hdr_digest
&&
1025 nvmet_tcp_verify_hdgst(queue
, &queue
->pdu
, queue
->offset
)) {
1026 nvmet_tcp_fatal_error(queue
); /* fatal */
1030 if (queue
->data_digest
&&
1031 nvmet_tcp_check_ddgst(queue
, &queue
->pdu
)) {
1032 nvmet_tcp_fatal_error(queue
); /* fatal */
1036 return nvmet_tcp_done_recv_pdu(queue
);
1039 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd
*cmd
)
1041 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1043 nvmet_tcp_ddgst(queue
->rcv_hash
, cmd
);
1045 queue
->left
= NVME_TCP_DIGEST_LENGTH
;
1046 queue
->rcv_state
= NVMET_TCP_RECV_DDGST
;
1049 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue
*queue
)
1051 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1054 while (msg_data_left(&cmd
->recv_msg
)) {
1055 ret
= sock_recvmsg(cmd
->queue
->sock
, &cmd
->recv_msg
,
1056 cmd
->recv_msg
.msg_flags
);
1060 cmd
->pdu_recv
+= ret
;
1061 cmd
->rbytes_done
+= ret
;
1064 nvmet_tcp_unmap_pdu_iovec(cmd
);
1066 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1067 cmd
->rbytes_done
== cmd
->req
.transfer_len
) {
1068 if (queue
->data_digest
) {
1069 nvmet_tcp_prep_recv_ddgst(cmd
);
1072 cmd
->req
.execute(&cmd
->req
);
1075 nvmet_prepare_receive_pdu(queue
);
1079 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue
*queue
)
1081 struct nvmet_tcp_cmd
*cmd
= queue
->cmd
;
1083 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
};
1085 .iov_base
= (void *)&cmd
->recv_ddgst
+ queue
->offset
,
1086 .iov_len
= queue
->left
1089 ret
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
1090 iov
.iov_len
, msg
.msg_flags
);
1091 if (unlikely(ret
< 0))
1094 queue
->offset
+= ret
;
1099 if (queue
->data_digest
&& cmd
->exp_ddgst
!= cmd
->recv_ddgst
) {
1100 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1101 queue
->idx
, cmd
->req
.cmd
->common
.command_id
,
1102 queue
->pdu
.cmd
.hdr
.type
, le32_to_cpu(cmd
->recv_ddgst
),
1103 le32_to_cpu(cmd
->exp_ddgst
));
1104 nvmet_tcp_finish_cmd(cmd
);
1105 nvmet_tcp_fatal_error(queue
);
1110 if (!(cmd
->flags
& NVMET_TCP_F_INIT_FAILED
) &&
1111 cmd
->rbytes_done
== cmd
->req
.transfer_len
)
1112 cmd
->req
.execute(&cmd
->req
);
1115 nvmet_prepare_receive_pdu(queue
);
1119 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue
*queue
)
1123 if (unlikely(queue
->rcv_state
== NVMET_TCP_RECV_ERR
))
1126 if (queue
->rcv_state
== NVMET_TCP_RECV_PDU
) {
1127 result
= nvmet_tcp_try_recv_pdu(queue
);
1132 if (queue
->rcv_state
== NVMET_TCP_RECV_DATA
) {
1133 result
= nvmet_tcp_try_recv_data(queue
);
1138 if (queue
->rcv_state
== NVMET_TCP_RECV_DDGST
) {
1139 result
= nvmet_tcp_try_recv_ddgst(queue
);
1146 if (result
== -EAGAIN
)
1153 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue
*queue
,
1154 int budget
, int *recvs
)
1158 for (i
= 0; i
< budget
; i
++) {
1159 ret
= nvmet_tcp_try_recv_one(queue
);
1168 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue
*queue
)
1170 spin_lock(&queue
->state_lock
);
1171 if (queue
->state
!= NVMET_TCP_Q_DISCONNECTING
) {
1172 queue
->state
= NVMET_TCP_Q_DISCONNECTING
;
1173 schedule_work(&queue
->release_work
);
1175 spin_unlock(&queue
->state_lock
);
1178 static void nvmet_tcp_io_work(struct work_struct
*w
)
1180 struct nvmet_tcp_queue
*queue
=
1181 container_of(w
, struct nvmet_tcp_queue
, io_work
);
1188 ret
= nvmet_tcp_try_recv(queue
, NVMET_TCP_RECV_BUDGET
, &ops
);
1191 } else if (ret
< 0) {
1192 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1193 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1195 nvmet_tcp_fatal_error(queue
);
1199 ret
= nvmet_tcp_try_send(queue
, NVMET_TCP_SEND_BUDGET
, &ops
);
1201 /* transmitted message/data */
1203 } else if (ret
< 0) {
1204 if (ret
== -EPIPE
|| ret
== -ECONNRESET
)
1205 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1207 nvmet_tcp_fatal_error(queue
);
1211 } while (pending
&& ops
< NVMET_TCP_IO_WORK_BUDGET
);
1214 * We exahusted our budget, requeue our selves
1217 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1220 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue
*queue
,
1221 struct nvmet_tcp_cmd
*c
)
1223 u8 hdgst
= nvmet_tcp_hdgst_len(queue
);
1226 c
->req
.port
= queue
->port
->nport
;
1228 c
->cmd_pdu
= page_frag_alloc(&queue
->pf_cache
,
1229 sizeof(*c
->cmd_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1232 c
->req
.cmd
= &c
->cmd_pdu
->cmd
;
1234 c
->rsp_pdu
= page_frag_alloc(&queue
->pf_cache
,
1235 sizeof(*c
->rsp_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1238 c
->req
.cqe
= &c
->rsp_pdu
->cqe
;
1240 c
->data_pdu
= page_frag_alloc(&queue
->pf_cache
,
1241 sizeof(*c
->data_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1245 c
->r2t_pdu
= page_frag_alloc(&queue
->pf_cache
,
1246 sizeof(*c
->r2t_pdu
) + hdgst
, GFP_KERNEL
| __GFP_ZERO
);
1250 c
->recv_msg
.msg_flags
= MSG_DONTWAIT
| MSG_NOSIGNAL
;
1252 list_add_tail(&c
->entry
, &queue
->free_list
);
1256 page_frag_free(c
->data_pdu
);
1258 page_frag_free(c
->rsp_pdu
);
1260 page_frag_free(c
->cmd_pdu
);
1264 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd
*c
)
1266 page_frag_free(c
->r2t_pdu
);
1267 page_frag_free(c
->data_pdu
);
1268 page_frag_free(c
->rsp_pdu
);
1269 page_frag_free(c
->cmd_pdu
);
1272 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue
*queue
)
1274 struct nvmet_tcp_cmd
*cmds
;
1275 int i
, ret
= -EINVAL
, nr_cmds
= queue
->nr_cmds
;
1277 cmds
= kcalloc(nr_cmds
, sizeof(struct nvmet_tcp_cmd
), GFP_KERNEL
);
1281 for (i
= 0; i
< nr_cmds
; i
++) {
1282 ret
= nvmet_tcp_alloc_cmd(queue
, cmds
+ i
);
1292 nvmet_tcp_free_cmd(cmds
+ i
);
1298 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue
*queue
)
1300 struct nvmet_tcp_cmd
*cmds
= queue
->cmds
;
1303 for (i
= 0; i
< queue
->nr_cmds
; i
++)
1304 nvmet_tcp_free_cmd(cmds
+ i
);
1306 nvmet_tcp_free_cmd(&queue
->connect
);
1310 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue
*queue
)
1312 struct socket
*sock
= queue
->sock
;
1314 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1315 sock
->sk
->sk_data_ready
= queue
->data_ready
;
1316 sock
->sk
->sk_state_change
= queue
->state_change
;
1317 sock
->sk
->sk_write_space
= queue
->write_space
;
1318 sock
->sk
->sk_user_data
= NULL
;
1319 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1322 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd
*cmd
)
1324 nvmet_req_uninit(&cmd
->req
);
1325 nvmet_tcp_unmap_pdu_iovec(cmd
);
1327 sgl_free(cmd
->req
.sg
);
1330 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue
*queue
)
1332 struct nvmet_tcp_cmd
*cmd
= queue
->cmds
;
1335 for (i
= 0; i
< queue
->nr_cmds
; i
++, cmd
++) {
1336 if (nvmet_tcp_need_data_in(cmd
))
1337 nvmet_tcp_finish_cmd(cmd
);
1340 if (!queue
->nr_cmds
&& nvmet_tcp_need_data_in(&queue
->connect
)) {
1341 /* failed in connect */
1342 nvmet_tcp_finish_cmd(&queue
->connect
);
1346 static void nvmet_tcp_release_queue_work(struct work_struct
*w
)
1348 struct nvmet_tcp_queue
*queue
=
1349 container_of(w
, struct nvmet_tcp_queue
, release_work
);
1351 mutex_lock(&nvmet_tcp_queue_mutex
);
1352 list_del_init(&queue
->queue_list
);
1353 mutex_unlock(&nvmet_tcp_queue_mutex
);
1355 nvmet_tcp_restore_socket_callbacks(queue
);
1356 flush_work(&queue
->io_work
);
1358 nvmet_tcp_uninit_data_in_cmds(queue
);
1359 nvmet_sq_destroy(&queue
->nvme_sq
);
1360 cancel_work_sync(&queue
->io_work
);
1361 sock_release(queue
->sock
);
1362 nvmet_tcp_free_cmds(queue
);
1363 if (queue
->hdr_digest
|| queue
->data_digest
)
1364 nvmet_tcp_free_crypto(queue
);
1365 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1370 static void nvmet_tcp_data_ready(struct sock
*sk
)
1372 struct nvmet_tcp_queue
*queue
;
1374 read_lock_bh(&sk
->sk_callback_lock
);
1375 queue
= sk
->sk_user_data
;
1377 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1378 read_unlock_bh(&sk
->sk_callback_lock
);
1381 static void nvmet_tcp_write_space(struct sock
*sk
)
1383 struct nvmet_tcp_queue
*queue
;
1385 read_lock_bh(&sk
->sk_callback_lock
);
1386 queue
= sk
->sk_user_data
;
1387 if (unlikely(!queue
))
1390 if (unlikely(queue
->state
== NVMET_TCP_Q_CONNECTING
)) {
1391 queue
->write_space(sk
);
1395 if (sk_stream_is_writeable(sk
)) {
1396 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
1397 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1400 read_unlock_bh(&sk
->sk_callback_lock
);
1403 static void nvmet_tcp_state_change(struct sock
*sk
)
1405 struct nvmet_tcp_queue
*queue
;
1407 write_lock_bh(&sk
->sk_callback_lock
);
1408 queue
= sk
->sk_user_data
;
1412 switch (sk
->sk_state
) {
1414 case TCP_CLOSE_WAIT
:
1417 sk
->sk_user_data
= NULL
;
1418 nvmet_tcp_schedule_release_queue(queue
);
1421 pr_warn("queue %d unhandled state %d\n",
1422 queue
->idx
, sk
->sk_state
);
1425 write_unlock_bh(&sk
->sk_callback_lock
);
1428 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue
*queue
)
1430 struct socket
*sock
= queue
->sock
;
1431 struct inet_sock
*inet
= inet_sk(sock
->sk
);
1432 struct linger sol
= { .l_onoff
= 1, .l_linger
= 0 };
1435 ret
= kernel_getsockname(sock
,
1436 (struct sockaddr
*)&queue
->sockaddr
);
1440 ret
= kernel_getpeername(sock
,
1441 (struct sockaddr
*)&queue
->sockaddr_peer
);
1446 * Cleanup whatever is sitting in the TCP transmit queue on socket
1447 * close. This is done to prevent stale data from being sent should
1448 * the network connection be restored before TCP times out.
1450 ret
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_LINGER
,
1451 (char *)&sol
, sizeof(sol
));
1455 if (so_priority
> 0) {
1456 ret
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_PRIORITY
,
1457 (char *)&so_priority
, sizeof(so_priority
));
1462 /* Set socket type of service */
1463 if (inet
->rcv_tos
> 0) {
1464 int tos
= inet
->rcv_tos
;
1466 ret
= kernel_setsockopt(sock
, SOL_IP
, IP_TOS
,
1467 (char *)&tos
, sizeof(tos
));
1472 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1473 sock
->sk
->sk_user_data
= queue
;
1474 queue
->data_ready
= sock
->sk
->sk_data_ready
;
1475 sock
->sk
->sk_data_ready
= nvmet_tcp_data_ready
;
1476 queue
->state_change
= sock
->sk
->sk_state_change
;
1477 sock
->sk
->sk_state_change
= nvmet_tcp_state_change
;
1478 queue
->write_space
= sock
->sk
->sk_write_space
;
1479 sock
->sk
->sk_write_space
= nvmet_tcp_write_space
;
1480 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1485 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port
*port
,
1486 struct socket
*newsock
)
1488 struct nvmet_tcp_queue
*queue
;
1491 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
1495 INIT_WORK(&queue
->release_work
, nvmet_tcp_release_queue_work
);
1496 INIT_WORK(&queue
->io_work
, nvmet_tcp_io_work
);
1497 queue
->sock
= newsock
;
1500 spin_lock_init(&queue
->state_lock
);
1501 queue
->state
= NVMET_TCP_Q_CONNECTING
;
1502 INIT_LIST_HEAD(&queue
->free_list
);
1503 init_llist_head(&queue
->resp_list
);
1504 INIT_LIST_HEAD(&queue
->resp_send_list
);
1506 queue
->idx
= ida_simple_get(&nvmet_tcp_queue_ida
, 0, 0, GFP_KERNEL
);
1507 if (queue
->idx
< 0) {
1509 goto out_free_queue
;
1512 ret
= nvmet_tcp_alloc_cmd(queue
, &queue
->connect
);
1514 goto out_ida_remove
;
1516 ret
= nvmet_sq_init(&queue
->nvme_sq
);
1518 goto out_free_connect
;
1520 port
->last_cpu
= cpumask_next_wrap(port
->last_cpu
,
1521 cpu_online_mask
, -1, false);
1522 queue
->cpu
= port
->last_cpu
;
1523 nvmet_prepare_receive_pdu(queue
);
1525 mutex_lock(&nvmet_tcp_queue_mutex
);
1526 list_add_tail(&queue
->queue_list
, &nvmet_tcp_queue_list
);
1527 mutex_unlock(&nvmet_tcp_queue_mutex
);
1529 ret
= nvmet_tcp_set_queue_sock(queue
);
1531 goto out_destroy_sq
;
1533 queue_work_on(queue
->cpu
, nvmet_tcp_wq
, &queue
->io_work
);
1537 mutex_lock(&nvmet_tcp_queue_mutex
);
1538 list_del_init(&queue
->queue_list
);
1539 mutex_unlock(&nvmet_tcp_queue_mutex
);
1540 nvmet_sq_destroy(&queue
->nvme_sq
);
1542 nvmet_tcp_free_cmd(&queue
->connect
);
1544 ida_simple_remove(&nvmet_tcp_queue_ida
, queue
->idx
);
1550 static void nvmet_tcp_accept_work(struct work_struct
*w
)
1552 struct nvmet_tcp_port
*port
=
1553 container_of(w
, struct nvmet_tcp_port
, accept_work
);
1554 struct socket
*newsock
;
1558 ret
= kernel_accept(port
->sock
, &newsock
, O_NONBLOCK
);
1561 pr_warn("failed to accept err=%d\n", ret
);
1564 ret
= nvmet_tcp_alloc_queue(port
, newsock
);
1566 pr_err("failed to allocate queue\n");
1567 sock_release(newsock
);
1572 static void nvmet_tcp_listen_data_ready(struct sock
*sk
)
1574 struct nvmet_tcp_port
*port
;
1576 read_lock_bh(&sk
->sk_callback_lock
);
1577 port
= sk
->sk_user_data
;
1581 if (sk
->sk_state
== TCP_LISTEN
)
1582 schedule_work(&port
->accept_work
);
1584 read_unlock_bh(&sk
->sk_callback_lock
);
1587 static int nvmet_tcp_add_port(struct nvmet_port
*nport
)
1589 struct nvmet_tcp_port
*port
;
1590 __kernel_sa_family_t af
;
1593 port
= kzalloc(sizeof(*port
), GFP_KERNEL
);
1597 switch (nport
->disc_addr
.adrfam
) {
1598 case NVMF_ADDR_FAMILY_IP4
:
1601 case NVMF_ADDR_FAMILY_IP6
:
1605 pr_err("address family %d not supported\n",
1606 nport
->disc_addr
.adrfam
);
1611 ret
= inet_pton_with_scope(&init_net
, af
, nport
->disc_addr
.traddr
,
1612 nport
->disc_addr
.trsvcid
, &port
->addr
);
1614 pr_err("malformed ip/port passed: %s:%s\n",
1615 nport
->disc_addr
.traddr
, nport
->disc_addr
.trsvcid
);
1619 port
->nport
= nport
;
1620 port
->last_cpu
= -1;
1621 INIT_WORK(&port
->accept_work
, nvmet_tcp_accept_work
);
1622 if (port
->nport
->inline_data_size
< 0)
1623 port
->nport
->inline_data_size
= NVMET_TCP_DEF_INLINE_DATA_SIZE
;
1625 ret
= sock_create(port
->addr
.ss_family
, SOCK_STREAM
,
1626 IPPROTO_TCP
, &port
->sock
);
1628 pr_err("failed to create a socket\n");
1632 port
->sock
->sk
->sk_user_data
= port
;
1633 port
->data_ready
= port
->sock
->sk
->sk_data_ready
;
1634 port
->sock
->sk
->sk_data_ready
= nvmet_tcp_listen_data_ready
;
1637 ret
= kernel_setsockopt(port
->sock
, IPPROTO_TCP
,
1638 TCP_NODELAY
, (char *)&opt
, sizeof(opt
));
1640 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret
);
1644 ret
= kernel_setsockopt(port
->sock
, SOL_SOCKET
, SO_REUSEADDR
,
1645 (char *)&opt
, sizeof(opt
));
1647 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret
);
1651 if (so_priority
> 0) {
1652 ret
= kernel_setsockopt(port
->sock
, SOL_SOCKET
, SO_PRIORITY
,
1653 (char *)&so_priority
, sizeof(so_priority
));
1655 pr_err("failed to set SO_PRIORITY sock opt %d\n", ret
);
1660 ret
= kernel_bind(port
->sock
, (struct sockaddr
*)&port
->addr
,
1661 sizeof(port
->addr
));
1663 pr_err("failed to bind port socket %d\n", ret
);
1667 ret
= kernel_listen(port
->sock
, 128);
1669 pr_err("failed to listen %d on port sock\n", ret
);
1674 pr_info("enabling port %d (%pISpc)\n",
1675 le16_to_cpu(nport
->disc_addr
.portid
), &port
->addr
);
1680 sock_release(port
->sock
);
1686 static void nvmet_tcp_remove_port(struct nvmet_port
*nport
)
1688 struct nvmet_tcp_port
*port
= nport
->priv
;
1690 write_lock_bh(&port
->sock
->sk
->sk_callback_lock
);
1691 port
->sock
->sk
->sk_data_ready
= port
->data_ready
;
1692 port
->sock
->sk
->sk_user_data
= NULL
;
1693 write_unlock_bh(&port
->sock
->sk
->sk_callback_lock
);
1694 cancel_work_sync(&port
->accept_work
);
1696 sock_release(port
->sock
);
1700 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl
*ctrl
)
1702 struct nvmet_tcp_queue
*queue
;
1704 mutex_lock(&nvmet_tcp_queue_mutex
);
1705 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1706 if (queue
->nvme_sq
.ctrl
== ctrl
)
1707 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1708 mutex_unlock(&nvmet_tcp_queue_mutex
);
1711 static u16
nvmet_tcp_install_queue(struct nvmet_sq
*sq
)
1713 struct nvmet_tcp_queue
*queue
=
1714 container_of(sq
, struct nvmet_tcp_queue
, nvme_sq
);
1717 /* Let inflight controller teardown complete */
1718 flush_scheduled_work();
1721 queue
->nr_cmds
= sq
->size
* 2;
1722 if (nvmet_tcp_alloc_cmds(queue
))
1723 return NVME_SC_INTERNAL
;
1727 static void nvmet_tcp_disc_port_addr(struct nvmet_req
*req
,
1728 struct nvmet_port
*nport
, char *traddr
)
1730 struct nvmet_tcp_port
*port
= nport
->priv
;
1732 if (inet_addr_is_any((struct sockaddr
*)&port
->addr
)) {
1733 struct nvmet_tcp_cmd
*cmd
=
1734 container_of(req
, struct nvmet_tcp_cmd
, req
);
1735 struct nvmet_tcp_queue
*queue
= cmd
->queue
;
1737 sprintf(traddr
, "%pISc", (struct sockaddr
*)&queue
->sockaddr
);
1739 memcpy(traddr
, nport
->disc_addr
.traddr
, NVMF_TRADDR_SIZE
);
1743 static struct nvmet_fabrics_ops nvmet_tcp_ops
= {
1744 .owner
= THIS_MODULE
,
1745 .type
= NVMF_TRTYPE_TCP
,
1747 .has_keyed_sgls
= 0,
1748 .add_port
= nvmet_tcp_add_port
,
1749 .remove_port
= nvmet_tcp_remove_port
,
1750 .queue_response
= nvmet_tcp_queue_response
,
1751 .delete_ctrl
= nvmet_tcp_delete_ctrl
,
1752 .install_queue
= nvmet_tcp_install_queue
,
1753 .disc_traddr
= nvmet_tcp_disc_port_addr
,
1756 static int __init
nvmet_tcp_init(void)
1760 nvmet_tcp_wq
= alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI
, 0);
1764 ret
= nvmet_register_transport(&nvmet_tcp_ops
);
1770 destroy_workqueue(nvmet_tcp_wq
);
1774 static void __exit
nvmet_tcp_exit(void)
1776 struct nvmet_tcp_queue
*queue
;
1778 nvmet_unregister_transport(&nvmet_tcp_ops
);
1780 flush_scheduled_work();
1781 mutex_lock(&nvmet_tcp_queue_mutex
);
1782 list_for_each_entry(queue
, &nvmet_tcp_queue_list
, queue_list
)
1783 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1784 mutex_unlock(&nvmet_tcp_queue_mutex
);
1785 flush_scheduled_work();
1787 destroy_workqueue(nvmet_tcp_wq
);
1790 module_init(nvmet_tcp_init
);
1791 module_exit(nvmet_tcp_exit
);
1793 MODULE_LICENSE("GPL v2");
1794 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */