1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
4 /* Kai Shen <kaishen@linux.alibaba.com> */
5 /* Copyright (c) 2020-2022, Alibaba Group. */
7 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
9 /* Greg Joyce <greg@opengridcomputing.com> */
10 /* Copyright (c) 2008-2019, IBM Corporation */
11 /* Copyright (c) 2017, Open Grid Computing, Inc. */
13 #include <linux/workqueue.h>
14 #include <trace/events/sock.h>
18 #include "erdma_verbs.h"
20 static struct workqueue_struct
*erdma_cm_wq
;
22 static void erdma_cm_llp_state_change(struct sock
*sk
);
23 static void erdma_cm_llp_data_ready(struct sock
*sk
);
24 static void erdma_cm_llp_error_report(struct sock
*sk
);
26 static void erdma_sk_assign_cm_upcalls(struct sock
*sk
)
28 write_lock_bh(&sk
->sk_callback_lock
);
29 sk
->sk_state_change
= erdma_cm_llp_state_change
;
30 sk
->sk_data_ready
= erdma_cm_llp_data_ready
;
31 sk
->sk_error_report
= erdma_cm_llp_error_report
;
32 write_unlock_bh(&sk
->sk_callback_lock
);
35 static void erdma_sk_save_upcalls(struct sock
*sk
)
37 struct erdma_cep
*cep
= sk_to_cep(sk
);
39 write_lock_bh(&sk
->sk_callback_lock
);
40 cep
->sk_state_change
= sk
->sk_state_change
;
41 cep
->sk_data_ready
= sk
->sk_data_ready
;
42 cep
->sk_error_report
= sk
->sk_error_report
;
43 write_unlock_bh(&sk
->sk_callback_lock
);
46 static void erdma_sk_restore_upcalls(struct sock
*sk
, struct erdma_cep
*cep
)
48 sk
->sk_state_change
= cep
->sk_state_change
;
49 sk
->sk_data_ready
= cep
->sk_data_ready
;
50 sk
->sk_error_report
= cep
->sk_error_report
;
51 sk
->sk_user_data
= NULL
;
54 static void erdma_socket_disassoc(struct socket
*s
)
56 struct sock
*sk
= s
->sk
;
57 struct erdma_cep
*cep
;
60 write_lock_bh(&sk
->sk_callback_lock
);
63 erdma_sk_restore_upcalls(sk
, cep
);
68 write_unlock_bh(&sk
->sk_callback_lock
);
74 static void erdma_cep_socket_assoc(struct erdma_cep
*cep
, struct socket
*s
)
78 s
->sk
->sk_user_data
= cep
;
80 erdma_sk_save_upcalls(s
->sk
);
81 erdma_sk_assign_cm_upcalls(s
->sk
);
84 static void erdma_disassoc_listen_cep(struct erdma_cep
*cep
)
86 if (cep
->listen_cep
) {
87 erdma_cep_put(cep
->listen_cep
);
88 cep
->listen_cep
= NULL
;
92 static struct erdma_cep
*erdma_cep_alloc(struct erdma_dev
*dev
)
94 struct erdma_cep
*cep
= kzalloc(sizeof(*cep
), GFP_KERNEL
);
100 INIT_LIST_HEAD(&cep
->listenq
);
101 INIT_LIST_HEAD(&cep
->devq
);
102 INIT_LIST_HEAD(&cep
->work_freelist
);
104 kref_init(&cep
->ref
);
105 cep
->state
= ERDMA_EPSTATE_IDLE
;
106 init_waitqueue_head(&cep
->waitq
);
107 spin_lock_init(&cep
->lock
);
110 spin_lock_irqsave(&dev
->lock
, flags
);
111 list_add_tail(&cep
->devq
, &dev
->cep_list
);
112 spin_unlock_irqrestore(&dev
->lock
, flags
);
117 static void erdma_cm_free_work(struct erdma_cep
*cep
)
119 struct list_head
*w
, *tmp
;
120 struct erdma_cm_work
*work
;
122 list_for_each_safe(w
, tmp
, &cep
->work_freelist
) {
123 work
= list_entry(w
, struct erdma_cm_work
, list
);
124 list_del(&work
->list
);
129 static void erdma_cancel_mpatimer(struct erdma_cep
*cep
)
131 spin_lock_bh(&cep
->lock
);
132 if (cep
->mpa_timer
) {
133 if (cancel_delayed_work(&cep
->mpa_timer
->work
)) {
135 kfree(cep
->mpa_timer
);
137 cep
->mpa_timer
= NULL
;
139 spin_unlock_bh(&cep
->lock
);
142 static void erdma_put_work(struct erdma_cm_work
*work
)
144 INIT_LIST_HEAD(&work
->list
);
145 spin_lock_bh(&work
->cep
->lock
);
146 list_add(&work
->list
, &work
->cep
->work_freelist
);
147 spin_unlock_bh(&work
->cep
->lock
);
150 static void erdma_cep_set_inuse(struct erdma_cep
*cep
)
154 spin_lock_irqsave(&cep
->lock
, flags
);
155 while (cep
->in_use
) {
156 spin_unlock_irqrestore(&cep
->lock
, flags
);
157 wait_event_interruptible(cep
->waitq
, !cep
->in_use
);
158 if (signal_pending(current
))
159 flush_signals(current
);
161 spin_lock_irqsave(&cep
->lock
, flags
);
165 spin_unlock_irqrestore(&cep
->lock
, flags
);
168 static void erdma_cep_set_free(struct erdma_cep
*cep
)
172 spin_lock_irqsave(&cep
->lock
, flags
);
174 spin_unlock_irqrestore(&cep
->lock
, flags
);
176 wake_up(&cep
->waitq
);
179 static void __erdma_cep_dealloc(struct kref
*ref
)
181 struct erdma_cep
*cep
= container_of(ref
, struct erdma_cep
, ref
);
182 struct erdma_dev
*dev
= cep
->dev
;
185 WARN_ON(cep
->listen_cep
);
187 kfree(cep
->private_data
);
188 kfree(cep
->mpa
.pdata
);
189 spin_lock_bh(&cep
->lock
);
190 if (!list_empty(&cep
->work_freelist
))
191 erdma_cm_free_work(cep
);
192 spin_unlock_bh(&cep
->lock
);
194 spin_lock_irqsave(&dev
->lock
, flags
);
195 list_del(&cep
->devq
);
196 spin_unlock_irqrestore(&dev
->lock
, flags
);
200 static struct erdma_cm_work
*erdma_get_work(struct erdma_cep
*cep
)
202 struct erdma_cm_work
*work
= NULL
;
204 spin_lock_bh(&cep
->lock
);
205 if (!list_empty(&cep
->work_freelist
)) {
206 work
= list_entry(cep
->work_freelist
.next
, struct erdma_cm_work
,
208 list_del_init(&work
->list
);
211 spin_unlock_bh(&cep
->lock
);
215 static int erdma_cm_alloc_work(struct erdma_cep
*cep
, int num
)
217 struct erdma_cm_work
*work
;
220 work
= kmalloc(sizeof(*work
), GFP_KERNEL
);
222 if (!(list_empty(&cep
->work_freelist
)))
223 erdma_cm_free_work(cep
);
227 INIT_LIST_HEAD(&work
->list
);
228 list_add(&work
->list
, &cep
->work_freelist
);
234 static int erdma_cm_upcall(struct erdma_cep
*cep
, enum iw_cm_event_type reason
,
237 struct iw_cm_event event
;
238 struct iw_cm_id
*cm_id
;
240 memset(&event
, 0, sizeof(event
));
241 event
.status
= status
;
242 event
.event
= reason
;
244 if (reason
== IW_CM_EVENT_CONNECT_REQUEST
) {
245 event
.provider_data
= cep
;
246 cm_id
= cep
->listen_cep
->cm_id
;
248 event
.ird
= cep
->dev
->attrs
.max_ird
;
249 event
.ord
= cep
->dev
->attrs
.max_ord
;
254 if (reason
== IW_CM_EVENT_CONNECT_REQUEST
||
255 reason
== IW_CM_EVENT_CONNECT_REPLY
) {
256 u16 pd_len
= be16_to_cpu(cep
->mpa
.hdr
.params
.pd_len
);
258 if (pd_len
&& cep
->mpa
.pdata
) {
259 event
.private_data_len
= pd_len
;
260 event
.private_data
= cep
->mpa
.pdata
;
263 getname_local(cep
->sock
, &event
.local_addr
);
264 getname_peer(cep
->sock
, &event
.remote_addr
);
267 return cm_id
->event_handler(cm_id
, &event
);
270 void erdma_qp_cm_drop(struct erdma_qp
*qp
)
272 struct erdma_cep
*cep
= qp
->cep
;
277 erdma_cep_set_inuse(cep
);
279 /* already closed. */
280 if (cep
->state
== ERDMA_EPSTATE_CLOSED
)
284 switch (cep
->state
) {
285 case ERDMA_EPSTATE_AWAIT_MPAREP
:
286 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
,
289 case ERDMA_EPSTATE_RDMA_MODE
:
290 erdma_cm_upcall(cep
, IW_CM_EVENT_CLOSE
, 0);
292 case ERDMA_EPSTATE_IDLE
:
293 case ERDMA_EPSTATE_LISTENING
:
294 case ERDMA_EPSTATE_CONNECTING
:
295 case ERDMA_EPSTATE_AWAIT_MPAREQ
:
296 case ERDMA_EPSTATE_RECVD_MPAREQ
:
297 case ERDMA_EPSTATE_CLOSED
:
301 cep
->cm_id
->rem_ref(cep
->cm_id
);
305 cep
->state
= ERDMA_EPSTATE_CLOSED
;
308 erdma_socket_disassoc(cep
->sock
);
309 sock_release(cep
->sock
);
318 erdma_cep_set_free(cep
);
321 void erdma_cep_put(struct erdma_cep
*cep
)
323 WARN_ON(kref_read(&cep
->ref
) < 1);
324 kref_put(&cep
->ref
, __erdma_cep_dealloc
);
327 void erdma_cep_get(struct erdma_cep
*cep
)
332 static int erdma_send_mpareqrep(struct erdma_cep
*cep
, const void *pdata
,
335 struct socket
*s
= cep
->sock
;
336 struct mpa_rr
*rr
= &cep
->mpa
.hdr
;
343 memset(&msg
, 0, sizeof(msg
));
345 rr
->params
.pd_len
= cpu_to_be16(pd_len
);
347 iov
[iovec_num
].iov_base
= rr
;
348 iov
[iovec_num
].iov_len
= sizeof(*rr
);
350 mpa_len
= sizeof(*rr
);
352 iov
[iovec_num
].iov_base
= &cep
->mpa
.ext_data
;
353 iov
[iovec_num
].iov_len
= sizeof(cep
->mpa
.ext_data
);
355 mpa_len
+= sizeof(cep
->mpa
.ext_data
);
358 iov
[iovec_num
].iov_base
= (char *)pdata
;
359 iov
[iovec_num
].iov_len
= pd_len
;
364 ret
= kernel_sendmsg(s
, &msg
, iov
, iovec_num
, mpa_len
);
366 return ret
< 0 ? ret
: 0;
369 static inline int ksock_recv(struct socket
*sock
, char *buf
, size_t size
,
372 struct kvec iov
= { buf
, size
};
373 struct msghdr msg
= { .msg_name
= NULL
, .msg_flags
= flags
};
375 return kernel_recvmsg(sock
, &msg
, &iov
, 1, size
, flags
);
378 static int __recv_mpa_hdr(struct erdma_cep
*cep
, int hdr_rcvd
, char *hdr
,
379 int hdr_size
, int *rcvd_out
)
381 struct socket
*s
= cep
->sock
;
385 if (hdr_rcvd
< hdr_size
) {
386 rcvd
= ksock_recv(s
, hdr
+ hdr_rcvd
, hdr_size
- hdr_rcvd
,
392 return -ECONNABORTED
;
397 if (hdr_rcvd
< hdr_size
)
404 static void __mpa_rr_set_revision(__be16
*bits
, u8 rev
)
406 *bits
= (*bits
& ~MPA_RR_MASK_REVISION
) |
407 (cpu_to_be16(rev
) & MPA_RR_MASK_REVISION
);
410 static u8
__mpa_rr_revision(__be16 mpa_rr_bits
)
412 __be16 rev
= mpa_rr_bits
& MPA_RR_MASK_REVISION
;
414 return (u8
)be16_to_cpu(rev
);
417 static void __mpa_ext_set_cc(__be32
*bits
, u32 cc
)
419 *bits
= (*bits
& ~MPA_EXT_FLAG_CC
) |
420 (cpu_to_be32(cc
) & MPA_EXT_FLAG_CC
);
423 static u8
__mpa_ext_cc(__be32 mpa_ext_bits
)
425 __be32 cc
= mpa_ext_bits
& MPA_EXT_FLAG_CC
;
427 return (u8
)be32_to_cpu(cc
);
431 * Receive MPA Request/Reply header.
433 * Returns 0 if complete MPA Request/Reply haeder including
434 * eventual private data was received. Returns -EAGAIN if
435 * header was partially received or negative error code otherwise.
437 * Context: May be called in process context only
439 static int erdma_recv_mpa_rr(struct erdma_cep
*cep
)
441 struct mpa_rr
*hdr
= &cep
->mpa
.hdr
;
442 struct socket
*s
= cep
->sock
;
444 int rcvd
, to_rcv
, ret
, pd_rcvd
;
446 if (cep
->mpa
.bytes_rcvd
< sizeof(struct mpa_rr
)) {
447 ret
= __recv_mpa_hdr(cep
, cep
->mpa
.bytes_rcvd
,
448 (char *)&cep
->mpa
.hdr
,
449 sizeof(struct mpa_rr
), &rcvd
);
450 cep
->mpa
.bytes_rcvd
+= rcvd
;
455 if (be16_to_cpu(hdr
->params
.pd_len
) > MPA_MAX_PRIVDATA
||
456 __mpa_rr_revision(hdr
->params
.bits
) != MPA_REVISION_EXT_1
)
459 if (cep
->mpa
.bytes_rcvd
- sizeof(struct mpa_rr
) <
460 sizeof(struct erdma_mpa_ext
)) {
461 ret
= __recv_mpa_hdr(
462 cep
, cep
->mpa
.bytes_rcvd
- sizeof(struct mpa_rr
),
463 (char *)&cep
->mpa
.ext_data
,
464 sizeof(struct erdma_mpa_ext
), &rcvd
);
465 cep
->mpa
.bytes_rcvd
+= rcvd
;
470 pd_len
= be16_to_cpu(hdr
->params
.pd_len
);
471 pd_rcvd
= cep
->mpa
.bytes_rcvd
- sizeof(struct mpa_rr
) -
472 sizeof(struct erdma_mpa_ext
);
473 to_rcv
= pd_len
- pd_rcvd
;
477 * We have received the whole MPA Request/Reply message.
478 * Check against peer protocol violation.
482 ret
= __recv_mpa_hdr(cep
, 0, (char *)&word
, sizeof(word
),
484 if (ret
== -EAGAIN
&& rcvd
== 0)
494 * At this point, MPA header has been fully received, and pd_len != 0.
495 * So, begin to receive private data.
497 if (!cep
->mpa
.pdata
) {
498 cep
->mpa
.pdata
= kmalloc(pd_len
+ 4, GFP_KERNEL
);
503 rcvd
= ksock_recv(s
, cep
->mpa
.pdata
+ pd_rcvd
, to_rcv
+ 4,
511 cep
->mpa
.bytes_rcvd
+= rcvd
;
520 * erdma_proc_mpareq()
522 * Read MPA Request from socket and signal new connection to IWCM
523 * if success. Caller must hold lock on corresponding listening CEP.
525 static int erdma_proc_mpareq(struct erdma_cep
*cep
)
530 ret
= erdma_recv_mpa_rr(cep
);
536 if (memcmp(req
->key
, MPA_KEY_REQ
, MPA_KEY_SIZE
))
539 memcpy(req
->key
, MPA_KEY_REP
, MPA_KEY_SIZE
);
541 /* Currently does not support marker and crc. */
542 if (req
->params
.bits
& MPA_RR_FLAG_MARKERS
||
543 req
->params
.bits
& MPA_RR_FLAG_CRC
)
546 cep
->state
= ERDMA_EPSTATE_RECVD_MPAREQ
;
548 /* Keep reference until IWCM accepts/rejects */
550 ret
= erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REQUEST
, 0);
557 req
->params
.bits
&= ~MPA_RR_FLAG_MARKERS
;
558 req
->params
.bits
|= MPA_RR_FLAG_REJECT
;
559 req
->params
.bits
&= ~MPA_RR_FLAG_CRC
;
561 kfree(cep
->mpa
.pdata
);
562 cep
->mpa
.pdata
= NULL
;
563 erdma_send_mpareqrep(cep
, NULL
, 0);
568 static int erdma_proc_mpareply(struct erdma_cep
*cep
)
570 struct erdma_qp_attrs qp_attrs
;
571 struct erdma_qp
*qp
= cep
->qp
;
575 ret
= erdma_recv_mpa_rr(cep
);
579 erdma_cancel_mpatimer(cep
);
583 if (memcmp(rep
->key
, MPA_KEY_REP
, MPA_KEY_SIZE
)) {
588 if (rep
->params
.bits
& MPA_RR_FLAG_REJECT
) {
589 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
, -ECONNRESET
);
593 /* Currently does not support marker and crc. */
594 if ((rep
->params
.bits
& MPA_RR_FLAG_MARKERS
) ||
595 (rep
->params
.bits
& MPA_RR_FLAG_CRC
)) {
596 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
, -ECONNREFUSED
);
600 memset(&qp_attrs
, 0, sizeof(qp_attrs
));
601 qp_attrs
.irq_size
= cep
->ird
;
602 qp_attrs
.orq_size
= cep
->ord
;
603 qp_attrs
.state
= ERDMA_QP_STATE_RTS
;
605 down_write(&qp
->state_lock
);
606 if (qp
->attrs
.state
> ERDMA_QP_STATE_RTR
) {
608 up_write(&qp
->state_lock
);
612 qp
->attrs
.qp_type
= ERDMA_QP_ACTIVE
;
613 if (__mpa_ext_cc(cep
->mpa
.ext_data
.bits
) != qp
->attrs
.cc
)
614 qp
->attrs
.cc
= COMPROMISE_CC
;
616 ret
= erdma_modify_qp_internal(qp
, &qp_attrs
,
617 ERDMA_QP_ATTR_STATE
|
618 ERDMA_QP_ATTR_LLP_HANDLE
|
621 up_write(&qp
->state_lock
);
624 ret
= erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
, 0);
626 cep
->state
= ERDMA_EPSTATE_RDMA_MODE
;
633 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
, -EINVAL
);
638 static void erdma_accept_newconn(struct erdma_cep
*cep
)
640 struct socket
*s
= cep
->sock
;
641 struct socket
*new_s
= NULL
;
642 struct erdma_cep
*new_cep
= NULL
;
645 if (cep
->state
!= ERDMA_EPSTATE_LISTENING
)
648 new_cep
= erdma_cep_alloc(cep
->dev
);
653 * 4: Allocate a sufficient number of work elements
654 * to allow concurrent handling of local + peer close
655 * events, MPA header processing + MPA timeout.
657 if (erdma_cm_alloc_work(new_cep
, 4) != 0)
661 * Copy saved socket callbacks from listening CEP
662 * and assign new socket with new CEP
664 new_cep
->sk_state_change
= cep
->sk_state_change
;
665 new_cep
->sk_data_ready
= cep
->sk_data_ready
;
666 new_cep
->sk_error_report
= cep
->sk_error_report
;
668 ret
= kernel_accept(s
, &new_s
, O_NONBLOCK
);
672 new_cep
->sock
= new_s
;
673 erdma_cep_get(new_cep
);
674 new_s
->sk
->sk_user_data
= new_cep
;
676 tcp_sock_set_nodelay(new_s
->sk
);
677 new_cep
->state
= ERDMA_EPSTATE_AWAIT_MPAREQ
;
679 ret
= erdma_cm_queue_work(new_cep
, ERDMA_CM_WORK_MPATIMEOUT
);
683 new_cep
->listen_cep
= cep
;
686 if (atomic_read(&new_s
->sk
->sk_rmem_alloc
)) {
687 /* MPA REQ already queued */
688 erdma_cep_set_inuse(new_cep
);
689 ret
= erdma_proc_mpareq(new_cep
);
690 if (ret
!= -EAGAIN
) {
692 new_cep
->listen_cep
= NULL
;
694 erdma_cep_set_free(new_cep
);
698 erdma_cep_set_free(new_cep
);
704 new_cep
->state
= ERDMA_EPSTATE_CLOSED
;
705 erdma_cancel_mpatimer(new_cep
);
707 erdma_cep_put(new_cep
);
708 new_cep
->sock
= NULL
;
712 erdma_socket_disassoc(new_s
);
717 static int erdma_newconn_connected(struct erdma_cep
*cep
)
721 cep
->mpa
.hdr
.params
.bits
= 0;
722 __mpa_rr_set_revision(&cep
->mpa
.hdr
.params
.bits
, MPA_REVISION_EXT_1
);
724 memcpy(cep
->mpa
.hdr
.key
, MPA_KEY_REQ
, MPA_KEY_SIZE
);
725 cep
->mpa
.ext_data
.cookie
= cpu_to_be32(cep
->qp
->attrs
.cookie
);
726 __mpa_ext_set_cc(&cep
->mpa
.ext_data
.bits
, cep
->qp
->attrs
.cc
);
728 ret
= erdma_send_mpareqrep(cep
, cep
->private_data
, cep
->pd_len
);
729 cep
->state
= ERDMA_EPSTATE_AWAIT_MPAREP
;
730 cep
->mpa
.hdr
.params
.pd_len
= 0;
733 ret
= erdma_cm_queue_work(cep
, ERDMA_CM_WORK_MPATIMEOUT
);
738 static void erdma_cm_work_handler(struct work_struct
*w
)
740 struct erdma_cm_work
*work
;
741 struct erdma_cep
*cep
;
742 int release_cep
= 0, ret
= 0;
744 work
= container_of(w
, struct erdma_cm_work
, work
.work
);
747 erdma_cep_set_inuse(cep
);
749 switch (work
->type
) {
750 case ERDMA_CM_WORK_CONNECTED
:
751 erdma_cancel_mpatimer(cep
);
752 if (cep
->state
== ERDMA_EPSTATE_CONNECTING
) {
753 ret
= erdma_newconn_connected(cep
);
755 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
,
761 case ERDMA_CM_WORK_CONNECTTIMEOUT
:
762 if (cep
->state
== ERDMA_EPSTATE_CONNECTING
) {
763 cep
->mpa_timer
= NULL
;
764 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
,
769 case ERDMA_CM_WORK_ACCEPT
:
770 erdma_accept_newconn(cep
);
772 case ERDMA_CM_WORK_READ_MPAHDR
:
773 if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREQ
) {
774 if (cep
->listen_cep
) {
775 erdma_cep_set_inuse(cep
->listen_cep
);
777 if (cep
->listen_cep
->state
==
778 ERDMA_EPSTATE_LISTENING
)
779 ret
= erdma_proc_mpareq(cep
);
783 erdma_cep_set_free(cep
->listen_cep
);
785 if (ret
!= -EAGAIN
) {
786 erdma_cep_put(cep
->listen_cep
);
787 cep
->listen_cep
= NULL
;
792 } else if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREP
) {
793 ret
= erdma_proc_mpareply(cep
);
796 if (ret
&& ret
!= -EAGAIN
)
799 case ERDMA_CM_WORK_CLOSE_LLP
:
801 erdma_cm_upcall(cep
, IW_CM_EVENT_CLOSE
, 0);
804 case ERDMA_CM_WORK_PEER_CLOSE
:
806 if (cep
->state
== ERDMA_EPSTATE_CONNECTING
||
807 cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREP
) {
809 * MPA reply not received, but connection drop
811 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
,
813 } else if (cep
->state
== ERDMA_EPSTATE_RDMA_MODE
) {
815 * NOTE: IW_CM_EVENT_DISCONNECT is given just
816 * to transition IWCM into CLOSING.
818 erdma_cm_upcall(cep
, IW_CM_EVENT_DISCONNECT
, 0);
819 erdma_cm_upcall(cep
, IW_CM_EVENT_CLOSE
, 0);
821 } else if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREQ
) {
822 /* Socket close before MPA request received. */
823 erdma_disassoc_listen_cep(cep
);
828 case ERDMA_CM_WORK_MPATIMEOUT
:
829 cep
->mpa_timer
= NULL
;
830 if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREP
) {
832 * MPA request timed out:
833 * Hide any partially received private data and signal
836 cep
->mpa
.hdr
.params
.pd_len
= 0;
839 erdma_cm_upcall(cep
, IW_CM_EVENT_CONNECT_REPLY
,
842 } else if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREQ
) {
843 /* No MPA req received after peer TCP stream setup. */
844 erdma_disassoc_listen_cep(cep
);
851 WARN(1, "Undefined CM work type: %d\n", work
->type
);
855 erdma_cancel_mpatimer(cep
);
856 cep
->state
= ERDMA_EPSTATE_CLOSED
;
858 struct erdma_qp
*qp
= cep
->qp
;
860 * Serialize a potential race with application
861 * closing the QP and calling erdma_qp_cm_drop()
864 erdma_cep_set_free(cep
);
866 erdma_qp_llp_close(qp
);
869 erdma_cep_set_inuse(cep
);
875 erdma_socket_disassoc(cep
->sock
);
876 sock_release(cep
->sock
);
881 cep
->cm_id
->rem_ref(cep
->cm_id
);
883 if (cep
->state
!= ERDMA_EPSTATE_LISTENING
)
887 erdma_cep_set_free(cep
);
888 erdma_put_work(work
);
892 int erdma_cm_queue_work(struct erdma_cep
*cep
, enum erdma_work_type type
)
894 struct erdma_cm_work
*work
= erdma_get_work(cep
);
895 unsigned long delay
= 0;
905 INIT_DELAYED_WORK(&work
->work
, erdma_cm_work_handler
);
907 if (type
== ERDMA_CM_WORK_MPATIMEOUT
) {
908 cep
->mpa_timer
= work
;
910 if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREP
)
911 delay
= MPAREP_TIMEOUT
;
913 delay
= MPAREQ_TIMEOUT
;
914 } else if (type
== ERDMA_CM_WORK_CONNECTTIMEOUT
) {
915 cep
->mpa_timer
= work
;
917 delay
= CONNECT_TIMEOUT
;
920 queue_delayed_work(erdma_cm_wq
, &work
->work
, delay
);
925 static void erdma_cm_llp_data_ready(struct sock
*sk
)
927 struct erdma_cep
*cep
;
929 trace_sk_data_ready(sk
);
931 read_lock(&sk
->sk_callback_lock
);
937 if (cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREQ
||
938 cep
->state
== ERDMA_EPSTATE_AWAIT_MPAREP
)
939 erdma_cm_queue_work(cep
, ERDMA_CM_WORK_READ_MPAHDR
);
942 read_unlock(&sk
->sk_callback_lock
);
945 static void erdma_cm_llp_error_report(struct sock
*sk
)
947 struct erdma_cep
*cep
= sk_to_cep(sk
);
950 cep
->sk_error_report(sk
);
953 static void erdma_cm_llp_state_change(struct sock
*sk
)
955 struct erdma_cep
*cep
;
956 void (*orig_state_change
)(struct sock
*sk
);
958 read_lock(&sk
->sk_callback_lock
);
962 read_unlock(&sk
->sk_callback_lock
);
965 orig_state_change
= cep
->sk_state_change
;
967 switch (sk
->sk_state
) {
968 case TCP_ESTABLISHED
:
969 if (cep
->state
== ERDMA_EPSTATE_CONNECTING
)
970 erdma_cm_queue_work(cep
, ERDMA_CM_WORK_CONNECTED
);
972 erdma_cm_queue_work(cep
, ERDMA_CM_WORK_ACCEPT
);
976 if (cep
->state
!= ERDMA_EPSTATE_LISTENING
)
977 erdma_cm_queue_work(cep
, ERDMA_CM_WORK_PEER_CLOSE
);
982 read_unlock(&sk
->sk_callback_lock
);
983 orig_state_change(sk
);
986 static int kernel_bindconnect(struct socket
*s
, struct sockaddr
*laddr
,
987 int laddrlen
, struct sockaddr
*raddr
,
988 int raddrlen
, int flags
)
992 sock_set_reuseaddr(s
->sk
);
993 ret
= s
->ops
->bind(s
, laddr
, laddrlen
);
996 ret
= s
->ops
->connect(s
, raddr
, raddrlen
, flags
);
997 return ret
< 0 ? ret
: 0;
1000 int erdma_connect(struct iw_cm_id
*id
, struct iw_cm_conn_param
*params
)
1002 struct erdma_dev
*dev
= to_edev(id
->device
);
1003 struct erdma_qp
*qp
;
1004 struct erdma_cep
*cep
= NULL
;
1005 struct socket
*s
= NULL
;
1006 struct sockaddr
*laddr
= (struct sockaddr
*)&id
->m_local_addr
;
1007 struct sockaddr
*raddr
= (struct sockaddr
*)&id
->m_remote_addr
;
1008 u16 pd_len
= params
->private_data_len
;
1011 if (pd_len
> MPA_MAX_PRIVDATA
)
1014 if (params
->ird
> dev
->attrs
.max_ird
||
1015 params
->ord
> dev
->attrs
.max_ord
)
1018 if (laddr
->sa_family
!= AF_INET
|| raddr
->sa_family
!= AF_INET
)
1019 return -EAFNOSUPPORT
;
1021 qp
= find_qp_by_qpn(dev
, params
->qpn
);
1026 ret
= sock_create(AF_INET
, SOCK_STREAM
, IPPROTO_TCP
, &s
);
1030 cep
= erdma_cep_alloc(dev
);
1033 goto error_release_sock
;
1036 erdma_cep_set_inuse(cep
);
1038 /* Associate QP with CEP */
1043 /* Associate cm_id with CEP */
1048 * 6: Allocate a sufficient number of work elements
1049 * to allow concurrent handling of local + peer close
1050 * events, MPA header processing + MPA timeout, connected event
1051 * and connect timeout.
1053 ret
= erdma_cm_alloc_work(cep
, 6);
1056 goto error_release_cep
;
1059 cep
->ird
= params
->ird
;
1060 cep
->ord
= params
->ord
;
1061 cep
->state
= ERDMA_EPSTATE_CONNECTING
;
1063 erdma_cep_socket_assoc(cep
, s
);
1066 cep
->pd_len
= pd_len
;
1067 cep
->private_data
= kmalloc(pd_len
, GFP_KERNEL
);
1068 if (!cep
->private_data
) {
1070 goto error_disassoc
;
1073 memcpy(cep
->private_data
, params
->private_data
,
1074 params
->private_data_len
);
1077 ret
= kernel_bindconnect(s
, laddr
, sizeof(*laddr
), raddr
,
1078 sizeof(*raddr
), O_NONBLOCK
);
1079 if (ret
!= -EINPROGRESS
&& ret
!= 0) {
1080 goto error_disassoc
;
1081 } else if (ret
== 0) {
1082 ret
= erdma_cm_queue_work(cep
, ERDMA_CM_WORK_CONNECTED
);
1084 goto error_disassoc
;
1086 ret
= erdma_cm_queue_work(cep
, ERDMA_CM_WORK_CONNECTTIMEOUT
);
1088 goto error_disassoc
;
1091 erdma_cep_set_free(cep
);
1095 kfree(cep
->private_data
);
1096 cep
->private_data
= NULL
;
1099 erdma_socket_disassoc(s
);
1102 /* disassoc with cm_id */
1106 /* disassoc with qp */
1111 cep
->state
= ERDMA_EPSTATE_CLOSED
;
1113 erdma_cep_set_free(cep
);
1115 /* release the cep. */
1127 int erdma_accept(struct iw_cm_id
*id
, struct iw_cm_conn_param
*params
)
1129 struct erdma_dev
*dev
= to_edev(id
->device
);
1130 struct erdma_cep
*cep
= (struct erdma_cep
*)id
->provider_data
;
1131 struct erdma_qp
*qp
;
1132 struct erdma_qp_attrs qp_attrs
;
1135 erdma_cep_set_inuse(cep
);
1138 /* Free lingering inbound private data */
1139 if (cep
->mpa
.hdr
.params
.pd_len
) {
1140 cep
->mpa
.hdr
.params
.pd_len
= 0;
1141 kfree(cep
->mpa
.pdata
);
1142 cep
->mpa
.pdata
= NULL
;
1144 erdma_cancel_mpatimer(cep
);
1146 if (cep
->state
!= ERDMA_EPSTATE_RECVD_MPAREQ
) {
1147 erdma_cep_set_free(cep
);
1153 qp
= find_qp_by_qpn(dev
, params
->qpn
);
1158 down_write(&qp
->state_lock
);
1159 if (qp
->attrs
.state
> ERDMA_QP_STATE_RTR
) {
1161 up_write(&qp
->state_lock
);
1165 if (params
->ord
> dev
->attrs
.max_ord
||
1166 params
->ird
> dev
->attrs
.max_ord
) {
1168 up_write(&qp
->state_lock
);
1172 if (params
->private_data_len
> MPA_MAX_PRIVDATA
) {
1174 up_write(&qp
->state_lock
);
1178 cep
->ird
= params
->ird
;
1179 cep
->ord
= params
->ord
;
1184 memset(&qp_attrs
, 0, sizeof(qp_attrs
));
1185 qp_attrs
.orq_size
= params
->ord
;
1186 qp_attrs
.irq_size
= params
->ird
;
1188 qp_attrs
.state
= ERDMA_QP_STATE_RTS
;
1190 /* Associate QP with CEP */
1195 cep
->state
= ERDMA_EPSTATE_RDMA_MODE
;
1197 qp
->attrs
.qp_type
= ERDMA_QP_PASSIVE
;
1198 qp
->attrs
.pd_len
= params
->private_data_len
;
1200 if (qp
->attrs
.cc
!= __mpa_ext_cc(cep
->mpa
.ext_data
.bits
))
1201 qp
->attrs
.cc
= COMPROMISE_CC
;
1204 ret
= erdma_modify_qp_internal(qp
, &qp_attrs
,
1205 ERDMA_QP_ATTR_STATE
|
1207 ERDMA_QP_ATTR_LLP_HANDLE
|
1210 up_write(&qp
->state_lock
);
1215 cep
->mpa
.ext_data
.bits
= 0;
1216 __mpa_ext_set_cc(&cep
->mpa
.ext_data
.bits
, qp
->attrs
.cc
);
1217 cep
->mpa
.ext_data
.cookie
= cpu_to_be32(cep
->qp
->attrs
.cookie
);
1219 ret
= erdma_send_mpareqrep(cep
, params
->private_data
,
1220 params
->private_data_len
);
1222 ret
= erdma_cm_upcall(cep
, IW_CM_EVENT_ESTABLISHED
, 0);
1226 erdma_cep_set_free(cep
);
1232 erdma_socket_disassoc(cep
->sock
);
1233 sock_release(cep
->sock
);
1236 cep
->state
= ERDMA_EPSTATE_CLOSED
;
1239 cep
->cm_id
->rem_ref(id
);
1251 erdma_cep_set_free(cep
);
1257 int erdma_reject(struct iw_cm_id
*id
, const void *pdata
, u8 plen
)
1259 struct erdma_cep
*cep
= (struct erdma_cep
*)id
->provider_data
;
1261 erdma_cep_set_inuse(cep
);
1264 erdma_cancel_mpatimer(cep
);
1266 if (cep
->state
!= ERDMA_EPSTATE_RECVD_MPAREQ
) {
1267 erdma_cep_set_free(cep
);
1273 if (__mpa_rr_revision(cep
->mpa
.hdr
.params
.bits
) == MPA_REVISION_EXT_1
) {
1274 cep
->mpa
.hdr
.params
.bits
|= MPA_RR_FLAG_REJECT
; /* reject */
1275 erdma_send_mpareqrep(cep
, pdata
, plen
);
1278 erdma_socket_disassoc(cep
->sock
);
1279 sock_release(cep
->sock
);
1282 cep
->state
= ERDMA_EPSTATE_CLOSED
;
1284 erdma_cep_set_free(cep
);
1290 int erdma_create_listen(struct iw_cm_id
*id
, int backlog
)
1293 struct erdma_cep
*cep
= NULL
;
1295 struct erdma_dev
*dev
= to_edev(id
->device
);
1296 int addr_family
= id
->local_addr
.ss_family
;
1297 struct sockaddr_in
*laddr
= &to_sockaddr_in(id
->local_addr
);
1299 if (addr_family
!= AF_INET
)
1300 return -EAFNOSUPPORT
;
1302 ret
= sock_create(addr_family
, SOCK_STREAM
, IPPROTO_TCP
, &s
);
1306 sock_set_reuseaddr(s
->sk
);
1308 /* For wildcard addr, limit binding to current device only */
1309 if (ipv4_is_zeronet(laddr
->sin_addr
.s_addr
))
1310 s
->sk
->sk_bound_dev_if
= dev
->netdev
->ifindex
;
1312 ret
= s
->ops
->bind(s
, (struct sockaddr
*)laddr
,
1313 sizeof(struct sockaddr_in
));
1317 cep
= erdma_cep_alloc(dev
);
1322 erdma_cep_socket_assoc(cep
, s
);
1324 ret
= erdma_cm_alloc_work(cep
, backlog
);
1328 ret
= s
->ops
->listen(s
, backlog
);
1335 if (!id
->provider_data
) {
1337 kmalloc(sizeof(struct list_head
), GFP_KERNEL
);
1338 if (!id
->provider_data
) {
1342 INIT_LIST_HEAD((struct list_head
*)id
->provider_data
);
1345 list_add_tail(&cep
->listenq
, (struct list_head
*)id
->provider_data
);
1346 cep
->state
= ERDMA_EPSTATE_LISTENING
;
1352 erdma_cep_set_inuse(cep
);
1355 cep
->cm_id
->rem_ref(cep
->cm_id
);
1359 erdma_socket_disassoc(s
);
1360 cep
->state
= ERDMA_EPSTATE_CLOSED
;
1362 erdma_cep_set_free(cep
);
1370 static void erdma_drop_listeners(struct iw_cm_id
*id
)
1372 struct list_head
*p
, *tmp
;
1374 * In case of a wildcard rdma_listen on a multi-homed device,
1375 * a listener's IWCM id is associated with more than one listening CEP.
1377 list_for_each_safe(p
, tmp
, (struct list_head
*)id
->provider_data
) {
1378 struct erdma_cep
*cep
=
1379 list_entry(p
, struct erdma_cep
, listenq
);
1383 erdma_cep_set_inuse(cep
);
1386 cep
->cm_id
->rem_ref(cep
->cm_id
);
1390 erdma_socket_disassoc(cep
->sock
);
1391 sock_release(cep
->sock
);
1394 cep
->state
= ERDMA_EPSTATE_CLOSED
;
1395 erdma_cep_set_free(cep
);
1400 int erdma_destroy_listen(struct iw_cm_id
*id
)
1402 if (!id
->provider_data
)
1405 erdma_drop_listeners(id
);
1406 kfree(id
->provider_data
);
1407 id
->provider_data
= NULL
;
1412 int erdma_cm_init(void)
1414 erdma_cm_wq
= create_singlethread_workqueue("erdma_cm_wq");
1421 void erdma_cm_exit(void)
1424 destroy_workqueue(erdma_cm_wq
);