Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / drivers / infiniband / hw / erdma / erdma_cm.c
blob771059a8eb7d7f76eb38652edbce9bfdb5ee89b3
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
4 /* Kai Shen <kaishen@linux.alibaba.com> */
5 /* Copyright (c) 2020-2022, Alibaba Group. */
7 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
8 /* Fredy Neeser */
9 /* Greg Joyce <greg@opengridcomputing.com> */
10 /* Copyright (c) 2008-2019, IBM Corporation */
11 /* Copyright (c) 2017, Open Grid Computing, Inc. */
13 #include <linux/workqueue.h>
14 #include <trace/events/sock.h>
16 #include "erdma.h"
17 #include "erdma_cm.h"
18 #include "erdma_verbs.h"
20 static struct workqueue_struct *erdma_cm_wq;
22 static void erdma_cm_llp_state_change(struct sock *sk);
23 static void erdma_cm_llp_data_ready(struct sock *sk);
24 static void erdma_cm_llp_error_report(struct sock *sk);
26 static void erdma_sk_assign_cm_upcalls(struct sock *sk)
28 write_lock_bh(&sk->sk_callback_lock);
29 sk->sk_state_change = erdma_cm_llp_state_change;
30 sk->sk_data_ready = erdma_cm_llp_data_ready;
31 sk->sk_error_report = erdma_cm_llp_error_report;
32 write_unlock_bh(&sk->sk_callback_lock);
35 static void erdma_sk_save_upcalls(struct sock *sk)
37 struct erdma_cep *cep = sk_to_cep(sk);
39 write_lock_bh(&sk->sk_callback_lock);
40 cep->sk_state_change = sk->sk_state_change;
41 cep->sk_data_ready = sk->sk_data_ready;
42 cep->sk_error_report = sk->sk_error_report;
43 write_unlock_bh(&sk->sk_callback_lock);
46 static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep)
48 sk->sk_state_change = cep->sk_state_change;
49 sk->sk_data_ready = cep->sk_data_ready;
50 sk->sk_error_report = cep->sk_error_report;
51 sk->sk_user_data = NULL;
54 static void erdma_socket_disassoc(struct socket *s)
56 struct sock *sk = s->sk;
57 struct erdma_cep *cep;
59 if (sk) {
60 write_lock_bh(&sk->sk_callback_lock);
61 cep = sk_to_cep(sk);
62 if (cep) {
63 erdma_sk_restore_upcalls(sk, cep);
64 erdma_cep_put(cep);
65 } else {
66 WARN_ON_ONCE(1);
68 write_unlock_bh(&sk->sk_callback_lock);
69 } else {
70 WARN_ON_ONCE(1);
74 static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s)
76 cep->sock = s;
77 erdma_cep_get(cep);
78 s->sk->sk_user_data = cep;
80 erdma_sk_save_upcalls(s->sk);
81 erdma_sk_assign_cm_upcalls(s->sk);
84 static void erdma_disassoc_listen_cep(struct erdma_cep *cep)
86 if (cep->listen_cep) {
87 erdma_cep_put(cep->listen_cep);
88 cep->listen_cep = NULL;
92 static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev)
94 struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
95 unsigned long flags;
97 if (!cep)
98 return NULL;
100 INIT_LIST_HEAD(&cep->listenq);
101 INIT_LIST_HEAD(&cep->devq);
102 INIT_LIST_HEAD(&cep->work_freelist);
104 kref_init(&cep->ref);
105 cep->state = ERDMA_EPSTATE_IDLE;
106 init_waitqueue_head(&cep->waitq);
107 spin_lock_init(&cep->lock);
108 cep->dev = dev;
110 spin_lock_irqsave(&dev->lock, flags);
111 list_add_tail(&cep->devq, &dev->cep_list);
112 spin_unlock_irqrestore(&dev->lock, flags);
114 return cep;
117 static void erdma_cm_free_work(struct erdma_cep *cep)
119 struct list_head *w, *tmp;
120 struct erdma_cm_work *work;
122 list_for_each_safe(w, tmp, &cep->work_freelist) {
123 work = list_entry(w, struct erdma_cm_work, list);
124 list_del(&work->list);
125 kfree(work);
129 static void erdma_cancel_mpatimer(struct erdma_cep *cep)
131 spin_lock_bh(&cep->lock);
132 if (cep->mpa_timer) {
133 if (cancel_delayed_work(&cep->mpa_timer->work)) {
134 erdma_cep_put(cep);
135 kfree(cep->mpa_timer);
137 cep->mpa_timer = NULL;
139 spin_unlock_bh(&cep->lock);
142 static void erdma_put_work(struct erdma_cm_work *work)
144 INIT_LIST_HEAD(&work->list);
145 spin_lock_bh(&work->cep->lock);
146 list_add(&work->list, &work->cep->work_freelist);
147 spin_unlock_bh(&work->cep->lock);
150 static void erdma_cep_set_inuse(struct erdma_cep *cep)
152 unsigned long flags;
154 spin_lock_irqsave(&cep->lock, flags);
155 while (cep->in_use) {
156 spin_unlock_irqrestore(&cep->lock, flags);
157 wait_event_interruptible(cep->waitq, !cep->in_use);
158 if (signal_pending(current))
159 flush_signals(current);
161 spin_lock_irqsave(&cep->lock, flags);
164 cep->in_use = 1;
165 spin_unlock_irqrestore(&cep->lock, flags);
168 static void erdma_cep_set_free(struct erdma_cep *cep)
170 unsigned long flags;
172 spin_lock_irqsave(&cep->lock, flags);
173 cep->in_use = 0;
174 spin_unlock_irqrestore(&cep->lock, flags);
176 wake_up(&cep->waitq);
179 static void __erdma_cep_dealloc(struct kref *ref)
181 struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref);
182 struct erdma_dev *dev = cep->dev;
183 unsigned long flags;
185 WARN_ON(cep->listen_cep);
187 kfree(cep->private_data);
188 kfree(cep->mpa.pdata);
189 spin_lock_bh(&cep->lock);
190 if (!list_empty(&cep->work_freelist))
191 erdma_cm_free_work(cep);
192 spin_unlock_bh(&cep->lock);
194 spin_lock_irqsave(&dev->lock, flags);
195 list_del(&cep->devq);
196 spin_unlock_irqrestore(&dev->lock, flags);
197 kfree(cep);
200 static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep)
202 struct erdma_cm_work *work = NULL;
204 spin_lock_bh(&cep->lock);
205 if (!list_empty(&cep->work_freelist)) {
206 work = list_entry(cep->work_freelist.next, struct erdma_cm_work,
207 list);
208 list_del_init(&work->list);
211 spin_unlock_bh(&cep->lock);
212 return work;
215 static int erdma_cm_alloc_work(struct erdma_cep *cep, int num)
217 struct erdma_cm_work *work;
219 while (num--) {
220 work = kmalloc(sizeof(*work), GFP_KERNEL);
221 if (!work) {
222 if (!(list_empty(&cep->work_freelist)))
223 erdma_cm_free_work(cep);
224 return -ENOMEM;
226 work->cep = cep;
227 INIT_LIST_HEAD(&work->list);
228 list_add(&work->list, &cep->work_freelist);
231 return 0;
234 static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason,
235 int status)
237 struct iw_cm_event event;
238 struct iw_cm_id *cm_id;
240 memset(&event, 0, sizeof(event));
241 event.status = status;
242 event.event = reason;
244 if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
245 event.provider_data = cep;
246 cm_id = cep->listen_cep->cm_id;
248 event.ird = cep->dev->attrs.max_ird;
249 event.ord = cep->dev->attrs.max_ord;
250 } else {
251 cm_id = cep->cm_id;
254 if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
255 reason == IW_CM_EVENT_CONNECT_REPLY) {
256 u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
258 if (pd_len && cep->mpa.pdata) {
259 event.private_data_len = pd_len;
260 event.private_data = cep->mpa.pdata;
263 getname_local(cep->sock, &event.local_addr);
264 getname_peer(cep->sock, &event.remote_addr);
267 return cm_id->event_handler(cm_id, &event);
270 void erdma_qp_cm_drop(struct erdma_qp *qp)
272 struct erdma_cep *cep = qp->cep;
274 if (!qp->cep)
275 return;
277 erdma_cep_set_inuse(cep);
279 /* already closed. */
280 if (cep->state == ERDMA_EPSTATE_CLOSED)
281 goto out;
283 if (cep->cm_id) {
284 switch (cep->state) {
285 case ERDMA_EPSTATE_AWAIT_MPAREP:
286 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
287 -EINVAL);
288 break;
289 case ERDMA_EPSTATE_RDMA_MODE:
290 erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
291 break;
292 case ERDMA_EPSTATE_IDLE:
293 case ERDMA_EPSTATE_LISTENING:
294 case ERDMA_EPSTATE_CONNECTING:
295 case ERDMA_EPSTATE_AWAIT_MPAREQ:
296 case ERDMA_EPSTATE_RECVD_MPAREQ:
297 case ERDMA_EPSTATE_CLOSED:
298 default:
299 break;
301 cep->cm_id->rem_ref(cep->cm_id);
302 cep->cm_id = NULL;
303 erdma_cep_put(cep);
305 cep->state = ERDMA_EPSTATE_CLOSED;
307 if (cep->sock) {
308 erdma_socket_disassoc(cep->sock);
309 sock_release(cep->sock);
310 cep->sock = NULL;
313 if (cep->qp) {
314 cep->qp = NULL;
315 erdma_qp_put(qp);
317 out:
318 erdma_cep_set_free(cep);
321 void erdma_cep_put(struct erdma_cep *cep)
323 WARN_ON(kref_read(&cep->ref) < 1);
324 kref_put(&cep->ref, __erdma_cep_dealloc);
327 void erdma_cep_get(struct erdma_cep *cep)
329 kref_get(&cep->ref);
332 static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata,
333 u8 pd_len)
335 struct socket *s = cep->sock;
336 struct mpa_rr *rr = &cep->mpa.hdr;
337 struct kvec iov[3];
338 struct msghdr msg;
339 int iovec_num = 0;
340 int ret;
341 int mpa_len;
343 memset(&msg, 0, sizeof(msg));
345 rr->params.pd_len = cpu_to_be16(pd_len);
347 iov[iovec_num].iov_base = rr;
348 iov[iovec_num].iov_len = sizeof(*rr);
349 iovec_num++;
350 mpa_len = sizeof(*rr);
352 iov[iovec_num].iov_base = &cep->mpa.ext_data;
353 iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data);
354 iovec_num++;
355 mpa_len += sizeof(cep->mpa.ext_data);
357 if (pd_len) {
358 iov[iovec_num].iov_base = (char *)pdata;
359 iov[iovec_num].iov_len = pd_len;
360 mpa_len += pd_len;
361 iovec_num++;
364 ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len);
366 return ret < 0 ? ret : 0;
369 static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
370 int flags)
372 struct kvec iov = { buf, size };
373 struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
375 return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
378 static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr,
379 int hdr_size, int *rcvd_out)
381 struct socket *s = cep->sock;
382 int rcvd;
384 *rcvd_out = 0;
385 if (hdr_rcvd < hdr_size) {
386 rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd,
387 MSG_DONTWAIT);
388 if (rcvd == -EAGAIN)
389 return -EAGAIN;
391 if (rcvd <= 0)
392 return -ECONNABORTED;
394 hdr_rcvd += rcvd;
395 *rcvd_out = rcvd;
397 if (hdr_rcvd < hdr_size)
398 return -EAGAIN;
401 return 0;
404 static void __mpa_rr_set_revision(__be16 *bits, u8 rev)
406 *bits = (*bits & ~MPA_RR_MASK_REVISION) |
407 (cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
410 static u8 __mpa_rr_revision(__be16 mpa_rr_bits)
412 __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
414 return (u8)be16_to_cpu(rev);
417 static void __mpa_ext_set_cc(__be32 *bits, u32 cc)
419 *bits = (*bits & ~MPA_EXT_FLAG_CC) |
420 (cpu_to_be32(cc) & MPA_EXT_FLAG_CC);
423 static u8 __mpa_ext_cc(__be32 mpa_ext_bits)
425 __be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC;
427 return (u8)be32_to_cpu(cc);
431 * Receive MPA Request/Reply header.
433 * Returns 0 if complete MPA Request/Reply haeder including
434 * eventual private data was received. Returns -EAGAIN if
435 * header was partially received or negative error code otherwise.
437 * Context: May be called in process context only
439 static int erdma_recv_mpa_rr(struct erdma_cep *cep)
441 struct mpa_rr *hdr = &cep->mpa.hdr;
442 struct socket *s = cep->sock;
443 u16 pd_len;
444 int rcvd, to_rcv, ret, pd_rcvd;
446 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
447 ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd,
448 (char *)&cep->mpa.hdr,
449 sizeof(struct mpa_rr), &rcvd);
450 cep->mpa.bytes_rcvd += rcvd;
451 if (ret)
452 return ret;
455 if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA ||
456 __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1)
457 return -EPROTO;
459 if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) <
460 sizeof(struct erdma_mpa_ext)) {
461 ret = __recv_mpa_hdr(
462 cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
463 (char *)&cep->mpa.ext_data,
464 sizeof(struct erdma_mpa_ext), &rcvd);
465 cep->mpa.bytes_rcvd += rcvd;
466 if (ret)
467 return ret;
470 pd_len = be16_to_cpu(hdr->params.pd_len);
471 pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) -
472 sizeof(struct erdma_mpa_ext);
473 to_rcv = pd_len - pd_rcvd;
475 if (!to_rcv) {
477 * We have received the whole MPA Request/Reply message.
478 * Check against peer protocol violation.
480 u32 word;
482 ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word),
483 &rcvd);
484 if (ret == -EAGAIN && rcvd == 0)
485 return 0;
487 if (ret)
488 return ret;
490 return -EPROTO;
494 * At this point, MPA header has been fully received, and pd_len != 0.
495 * So, begin to receive private data.
497 if (!cep->mpa.pdata) {
498 cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
499 if (!cep->mpa.pdata)
500 return -ENOMEM;
503 rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4,
504 MSG_DONTWAIT);
505 if (rcvd < 0)
506 return rcvd;
508 if (rcvd > to_rcv)
509 return -EPROTO;
511 cep->mpa.bytes_rcvd += rcvd;
513 if (to_rcv == rcvd)
514 return 0;
516 return -EAGAIN;
520 * erdma_proc_mpareq()
522 * Read MPA Request from socket and signal new connection to IWCM
523 * if success. Caller must hold lock on corresponding listening CEP.
525 static int erdma_proc_mpareq(struct erdma_cep *cep)
527 struct mpa_rr *req;
528 int ret;
530 ret = erdma_recv_mpa_rr(cep);
531 if (ret)
532 return ret;
534 req = &cep->mpa.hdr;
536 if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE))
537 return -EPROTO;
539 memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE);
541 /* Currently does not support marker and crc. */
542 if (req->params.bits & MPA_RR_FLAG_MARKERS ||
543 req->params.bits & MPA_RR_FLAG_CRC)
544 goto reject_conn;
546 cep->state = ERDMA_EPSTATE_RECVD_MPAREQ;
548 /* Keep reference until IWCM accepts/rejects */
549 erdma_cep_get(cep);
550 ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
551 if (ret)
552 erdma_cep_put(cep);
554 return ret;
556 reject_conn:
557 req->params.bits &= ~MPA_RR_FLAG_MARKERS;
558 req->params.bits |= MPA_RR_FLAG_REJECT;
559 req->params.bits &= ~MPA_RR_FLAG_CRC;
561 kfree(cep->mpa.pdata);
562 cep->mpa.pdata = NULL;
563 erdma_send_mpareqrep(cep, NULL, 0);
565 return -EOPNOTSUPP;
568 static int erdma_proc_mpareply(struct erdma_cep *cep)
570 struct erdma_qp_attrs qp_attrs;
571 struct erdma_qp *qp = cep->qp;
572 struct mpa_rr *rep;
573 int ret;
575 ret = erdma_recv_mpa_rr(cep);
576 if (ret)
577 goto out_err;
579 erdma_cancel_mpatimer(cep);
581 rep = &cep->mpa.hdr;
583 if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) {
584 ret = -EPROTO;
585 goto out_err;
588 if (rep->params.bits & MPA_RR_FLAG_REJECT) {
589 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
590 return -ECONNRESET;
593 /* Currently does not support marker and crc. */
594 if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
595 (rep->params.bits & MPA_RR_FLAG_CRC)) {
596 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
597 return -EINVAL;
600 memset(&qp_attrs, 0, sizeof(qp_attrs));
601 qp_attrs.irq_size = cep->ird;
602 qp_attrs.orq_size = cep->ord;
603 qp_attrs.state = ERDMA_QP_STATE_RTS;
605 down_write(&qp->state_lock);
606 if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
607 ret = -EINVAL;
608 up_write(&qp->state_lock);
609 goto out_err;
612 qp->attrs.qp_type = ERDMA_QP_ACTIVE;
613 if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc)
614 qp->attrs.cc = COMPROMISE_CC;
616 ret = erdma_modify_qp_internal(qp, &qp_attrs,
617 ERDMA_QP_ATTR_STATE |
618 ERDMA_QP_ATTR_LLP_HANDLE |
619 ERDMA_QP_ATTR_MPA);
621 up_write(&qp->state_lock);
623 if (!ret) {
624 ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
625 if (!ret)
626 cep->state = ERDMA_EPSTATE_RDMA_MODE;
628 return 0;
631 out_err:
632 if (ret != -EAGAIN)
633 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
635 return ret;
638 static void erdma_accept_newconn(struct erdma_cep *cep)
640 struct socket *s = cep->sock;
641 struct socket *new_s = NULL;
642 struct erdma_cep *new_cep = NULL;
643 int ret = 0;
645 if (cep->state != ERDMA_EPSTATE_LISTENING)
646 goto error;
648 new_cep = erdma_cep_alloc(cep->dev);
649 if (!new_cep)
650 goto error;
653 * 4: Allocate a sufficient number of work elements
654 * to allow concurrent handling of local + peer close
655 * events, MPA header processing + MPA timeout.
657 if (erdma_cm_alloc_work(new_cep, 4) != 0)
658 goto error;
661 * Copy saved socket callbacks from listening CEP
662 * and assign new socket with new CEP
664 new_cep->sk_state_change = cep->sk_state_change;
665 new_cep->sk_data_ready = cep->sk_data_ready;
666 new_cep->sk_error_report = cep->sk_error_report;
668 ret = kernel_accept(s, &new_s, O_NONBLOCK);
669 if (ret != 0)
670 goto error;
672 new_cep->sock = new_s;
673 erdma_cep_get(new_cep);
674 new_s->sk->sk_user_data = new_cep;
676 tcp_sock_set_nodelay(new_s->sk);
677 new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ;
679 ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT);
680 if (ret)
681 goto error;
683 new_cep->listen_cep = cep;
684 erdma_cep_get(cep);
686 if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
687 /* MPA REQ already queued */
688 erdma_cep_set_inuse(new_cep);
689 ret = erdma_proc_mpareq(new_cep);
690 if (ret != -EAGAIN) {
691 erdma_cep_put(cep);
692 new_cep->listen_cep = NULL;
693 if (ret) {
694 erdma_cep_set_free(new_cep);
695 goto error;
698 erdma_cep_set_free(new_cep);
700 return;
702 error:
703 if (new_cep) {
704 new_cep->state = ERDMA_EPSTATE_CLOSED;
705 erdma_cancel_mpatimer(new_cep);
707 erdma_cep_put(new_cep);
708 new_cep->sock = NULL;
711 if (new_s) {
712 erdma_socket_disassoc(new_s);
713 sock_release(new_s);
717 static int erdma_newconn_connected(struct erdma_cep *cep)
719 int ret = 0;
721 cep->mpa.hdr.params.bits = 0;
722 __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1);
724 memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE);
725 cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
726 __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc);
728 ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len);
729 cep->state = ERDMA_EPSTATE_AWAIT_MPAREP;
730 cep->mpa.hdr.params.pd_len = 0;
732 if (ret >= 0)
733 ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT);
735 return ret;
738 static void erdma_cm_work_handler(struct work_struct *w)
740 struct erdma_cm_work *work;
741 struct erdma_cep *cep;
742 int release_cep = 0, ret = 0;
744 work = container_of(w, struct erdma_cm_work, work.work);
745 cep = work->cep;
747 erdma_cep_set_inuse(cep);
749 switch (work->type) {
750 case ERDMA_CM_WORK_CONNECTED:
751 erdma_cancel_mpatimer(cep);
752 if (cep->state == ERDMA_EPSTATE_CONNECTING) {
753 ret = erdma_newconn_connected(cep);
754 if (ret) {
755 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
756 -EIO);
757 release_cep = 1;
760 break;
761 case ERDMA_CM_WORK_CONNECTTIMEOUT:
762 if (cep->state == ERDMA_EPSTATE_CONNECTING) {
763 cep->mpa_timer = NULL;
764 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
765 -ETIMEDOUT);
766 release_cep = 1;
768 break;
769 case ERDMA_CM_WORK_ACCEPT:
770 erdma_accept_newconn(cep);
771 break;
772 case ERDMA_CM_WORK_READ_MPAHDR:
773 if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
774 if (cep->listen_cep) {
775 erdma_cep_set_inuse(cep->listen_cep);
777 if (cep->listen_cep->state ==
778 ERDMA_EPSTATE_LISTENING)
779 ret = erdma_proc_mpareq(cep);
780 else
781 ret = -EFAULT;
783 erdma_cep_set_free(cep->listen_cep);
785 if (ret != -EAGAIN) {
786 erdma_cep_put(cep->listen_cep);
787 cep->listen_cep = NULL;
788 if (ret)
789 erdma_cep_put(cep);
792 } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
793 ret = erdma_proc_mpareply(cep);
796 if (ret && ret != -EAGAIN)
797 release_cep = 1;
798 break;
799 case ERDMA_CM_WORK_CLOSE_LLP:
800 if (cep->cm_id)
801 erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
802 release_cep = 1;
803 break;
804 case ERDMA_CM_WORK_PEER_CLOSE:
805 if (cep->cm_id) {
806 if (cep->state == ERDMA_EPSTATE_CONNECTING ||
807 cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
809 * MPA reply not received, but connection drop
811 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
812 -ECONNRESET);
813 } else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) {
815 * NOTE: IW_CM_EVENT_DISCONNECT is given just
816 * to transition IWCM into CLOSING.
818 erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
819 erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
821 } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
822 /* Socket close before MPA request received. */
823 erdma_disassoc_listen_cep(cep);
824 erdma_cep_put(cep);
826 release_cep = 1;
827 break;
828 case ERDMA_CM_WORK_MPATIMEOUT:
829 cep->mpa_timer = NULL;
830 if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) {
832 * MPA request timed out:
833 * Hide any partially received private data and signal
834 * timeout
836 cep->mpa.hdr.params.pd_len = 0;
838 if (cep->cm_id)
839 erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
840 -ETIMEDOUT);
841 release_cep = 1;
842 } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) {
843 /* No MPA req received after peer TCP stream setup. */
844 erdma_disassoc_listen_cep(cep);
846 erdma_cep_put(cep);
847 release_cep = 1;
849 break;
850 default:
851 WARN(1, "Undefined CM work type: %d\n", work->type);
854 if (release_cep) {
855 erdma_cancel_mpatimer(cep);
856 cep->state = ERDMA_EPSTATE_CLOSED;
857 if (cep->qp) {
858 struct erdma_qp *qp = cep->qp;
860 * Serialize a potential race with application
861 * closing the QP and calling erdma_qp_cm_drop()
863 erdma_qp_get(qp);
864 erdma_cep_set_free(cep);
866 erdma_qp_llp_close(qp);
867 erdma_qp_put(qp);
869 erdma_cep_set_inuse(cep);
870 cep->qp = NULL;
871 erdma_qp_put(qp);
874 if (cep->sock) {
875 erdma_socket_disassoc(cep->sock);
876 sock_release(cep->sock);
877 cep->sock = NULL;
880 if (cep->cm_id) {
881 cep->cm_id->rem_ref(cep->cm_id);
882 cep->cm_id = NULL;
883 if (cep->state != ERDMA_EPSTATE_LISTENING)
884 erdma_cep_put(cep);
887 erdma_cep_set_free(cep);
888 erdma_put_work(work);
889 erdma_cep_put(cep);
892 int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type)
894 struct erdma_cm_work *work = erdma_get_work(cep);
895 unsigned long delay = 0;
897 if (!work)
898 return -ENOMEM;
900 work->type = type;
901 work->cep = cep;
903 erdma_cep_get(cep);
905 INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler);
907 if (type == ERDMA_CM_WORK_MPATIMEOUT) {
908 cep->mpa_timer = work;
910 if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
911 delay = MPAREP_TIMEOUT;
912 else
913 delay = MPAREQ_TIMEOUT;
914 } else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) {
915 cep->mpa_timer = work;
917 delay = CONNECT_TIMEOUT;
920 queue_delayed_work(erdma_cm_wq, &work->work, delay);
922 return 0;
925 static void erdma_cm_llp_data_ready(struct sock *sk)
927 struct erdma_cep *cep;
929 trace_sk_data_ready(sk);
931 read_lock(&sk->sk_callback_lock);
933 cep = sk_to_cep(sk);
934 if (!cep)
935 goto out;
937 if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ ||
938 cep->state == ERDMA_EPSTATE_AWAIT_MPAREP)
939 erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR);
941 out:
942 read_unlock(&sk->sk_callback_lock);
945 static void erdma_cm_llp_error_report(struct sock *sk)
947 struct erdma_cep *cep = sk_to_cep(sk);
949 if (cep)
950 cep->sk_error_report(sk);
953 static void erdma_cm_llp_state_change(struct sock *sk)
955 struct erdma_cep *cep;
956 void (*orig_state_change)(struct sock *sk);
958 read_lock(&sk->sk_callback_lock);
960 cep = sk_to_cep(sk);
961 if (!cep) {
962 read_unlock(&sk->sk_callback_lock);
963 return;
965 orig_state_change = cep->sk_state_change;
967 switch (sk->sk_state) {
968 case TCP_ESTABLISHED:
969 if (cep->state == ERDMA_EPSTATE_CONNECTING)
970 erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
971 else
972 erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT);
973 break;
974 case TCP_CLOSE:
975 case TCP_CLOSE_WAIT:
976 if (cep->state != ERDMA_EPSTATE_LISTENING)
977 erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE);
978 break;
979 default:
980 break;
982 read_unlock(&sk->sk_callback_lock);
983 orig_state_change(sk);
986 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
987 int laddrlen, struct sockaddr *raddr,
988 int raddrlen, int flags)
990 int ret;
992 sock_set_reuseaddr(s->sk);
993 ret = s->ops->bind(s, laddr, laddrlen);
994 if (ret)
995 return ret;
996 ret = s->ops->connect(s, raddr, raddrlen, flags);
997 return ret < 0 ? ret : 0;
1000 int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1002 struct erdma_dev *dev = to_edev(id->device);
1003 struct erdma_qp *qp;
1004 struct erdma_cep *cep = NULL;
1005 struct socket *s = NULL;
1006 struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr;
1007 struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr;
1008 u16 pd_len = params->private_data_len;
1009 int ret;
1011 if (pd_len > MPA_MAX_PRIVDATA)
1012 return -EINVAL;
1014 if (params->ird > dev->attrs.max_ird ||
1015 params->ord > dev->attrs.max_ord)
1016 return -EINVAL;
1018 if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET)
1019 return -EAFNOSUPPORT;
1021 qp = find_qp_by_qpn(dev, params->qpn);
1022 if (!qp)
1023 return -ENOENT;
1024 erdma_qp_get(qp);
1026 ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s);
1027 if (ret < 0)
1028 goto error_put_qp;
1030 cep = erdma_cep_alloc(dev);
1031 if (!cep) {
1032 ret = -ENOMEM;
1033 goto error_release_sock;
1036 erdma_cep_set_inuse(cep);
1038 /* Associate QP with CEP */
1039 erdma_cep_get(cep);
1040 qp->cep = cep;
1041 cep->qp = qp;
1043 /* Associate cm_id with CEP */
1044 id->add_ref(id);
1045 cep->cm_id = id;
1048 * 6: Allocate a sufficient number of work elements
1049 * to allow concurrent handling of local + peer close
1050 * events, MPA header processing + MPA timeout, connected event
1051 * and connect timeout.
1053 ret = erdma_cm_alloc_work(cep, 6);
1054 if (ret != 0) {
1055 ret = -ENOMEM;
1056 goto error_release_cep;
1059 cep->ird = params->ird;
1060 cep->ord = params->ord;
1061 cep->state = ERDMA_EPSTATE_CONNECTING;
1063 erdma_cep_socket_assoc(cep, s);
1065 if (pd_len) {
1066 cep->pd_len = pd_len;
1067 cep->private_data = kmalloc(pd_len, GFP_KERNEL);
1068 if (!cep->private_data) {
1069 ret = -ENOMEM;
1070 goto error_disassoc;
1073 memcpy(cep->private_data, params->private_data,
1074 params->private_data_len);
1077 ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr,
1078 sizeof(*raddr), O_NONBLOCK);
1079 if (ret != -EINPROGRESS && ret != 0) {
1080 goto error_disassoc;
1081 } else if (ret == 0) {
1082 ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED);
1083 if (ret)
1084 goto error_disassoc;
1085 } else {
1086 ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT);
1087 if (ret)
1088 goto error_disassoc;
1091 erdma_cep_set_free(cep);
1092 return 0;
1094 error_disassoc:
1095 kfree(cep->private_data);
1096 cep->private_data = NULL;
1097 cep->pd_len = 0;
1099 erdma_socket_disassoc(s);
1101 error_release_cep:
1102 /* disassoc with cm_id */
1103 cep->cm_id = NULL;
1104 id->rem_ref(id);
1106 /* disassoc with qp */
1107 qp->cep = NULL;
1108 erdma_cep_put(cep);
1109 cep->qp = NULL;
1111 cep->state = ERDMA_EPSTATE_CLOSED;
1113 erdma_cep_set_free(cep);
1115 /* release the cep. */
1116 erdma_cep_put(cep);
1118 error_release_sock:
1119 if (s)
1120 sock_release(s);
1121 error_put_qp:
1122 erdma_qp_put(qp);
1124 return ret;
1127 int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1129 struct erdma_dev *dev = to_edev(id->device);
1130 struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
1131 struct erdma_qp *qp;
1132 struct erdma_qp_attrs qp_attrs;
1133 int ret;
1135 erdma_cep_set_inuse(cep);
1136 erdma_cep_put(cep);
1138 /* Free lingering inbound private data */
1139 if (cep->mpa.hdr.params.pd_len) {
1140 cep->mpa.hdr.params.pd_len = 0;
1141 kfree(cep->mpa.pdata);
1142 cep->mpa.pdata = NULL;
1144 erdma_cancel_mpatimer(cep);
1146 if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
1147 erdma_cep_set_free(cep);
1148 erdma_cep_put(cep);
1150 return -ECONNRESET;
1153 qp = find_qp_by_qpn(dev, params->qpn);
1154 if (!qp)
1155 return -ENOENT;
1156 erdma_qp_get(qp);
1158 down_write(&qp->state_lock);
1159 if (qp->attrs.state > ERDMA_QP_STATE_RTR) {
1160 ret = -EINVAL;
1161 up_write(&qp->state_lock);
1162 goto error;
1165 if (params->ord > dev->attrs.max_ord ||
1166 params->ird > dev->attrs.max_ord) {
1167 ret = -EINVAL;
1168 up_write(&qp->state_lock);
1169 goto error;
1172 if (params->private_data_len > MPA_MAX_PRIVDATA) {
1173 ret = -EINVAL;
1174 up_write(&qp->state_lock);
1175 goto error;
1178 cep->ird = params->ird;
1179 cep->ord = params->ord;
1181 cep->cm_id = id;
1182 id->add_ref(id);
1184 memset(&qp_attrs, 0, sizeof(qp_attrs));
1185 qp_attrs.orq_size = params->ord;
1186 qp_attrs.irq_size = params->ird;
1188 qp_attrs.state = ERDMA_QP_STATE_RTS;
1190 /* Associate QP with CEP */
1191 erdma_cep_get(cep);
1192 qp->cep = cep;
1193 cep->qp = qp;
1195 cep->state = ERDMA_EPSTATE_RDMA_MODE;
1197 qp->attrs.qp_type = ERDMA_QP_PASSIVE;
1198 qp->attrs.pd_len = params->private_data_len;
1200 if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits))
1201 qp->attrs.cc = COMPROMISE_CC;
1203 /* move to rts */
1204 ret = erdma_modify_qp_internal(qp, &qp_attrs,
1205 ERDMA_QP_ATTR_STATE |
1206 ERDMA_QP_ATTR_ORD |
1207 ERDMA_QP_ATTR_LLP_HANDLE |
1208 ERDMA_QP_ATTR_IRD |
1209 ERDMA_QP_ATTR_MPA);
1210 up_write(&qp->state_lock);
1212 if (ret)
1213 goto error;
1215 cep->mpa.ext_data.bits = 0;
1216 __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc);
1217 cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie);
1219 ret = erdma_send_mpareqrep(cep, params->private_data,
1220 params->private_data_len);
1221 if (!ret) {
1222 ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
1223 if (ret)
1224 goto error;
1226 erdma_cep_set_free(cep);
1228 return 0;
1231 error:
1232 erdma_socket_disassoc(cep->sock);
1233 sock_release(cep->sock);
1234 cep->sock = NULL;
1236 cep->state = ERDMA_EPSTATE_CLOSED;
1238 if (cep->cm_id) {
1239 cep->cm_id->rem_ref(id);
1240 cep->cm_id = NULL;
1243 if (qp->cep) {
1244 erdma_cep_put(cep);
1245 qp->cep = NULL;
1248 cep->qp = NULL;
1249 erdma_qp_put(qp);
1251 erdma_cep_set_free(cep);
1252 erdma_cep_put(cep);
1254 return ret;
1257 int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen)
1259 struct erdma_cep *cep = (struct erdma_cep *)id->provider_data;
1261 erdma_cep_set_inuse(cep);
1262 erdma_cep_put(cep);
1264 erdma_cancel_mpatimer(cep);
1266 if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) {
1267 erdma_cep_set_free(cep);
1268 erdma_cep_put(cep);
1270 return -ECONNRESET;
1273 if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) {
1274 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
1275 erdma_send_mpareqrep(cep, pdata, plen);
1278 erdma_socket_disassoc(cep->sock);
1279 sock_release(cep->sock);
1280 cep->sock = NULL;
1282 cep->state = ERDMA_EPSTATE_CLOSED;
1284 erdma_cep_set_free(cep);
1285 erdma_cep_put(cep);
1287 return 0;
1290 int erdma_create_listen(struct iw_cm_id *id, int backlog)
1292 struct socket *s;
1293 struct erdma_cep *cep = NULL;
1294 int ret = 0;
1295 struct erdma_dev *dev = to_edev(id->device);
1296 int addr_family = id->local_addr.ss_family;
1297 struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
1299 if (addr_family != AF_INET)
1300 return -EAFNOSUPPORT;
1302 ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
1303 if (ret < 0)
1304 return ret;
1306 sock_set_reuseaddr(s->sk);
1308 /* For wildcard addr, limit binding to current device only */
1309 if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
1310 s->sk->sk_bound_dev_if = dev->netdev->ifindex;
1312 ret = s->ops->bind(s, (struct sockaddr *)laddr,
1313 sizeof(struct sockaddr_in));
1314 if (ret)
1315 goto error;
1317 cep = erdma_cep_alloc(dev);
1318 if (!cep) {
1319 ret = -ENOMEM;
1320 goto error;
1322 erdma_cep_socket_assoc(cep, s);
1324 ret = erdma_cm_alloc_work(cep, backlog);
1325 if (ret)
1326 goto error;
1328 ret = s->ops->listen(s, backlog);
1329 if (ret)
1330 goto error;
1332 cep->cm_id = id;
1333 id->add_ref(id);
1335 if (!id->provider_data) {
1336 id->provider_data =
1337 kmalloc(sizeof(struct list_head), GFP_KERNEL);
1338 if (!id->provider_data) {
1339 ret = -ENOMEM;
1340 goto error;
1342 INIT_LIST_HEAD((struct list_head *)id->provider_data);
1345 list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
1346 cep->state = ERDMA_EPSTATE_LISTENING;
1348 return 0;
1350 error:
1351 if (cep) {
1352 erdma_cep_set_inuse(cep);
1354 if (cep->cm_id) {
1355 cep->cm_id->rem_ref(cep->cm_id);
1356 cep->cm_id = NULL;
1358 cep->sock = NULL;
1359 erdma_socket_disassoc(s);
1360 cep->state = ERDMA_EPSTATE_CLOSED;
1362 erdma_cep_set_free(cep);
1363 erdma_cep_put(cep);
1365 sock_release(s);
1367 return ret;
1370 static void erdma_drop_listeners(struct iw_cm_id *id)
1372 struct list_head *p, *tmp;
1374 * In case of a wildcard rdma_listen on a multi-homed device,
1375 * a listener's IWCM id is associated with more than one listening CEP.
1377 list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
1378 struct erdma_cep *cep =
1379 list_entry(p, struct erdma_cep, listenq);
1381 list_del(p);
1383 erdma_cep_set_inuse(cep);
1385 if (cep->cm_id) {
1386 cep->cm_id->rem_ref(cep->cm_id);
1387 cep->cm_id = NULL;
1389 if (cep->sock) {
1390 erdma_socket_disassoc(cep->sock);
1391 sock_release(cep->sock);
1392 cep->sock = NULL;
1394 cep->state = ERDMA_EPSTATE_CLOSED;
1395 erdma_cep_set_free(cep);
1396 erdma_cep_put(cep);
1400 int erdma_destroy_listen(struct iw_cm_id *id)
1402 if (!id->provider_data)
1403 return 0;
1405 erdma_drop_listeners(id);
1406 kfree(id->provider_data);
1407 id->provider_data = NULL;
1409 return 0;
1412 int erdma_cm_init(void)
1414 erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq");
1415 if (!erdma_cm_wq)
1416 return -ENOMEM;
1418 return 0;
1421 void erdma_cm_exit(void)
1423 if (erdma_cm_wq)
1424 destroy_workqueue(erdma_cm_wq);