4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27 * This software is available to you under a choice of one of two
28 * licenses. You may choose to be licensed under the terms of the GNU
29 * General Public License (GPL) Version 2, available from the file
30 * COPYING in the main directory of this source tree, or the
31 * OpenIB.org BSD license below:
33 * Redistribution and use in source and binary forms, with or
34 * without modification, are permitted provided that the following
37 * - Redistributions of source code must retain the above
38 * copyright notice, this list of conditions and the following
41 * - Redistributions in binary form must reproduce the above
42 * copyright notice, this list of conditions and the following
43 * disclaimer in the documentation and/or other materials
44 * provided with the distribution.
46 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
47 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
48 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
49 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
50 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
51 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
52 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
57 * Sun elects to include this software in Sun product
58 * under the OpenIB BSD license.
61 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
62 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
65 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
66 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
67 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
68 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
69 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71 * POSSIBILITY OF SUCH DAMAGE.
74 #include <sys/types.h>
76 #include <sys/sunddi.h>
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 #include <sys/ib/clients/rds/rds_kstat.h>
83 static void rds_async_handler(void *clntp
, ibt_hca_hdl_t hdl
,
84 ibt_async_code_t code
, ibt_async_event_t
*event
);
86 static struct ibt_clnt_modinfo_s rds_ib_modinfo
= {
94 /* performance tunables */
95 uint_t rds_no_interrupts
= 0;
96 uint_t rds_poll_percent_full
= 25;
97 uint_t rds_wc_signal
= IBT_NEXT_SOLICITED
;
98 uint_t rds_waittime_ms
= 100; /* ms */
100 extern dev_info_t
*rdsib_dev_info
;
101 extern void rds_close_sessions();
104 rdsib_validate_chan_sizes(ibt_hca_attr_t
*hattrp
)
106 /* The SQ size should not be more than that supported by the HCA */
107 if (((MaxDataSendBuffers
+ RDS_NUM_ACKS
) > hattrp
->hca_max_chan_sz
) ||
108 ((MaxDataSendBuffers
+ RDS_NUM_ACKS
) > hattrp
->hca_max_cq_sz
)) {
109 RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
110 "than that supported by the HCA driver "
111 "(%d + %d > %d or %d), lowering it to a supported value.",
112 RDS_NUM_ACKS
, MaxDataSendBuffers
, RDS_NUM_ACKS
,
113 hattrp
->hca_max_chan_sz
, hattrp
->hca_max_cq_sz
);
115 MaxDataSendBuffers
= (hattrp
->hca_max_chan_sz
>
116 hattrp
->hca_max_cq_sz
) ?
117 hattrp
->hca_max_cq_sz
- RDS_NUM_ACKS
:
118 hattrp
->hca_max_chan_sz
- RDS_NUM_ACKS
;
121 /* The RQ size should not be more than that supported by the HCA */
122 if ((MaxDataRecvBuffers
> hattrp
->hca_max_chan_sz
) ||
123 (MaxDataRecvBuffers
> hattrp
->hca_max_cq_sz
)) {
124 RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
125 "supported by the HCA driver (%d > %d or %d), lowering it "
126 "to a supported value.", MaxDataRecvBuffers
,
127 hattrp
->hca_max_chan_sz
, hattrp
->hca_max_cq_sz
);
129 MaxDataRecvBuffers
= (hattrp
->hca_max_chan_sz
>
130 hattrp
->hca_max_cq_sz
) ? hattrp
->hca_max_cq_sz
:
131 hattrp
->hca_max_chan_sz
;
134 /* The SQ size should not be more than that supported by the HCA */
135 if ((MaxCtrlSendBuffers
> hattrp
->hca_max_chan_sz
) ||
136 (MaxCtrlSendBuffers
> hattrp
->hca_max_cq_sz
)) {
137 RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
138 "supported by the HCA driver (%d > %d or %d), lowering it "
139 "to a supported value.", MaxCtrlSendBuffers
,
140 hattrp
->hca_max_chan_sz
, hattrp
->hca_max_cq_sz
);
142 MaxCtrlSendBuffers
= (hattrp
->hca_max_chan_sz
>
143 hattrp
->hca_max_cq_sz
) ? hattrp
->hca_max_cq_sz
:
144 hattrp
->hca_max_chan_sz
;
147 /* The RQ size should not be more than that supported by the HCA */
148 if ((MaxCtrlRecvBuffers
> hattrp
->hca_max_chan_sz
) ||
149 (MaxCtrlRecvBuffers
> hattrp
->hca_max_cq_sz
)) {
150 RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
151 "supported by the HCA driver (%d > %d or %d), lowering it "
152 "to a supported value.", MaxCtrlRecvBuffers
,
153 hattrp
->hca_max_chan_sz
, hattrp
->hca_max_cq_sz
);
155 MaxCtrlRecvBuffers
= (hattrp
->hca_max_chan_sz
>
156 hattrp
->hca_max_cq_sz
) ? hattrp
->hca_max_cq_sz
:
157 hattrp
->hca_max_chan_sz
;
160 /* The MaxRecvMemory should be less than that supported by the HCA */
161 if ((NDataRX
* RdsPktSize
) > hattrp
->hca_max_memr_len
) {
162 RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
163 "supported by the HCA driver (%d > %d), lowering it to %d",
164 NDataRX
* RdsPktSize
, hattrp
->hca_max_memr_len
,
165 hattrp
->hca_max_memr_len
);
167 NDataRX
= hattrp
->hca_max_memr_len
/RdsPktSize
;
171 /* Return hcap, given the hca guid */
173 rds_lkup_hca(ib_guid_t hca_guid
)
177 RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
178 "guid: %llx", rdsib_statep
, hca_guid
);
180 rw_enter(&rdsib_statep
->rds_hca_lock
, RW_READER
);
182 hcap
= rdsib_statep
->rds_hcalistp
;
183 while ((hcap
!= NULL
) && (hcap
->hca_guid
!= hca_guid
)) {
184 hcap
= hcap
->hca_nextp
;
187 rw_exit(&rdsib_statep
->rds_hca_lock
);
189 RDS_DPRINTF4("rds_lkup_hca", "return");
194 void rds_randomize_qps(rds_hca_t
*hcap
);
197 rdsib_init_hca(ib_guid_t hca_guid
)
200 boolean_t alloc
= B_FALSE
;
203 RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid
);
205 /* Do a HCA lookup */
206 hcap
= rds_lkup_hca(hca_guid
);
208 if (hcap
!= NULL
&& hcap
->hca_hdl
!= NULL
) {
210 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
211 * that we have already opened. Just return NULL so that
212 * we'll not end up reinitializing the HCA again.
214 RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
219 RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
220 hcap
= (rds_hca_t
*)kmem_zalloc(sizeof (rds_hca_t
), KM_SLEEP
);
224 hcap
->hca_guid
= hca_guid
;
225 ret
= ibt_open_hca(rdsib_statep
->rds_ibhdl
, hca_guid
,
227 if (ret
!= IBT_SUCCESS
) {
228 if (ret
== IBT_HCA_IN_USE
) {
229 RDS_DPRINTF2("rdsib_init_hca",
230 "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
233 RDS_DPRINTF2("rdsib_init_hca",
234 "ibt_open_hca: 0x%llx failed: %d", hca_guid
, ret
);
236 if (alloc
== B_TRUE
) {
237 kmem_free(hcap
, sizeof (rds_hca_t
));
242 ret
= ibt_query_hca(hcap
->hca_hdl
, &hcap
->hca_attr
);
243 if (ret
!= IBT_SUCCESS
) {
244 RDS_DPRINTF2("rdsib_init_hca",
245 "Query HCA: 0x%llx failed: %d", hca_guid
, ret
);
246 ret
= ibt_close_hca(hcap
->hca_hdl
);
247 ASSERT(ret
== IBT_SUCCESS
);
248 if (alloc
== B_TRUE
) {
249 kmem_free(hcap
, sizeof (rds_hca_t
));
251 hcap
->hca_hdl
= NULL
;
256 ret
= ibt_query_hca_ports(hcap
->hca_hdl
, 0,
257 &hcap
->hca_pinfop
, &hcap
->hca_nports
, &hcap
->hca_pinfo_sz
);
258 if (ret
!= IBT_SUCCESS
) {
259 RDS_DPRINTF2("rdsib_init_hca",
260 "Query HCA 0x%llx ports failed: %d", hca_guid
,
262 ret
= ibt_close_hca(hcap
->hca_hdl
);
263 hcap
->hca_hdl
= NULL
;
264 ASSERT(ret
== IBT_SUCCESS
);
265 if (alloc
== B_TRUE
) {
266 kmem_free(hcap
, sizeof (rds_hca_t
));
268 hcap
->hca_hdl
= NULL
;
273 /* Only one PD per HCA is allocated, so do it here */
274 ret
= ibt_alloc_pd(hcap
->hca_hdl
, IBT_PD_NO_FLAGS
,
276 if (ret
!= IBT_SUCCESS
) {
277 RDS_DPRINTF2("rdsib_init_hca",
278 "ibt_alloc_pd 0x%llx failed: %d", hca_guid
, ret
);
279 (void) ibt_free_portinfo(hcap
->hca_pinfop
,
281 ret
= ibt_close_hca(hcap
->hca_hdl
);
282 ASSERT(ret
== IBT_SUCCESS
);
283 hcap
->hca_hdl
= NULL
;
284 if (alloc
== B_TRUE
) {
285 kmem_free(hcap
, sizeof (rds_hca_t
));
287 hcap
->hca_hdl
= NULL
;
292 rdsib_validate_chan_sizes(&hcap
->hca_attr
);
294 /* To minimize stale connections after ungraceful reboots */
295 rds_randomize_qps(hcap
);
297 rw_enter(&rdsib_statep
->rds_hca_lock
, RW_WRITER
);
298 hcap
->hca_state
= RDS_HCA_STATE_OPEN
;
299 if (alloc
== B_TRUE
) {
300 /* this is a new HCA, add it to the list */
301 rdsib_statep
->rds_nhcas
++;
302 hcap
->hca_nextp
= rdsib_statep
->rds_hcalistp
;
303 rdsib_statep
->rds_hcalistp
= hcap
;
305 rw_exit(&rdsib_statep
->rds_hca_lock
);
307 RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid
);
316 rdsib_initialize_ib()
320 uint_t ix
, hcaix
, nhcas
;
323 RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep
);
325 ASSERT(rdsib_statep
!= NULL
);
326 if (rdsib_statep
== NULL
) {
327 RDS_DPRINTF1("rdsib_initialize_ib",
328 "RDS Statep not initialized");
332 /* How many hcas are there? */
333 nhcas
= ibt_get_hca_list(&guidp
);
335 RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
339 RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas
);
341 /* Register with IBTF */
342 ret
= ibt_attach(&rds_ib_modinfo
, rdsib_dev_info
, rdsib_statep
,
343 &rdsib_statep
->rds_ibhdl
);
344 if (ret
!= IBT_SUCCESS
) {
345 RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
347 (void) ibt_free_hca_list(guidp
, nhcas
);
352 * Open each HCA and gather its information. Don't care about HCAs
353 * that cannot be opened. It is OK as long as atleast one HCA can be
355 * Initialize a HCA only if all the information is available.
357 for (ix
= 0, hcaix
= 0; ix
< nhcas
; ix
++) {
358 RDS_DPRINTF3(LABEL
, "Open HCA: 0x%llx", guidp
[ix
]);
360 hcap
= rdsib_init_hca(guidp
[ix
]);
361 if (hcap
!= NULL
) hcaix
++;
364 /* free the HCA list, we are done with it */
365 (void) ibt_free_hca_list(guidp
, nhcas
);
368 /* Failed to Initialize even one HCA */
369 RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
370 (void) ibt_detach(rdsib_statep
->rds_ibhdl
);
371 rdsib_statep
->rds_ibhdl
= NULL
;
376 RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
377 (nhcas
- hcaix
), nhcas
);
380 RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep
);
389 rdsib_deinitialize_ib()
391 rds_hca_t
*hcap
, *nextp
;
394 RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep
);
396 /* close and destroy all the sessions */
397 rds_close_sessions(NULL
);
399 /* Release all HCA resources */
400 rw_enter(&rdsib_statep
->rds_hca_lock
, RW_WRITER
);
401 RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
402 rdsib_statep
->rds_hcalistp
, rdsib_statep
->rds_nhcas
);
403 hcap
= rdsib_statep
->rds_hcalistp
;
404 rdsib_statep
->rds_hcalistp
= NULL
;
405 rdsib_statep
->rds_nhcas
= 0;
406 rw_exit(&rdsib_statep
->rds_hca_lock
);
408 while (hcap
!= NULL
) {
409 nextp
= hcap
->hca_nextp
;
411 if (hcap
->hca_hdl
!= NULL
) {
412 ret
= ibt_free_pd(hcap
->hca_hdl
, hcap
->hca_pdhdl
);
413 ASSERT(ret
== IBT_SUCCESS
);
415 (void) ibt_free_portinfo(hcap
->hca_pinfop
,
418 ret
= ibt_close_hca(hcap
->hca_hdl
);
419 ASSERT(ret
== IBT_SUCCESS
);
422 kmem_free(hcap
, sizeof (rds_hca_t
));
426 /* Deregister with IBTF */
427 if (rdsib_statep
->rds_ibhdl
!= NULL
) {
428 (void) ibt_detach(rdsib_statep
->rds_ibhdl
);
429 rdsib_statep
->rds_ibhdl
= NULL
;
432 RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
437 * Called on open of first RDS socket
444 RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep
);
446 /* Enable incoming connection requests */
447 if (rdsib_statep
->rds_srvhdl
== NULL
) {
448 rdsib_statep
->rds_srvhdl
=
449 rds_register_service(rdsib_statep
->rds_ibhdl
);
450 if (rdsib_statep
->rds_srvhdl
== NULL
) {
451 RDS_DPRINTF2("rdsib_open_ib",
452 "Service registration failed");
455 /* bind the service on all available ports */
456 ret
= rds_bind_service(rdsib_statep
);
458 RDS_DPRINTF2("rdsib_open_ib",
459 "Bind service failed: %d", ret
);
464 RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep
);
470 * Called when all ports are closed.
477 RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep
);
479 /* Disable incoming connection requests */
480 if (rdsib_statep
->rds_srvhdl
!= NULL
) {
481 ret
= ibt_unbind_all_services(rdsib_statep
->rds_srvhdl
);
483 RDS_DPRINTF2("rdsib_close_ib",
484 "ibt_unbind_all_services failed: %d\n", ret
);
486 ret
= ibt_deregister_service(rdsib_statep
->rds_ibhdl
,
487 rdsib_statep
->rds_srvhdl
);
489 RDS_DPRINTF2("rdsib_close_ib",
490 "ibt_deregister_service failed: %d\n", ret
);
492 rdsib_statep
->rds_srvhdl
= NULL
;
496 RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep
);
499 /* Return hcap, given the hca guid */
501 rds_get_hcap(rds_state_t
*statep
, ib_guid_t hca_guid
)
505 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
506 "guid: %llx", statep
, hca_guid
);
508 rw_enter(&statep
->rds_hca_lock
, RW_READER
);
510 hcap
= statep
->rds_hcalistp
;
511 while ((hcap
!= NULL
) && (hcap
->hca_guid
!= hca_guid
)) {
512 hcap
= hcap
->hca_nextp
;
516 * don't let anyone use this HCA until the RECV memory
517 * is registered with this HCA
519 if ((hcap
!= NULL
) &&
520 (hcap
->hca_state
== RDS_HCA_STATE_MEM_REGISTERED
)) {
521 ASSERT(hcap
->hca_mrhdl
!= NULL
);
522 rw_exit(&statep
->rds_hca_lock
);
526 RDS_DPRINTF2("rds_get_hcap",
527 "HCA (0x%p, 0x%llx) is not initialized", hcap
, hca_guid
);
528 rw_exit(&statep
->rds_hca_lock
);
530 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
535 /* Return hcap, given a gid */
537 rds_gid_to_hcap(rds_state_t
*statep
, ib_gid_t gid
)
542 RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
543 statep
, gid
.gid_prefix
, gid
.gid_guid
);
545 rw_enter(&statep
->rds_hca_lock
, RW_READER
);
547 hcap
= statep
->rds_hcalistp
;
548 while (hcap
!= NULL
) {
551 * don't let anyone use this HCA until the RECV memory
552 * is registered with this HCA
554 if (hcap
->hca_state
!= RDS_HCA_STATE_MEM_REGISTERED
) {
555 RDS_DPRINTF3("rds_gid_to_hcap",
556 "HCA (0x%p, 0x%llx) is not initialized",
558 hcap
= hcap
->hca_nextp
;
562 for (ix
= 0; ix
< hcap
->hca_nports
; ix
++) {
563 if ((hcap
->hca_pinfop
[ix
].p_sgid_tbl
[0].gid_prefix
==
565 (hcap
->hca_pinfop
[ix
].p_sgid_tbl
[0].gid_guid
==
567 RDS_DPRINTF4("rds_gid_to_hcap",
568 "gid found in hcap: 0x%p", hcap
);
569 rw_exit(&statep
->rds_hca_lock
);
573 hcap
= hcap
->hca_nextp
;
576 rw_exit(&statep
->rds_hca_lock
);
581 /* This is called from the send CQ handler */
583 rds_send_acknowledgement(rds_ep_t
*ep
)
588 RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep
);
590 mutex_enter(&ep
->ep_lock
);
592 ASSERT(ep
->ep_rdmacnt
!= 0);
595 * The previous ACK completed successfully, send the next one
596 * if more messages were received after sending the last ACK
598 if (ep
->ep_rbufid
!= *(uintptr_t *)(uintptr_t)ep
->ep_ackds
.ds_va
) {
599 *(uintptr_t *)(uintptr_t)ep
->ep_ackds
.ds_va
= ep
->ep_rbufid
;
600 mutex_exit(&ep
->ep_lock
);
602 /* send acknowledgement */
604 ret
= ibt_post_send(ep
->ep_chanhdl
, &ep
->ep_ackwr
, 1, &ix
);
605 if (ret
!= IBT_SUCCESS
) {
606 RDS_DPRINTF2("rds_send_acknowledgement",
607 "EP(%p): ibt_post_send for acknowledgement "
608 "failed: %d, SQ depth: %d",
609 ep
, ret
, ep
->ep_sndpool
.pool_nbusy
);
610 mutex_enter(&ep
->ep_lock
);
612 mutex_exit(&ep
->ep_lock
);
615 /* ACKed all messages, no more to ACK */
617 mutex_exit(&ep
->ep_lock
);
621 RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep
);
625 rds_poll_ctrl_completions(ibt_cq_hdl_t cq
, rds_ep_t
*ep
)
630 rds_ctrl_pkt_t
*cpkt
;
632 int ret
= IBT_SUCCESS
;
634 RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep
);
636 bzero(&wc
, sizeof (ibt_wc_t
));
637 ret
= ibt_poll_cq(cq
, &wc
, 1, &npolled
);
638 if (ret
!= IBT_SUCCESS
) {
639 if (ret
!= IBT_CQ_EMPTY
) {
640 RDS_DPRINTF2(LABEL
, "EP(%p) CQ(%p): ibt_poll_cq "
641 "returned: %d", ep
, cq
, ret
);
643 RDS_DPRINTF5(LABEL
, "EP(%p) CQ(%p): ibt_poll_cq "
644 "returned: IBT_CQ_EMPTY", ep
, cq
);
649 bp
= (rds_buf_t
*)(uintptr_t)wc
.wc_id
;
651 if (wc
.wc_status
!= IBT_WC_SUCCESS
) {
652 mutex_enter(&ep
->ep_recvqp
.qp_lock
);
653 ep
->ep_recvqp
.qp_level
--;
654 mutex_exit(&ep
->ep_recvqp
.qp_lock
);
656 /* Free the buffer */
657 bp
->buf_state
= RDS_RCVBUF_FREE
;
658 rds_free_recv_buf(bp
, 1);
660 /* Receive completion failure */
661 if (wc
.wc_status
!= IBT_WC_WR_FLUSHED_ERR
) {
662 RDS_DPRINTF2("rds_poll_ctrl_completions",
663 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
664 ep
, cq
, wc
.wc_id
, wc
.wc_status
);
669 /* there is one less in the RQ */
670 recvqp
= &ep
->ep_recvqp
;
671 mutex_enter(&recvqp
->qp_lock
);
673 if ((recvqp
->qp_taskqpending
== B_FALSE
) &&
674 (recvqp
->qp_level
<= recvqp
->qp_lwm
)) {
675 /* Time to post more buffers into the RQ */
676 recvqp
->qp_taskqpending
= B_TRUE
;
677 mutex_exit(&recvqp
->qp_lock
);
679 ret
= ddi_taskq_dispatch(rds_taskq
,
680 rds_post_recv_buf
, (void *)ep
->ep_chanhdl
, DDI_NOSLEEP
);
681 if (ret
!= DDI_SUCCESS
) {
682 RDS_DPRINTF2(LABEL
, "ddi_taskq_dispatch failed: %d",
684 mutex_enter(&recvqp
->qp_lock
);
685 recvqp
->qp_taskqpending
= B_FALSE
;
686 mutex_exit(&recvqp
->qp_lock
);
689 mutex_exit(&recvqp
->qp_lock
);
692 cpkt
= (rds_ctrl_pkt_t
*)(uintptr_t)bp
->buf_ds
.ds_va
;
693 rds_handle_control_message(ep
->ep_sp
, cpkt
);
695 bp
->buf_state
= RDS_RCVBUF_FREE
;
696 rds_free_recv_buf(bp
, 1);
698 RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep
);
703 #define RDS_POST_FEW_ATATIME 100
704 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
706 rds_post_recv_buf(void *arg
)
708 ibt_channel_hdl_t chanhdl
;
714 ibt_recv_wr_t
*wrp
, wr
[RDS_POST_FEW_ATATIME
];
716 uint_t npost
, nspace
, rcv_len
;
720 chanhdl
= (ibt_channel_hdl_t
)arg
;
721 RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl
);
722 RDS_INCR_POST_RCV_BUF_CALLS();
724 ep
= (rds_ep_t
*)ibt_get_chan_private(chanhdl
);
727 recvqp
= &ep
->ep_recvqp
;
729 RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep
);
731 /* get the hcap for the HCA hosting this channel */
732 hcap
= rds_lkup_hca(ep
->ep_hca_guid
);
734 RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
739 /* Make sure the session is still connected */
740 rw_enter(&sp
->session_lock
, RW_READER
);
741 if ((sp
->session_state
!= RDS_SESSION_STATE_INIT
) &&
742 (sp
->session_state
!= RDS_SESSION_STATE_CONNECTED
) &&
743 (sp
->session_state
!= RDS_SESSION_STATE_HCA_CLOSING
)) {
744 RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
745 "in active state (%d)", ep
, sp
->session_state
);
746 rw_exit(&sp
->session_lock
);
749 rw_exit(&sp
->session_lock
);
751 /* how many can be posted */
752 mutex_enter(&recvqp
->qp_lock
);
753 nspace
= recvqp
->qp_depth
- recvqp
->qp_level
;
755 RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
756 recvqp
->qp_taskqpending
= B_FALSE
;
757 mutex_exit(&recvqp
->qp_lock
);
760 mutex_exit(&recvqp
->qp_lock
);
762 if (ep
->ep_type
== RDS_EP_TYPE_DATA
) {
764 rcv_len
= RdsPktSize
;
767 rcv_len
= RDS_CTRLPKT_SIZE
;
770 bp
= rds_get_buf(gp
, nspace
, &jx
);
772 RDS_DPRINTF2(LABEL
, "EP(%p): No Recv buffers available", ep
);
773 /* try again later */
774 ret
= ddi_taskq_dispatch(rds_taskq
, rds_post_recv_buf
,
775 (void *)chanhdl
, DDI_NOSLEEP
);
776 if (ret
!= DDI_SUCCESS
) {
777 RDS_DPRINTF2(LABEL
, "ddi_taskq_dispatch failed: %d",
779 mutex_enter(&recvqp
->qp_lock
);
780 recvqp
->qp_taskqpending
= B_FALSE
;
781 mutex_exit(&recvqp
->qp_lock
);
787 RDS_DPRINTF2(LABEL
, "EP(%p): Recv buffers "
788 "needed: %d available: %d", ep
, nspace
, jx
);
793 for (ix
= 0; ix
< nspace
; ix
++) {
795 ASSERT(bp1
->buf_state
== RDS_RCVBUF_FREE
);
796 bp1
->buf_state
= RDS_RCVBUF_POSTED
;
797 bp1
->buf_ds
.ds_key
= hcap
->hca_lkey
;
798 bp1
->buf_ds
.ds_len
= rcv_len
;
799 bp1
= bp1
->buf_nextp
;
803 wrp
= kmem_zalloc(RDS_POST_FEW_ATATIME
* sizeof (ibt_recv_wr_t
),
811 jx
= (npost
> RDS_POST_FEW_ATATIME
) ?
812 RDS_POST_FEW_ATATIME
: npost
;
813 for (ix
= 0; ix
< jx
; ix
++) {
814 wrp
[ix
].wr_id
= (uintptr_t)bp
;
816 wrp
[ix
].wr_sgl
= &bp
->buf_ds
;
820 ret
= ibt_post_recv(chanhdl
, wrp
, jx
, &kx
);
821 if ((ret
!= IBT_SUCCESS
) || (kx
!= jx
)) {
822 RDS_DPRINTF2(LABEL
, "ibt_post_recv for %d WRs failed: "
831 mutex_enter(&recvqp
->qp_lock
);
833 RDS_DPRINTF2("rds_post_recv_buf",
834 "EP(%p) Failed to post %d WRs", ep
, npost
);
835 recvqp
->qp_level
+= (nspace
- npost
);
837 recvqp
->qp_level
+= nspace
;
841 * sometimes, the recv WRs can get consumed as soon as they are
842 * posted. In that case, taskq thread to post more WRs to the RQ will
843 * not be scheduled as the taskqpending flag is still set.
845 if (recvqp
->qp_level
== 0) {
846 mutex_exit(&recvqp
->qp_lock
);
847 ret
= ddi_taskq_dispatch(rds_taskq
,
848 rds_post_recv_buf
, (void *)chanhdl
, DDI_NOSLEEP
);
849 if (ret
!= DDI_SUCCESS
) {
850 RDS_DPRINTF2("rds_post_recv_buf",
851 "ddi_taskq_dispatch failed: %d", ret
);
852 mutex_enter(&recvqp
->qp_lock
);
853 recvqp
->qp_taskqpending
= B_FALSE
;
854 mutex_exit(&recvqp
->qp_lock
);
857 recvqp
->qp_taskqpending
= B_FALSE
;
858 mutex_exit(&recvqp
->qp_lock
);
862 kmem_free(wrp
, RDS_POST_FEW_ATATIME
* sizeof (ibt_recv_wr_t
));
865 RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep
);
869 rds_poll_data_completions(ibt_cq_hdl_t cq
, rds_ep_t
*ep
)
873 rds_data_hdr_t
*pktp
;
876 int ret
= IBT_SUCCESS
;
879 RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep
);
881 bzero(&wc
, sizeof (ibt_wc_t
));
882 ret
= ibt_poll_cq(cq
, &wc
, 1, &npolled
);
883 if (ret
!= IBT_SUCCESS
) {
884 if (ret
!= IBT_CQ_EMPTY
) {
885 RDS_DPRINTF2(LABEL
, "EP(%p) CQ(%p): ibt_poll_cq "
886 "returned: %d", ep
, cq
, ret
);
888 RDS_DPRINTF5(LABEL
, "EP(%p) CQ(%p): ibt_poll_cq "
889 "returned: IBT_CQ_EMPTY", ep
, cq
);
894 bp
= (rds_buf_t
*)(uintptr_t)wc
.wc_id
;
895 ASSERT(bp
->buf_state
== RDS_RCVBUF_POSTED
);
896 bp
->buf_state
= RDS_RCVBUF_ONSOCKQ
;
897 bp
->buf_nextp
= NULL
;
899 if (wc
.wc_status
!= IBT_WC_SUCCESS
) {
900 mutex_enter(&ep
->ep_recvqp
.qp_lock
);
901 ep
->ep_recvqp
.qp_level
--;
902 mutex_exit(&ep
->ep_recvqp
.qp_lock
);
904 /* free the buffer */
905 bp
->buf_state
= RDS_RCVBUF_FREE
;
906 rds_free_recv_buf(bp
, 1);
908 /* Receive completion failure */
909 if (wc
.wc_status
!= IBT_WC_WR_FLUSHED_ERR
) {
910 RDS_DPRINTF2("rds_poll_data_completions",
911 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
912 ep
, cq
, wc
.wc_id
, wc
.wc_status
);
918 /* there is one less in the RQ */
919 recvqp
= &ep
->ep_recvqp
;
920 mutex_enter(&recvqp
->qp_lock
);
922 if ((recvqp
->qp_taskqpending
== B_FALSE
) &&
923 (recvqp
->qp_level
<= recvqp
->qp_lwm
)) {
924 /* Time to post more buffers into the RQ */
925 recvqp
->qp_taskqpending
= B_TRUE
;
926 mutex_exit(&recvqp
->qp_lock
);
928 ret
= ddi_taskq_dispatch(rds_taskq
,
929 rds_post_recv_buf
, (void *)ep
->ep_chanhdl
, DDI_NOSLEEP
);
930 if (ret
!= DDI_SUCCESS
) {
931 RDS_DPRINTF2(LABEL
, "ddi_taskq_dispatch failed: %d",
933 mutex_enter(&recvqp
->qp_lock
);
934 recvqp
->qp_taskqpending
= B_FALSE
;
935 mutex_exit(&recvqp
->qp_lock
);
938 mutex_exit(&recvqp
->qp_lock
);
941 pktp
= (rds_data_hdr_t
*)(uintptr_t)bp
->buf_ds
.ds_va
;
942 ASSERT(pktp
->dh_datalen
!= 0);
944 RDS_DPRINTF5(LABEL
, "Message Received: sendIP: 0x%x recvIP: 0x%x "
945 "sendport: %d recvport: %d npkts: %d pktno: %d", ep
->ep_remip
,
946 ep
->ep_myip
, pktp
->dh_sendport
, pktp
->dh_recvport
,
947 pktp
->dh_npkts
, pktp
->dh_psn
);
949 RDS_DPRINTF3(LABEL
, "BP(%p): npkts: %d psn: %d", bp
,
950 pktp
->dh_npkts
, pktp
->dh_psn
);
952 if (pktp
->dh_npkts
== 1) {
953 /* single pkt or last packet */
954 if (pktp
->dh_psn
!= 0) {
955 /* last packet of a segmented message */
956 ASSERT(ep
->ep_seglbp
!= NULL
);
957 ep
->ep_seglbp
->buf_nextp
= bp
;
959 rds_received_msg(ep
, ep
->ep_segfbp
);
960 ep
->ep_segfbp
= NULL
;
961 ep
->ep_seglbp
= NULL
;
964 rds_received_msg(ep
, bp
);
968 if (pktp
->dh_psn
== 0) {
970 ASSERT(ep
->ep_segfbp
== NULL
);
974 /* intermediate packet */
975 ASSERT(ep
->ep_segfbp
!= NULL
);
976 ep
->ep_seglbp
->buf_nextp
= bp
;
981 RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep
);
987 rds_recvcq_handler(ibt_cq_hdl_t cq
, void *arg
)
990 int ret
= IBT_SUCCESS
;
991 int (*func
)(ibt_cq_hdl_t
, rds_ep_t
*);
993 ep
= (rds_ep_t
*)arg
;
995 RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep
);
997 if (ep
->ep_type
== RDS_EP_TYPE_DATA
) {
998 func
= rds_poll_data_completions
;
1000 func
= rds_poll_ctrl_completions
;
1005 } while (ret
!= IBT_CQ_EMPTY
);
1008 ret
= ibt_enable_cq_notify(cq
, rds_wc_signal
);
1009 if (ret
!= IBT_SUCCESS
) {
1010 RDS_DPRINTF2(LABEL
, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1011 "failed: %d", ep
, cq
, ret
);
1017 } while (ret
!= IBT_CQ_EMPTY
);
1019 RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep
);
1023 rds_poll_send_completions(ibt_cq_hdl_t cq
, rds_ep_t
*ep
, boolean_t lock
)
1025 ibt_wc_t wc
[RDS_NUM_DATA_SEND_WCS
];
1026 uint_t npolled
, nret
, send_error
= 0;
1027 rds_buf_t
*headp
, *tailp
, *bp
;
1030 RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep
);
1036 ret
= ibt_poll_cq(cq
, wc
, RDS_NUM_DATA_SEND_WCS
, &nret
);
1037 if (ret
!= IBT_SUCCESS
) {
1038 if (ret
!= IBT_CQ_EMPTY
) {
1039 RDS_DPRINTF2(LABEL
, "EP(%p) CQ(%p): "
1040 "ibt_poll_cq returned: %d", ep
, cq
, ret
);
1042 RDS_DPRINTF5(LABEL
, "EP(%p) CQ(%p): "
1043 "ibt_poll_cq returned: IBT_CQ_EMPTY",
1050 for (ix
= 0; ix
< nret
; ix
++) {
1051 if (wc
[ix
].wc_status
== IBT_WC_SUCCESS
) {
1052 if (wc
[ix
].wc_type
== IBT_WRC_RDMAW
) {
1053 rds_send_acknowledgement(ep
);
1057 bp
= (rds_buf_t
*)(uintptr_t)wc
[ix
].wc_id
;
1058 ASSERT(bp
->buf_state
== RDS_SNDBUF_PENDING
);
1059 bp
->buf_state
= RDS_SNDBUF_FREE
;
1060 } else if (wc
[ix
].wc_status
== IBT_WC_WR_FLUSHED_ERR
) {
1062 RDS_DPRINTF5("rds_poll_send_completions",
1063 "EP(%p): WC ID: %p ERROR: %d", ep
,
1064 wc
[ix
].wc_id
, wc
[ix
].wc_status
);
1068 if (wc
[ix
].wc_id
== RDS_RDMAW_WRID
) {
1069 mutex_enter(&ep
->ep_lock
);
1071 mutex_exit(&ep
->ep_lock
);
1075 bp
= (rds_buf_t
*)(uintptr_t)wc
[ix
].wc_id
;
1076 ASSERT(bp
->buf_state
== RDS_SNDBUF_PENDING
);
1077 bp
->buf_state
= RDS_SNDBUF_FREE
;
1080 RDS_DPRINTF2("rds_poll_send_completions",
1081 "EP(%p): WC ID: %p ERROR: %d", ep
,
1082 wc
[ix
].wc_id
, wc
[ix
].wc_status
);
1083 if (send_error
== 0) {
1084 rds_session_t
*sp
= ep
->ep_sp
;
1086 /* don't let anyone send anymore */
1087 rw_enter(&sp
->session_lock
, RW_WRITER
);
1088 if (sp
->session_state
!=
1089 RDS_SESSION_STATE_ERROR
) {
1091 RDS_SESSION_STATE_ERROR
;
1092 /* Make this the active end */
1096 rw_exit(&sp
->session_lock
);
1101 if (wc
[ix
].wc_id
== RDS_RDMAW_WRID
) {
1102 mutex_enter(&ep
->ep_lock
);
1104 mutex_exit(&ep
->ep_lock
);
1108 bp
= (rds_buf_t
*)(uintptr_t)wc
[ix
].wc_id
;
1109 ASSERT(bp
->buf_state
== RDS_SNDBUF_PENDING
);
1110 bp
->buf_state
= RDS_SNDBUF_FREE
;
1113 bp
->buf_nextp
= NULL
;
1115 tailp
->buf_nextp
= bp
;
1125 if (rds_no_interrupts
&& (npolled
> 100)) {
1129 if (rds_no_interrupts
== 1) {
1132 } while (ret
!= IBT_CQ_EMPTY
);
1134 RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1135 npolled
, send_error
);
1137 /* put the buffers to the pool */
1139 rds_free_send_buf(ep
, headp
, tailp
, npolled
, lock
);
1142 if (send_error
!= 0) {
1143 rds_handle_send_error(ep
);
1146 RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep
);
1150 rds_sendcq_handler(ibt_cq_hdl_t cq
, void *arg
)
1155 ep
= (rds_ep_t
*)arg
;
1157 RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep
);
1160 ret
= ibt_enable_cq_notify(cq
, IBT_NEXT_COMPLETION
);
1161 if (ret
!= IBT_SUCCESS
) {
1162 RDS_DPRINTF2(LABEL
, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1163 "failed: %d", ep
, cq
, ret
);
1167 rds_poll_send_completions(cq
, ep
, B_FALSE
);
1169 RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep
);
1173 rds_ep_free_rc_channel(rds_ep_t
*ep
)
1177 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep
);
1179 ASSERT(mutex_owned(&ep
->ep_lock
));
1182 if (ep
->ep_chanhdl
!= NULL
) {
1183 /* wait until the RQ is empty */
1184 (void) ibt_flush_channel(ep
->ep_chanhdl
);
1185 (void) rds_is_recvq_empty(ep
, B_TRUE
);
1186 ret
= ibt_free_channel(ep
->ep_chanhdl
);
1187 if (ret
!= IBT_SUCCESS
) {
1188 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
1189 "ibt_free_channel returned: %d", ep
, ret
);
1191 ep
->ep_chanhdl
= NULL
;
1193 RDS_DPRINTF2("rds_ep_free_rc_channel",
1194 "EP(%p) Channel is ALREADY FREE", ep
);
1197 /* free the Send CQ */
1198 if (ep
->ep_sendcq
!= NULL
) {
1199 ret
= ibt_free_cq(ep
->ep_sendcq
);
1200 if (ret
!= IBT_SUCCESS
) {
1201 RDS_DPRINTF2("rds_ep_free_rc_channel",
1202 "EP(%p) - for sendcq, ibt_free_cq returned %d",
1205 ep
->ep_sendcq
= NULL
;
1207 RDS_DPRINTF2("rds_ep_free_rc_channel",
1208 "EP(%p) SendCQ is ALREADY FREE", ep
);
1211 /* free the Recv CQ */
1212 if (ep
->ep_recvcq
!= NULL
) {
1213 ret
= ibt_free_cq(ep
->ep_recvcq
);
1214 if (ret
!= IBT_SUCCESS
) {
1215 RDS_DPRINTF2("rds_ep_free_rc_channel",
1216 "EP(%p) - for recvcq, ibt_free_cq returned %d",
1219 ep
->ep_recvcq
= NULL
;
1221 RDS_DPRINTF2("rds_ep_free_rc_channel",
1222 "EP(%p) RecvCQ is ALREADY FREE", ep
);
1225 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep
);
1228 /* Allocate resources for RC channel */
1230 rds_ep_alloc_rc_channel(rds_ep_t
*ep
, uint8_t hca_port
)
1232 int ret
= IBT_SUCCESS
;
1233 ibt_cq_attr_t scqattr
, rcqattr
;
1234 ibt_rc_chan_alloc_args_t chanargs
;
1235 ibt_channel_hdl_t chanhdl
;
1239 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1242 /* Update the EP with the right IP address and HCA guid */
1245 rw_enter(&sp
->session_lock
, RW_READER
);
1246 mutex_enter(&ep
->ep_lock
);
1247 ep
->ep_myip
= sp
->session_myip
;
1248 ep
->ep_remip
= sp
->session_remip
;
1249 hcap
= rds_gid_to_hcap(rdsib_statep
, sp
->session_lgid
);
1250 ep
->ep_hca_guid
= hcap
->hca_guid
;
1251 mutex_exit(&ep
->ep_lock
);
1252 rw_exit(&sp
->session_lock
);
1254 /* reset taskqpending flag here */
1255 ep
->ep_recvqp
.qp_taskqpending
= B_FALSE
;
1257 if (ep
->ep_type
== RDS_EP_TYPE_CTRL
) {
1258 scqattr
.cq_size
= MaxCtrlSendBuffers
;
1259 scqattr
.cq_sched
= NULL
;
1260 scqattr
.cq_flags
= IBT_CQ_NO_FLAGS
;
1262 rcqattr
.cq_size
= MaxCtrlRecvBuffers
;
1263 rcqattr
.cq_sched
= NULL
;
1264 rcqattr
.cq_flags
= IBT_CQ_NO_FLAGS
;
1266 chanargs
.rc_sizes
.cs_sq
= MaxCtrlSendBuffers
;
1267 chanargs
.rc_sizes
.cs_rq
= MaxCtrlRecvBuffers
;
1268 chanargs
.rc_sizes
.cs_sq_sgl
= 1;
1269 chanargs
.rc_sizes
.cs_rq_sgl
= 1;
1271 scqattr
.cq_size
= MaxDataSendBuffers
+ RDS_NUM_ACKS
;
1272 scqattr
.cq_sched
= NULL
;
1273 scqattr
.cq_flags
= IBT_CQ_NO_FLAGS
;
1275 rcqattr
.cq_size
= MaxDataRecvBuffers
;
1276 rcqattr
.cq_sched
= NULL
;
1277 rcqattr
.cq_flags
= IBT_CQ_NO_FLAGS
;
1279 chanargs
.rc_sizes
.cs_sq
= MaxDataSendBuffers
+ RDS_NUM_ACKS
;
1280 chanargs
.rc_sizes
.cs_rq
= MaxDataRecvBuffers
;
1281 chanargs
.rc_sizes
.cs_sq_sgl
= 1;
1282 chanargs
.rc_sizes
.cs_rq_sgl
= 1;
1285 mutex_enter(&ep
->ep_lock
);
1286 if (ep
->ep_sendcq
== NULL
) {
1287 /* returned size is always greater than the requested size */
1288 ret
= ibt_alloc_cq(hcap
->hca_hdl
, &scqattr
,
1289 &ep
->ep_sendcq
, NULL
);
1290 if (ret
!= IBT_SUCCESS
) {
1291 RDS_DPRINTF2(LABEL
, "ibt_alloc_cq for sendCQ "
1292 "failed, size = %d: %d", scqattr
.cq_size
, ret
);
1293 mutex_exit(&ep
->ep_lock
);
1297 (void) ibt_set_cq_handler(ep
->ep_sendcq
, rds_sendcq_handler
,
1300 if (rds_no_interrupts
== 0) {
1301 ret
= ibt_enable_cq_notify(ep
->ep_sendcq
,
1302 IBT_NEXT_COMPLETION
);
1303 if (ret
!= IBT_SUCCESS
) {
1305 "ibt_enable_cq_notify failed: %d", ret
);
1306 (void) ibt_free_cq(ep
->ep_sendcq
);
1307 ep
->ep_sendcq
= NULL
;
1308 mutex_exit(&ep
->ep_lock
);
1314 if (ep
->ep_recvcq
== NULL
) {
1315 /* returned size is always greater than the requested size */
1316 ret
= ibt_alloc_cq(hcap
->hca_hdl
, &rcqattr
,
1317 &ep
->ep_recvcq
, NULL
);
1318 if (ret
!= IBT_SUCCESS
) {
1319 RDS_DPRINTF2(LABEL
, "ibt_alloc_cq for recvCQ "
1320 "failed, size = %d: %d", rcqattr
.cq_size
, ret
);
1321 (void) ibt_free_cq(ep
->ep_sendcq
);
1322 ep
->ep_sendcq
= NULL
;
1323 mutex_exit(&ep
->ep_lock
);
1327 (void) ibt_set_cq_handler(ep
->ep_recvcq
, rds_recvcq_handler
,
1330 ret
= ibt_enable_cq_notify(ep
->ep_recvcq
, rds_wc_signal
);
1331 if (ret
!= IBT_SUCCESS
) {
1333 "ibt_enable_cq_notify failed: %d", ret
);
1334 (void) ibt_free_cq(ep
->ep_recvcq
);
1335 ep
->ep_recvcq
= NULL
;
1336 (void) ibt_free_cq(ep
->ep_sendcq
);
1337 ep
->ep_sendcq
= NULL
;
1338 mutex_exit(&ep
->ep_lock
);
1343 chanargs
.rc_flags
= IBT_ALL_SIGNALED
;
1344 chanargs
.rc_control
= IBT_CEP_RDMA_RD
| IBT_CEP_RDMA_WR
|
1346 chanargs
.rc_hca_port_num
= hca_port
;
1347 chanargs
.rc_scq
= ep
->ep_sendcq
;
1348 chanargs
.rc_rcq
= ep
->ep_recvcq
;
1349 chanargs
.rc_pd
= hcap
->hca_pdhdl
;
1350 chanargs
.rc_srq
= NULL
;
1352 ret
= ibt_alloc_rc_channel(hcap
->hca_hdl
,
1353 IBT_ACHAN_NO_FLAGS
, &chanargs
, &chanhdl
, NULL
);
1354 if (ret
!= IBT_SUCCESS
) {
1355 RDS_DPRINTF2(LABEL
, "ibt_alloc_rc_channel fail: %d",
1357 (void) ibt_free_cq(ep
->ep_recvcq
);
1358 ep
->ep_recvcq
= NULL
;
1359 (void) ibt_free_cq(ep
->ep_sendcq
);
1360 ep
->ep_sendcq
= NULL
;
1361 mutex_exit(&ep
->ep_lock
);
1364 mutex_exit(&ep
->ep_lock
);
1366 /* Chan private should contain the ep */
1367 (void) ibt_set_chan_private(chanhdl
, ep
);
1369 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl
);
1377 /* Return node guid given a port gid */
1379 rds_gid_to_node_guid(ib_gid_t gid
)
1381 ibt_node_info_t nodeinfo
;
1384 RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1385 gid
.gid_prefix
, gid
.gid_guid
);
1387 ret
= ibt_gid_to_node_info(gid
, &nodeinfo
);
1388 if (ret
!= IBT_SUCCESS
) {
1389 RDS_DPRINTF2(LABEL
, "ibt_gid_node_info for gid: %llx:%llx "
1390 "failed", gid
.gid_prefix
, gid
.gid_guid
);
1394 RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1395 nodeinfo
.n_node_guid
);
1397 return (nodeinfo
.n_node_guid
);
1403 rds_handle_portup_event(rds_state_t
*statep
, ibt_hca_hdl_t hdl
,
1404 ibt_async_event_t
*event
)
1407 ibt_hca_portinfo_t
*newpinfop
, *oldpinfop
;
1408 uint_t newsize
, oldsize
, nport
;
1412 RDS_DPRINTF2("rds_handle_portup_event",
1413 "Enter: GUID: 0x%llx Statep: %p", event
->ev_hca_guid
, statep
);
1415 rw_enter(&statep
->rds_hca_lock
, RW_WRITER
);
1417 hcap
= statep
->rds_hcalistp
;
1418 while ((hcap
!= NULL
) && (hcap
->hca_guid
!= event
->ev_hca_guid
)) {
1419 hcap
= hcap
->hca_nextp
;
1423 RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1424 "not in our list", event
->ev_hca_guid
);
1425 rw_exit(&statep
->rds_hca_lock
);
1429 ret
= ibt_query_hca_ports(hdl
, 0, &newpinfop
, &nport
, &newsize
);
1430 if (ret
!= IBT_SUCCESS
) {
1431 RDS_DPRINTF2(LABEL
, "ibt_query_hca_ports failed: %d", ret
);
1432 rw_exit(&statep
->rds_hca_lock
);
1436 oldpinfop
= hcap
->hca_pinfop
;
1437 oldsize
= hcap
->hca_pinfo_sz
;
1438 hcap
->hca_pinfop
= newpinfop
;
1439 hcap
->hca_pinfo_sz
= newsize
;
1441 (void) ibt_free_portinfo(oldpinfop
, oldsize
);
1443 /* If RDS service is not registered then no bind is needed */
1444 if (statep
->rds_srvhdl
== NULL
) {
1445 RDS_DPRINTF2("rds_handle_portup_event",
1446 "RDS Service is not registered, so no action needed");
1447 rw_exit(&statep
->rds_hca_lock
);
1452 * If the service was previously bound on this port and
1453 * if this port has changed state down and now up, we do not
1454 * need to bind the service again. The bind is expected to
1455 * persist across state changes. If the service was never bound
1456 * before then we bind it this time.
1458 if (hcap
->hca_bindhdl
[event
->ev_port
- 1] == NULL
) {
1460 /* structure copy */
1461 gid
= newpinfop
[event
->ev_port
- 1].p_sgid_tbl
[0];
1463 /* bind RDS service on the port, pass statep as cm_private */
1464 ret
= ibt_bind_service(statep
->rds_srvhdl
, gid
, NULL
, statep
,
1465 &hcap
->hca_bindhdl
[event
->ev_port
- 1]);
1466 if (ret
!= IBT_SUCCESS
) {
1467 RDS_DPRINTF2("rds_handle_portup_event",
1468 "Bind service for HCA: 0x%llx Port: %d "
1469 "gid %llx:%llx returned: %d", event
->ev_hca_guid
,
1470 event
->ev_port
, gid
.gid_prefix
, gid
.gid_guid
, ret
);
1474 rw_exit(&statep
->rds_hca_lock
);
1476 RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1477 event
->ev_hca_guid
);
1481 rdsib_add_hca(ib_guid_t hca_guid
)
1484 ibt_mr_attr_t mem_attr
;
1485 ibt_mr_desc_t mem_desc
;
1488 RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid
);
1490 hcap
= rdsib_init_hca(hca_guid
);
1494 /* register the recv memory with this hca */
1495 mutex_enter(&rds_dpool
.pool_lock
);
1496 if (rds_dpool
.pool_memp
== NULL
) {
1497 /* no memory to register */
1498 RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
1499 mutex_exit(&rds_dpool
.pool_lock
);
1503 mem_attr
.mr_vaddr
= (ib_vaddr_t
)(uintptr_t)rds_dpool
.pool_memp
;
1504 mem_attr
.mr_len
= rds_dpool
.pool_memsize
;
1505 mem_attr
.mr_as
= NULL
;
1506 mem_attr
.mr_flags
= IBT_MR_ENABLE_LOCAL_WRITE
;
1508 ret
= ibt_register_mr(hcap
->hca_hdl
, hcap
->hca_pdhdl
, &mem_attr
,
1509 &hcap
->hca_mrhdl
, &mem_desc
);
1511 mutex_exit(&rds_dpool
.pool_lock
);
1513 if (ret
!= IBT_SUCCESS
) {
1514 RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
1517 rw_enter(&rdsib_statep
->rds_hca_lock
, RW_WRITER
);
1518 hcap
->hca_state
= RDS_HCA_STATE_MEM_REGISTERED
;
1519 hcap
->hca_lkey
= mem_desc
.md_lkey
;
1520 hcap
->hca_rkey
= mem_desc
.md_rkey
;
1521 rw_exit(&rdsib_statep
->rds_hca_lock
);
1524 RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid
);
1527 void rds_close_this_session(rds_session_t
*sp
, uint8_t wait
);
1528 int rds_post_control_message(rds_session_t
*sp
, uint8_t code
, in_port_t port
);
1531 rdsib_del_hca(rds_state_t
*statep
, ib_guid_t hca_guid
)
1535 rds_hca_state_t saved_state
;
1538 RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid
);
1541 * This should be a write lock as we don't want anyone to get access
1542 * to the hcap while we are modifing its contents
1544 rw_enter(&statep
->rds_hca_lock
, RW_WRITER
);
1546 hcap
= statep
->rds_hcalistp
;
1547 while ((hcap
!= NULL
) && (hcap
->hca_guid
!= hca_guid
)) {
1548 hcap
= hcap
->hca_nextp
;
1551 /* Prevent initiating any new activity on this HCA */
1552 ASSERT(hcap
!= NULL
);
1553 saved_state
= hcap
->hca_state
;
1554 hcap
->hca_state
= RDS_HCA_STATE_STOPPING
;
1556 rw_exit(&statep
->rds_hca_lock
);
1559 * stop the outgoing traffic and close any active sessions on this hca.
1560 * Any pending messages in the SQ will be allowed to complete.
1562 rw_enter(&statep
->rds_sessionlock
, RW_READER
);
1563 sp
= statep
->rds_sessionlistp
;
1565 if (sp
->session_hca_guid
!= hca_guid
) {
1566 sp
= sp
->session_nextp
;
1570 rw_enter(&sp
->session_lock
, RW_WRITER
);
1571 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp
,
1574 * We are changing the session state in advance. This prevents
1575 * further messages to be posted to the SQ. We then
1576 * send a control message to the remote and tell it close
1579 sp
->session_state
= RDS_SESSION_STATE_HCA_CLOSING
;
1580 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
1581 "RDS_SESSION_STATE_PASSIVE_CLOSING", sp
);
1582 rw_exit(&sp
->session_lock
);
1585 * wait until the sendq is empty then tell the remote to
1586 * close this session. This enables for graceful shutdown of
1589 (void) rds_is_sendq_empty(&sp
->session_dataep
, 2);
1590 (void) rds_post_control_message(sp
,
1591 RDS_CTRL_CODE_CLOSE_SESSION
, 0);
1593 sp
= sp
->session_nextp
;
1596 /* wait until all the sessions are off this HCA */
1597 sp
= statep
->rds_sessionlistp
;
1599 if (sp
->session_hca_guid
!= hca_guid
) {
1600 sp
= sp
->session_nextp
;
1604 rw_enter(&sp
->session_lock
, RW_READER
);
1605 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp
,
1608 while ((sp
->session_state
== RDS_SESSION_STATE_HCA_CLOSING
) ||
1609 (sp
->session_state
== RDS_SESSION_STATE_ERROR
) ||
1610 (sp
->session_state
== RDS_SESSION_STATE_PASSIVE_CLOSING
) ||
1611 (sp
->session_state
== RDS_SESSION_STATE_CLOSED
)) {
1612 rw_exit(&sp
->session_lock
);
1614 rw_enter(&sp
->session_lock
, RW_READER
);
1615 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp
,
1619 rw_exit(&sp
->session_lock
);
1621 sp
= sp
->session_nextp
;
1623 rw_exit(&statep
->rds_sessionlock
);
1626 * if rdsib_close_ib was called before this, then that would have
1627 * unbound the service on all ports. In that case, the HCA structs
1628 * will contain stale bindhdls. Hence, we do not call unbind unless
1629 * the service is still registered.
1631 if (statep
->rds_srvhdl
!= NULL
) {
1632 /* unbind RDS service on all ports on this HCA */
1633 for (ix
= 0; ix
< hcap
->hca_nports
; ix
++) {
1634 if (hcap
->hca_bindhdl
[ix
] == NULL
) {
1638 RDS_DPRINTF2("rdsib_del_hca",
1639 "Unbinding Service: port: %d, bindhdl: %p",
1640 ix
+ 1, hcap
->hca_bindhdl
[ix
]);
1641 (void) ibt_unbind_service(rdsib_statep
->rds_srvhdl
,
1642 hcap
->hca_bindhdl
[ix
]);
1643 hcap
->hca_bindhdl
[ix
] = NULL
;
1647 RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap
,
1650 switch (saved_state
) {
1651 case RDS_HCA_STATE_MEM_REGISTERED
:
1652 ASSERT(hcap
->hca_mrhdl
!= NULL
);
1653 ret
= ibt_deregister_mr(hcap
->hca_hdl
, hcap
->hca_mrhdl
);
1654 if (ret
!= IBT_SUCCESS
) {
1655 RDS_DPRINTF2("rdsib_del_hca",
1656 "ibt_deregister_mr failed: %d", ret
);
1659 hcap
->hca_mrhdl
= NULL
;
1661 case RDS_HCA_STATE_OPEN
:
1662 ASSERT(hcap
->hca_hdl
!= NULL
);
1663 ASSERT(hcap
->hca_pdhdl
!= NULL
);
1666 ret
= ibt_free_pd(hcap
->hca_hdl
, hcap
->hca_pdhdl
);
1667 if (ret
!= IBT_SUCCESS
) {
1668 RDS_DPRINTF2("rdsib_del_hca",
1669 "ibt_free_pd failed: %d", ret
);
1672 (void) ibt_free_portinfo(hcap
->hca_pinfop
, hcap
->hca_pinfo_sz
);
1674 ret
= ibt_close_hca(hcap
->hca_hdl
);
1675 if (ret
!= IBT_SUCCESS
) {
1676 RDS_DPRINTF2("rdsib_del_hca",
1677 "ibt_close_hca failed: %d", ret
);
1680 hcap
->hca_hdl
= NULL
;
1681 hcap
->hca_pdhdl
= NULL
;
1687 * This should be a write lock as we don't want anyone to get access
1688 * to the hcap while we are modifing its contents
1690 rw_enter(&statep
->rds_hca_lock
, RW_WRITER
);
1691 hcap
->hca_state
= RDS_HCA_STATE_REMOVED
;
1692 rw_exit(&statep
->rds_hca_lock
);
1694 RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid
);
1698 rds_async_handler(void *clntp
, ibt_hca_hdl_t hdl
, ibt_async_code_t code
,
1699 ibt_async_event_t
*event
)
1701 rds_state_t
*statep
= (rds_state_t
*)clntp
;
1703 RDS_DPRINTF2("rds_async_handler", "Async code: %d", code
);
1706 case IBT_EVENT_PORT_UP
:
1707 rds_handle_portup_event(statep
, hdl
, event
);
1709 case IBT_HCA_ATTACH_EVENT
:
1711 * NOTE: In some error recovery paths, it is possible to
1712 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
1714 (void) rdsib_add_hca(event
->ev_hca_guid
);
1716 case IBT_HCA_DETACH_EVENT
:
1717 (void) rdsib_del_hca(statep
, event
->ev_hca_guid
);
1721 RDS_DPRINTF2(LABEL
, "Async event: %d not handled", code
);
1724 RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code
);
1728 * This routine exists to minimize stale connections across ungraceful
1729 * reboots of nodes in a cluster.
1732 rds_randomize_qps(rds_hca_t
*hcap
)
1734 ibt_cq_attr_t cqattr
;
1735 ibt_rc_chan_alloc_args_t chanargs
;
1736 ibt_channel_hdl_t qp1
, qp2
;
1737 ibt_cq_hdl_t cq_hdl
;
1739 uint8_t i
, j
, rand1
, rand2
;
1742 bzero(&cqattr
, sizeof (ibt_cq_attr_t
));
1744 cqattr
.cq_sched
= NULL
;
1745 cqattr
.cq_flags
= IBT_CQ_NO_FLAGS
;
1746 ret
= ibt_alloc_cq(hcap
->hca_hdl
, &cqattr
, &cq_hdl
, NULL
);
1747 if (ret
!= IBT_SUCCESS
) {
1748 RDS_DPRINTF2("rds_randomize_qps",
1749 "ibt_alloc_cq failed: %d", ret
);
1753 bzero(&chanargs
, sizeof (ibt_rc_chan_alloc_args_t
));
1754 chanargs
.rc_flags
= IBT_ALL_SIGNALED
;
1755 chanargs
.rc_control
= IBT_CEP_RDMA_RD
| IBT_CEP_RDMA_WR
|
1757 chanargs
.rc_hca_port_num
= 1;
1758 chanargs
.rc_scq
= cq_hdl
;
1759 chanargs
.rc_rcq
= cq_hdl
;
1760 chanargs
.rc_pd
= hcap
->hca_pdhdl
;
1761 chanargs
.rc_srq
= NULL
;
1764 rand1
= (nsec
& 0xF);
1765 rand2
= (nsec
>> 4) & 0xF;
1766 RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
1769 for (i
= 0; i
< rand1
+ 3; i
++) {
1770 if (ibt_alloc_rc_channel(hcap
->hca_hdl
,
1771 IBT_ACHAN_NO_FLAGS
, &chanargs
, &qp1
, NULL
) !=
1773 RDS_DPRINTF2("rds_randomize_qps",
1774 "Bailing at i: %d", i
);
1775 (void) ibt_free_cq(cq_hdl
);
1778 for (j
= 0; j
< rand2
+ 3; j
++) {
1779 if (ibt_alloc_rc_channel(hcap
->hca_hdl
,
1780 IBT_ACHAN_NO_FLAGS
, &chanargs
, &qp2
,
1781 NULL
) != IBT_SUCCESS
) {
1782 RDS_DPRINTF2("rds_randomize_qps",
1783 "Bailing at i: %d j: %d", i
, j
);
1784 (void) ibt_free_channel(qp1
);
1785 (void) ibt_free_cq(cq_hdl
);
1788 (void) ibt_free_channel(qp2
);
1790 (void) ibt_free_channel(qp1
);
1793 (void) ibt_free_cq(cq_hdl
);