4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
29 * Portions of this source code is developed by the team members of
30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 * headed by Professor Dhabaleswar K. (DK) Panda.
33 * Acknowledgements to contributions from developors:
34 * Ranjit Noronha: noronha@cse.ohio-state.edu
35 * Lei Chai : chail@cse.ohio-state.edu
36 * Weikuan Yu : yuw@cse.ohio-state.edu
41 * The rpcib plugin. Implements the interface for RDMATF's
42 * interaction with IBTF.
45 #include <sys/param.h>
46 #include <sys/types.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
51 #include <sys/socket.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
63 #include <sys/cmn_err.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
70 #include <sys/ib/ibtl/ibti.h>
73 #include <sys/modctl.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
79 #include <net/if_types.h>
81 #include <rpc/rpc_rdma.h>
83 #include <sys/atomic.h>
85 #define NFS_RDMA_PORT 20049
89 * Convenience structures for connection management
91 typedef struct rpcib_ipaddrs
{
92 void *ri_list
; /* pointer to list of addresses */
93 uint_t ri_count
; /* number of addresses in list */
94 uint_t ri_size
; /* size of ri_list in bytes */
98 typedef struct rpcib_ping
{
100 ibt_path_info_t path
;
106 * Prototype declarations for driver ops
108 static int rpcib_attach(dev_info_t
*, ddi_attach_cmd_t
);
109 static int rpcib_getinfo(dev_info_t
*, ddi_info_cmd_t
,
111 static int rpcib_detach(dev_info_t
*, ddi_detach_cmd_t
);
112 static boolean_t
rpcib_rdma_capable_interface(struct lifreq
*);
113 static int rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t
rpcib_get_ib_addresses(rpcib_ipaddrs_t
*, rpcib_ipaddrs_t
*);
115 static int rpcib_cache_kstat_update(kstat_t
*, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t
*);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t
*hca
, struct netbuf
*s_svcaddr
,
120 struct netbuf
*d_svcaddr
, CONN
**conn
);
123 kstat_named_t cache_limit
;
124 kstat_named_t cache_allocation
;
125 kstat_named_t cache_hits
;
126 kstat_named_t cache_misses
;
127 kstat_named_t cache_misses_above_the_limit
;
129 {"cache_limit", KSTAT_DATA_UINT64
},
130 {"cache_allocation", KSTAT_DATA_UINT64
},
131 {"cache_hits", KSTAT_DATA_UINT64
},
132 {"cache_misses", KSTAT_DATA_UINT64
},
133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64
},
137 static struct cb_ops rpcib_cbops
= {
140 nodev
, /* strategy */
150 ddi_prop_op
, /* prop_op */
154 nodev
, /* int (*cb_aread)() */
155 nodev
/* int (*cb_awrite)() */
161 static struct dev_ops rpcib_ops
= {
162 DEVO_REV
, /* devo_rev, */
164 rpcib_getinfo
, /* info */
165 nulldev
, /* identify */
167 rpcib_attach
, /* attach */
168 rpcib_detach
, /* detach */
170 &rpcib_cbops
, /* driver ops - devctl interfaces */
171 NULL
, /* bus operations */
173 ddi_quiesce_not_needed
, /* quiesce */
177 * Module linkage information.
180 static struct modldrv rib_modldrv
= {
181 &mod_driverops
, /* Driver module */
182 "RPCIB plugin driver", /* Driver name and version */
183 &rpcib_ops
, /* Driver ops */
186 static struct modlinkage rib_modlinkage
= {
188 (void *)&rib_modldrv
,
192 typedef struct rib_lrc_entry
{
193 struct rib_lrc_entry
*forw
;
194 struct rib_lrc_entry
*back
;
201 struct mrc lrc_mhandle
;
202 bool_t lrc_on_freed_list
;
205 typedef struct cache_struct
{
211 } cache_avl_struct_t
;
213 uint64_t cache_limit
= 100 * 1024 * 1024;
214 static uint64_t cache_watermark
= 80 * 1024 * 1024;
215 static bool_t stats_enabled
= FALSE
;
217 static uint64_t max_unsignaled_rws
= 5;
218 int nfs_rdma_port
= NFS_RDMA_PORT
;
220 #define RIBNETID_TCP "tcp"
221 #define RIBNETID_TCP6 "tcp6"
224 * rib_stat: private data pointer used when registering
225 * with the IBTF. It is returned to the consumer
228 static rpcib_state_t
*rib_stat
= NULL
;
230 #define RNR_RETRIES IBT_RNR_RETRY_1
232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
235 int preposted_rbufs
= RDMA_BUFS_GRANT
;
236 int send_threshold
= 1;
239 * Old cards with Tavor driver have limited memory footprint
240 * when booted in 32bit. The rib_max_rbufs tunable can be
241 * tuned for more buffers if needed.
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs
= MAX_BUFS
;
247 int rib_max_rbufs
= 10 * MAX_BUFS
;
248 #endif /* !(_ELF64) && !(__sparc) */
250 int rib_conn_timeout
= 60 * 12; /* 12 minutes */
253 * State of the plugin.
254 * ACCEPT = accepting new connections and requests.
255 * NO_ACCEPT = not accepting new connection and requests.
256 * This should eventually move to rpcib_state_t structure, since this
257 * will tell in which state the plugin is for a particular type of service
258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259 * state for one and in no_accept state for the other.
262 kmutex_t plugin_state_lock
;
264 ldi_ident_t rpcib_li
;
267 * RPCIB RDMATF operations
269 static rdma_stat
rib_reachable(int addr_type
, struct netbuf
*, void **handle
);
270 static rdma_stat
rib_disconnect(CONN
*conn
);
271 static void rib_listen(struct rdma_svc_data
*rd
);
272 static void rib_listen_stop(struct rdma_svc_data
*rd
);
273 static rdma_stat
rib_registermem(CONN
*conn
, caddr_t adsp
, caddr_t buf
,
274 uint_t buflen
, struct mrc
*buf_handle
);
275 static rdma_stat
rib_deregistermem(CONN
*conn
, caddr_t buf
,
276 struct mrc buf_handle
);
277 static rdma_stat
rib_registermem_via_hca(rib_hca_t
*hca
, caddr_t adsp
,
278 caddr_t buf
, uint_t buflen
, struct mrc
*buf_handle
);
279 static rdma_stat
rib_deregistermem_via_hca(rib_hca_t
*hca
, caddr_t buf
,
280 struct mrc buf_handle
);
281 static rdma_stat
rib_registermemsync(CONN
*conn
, caddr_t adsp
, caddr_t buf
,
282 uint_t buflen
, struct mrc
*buf_handle
, RIB_SYNCMEM_HANDLE
*sync_handle
,
284 static rdma_stat
rib_deregistermemsync(CONN
*conn
, caddr_t buf
,
285 struct mrc buf_handle
, RIB_SYNCMEM_HANDLE sync_handle
, void *);
286 static rdma_stat
rib_syncmem(CONN
*conn
, RIB_SYNCMEM_HANDLE shandle
,
287 caddr_t buf
, int len
, int cpu
);
289 static rdma_stat
rib_reg_buf_alloc(CONN
*conn
, rdma_buf_t
*rdbuf
);
291 static void rib_reg_buf_free(CONN
*conn
, rdma_buf_t
*rdbuf
);
292 static void *rib_rbuf_alloc(CONN
*, rdma_buf_t
*);
294 static void rib_rbuf_free(CONN
*conn
, int ptype
, void *buf
);
296 static rdma_stat
rib_send(CONN
*conn
, struct clist
*cl
, uint32_t msgid
);
297 static rdma_stat
rib_send_resp(CONN
*conn
, struct clist
*cl
, uint32_t msgid
);
298 static rdma_stat
rib_post_resp(CONN
*conn
, struct clist
*cl
, uint32_t msgid
);
299 static rdma_stat
rib_post_resp_remove(CONN
*conn
, uint32_t msgid
);
300 static rdma_stat
rib_post_recv(CONN
*conn
, struct clist
*cl
);
301 static rdma_stat
rib_recv(CONN
*conn
, struct clist
**clp
, uint32_t msgid
);
302 static rdma_stat
rib_read(CONN
*conn
, struct clist
*cl
, int wait
);
303 static rdma_stat
rib_write(CONN
*conn
, struct clist
*cl
, int wait
);
304 static rdma_stat
rib_ping_srv(int addr_type
, struct netbuf
*, rpcib_ping_t
*);
305 static rdma_stat
rib_conn_get(struct netbuf
*, struct netbuf
*,
306 int addr_type
, void *, CONN
**);
307 static rdma_stat
rib_conn_release(CONN
*conn
);
308 static rdma_stat
rib_connect(struct netbuf
*, struct netbuf
*, int,
309 rpcib_ping_t
*, CONN
**);
310 static rdma_stat
rib_getinfo(rdma_info_t
*info
);
312 static rib_lrc_entry_t
*rib_get_cache_buf(CONN
*conn
, uint32_t len
);
313 static void rib_free_cache_buf(CONN
*conn
, rib_lrc_entry_t
*buf
);
314 static void rib_destroy_cache(rib_hca_t
*hca
);
315 static void rib_server_side_cache_reclaim(void *argp
);
316 static int avl_compare(const void *t1
, const void *t2
);
318 static void rib_stop_services(rib_hca_t
*);
319 static void rib_close_channels(rib_conn_list_t
*);
320 static void rib_conn_close(void *);
321 static void rib_recv_rele(rib_qp_t
*);
322 static rdma_stat
rib_conn_release_locked(CONN
*conn
);
325 * RPCIB addressing operations
329 * RDMA operations the RPCIB module exports
331 static rdmaops_t rib_ops
= {
340 rib_deregistermemsync
,
347 rib_post_resp_remove
,
356 * RDMATF RPCIB plugin details
358 static rdma_mod_t rib_mod
= {
359 "ibtf", /* api name */
362 &rib_ops
, /* rdma op vector for ibtf */
365 static rdma_stat
rpcib_open_hcas(rpcib_state_t
*);
366 static rdma_stat
rib_qp_init(rib_qp_t
*, int);
367 static void rib_svc_scq_handler(ibt_cq_hdl_t
, void *);
368 static void rib_clnt_scq_handler(ibt_cq_hdl_t
, void *);
369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t
, void *);
370 static void rib_svc_rcq_handler(ibt_cq_hdl_t
, void *);
371 static rib_bufpool_t
*rib_rbufpool_create(rib_hca_t
*hca
, int ptype
, int num
);
372 static rdma_stat
rib_reg_mem(rib_hca_t
*, caddr_t adsp
, caddr_t
, uint_t
,
373 ibt_mr_flags_t
, ibt_mr_hdl_t
*, ibt_mr_desc_t
*);
374 static rdma_stat
rib_reg_mem_user(rib_hca_t
*, caddr_t
, uint_t
, ibt_mr_flags_t
,
375 ibt_mr_hdl_t
*, ibt_mr_desc_t
*, caddr_t
);
376 static rdma_stat
rib_conn_to_srv(rib_hca_t
*, rib_qp_t
*, rpcib_ping_t
*);
377 static rdma_stat
rib_clnt_create_chan(rib_hca_t
*, struct netbuf
*,
379 static rdma_stat
rib_svc_create_chan(rib_hca_t
*, caddr_t
, uint8_t,
381 static rdma_stat
rib_sendwait(rib_qp_t
*, struct send_wid
*);
382 static struct send_wid
*rib_init_sendwait(uint32_t, int, rib_qp_t
*);
383 static int rib_free_sendwait(struct send_wid
*);
384 static struct rdma_done_list
*rdma_done_add(rib_qp_t
*qp
, uint32_t xid
);
385 static void rdma_done_rm(rib_qp_t
*qp
, struct rdma_done_list
*rd
);
386 static void rdma_done_rem_list(rib_qp_t
*);
387 static void rdma_done_notify(rib_qp_t
*qp
, uint32_t xid
);
389 static void rib_async_handler(void *,
390 ibt_hca_hdl_t
, ibt_async_code_t
, ibt_async_event_t
*);
391 static rdma_stat
rib_rem_rep(rib_qp_t
*, struct reply
*);
392 static struct svc_recv
*rib_init_svc_recv(rib_qp_t
*, ibt_wr_ds_t
*);
393 static int rib_free_svc_recv(struct svc_recv
*);
394 static struct recv_wid
*rib_create_wid(rib_qp_t
*, ibt_wr_ds_t
*, uint32_t);
395 static void rib_free_wid(struct recv_wid
*);
396 static rdma_stat
rib_disconnect_channel(CONN
*, rib_conn_list_t
*);
397 static void rib_detach_hca(ibt_hca_hdl_t
);
398 static void rib_close_a_channel(CONN
*);
399 static void rib_send_hold(rib_qp_t
*);
400 static void rib_send_rele(rib_qp_t
*);
403 * Registration with IBTF as a consumer
405 static struct ibt_clnt_modinfo_s rib_modinfo
= {
408 rib_async_handler
, /* async event handler */
409 NULL
, /* Memory Region Handler */
417 typedef struct rpcib_s
{
418 dev_info_t
*rpcib_dip
;
419 kmutex_t rpcib_mutex
;
425 * /etc/system controlled variable to control
426 * debugging in rpcib kernel module.
427 * Set it to values greater that 1 to control
428 * the amount of debugging messages required.
437 error
= mod_install((struct modlinkage
*)&rib_modlinkage
);
440 * Could not load module
444 mutex_init(&plugin_state_lock
, NULL
, MUTEX_DRIVER
, NULL
);
456 if ((status
= mod_remove(&rib_modlinkage
)) != 0) {
459 mutex_destroy(&plugin_state_lock
);
464 _info(struct modinfo
*modinfop
)
466 return (mod_info(&rib_modlinkage
, modinfop
));
471 * Given the device number, return the devinfo pointer or the
473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
478 rpcib_getinfo(dev_info_t
*dip
, ddi_info_cmd_t cmd
, void *arg
, void **result
)
480 int ret
= DDI_SUCCESS
;
483 case DDI_INFO_DEVT2DEVINFO
:
484 if (rpcib
.rpcib_dip
!= NULL
)
485 *result
= rpcib
.rpcib_dip
;
492 case DDI_INFO_DEVT2INSTANCE
:
503 rpcib_free_hca_list()
505 rib_hca_t
*hca
, *hcap
;
507 rw_enter(&rib_stat
->hcas_list_lock
, RW_WRITER
);
508 hca
= rib_stat
->hcas_list
;
509 rib_stat
->hcas_list
= NULL
;
510 rw_exit(&rib_stat
->hcas_list_lock
);
511 while (hca
!= NULL
) {
512 rw_enter(&hca
->state_lock
, RW_WRITER
);
515 rib_stat
->nhca_inited
--;
516 rib_mod
.rdma_count
--;
517 hcap
->state
= HCA_DETACHED
;
518 rw_exit(&hcap
->state_lock
);
519 rib_stop_hca_services(hcap
);
521 kmem_free(hcap
, sizeof (*hcap
));
526 rpcib_free_service_list()
528 rib_service_t
*service
;
531 rw_enter(&rib_stat
->service_list_lock
, RW_WRITER
);
532 while (rib_stat
->service_list
!= NULL
) {
533 service
= rib_stat
->service_list
;
534 ret
= ibt_unbind_all_services(service
->srv_hdl
);
535 if (ret
!= IBT_SUCCESS
) {
536 rw_exit(&rib_stat
->service_list_lock
);
538 cmn_err(CE_NOTE
, "rpcib_free_service_list: "
539 "ibt_unbind_all_services failed (%d)\n", (int)ret
);
541 return (RDMA_FAILED
);
543 ret
= ibt_deregister_service(rib_stat
->ibt_clnt_hdl
,
545 if (ret
!= IBT_SUCCESS
) {
546 rw_exit(&rib_stat
->service_list_lock
);
548 cmn_err(CE_NOTE
, "rpcib_free_service_list: "
549 "ibt_deregister_service failed (%d)\n", (int)ret
);
551 return (RDMA_FAILED
);
553 rib_stat
->service_list
= service
->next
;
554 kmem_free(service
, sizeof (rib_service_t
));
556 rw_exit(&rib_stat
->service_list_lock
);
558 return (RDMA_SUCCESS
);
562 rpcib_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
564 ibt_status_t ibt_status
;
571 return (DDI_SUCCESS
);
573 return (DDI_FAILURE
);
576 mutex_init(&rpcib
.rpcib_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
578 mutex_enter(&rpcib
.rpcib_mutex
);
579 if (rpcib
.rpcib_dip
!= NULL
) {
580 mutex_exit(&rpcib
.rpcib_mutex
);
581 return (DDI_FAILURE
);
583 rpcib
.rpcib_dip
= dip
;
584 mutex_exit(&rpcib
.rpcib_mutex
);
586 * Create the "rpcib" minor-node.
588 if (ddi_create_minor_node(dip
,
589 "rpcib", S_IFCHR
, 0, DDI_PSEUDO
, 0) != DDI_SUCCESS
) {
590 /* Error message, no cmn_err as they print on console */
591 return (DDI_FAILURE
);
594 if (rib_stat
== NULL
) {
595 rib_stat
= kmem_zalloc(sizeof (*rib_stat
), KM_SLEEP
);
596 mutex_init(&rib_stat
->open_hca_lock
, NULL
, MUTEX_DRIVER
, NULL
);
597 rw_init(&rib_stat
->hcas_list_lock
, NULL
, RW_DRIVER
, NULL
);
598 mutex_init(&rib_stat
->listen_lock
, NULL
, MUTEX_DRIVER
, NULL
);
601 rib_stat
->hca_count
= ibt_get_hca_list(NULL
);
602 if (rib_stat
->hca_count
< 1) {
603 mutex_destroy(&rib_stat
->listen_lock
);
604 rw_destroy(&rib_stat
->hcas_list_lock
);
605 mutex_destroy(&rib_stat
->open_hca_lock
);
606 kmem_free(rib_stat
, sizeof (*rib_stat
));
608 return (DDI_FAILURE
);
611 ibt_status
= ibt_attach(&rib_modinfo
, dip
,
612 (void *)rib_stat
, &rib_stat
->ibt_clnt_hdl
);
614 if (ibt_status
!= IBT_SUCCESS
) {
615 mutex_destroy(&rib_stat
->listen_lock
);
616 rw_destroy(&rib_stat
->hcas_list_lock
);
617 mutex_destroy(&rib_stat
->open_hca_lock
);
618 kmem_free(rib_stat
, sizeof (*rib_stat
));
620 return (DDI_FAILURE
);
623 rib_stat
->service_list
= NULL
;
624 rw_init(&rib_stat
->service_list_lock
, NULL
, RW_DRIVER
, NULL
);
625 mutex_enter(&rib_stat
->open_hca_lock
);
626 if (rpcib_open_hcas(rib_stat
) != RDMA_SUCCESS
) {
627 mutex_exit(&rib_stat
->open_hca_lock
);
630 mutex_exit(&rib_stat
->open_hca_lock
);
632 if (ddi_prop_update_int(DDI_DEV_T_NONE
, dip
, DDI_NO_AUTODETACH
, 1) !=
634 cmn_err(CE_WARN
, "rpcib_attach: ddi-no-autodetach prop update "
640 * Register with rdmatf
642 r_status
= rdma_register_mod(&rib_mod
);
643 if (r_status
!= RDMA_SUCCESS
&& r_status
!= RDMA_REG_EXIST
) {
644 cmn_err(CE_WARN
, "rpcib_attach:rdma_register_mod failed, "
645 "status = %d", r_status
);
649 return (DDI_SUCCESS
);
654 (void) ibt_detach(rib_stat
->ibt_clnt_hdl
);
655 rpcib_free_hca_list();
656 (void) rpcib_free_service_list();
657 mutex_destroy(&rib_stat
->listen_lock
);
658 rw_destroy(&rib_stat
->hcas_list_lock
);
659 mutex_destroy(&rib_stat
->open_hca_lock
);
660 rw_destroy(&rib_stat
->service_list_lock
);
661 kmem_free(rib_stat
, sizeof (*rib_stat
));
663 return (DDI_FAILURE
);
668 rpcib_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
677 return (DDI_FAILURE
);
681 * Detach the hca and free resources
683 mutex_enter(&plugin_state_lock
);
684 plugin_state
= NO_ACCEPT
;
685 mutex_exit(&plugin_state_lock
);
687 if (rpcib_free_service_list() != RDMA_SUCCESS
)
688 return (DDI_FAILURE
);
689 rpcib_free_hca_list();
691 (void) ibt_detach(rib_stat
->ibt_clnt_hdl
);
692 mutex_destroy(&rib_stat
->listen_lock
);
693 rw_destroy(&rib_stat
->hcas_list_lock
);
694 mutex_destroy(&rib_stat
->open_hca_lock
);
695 rw_destroy(&rib_stat
->service_list_lock
);
697 kmem_free(rib_stat
, sizeof (*rib_stat
));
700 mutex_enter(&rpcib
.rpcib_mutex
);
701 rpcib
.rpcib_dip
= NULL
;
702 mutex_exit(&rpcib
.rpcib_mutex
);
703 mutex_destroy(&rpcib
.rpcib_mutex
);
704 return (DDI_SUCCESS
);
708 static void rib_rbufpool_free(rib_hca_t
*, int);
709 static void rib_rbufpool_deregister(rib_hca_t
*, int);
710 static void rib_rbufpool_destroy(rib_hca_t
*hca
, int ptype
);
711 static struct reply
*rib_addreplylist(rib_qp_t
*, uint32_t);
712 static rdma_stat
rib_rem_replylist(rib_qp_t
*);
713 static int rib_remreply(rib_qp_t
*, struct reply
*);
714 static rdma_stat
rib_add_connlist(CONN
*, rib_conn_list_t
*);
715 static rdma_stat
rib_rm_conn(CONN
*, rib_conn_list_t
*);
719 * One CQ pair per HCA
722 rib_create_cq(rib_hca_t
*hca
, uint32_t cq_size
, ibt_cq_handler_t cq_handler
,
726 ibt_cq_attr_t cq_attr
;
729 rdma_stat error
= RDMA_SUCCESS
;
731 cq
= kmem_zalloc(sizeof (rib_cq_t
), KM_SLEEP
);
733 bzero(&cq_attr
, sizeof (cq_attr
));
734 cq_attr
.cq_size
= cq_size
;
735 cq_attr
.cq_flags
= IBT_CQ_NO_FLAGS
;
736 status
= ibt_alloc_cq(hca
->hca_hdl
, &cq_attr
, &cq
->rib_cq_hdl
,
738 if (status
!= IBT_SUCCESS
) {
739 cmn_err(CE_WARN
, "rib_create_cq: ibt_alloc_cq() failed,"
740 " status=%d", status
);
744 ibt_set_cq_handler(cq
->rib_cq_hdl
, cq_handler
, hca
);
747 * Enable CQ callbacks. CQ Callbacks are single shot
748 * (e.g. you have to call ibt_enable_cq_notify()
749 * after each callback to get another one).
751 status
= ibt_enable_cq_notify(cq
->rib_cq_hdl
, IBT_NEXT_COMPLETION
);
752 if (status
!= IBT_SUCCESS
) {
753 cmn_err(CE_WARN
, "rib_create_cq: "
754 "enable_cq_notify failed, status %d", status
);
763 (void) ibt_free_cq(cq
->rib_cq_hdl
);
765 kmem_free(cq
, sizeof (rib_cq_t
));
772 * Caller should have already locked the hcas_lock before calling
776 rpcib_find_hca(rpcib_state_t
*ribstat
, ib_guid_t guid
)
778 rib_hca_t
*hca
= ribstat
->hcas_list
;
780 while (hca
&& hca
->hca_guid
!= guid
)
787 rpcib_open_hcas(rpcib_state_t
*ribstat
)
790 ibt_status_t ibt_status
;
792 ibt_hca_portinfo_t
*pinfop
;
793 ibt_pd_flags_t pd_flags
= IBT_PD_NO_FLAGS
;
794 uint_t size
, cq_size
;
797 cache_avl_struct_t example_avl_node
;
799 int old_nhca_inited
= ribstat
->nhca_inited
;
800 ib_guid_t
*hca_guids
;
802 ASSERT(MUTEX_HELD(&ribstat
->open_hca_lock
));
804 ribstat
->hca_count
= ibt_get_hca_list(&hca_guids
);
805 if (ribstat
->hca_count
== 0)
806 return (RDMA_FAILED
);
808 rw_enter(&ribstat
->hcas_list_lock
, RW_WRITER
);
810 * Open a hca and setup for RDMA
812 for (i
= 0; i
< ribstat
->hca_count
; i
++) {
813 if (rpcib_find_hca(ribstat
, hca_guids
[i
]))
815 hca
= kmem_zalloc(sizeof (rib_hca_t
), KM_SLEEP
);
817 ibt_status
= ibt_open_hca(ribstat
->ibt_clnt_hdl
,
818 hca_guids
[i
], &hca
->hca_hdl
);
819 if (ibt_status
!= IBT_SUCCESS
) {
820 kmem_free(hca
, sizeof (rib_hca_t
));
823 hca
->hca_guid
= hca_guids
[i
];
824 hca
->ibt_clnt_hdl
= ribstat
->ibt_clnt_hdl
;
825 hca
->state
= HCA_INITED
;
830 ibt_status
= ibt_query_hca(hca
->hca_hdl
, &hca
->hca_attrs
);
831 if (ibt_status
!= IBT_SUCCESS
) {
836 * One PD (Protection Domain) per HCA.
837 * A qp is allowed to access a memory region
838 * only when it's in the same PD as that of
841 ibt_status
= ibt_alloc_pd(hca
->hca_hdl
, pd_flags
, &hca
->pd_hdl
);
842 if (ibt_status
!= IBT_SUCCESS
) {
849 ibt_status
= ibt_query_hca_ports(hca
->hca_hdl
,
850 0, &pinfop
, &hca
->hca_nports
, &size
);
851 if (ibt_status
!= IBT_SUCCESS
) {
854 hca
->hca_ports
= pinfop
;
855 hca
->hca_pinfosz
= size
;
858 cq_size
= DEF_CQ_SIZE
; /* default cq size */
860 * Create 2 pairs of cq's (1 pair for client
861 * and the other pair for server) on this hca.
862 * If number of qp's gets too large, then several
863 * cq's will be needed.
865 status
= rib_create_cq(hca
, cq_size
, rib_svc_rcq_handler
,
867 if (status
!= RDMA_SUCCESS
) {
871 status
= rib_create_cq(hca
, cq_size
, rib_svc_scq_handler
,
873 if (status
!= RDMA_SUCCESS
) {
877 status
= rib_create_cq(hca
, cq_size
, rib_clnt_rcq_handler
,
879 if (status
!= RDMA_SUCCESS
) {
883 status
= rib_create_cq(hca
, cq_size
, rib_clnt_scq_handler
,
885 if (status
!= RDMA_SUCCESS
) {
890 * Create buffer pools.
891 * Note rib_rbuf_create also allocates memory windows.
893 hca
->recv_pool
= rib_rbufpool_create(hca
,
894 RECV_BUFFER
, rib_max_rbufs
);
895 if (hca
->recv_pool
== NULL
) {
899 hca
->send_pool
= rib_rbufpool_create(hca
,
900 SEND_BUFFER
, rib_max_rbufs
);
901 if (hca
->send_pool
== NULL
) {
902 rib_rbufpool_destroy(hca
, RECV_BUFFER
);
906 if (hca
->server_side_cache
== NULL
) {
907 (void) sprintf(rssc_name
,
908 "rib_srvr_cache_%llx",
909 (long long unsigned int) hca
->hca_guid
);
910 hca
->server_side_cache
= kmem_cache_create(
912 sizeof (cache_avl_struct_t
), 0,
915 rib_server_side_cache_reclaim
,
919 avl_create(&hca
->avl_tree
,
921 sizeof (cache_avl_struct_t
),
922 (uint_t
)(uintptr_t)&example_avl_node
.avl_link
-
923 (uint_t
)(uintptr_t)&example_avl_node
);
925 rw_init(&hca
->bound_services_lock
, NULL
, RW_DRIVER
,
927 rw_init(&hca
->state_lock
, NULL
, RW_DRIVER
, hca
->iblock
);
928 rw_init(&hca
->avl_rw_lock
,
929 NULL
, RW_DRIVER
, hca
->iblock
);
930 mutex_init(&hca
->cache_allocation_lock
,
931 NULL
, MUTEX_DRIVER
, NULL
);
932 hca
->avl_init
= TRUE
;
934 /* Create kstats for the cache */
935 ASSERT(INGLOBALZONE(curproc
));
937 if (!stats_enabled
) {
938 ksp
= kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
940 sizeof (rpcib_kstat
) / sizeof (kstat_named_t
),
941 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
,
944 ksp
->ks_data
= (void *) &rpcib_kstat
;
945 ksp
->ks_update
= rpcib_cache_kstat_update
;
947 stats_enabled
= TRUE
;
950 if (hca
->cleanup_helper
== NULL
) {
951 char tq_name
[sizeof (hca
->hca_guid
) * 2 + 1];
953 (void) snprintf(tq_name
, sizeof (tq_name
), "%llX",
954 (unsigned long long int) hca
->hca_guid
);
955 hca
->cleanup_helper
= ddi_taskq_create(NULL
,
956 tq_name
, 1, TASKQ_DEFAULTPRI
, 0);
959 mutex_init(&hca
->cb_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
960 cv_init(&hca
->cb_cv
, NULL
, CV_DRIVER
, NULL
);
961 rw_init(&hca
->cl_conn_list
.conn_lock
, NULL
, RW_DRIVER
,
963 rw_init(&hca
->srv_conn_list
.conn_lock
, NULL
, RW_DRIVER
,
965 mutex_init(&hca
->inuse_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
968 hca
->next
= ribstat
->hcas_list
;
969 ribstat
->hcas_list
= hca
;
970 ribstat
->nhca_inited
++;
971 ibt_free_portinfo(hca
->hca_ports
, hca
->hca_pinfosz
);
975 ibt_free_portinfo(hca
->hca_ports
, hca
->hca_pinfosz
);
977 (void) ibt_free_pd(hca
->hca_hdl
, hca
->pd_hdl
);
979 (void) ibt_close_hca(hca
->hca_hdl
);
980 kmem_free(hca
, sizeof (rib_hca_t
));
982 rw_exit(&ribstat
->hcas_list_lock
);
983 ibt_free_hca_list(hca_guids
, ribstat
->hca_count
);
984 rib_mod
.rdma_count
= rib_stat
->nhca_inited
;
987 * return success if at least one new hca has been configured.
989 if (ribstat
->nhca_inited
!= old_nhca_inited
)
990 return (RDMA_SUCCESS
);
992 return (RDMA_FAILED
);
1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl
, void *arg
)
1006 ibt_status_t ibt_status
;
1008 struct send_wid
*wd
;
1014 * Re-enable cq notify here to avoid missing any
1015 * completion queue notification.
1017 (void) ibt_enable_cq_notify(cq_hdl
, IBT_NEXT_COMPLETION
);
1019 ibt_status
= IBT_SUCCESS
;
1020 while (ibt_status
!= IBT_CQ_EMPTY
) {
1021 bzero(&wc
, sizeof (wc
));
1022 ibt_status
= ibt_poll_cq(cq_hdl
, &wc
, 1, NULL
);
1023 if (ibt_status
!= IBT_SUCCESS
)
1027 * Got a send completion
1029 if (wc
.wc_id
!= RDMA_DUMMY_WRID
) {
1030 wd
= (struct send_wid
*)(uintptr_t)wc
.wc_id
;
1034 mutex_enter(&wd
->sendwait_lock
);
1035 switch (wc
.wc_status
) {
1036 case IBT_WC_SUCCESS
:
1037 wd
->status
= RDMA_SUCCESS
;
1041 * RC Send Q Error Code Local state Remote State
1042 * ==================== =========== ============
1043 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1044 * IBT_WC_LOCAL_LEN_ERR ERROR None
1045 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1046 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1047 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1048 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1049 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1050 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1051 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1052 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1053 * IBT_WC_WR_FLUSHED_ERR ERROR None
1056 * Channel in error state. Set connection to
1057 * ERROR and cleanup will happen either from
1058 * conn_release or from rib_conn_get
1060 wd
->status
= RDMA_FAILED
;
1061 mutex_enter(&conn
->c_lock
);
1062 if (conn
->c_state
!= C_DISCONN_PEND
)
1063 conn
->c_state
= C_ERROR_CONN
;
1064 mutex_exit(&conn
->c_lock
);
1068 if (wd
->cv_sig
== 1) {
1072 cv_signal(&wd
->wait_cv
);
1073 mutex_exit(&wd
->sendwait_lock
);
1076 * Poster not waiting for notification.
1077 * Free the send buffers and send_wid
1079 for (i
= 0; i
< wd
->nsbufs
; i
++) {
1080 rib_rbuf_free(qptoc(wd
->qp
),
1082 (void *)(uintptr_t)wd
->sbufaddr
[i
]);
1085 /* decrement the send ref count */
1088 mutex_exit(&wd
->sendwait_lock
);
1089 (void) rib_free_sendwait(wd
);
1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl
, void *arg
)
1099 ibt_status_t ibt_status
;
1101 struct send_wid
*wd
;
1107 * Re-enable cq notify here to avoid missing any
1108 * completion queue notification.
1110 (void) ibt_enable_cq_notify(cq_hdl
, IBT_NEXT_COMPLETION
);
1112 ibt_status
= IBT_SUCCESS
;
1113 while (ibt_status
!= IBT_CQ_EMPTY
) {
1114 bzero(&wc
, sizeof (wc
));
1115 ibt_status
= ibt_poll_cq(cq_hdl
, &wc
, 1, NULL
);
1116 if (ibt_status
!= IBT_SUCCESS
)
1120 * Got a send completion
1122 if (wc
.wc_id
!= RDMA_DUMMY_WRID
) {
1123 wd
= (struct send_wid
*)(uintptr_t)wc
.wc_id
;
1126 mutex_enter(&wd
->sendwait_lock
);
1128 switch (wc
.wc_status
) {
1129 case IBT_WC_SUCCESS
:
1130 wd
->status
= RDMA_SUCCESS
;
1134 * Channel in error state. Set connection to
1135 * ERROR and cleanup will happen either from
1136 * conn_release or conn timeout.
1138 wd
->status
= RDMA_FAILED
;
1139 mutex_enter(&conn
->c_lock
);
1140 if (conn
->c_state
!= C_DISCONN_PEND
)
1141 conn
->c_state
= C_ERROR_CONN
;
1142 mutex_exit(&conn
->c_lock
);
1146 if (wd
->cv_sig
== 1) {
1148 * Update completion status and notify poster
1150 cv_signal(&wd
->wait_cv
);
1151 mutex_exit(&wd
->sendwait_lock
);
1154 * Poster not waiting for notification.
1155 * Free the send buffers and send_wid
1157 for (i
= 0; i
< wd
->nsbufs
; i
++) {
1158 rib_rbuf_free(qptoc(wd
->qp
),
1160 (void *)(uintptr_t)wd
->sbufaddr
[i
]);
1163 /* decrement the send ref count */
1166 mutex_exit(&wd
->sendwait_lock
);
1167 (void) rib_free_sendwait(wd
);
1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl
, void *arg
)
1181 ibt_status_t ibt_status
;
1183 struct recv_wid
*rwid
;
1186 * Re-enable cq notify here to avoid missing any
1187 * completion queue notification.
1189 (void) ibt_enable_cq_notify(cq_hdl
, IBT_NEXT_COMPLETION
);
1191 ibt_status
= IBT_SUCCESS
;
1192 while (ibt_status
!= IBT_CQ_EMPTY
) {
1193 bzero(&wc
, sizeof (wc
));
1194 ibt_status
= ibt_poll_cq(cq_hdl
, &wc
, 1, NULL
);
1195 if (ibt_status
!= IBT_SUCCESS
)
1198 rwid
= (struct recv_wid
*)(uintptr_t)wc
.wc_id
;
1201 if (wc
.wc_status
== IBT_WC_SUCCESS
) {
1203 uint_t xid
, vers
, op
, find_xid
= 0;
1205 CONN
*conn
= qptoc(qp
);
1206 uint32_t rdma_credit
= 0;
1209 xdrmem_create(xdrs
, (caddr_t
)(uintptr_t)rwid
->addr
,
1210 wc
.wc_bytes_xfer
, XDR_DECODE
);
1212 * Treat xid as opaque (xid is the first entity
1213 * in the rpc rdma message).
1215 xid
= *(uint32_t *)(uintptr_t)rwid
->addr
;
1217 /* Skip xid and set the xdr position accordingly. */
1218 XDR_SETPOS(xdrs
, sizeof (uint32_t));
1219 (void) xdr_u_int(xdrs
, &vers
);
1220 (void) xdr_u_int(xdrs
, &rdma_credit
);
1221 (void) xdr_u_int(xdrs
, &op
);
1224 if (vers
!= RPCRDMA_VERS
) {
1226 * Invalid RPC/RDMA version. Cannot
1227 * interoperate. Set connection to
1228 * ERROR state and bail out.
1230 mutex_enter(&conn
->c_lock
);
1231 if (conn
->c_state
!= C_DISCONN_PEND
)
1232 conn
->c_state
= C_ERROR_CONN
;
1233 mutex_exit(&conn
->c_lock
);
1234 rib_rbuf_free(conn
, RECV_BUFFER
,
1235 (void *)(uintptr_t)rwid
->addr
);
1241 mutex_enter(&qp
->replylist_lock
);
1242 for (r
= qp
->replylist
; r
!= NULL
; r
= r
->next
) {
1243 if (r
->xid
== xid
) {
1249 r
->status
= RDMA_SUCCESS
;
1250 r
->vaddr_cq
= rwid
->addr
;
1253 cv_signal(&r
->wait_cv
);
1256 rib_rbuf_free(qptoc(qp
),
1265 mutex_exit(&qp
->replylist_lock
);
1266 if (find_xid
== 0) {
1267 /* RPC caller not waiting for reply */
1269 DTRACE_PROBE1(rpcib__i__nomatchxid1
,
1272 rib_rbuf_free(qptoc(qp
), RECV_BUFFER
,
1273 (void *)(uintptr_t)rwid
->addr
);
1275 } else if (wc
.wc_status
== IBT_WC_WR_FLUSHED_ERR
) {
1276 CONN
*conn
= qptoc(qp
);
1279 * Connection being flushed. Just free
1282 rib_rbuf_free(conn
, RECV_BUFFER
,
1283 (void *)(uintptr_t)rwid
->addr
);
1285 CONN
*conn
= qptoc(qp
);
1287 * RC Recv Q Error Code Local state Remote State
1288 * ==================== =========== ============
1289 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1290 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1291 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1292 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1293 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1294 * IBT_WC_WR_FLUSHED_ERR None None
1297 * Channel in error state. Set connection
1300 mutex_enter(&conn
->c_lock
);
1301 if (conn
->c_state
!= C_DISCONN_PEND
)
1302 conn
->c_state
= C_ERROR_CONN
;
1303 mutex_exit(&conn
->c_lock
);
1304 rib_rbuf_free(conn
, RECV_BUFFER
,
1305 (void *)(uintptr_t)rwid
->addr
);
1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl
, void *arg
)
1317 rdma_recv_data_t
*rdp
;
1319 ibt_status_t ibt_status
;
1321 struct svc_recv
*s_recvp
;
1326 * Re-enable cq notify here to avoid missing any
1327 * completion queue notification.
1329 (void) ibt_enable_cq_notify(cq_hdl
, IBT_NEXT_COMPLETION
);
1331 ibt_status
= IBT_SUCCESS
;
1332 while (ibt_status
!= IBT_CQ_EMPTY
) {
1333 bzero(&wc
, sizeof (wc
));
1334 ibt_status
= ibt_poll_cq(cq_hdl
, &wc
, 1, NULL
);
1335 if (ibt_status
!= IBT_SUCCESS
)
1338 s_recvp
= (struct svc_recv
*)(uintptr_t)wc
.wc_id
;
1342 if (wc
.wc_status
== IBT_WC_SUCCESS
) {
1344 uint_t xid
, vers
, op
;
1345 uint32_t rdma_credit
;
1348 /* s_recvp->vaddr stores data */
1349 xdrmem_create(xdrs
, (caddr_t
)(uintptr_t)s_recvp
->vaddr
,
1350 wc
.wc_bytes_xfer
, XDR_DECODE
);
1353 * Treat xid as opaque (xid is the first entity
1354 * in the rpc rdma message).
1356 xid
= *(uint32_t *)(uintptr_t)s_recvp
->vaddr
;
1357 /* Skip xid and set the xdr position accordingly. */
1358 XDR_SETPOS(xdrs
, sizeof (uint32_t));
1359 if (!xdr_u_int(xdrs
, &vers
) ||
1360 !xdr_u_int(xdrs
, &rdma_credit
) ||
1361 !xdr_u_int(xdrs
, &op
)) {
1362 rib_rbuf_free(conn
, RECV_BUFFER
,
1363 (void *)(uintptr_t)s_recvp
->vaddr
);
1366 (void) rib_free_svc_recv(s_recvp
);
1371 if (vers
!= RPCRDMA_VERS
) {
1373 * Invalid RPC/RDMA version.
1374 * Drop rpc rdma message.
1376 rib_rbuf_free(conn
, RECV_BUFFER
,
1377 (void *)(uintptr_t)s_recvp
->vaddr
);
1379 (void) rib_free_svc_recv(s_recvp
);
1383 * Is this for RDMA_DONE?
1385 if (op
== RDMA_DONE
) {
1386 rib_rbuf_free(conn
, RECV_BUFFER
,
1387 (void *)(uintptr_t)s_recvp
->vaddr
);
1389 * Wake up the thread waiting on
1390 * a RDMA_DONE for xid
1392 mutex_enter(&qp
->rdlist_lock
);
1393 rdma_done_notify(qp
, xid
);
1394 mutex_exit(&qp
->rdlist_lock
);
1396 (void) rib_free_svc_recv(s_recvp
);
1400 mutex_enter(&plugin_state_lock
);
1401 mutex_enter(&conn
->c_lock
);
1402 if ((plugin_state
== ACCEPT
) &&
1403 (conn
->c_state
== C_CONNECTED
)) {
1405 mutex_exit(&conn
->c_lock
);
1406 while ((mp
= allocb(sizeof (*rdp
), BPRI_LO
))
1409 sizeof (*rdp
), BPRI_LO
);
1411 * Plugin is in accept state, hence the master
1412 * transport queue for this is still accepting
1413 * requests. Hence we can call svc_queuereq to
1414 * queue this recieved msg.
1416 rdp
= (rdma_recv_data_t
*)mp
->b_rptr
;
1419 (caddr_t
)(uintptr_t)s_recvp
->vaddr
;
1420 rdp
->rpcmsg
.type
= RECV_BUFFER
;
1421 rdp
->rpcmsg
.len
= wc
.wc_bytes_xfer
;
1422 rdp
->status
= wc
.wc_status
;
1423 mp
->b_wptr
+= sizeof (*rdp
);
1424 (void) svc_queuereq((queue_t
*)rib_stat
->q
, mp
,
1426 mutex_exit(&plugin_state_lock
);
1429 * The master transport for this is going
1430 * away and the queue is not accepting anymore
1431 * requests for krpc, so don't do anything, just
1434 mutex_exit(&conn
->c_lock
);
1435 mutex_exit(&plugin_state_lock
);
1436 rib_rbuf_free(conn
, RECV_BUFFER
,
1437 (void *)(uintptr_t)s_recvp
->vaddr
);
1440 rib_rbuf_free(conn
, RECV_BUFFER
,
1441 (void *)(uintptr_t)s_recvp
->vaddr
);
1444 (void) rib_free_svc_recv(s_recvp
);
1451 mutex_enter(&rib_stat
->open_hca_lock
);
1452 (void) rpcib_open_hcas(rib_stat
);
1454 mutex_exit(&rib_stat
->open_hca_lock
);
1458 * Handles DR event of IBT_HCA_DETACH_EVENT.
1462 rib_async_handler(void *clnt_private
, ibt_hca_hdl_t hca_hdl
,
1463 ibt_async_code_t code
, ibt_async_event_t
*event
)
1466 case IBT_HCA_ATTACH_EVENT
:
1469 case IBT_HCA_DETACH_EVENT
:
1470 rib_detach_hca(hca_hdl
);
1472 cmn_err(CE_NOTE
, "rib_async_handler(): HCA being detached!\n");
1475 case IBT_EVENT_PORT_UP
:
1477 * A port is up. We should call rib_listen() since there is
1478 * a chance that rib_listen() may have failed during
1479 * rib_attach_hca() because the port had not been up yet.
1483 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1487 case IBT_EVENT_PATH_MIGRATED
:
1488 cmn_err(CE_NOTE
, "rib_async_handler(): "
1489 "IBT_EVENT_PATH_MIGRATED\n");
1492 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_EVENT_SQD\n");
1494 case IBT_EVENT_COM_EST
:
1495 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1497 case IBT_ERROR_CATASTROPHIC_CHAN
:
1498 cmn_err(CE_NOTE
, "rib_async_handler(): "
1499 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1501 case IBT_ERROR_INVALID_REQUEST_CHAN
:
1502 cmn_err(CE_NOTE
, "rib_async_handler(): "
1503 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1505 case IBT_ERROR_ACCESS_VIOLATION_CHAN
:
1506 cmn_err(CE_NOTE
, "rib_async_handler(): "
1507 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1509 case IBT_ERROR_PATH_MIGRATE_REQ
:
1510 cmn_err(CE_NOTE
, "rib_async_handler(): "
1511 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1514 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ERROR_CQ\n");
1516 case IBT_ERROR_PORT_DOWN
:
1517 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1519 case IBT_ASYNC_OPAQUE1
:
1520 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1522 case IBT_ASYNC_OPAQUE2
:
1523 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1525 case IBT_ASYNC_OPAQUE3
:
1526 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1528 case IBT_ASYNC_OPAQUE4
:
1529 cmn_err(CE_NOTE
, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1538 * Client's reachable function.
1541 rib_reachable(int addr_type
, struct netbuf
*raddr
, void **handle
)
1545 struct netbuf saddr
;
1548 bzero(&saddr
, sizeof (struct netbuf
));
1549 status
= rib_connect(&saddr
, raddr
, addr_type
, &rpt
, &conn
);
1551 if (status
== RDMA_SUCCESS
) {
1552 *handle
= (void *)rpt
.hca
;
1553 /* release the reference */
1554 (void) rib_conn_release(conn
);
1555 return (RDMA_SUCCESS
);
1558 DTRACE_PROBE(rpcib__i__pingfailed
);
1559 return (RDMA_FAILED
);
1563 /* Client side qp creation */
1565 rib_clnt_create_chan(rib_hca_t
*hca
, struct netbuf
*raddr
, rib_qp_t
**qp
)
1567 rib_qp_t
*kqp
= NULL
;
1569 rdma_clnt_cred_ctrl_t
*cc_info
;
1574 kqp
= kmem_zalloc(sizeof (rib_qp_t
), KM_SLEEP
);
1577 kqp
->rdmaconn
.c_rdmamod
= &rib_mod
;
1578 kqp
->rdmaconn
.c_private
= (caddr_t
)kqp
;
1580 kqp
->mode
= RIB_CLIENT
;
1581 kqp
->chan_flags
= IBT_BLOCKING
;
1582 conn
->c_raddr
.buf
= kmem_alloc(raddr
->len
, KM_SLEEP
);
1583 bcopy(raddr
->buf
, conn
->c_raddr
.buf
, raddr
->len
);
1584 conn
->c_raddr
.len
= conn
->c_raddr
.maxlen
= raddr
->len
;
1588 cv_init(&kqp
->cb_conn_cv
, NULL
, CV_DEFAULT
, NULL
);
1589 cv_init(&kqp
->posted_rbufs_cv
, NULL
, CV_DEFAULT
, NULL
);
1590 mutex_init(&kqp
->posted_rbufs_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1591 cv_init(&kqp
->send_rbufs_cv
, NULL
, CV_DEFAULT
, NULL
);
1592 mutex_init(&kqp
->send_rbufs_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1593 mutex_init(&kqp
->replylist_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1594 mutex_init(&kqp
->rdlist_lock
, NULL
, MUTEX_DEFAULT
, hca
->iblock
);
1595 mutex_init(&kqp
->cb_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1596 cv_init(&kqp
->rdmaconn
.c_cv
, NULL
, CV_DEFAULT
, NULL
);
1597 mutex_init(&kqp
->rdmaconn
.c_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1599 * Initialize the client credit control
1600 * portion of the rdmaconn struct.
1602 kqp
->rdmaconn
.c_cc_type
= RDMA_CC_CLNT
;
1603 cc_info
= &kqp
->rdmaconn
.rdma_conn_cred_ctrl_u
.c_clnt_cc
;
1604 cc_info
->clnt_cc_granted_ops
= 0;
1605 cc_info
->clnt_cc_in_flight_ops
= 0;
1606 cv_init(&cc_info
->clnt_cc_cv
, NULL
, CV_DEFAULT
, NULL
);
1609 return (RDMA_SUCCESS
);
1612 /* Server side qp creation */
1614 rib_svc_create_chan(rib_hca_t
*hca
, caddr_t q
, uint8_t port
, rib_qp_t
**qp
)
1616 rib_qp_t
*kqp
= NULL
;
1617 ibt_chan_sizes_t chan_sizes
;
1618 ibt_rc_chan_alloc_args_t qp_attr
;
1619 ibt_status_t ibt_status
;
1620 rdma_srv_cred_ctrl_t
*cc_info
;
1624 kqp
= kmem_zalloc(sizeof (rib_qp_t
), KM_SLEEP
);
1626 kqp
->port_num
= port
;
1627 kqp
->rdmaconn
.c_rdmamod
= &rib_mod
;
1628 kqp
->rdmaconn
.c_private
= (caddr_t
)kqp
;
1631 * Create the qp handle
1633 bzero(&qp_attr
, sizeof (ibt_rc_chan_alloc_args_t
));
1634 qp_attr
.rc_scq
= hca
->svc_scq
->rib_cq_hdl
;
1635 qp_attr
.rc_rcq
= hca
->svc_rcq
->rib_cq_hdl
;
1636 qp_attr
.rc_pd
= hca
->pd_hdl
;
1637 qp_attr
.rc_hca_port_num
= port
;
1638 qp_attr
.rc_sizes
.cs_sq_sgl
= DSEG_MAX
;
1639 qp_attr
.rc_sizes
.cs_rq_sgl
= RQ_DSEG_MAX
;
1640 qp_attr
.rc_sizes
.cs_sq
= DEF_SQ_SIZE
;
1641 qp_attr
.rc_sizes
.cs_rq
= DEF_RQ_SIZE
;
1642 qp_attr
.rc_clone_chan
= NULL
;
1643 qp_attr
.rc_control
= IBT_CEP_RDMA_RD
| IBT_CEP_RDMA_WR
;
1644 qp_attr
.rc_flags
= IBT_WR_SIGNALED
;
1646 rw_enter(&hca
->state_lock
, RW_READER
);
1647 if (hca
->state
!= HCA_DETACHED
) {
1648 ibt_status
= ibt_alloc_rc_channel(hca
->hca_hdl
,
1649 IBT_ACHAN_NO_FLAGS
, &qp_attr
, &kqp
->qp_hdl
,
1652 rw_exit(&hca
->state_lock
);
1655 rw_exit(&hca
->state_lock
);
1657 if (ibt_status
!= IBT_SUCCESS
) {
1658 DTRACE_PROBE1(rpcib__i_svccreatechanfail
,
1663 kqp
->mode
= RIB_SERVER
;
1664 kqp
->chan_flags
= IBT_BLOCKING
;
1665 kqp
->q
= q
; /* server ONLY */
1667 cv_init(&kqp
->cb_conn_cv
, NULL
, CV_DEFAULT
, NULL
);
1668 cv_init(&kqp
->posted_rbufs_cv
, NULL
, CV_DEFAULT
, NULL
);
1669 mutex_init(&kqp
->replylist_lock
, NULL
, MUTEX_DEFAULT
, hca
->iblock
);
1670 mutex_init(&kqp
->posted_rbufs_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1671 cv_init(&kqp
->send_rbufs_cv
, NULL
, CV_DEFAULT
, NULL
);
1672 mutex_init(&kqp
->send_rbufs_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1673 mutex_init(&kqp
->rdlist_lock
, NULL
, MUTEX_DEFAULT
, hca
->iblock
);
1674 mutex_init(&kqp
->cb_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1675 cv_init(&kqp
->rdmaconn
.c_cv
, NULL
, CV_DEFAULT
, NULL
);
1676 mutex_init(&kqp
->rdmaconn
.c_lock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
1678 * Set the private data area to qp to be used in callbacks
1680 ibt_set_chan_private(kqp
->qp_hdl
, (void *)kqp
);
1681 kqp
->rdmaconn
.c_state
= C_CONNECTED
;
1684 * Initialize the server credit control
1685 * portion of the rdmaconn struct.
1687 kqp
->rdmaconn
.c_cc_type
= RDMA_CC_SRV
;
1688 cc_info
= &kqp
->rdmaconn
.rdma_conn_cred_ctrl_u
.c_srv_cc
;
1689 cc_info
->srv_cc_buffers_granted
= preposted_rbufs
;
1690 cc_info
->srv_cc_cur_buffers_used
= 0;
1691 cc_info
->srv_cc_posted
= preposted_rbufs
;
1695 return (RDMA_SUCCESS
);
1698 kmem_free(kqp
, sizeof (rib_qp_t
));
1700 return (RDMA_FAILED
);
1705 rib_clnt_cm_handler(void *clnt_hdl
, ibt_cm_event_t
*event
,
1706 ibt_cm_return_args_t
*ret_args
, void *priv_data
,
1707 ibt_priv_data_len_t len
)
1711 hca
= (rib_hca_t
*)clnt_hdl
;
1713 switch (event
->cm_type
) {
1715 /* got a connection close event */
1716 case IBT_CM_EVENT_CONN_CLOSED
:
1721 /* check reason why connection was closed */
1722 switch (event
->cm_event
.closed
) {
1723 case IBT_CM_CLOSED_DREP_RCVD
:
1724 case IBT_CM_CLOSED_DREQ_TIMEOUT
:
1725 case IBT_CM_CLOSED_DUP
:
1726 case IBT_CM_CLOSED_ABORT
:
1727 case IBT_CM_CLOSED_ALREADY
:
1729 * These cases indicate the local end initiated
1730 * the closing of the channel. Nothing to do here.
1735 * Reason for CONN_CLOSED event must be one of
1736 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737 * or IBT_CM_CLOSED_STALE. These indicate cases were
1738 * the remote end is closing the channel. In these
1739 * cases free the channel and transition to error
1742 qp
= ibt_get_chan_private(event
->cm_channel
);
1744 mutex_enter(&conn
->c_lock
);
1745 if (conn
->c_state
== C_DISCONN_PEND
) {
1746 mutex_exit(&conn
->c_lock
);
1750 conn
->c_state
= C_ERROR_CONN
;
1753 * Free the conn if c_ref is down to 0 already
1755 if (conn
->c_ref
== 0) {
1757 * Remove from list and free conn
1759 conn
->c_state
= C_DISCONN_PEND
;
1760 mutex_exit(&conn
->c_lock
);
1761 rw_enter(&hca
->state_lock
, RW_READER
);
1762 if (hca
->state
!= HCA_DETACHED
)
1763 (void) rib_disconnect_channel(conn
,
1764 &hca
->cl_conn_list
);
1765 rw_exit(&hca
->state_lock
);
1768 * conn will be freed when c_ref goes to 0.
1769 * Indicate to cleaning thread not to close
1770 * the connection, but just free the channel.
1772 conn
->c_flags
|= C_CLOSE_NOTNEEDED
;
1773 mutex_exit(&conn
->c_lock
);
1777 cmn_err(CE_NOTE
, "rib_clnt_cm_handler: "
1778 "(CONN_CLOSED) channel disconnected");
1787 return (IBT_CM_ACCEPT
);
1791 * Connect to the server.
1794 rib_conn_to_srv(rib_hca_t
*hca
, rib_qp_t
*qp
, rpcib_ping_t
*rptp
)
1796 ibt_chan_open_args_t chan_args
; /* channel args */
1797 ibt_chan_sizes_t chan_sizes
;
1798 ibt_rc_chan_alloc_args_t qp_attr
;
1799 ibt_status_t ibt_status
;
1800 ibt_rc_returns_t ret_args
; /* conn reject info */
1801 int refresh
= REFRESH_ATTEMPTS
; /* refresh if IBT_CM_CONN_STALE */
1802 ibt_ip_cm_info_t ipcm_info
;
1803 uint8_t cmp_ip_pvt
[IBT_IP_HDR_PRIV_DATA_SZ
];
1806 (void) bzero(&chan_args
, sizeof (chan_args
));
1807 (void) bzero(&qp_attr
, sizeof (ibt_rc_chan_alloc_args_t
));
1808 (void) bzero(&ipcm_info
, sizeof (ibt_ip_cm_info_t
));
1810 ipcm_info
.src_addr
.family
= rptp
->srcip
.family
;
1811 switch (ipcm_info
.src_addr
.family
) {
1813 ipcm_info
.src_addr
.un
.ip4addr
= rptp
->srcip
.un
.ip4addr
;
1816 ipcm_info
.src_addr
.un
.ip6addr
= rptp
->srcip
.un
.ip6addr
;
1820 ipcm_info
.dst_addr
.family
= rptp
->srcip
.family
;
1821 switch (ipcm_info
.dst_addr
.family
) {
1823 ipcm_info
.dst_addr
.un
.ip4addr
= rptp
->dstip
.un
.ip4addr
;
1826 ipcm_info
.dst_addr
.un
.ip6addr
= rptp
->dstip
.un
.ip6addr
;
1830 ipcm_info
.src_port
= (in_port_t
)nfs_rdma_port
;
1832 ibt_status
= ibt_format_ip_private_data(&ipcm_info
,
1833 IBT_IP_HDR_PRIV_DATA_SZ
, cmp_ip_pvt
);
1835 if (ibt_status
!= IBT_SUCCESS
) {
1836 cmn_err(CE_WARN
, "ibt_format_ip_private_data failed\n");
1840 qp_attr
.rc_hca_port_num
= rptp
->path
.pi_prim_cep_path
.cep_hca_port_num
;
1841 /* Alloc a RC channel */
1842 qp_attr
.rc_scq
= hca
->clnt_scq
->rib_cq_hdl
;
1843 qp_attr
.rc_rcq
= hca
->clnt_rcq
->rib_cq_hdl
;
1844 qp_attr
.rc_pd
= hca
->pd_hdl
;
1845 qp_attr
.rc_sizes
.cs_sq_sgl
= DSEG_MAX
;
1846 qp_attr
.rc_sizes
.cs_rq_sgl
= RQ_DSEG_MAX
;
1847 qp_attr
.rc_sizes
.cs_sq
= DEF_SQ_SIZE
;
1848 qp_attr
.rc_sizes
.cs_rq
= DEF_RQ_SIZE
;
1849 qp_attr
.rc_clone_chan
= NULL
;
1850 qp_attr
.rc_control
= IBT_CEP_RDMA_RD
| IBT_CEP_RDMA_WR
;
1851 qp_attr
.rc_flags
= IBT_WR_SIGNALED
;
1853 rptp
->path
.pi_sid
= ibt_get_ip_sid(IPPROTO_TCP
, nfs_rdma_port
);
1854 chan_args
.oc_path
= &rptp
->path
;
1856 chan_args
.oc_cm_handler
= rib_clnt_cm_handler
;
1857 chan_args
.oc_cm_clnt_private
= (void *)hca
;
1858 chan_args
.oc_rdma_ra_out
= 4;
1859 chan_args
.oc_rdma_ra_in
= 4;
1860 chan_args
.oc_path_retry_cnt
= 2;
1861 chan_args
.oc_path_rnr_retry_cnt
= RNR_RETRIES
;
1862 chan_args
.oc_priv_data
= cmp_ip_pvt
;
1863 chan_args
.oc_priv_data_len
= IBT_IP_HDR_PRIV_DATA_SZ
;
1866 rw_enter(&hca
->state_lock
, RW_READER
);
1867 if (hca
->state
!= HCA_DETACHED
) {
1868 ibt_status
= ibt_alloc_rc_channel(hca
->hca_hdl
,
1870 &qp_attr
, &qp
->qp_hdl
,
1873 rw_exit(&hca
->state_lock
);
1874 return (RDMA_FAILED
);
1876 rw_exit(&hca
->state_lock
);
1878 if (ibt_status
!= IBT_SUCCESS
) {
1879 DTRACE_PROBE1(rpcib__i_conntosrv
,
1881 return (RDMA_FAILED
);
1884 /* Connect to the Server */
1885 (void) bzero(&ret_args
, sizeof (ret_args
));
1886 mutex_enter(&qp
->cb_lock
);
1887 ibt_status
= ibt_open_rc_channel(qp
->qp_hdl
, IBT_OCHAN_NO_FLAGS
,
1888 IBT_BLOCKING
, &chan_args
, &ret_args
);
1889 if (ibt_status
!= IBT_SUCCESS
) {
1890 DTRACE_PROBE2(rpcib__i_openrctosrv
,
1891 int, ibt_status
, int, ret_args
.rc_status
);
1893 (void) ibt_free_channel(qp
->qp_hdl
);
1895 mutex_exit(&qp
->cb_lock
);
1896 if (refresh
-- && ibt_status
== IBT_CM_FAILURE
&&
1897 ret_args
.rc_status
== IBT_CM_CONN_STALE
) {
1899 * Got IBT_CM_CONN_STALE probably because of stale
1900 * data on the passive end of a channel that existed
1901 * prior to reboot. Retry establishing a channel
1902 * REFRESH_ATTEMPTS times, during which time the
1903 * stale conditions on the server might clear up.
1907 return (RDMA_FAILED
);
1909 mutex_exit(&qp
->cb_lock
);
1911 * Set the private data area to qp to be used in callbacks
1913 ibt_set_chan_private(qp
->qp_hdl
, (void *)qp
);
1914 return (RDMA_SUCCESS
);
1918 rib_ping_srv(int addr_type
, struct netbuf
*raddr
, rpcib_ping_t
*rptp
)
1920 uint_t i
, addr_count
;
1921 ibt_status_t ibt_status
;
1922 uint8_t num_paths_p
;
1923 ibt_ip_path_attr_t ipattr
;
1924 ibt_path_ip_src_t srcip
;
1925 rpcib_ipaddrs_t addrs4
;
1926 rpcib_ipaddrs_t addrs6
;
1927 struct sockaddr_in
*sinp
;
1928 struct sockaddr_in6
*sin6p
;
1929 rdma_stat retval
= RDMA_FAILED
;
1932 if ((addr_type
!= AF_INET
) && (addr_type
!= AF_INET6
))
1933 return (RDMA_INVAL
);
1934 ASSERT(raddr
->buf
!= NULL
);
1936 bzero(&ipattr
, sizeof (ibt_ip_path_attr_t
));
1938 if (!rpcib_get_ib_addresses(&addrs4
, &addrs6
) ||
1939 (addrs4
.ri_count
== 0 && addrs6
.ri_count
== 0)) {
1940 retval
= RDMA_FAILED
;
1944 if (addr_type
== AF_INET
) {
1945 addr_count
= addrs4
.ri_count
;
1946 sinp
= (struct sockaddr_in
*)raddr
->buf
;
1947 rptp
->dstip
.family
= AF_INET
;
1948 rptp
->dstip
.un
.ip4addr
= sinp
->sin_addr
.s_addr
;
1949 sinp
= addrs4
.ri_list
;
1951 addr_count
= addrs6
.ri_count
;
1952 sin6p
= (struct sockaddr_in6
*)raddr
->buf
;
1953 rptp
->dstip
.family
= AF_INET6
;
1954 rptp
->dstip
.un
.ip6addr
= sin6p
->sin6_addr
;
1955 sin6p
= addrs6
.ri_list
;
1958 rw_enter(&rib_stat
->hcas_list_lock
, RW_READER
);
1959 for (hca
= rib_stat
->hcas_list
; hca
; hca
= hca
->next
) {
1960 rw_enter(&hca
->state_lock
, RW_READER
);
1961 if (hca
->state
== HCA_DETACHED
) {
1962 rw_exit(&hca
->state_lock
);
1966 ipattr
.ipa_dst_ip
= &rptp
->dstip
;
1967 ipattr
.ipa_hca_guid
= hca
->hca_guid
;
1968 ipattr
.ipa_ndst
= 1;
1969 ipattr
.ipa_max_paths
= 1;
1970 ipattr
.ipa_src_ip
.family
= rptp
->dstip
.family
;
1971 for (i
= 0; i
< addr_count
; i
++) {
1973 if (addr_type
== AF_INET
) {
1974 ipattr
.ipa_src_ip
.un
.ip4addr
=
1975 sinp
[i
].sin_addr
.s_addr
;
1977 ipattr
.ipa_src_ip
.un
.ip6addr
=
1980 bzero(&srcip
, sizeof (ibt_path_ip_src_t
));
1982 ibt_status
= ibt_get_ip_paths(rib_stat
->ibt_clnt_hdl
,
1983 IBT_PATH_NO_FLAGS
, &ipattr
, &rptp
->path
,
1984 &num_paths_p
, &srcip
);
1985 if (ibt_status
== IBT_SUCCESS
&&
1987 rptp
->path
.pi_hca_guid
== hca
->hca_guid
) {
1989 rw_exit(&hca
->state_lock
);
1990 if (addr_type
== AF_INET
) {
1991 rptp
->srcip
.family
= AF_INET
;
1992 rptp
->srcip
.un
.ip4addr
=
1993 srcip
.ip_primary
.un
.ip4addr
;
1995 rptp
->srcip
.family
= AF_INET6
;
1996 rptp
->srcip
.un
.ip6addr
=
1997 srcip
.ip_primary
.un
.ip6addr
;
2000 retval
= RDMA_SUCCESS
;
2004 rw_exit(&hca
->state_lock
);
2007 rw_exit(&rib_stat
->hcas_list_lock
);
2009 if (addrs4
.ri_size
> 0)
2010 kmem_free(addrs4
.ri_list
, addrs4
.ri_size
);
2011 if (addrs6
.ri_size
> 0)
2012 kmem_free(addrs6
.ri_list
, addrs6
.ri_size
);
2017 * Close channel, remove from connection list and
2018 * free up resources allocated for that channel.
2021 rib_disconnect_channel(CONN
*conn
, rib_conn_list_t
*conn_list
)
2023 rib_qp_t
*qp
= ctoqp(conn
);
2026 mutex_enter(&conn
->c_lock
);
2027 if (conn
->c_timeout
!= NULL
) {
2028 mutex_exit(&conn
->c_lock
);
2029 (void) untimeout(conn
->c_timeout
);
2030 mutex_enter(&conn
->c_lock
);
2033 while (conn
->c_flags
& C_CLOSE_PENDING
) {
2034 cv_wait(&conn
->c_cv
, &conn
->c_lock
);
2036 mutex_exit(&conn
->c_lock
);
2039 * c_ref == 0 and connection is in C_DISCONN_PEND
2042 if (conn_list
!= NULL
)
2043 (void) rib_rm_conn(conn
, conn_list
);
2046 * There is only one case where we get here with
2047 * qp_hdl = NULL, which is during connection setup on
2048 * the client. In such a case there are no posted
2049 * send/recv buffers.
2051 if (qp
->qp_hdl
!= NULL
) {
2052 mutex_enter(&qp
->posted_rbufs_lock
);
2053 while (qp
->n_posted_rbufs
)
2054 cv_wait(&qp
->posted_rbufs_cv
, &qp
->posted_rbufs_lock
);
2055 mutex_exit(&qp
->posted_rbufs_lock
);
2057 mutex_enter(&qp
->send_rbufs_lock
);
2058 while (qp
->n_send_rbufs
)
2059 cv_wait(&qp
->send_rbufs_cv
, &qp
->send_rbufs_lock
);
2060 mutex_exit(&qp
->send_rbufs_lock
);
2062 (void) ibt_free_channel(qp
->qp_hdl
);
2066 ASSERT(qp
->rdlist
== NULL
);
2068 if (qp
->replylist
!= NULL
) {
2069 (void) rib_rem_replylist(qp
);
2072 cv_destroy(&qp
->cb_conn_cv
);
2073 cv_destroy(&qp
->posted_rbufs_cv
);
2074 cv_destroy(&qp
->send_rbufs_cv
);
2075 mutex_destroy(&qp
->cb_lock
);
2076 mutex_destroy(&qp
->replylist_lock
);
2077 mutex_destroy(&qp
->posted_rbufs_lock
);
2078 mutex_destroy(&qp
->send_rbufs_lock
);
2079 mutex_destroy(&qp
->rdlist_lock
);
2081 cv_destroy(&conn
->c_cv
);
2082 mutex_destroy(&conn
->c_lock
);
2084 if (conn
->c_raddr
.buf
!= NULL
) {
2085 kmem_free(conn
->c_raddr
.buf
, conn
->c_raddr
.len
);
2087 if (conn
->c_laddr
.buf
!= NULL
) {
2088 kmem_free(conn
->c_laddr
.buf
, conn
->c_laddr
.len
);
2090 if (conn
->c_netid
!= NULL
) {
2091 kmem_free(conn
->c_netid
, (strlen(conn
->c_netid
) + 1));
2093 if (conn
->c_addrmask
.buf
!= NULL
) {
2094 kmem_free(conn
->c_addrmask
.buf
, conn
->c_addrmask
.len
);
2098 * Credit control cleanup.
2100 if (qp
->rdmaconn
.c_cc_type
== RDMA_CC_CLNT
) {
2101 rdma_clnt_cred_ctrl_t
*cc_info
;
2102 cc_info
= &qp
->rdmaconn
.rdma_conn_cred_ctrl_u
.c_clnt_cc
;
2103 cv_destroy(&cc_info
->clnt_cc_cv
);
2106 kmem_free(qp
, sizeof (rib_qp_t
));
2109 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110 * then the hca is no longer being used.
2112 if (conn_list
!= NULL
) {
2113 rw_enter(&hca
->state_lock
, RW_READER
);
2114 if (hca
->state
== HCA_DETACHED
) {
2115 rw_enter(&hca
->srv_conn_list
.conn_lock
, RW_READER
);
2116 if (hca
->srv_conn_list
.conn_hd
== NULL
) {
2117 rw_enter(&hca
->cl_conn_list
.conn_lock
,
2120 if (hca
->cl_conn_list
.conn_hd
== NULL
) {
2121 mutex_enter(&hca
->inuse_lock
);
2123 cv_signal(&hca
->cb_cv
);
2124 mutex_exit(&hca
->inuse_lock
);
2126 rw_exit(&hca
->cl_conn_list
.conn_lock
);
2128 rw_exit(&hca
->srv_conn_list
.conn_lock
);
2130 rw_exit(&hca
->state_lock
);
2133 return (RDMA_SUCCESS
);
2137 * All sends are done under the protection of
2138 * the wdesc->sendwait_lock. n_send_rbufs count
2139 * is protected using the send_rbufs_lock.
2141 * sendwait_lock -> send_rbufs_lock
2145 rib_send_hold(rib_qp_t
*qp
)
2147 mutex_enter(&qp
->send_rbufs_lock
);
2149 mutex_exit(&qp
->send_rbufs_lock
);
2153 rib_send_rele(rib_qp_t
*qp
)
2155 mutex_enter(&qp
->send_rbufs_lock
);
2157 if (qp
->n_send_rbufs
== 0)
2158 cv_signal(&qp
->send_rbufs_cv
);
2159 mutex_exit(&qp
->send_rbufs_lock
);
2163 rib_recv_rele(rib_qp_t
*qp
)
2165 mutex_enter(&qp
->posted_rbufs_lock
);
2166 qp
->n_posted_rbufs
--;
2167 if (qp
->n_posted_rbufs
== 0)
2168 cv_signal(&qp
->posted_rbufs_cv
);
2169 mutex_exit(&qp
->posted_rbufs_lock
);
2173 * Wait for send completion notification. Only on receiving a
2174 * notification be it a successful or error completion, free the
2178 rib_sendwait(rib_qp_t
*qp
, struct send_wid
*wd
)
2180 clock_t timout
, cv_wait_ret
;
2181 rdma_stat error
= RDMA_SUCCESS
;
2185 * Wait for send to complete
2188 mutex_enter(&wd
->sendwait_lock
);
2189 if (wd
->status
== (uint_t
)SEND_WAIT
) {
2190 timout
= drv_usectohz(SEND_WAIT_TIME
* 1000000) +
2193 if (qp
->mode
== RIB_SERVER
) {
2194 while ((cv_wait_ret
= cv_timedwait(&wd
->wait_cv
,
2195 &wd
->sendwait_lock
, timout
)) > 0 &&
2196 wd
->status
== (uint_t
)SEND_WAIT
)
2198 switch (cv_wait_ret
) {
2199 case -1: /* timeout */
2200 DTRACE_PROBE(rpcib__i__srvsendwait__timeout
);
2202 wd
->cv_sig
= 0; /* no signal needed */
2203 error
= RDMA_TIMEDOUT
;
2205 default: /* got send completion */
2209 while ((cv_wait_ret
= cv_timedwait_sig(&wd
->wait_cv
,
2210 &wd
->sendwait_lock
, timout
)) > 0 &&
2211 wd
->status
== (uint_t
)SEND_WAIT
)
2213 switch (cv_wait_ret
) {
2214 case -1: /* timeout */
2215 DTRACE_PROBE(rpcib__i__clntsendwait__timeout
);
2217 wd
->cv_sig
= 0; /* no signal needed */
2218 error
= RDMA_TIMEDOUT
;
2220 case 0: /* interrupted */
2221 DTRACE_PROBE(rpcib__i__clntsendwait__intr
);
2223 wd
->cv_sig
= 0; /* no signal needed */
2226 default: /* got send completion */
2232 if (wd
->status
!= (uint_t
)SEND_WAIT
) {
2233 /* got send completion */
2234 if (wd
->status
!= RDMA_SUCCESS
) {
2235 switch (wd
->status
) {
2237 error
= RDMA_CONNLOST
;
2240 error
= RDMA_FAILED
;
2244 for (i
= 0; i
< wd
->nsbufs
; i
++) {
2245 rib_rbuf_free(qptoc(qp
), SEND_BUFFER
,
2246 (void *)(uintptr_t)wd
->sbufaddr
[i
]);
2251 mutex_exit(&wd
->sendwait_lock
);
2252 (void) rib_free_sendwait(wd
);
2255 mutex_exit(&wd
->sendwait_lock
);
2260 static struct send_wid
*
2261 rib_init_sendwait(uint32_t xid
, int cv_sig
, rib_qp_t
*qp
)
2263 struct send_wid
*wd
;
2265 wd
= kmem_zalloc(sizeof (struct send_wid
), KM_SLEEP
);
2267 wd
->cv_sig
= cv_sig
;
2269 cv_init(&wd
->wait_cv
, NULL
, CV_DEFAULT
, NULL
);
2270 mutex_init(&wd
->sendwait_lock
, NULL
, MUTEX_DRIVER
, NULL
);
2271 wd
->status
= (uint_t
)SEND_WAIT
;
2277 rib_free_sendwait(struct send_wid
*wdesc
)
2279 cv_destroy(&wdesc
->wait_cv
);
2280 mutex_destroy(&wdesc
->sendwait_lock
);
2281 kmem_free(wdesc
, sizeof (*wdesc
));
2287 rib_rem_rep(rib_qp_t
*qp
, struct reply
*rep
)
2289 mutex_enter(&qp
->replylist_lock
);
2291 (void) rib_remreply(qp
, rep
);
2292 mutex_exit(&qp
->replylist_lock
);
2293 return (RDMA_SUCCESS
);
2295 mutex_exit(&qp
->replylist_lock
);
2296 return (RDMA_FAILED
);
2300 * Send buffers are freed here only in case of error in posting
2301 * on QP. If the post succeeded, the send buffers are freed upon
2302 * send completion in rib_sendwait() or in the scq_handler.
2305 rib_send_and_wait(CONN
*conn
, struct clist
*cl
, uint32_t msgid
,
2306 int send_sig
, int cv_sig
, caddr_t
*swid
)
2308 struct send_wid
*wdesc
;
2310 ibt_status_t ibt_status
= IBT_SUCCESS
;
2311 rdma_stat ret
= RDMA_SUCCESS
;
2312 ibt_send_wr_t tx_wr
;
2314 ibt_wr_ds_t sgl
[DSEG_MAX
];
2315 uint_t total_msg_size
;
2322 bzero(&tx_wr
, sizeof (ibt_send_wr_t
));
2327 while (clp
!= NULL
) {
2328 if (nds
>= DSEG_MAX
) {
2329 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded
);
2330 return (RDMA_FAILED
);
2332 sgl
[nds
].ds_va
= clp
->w
.c_saddr
;
2333 sgl
[nds
].ds_key
= clp
->c_smemhandle
.mrc_lmr
; /* lkey */
2334 sgl
[nds
].ds_len
= clp
->c_len
;
2335 total_msg_size
+= clp
->c_len
;
2341 /* Set SEND_SIGNAL flag. */
2342 tx_wr
.wr_flags
= IBT_WR_SEND_SIGNAL
;
2343 wdesc
= rib_init_sendwait(msgid
, cv_sig
, qp
);
2344 *swid
= (caddr_t
)wdesc
;
2345 tx_wr
.wr_id
= (ibt_wrid_t
)(uintptr_t)wdesc
;
2346 mutex_enter(&wdesc
->sendwait_lock
);
2347 wdesc
->nsbufs
= nds
;
2348 for (i
= 0; i
< nds
; i
++) {
2349 wdesc
->sbufaddr
[i
] = sgl
[i
].ds_va
;
2352 tx_wr
.wr_flags
= IBT_WR_NO_FLAGS
;
2354 tx_wr
.wr_id
= (ibt_wrid_t
)RDMA_DUMMY_WRID
;
2357 tx_wr
.wr_opcode
= IBT_WRC_SEND
;
2358 tx_wr
.wr_trans
= IBT_RC_SRV
;
2362 mutex_enter(&conn
->c_lock
);
2363 if (conn
->c_state
== C_CONNECTED
) {
2364 ibt_status
= ibt_post_send(qp
->qp_hdl
, &tx_wr
, 1, NULL
);
2366 if (conn
->c_state
!= C_CONNECTED
||
2367 ibt_status
!= IBT_SUCCESS
) {
2368 if (conn
->c_state
!= C_DISCONN_PEND
)
2369 conn
->c_state
= C_ERROR_CONN
;
2370 mutex_exit(&conn
->c_lock
);
2372 for (i
= 0; i
< nds
; i
++) {
2373 rib_rbuf_free(conn
, SEND_BUFFER
,
2374 (void *)(uintptr_t)wdesc
->sbufaddr
[i
]);
2376 mutex_exit(&wdesc
->sendwait_lock
);
2377 (void) rib_free_sendwait(wdesc
);
2379 return (RDMA_CONNLOST
);
2382 mutex_exit(&conn
->c_lock
);
2386 mutex_exit(&wdesc
->sendwait_lock
);
2389 * cv_wait for send to complete.
2390 * We can fail due to a timeout or signal or
2391 * unsuccessful send.
2393 ret
= rib_sendwait(qp
, wdesc
);
2399 return (RDMA_SUCCESS
);
2404 rib_send(CONN
*conn
, struct clist
*cl
, uint32_t msgid
)
2409 /* send-wait & cv_signal */
2410 ret
= rib_send_and_wait(conn
, cl
, msgid
, 1, 1, &wd
);
2415 * Deprecated/obsolete interface not used currently
2416 * but earlier used for READ-READ protocol.
2417 * Send RPC reply and wait for RDMA_DONE.
2420 rib_send_resp(CONN
*conn
, struct clist
*cl
, uint32_t msgid
)
2422 rdma_stat ret
= RDMA_SUCCESS
;
2423 struct rdma_done_list
*rd
;
2424 clock_t cv_wait_ret
;
2425 caddr_t
*wid
= NULL
;
2426 rib_qp_t
*qp
= ctoqp(conn
);
2428 mutex_enter(&qp
->rdlist_lock
);
2429 rd
= rdma_done_add(qp
, msgid
);
2431 /* No cv_signal (whether send-wait or no-send-wait) */
2432 ret
= rib_send_and_wait(conn
, cl
, msgid
, 1, 0, wid
);
2434 if (ret
!= RDMA_SUCCESS
) {
2435 rdma_done_rm(qp
, rd
);
2438 * Wait for RDMA_DONE from remote end
2440 cv_wait_ret
= cv_reltimedwait(&rd
->rdma_done_cv
,
2441 &qp
->rdlist_lock
, drv_usectohz(REPLY_WAIT_TIME
* 1000000),
2444 rdma_done_rm(qp
, rd
);
2446 if (cv_wait_ret
< 0) {
2447 ret
= RDMA_TIMEDOUT
;
2451 mutex_exit(&qp
->rdlist_lock
);
2455 static struct recv_wid
*
2456 rib_create_wid(rib_qp_t
*qp
, ibt_wr_ds_t
*sgl
, uint32_t msgid
)
2458 struct recv_wid
*rwid
;
2460 rwid
= kmem_zalloc(sizeof (struct recv_wid
), KM_SLEEP
);
2462 rwid
->addr
= sgl
->ds_va
;
2469 rib_free_wid(struct recv_wid
*rwid
)
2471 kmem_free(rwid
, sizeof (struct recv_wid
));
2475 rib_clnt_post(CONN
* conn
, struct clist
*cl
, uint32_t msgid
)
2477 rib_qp_t
*qp
= ctoqp(conn
);
2478 struct clist
*clp
= cl
;
2480 struct recv_wid
*rwid
;
2482 ibt_wr_ds_t sgl
[DSEG_MAX
];
2483 ibt_recv_wr_t recv_wr
;
2485 ibt_status_t ibt_status
;
2488 * rdma_clnt_postrecv uses RECV_BUFFER.
2492 while (cl
!= NULL
) {
2493 if (nds
>= DSEG_MAX
) {
2497 sgl
[nds
].ds_va
= cl
->w
.c_saddr
;
2498 sgl
[nds
].ds_key
= cl
->c_smemhandle
.mrc_lmr
; /* lkey */
2499 sgl
[nds
].ds_len
= cl
->c_len
;
2509 bzero(&recv_wr
, sizeof (ibt_recv_wr_t
));
2510 recv_wr
.wr_nds
= nds
;
2511 recv_wr
.wr_sgl
= sgl
;
2513 rwid
= rib_create_wid(qp
, &sgl
[0], msgid
);
2515 recv_wr
.wr_id
= (ibt_wrid_t
)(uintptr_t)rwid
;
2517 ret
= RDMA_NORESOURCE
;
2520 rep
= rib_addreplylist(qp
, msgid
);
2523 ret
= RDMA_NORESOURCE
;
2527 mutex_enter(&conn
->c_lock
);
2529 if (conn
->c_state
== C_CONNECTED
) {
2530 ibt_status
= ibt_post_recv(qp
->qp_hdl
, &recv_wr
, 1, NULL
);
2533 if (conn
->c_state
!= C_CONNECTED
||
2534 ibt_status
!= IBT_SUCCESS
) {
2535 if (conn
->c_state
!= C_DISCONN_PEND
)
2536 conn
->c_state
= C_ERROR_CONN
;
2537 mutex_exit(&conn
->c_lock
);
2539 (void) rib_rem_rep(qp
, rep
);
2540 ret
= RDMA_CONNLOST
;
2544 mutex_enter(&qp
->posted_rbufs_lock
);
2545 qp
->n_posted_rbufs
++;
2546 mutex_exit(&qp
->posted_rbufs_lock
);
2548 mutex_exit(&conn
->c_lock
);
2549 return (RDMA_SUCCESS
);
2552 while (clp
!= NULL
) {
2553 rib_rbuf_free(conn
, RECV_BUFFER
,
2554 (void *)(uintptr_t)clp
->w
.c_saddr3
);
2561 rib_svc_post(CONN
* conn
, struct clist
*cl
)
2563 rib_qp_t
*qp
= ctoqp(conn
);
2564 struct svc_recv
*s_recvp
;
2566 ibt_wr_ds_t sgl
[DSEG_MAX
];
2567 ibt_recv_wr_t recv_wr
;
2568 ibt_status_t ibt_status
;
2571 while (cl
!= NULL
) {
2572 if (nds
>= DSEG_MAX
) {
2573 return (RDMA_FAILED
);
2575 sgl
[nds
].ds_va
= cl
->w
.c_saddr
;
2576 sgl
[nds
].ds_key
= cl
->c_smemhandle
.mrc_lmr
; /* lkey */
2577 sgl
[nds
].ds_len
= cl
->c_len
;
2583 rib_rbuf_free(conn
, RECV_BUFFER
,
2584 (caddr_t
)(uintptr_t)sgl
[0].ds_va
);
2586 return (RDMA_FAILED
);
2589 bzero(&recv_wr
, sizeof (ibt_recv_wr_t
));
2590 recv_wr
.wr_nds
= nds
;
2591 recv_wr
.wr_sgl
= sgl
;
2593 s_recvp
= rib_init_svc_recv(qp
, &sgl
[0]);
2594 /* Use s_recvp's addr as wr id */
2595 recv_wr
.wr_id
= (ibt_wrid_t
)(uintptr_t)s_recvp
;
2596 mutex_enter(&conn
->c_lock
);
2597 if (conn
->c_state
== C_CONNECTED
) {
2598 ibt_status
= ibt_post_recv(qp
->qp_hdl
, &recv_wr
, 1, NULL
);
2600 if (conn
->c_state
!= C_CONNECTED
||
2601 ibt_status
!= IBT_SUCCESS
) {
2602 if (conn
->c_state
!= C_DISCONN_PEND
)
2603 conn
->c_state
= C_ERROR_CONN
;
2604 mutex_exit(&conn
->c_lock
);
2605 rib_rbuf_free(conn
, RECV_BUFFER
,
2606 (caddr_t
)(uintptr_t)sgl
[0].ds_va
);
2607 (void) rib_free_svc_recv(s_recvp
);
2609 return (RDMA_CONNLOST
);
2611 mutex_exit(&conn
->c_lock
);
2613 return (RDMA_SUCCESS
);
2618 rib_post_resp(CONN
* conn
, struct clist
*cl
, uint32_t msgid
)
2620 return (rib_clnt_post(conn
, cl
, msgid
));
2625 rib_post_resp_remove(CONN
* conn
, uint32_t msgid
)
2627 rib_qp_t
*qp
= ctoqp(conn
);
2630 mutex_enter(&qp
->replylist_lock
);
2631 for (rep
= qp
->replylist
; rep
!= NULL
; rep
= rep
->next
) {
2632 if (rep
->xid
== msgid
) {
2633 if (rep
->vaddr_cq
) {
2634 rib_rbuf_free(conn
, RECV_BUFFER
,
2635 (caddr_t
)(uintptr_t)rep
->vaddr_cq
);
2637 (void) rib_remreply(qp
, rep
);
2641 mutex_exit(&qp
->replylist_lock
);
2643 return (RDMA_SUCCESS
);
2648 rib_post_recv(CONN
*conn
, struct clist
*cl
)
2650 rib_qp_t
*qp
= ctoqp(conn
);
2652 if (rib_svc_post(conn
, cl
) == RDMA_SUCCESS
) {
2653 mutex_enter(&qp
->posted_rbufs_lock
);
2654 qp
->n_posted_rbufs
++;
2655 mutex_exit(&qp
->posted_rbufs_lock
);
2656 return (RDMA_SUCCESS
);
2658 return (RDMA_FAILED
);
2662 * Client side only interface to "recv" the rpc reply buf
2663 * posted earlier by rib_post_resp(conn, cl, msgid).
2666 rib_recv(CONN
*conn
, struct clist
**clp
, uint32_t msgid
)
2668 struct reply
*rep
= NULL
;
2669 clock_t timout
, cv_wait_ret
;
2670 rdma_stat ret
= RDMA_SUCCESS
;
2671 rib_qp_t
*qp
= ctoqp(conn
);
2674 * Find the reply structure for this msgid
2676 mutex_enter(&qp
->replylist_lock
);
2678 for (rep
= qp
->replylist
; rep
!= NULL
; rep
= rep
->next
) {
2679 if (rep
->xid
== msgid
)
2685 * If message not yet received, wait.
2687 if (rep
->status
== (uint_t
)REPLY_WAIT
) {
2688 timout
= ddi_get_lbolt() +
2689 drv_usectohz(REPLY_WAIT_TIME
* 1000000);
2691 while ((cv_wait_ret
= cv_timedwait_sig(&rep
->wait_cv
,
2692 &qp
->replylist_lock
, timout
)) > 0 &&
2693 rep
->status
== (uint_t
)REPLY_WAIT
)
2696 switch (cv_wait_ret
) {
2697 case -1: /* timeout */
2698 ret
= RDMA_TIMEDOUT
;
2708 if (rep
->status
== RDMA_SUCCESS
) {
2709 struct clist
*cl
= NULL
;
2712 * Got message successfully
2714 clist_add(&cl
, 0, rep
->bytes_xfer
, NULL
,
2715 (caddr_t
)(uintptr_t)rep
->vaddr_cq
, NULL
, NULL
);
2718 if (rep
->status
!= (uint_t
)REPLY_WAIT
) {
2720 * Got error in reply message. Free
2724 rib_rbuf_free(conn
, RECV_BUFFER
,
2725 (caddr_t
)(uintptr_t)rep
->vaddr_cq
);
2728 (void) rib_remreply(qp
, rep
);
2731 * No matching reply structure found for given msgid on the
2735 DTRACE_PROBE(rpcib__i__nomatchxid2
);
2741 mutex_exit(&qp
->replylist_lock
);
2746 * RDMA write a buffer to the remote address.
2749 rib_write(CONN
*conn
, struct clist
*cl
, int wait
)
2751 ibt_send_wr_t tx_wr
;
2753 ibt_wr_ds_t sgl
[DSEG_MAX
];
2754 struct send_wid
*wdesc
;
2755 ibt_status_t ibt_status
;
2756 rdma_stat ret
= RDMA_SUCCESS
;
2757 rib_qp_t
*qp
= ctoqp(conn
);
2758 uint64_t n_writes
= 0;
2761 return (RDMA_FAILED
);
2764 while ((cl
!= NULL
)) {
2765 if (cl
->c_len
> 0) {
2766 bzero(&tx_wr
, sizeof (ibt_send_wr_t
));
2767 tx_wr
.wr
.rc
.rcwr
.rdma
.rdma_raddr
= cl
->u
.c_daddr
;
2768 tx_wr
.wr
.rc
.rcwr
.rdma
.rdma_rkey
=
2769 cl
->c_dmemhandle
.mrc_rmr
; /* rkey */
2770 sgl
[0].ds_va
= cl
->w
.c_saddr
;
2771 sgl
[0].ds_key
= cl
->c_smemhandle
.mrc_lmr
; /* lkey */
2772 sgl
[0].ds_len
= cl
->c_len
;
2777 if (n_writes
> max_unsignaled_rws
) {
2786 tx_wr
.wr_flags
= IBT_WR_SEND_SIGNAL
;
2787 wdesc
= rib_init_sendwait(0, cv_sig
, qp
);
2788 tx_wr
.wr_id
= (ibt_wrid_t
)(uintptr_t)wdesc
;
2789 mutex_enter(&wdesc
->sendwait_lock
);
2791 tx_wr
.wr_flags
= IBT_WR_NO_FLAGS
;
2792 tx_wr
.wr_id
= (ibt_wrid_t
)RDMA_DUMMY_WRID
;
2794 tx_wr
.wr_opcode
= IBT_WRC_RDMAW
;
2795 tx_wr
.wr_trans
= IBT_RC_SRV
;
2799 mutex_enter(&conn
->c_lock
);
2800 if (conn
->c_state
== C_CONNECTED
) {
2802 ibt_post_send(qp
->qp_hdl
, &tx_wr
, 1, NULL
);
2804 if (conn
->c_state
!= C_CONNECTED
||
2805 ibt_status
!= IBT_SUCCESS
) {
2806 if (conn
->c_state
!= C_DISCONN_PEND
)
2807 conn
->c_state
= C_ERROR_CONN
;
2808 mutex_exit(&conn
->c_lock
);
2810 mutex_exit(&wdesc
->sendwait_lock
);
2811 (void) rib_free_sendwait(wdesc
);
2813 return (RDMA_CONNLOST
);
2816 mutex_exit(&conn
->c_lock
);
2819 * Wait for send to complete
2824 mutex_exit(&wdesc
->sendwait_lock
);
2826 ret
= rib_sendwait(qp
, wdesc
);
2834 return (RDMA_SUCCESS
);
2838 * RDMA Read a buffer from the remote address.
2841 rib_read(CONN
*conn
, struct clist
*cl
, int wait
)
2843 ibt_send_wr_t rx_wr
;
2846 struct send_wid
*wdesc
;
2847 ibt_status_t ibt_status
= IBT_SUCCESS
;
2848 rdma_stat ret
= RDMA_SUCCESS
;
2849 rib_qp_t
*qp
= ctoqp(conn
);
2852 return (RDMA_FAILED
);
2855 while (cl
!= NULL
) {
2856 bzero(&rx_wr
, sizeof (ibt_send_wr_t
));
2858 * Remote address is at the head chunk item in list.
2860 rx_wr
.wr
.rc
.rcwr
.rdma
.rdma_raddr
= cl
->w
.c_saddr
;
2861 rx_wr
.wr
.rc
.rcwr
.rdma
.rdma_rkey
= cl
->c_smemhandle
.mrc_rmr
;
2863 sgl
.ds_va
= cl
->u
.c_daddr
;
2864 sgl
.ds_key
= cl
->c_dmemhandle
.mrc_lmr
; /* lkey */
2865 sgl
.ds_len
= cl
->c_len
;
2868 * If there are multiple chunks to be read, and
2869 * wait is set, ask for signal only for the last chunk
2870 * and wait only on the last chunk. The completion of
2871 * RDMA_READ on last chunk ensures that reads on all
2872 * previous chunks are also completed.
2874 if (wait
&& (cl
->c_next
== NULL
)) {
2876 wdesc
= rib_init_sendwait(0, cv_sig
, qp
);
2877 rx_wr
.wr_flags
= IBT_WR_SEND_SIGNAL
;
2878 rx_wr
.wr_id
= (ibt_wrid_t
)(uintptr_t)wdesc
;
2879 mutex_enter(&wdesc
->sendwait_lock
);
2881 rx_wr
.wr_flags
= IBT_WR_NO_FLAGS
;
2882 rx_wr
.wr_id
= (ibt_wrid_t
)RDMA_DUMMY_WRID
;
2884 rx_wr
.wr_opcode
= IBT_WRC_RDMAR
;
2885 rx_wr
.wr_trans
= IBT_RC_SRV
;
2887 rx_wr
.wr_sgl
= &sgl
;
2889 mutex_enter(&conn
->c_lock
);
2890 if (conn
->c_state
== C_CONNECTED
) {
2891 ibt_status
= ibt_post_send(qp
->qp_hdl
, &rx_wr
, 1, NULL
);
2893 if (conn
->c_state
!= C_CONNECTED
||
2894 ibt_status
!= IBT_SUCCESS
) {
2895 if (conn
->c_state
!= C_DISCONN_PEND
)
2896 conn
->c_state
= C_ERROR_CONN
;
2897 mutex_exit(&conn
->c_lock
);
2898 if (wait
&& (cl
->c_next
== NULL
)) {
2899 mutex_exit(&wdesc
->sendwait_lock
);
2900 (void) rib_free_sendwait(wdesc
);
2902 return (RDMA_CONNLOST
);
2905 mutex_exit(&conn
->c_lock
);
2908 * Wait for send to complete if this is the
2909 * last item in the list.
2911 if (wait
&& cl
->c_next
== NULL
) {
2913 mutex_exit(&wdesc
->sendwait_lock
);
2915 ret
= rib_sendwait(qp
, wdesc
);
2922 return (RDMA_SUCCESS
);
2926 * rib_srv_cm_handler()
2927 * Connection Manager callback to handle RC connection requests.
2930 static ibt_cm_status_t
2931 rib_srv_cm_handler(void *any
, ibt_cm_event_t
*event
,
2932 ibt_cm_return_args_t
*ret_args
, void *priv_data
,
2933 ibt_priv_data_len_t len
)
2938 rdma_stat status
= RDMA_SUCCESS
;
2941 rdma_buf_t rdbuf
= {0};
2944 ibt_ip_cm_info_t ipinfo
;
2945 struct sockaddr_in
*s
;
2946 struct sockaddr_in6
*s6
;
2947 int sin_size
= sizeof (struct sockaddr_in
);
2948 int in_size
= sizeof (struct in_addr
);
2949 int sin6_size
= sizeof (struct sockaddr_in6
);
2951 ASSERT(any
!= NULL
);
2952 ASSERT(event
!= NULL
);
2954 hca
= (rib_hca_t
*)any
;
2956 /* got a connection request */
2957 switch (event
->cm_type
) {
2958 case IBT_CM_EVENT_REQ_RCV
:
2960 * If the plugin is in the NO_ACCEPT state, bail out.
2962 mutex_enter(&plugin_state_lock
);
2963 if (plugin_state
== NO_ACCEPT
) {
2964 mutex_exit(&plugin_state_lock
);
2965 return (IBT_CM_REJECT
);
2967 mutex_exit(&plugin_state_lock
);
2970 * Need to send a MRA MAD to CM so that it does not
2973 (void) ibt_cm_delay(IBT_CM_DELAY_REQ
, event
->cm_session_id
,
2974 event
->cm_event
.req
.req_timeout
* 8, NULL
, 0);
2976 mutex_enter(&rib_stat
->open_hca_lock
);
2978 mutex_exit(&rib_stat
->open_hca_lock
);
2980 status
= rib_svc_create_chan(hca
, (caddr_t
)q
,
2981 event
->cm_event
.req
.req_prim_hca_port
, &qp
);
2984 return (IBT_CM_REJECT
);
2987 ret_args
->cm_ret
.rep
.cm_channel
= qp
->qp_hdl
;
2988 ret_args
->cm_ret
.rep
.cm_rdma_ra_out
= 4;
2989 ret_args
->cm_ret
.rep
.cm_rdma_ra_in
= 4;
2990 ret_args
->cm_ret
.rep
.cm_rnr_retry_cnt
= RNR_RETRIES
;
2993 * Pre-posts RECV buffers
2996 for (i
= 0; i
< preposted_rbufs
; i
++) {
2997 bzero(&rdbuf
, sizeof (rdbuf
));
2998 rdbuf
.type
= RECV_BUFFER
;
2999 buf
= rib_rbuf_alloc(conn
, &rdbuf
);
3002 * A connection is not established yet.
3003 * Just flush the channel. Buffers
3004 * posted till now will error out with
3005 * IBT_WC_WR_FLUSHED_ERR.
3007 (void) ibt_flush_channel(qp
->qp_hdl
);
3008 (void) rib_disconnect_channel(conn
, NULL
);
3009 return (IBT_CM_REJECT
);
3012 bzero(&cl
, sizeof (cl
));
3013 cl
.w
.c_saddr3
= (caddr_t
)rdbuf
.addr
;
3014 cl
.c_len
= rdbuf
.len
;
3015 cl
.c_smemhandle
.mrc_lmr
=
3016 rdbuf
.handle
.mrc_lmr
; /* lkey */
3018 status
= rib_post_recv(conn
, &cl
);
3019 if (status
!= RDMA_SUCCESS
) {
3021 * A connection is not established yet.
3022 * Just flush the channel. Buffers
3023 * posted till now will error out with
3024 * IBT_WC_WR_FLUSHED_ERR.
3026 (void) ibt_flush_channel(qp
->qp_hdl
);
3027 (void) rib_disconnect_channel(conn
, NULL
);
3028 return (IBT_CM_REJECT
);
3031 (void) rib_add_connlist(conn
, &hca
->srv_conn_list
);
3034 * Get the address translation
3036 rw_enter(&hca
->state_lock
, RW_READER
);
3037 if (hca
->state
== HCA_DETACHED
) {
3038 rw_exit(&hca
->state_lock
);
3039 return (IBT_CM_REJECT
);
3041 rw_exit(&hca
->state_lock
);
3043 bzero(&ipinfo
, sizeof (ibt_ip_cm_info_t
));
3045 if (ibt_get_ip_data(event
->cm_priv_data_len
,
3046 event
->cm_priv_data
,
3047 &ipinfo
) != IBT_SUCCESS
) {
3049 return (IBT_CM_REJECT
);
3052 switch (ipinfo
.src_addr
.family
) {
3055 conn
->c_netid
= kmem_zalloc(strlen(RIBNETID_TCP
) + 1,
3057 (void) strcpy(conn
->c_netid
, RIBNETID_TCP
);
3059 conn
->c_raddr
.maxlen
=
3060 conn
->c_raddr
.len
= sin_size
;
3061 conn
->c_raddr
.buf
= kmem_zalloc(sin_size
, KM_SLEEP
);
3063 s
= (struct sockaddr_in
*)conn
->c_raddr
.buf
;
3064 s
->sin_family
= AF_INET
;
3065 bcopy((void *)&ipinfo
.src_addr
.un
.ip4addr
,
3066 &s
->sin_addr
, in_size
);
3068 conn
->c_laddr
.maxlen
=
3069 conn
->c_laddr
.len
= sin_size
;
3070 conn
->c_laddr
.buf
= kmem_zalloc(sin_size
, KM_SLEEP
);
3072 s
= (struct sockaddr_in
*)conn
->c_laddr
.buf
;
3073 s
->sin_family
= AF_INET
;
3074 bcopy((void *)&ipinfo
.dst_addr
.un
.ip4addr
,
3075 &s
->sin_addr
, in_size
);
3077 conn
->c_addrmask
.maxlen
= conn
->c_addrmask
.len
=
3078 sizeof (struct sockaddr_in
);
3079 conn
->c_addrmask
.buf
=
3080 kmem_zalloc(conn
->c_addrmask
.len
, KM_SLEEP
);
3081 ((struct sockaddr_in
*)
3082 conn
->c_addrmask
.buf
)->sin_addr
.s_addr
=
3084 ((struct sockaddr_in
*)
3085 conn
->c_addrmask
.buf
)->sin_family
=
3091 conn
->c_netid
= kmem_zalloc(strlen(RIBNETID_TCP6
) + 1,
3093 (void) strcpy(conn
->c_netid
, RIBNETID_TCP6
);
3095 conn
->c_raddr
.maxlen
=
3096 conn
->c_raddr
.len
= sin6_size
;
3097 conn
->c_raddr
.buf
= kmem_zalloc(sin6_size
, KM_SLEEP
);
3099 s6
= (struct sockaddr_in6
*)conn
->c_raddr
.buf
;
3100 s6
->sin6_family
= AF_INET6
;
3101 bcopy((void *)&ipinfo
.src_addr
.un
.ip6addr
,
3103 sizeof (struct in6_addr
));
3105 conn
->c_laddr
.maxlen
=
3106 conn
->c_laddr
.len
= sin6_size
;
3107 conn
->c_laddr
.buf
= kmem_zalloc(sin6_size
, KM_SLEEP
);
3109 s6
= (struct sockaddr_in6
*)conn
->c_laddr
.buf
;
3110 s6
->sin6_family
= AF_INET6
;
3111 bcopy((void *)&ipinfo
.dst_addr
.un
.ip6addr
,
3113 sizeof (struct in6_addr
));
3115 conn
->c_addrmask
.maxlen
= conn
->c_addrmask
.len
=
3116 sizeof (struct sockaddr_in6
);
3117 conn
->c_addrmask
.buf
=
3118 kmem_zalloc(conn
->c_addrmask
.len
, KM_SLEEP
);
3119 (void) memset(&((struct sockaddr_in6
*)
3120 conn
->c_addrmask
.buf
)->sin6_addr
, (uchar_t
)~0,
3121 sizeof (struct in6_addr
));
3122 ((struct sockaddr_in6
*)
3123 conn
->c_addrmask
.buf
)->sin6_family
=
3128 return (IBT_CM_REJECT
);
3133 case IBT_CM_EVENT_CONN_CLOSED
:
3138 switch (event
->cm_event
.closed
) {
3139 case IBT_CM_CLOSED_DREP_RCVD
:
3140 case IBT_CM_CLOSED_DREQ_TIMEOUT
:
3141 case IBT_CM_CLOSED_DUP
:
3142 case IBT_CM_CLOSED_ABORT
:
3143 case IBT_CM_CLOSED_ALREADY
:
3145 * These cases indicate the local end initiated
3146 * the closing of the channel. Nothing to do here.
3151 * Reason for CONN_CLOSED event must be one of
3152 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153 * or IBT_CM_CLOSED_STALE. These indicate cases were
3154 * the remote end is closing the channel. In these
3155 * cases free the channel and transition to error
3158 qp
= ibt_get_chan_private(event
->cm_channel
);
3160 mutex_enter(&conn
->c_lock
);
3161 if (conn
->c_state
== C_DISCONN_PEND
) {
3162 mutex_exit(&conn
->c_lock
);
3165 conn
->c_state
= C_ERROR_CONN
;
3168 * Free the conn if c_ref goes down to 0
3170 if (conn
->c_ref
== 0) {
3172 * Remove from list and free conn
3174 conn
->c_state
= C_DISCONN_PEND
;
3175 mutex_exit(&conn
->c_lock
);
3176 (void) rib_disconnect_channel(conn
,
3177 &hca
->srv_conn_list
);
3180 * conn will be freed when c_ref goes to 0.
3181 * Indicate to cleaning thread not to close
3182 * the connection, but just free the channel.
3184 conn
->c_flags
|= C_CLOSE_NOTNEEDED
;
3185 mutex_exit(&conn
->c_lock
);
3187 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect
);
3192 case IBT_CM_EVENT_CONN_EST
:
3194 * RTU received, hence connection established.
3197 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3198 "(CONN_EST) channel established");
3202 if (rib_debug
> 2) {
3203 /* Let CM handle the following events. */
3204 if (event
->cm_type
== IBT_CM_EVENT_REP_RCV
) {
3205 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3206 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207 } else if (event
->cm_type
== IBT_CM_EVENT_LAP_RCV
) {
3208 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3209 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210 } else if (event
->cm_type
== IBT_CM_EVENT_MRA_RCV
) {
3211 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3212 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213 } else if (event
->cm_type
== IBT_CM_EVENT_APR_RCV
) {
3214 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3215 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216 } else if (event
->cm_type
== IBT_CM_EVENT_FAILURE
) {
3217 cmn_err(CE_NOTE
, "rib_srv_cm_handler: "
3218 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3221 return (IBT_CM_DEFAULT
);
3224 /* accept all other CM messages (i.e. let the CM handle them) */
3225 return (IBT_CM_ACCEPT
);
3229 rib_register_service(rib_hca_t
*hca
, int service_type
,
3230 uint8_t protocol_num
, in_port_t dst_port
)
3232 ibt_srv_desc_t sdesc
;
3233 ibt_hca_portinfo_t
*port_infop
;
3235 ibt_srv_hdl_t srv_hdl
;
3237 uint_t pki
, i
, num_ports
, nbinds
;
3238 ibt_status_t ibt_status
;
3239 rib_service_t
*service
;
3243 * Query all ports for the given HCA
3245 rw_enter(&hca
->state_lock
, RW_READER
);
3246 if (hca
->state
!= HCA_DETACHED
) {
3247 ibt_status
= ibt_query_hca_ports(hca
->hca_hdl
, 0, &port_infop
,
3248 &num_ports
, &port_size
);
3249 rw_exit(&hca
->state_lock
);
3251 rw_exit(&hca
->state_lock
);
3252 return (RDMA_FAILED
);
3254 if (ibt_status
!= IBT_SUCCESS
) {
3255 return (RDMA_FAILED
);
3258 DTRACE_PROBE1(rpcib__i__regservice_numports
,
3261 for (i
= 0; i
< num_ports
; i
++) {
3262 if (port_infop
[i
].p_linkstate
!= IBT_PORT_ACTIVE
) {
3263 DTRACE_PROBE1(rpcib__i__regservice__portinactive
,
3265 } else if (port_infop
[i
].p_linkstate
== IBT_PORT_ACTIVE
) {
3266 DTRACE_PROBE1(rpcib__i__regservice__portactive
,
3272 * Get all the IP addresses on this system to register the
3273 * given "service type" on all DNS recognized IP addrs.
3274 * Each service type such as NFS will have all the systems
3275 * IP addresses as its different names. For now the only
3276 * type of service we support in RPCIB is NFS.
3278 rw_enter(&rib_stat
->service_list_lock
, RW_WRITER
);
3280 * Start registering and binding service to active
3281 * on active ports on this HCA.
3284 for (service
= rib_stat
->service_list
;
3285 service
&& (service
->srv_type
!= service_type
);
3286 service
= service
->next
)
3289 if (service
== NULL
) {
3291 * We use IP addresses as the service names for
3292 * service registration. Register each of them
3293 * with CM to obtain a svc_id and svc_hdl. We do not
3294 * register the service with machine's loopback address.
3296 (void) bzero(&srv_id
, sizeof (ib_svc_id_t
));
3297 (void) bzero(&srv_hdl
, sizeof (ibt_srv_hdl_t
));
3298 (void) bzero(&sdesc
, sizeof (ibt_srv_desc_t
));
3299 sdesc
.sd_handler
= rib_srv_cm_handler
;
3301 ibt_status
= ibt_register_service(hca
->ibt_clnt_hdl
,
3302 &sdesc
, ibt_get_ip_sid(protocol_num
, dst_port
),
3303 1, &srv_hdl
, &srv_id
);
3304 if ((ibt_status
!= IBT_SUCCESS
) &&
3305 (ibt_status
!= IBT_CM_SERVICE_EXISTS
)) {
3306 rw_exit(&rib_stat
->service_list_lock
);
3307 DTRACE_PROBE1(rpcib__i__regservice__ibtres
,
3309 ibt_free_portinfo(port_infop
, port_size
);
3310 return (RDMA_FAILED
);
3314 * Allocate and prepare a service entry
3316 service
= kmem_zalloc(sizeof (rib_service_t
), KM_SLEEP
);
3318 service
->srv_type
= service_type
;
3319 service
->srv_hdl
= srv_hdl
;
3320 service
->srv_id
= srv_id
;
3322 service
->next
= rib_stat
->service_list
;
3323 rib_stat
->service_list
= service
;
3324 DTRACE_PROBE1(rpcib__i__regservice__new__service
,
3325 int, service
->srv_type
);
3327 srv_hdl
= service
->srv_hdl
;
3328 srv_id
= service
->srv_id
;
3329 DTRACE_PROBE1(rpcib__i__regservice__existing__service
,
3330 int, service
->srv_type
);
3333 for (i
= 0; i
< num_ports
; i
++) {
3334 ibt_sbind_hdl_t sbp
;
3335 rib_hca_service_t
*hca_srv
;
3338 if (port_infop
[i
].p_linkstate
!= IBT_PORT_ACTIVE
)
3341 for (pki
= 0; pki
< port_infop
[i
].p_pkey_tbl_sz
; pki
++) {
3342 pkey
= port_infop
[i
].p_pkey_tbl
[pki
];
3344 rw_enter(&hca
->bound_services_lock
, RW_READER
);
3345 gid
= port_infop
[i
].p_sgid_tbl
[0];
3346 for (hca_srv
= hca
->bound_services
; hca_srv
;
3347 hca_srv
= hca_srv
->next
) {
3348 if ((hca_srv
->srv_id
== service
->srv_id
) &&
3349 (hca_srv
->gid
.gid_prefix
==
3351 (hca_srv
->gid
.gid_guid
== gid
.gid_guid
))
3354 rw_exit(&hca
->bound_services_lock
);
3355 if (hca_srv
!= NULL
) {
3357 * port is alreay bound the the service
3360 rpcib__i__regservice__already__bound
,
3366 if ((pkey
& IBSRM_HB
) &&
3367 (pkey
!= IB_PKEY_INVALID_FULL
)) {
3370 ibt_status
= ibt_bind_service(srv_hdl
,
3371 gid
, NULL
, hca
, &sbp
);
3373 if (ibt_status
== IBT_SUCCESS
) {
3374 hca_srv
= kmem_zalloc(
3375 sizeof (rib_hca_service_t
),
3377 hca_srv
->srv_id
= srv_id
;
3379 hca_srv
->sbind_hdl
= sbp
;
3381 rw_enter(&hca
->bound_services_lock
,
3383 hca_srv
->next
= hca
->bound_services
;
3384 hca
->bound_services
= hca_srv
;
3385 rw_exit(&hca
->bound_services_lock
);
3389 DTRACE_PROBE1(rpcib__i__regservice__bindres
,
3394 rw_exit(&rib_stat
->service_list_lock
);
3396 ibt_free_portinfo(port_infop
, port_size
);
3399 return (RDMA_FAILED
);
3402 * Put this plugin into accept state, since atleast
3403 * one registration was successful.
3405 mutex_enter(&plugin_state_lock
);
3406 plugin_state
= ACCEPT
;
3407 mutex_exit(&plugin_state_lock
);
3408 return (RDMA_SUCCESS
);
3413 rib_listen(struct rdma_svc_data
*rd
)
3416 int n_listening
= 0;
3419 mutex_enter(&rib_stat
->listen_lock
);
3421 * if rd parameter is NULL then it means that rib_stat->q is
3422 * already initialized by a call from RDMA and we just want to
3423 * add a newly attached HCA to the same listening state as other
3427 if (rib_stat
->q
== NULL
) {
3428 mutex_exit(&rib_stat
->listen_lock
);
3432 rib_stat
->q
= &rd
->q
;
3434 rw_enter(&rib_stat
->hcas_list_lock
, RW_READER
);
3435 for (hca
= rib_stat
->hcas_list
; hca
; hca
= hca
->next
) {
3437 * First check if a hca is still attached
3439 rw_enter(&hca
->state_lock
, RW_READER
);
3440 if (hca
->state
!= HCA_INITED
) {
3441 rw_exit(&hca
->state_lock
);
3444 rw_exit(&hca
->state_lock
);
3447 * Right now the only service type is NFS. Hence
3448 * force feed this value. Ideally to communicate
3449 * the service type it should be passed down in
3452 status
= rib_register_service(hca
, NFS
,
3453 IPPROTO_TCP
, nfs_rdma_port
);
3454 if (status
== RDMA_SUCCESS
)
3457 rw_exit(&rib_stat
->hcas_list_lock
);
3460 * Service active on an HCA, check rd->err_code for more
3461 * explainable errors.
3464 if (n_listening
> 0) {
3466 rd
->err_code
= RDMA_SUCCESS
;
3469 rd
->err_code
= RDMA_FAILED
;
3472 mutex_exit(&rib_stat
->listen_lock
);
3478 rib_listen_stop(struct rdma_svc_data
*svcdata
)
3482 mutex_enter(&rib_stat
->listen_lock
);
3484 * KRPC called the RDMATF to stop the listeners, this means
3485 * stop sending incomming or recieved requests to KRPC master
3486 * transport handle for RDMA-IB. This is also means that the
3487 * master transport handle, responsible for us, is going away.
3489 mutex_enter(&plugin_state_lock
);
3490 plugin_state
= NO_ACCEPT
;
3491 if (svcdata
!= NULL
)
3492 svcdata
->active
= 0;
3493 mutex_exit(&plugin_state_lock
);
3495 rw_enter(&rib_stat
->hcas_list_lock
, RW_READER
);
3496 for (hca
= rib_stat
->hcas_list
; hca
; hca
= hca
->next
) {
3498 * First check if a hca is still attached
3500 rw_enter(&hca
->state_lock
, RW_READER
);
3501 if (hca
->state
== HCA_DETACHED
) {
3502 rw_exit(&hca
->state_lock
);
3505 rib_close_channels(&hca
->srv_conn_list
);
3506 rib_stop_services(hca
);
3507 rw_exit(&hca
->state_lock
);
3509 rw_exit(&rib_stat
->hcas_list_lock
);
3512 * Avoid rib_listen() using the stale q field.
3513 * This could happen if a port goes up after all services
3514 * are already unregistered.
3517 mutex_exit(&rib_stat
->listen_lock
);
3521 * Traverse the HCA's service list to unbind and deregister services.
3522 * For each bound service of HCA to be removed, first find the corresponding
3523 * service handle (srv_hdl) and then unbind the service by calling
3524 * ibt_unbind_service().
3527 rib_stop_services(rib_hca_t
*hca
)
3529 rib_hca_service_t
*srv_list
, *to_remove
;
3532 * unbind and deregister the services for this service type.
3533 * Right now there is only one service type. In future it will
3534 * be passed down to this function.
3536 rw_enter(&hca
->bound_services_lock
, RW_READER
);
3537 srv_list
= hca
->bound_services
;
3538 hca
->bound_services
= NULL
;
3539 rw_exit(&hca
->bound_services_lock
);
3541 while (srv_list
!= NULL
) {
3544 to_remove
= srv_list
;
3545 srv_list
= to_remove
->next
;
3546 rw_enter(&rib_stat
->service_list_lock
, RW_READER
);
3547 for (sc
= rib_stat
->service_list
;
3548 sc
&& (sc
->srv_id
!= to_remove
->srv_id
);
3552 * if sc is NULL then the service doesn't exist anymore,
3553 * probably just removed completely through rib_stat.
3556 (void) ibt_unbind_service(sc
->srv_hdl
,
3557 to_remove
->sbind_hdl
);
3558 rw_exit(&rib_stat
->service_list_lock
);
3559 kmem_free(to_remove
, sizeof (rib_hca_service_t
));
3563 static struct svc_recv
*
3564 rib_init_svc_recv(rib_qp_t
*qp
, ibt_wr_ds_t
*sgl
)
3566 struct svc_recv
*recvp
;
3568 recvp
= kmem_zalloc(sizeof (struct svc_recv
), KM_SLEEP
);
3569 recvp
->vaddr
= sgl
->ds_va
;
3571 recvp
->bytes_xfer
= 0;
3576 rib_free_svc_recv(struct svc_recv
*recvp
)
3578 kmem_free(recvp
, sizeof (*recvp
));
3583 static struct reply
*
3584 rib_addreplylist(rib_qp_t
*qp
, uint32_t msgid
)
3589 rep
= kmem_zalloc(sizeof (struct reply
), KM_NOSLEEP
);
3591 DTRACE_PROBE(rpcib__i__addrreply__nomem
);
3595 rep
->vaddr_cq
= (uintptr_t)NULL
;
3596 rep
->bytes_xfer
= 0;
3597 rep
->status
= (uint_t
)REPLY_WAIT
;
3599 cv_init(&rep
->wait_cv
, NULL
, CV_DEFAULT
, NULL
);
3601 mutex_enter(&qp
->replylist_lock
);
3602 if (qp
->replylist
) {
3603 rep
->next
= qp
->replylist
;
3604 qp
->replylist
->prev
= rep
;
3606 qp
->rep_list_size
++;
3608 DTRACE_PROBE1(rpcib__i__addrreply__listsize
,
3609 int, qp
->rep_list_size
);
3611 qp
->replylist
= rep
;
3612 mutex_exit(&qp
->replylist_lock
);
3618 rib_rem_replylist(rib_qp_t
*qp
)
3620 struct reply
*r
, *n
;
3622 mutex_enter(&qp
->replylist_lock
);
3623 for (r
= qp
->replylist
; r
!= NULL
; r
= n
) {
3625 (void) rib_remreply(qp
, r
);
3627 mutex_exit(&qp
->replylist_lock
);
3629 return (RDMA_SUCCESS
);
3633 rib_remreply(rib_qp_t
*qp
, struct reply
*rep
)
3636 ASSERT(MUTEX_HELD(&qp
->replylist_lock
));
3638 rep
->prev
->next
= rep
->next
;
3641 rep
->next
->prev
= rep
->prev
;
3643 if (qp
->replylist
== rep
)
3644 qp
->replylist
= rep
->next
;
3646 cv_destroy(&rep
->wait_cv
);
3647 qp
->rep_list_size
--;
3649 DTRACE_PROBE1(rpcib__i__remreply__listsize
,
3650 int, qp
->rep_list_size
);
3652 kmem_free(rep
, sizeof (*rep
));
3658 rib_registermem(CONN
*conn
, caddr_t adsp
, caddr_t buf
, uint_t buflen
,
3659 struct mrc
*buf_handle
)
3661 ibt_mr_hdl_t mr_hdl
= NULL
; /* memory region handle */
3662 ibt_mr_desc_t mr_desc
; /* vaddr, lkey, rkey */
3664 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
3667 * Note: ALL buffer pools use the same memory type RDMARW.
3669 status
= rib_reg_mem(hca
, adsp
, buf
, buflen
, 0, &mr_hdl
, &mr_desc
);
3670 if (status
== RDMA_SUCCESS
) {
3671 buf_handle
->mrc_linfo
= (uintptr_t)mr_hdl
;
3672 buf_handle
->mrc_lmr
= (uint32_t)mr_desc
.md_lkey
;
3673 buf_handle
->mrc_rmr
= (uint32_t)mr_desc
.md_rkey
;
3675 buf_handle
->mrc_linfo
= (uintptr_t)NULL
;
3676 buf_handle
->mrc_lmr
= 0;
3677 buf_handle
->mrc_rmr
= 0;
3683 rib_reg_mem(rib_hca_t
*hca
, caddr_t adsp
, caddr_t buf
, uint_t size
,
3684 ibt_mr_flags_t spec
,
3685 ibt_mr_hdl_t
*mr_hdlp
, ibt_mr_desc_t
*mr_descp
)
3687 ibt_mr_attr_t mem_attr
;
3688 ibt_status_t ibt_status
;
3689 mem_attr
.mr_vaddr
= (uintptr_t)buf
;
3690 mem_attr
.mr_len
= (ib_msglen_t
)size
;
3691 mem_attr
.mr_as
= (struct as
*)(caddr_t
)adsp
;
3692 mem_attr
.mr_flags
= IBT_MR_SLEEP
| IBT_MR_ENABLE_LOCAL_WRITE
|
3693 IBT_MR_ENABLE_REMOTE_READ
| IBT_MR_ENABLE_REMOTE_WRITE
|
3694 IBT_MR_ENABLE_WINDOW_BIND
| spec
;
3696 rw_enter(&hca
->state_lock
, RW_READER
);
3697 if (hca
->state
!= HCA_DETACHED
) {
3698 ibt_status
= ibt_register_mr(hca
->hca_hdl
, hca
->pd_hdl
,
3699 &mem_attr
, mr_hdlp
, mr_descp
);
3700 rw_exit(&hca
->state_lock
);
3702 rw_exit(&hca
->state_lock
);
3703 return (RDMA_FAILED
);
3706 if (ibt_status
!= IBT_SUCCESS
) {
3707 return (RDMA_FAILED
);
3709 return (RDMA_SUCCESS
);
3713 rib_registermemsync(CONN
*conn
, caddr_t adsp
, caddr_t buf
, uint_t buflen
,
3714 struct mrc
*buf_handle
, RIB_SYNCMEM_HANDLE
*sync_handle
, void *lrc
)
3716 ibt_mr_hdl_t mr_hdl
= NULL
; /* memory region handle */
3718 ibt_mr_desc_t mr_desc
; /* vaddr, lkey, rkey */
3720 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
3723 * Non-coherent memory registration.
3725 l
= (rib_lrc_entry_t
*)lrc
;
3727 if (l
->registered
) {
3728 buf_handle
->mrc_linfo
=
3729 (uintptr_t)l
->lrc_mhandle
.mrc_linfo
;
3730 buf_handle
->mrc_lmr
=
3731 (uint32_t)l
->lrc_mhandle
.mrc_lmr
;
3732 buf_handle
->mrc_rmr
=
3733 (uint32_t)l
->lrc_mhandle
.mrc_rmr
;
3734 *sync_handle
= (RIB_SYNCMEM_HANDLE
)
3735 (uintptr_t)l
->lrc_mhandle
.mrc_linfo
;
3736 return (RDMA_SUCCESS
);
3738 /* Always register the whole buffer */
3739 buf
= (caddr_t
)l
->lrc_buf
;
3740 buflen
= l
->lrc_len
;
3743 status
= rib_reg_mem(hca
, adsp
, buf
, buflen
, 0, &mr_hdl
, &mr_desc
);
3745 if (status
== RDMA_SUCCESS
) {
3747 l
->lrc_mhandle
.mrc_linfo
= (uintptr_t)mr_hdl
;
3748 l
->lrc_mhandle
.mrc_lmr
= (uint32_t)mr_desc
.md_lkey
;
3749 l
->lrc_mhandle
.mrc_rmr
= (uint32_t)mr_desc
.md_rkey
;
3750 l
->registered
= TRUE
;
3752 buf_handle
->mrc_linfo
= (uintptr_t)mr_hdl
;
3753 buf_handle
->mrc_lmr
= (uint32_t)mr_desc
.md_lkey
;
3754 buf_handle
->mrc_rmr
= (uint32_t)mr_desc
.md_rkey
;
3755 *sync_handle
= (RIB_SYNCMEM_HANDLE
)mr_hdl
;
3757 buf_handle
->mrc_linfo
= (uintptr_t)NULL
;
3758 buf_handle
->mrc_lmr
= 0;
3759 buf_handle
->mrc_rmr
= 0;
3766 rib_deregistermem(CONN
*conn
, caddr_t buf
, struct mrc buf_handle
)
3768 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
3770 * Allow memory deregistration even if HCA is
3771 * getting detached. Need all outstanding
3772 * memory registrations to be deregistered
3773 * before HCA_DETACH_EVENT can be accepted.
3775 (void) ibt_deregister_mr(hca
->hca_hdl
,
3776 (ibt_mr_hdl_t
)(uintptr_t)buf_handle
.mrc_linfo
);
3777 return (RDMA_SUCCESS
);
3782 rib_deregistermemsync(CONN
*conn
, caddr_t buf
, struct mrc buf_handle
,
3783 RIB_SYNCMEM_HANDLE sync_handle
, void *lrc
)
3786 l
= (rib_lrc_entry_t
*)lrc
;
3789 return (RDMA_SUCCESS
);
3791 (void) rib_deregistermem(conn
, buf
, buf_handle
);
3793 return (RDMA_SUCCESS
);
3798 rib_syncmem(CONN
*conn
, RIB_SYNCMEM_HANDLE shandle
, caddr_t buf
,
3801 ibt_status_t status
;
3802 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
3803 ibt_mr_sync_t mr_segment
;
3805 mr_segment
.ms_handle
= (ibt_mr_hdl_t
)shandle
;
3806 mr_segment
.ms_vaddr
= (ib_vaddr_t
)(uintptr_t)buf
;
3807 mr_segment
.ms_len
= (ib_memlen_t
)len
;
3809 /* make incoming data visible to memory */
3810 mr_segment
.ms_flags
= IBT_SYNC_WRITE
;
3812 /* make memory changes visible to IO */
3813 mr_segment
.ms_flags
= IBT_SYNC_READ
;
3815 rw_enter(&hca
->state_lock
, RW_READER
);
3816 if (hca
->state
!= HCA_DETACHED
) {
3817 status
= ibt_sync_mr(hca
->hca_hdl
, &mr_segment
, 1);
3818 rw_exit(&hca
->state_lock
);
3820 rw_exit(&hca
->state_lock
);
3821 return (RDMA_FAILED
);
3824 if (status
== IBT_SUCCESS
)
3825 return (RDMA_SUCCESS
);
3827 return (RDMA_FAILED
);
3835 rib_getinfo(rdma_info_t
*info
)
3841 info
->mts
= 1000000;
3842 info
->mtu
= 1000000;
3844 return (RDMA_SUCCESS
);
3848 rib_rbufpool_create(rib_hca_t
*hca
, int ptype
, int num
)
3850 rib_bufpool_t
*rbp
= NULL
;
3851 bufpool_t
*bp
= NULL
;
3853 ibt_mr_attr_t mem_attr
;
3854 ibt_status_t ibt_status
;
3857 rbp
= (rib_bufpool_t
*)kmem_zalloc(sizeof (rib_bufpool_t
), KM_SLEEP
);
3859 bp
= (bufpool_t
*)kmem_zalloc(sizeof (bufpool_t
) +
3860 num
* sizeof (void *), KM_SLEEP
);
3862 mutex_init(&bp
->buflock
, NULL
, MUTEX_DRIVER
, hca
->iblock
);
3868 mem_attr
.mr_flags
= IBT_MR_SLEEP
| IBT_MR_ENABLE_LOCAL_WRITE
;
3869 bp
->rsize
= RPC_MSG_SZ
;
3872 mem_attr
.mr_flags
= IBT_MR_SLEEP
| IBT_MR_ENABLE_LOCAL_WRITE
;
3873 bp
->rsize
= RPC_BUF_SIZE
;
3880 * Register the pool.
3882 bp
->bufsize
= num
* bp
->rsize
;
3883 bp
->buf
= kmem_zalloc(bp
->bufsize
, KM_SLEEP
);
3884 rbp
->mr_hdl
= (ibt_mr_hdl_t
*)kmem_zalloc(num
*
3885 sizeof (ibt_mr_hdl_t
), KM_SLEEP
);
3886 rbp
->mr_desc
= (ibt_mr_desc_t
*)kmem_zalloc(num
*
3887 sizeof (ibt_mr_desc_t
), KM_SLEEP
);
3888 rw_enter(&hca
->state_lock
, RW_READER
);
3890 if (hca
->state
== HCA_DETACHED
) {
3891 rw_exit(&hca
->state_lock
);
3895 for (i
= 0, buf
= bp
->buf
; i
< num
; i
++, buf
+= bp
->rsize
) {
3896 bzero(&rbp
->mr_desc
[i
], sizeof (ibt_mr_desc_t
));
3897 mem_attr
.mr_vaddr
= (uintptr_t)buf
;
3898 mem_attr
.mr_len
= (ib_msglen_t
)bp
->rsize
;
3899 mem_attr
.mr_as
= NULL
;
3900 ibt_status
= ibt_register_mr(hca
->hca_hdl
,
3901 hca
->pd_hdl
, &mem_attr
,
3904 if (ibt_status
!= IBT_SUCCESS
) {
3905 for (j
= 0; j
< i
; j
++) {
3906 (void) ibt_deregister_mr(hca
->hca_hdl
,
3909 rw_exit(&hca
->state_lock
);
3913 rw_exit(&hca
->state_lock
);
3914 buf
= (caddr_t
)bp
->buf
;
3915 for (i
= 0; i
< num
; i
++, buf
+= bp
->rsize
) {
3916 bp
->buflist
[i
] = (void *)buf
;
3918 bp
->buffree
= num
- 1; /* no. of free buffers */
3925 kmem_free(bp
->buf
, bp
->bufsize
);
3926 kmem_free(bp
, sizeof (bufpool_t
) + num
*sizeof (void *));
3930 kmem_free(rbp
->mr_hdl
, num
*sizeof (ibt_mr_hdl_t
));
3932 kmem_free(rbp
->mr_desc
, num
*sizeof (ibt_mr_desc_t
));
3933 kmem_free(rbp
, sizeof (rib_bufpool_t
));
3939 rib_rbufpool_deregister(rib_hca_t
*hca
, int ptype
)
3942 rib_bufpool_t
*rbp
= NULL
;
3946 * Obtain pool address based on type of pool
3950 rbp
= hca
->send_pool
;
3953 rbp
= hca
->recv_pool
;
3964 * Deregister the pool memory and free it.
3966 for (i
= 0; i
< bp
->numelems
; i
++) {
3967 (void) ibt_deregister_mr(hca
->hca_hdl
, rbp
->mr_hdl
[i
]);
3972 rib_rbufpool_free(rib_hca_t
*hca
, int ptype
)
3975 rib_bufpool_t
*rbp
= NULL
;
3979 * Obtain pool address based on type of pool
3983 rbp
= hca
->send_pool
;
3986 rbp
= hca
->recv_pool
;
3997 * Free the pool memory.
4000 kmem_free(rbp
->mr_hdl
, bp
->numelems
*sizeof (ibt_mr_hdl_t
));
4003 kmem_free(rbp
->mr_desc
, bp
->numelems
*sizeof (ibt_mr_desc_t
));
4005 kmem_free(bp
->buf
, bp
->bufsize
);
4006 mutex_destroy(&bp
->buflock
);
4007 kmem_free(bp
, sizeof (bufpool_t
) + bp
->numelems
*sizeof (void *));
4008 kmem_free(rbp
, sizeof (rib_bufpool_t
));
4012 rib_rbufpool_destroy(rib_hca_t
*hca
, int ptype
)
4015 * Deregister the pool memory and free it.
4017 rib_rbufpool_deregister(hca
, ptype
);
4018 rib_rbufpool_free(hca
, ptype
);
4022 * Fetch a buffer from the pool of type specified in rdbuf->type.
4025 rib_reg_buf_alloc(CONN
*conn
, rdma_buf_t
*rdbuf
)
4027 rib_lrc_entry_t
*rlep
;
4029 if (rdbuf
->type
== RDMA_LONG_BUFFER
) {
4030 rlep
= rib_get_cache_buf(conn
, rdbuf
->len
);
4031 rdbuf
->rb_private
= (caddr_t
)rlep
;
4032 rdbuf
->addr
= rlep
->lrc_buf
;
4033 rdbuf
->handle
= rlep
->lrc_mhandle
;
4034 return (RDMA_SUCCESS
);
4037 rdbuf
->addr
= rib_rbuf_alloc(conn
, rdbuf
);
4039 switch (rdbuf
->type
) {
4041 rdbuf
->len
= RPC_MSG_SZ
; /* 1K */
4044 rdbuf
->len
= RPC_BUF_SIZE
; /* 2K */
4049 return (RDMA_SUCCESS
);
4051 return (RDMA_FAILED
);
4055 * Fetch a buffer of specified type.
4056 * Note that rdbuf->handle is mw's rkey.
4059 rib_rbuf_alloc(CONN
*conn
, rdma_buf_t
*rdbuf
)
4061 rib_qp_t
*qp
= ctoqp(conn
);
4062 rib_hca_t
*hca
= qp
->hca
;
4063 rdma_btype ptype
= rdbuf
->type
;
4065 rib_bufpool_t
*rbp
= NULL
;
4070 * Obtain pool address based on type of pool
4074 rbp
= hca
->send_pool
;
4077 rbp
= hca
->recv_pool
;
4087 mutex_enter(&bp
->buflock
);
4088 if (bp
->buffree
< 0) {
4089 mutex_exit(&bp
->buflock
);
4093 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094 buf
= bp
->buflist
[bp
->buffree
];
4096 rdbuf
->len
= bp
->rsize
;
4097 for (i
= bp
->numelems
- 1; i
>= 0; i
--) {
4098 if ((ib_vaddr_t
)(uintptr_t)buf
== rbp
->mr_desc
[i
].md_vaddr
) {
4099 rdbuf
->handle
.mrc_rmr
=
4100 (uint32_t)rbp
->mr_desc
[i
].md_rkey
;
4101 rdbuf
->handle
.mrc_linfo
=
4102 (uintptr_t)rbp
->mr_hdl
[i
];
4103 rdbuf
->handle
.mrc_lmr
=
4104 (uint32_t)rbp
->mr_desc
[i
].md_lkey
;
4107 mutex_exit(&bp
->buflock
);
4113 mutex_exit(&bp
->buflock
);
4119 rib_reg_buf_free(CONN
*conn
, rdma_buf_t
*rdbuf
)
4122 if (rdbuf
->type
== RDMA_LONG_BUFFER
) {
4123 rib_free_cache_buf(conn
, (rib_lrc_entry_t
*)rdbuf
->rb_private
);
4124 rdbuf
->rb_private
= NULL
;
4127 rib_rbuf_free(conn
, rdbuf
->type
, rdbuf
->addr
);
4131 rib_rbuf_free(CONN
*conn
, int ptype
, void *buf
)
4133 rib_qp_t
*qp
= ctoqp(conn
);
4134 rib_hca_t
*hca
= qp
->hca
;
4135 rib_bufpool_t
*rbp
= NULL
;
4139 * Obtain pool address based on type of pool
4143 rbp
= hca
->send_pool
;
4146 rbp
= hca
->recv_pool
;
4156 mutex_enter(&bp
->buflock
);
4157 if (++bp
->buffree
>= bp
->numelems
) {
4159 * Should never happen
4163 bp
->buflist
[bp
->buffree
] = buf
;
4165 mutex_exit(&bp
->buflock
);
4169 rib_add_connlist(CONN
*cn
, rib_conn_list_t
*connlist
)
4171 rw_enter(&connlist
->conn_lock
, RW_WRITER
);
4172 if (connlist
->conn_hd
) {
4173 cn
->c_next
= connlist
->conn_hd
;
4174 connlist
->conn_hd
->c_prev
= cn
;
4176 connlist
->conn_hd
= cn
;
4177 rw_exit(&connlist
->conn_lock
);
4179 return (RDMA_SUCCESS
);
4183 rib_rm_conn(CONN
*cn
, rib_conn_list_t
*connlist
)
4185 rw_enter(&connlist
->conn_lock
, RW_WRITER
);
4187 cn
->c_prev
->c_next
= cn
->c_next
;
4190 cn
->c_next
->c_prev
= cn
->c_prev
;
4192 if (connlist
->conn_hd
== cn
)
4193 connlist
->conn_hd
= cn
->c_next
;
4194 rw_exit(&connlist
->conn_lock
);
4196 return (RDMA_SUCCESS
);
4201 rib_conn_get(struct netbuf
*s_svcaddr
, struct netbuf
*d_svcaddr
,
4202 int addr_type
, void *handle
, CONN
**conn
)
4207 status
= rib_connect(s_svcaddr
, d_svcaddr
, addr_type
, &rpt
, conn
);
4212 * rib_find_hca_connection
4214 * if there is an existing connection to the specified address then
4215 * it will be returned in conn, otherwise conn will be set to NULL.
4216 * Also cleans up any connection that is in error state.
4219 rib_find_hca_connection(rib_hca_t
*hca
, struct netbuf
*s_svcaddr
,
4220 struct netbuf
*d_svcaddr
, CONN
**conn
)
4223 clock_t cv_stat
, timout
;
4227 rw_enter(&hca
->cl_conn_list
.conn_lock
, RW_READER
);
4228 cn
= hca
->cl_conn_list
.conn_hd
;
4229 while (cn
!= NULL
) {
4231 * First, clear up any connection in the ERROR state
4233 mutex_enter(&cn
->c_lock
);
4234 if (cn
->c_state
== C_ERROR_CONN
) {
4235 if (cn
->c_ref
== 0) {
4237 * Remove connection from list and destroy it.
4239 cn
->c_state
= C_DISCONN_PEND
;
4240 mutex_exit(&cn
->c_lock
);
4241 rw_exit(&hca
->cl_conn_list
.conn_lock
);
4242 rib_conn_close((void *)cn
);
4245 mutex_exit(&cn
->c_lock
);
4249 if (cn
->c_state
== C_DISCONN_PEND
) {
4250 mutex_exit(&cn
->c_lock
);
4256 * source address is only checked for if there is one,
4257 * this is the case for retries.
4259 if ((cn
->c_raddr
.len
== d_svcaddr
->len
) &&
4260 (bcmp(d_svcaddr
->buf
, cn
->c_raddr
.buf
,
4261 d_svcaddr
->len
) == 0) &&
4262 ((s_svcaddr
->len
== 0) ||
4263 ((cn
->c_laddr
.len
== s_svcaddr
->len
) &&
4264 (bcmp(s_svcaddr
->buf
, cn
->c_laddr
.buf
,
4265 s_svcaddr
->len
) == 0)))) {
4267 * Our connection. Give up conn list lock
4268 * as we are done traversing the list.
4270 rw_exit(&hca
->cl_conn_list
.conn_lock
);
4271 if (cn
->c_state
== C_CONNECTED
) {
4272 cn
->c_ref
++; /* sharing a conn */
4273 mutex_exit(&cn
->c_lock
);
4275 return (RDMA_SUCCESS
);
4277 if (cn
->c_state
== C_CONN_PEND
) {
4279 * Hold a reference to this conn before
4280 * we give up the lock.
4283 timout
= ddi_get_lbolt() +
4284 drv_usectohz(CONN_WAIT_TIME
* 1000000);
4285 while ((cv_stat
= cv_timedwait_sig(&cn
->c_cv
,
4286 &cn
->c_lock
, timout
)) > 0 &&
4287 cn
->c_state
== C_CONN_PEND
)
4290 (void) rib_conn_release_locked(cn
);
4294 (void) rib_conn_release_locked(cn
);
4295 return (RDMA_TIMEDOUT
);
4297 if (cn
->c_state
== C_CONNECTED
) {
4299 mutex_exit(&cn
->c_lock
);
4300 return (RDMA_SUCCESS
);
4302 (void) rib_conn_release_locked(cn
);
4303 return (RDMA_TIMEDOUT
);
4307 mutex_exit(&cn
->c_lock
);
4310 rw_exit(&hca
->cl_conn_list
.conn_lock
);
4312 return (RDMA_FAILED
);
4316 * Connection management.
4317 * IBTF does not support recycling of channels. So connections are only
4318 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319 * C_DISCONN_PEND state. No C_IDLE state.
4320 * C_CONN_PEND state: Connection establishment in progress to the server.
4321 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323 * only in this state.
4324 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328 * c_ref drops to 0 (this indicates that RPC has no more references to this
4329 * connection), the connection should be destroyed. A connection transitions
4330 * into this state when it is being destroyed.
4334 rib_connect(struct netbuf
*s_svcaddr
, struct netbuf
*d_svcaddr
,
4335 int addr_type
, rpcib_ping_t
*rpt
, CONN
**conn
)
4344 rw_enter(&rib_stat
->hcas_list_lock
, RW_READER
);
4345 for (hca
= rib_stat
->hcas_list
; hca
; hca
= hca
->next
) {
4346 rw_enter(&hca
->state_lock
, RW_READER
);
4347 if (hca
->state
!= HCA_DETACHED
) {
4348 status
= rib_find_hca_connection(hca
, s_svcaddr
,
4350 rw_exit(&hca
->state_lock
);
4351 if ((status
== RDMA_INTR
) || (status
== RDMA_SUCCESS
)) {
4352 rw_exit(&rib_stat
->hcas_list_lock
);
4356 rw_exit(&hca
->state_lock
);
4358 rw_exit(&rib_stat
->hcas_list_lock
);
4361 * No existing connection found, establish a new connection.
4363 bzero(rpt
, sizeof (rpcib_ping_t
));
4365 status
= rib_ping_srv(addr_type
, d_svcaddr
, rpt
);
4366 if (status
!= RDMA_SUCCESS
) {
4367 return (RDMA_FAILED
);
4371 if (rpt
->srcip
.family
== AF_INET
) {
4372 s_addr_len
= sizeof (rpt
->srcip
.un
.ip4addr
);
4373 s_addr_buf
= (char *)&rpt
->srcip
.un
.ip4addr
;
4374 } else if (rpt
->srcip
.family
== AF_INET6
) {
4375 s_addr_len
= sizeof (rpt
->srcip
.un
.ip6addr
);
4376 s_addr_buf
= (char *)&rpt
->srcip
.un
.ip6addr
;
4378 return (RDMA_FAILED
);
4382 * Channel to server doesn't exist yet, create one.
4384 if (rib_clnt_create_chan(hca
, d_svcaddr
, &qp
) != RDMA_SUCCESS
) {
4385 return (RDMA_FAILED
);
4388 cn
->c_state
= C_CONN_PEND
;
4391 cn
->c_laddr
.buf
= kmem_alloc(s_addr_len
, KM_SLEEP
);
4392 bcopy(s_addr_buf
, cn
->c_laddr
.buf
, s_addr_len
);
4393 cn
->c_laddr
.len
= cn
->c_laddr
.maxlen
= s_addr_len
;
4395 if (rpt
->srcip
.family
== AF_INET
) {
4396 cn
->c_netid
= kmem_zalloc(strlen(RIBNETID_TCP
) + 1, KM_SLEEP
);
4397 (void) strcpy(cn
->c_netid
, RIBNETID_TCP
);
4399 cn
->c_addrmask
.len
= cn
->c_addrmask
.maxlen
=
4400 sizeof (struct sockaddr_in
);
4401 cn
->c_addrmask
.buf
= kmem_zalloc(cn
->c_addrmask
.len
, KM_SLEEP
);
4403 ((struct sockaddr_in
*)cn
->c_addrmask
.buf
)->sin_addr
.s_addr
=
4405 ((struct sockaddr_in
*)cn
->c_addrmask
.buf
)->sin_family
=
4409 cn
->c_netid
= kmem_zalloc(strlen(RIBNETID_TCP6
) + 1, KM_SLEEP
);
4410 (void) strcpy(cn
->c_netid
, RIBNETID_TCP6
);
4412 cn
->c_addrmask
.len
= cn
->c_addrmask
.maxlen
=
4413 sizeof (struct sockaddr_in6
);
4414 cn
->c_addrmask
.buf
= kmem_zalloc(cn
->c_addrmask
.len
, KM_SLEEP
);
4417 &((struct sockaddr_in6
*)cn
->c_addrmask
.buf
)->sin6_addr
,
4418 (uchar_t
)~0, sizeof (struct in6_addr
));
4419 ((struct sockaddr_in6
*)cn
->c_addrmask
.buf
)->sin6_family
=
4425 * We had given up the READER lock. In the time since then,
4426 * another thread might have created the connection we are
4427 * trying here. But for now, that is quiet alright - there
4428 * might be two connections between a pair of hosts instead
4429 * of one. If we really want to close that window,
4430 * then need to check the list after acquiring the
4433 (void) rib_add_connlist(cn
, &hca
->cl_conn_list
);
4434 status
= rib_conn_to_srv(hca
, qp
, rpt
);
4435 mutex_enter(&cn
->c_lock
);
4437 if (cn
->c_flags
& C_CLOSE_PENDING
) {
4439 * This handles a case where the module or
4440 * HCA detached in the time a connection is
4441 * established. In such a case close the
4442 * connection immediately if this is the
4445 if (cn
->c_ref
== 1) {
4447 cn
->c_state
= C_DISCONN_PEND
;
4448 mutex_exit(&cn
->c_lock
);
4449 rib_conn_close((void *)cn
);
4450 return (RDMA_FAILED
);
4454 * Connection to be closed later when c_ref = 0
4456 status
= RDMA_FAILED
;
4459 if (status
== RDMA_SUCCESS
) {
4460 cn
->c_state
= C_CONNECTED
;
4463 cn
->c_state
= C_ERROR_CONN
;
4466 cv_signal(&cn
->c_cv
);
4467 mutex_exit(&cn
->c_lock
);
4472 rib_conn_close(void *rarg
)
4474 CONN
*conn
= (CONN
*)rarg
;
4475 rib_qp_t
*qp
= ctoqp(conn
);
4477 mutex_enter(&conn
->c_lock
);
4478 if (!(conn
->c_flags
& C_CLOSE_NOTNEEDED
)) {
4480 conn
->c_flags
|= (C_CLOSE_NOTNEEDED
| C_CLOSE_PENDING
);
4483 * Live connection in CONNECTED state.
4485 if (conn
->c_state
== C_CONNECTED
) {
4486 conn
->c_state
= C_ERROR_CONN
;
4488 mutex_exit(&conn
->c_lock
);
4490 rib_close_a_channel(conn
);
4492 mutex_enter(&conn
->c_lock
);
4493 conn
->c_flags
&= ~C_CLOSE_PENDING
;
4496 mutex_exit(&conn
->c_lock
);
4498 if (qp
->mode
== RIB_SERVER
)
4499 (void) rib_disconnect_channel(conn
,
4500 &qp
->hca
->srv_conn_list
);
4502 (void) rib_disconnect_channel(conn
,
4503 &qp
->hca
->cl_conn_list
);
4507 rib_conn_timeout_call(void *carg
)
4510 CONN
*conn
= (CONN
*)carg
;
4511 rib_hca_t
*hca
= ctoqp(conn
)->hca
;
4514 mutex_enter(&conn
->c_lock
);
4515 if ((conn
->c_ref
> 0) ||
4516 (conn
->c_state
== C_DISCONN_PEND
)) {
4517 conn
->c_timeout
= NULL
;
4518 mutex_exit(&conn
->c_lock
);
4522 idle_time
= (gethrestime_sec() - conn
->c_last_used
);
4524 if ((idle_time
<= rib_conn_timeout
) &&
4525 (conn
->c_state
!= C_ERROR_CONN
)) {
4527 * There was activity after the last timeout.
4528 * Extend the conn life. Unless the conn is
4529 * already in error state.
4531 conn
->c_timeout
= timeout(rib_conn_timeout_call
, conn
,
4532 SEC_TO_TICK(rib_conn_timeout
- idle_time
));
4533 mutex_exit(&conn
->c_lock
);
4537 error
= ddi_taskq_dispatch(hca
->cleanup_helper
, rib_conn_close
,
4538 (void *)conn
, DDI_NOSLEEP
);
4541 * If taskq dispatch fails above, then reset the timeout
4542 * to try again after 10 secs.
4545 if (error
!= DDI_SUCCESS
) {
4546 conn
->c_timeout
= timeout(rib_conn_timeout_call
, conn
,
4547 SEC_TO_TICK(RDMA_CONN_REAP_RETRY
));
4548 mutex_exit(&conn
->c_lock
);
4552 conn
->c_state
= C_DISCONN_PEND
;
4553 mutex_exit(&conn
->c_lock
);
4557 rib_conn_release(CONN
*conn
)
4559 mutex_enter(&conn
->c_lock
);
4560 return (rib_conn_release_locked(conn
));
4564 * Expects conn->c_lock to be held on entry.
4565 * c_lock released on return
4568 rib_conn_release_locked(CONN
*conn
)
4572 conn
->c_last_used
= gethrestime_sec();
4573 if (conn
->c_ref
> 0) {
4574 mutex_exit(&conn
->c_lock
);
4575 return (RDMA_SUCCESS
);
4579 * If a conn is C_ERROR_CONN, close the channel.
4581 if (conn
->c_ref
== 0 && conn
->c_state
== C_ERROR_CONN
) {
4582 conn
->c_state
= C_DISCONN_PEND
;
4583 mutex_exit(&conn
->c_lock
);
4584 rib_conn_close((void *)conn
);
4585 return (RDMA_SUCCESS
);
4589 * c_ref == 0, set a timeout for conn release
4592 if (conn
->c_timeout
== NULL
) {
4593 conn
->c_timeout
= timeout(rib_conn_timeout_call
, conn
,
4594 SEC_TO_TICK(rib_conn_timeout
));
4597 mutex_exit(&conn
->c_lock
);
4598 return (RDMA_SUCCESS
);
4602 * Add at front of list
4604 static struct rdma_done_list
*
4605 rdma_done_add(rib_qp_t
*qp
, uint32_t xid
)
4607 struct rdma_done_list
*rd
;
4609 ASSERT(MUTEX_HELD(&qp
->rdlist_lock
));
4611 rd
= kmem_alloc(sizeof (*rd
), KM_SLEEP
);
4613 cv_init(&rd
->rdma_done_cv
, NULL
, CV_DEFAULT
, NULL
);
4616 rd
->next
= qp
->rdlist
;
4617 if (qp
->rdlist
!= NULL
)
4618 qp
->rdlist
->prev
= rd
;
4625 rdma_done_rm(rib_qp_t
*qp
, struct rdma_done_list
*rd
)
4627 struct rdma_done_list
*r
;
4629 ASSERT(MUTEX_HELD(&qp
->rdlist_lock
));
4640 qp
->rdlist
= rd
->next
;
4643 cv_destroy(&rd
->rdma_done_cv
);
4644 kmem_free(rd
, sizeof (*rd
));
4648 rdma_done_rem_list(rib_qp_t
*qp
)
4650 struct rdma_done_list
*r
, *n
;
4652 mutex_enter(&qp
->rdlist_lock
);
4653 for (r
= qp
->rdlist
; r
!= NULL
; r
= n
) {
4655 rdma_done_rm(qp
, r
);
4657 mutex_exit(&qp
->rdlist_lock
);
4661 rdma_done_notify(rib_qp_t
*qp
, uint32_t xid
)
4663 struct rdma_done_list
*r
= qp
->rdlist
;
4665 ASSERT(MUTEX_HELD(&qp
->rdlist_lock
));
4668 if (r
->xid
== xid
) {
4669 cv_signal(&r
->rdma_done_cv
);
4675 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid
,
4680 * Expects conn->c_lock to be held by the caller.
4684 rib_close_a_channel(CONN
*conn
)
4689 if (qp
->qp_hdl
== NULL
) {
4690 /* channel already freed */
4695 * Call ibt_close_rc_channel in blocking mode
4696 * with no callbacks.
4698 (void) ibt_close_rc_channel(qp
->qp_hdl
, IBT_NOCALLBACKS
,
4699 NULL
, 0, NULL
, NULL
, 0);
4703 * Goes through all connections and closes the channel
4704 * This will cause all the WRs on those channels to be
4708 rib_close_channels(rib_conn_list_t
*connlist
)
4712 rw_enter(&connlist
->conn_lock
, RW_READER
);
4713 conn
= connlist
->conn_hd
;
4714 while (conn
!= NULL
) {
4715 mutex_enter(&conn
->c_lock
);
4717 if (!(conn
->c_flags
& C_CLOSE_NOTNEEDED
)) {
4719 if (conn
->c_state
== C_CONN_PEND
) {
4720 conn
->c_flags
|= C_CLOSE_PENDING
;
4724 conn
->c_flags
|= (C_CLOSE_NOTNEEDED
| C_CLOSE_PENDING
);
4727 * Live connection in CONNECTED state.
4729 if (conn
->c_state
== C_CONNECTED
)
4730 conn
->c_state
= C_ERROR_CONN
;
4731 mutex_exit(&conn
->c_lock
);
4733 rib_close_a_channel(conn
);
4735 mutex_enter(&conn
->c_lock
);
4736 conn
->c_flags
&= ~C_CLOSE_PENDING
;
4737 /* Signal a pending rib_disconnect_channel() */
4738 cv_signal(&conn
->c_cv
);
4741 mutex_exit(&conn
->c_lock
);
4744 rw_exit(&connlist
->conn_lock
);
4748 * Frees up all connections that are no longer being referenced
4751 rib_purge_connlist(rib_conn_list_t
*connlist
)
4756 rw_enter(&connlist
->conn_lock
, RW_READER
);
4757 conn
= connlist
->conn_hd
;
4758 while (conn
!= NULL
) {
4759 mutex_enter(&conn
->c_lock
);
4762 * At this point connection is either in ERROR
4763 * or DISCONN_PEND state. If in DISCONN_PEND state
4764 * then some other thread is culling that connection.
4765 * If not and if c_ref is 0, then destroy the connection.
4767 if (conn
->c_ref
== 0 &&
4768 conn
->c_state
!= C_DISCONN_PEND
) {
4770 * Cull the connection
4772 conn
->c_state
= C_DISCONN_PEND
;
4773 mutex_exit(&conn
->c_lock
);
4774 rw_exit(&connlist
->conn_lock
);
4775 (void) rib_disconnect_channel(conn
, connlist
);
4779 * conn disconnect already scheduled or will
4780 * happen from conn_release when c_ref drops to 0.
4782 mutex_exit(&conn
->c_lock
);
4784 conn
= conn
->c_next
;
4786 rw_exit(&connlist
->conn_lock
);
4789 * At this point, only connections with c_ref != 0 are on the list
4794 * Free all the HCA resources and close
4799 rib_free_hca(rib_hca_t
*hca
)
4801 (void) ibt_free_cq(hca
->clnt_rcq
->rib_cq_hdl
);
4802 (void) ibt_free_cq(hca
->clnt_scq
->rib_cq_hdl
);
4803 (void) ibt_free_cq(hca
->svc_rcq
->rib_cq_hdl
);
4804 (void) ibt_free_cq(hca
->svc_scq
->rib_cq_hdl
);
4806 kmem_free(hca
->clnt_rcq
, sizeof (rib_cq_t
));
4807 kmem_free(hca
->clnt_scq
, sizeof (rib_cq_t
));
4808 kmem_free(hca
->svc_rcq
, sizeof (rib_cq_t
));
4809 kmem_free(hca
->svc_scq
, sizeof (rib_cq_t
));
4811 rib_rbufpool_destroy(hca
, RECV_BUFFER
);
4812 rib_rbufpool_destroy(hca
, SEND_BUFFER
);
4813 rib_destroy_cache(hca
);
4814 if (rib_mod
.rdma_count
== 0)
4815 (void) rdma_unregister_mod(&rib_mod
);
4816 (void) ibt_free_pd(hca
->hca_hdl
, hca
->pd_hdl
);
4817 (void) ibt_close_hca(hca
->hca_hdl
);
4818 hca
->hca_hdl
= NULL
;
4823 rib_stop_hca_services(rib_hca_t
*hca
)
4825 rib_stop_services(hca
);
4826 rib_close_channels(&hca
->cl_conn_list
);
4827 rib_close_channels(&hca
->srv_conn_list
);
4829 rib_purge_connlist(&hca
->cl_conn_list
);
4830 rib_purge_connlist(&hca
->srv_conn_list
);
4832 if ((rib_stat
->hcas_list
== NULL
) && stats_enabled
) {
4833 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4835 stats_enabled
= FALSE
;
4838 rw_enter(&hca
->srv_conn_list
.conn_lock
, RW_READER
);
4839 rw_enter(&hca
->cl_conn_list
.conn_lock
, RW_READER
);
4840 if (hca
->srv_conn_list
.conn_hd
== NULL
&&
4841 hca
->cl_conn_list
.conn_hd
== NULL
) {
4843 * conn_lists are NULL, so destroy
4844 * buffers, close hca and be done.
4848 rw_exit(&hca
->cl_conn_list
.conn_lock
);
4849 rw_exit(&hca
->srv_conn_list
.conn_lock
);
4851 if (hca
->hca_hdl
!= NULL
) {
4852 mutex_enter(&hca
->inuse_lock
);
4854 cv_wait(&hca
->cb_cv
, &hca
->inuse_lock
);
4855 mutex_exit(&hca
->inuse_lock
);
4859 rw_destroy(&hca
->bound_services_lock
);
4861 if (hca
->cleanup_helper
!= NULL
) {
4862 ddi_taskq_destroy(hca
->cleanup_helper
);
4863 hca
->cleanup_helper
= NULL
;
4868 * Cleans and closes up all uses of the HCA
4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl
)
4873 rib_hca_t
*hca
= NULL
;
4876 rw_enter(&rib_stat
->hcas_list_lock
, RW_WRITER
);
4877 for (hcap
= &rib_stat
->hcas_list
; *hcap
; hcap
= &(*hcap
)->next
) {
4879 rw_enter(&hca
->state_lock
, RW_WRITER
);
4880 if (hca
->hca_hdl
== hca_hdl
) {
4882 * Mark as detached and remove from
4885 hca
->state
= HCA_DETACHED
;
4887 rib_stat
->nhca_inited
--;
4888 rib_mod
.rdma_count
--;
4889 rw_exit(&hca
->state_lock
);
4892 rw_exit(&hca
->state_lock
);
4894 rw_exit(&rib_stat
->hcas_list_lock
);
4898 ASSERT(hca
->hca_hdl
== hca_hdl
);
4901 * Stop all services on the HCA
4902 * Go through cl_conn_list and close all rc_channels
4903 * Go through svr_conn_list and close all rc_channels
4904 * Free connections whose c_ref has dropped to 0
4906 * Deregister and released all buffer pool memory after all
4907 * connections are destroyed
4908 * Free the protection domain
4911 rib_stop_hca_services(hca
);
4913 kmem_free(hca
, sizeof (*hca
));
4917 rib_server_side_cache_reclaim(void *argp
)
4919 cache_avl_struct_t
*rcas
;
4920 rib_lrc_entry_t
*rb
;
4921 rib_hca_t
*hca
= (rib_hca_t
*)argp
;
4923 rw_enter(&hca
->avl_rw_lock
, RW_WRITER
);
4924 rcas
= avl_first(&hca
->avl_tree
);
4926 avl_remove(&hca
->avl_tree
, rcas
);
4928 while (rcas
!= NULL
) {
4929 while (rcas
->r
.forw
!= &rcas
->r
) {
4934 (void) rib_deregistermem_via_hca(hca
,
4935 rb
->lrc_buf
, rb
->lrc_mhandle
);
4937 hca
->cache_allocation
-= rb
->lrc_len
;
4938 kmem_free(rb
->lrc_buf
, rb
->lrc_len
);
4939 kmem_free(rb
, sizeof (rib_lrc_entry_t
));
4941 mutex_destroy(&rcas
->node_lock
);
4942 kmem_cache_free(hca
->server_side_cache
, rcas
);
4943 rcas
= avl_first(&hca
->avl_tree
);
4945 avl_remove(&hca
->avl_tree
, rcas
);
4947 rw_exit(&hca
->avl_rw_lock
);
4951 rib_server_side_cache_cleanup(void *argp
)
4953 cache_avl_struct_t
*rcas
;
4954 rib_lrc_entry_t
*rb
;
4955 rib_hca_t
*hca
= (rib_hca_t
*)argp
;
4957 mutex_enter(&hca
->cache_allocation_lock
);
4958 if (hca
->cache_allocation
< cache_limit
) {
4959 mutex_exit(&hca
->cache_allocation_lock
);
4962 mutex_exit(&hca
->cache_allocation_lock
);
4964 rw_enter(&hca
->avl_rw_lock
, RW_WRITER
);
4965 rcas
= avl_last(&hca
->avl_tree
);
4967 avl_remove(&hca
->avl_tree
, rcas
);
4969 while (rcas
!= NULL
) {
4970 while (rcas
->r
.forw
!= &rcas
->r
) {
4975 (void) rib_deregistermem_via_hca(hca
,
4976 rb
->lrc_buf
, rb
->lrc_mhandle
);
4978 hca
->cache_allocation
-= rb
->lrc_len
;
4980 kmem_free(rb
->lrc_buf
, rb
->lrc_len
);
4981 kmem_free(rb
, sizeof (rib_lrc_entry_t
));
4983 mutex_destroy(&rcas
->node_lock
);
4984 if (hca
->server_side_cache
) {
4985 kmem_cache_free(hca
->server_side_cache
, rcas
);
4988 if (hca
->cache_allocation
< cache_limit
) {
4989 rw_exit(&hca
->avl_rw_lock
);
4993 rcas
= avl_last(&hca
->avl_tree
);
4995 avl_remove(&hca
->avl_tree
, rcas
);
4997 rw_exit(&hca
->avl_rw_lock
);
5001 avl_compare(const void *t1
, const void *t2
)
5003 if (((cache_avl_struct_t
*)t1
)->len
== ((cache_avl_struct_t
*)t2
)->len
)
5006 if (((cache_avl_struct_t
*)t1
)->len
< ((cache_avl_struct_t
*)t2
)->len
)
5013 rib_destroy_cache(rib_hca_t
*hca
)
5015 if (hca
->avl_init
) {
5016 rib_server_side_cache_reclaim((void *)hca
);
5017 if (hca
->server_side_cache
) {
5018 kmem_cache_destroy(hca
->server_side_cache
);
5019 hca
->server_side_cache
= NULL
;
5021 avl_destroy(&hca
->avl_tree
);
5022 mutex_destroy(&hca
->cache_allocation_lock
);
5023 rw_destroy(&hca
->avl_rw_lock
);
5025 hca
->avl_init
= FALSE
;
5029 rib_force_cleanup(void *hca
)
5031 if (((rib_hca_t
*)hca
)->cleanup_helper
!= NULL
)
5032 (void) ddi_taskq_dispatch(
5033 ((rib_hca_t
*)hca
)->cleanup_helper
,
5034 rib_server_side_cache_cleanup
,
5035 (void *)hca
, DDI_NOSLEEP
);
5038 static rib_lrc_entry_t
*
5039 rib_get_cache_buf(CONN
*conn
, uint32_t len
)
5041 cache_avl_struct_t cas
, *rcas
;
5042 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
5043 rib_lrc_entry_t
*reply_buf
;
5044 avl_index_t where
= (uintptr_t)NULL
;
5045 uint64_t c_alloc
= 0;
5052 rw_enter(&hca
->avl_rw_lock
, RW_READER
);
5054 mutex_enter(&hca
->cache_allocation_lock
);
5055 c_alloc
= hca
->cache_allocation
;
5056 mutex_exit(&hca
->cache_allocation_lock
);
5058 if ((rcas
= (cache_avl_struct_t
*)avl_find(&hca
->avl_tree
, &cas
,
5060 /* Am I above the cache limit */
5061 if ((c_alloc
+ len
) >= cache_limit
) {
5062 rib_force_cleanup((void *)hca
);
5063 rw_exit(&hca
->avl_rw_lock
);
5064 mutex_enter(&hca
->cache_allocation_lock
);
5065 hca
->cache_misses_above_the_limit
++;
5066 mutex_exit(&hca
->cache_allocation_lock
);
5068 /* Allocate and register the buffer directly */
5072 rw_exit(&hca
->avl_rw_lock
);
5073 rw_enter(&hca
->avl_rw_lock
, RW_WRITER
);
5075 /* Recheck to make sure no other thread added the entry in */
5076 if ((rcas
= (cache_avl_struct_t
*)avl_find(&hca
->avl_tree
,
5077 &cas
, &where
)) == NULL
) {
5078 /* Allocate an avl tree entry */
5079 rcas
= (cache_avl_struct_t
*)
5080 kmem_cache_alloc(hca
->server_side_cache
, KM_SLEEP
);
5082 bzero(rcas
, sizeof (cache_avl_struct_t
));
5084 rcas
->r
.forw
= &rcas
->r
;
5085 rcas
->r
.back
= &rcas
->r
;
5087 mutex_init(&rcas
->node_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
5088 avl_insert(&hca
->avl_tree
, rcas
, where
);
5092 mutex_enter(&rcas
->node_lock
);
5094 if (rcas
->r
.forw
!= &rcas
->r
&& rcas
->elements
> 0) {
5095 reply_buf
= rcas
->r
.forw
;
5098 mutex_exit(&rcas
->node_lock
);
5099 rw_exit(&hca
->avl_rw_lock
);
5101 mutex_enter(&hca
->cache_allocation_lock
);
5103 hca
->cache_allocation
-= len
;
5104 mutex_exit(&hca
->cache_allocation_lock
);
5106 /* Am I above the cache limit */
5107 mutex_exit(&rcas
->node_lock
);
5108 if ((c_alloc
+ len
) >= cache_limit
) {
5109 rib_force_cleanup((void *)hca
);
5110 rw_exit(&hca
->avl_rw_lock
);
5112 mutex_enter(&hca
->cache_allocation_lock
);
5113 hca
->cache_misses_above_the_limit
++;
5114 mutex_exit(&hca
->cache_allocation_lock
);
5115 /* Allocate and register the buffer directly */
5118 rw_exit(&hca
->avl_rw_lock
);
5119 mutex_enter(&hca
->cache_allocation_lock
);
5120 hca
->cache_misses
++;
5121 mutex_exit(&hca
->cache_allocation_lock
);
5122 /* Allocate a reply_buf entry */
5123 reply_buf
= (rib_lrc_entry_t
*)
5124 kmem_zalloc(sizeof (rib_lrc_entry_t
), KM_SLEEP
);
5125 bzero(reply_buf
, sizeof (rib_lrc_entry_t
));
5126 reply_buf
->lrc_buf
= kmem_alloc(len
, KM_SLEEP
);
5127 reply_buf
->lrc_len
= len
;
5128 reply_buf
->registered
= FALSE
;
5129 reply_buf
->avl_node
= (void *)rcas
;
5135 reply_buf
= (rib_lrc_entry_t
*)
5136 kmem_zalloc(sizeof (rib_lrc_entry_t
), KM_SLEEP
);
5137 bzero(reply_buf
, sizeof (rib_lrc_entry_t
));
5138 reply_buf
->lrc_buf
= kmem_alloc(len
, KM_SLEEP
);
5139 reply_buf
->lrc_len
= len
;
5140 reply_buf
->registered
= FALSE
;
5141 reply_buf
->avl_node
= NULL
;
5147 * Return a pre-registered back to the cache (without
5148 * unregistering the buffer)..
5152 rib_free_cache_buf(CONN
*conn
, rib_lrc_entry_t
*reg_buf
)
5154 cache_avl_struct_t cas
, *rcas
;
5155 avl_index_t where
= (uintptr_t)NULL
;
5156 rib_hca_t
*hca
= (ctoqp(conn
))->hca
;
5161 cas
.len
= reg_buf
->lrc_len
;
5162 rw_enter(&hca
->avl_rw_lock
, RW_READER
);
5163 if ((rcas
= (cache_avl_struct_t
*)
5164 avl_find(&hca
->avl_tree
, &cas
, &where
)) == NULL
) {
5165 rw_exit(&hca
->avl_rw_lock
);
5168 cas
.len
= reg_buf
->lrc_len
;
5169 mutex_enter(&rcas
->node_lock
);
5170 insque(reg_buf
, &rcas
->r
);
5172 mutex_exit(&rcas
->node_lock
);
5173 rw_exit(&hca
->avl_rw_lock
);
5174 mutex_enter(&hca
->cache_allocation_lock
);
5175 hca
->cache_allocation
+= cas
.len
;
5176 mutex_exit(&hca
->cache_allocation_lock
);
5183 if (reg_buf
->registered
)
5184 (void) rib_deregistermem_via_hca(hca
,
5185 reg_buf
->lrc_buf
, reg_buf
->lrc_mhandle
);
5186 kmem_free(reg_buf
->lrc_buf
, reg_buf
->lrc_len
);
5187 kmem_free(reg_buf
, sizeof (rib_lrc_entry_t
));
5191 rib_registermem_via_hca(rib_hca_t
*hca
, caddr_t adsp
, caddr_t buf
,
5192 uint_t buflen
, struct mrc
*buf_handle
)
5194 ibt_mr_hdl_t mr_hdl
= NULL
; /* memory region handle */
5195 ibt_mr_desc_t mr_desc
; /* vaddr, lkey, rkey */
5200 * Note: ALL buffer pools use the same memory type RDMARW.
5202 status
= rib_reg_mem(hca
, adsp
, buf
, buflen
, 0, &mr_hdl
, &mr_desc
);
5203 if (status
== RDMA_SUCCESS
) {
5204 buf_handle
->mrc_linfo
= (uint64_t)(uintptr_t)mr_hdl
;
5205 buf_handle
->mrc_lmr
= (uint32_t)mr_desc
.md_lkey
;
5206 buf_handle
->mrc_rmr
= (uint32_t)mr_desc
.md_rkey
;
5208 buf_handle
->mrc_linfo
= (uintptr_t)NULL
;
5209 buf_handle
->mrc_lmr
= 0;
5210 buf_handle
->mrc_rmr
= 0;
5217 rib_deregistermemsync_via_hca(rib_hca_t
*hca
, caddr_t buf
,
5218 struct mrc buf_handle
, RIB_SYNCMEM_HANDLE sync_handle
)
5221 (void) rib_deregistermem_via_hca(hca
, buf
, buf_handle
);
5222 return (RDMA_SUCCESS
);
5227 rib_deregistermem_via_hca(rib_hca_t
*hca
, caddr_t buf
, struct mrc buf_handle
)
5230 (void) ibt_deregister_mr(hca
->hca_hdl
,
5231 (ibt_mr_hdl_t
)(uintptr_t)buf_handle
.mrc_linfo
);
5232 return (RDMA_SUCCESS
);
5236 * Check if the IP interface named by `lifrp' is RDMA-capable.
5239 rpcib_rdma_capable_interface(struct lifreq
*lifrp
)
5241 char ifname
[LIFNAMSIZ
];
5244 if (lifrp
->lifr_type
== IFT_IB
)
5248 * Strip off the logical interface portion before getting
5249 * intimate with the name.
5251 (void) strlcpy(ifname
, lifrp
->lifr_name
, LIFNAMSIZ
);
5252 if ((cp
= strchr(ifname
, ':')) != NULL
)
5255 return (strcmp("lo0", ifname
) == 0);
5259 rpcib_do_ip_ioctl(int cmd
, int len
, void *arg
)
5263 struct strioctl iocb
;
5267 if (lookupname("/dev/udp", UIO_SYSSPACE
, FOLLOW
, NULLVPP
, &kkvp
) == 0) {
5268 if (t_kopen(NULL
, kkvp
->v_rdev
, FREAD
|FWRITE
,
5269 &tiptr
, CRED()) == 0) {
5270 vp
= tiptr
->fp
->f_vnode
;
5282 iocb
.ic_dp
= (caddr_t
)arg
;
5284 err
= kstr_ioctl(vp
, I_STR
, (intptr_t)&iocb
);
5286 (void) t_kclose(tiptr
, 0);
5292 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5296 rpcib_do_lifconf(struct lifconf
*lifcp
, uint_t
*bufsizep
)
5301 bzero(&lifn
, sizeof (struct lifnum
));
5302 lifn
.lifn_family
= AF_UNSPEC
;
5304 err
= rpcib_do_ip_ioctl(SIOCGLIFNUM
, sizeof (struct lifnum
), &lifn
);
5309 * Pad the interface count to account for additional interfaces that
5310 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5312 lifn
.lifn_count
+= 4;
5314 bzero(lifcp
, sizeof (struct lifconf
));
5315 lifcp
->lifc_family
= AF_UNSPEC
;
5316 lifcp
->lifc_len
= *bufsizep
= lifn
.lifn_count
* sizeof (struct lifreq
);
5317 lifcp
->lifc_buf
= kmem_zalloc(*bufsizep
, KM_SLEEP
);
5319 err
= rpcib_do_ip_ioctl(SIOCGLIFCONF
, sizeof (struct lifconf
), lifcp
);
5321 kmem_free(lifcp
->lifc_buf
, *bufsizep
);
5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t
*addrs4
, rpcib_ipaddrs_t
*addrs6
)
5332 struct lifconf lifc
;
5333 struct lifreq
*lifrp
;
5334 struct sockaddr_in
*sinp
;
5335 struct sockaddr_in6
*sin6p
;
5337 bzero(addrs4
, sizeof (rpcib_ipaddrs_t
));
5338 bzero(addrs6
, sizeof (rpcib_ipaddrs_t
));
5340 if (rpcib_do_lifconf(&lifc
, &bufsize
) != 0)
5343 if ((nifs
= lifc
.lifc_len
/ sizeof (struct lifreq
)) == 0) {
5344 kmem_free(lifc
.lifc_buf
, bufsize
);
5349 * Worst case is that all of the addresses are IB-capable and have
5350 * the same address family, so size our buffers accordingly.
5352 addrs4
->ri_size
= nifs
* sizeof (struct sockaddr_in
);
5353 addrs4
->ri_list
= kmem_zalloc(addrs4
->ri_size
, KM_SLEEP
);
5354 addrs6
->ri_size
= nifs
* sizeof (struct sockaddr_in6
);
5355 addrs6
->ri_list
= kmem_zalloc(addrs6
->ri_size
, KM_SLEEP
);
5357 for (lifrp
= lifc
.lifc_req
, i
= 0; i
< nifs
; i
++, lifrp
++) {
5358 if (!rpcib_rdma_capable_interface(lifrp
))
5361 if (lifrp
->lifr_addr
.ss_family
== AF_INET
) {
5362 sinp
= addrs4
->ri_list
;
5363 bcopy(&lifrp
->lifr_addr
, &sinp
[addrs4
->ri_count
++],
5364 sizeof (struct sockaddr_in
));
5365 } else if (lifrp
->lifr_addr
.ss_family
== AF_INET6
) {
5366 sin6p
= addrs6
->ri_list
;
5367 bcopy(&lifrp
->lifr_addr
, &sin6p
[addrs6
->ri_count
++],
5368 sizeof (struct sockaddr_in6
));
5372 kmem_free(lifc
.lifc_buf
, bufsize
);
5378 rpcib_cache_kstat_update(kstat_t
*ksp
, int rw
)
5382 if (KSTAT_WRITE
== rw
) {
5386 rpcib_kstat
.cache_limit
.value
.ui64
=
5387 (uint64_t)cache_limit
;
5388 rw_enter(&rib_stat
->hcas_list_lock
, RW_READER
);
5389 for (hca
= rib_stat
->hcas_list
; hca
; hca
= hca
->next
) {
5390 rpcib_kstat
.cache_allocation
.value
.ui64
+=
5391 (uint64_t)hca
->cache_allocation
;
5392 rpcib_kstat
.cache_hits
.value
.ui64
+=
5393 (uint64_t)hca
->cache_hits
;
5394 rpcib_kstat
.cache_misses
.value
.ui64
+=
5395 (uint64_t)hca
->cache_misses
;
5396 rpcib_kstat
.cache_misses_above_the_limit
.value
.ui64
+=
5397 (uint64_t)hca
->cache_misses_above_the_limit
;
5399 rw_exit(&rib_stat
->hcas_list_lock
);