Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / rpc / rpcib.c
blobbd3fef91015f015057a7e5e15a6fe785193a4374
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
29 * Portions of this source code is developed by the team members of
30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 * headed by Professor Dhabaleswar K. (DK) Panda.
33 * Acknowledgements to contributions from developors:
34 * Ranjit Noronha: noronha@cse.ohio-state.edu
35 * Lei Chai : chail@cse.ohio-state.edu
36 * Weikuan Yu : yuw@cse.ohio-state.edu
41 * The rpcib plugin. Implements the interface for RDMATF's
42 * interaction with IBTF.
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
85 #define NFS_RDMA_PORT 20049
89 * Convenience structures for connection management
91 typedef struct rpcib_ipaddrs {
92 void *ri_list; /* pointer to list of addresses */
93 uint_t ri_count; /* number of addresses in list */
94 uint_t ri_size; /* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
98 typedef struct rpcib_ping {
99 rib_hca_t *hca;
100 ibt_path_info_t path;
101 ibt_ip_addr_t srcip;
102 ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
106 * Prototype declarations for driver ops
108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 void *, void **);
111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 struct netbuf *d_svcaddr, CONN **conn);
122 struct {
123 kstat_named_t cache_limit;
124 kstat_named_t cache_allocation;
125 kstat_named_t cache_hits;
126 kstat_named_t cache_misses;
127 kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 {"cache_limit", KSTAT_DATA_UINT64 },
130 {"cache_allocation", KSTAT_DATA_UINT64 },
131 {"cache_hits", KSTAT_DATA_UINT64 },
132 {"cache_misses", KSTAT_DATA_UINT64 },
133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 nulldev, /* open */
139 nulldev, /* close */
140 nodev, /* strategy */
141 nodev, /* print */
142 nodev, /* dump */
143 nodev, /* read */
144 nodev, /* write */
145 nodev, /* ioctl */
146 nodev, /* devmap */
147 nodev, /* mmap */
148 nodev, /* segmap */
149 nochpoll, /* poll */
150 ddi_prop_op, /* prop_op */
151 NULL, /* stream */
152 D_MP, /* cb_flag */
153 CB_REV, /* rev */
154 nodev, /* int (*cb_aread)() */
155 nodev /* int (*cb_awrite)() */
159 * Device options
161 static struct dev_ops rpcib_ops = {
162 DEVO_REV, /* devo_rev, */
163 0, /* refcnt */
164 rpcib_getinfo, /* info */
165 nulldev, /* identify */
166 nulldev, /* probe */
167 rpcib_attach, /* attach */
168 rpcib_detach, /* detach */
169 nodev, /* reset */
170 &rpcib_cbops, /* driver ops - devctl interfaces */
171 NULL, /* bus operations */
172 NULL, /* power */
173 ddi_quiesce_not_needed, /* quiesce */
177 * Module linkage information.
180 static struct modldrv rib_modldrv = {
181 &mod_driverops, /* Driver module */
182 "RPCIB plugin driver", /* Driver name and version */
183 &rpcib_ops, /* Driver ops */
186 static struct modlinkage rib_modlinkage = {
187 MODREV_1,
188 (void *)&rib_modldrv,
189 NULL
192 typedef struct rib_lrc_entry {
193 struct rib_lrc_entry *forw;
194 struct rib_lrc_entry *back;
195 char *lrc_buf;
197 uint32_t lrc_len;
198 void *avl_node;
199 bool_t registered;
201 struct mrc lrc_mhandle;
202 bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
205 typedef struct cache_struct {
206 rib_lrc_entry_t r;
207 uint32_t len;
208 uint32_t elements;
209 kmutex_t node_lock;
210 avl_node_t avl_link;
211 } cache_avl_struct_t;
213 uint64_t cache_limit = 100 * 1024 * 1024;
214 static uint64_t cache_watermark = 80 * 1024 * 1024;
215 static bool_t stats_enabled = FALSE;
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
220 #define RIBNETID_TCP "tcp"
221 #define RIBNETID_TCP6 "tcp6"
224 * rib_stat: private data pointer used when registering
225 * with the IBTF. It is returned to the consumer
226 * in all callbacks.
228 static rpcib_state_t *rib_stat = NULL;
230 #define RNR_RETRIES IBT_RNR_RETRY_1
231 #define MAX_PORTS 2
232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
239 * Old cards with Tavor driver have limited memory footprint
240 * when booted in 32bit. The rib_max_rbufs tunable can be
241 * tuned for more buffers if needed.
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif /* !(_ELF64) && !(__sparc) */
250 int rib_conn_timeout = 60 * 12; /* 12 minutes */
253 * State of the plugin.
254 * ACCEPT = accepting new connections and requests.
255 * NO_ACCEPT = not accepting new connection and requests.
256 * This should eventually move to rpcib_state_t structure, since this
257 * will tell in which state the plugin is for a particular type of service
258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259 * state for one and in no_accept state for the other.
261 int plugin_state;
262 kmutex_t plugin_state_lock;
264 ldi_ident_t rpcib_li;
267 * RPCIB RDMATF operations
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf,
274 uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf,
282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 caddr_t buf, int len, int cpu);
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 int addr_type, void *, CONN **);
307 static rdma_stat rib_conn_release(CONN *conn);
308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 rpcib_ping_t *, CONN **);
310 static rdma_stat rib_getinfo(rdma_info_t *info);
312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 static void rib_destroy_cache(rib_hca_t *hca);
315 static void rib_server_side_cache_reclaim(void *argp);
316 static int avl_compare(const void *t1, const void *t2);
318 static void rib_stop_services(rib_hca_t *);
319 static void rib_close_channels(rib_conn_list_t *);
320 static void rib_conn_close(void *);
321 static void rib_recv_rele(rib_qp_t *);
322 static rdma_stat rib_conn_release_locked(CONN *conn);
325 * RPCIB addressing operations
329 * RDMA operations the RPCIB module exports
331 static rdmaops_t rib_ops = {
332 rib_reachable,
333 rib_conn_get,
334 rib_conn_release,
335 rib_listen,
336 rib_listen_stop,
337 rib_registermem,
338 rib_deregistermem,
339 rib_registermemsync,
340 rib_deregistermemsync,
341 rib_syncmem,
342 rib_reg_buf_alloc,
343 rib_reg_buf_free,
344 rib_send,
345 rib_send_resp,
346 rib_post_resp,
347 rib_post_resp_remove,
348 rib_post_recv,
349 rib_recv,
350 rib_read,
351 rib_write,
352 rib_getinfo,
356 * RDMATF RPCIB plugin details
358 static rdma_mod_t rib_mod = {
359 "ibtf", /* api name */
360 RDMATF_VERS_1,
362 &rib_ops, /* rdma op vector for ibtf */
365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 rib_qp_t **);
379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 rib_qp_t **);
381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 static int rib_free_sendwait(struct send_wid *);
384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 static void rdma_done_rem_list(rib_qp_t *);
387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
389 static void rib_async_handler(void *,
390 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 static int rib_free_svc_recv(struct svc_recv *);
394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 static void rib_free_wid(struct recv_wid *);
396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 static void rib_detach_hca(ibt_hca_hdl_t);
398 static void rib_close_a_channel(CONN *);
399 static void rib_send_hold(rib_qp_t *);
400 static void rib_send_rele(rib_qp_t *);
403 * Registration with IBTF as a consumer
405 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 IBTI_V_CURR,
407 IBT_GENERIC,
408 rib_async_handler, /* async event handler */
409 NULL, /* Memory Region Handler */
410 "nfs/ib"
414 * Global strucuture
417 typedef struct rpcib_s {
418 dev_info_t *rpcib_dip;
419 kmutex_t rpcib_mutex;
420 } rpcib_t;
422 rpcib_t rpcib;
425 * /etc/system controlled variable to control
426 * debugging in rpcib kernel module.
427 * Set it to values greater that 1 to control
428 * the amount of debugging messages required.
430 int rib_debug = 0;
433 _init(void)
435 int error;
437 error = mod_install((struct modlinkage *)&rib_modlinkage);
438 if (error != 0) {
440 * Could not load module
442 return (error);
444 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 return (0);
449 _fini()
451 int status;
454 * Remove module
456 if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 return (status);
459 mutex_destroy(&plugin_state_lock);
460 return (0);
464 _info(struct modinfo *modinfop)
466 return (mod_info(&rib_modlinkage, modinfop));
470 * rpcib_getinfo()
471 * Given the device number, return the devinfo pointer or the
472 * instance number.
473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
476 /*ARGSUSED*/
477 static int
478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
480 int ret = DDI_SUCCESS;
482 switch (cmd) {
483 case DDI_INFO_DEVT2DEVINFO:
484 if (rpcib.rpcib_dip != NULL)
485 *result = rpcib.rpcib_dip;
486 else {
487 *result = NULL;
488 ret = DDI_FAILURE;
490 break;
492 case DDI_INFO_DEVT2INSTANCE:
493 *result = NULL;
494 break;
496 default:
497 ret = DDI_FAILURE;
499 return (ret);
502 static void
503 rpcib_free_hca_list()
505 rib_hca_t *hca, *hcap;
507 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 hca = rib_stat->hcas_list;
509 rib_stat->hcas_list = NULL;
510 rw_exit(&rib_stat->hcas_list_lock);
511 while (hca != NULL) {
512 rw_enter(&hca->state_lock, RW_WRITER);
513 hcap = hca;
514 hca = hca->next;
515 rib_stat->nhca_inited--;
516 rib_mod.rdma_count--;
517 hcap->state = HCA_DETACHED;
518 rw_exit(&hcap->state_lock);
519 rib_stop_hca_services(hcap);
521 kmem_free(hcap, sizeof (*hcap));
525 static rdma_stat
526 rpcib_free_service_list()
528 rib_service_t *service;
529 ibt_status_t ret;
531 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 while (rib_stat->service_list != NULL) {
533 service = rib_stat->service_list;
534 ret = ibt_unbind_all_services(service->srv_hdl);
535 if (ret != IBT_SUCCESS) {
536 rw_exit(&rib_stat->service_list_lock);
537 #ifdef DEBUG
538 cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 #endif
541 return (RDMA_FAILED);
543 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 service->srv_hdl);
545 if (ret != IBT_SUCCESS) {
546 rw_exit(&rib_stat->service_list_lock);
547 #ifdef DEBUG
548 cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 "ibt_deregister_service failed (%d)\n", (int)ret);
550 #endif
551 return (RDMA_FAILED);
553 rib_stat->service_list = service->next;
554 kmem_free(service, sizeof (rib_service_t));
556 rw_exit(&rib_stat->service_list_lock);
558 return (RDMA_SUCCESS);
561 static int
562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
564 ibt_status_t ibt_status;
565 rdma_stat r_status;
567 switch (cmd) {
568 case DDI_ATTACH:
569 break;
570 case DDI_RESUME:
571 return (DDI_SUCCESS);
572 default:
573 return (DDI_FAILURE);
576 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
578 mutex_enter(&rpcib.rpcib_mutex);
579 if (rpcib.rpcib_dip != NULL) {
580 mutex_exit(&rpcib.rpcib_mutex);
581 return (DDI_FAILURE);
583 rpcib.rpcib_dip = dip;
584 mutex_exit(&rpcib.rpcib_mutex);
586 * Create the "rpcib" minor-node.
588 if (ddi_create_minor_node(dip,
589 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 /* Error message, no cmn_err as they print on console */
591 return (DDI_FAILURE);
594 if (rib_stat == NULL) {
595 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
601 rib_stat->hca_count = ibt_get_hca_list(NULL);
602 if (rib_stat->hca_count < 1) {
603 mutex_destroy(&rib_stat->listen_lock);
604 rw_destroy(&rib_stat->hcas_list_lock);
605 mutex_destroy(&rib_stat->open_hca_lock);
606 kmem_free(rib_stat, sizeof (*rib_stat));
607 rib_stat = NULL;
608 return (DDI_FAILURE);
611 ibt_status = ibt_attach(&rib_modinfo, dip,
612 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
614 if (ibt_status != IBT_SUCCESS) {
615 mutex_destroy(&rib_stat->listen_lock);
616 rw_destroy(&rib_stat->hcas_list_lock);
617 mutex_destroy(&rib_stat->open_hca_lock);
618 kmem_free(rib_stat, sizeof (*rib_stat));
619 rib_stat = NULL;
620 return (DDI_FAILURE);
623 rib_stat->service_list = NULL;
624 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 mutex_enter(&rib_stat->open_hca_lock);
626 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 mutex_exit(&rib_stat->open_hca_lock);
628 goto open_fail;
630 mutex_exit(&rib_stat->open_hca_lock);
632 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 DDI_PROP_SUCCESS) {
634 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 "failed.");
636 goto register_fail;
640 * Register with rdmatf
642 r_status = rdma_register_mod(&rib_mod);
643 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 "status = %d", r_status);
646 goto register_fail;
649 return (DDI_SUCCESS);
651 register_fail:
653 open_fail:
654 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 rpcib_free_hca_list();
656 (void) rpcib_free_service_list();
657 mutex_destroy(&rib_stat->listen_lock);
658 rw_destroy(&rib_stat->hcas_list_lock);
659 mutex_destroy(&rib_stat->open_hca_lock);
660 rw_destroy(&rib_stat->service_list_lock);
661 kmem_free(rib_stat, sizeof (*rib_stat));
662 rib_stat = NULL;
663 return (DDI_FAILURE);
666 /*ARGSUSED*/
667 static int
668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
670 switch (cmd) {
672 case DDI_DETACH:
673 break;
675 case DDI_SUSPEND:
676 default:
677 return (DDI_FAILURE);
681 * Detach the hca and free resources
683 mutex_enter(&plugin_state_lock);
684 plugin_state = NO_ACCEPT;
685 mutex_exit(&plugin_state_lock);
687 if (rpcib_free_service_list() != RDMA_SUCCESS)
688 return (DDI_FAILURE);
689 rpcib_free_hca_list();
691 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 mutex_destroy(&rib_stat->listen_lock);
693 rw_destroy(&rib_stat->hcas_list_lock);
694 mutex_destroy(&rib_stat->open_hca_lock);
695 rw_destroy(&rib_stat->service_list_lock);
697 kmem_free(rib_stat, sizeof (*rib_stat));
698 rib_stat = NULL;
700 mutex_enter(&rpcib.rpcib_mutex);
701 rpcib.rpcib_dip = NULL;
702 mutex_exit(&rpcib.rpcib_mutex);
703 mutex_destroy(&rpcib.rpcib_mutex);
704 return (DDI_SUCCESS);
708 static void rib_rbufpool_free(rib_hca_t *, int);
709 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 static int rib_remreply(rib_qp_t *, struct reply *);
714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
719 * One CQ pair per HCA
721 static rdma_stat
722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723 rib_cq_t **cqp)
725 rib_cq_t *cq;
726 ibt_cq_attr_t cq_attr;
727 uint32_t real_size;
728 ibt_status_t status;
729 rdma_stat error = RDMA_SUCCESS;
731 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 cq->rib_hca = hca;
733 bzero(&cq_attr, sizeof (cq_attr));
734 cq_attr.cq_size = cq_size;
735 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
736 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
737 &real_size);
738 if (status != IBT_SUCCESS) {
739 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
740 " status=%d", status);
741 error = RDMA_FAILED;
742 goto fail;
744 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
747 * Enable CQ callbacks. CQ Callbacks are single shot
748 * (e.g. you have to call ibt_enable_cq_notify()
749 * after each callback to get another one).
751 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
752 if (status != IBT_SUCCESS) {
753 cmn_err(CE_WARN, "rib_create_cq: "
754 "enable_cq_notify failed, status %d", status);
755 error = RDMA_FAILED;
756 goto fail;
758 *cqp = cq;
760 return (error);
761 fail:
762 if (cq->rib_cq_hdl)
763 (void) ibt_free_cq(cq->rib_cq_hdl);
764 if (cq)
765 kmem_free(cq, sizeof (rib_cq_t));
766 return (error);
770 * rpcib_find_hca
772 * Caller should have already locked the hcas_lock before calling
773 * this function.
775 static rib_hca_t *
776 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
778 rib_hca_t *hca = ribstat->hcas_list;
780 while (hca && hca->hca_guid != guid)
781 hca = hca->next;
783 return (hca);
786 static rdma_stat
787 rpcib_open_hcas(rpcib_state_t *ribstat)
789 rib_hca_t *hca;
790 ibt_status_t ibt_status;
791 rdma_stat status;
792 ibt_hca_portinfo_t *pinfop;
793 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
794 uint_t size, cq_size;
795 int i;
796 kstat_t *ksp;
797 cache_avl_struct_t example_avl_node;
798 char rssc_name[32];
799 int old_nhca_inited = ribstat->nhca_inited;
800 ib_guid_t *hca_guids;
802 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
804 ribstat->hca_count = ibt_get_hca_list(&hca_guids);
805 if (ribstat->hca_count == 0)
806 return (RDMA_FAILED);
808 rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
810 * Open a hca and setup for RDMA
812 for (i = 0; i < ribstat->hca_count; i++) {
813 if (rpcib_find_hca(ribstat, hca_guids[i]))
814 continue;
815 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
817 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
818 hca_guids[i], &hca->hca_hdl);
819 if (ibt_status != IBT_SUCCESS) {
820 kmem_free(hca, sizeof (rib_hca_t));
821 continue;
823 hca->hca_guid = hca_guids[i];
824 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
825 hca->state = HCA_INITED;
828 * query HCA info
830 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
831 if (ibt_status != IBT_SUCCESS) {
832 goto fail1;
836 * One PD (Protection Domain) per HCA.
837 * A qp is allowed to access a memory region
838 * only when it's in the same PD as that of
839 * the memory region.
841 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
842 if (ibt_status != IBT_SUCCESS) {
843 goto fail1;
847 * query HCA ports
849 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
850 0, &pinfop, &hca->hca_nports, &size);
851 if (ibt_status != IBT_SUCCESS) {
852 goto fail2;
854 hca->hca_ports = pinfop;
855 hca->hca_pinfosz = size;
856 pinfop = NULL;
858 cq_size = DEF_CQ_SIZE; /* default cq size */
860 * Create 2 pairs of cq's (1 pair for client
861 * and the other pair for server) on this hca.
862 * If number of qp's gets too large, then several
863 * cq's will be needed.
865 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
866 &hca->svc_rcq);
867 if (status != RDMA_SUCCESS) {
868 goto fail3;
871 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
872 &hca->svc_scq);
873 if (status != RDMA_SUCCESS) {
874 goto fail3;
877 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
878 &hca->clnt_rcq);
879 if (status != RDMA_SUCCESS) {
880 goto fail3;
883 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
884 &hca->clnt_scq);
885 if (status != RDMA_SUCCESS) {
886 goto fail3;
890 * Create buffer pools.
891 * Note rib_rbuf_create also allocates memory windows.
893 hca->recv_pool = rib_rbufpool_create(hca,
894 RECV_BUFFER, rib_max_rbufs);
895 if (hca->recv_pool == NULL) {
896 goto fail3;
899 hca->send_pool = rib_rbufpool_create(hca,
900 SEND_BUFFER, rib_max_rbufs);
901 if (hca->send_pool == NULL) {
902 rib_rbufpool_destroy(hca, RECV_BUFFER);
903 goto fail3;
906 if (hca->server_side_cache == NULL) {
907 (void) sprintf(rssc_name,
908 "rib_srvr_cache_%llx",
909 (long long unsigned int) hca->hca_guid);
910 hca->server_side_cache = kmem_cache_create(
911 rssc_name,
912 sizeof (cache_avl_struct_t), 0,
913 NULL,
914 NULL,
915 rib_server_side_cache_reclaim,
916 hca, NULL, 0);
919 avl_create(&hca->avl_tree,
920 avl_compare,
921 sizeof (cache_avl_struct_t),
922 (uint_t)(uintptr_t)&example_avl_node.avl_link-
923 (uint_t)(uintptr_t)&example_avl_node);
925 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
926 hca->iblock);
927 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
928 rw_init(&hca->avl_rw_lock,
929 NULL, RW_DRIVER, hca->iblock);
930 mutex_init(&hca->cache_allocation_lock,
931 NULL, MUTEX_DRIVER, NULL);
932 hca->avl_init = TRUE;
934 /* Create kstats for the cache */
935 ASSERT(INGLOBALZONE(curproc));
937 if (!stats_enabled) {
938 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
939 KSTAT_TYPE_NAMED,
940 sizeof (rpcib_kstat) / sizeof (kstat_named_t),
941 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
942 GLOBAL_ZONEID);
943 if (ksp) {
944 ksp->ks_data = (void *) &rpcib_kstat;
945 ksp->ks_update = rpcib_cache_kstat_update;
946 kstat_install(ksp);
947 stats_enabled = TRUE;
950 if (hca->cleanup_helper == NULL) {
951 char tq_name[sizeof (hca->hca_guid) * 2 + 1];
953 (void) snprintf(tq_name, sizeof (tq_name), "%llX",
954 (unsigned long long int) hca->hca_guid);
955 hca->cleanup_helper = ddi_taskq_create(NULL,
956 tq_name, 1, TASKQ_DEFAULTPRI, 0);
959 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
960 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
961 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
962 hca->iblock);
963 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
964 hca->iblock);
965 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
966 hca->inuse = TRUE;
968 hca->next = ribstat->hcas_list;
969 ribstat->hcas_list = hca;
970 ribstat->nhca_inited++;
971 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
972 continue;
974 fail3:
975 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
976 fail2:
977 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
978 fail1:
979 (void) ibt_close_hca(hca->hca_hdl);
980 kmem_free(hca, sizeof (rib_hca_t));
982 rw_exit(&ribstat->hcas_list_lock);
983 ibt_free_hca_list(hca_guids, ribstat->hca_count);
984 rib_mod.rdma_count = rib_stat->nhca_inited;
987 * return success if at least one new hca has been configured.
989 if (ribstat->nhca_inited != old_nhca_inited)
990 return (RDMA_SUCCESS);
991 else
992 return (RDMA_FAILED);
996 * Callback routines
1000 * SCQ handlers
1002 /* ARGSUSED */
1003 static void
1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1006 ibt_status_t ibt_status;
1007 ibt_wc_t wc;
1008 struct send_wid *wd;
1009 CONN *conn;
1010 rib_qp_t *qp;
1011 int i;
1014 * Re-enable cq notify here to avoid missing any
1015 * completion queue notification.
1017 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1019 ibt_status = IBT_SUCCESS;
1020 while (ibt_status != IBT_CQ_EMPTY) {
1021 bzero(&wc, sizeof (wc));
1022 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1023 if (ibt_status != IBT_SUCCESS)
1024 return;
1027 * Got a send completion
1029 if (wc.wc_id != RDMA_DUMMY_WRID) {
1030 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1031 qp = wd->qp;
1032 conn = qptoc(qp);
1034 mutex_enter(&wd->sendwait_lock);
1035 switch (wc.wc_status) {
1036 case IBT_WC_SUCCESS:
1037 wd->status = RDMA_SUCCESS;
1038 break;
1039 default:
1041 * RC Send Q Error Code Local state Remote State
1042 * ==================== =========== ============
1043 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1044 * IBT_WC_LOCAL_LEN_ERR ERROR None
1045 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1046 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1047 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1048 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1049 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1050 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1051 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1052 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1053 * IBT_WC_WR_FLUSHED_ERR ERROR None
1056 * Channel in error state. Set connection to
1057 * ERROR and cleanup will happen either from
1058 * conn_release or from rib_conn_get
1060 wd->status = RDMA_FAILED;
1061 mutex_enter(&conn->c_lock);
1062 if (conn->c_state != C_DISCONN_PEND)
1063 conn->c_state = C_ERROR_CONN;
1064 mutex_exit(&conn->c_lock);
1065 break;
1068 if (wd->cv_sig == 1) {
1070 * Notify poster
1072 cv_signal(&wd->wait_cv);
1073 mutex_exit(&wd->sendwait_lock);
1074 } else {
1076 * Poster not waiting for notification.
1077 * Free the send buffers and send_wid
1079 for (i = 0; i < wd->nsbufs; i++) {
1080 rib_rbuf_free(qptoc(wd->qp),
1081 SEND_BUFFER,
1082 (void *)(uintptr_t)wd->sbufaddr[i]);
1085 /* decrement the send ref count */
1086 rib_send_rele(qp);
1088 mutex_exit(&wd->sendwait_lock);
1089 (void) rib_free_sendwait(wd);
1095 /* ARGSUSED */
1096 static void
1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1099 ibt_status_t ibt_status;
1100 ibt_wc_t wc;
1101 struct send_wid *wd;
1102 rib_qp_t *qp;
1103 CONN *conn;
1104 int i;
1107 * Re-enable cq notify here to avoid missing any
1108 * completion queue notification.
1110 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1112 ibt_status = IBT_SUCCESS;
1113 while (ibt_status != IBT_CQ_EMPTY) {
1114 bzero(&wc, sizeof (wc));
1115 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1116 if (ibt_status != IBT_SUCCESS)
1117 return;
1120 * Got a send completion
1122 if (wc.wc_id != RDMA_DUMMY_WRID) {
1123 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1124 qp = wd->qp;
1125 conn = qptoc(qp);
1126 mutex_enter(&wd->sendwait_lock);
1128 switch (wc.wc_status) {
1129 case IBT_WC_SUCCESS:
1130 wd->status = RDMA_SUCCESS;
1131 break;
1132 default:
1134 * Channel in error state. Set connection to
1135 * ERROR and cleanup will happen either from
1136 * conn_release or conn timeout.
1138 wd->status = RDMA_FAILED;
1139 mutex_enter(&conn->c_lock);
1140 if (conn->c_state != C_DISCONN_PEND)
1141 conn->c_state = C_ERROR_CONN;
1142 mutex_exit(&conn->c_lock);
1143 break;
1146 if (wd->cv_sig == 1) {
1148 * Update completion status and notify poster
1150 cv_signal(&wd->wait_cv);
1151 mutex_exit(&wd->sendwait_lock);
1152 } else {
1154 * Poster not waiting for notification.
1155 * Free the send buffers and send_wid
1157 for (i = 0; i < wd->nsbufs; i++) {
1158 rib_rbuf_free(qptoc(wd->qp),
1159 SEND_BUFFER,
1160 (void *)(uintptr_t)wd->sbufaddr[i]);
1163 /* decrement the send ref count */
1164 rib_send_rele(qp);
1166 mutex_exit(&wd->sendwait_lock);
1167 (void) rib_free_sendwait(wd);
1174 * RCQ handler
1176 /* ARGSUSED */
1177 static void
1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1180 rib_qp_t *qp;
1181 ibt_status_t ibt_status;
1182 ibt_wc_t wc;
1183 struct recv_wid *rwid;
1186 * Re-enable cq notify here to avoid missing any
1187 * completion queue notification.
1189 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1191 ibt_status = IBT_SUCCESS;
1192 while (ibt_status != IBT_CQ_EMPTY) {
1193 bzero(&wc, sizeof (wc));
1194 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1195 if (ibt_status != IBT_SUCCESS)
1196 return;
1198 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1199 qp = rwid->qp;
1201 if (wc.wc_status == IBT_WC_SUCCESS) {
1202 XDR inxdrs, *xdrs;
1203 uint_t xid, vers, op, find_xid = 0;
1204 struct reply *r;
1205 CONN *conn = qptoc(qp);
1206 uint32_t rdma_credit = 0;
1208 xdrs = &inxdrs;
1209 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1210 wc.wc_bytes_xfer, XDR_DECODE);
1212 * Treat xid as opaque (xid is the first entity
1213 * in the rpc rdma message).
1215 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1217 /* Skip xid and set the xdr position accordingly. */
1218 XDR_SETPOS(xdrs, sizeof (uint32_t));
1219 (void) xdr_u_int(xdrs, &vers);
1220 (void) xdr_u_int(xdrs, &rdma_credit);
1221 (void) xdr_u_int(xdrs, &op);
1222 XDR_DESTROY(xdrs);
1224 if (vers != RPCRDMA_VERS) {
1226 * Invalid RPC/RDMA version. Cannot
1227 * interoperate. Set connection to
1228 * ERROR state and bail out.
1230 mutex_enter(&conn->c_lock);
1231 if (conn->c_state != C_DISCONN_PEND)
1232 conn->c_state = C_ERROR_CONN;
1233 mutex_exit(&conn->c_lock);
1234 rib_rbuf_free(conn, RECV_BUFFER,
1235 (void *)(uintptr_t)rwid->addr);
1236 rib_free_wid(rwid);
1237 rib_recv_rele(qp);
1238 continue;
1241 mutex_enter(&qp->replylist_lock);
1242 for (r = qp->replylist; r != NULL; r = r->next) {
1243 if (r->xid == xid) {
1244 find_xid = 1;
1245 switch (op) {
1246 case RDMA_MSG:
1247 case RDMA_NOMSG:
1248 case RDMA_MSGP:
1249 r->status = RDMA_SUCCESS;
1250 r->vaddr_cq = rwid->addr;
1251 r->bytes_xfer =
1252 wc.wc_bytes_xfer;
1253 cv_signal(&r->wait_cv);
1254 break;
1255 default:
1256 rib_rbuf_free(qptoc(qp),
1257 RECV_BUFFER,
1258 (void *)(uintptr_t)
1259 rwid->addr);
1260 break;
1262 break;
1265 mutex_exit(&qp->replylist_lock);
1266 if (find_xid == 0) {
1267 /* RPC caller not waiting for reply */
1269 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1270 int, xid);
1272 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1273 (void *)(uintptr_t)rwid->addr);
1275 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1276 CONN *conn = qptoc(qp);
1279 * Connection being flushed. Just free
1280 * the posted buffer
1282 rib_rbuf_free(conn, RECV_BUFFER,
1283 (void *)(uintptr_t)rwid->addr);
1284 } else {
1285 CONN *conn = qptoc(qp);
1287 * RC Recv Q Error Code Local state Remote State
1288 * ==================== =========== ============
1289 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1290 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1291 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1292 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1293 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1294 * IBT_WC_WR_FLUSHED_ERR None None
1297 * Channel in error state. Set connection
1298 * in ERROR state.
1300 mutex_enter(&conn->c_lock);
1301 if (conn->c_state != C_DISCONN_PEND)
1302 conn->c_state = C_ERROR_CONN;
1303 mutex_exit(&conn->c_lock);
1304 rib_rbuf_free(conn, RECV_BUFFER,
1305 (void *)(uintptr_t)rwid->addr);
1307 rib_free_wid(rwid);
1308 rib_recv_rele(qp);
1312 /* Server side */
1313 /* ARGSUSED */
1314 static void
1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1317 rdma_recv_data_t *rdp;
1318 rib_qp_t *qp;
1319 ibt_status_t ibt_status;
1320 ibt_wc_t wc;
1321 struct svc_recv *s_recvp;
1322 CONN *conn;
1323 mblk_t *mp;
1326 * Re-enable cq notify here to avoid missing any
1327 * completion queue notification.
1329 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1331 ibt_status = IBT_SUCCESS;
1332 while (ibt_status != IBT_CQ_EMPTY) {
1333 bzero(&wc, sizeof (wc));
1334 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1335 if (ibt_status != IBT_SUCCESS)
1336 return;
1338 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1339 qp = s_recvp->qp;
1340 conn = qptoc(qp);
1342 if (wc.wc_status == IBT_WC_SUCCESS) {
1343 XDR inxdrs, *xdrs;
1344 uint_t xid, vers, op;
1345 uint32_t rdma_credit;
1347 xdrs = &inxdrs;
1348 /* s_recvp->vaddr stores data */
1349 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1350 wc.wc_bytes_xfer, XDR_DECODE);
1353 * Treat xid as opaque (xid is the first entity
1354 * in the rpc rdma message).
1356 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1357 /* Skip xid and set the xdr position accordingly. */
1358 XDR_SETPOS(xdrs, sizeof (uint32_t));
1359 if (!xdr_u_int(xdrs, &vers) ||
1360 !xdr_u_int(xdrs, &rdma_credit) ||
1361 !xdr_u_int(xdrs, &op)) {
1362 rib_rbuf_free(conn, RECV_BUFFER,
1363 (void *)(uintptr_t)s_recvp->vaddr);
1364 XDR_DESTROY(xdrs);
1365 rib_recv_rele(qp);
1366 (void) rib_free_svc_recv(s_recvp);
1367 continue;
1369 XDR_DESTROY(xdrs);
1371 if (vers != RPCRDMA_VERS) {
1373 * Invalid RPC/RDMA version.
1374 * Drop rpc rdma message.
1376 rib_rbuf_free(conn, RECV_BUFFER,
1377 (void *)(uintptr_t)s_recvp->vaddr);
1378 rib_recv_rele(qp);
1379 (void) rib_free_svc_recv(s_recvp);
1380 continue;
1383 * Is this for RDMA_DONE?
1385 if (op == RDMA_DONE) {
1386 rib_rbuf_free(conn, RECV_BUFFER,
1387 (void *)(uintptr_t)s_recvp->vaddr);
1389 * Wake up the thread waiting on
1390 * a RDMA_DONE for xid
1392 mutex_enter(&qp->rdlist_lock);
1393 rdma_done_notify(qp, xid);
1394 mutex_exit(&qp->rdlist_lock);
1395 rib_recv_rele(qp);
1396 (void) rib_free_svc_recv(s_recvp);
1397 continue;
1400 mutex_enter(&plugin_state_lock);
1401 mutex_enter(&conn->c_lock);
1402 if ((plugin_state == ACCEPT) &&
1403 (conn->c_state == C_CONNECTED)) {
1404 conn->c_ref++;
1405 mutex_exit(&conn->c_lock);
1406 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1407 == NULL)
1408 (void) strwaitbuf(
1409 sizeof (*rdp), BPRI_LO);
1411 * Plugin is in accept state, hence the master
1412 * transport queue for this is still accepting
1413 * requests. Hence we can call svc_queuereq to
1414 * queue this recieved msg.
1416 rdp = (rdma_recv_data_t *)mp->b_rptr;
1417 rdp->conn = conn;
1418 rdp->rpcmsg.addr =
1419 (caddr_t)(uintptr_t)s_recvp->vaddr;
1420 rdp->rpcmsg.type = RECV_BUFFER;
1421 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1422 rdp->status = wc.wc_status;
1423 mp->b_wptr += sizeof (*rdp);
1424 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1425 FALSE);
1426 mutex_exit(&plugin_state_lock);
1427 } else {
1429 * The master transport for this is going
1430 * away and the queue is not accepting anymore
1431 * requests for krpc, so don't do anything, just
1432 * free the msg.
1434 mutex_exit(&conn->c_lock);
1435 mutex_exit(&plugin_state_lock);
1436 rib_rbuf_free(conn, RECV_BUFFER,
1437 (void *)(uintptr_t)s_recvp->vaddr);
1439 } else {
1440 rib_rbuf_free(conn, RECV_BUFFER,
1441 (void *)(uintptr_t)s_recvp->vaddr);
1443 rib_recv_rele(qp);
1444 (void) rib_free_svc_recv(s_recvp);
1448 static void
1449 rib_attach_hca()
1451 mutex_enter(&rib_stat->open_hca_lock);
1452 (void) rpcib_open_hcas(rib_stat);
1453 rib_listen(NULL);
1454 mutex_exit(&rib_stat->open_hca_lock);
1458 * Handles DR event of IBT_HCA_DETACH_EVENT.
1460 /* ARGSUSED */
1461 static void
1462 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1463 ibt_async_code_t code, ibt_async_event_t *event)
1465 switch (code) {
1466 case IBT_HCA_ATTACH_EVENT:
1467 rib_attach_hca();
1468 break;
1469 case IBT_HCA_DETACH_EVENT:
1470 rib_detach_hca(hca_hdl);
1471 #ifdef DEBUG
1472 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1473 #endif
1474 break;
1475 case IBT_EVENT_PORT_UP:
1477 * A port is up. We should call rib_listen() since there is
1478 * a chance that rib_listen() may have failed during
1479 * rib_attach_hca() because the port had not been up yet.
1481 rib_listen(NULL);
1482 #ifdef DEBUG
1483 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1484 #endif
1485 break;
1486 #ifdef DEBUG
1487 case IBT_EVENT_PATH_MIGRATED:
1488 cmn_err(CE_NOTE, "rib_async_handler(): "
1489 "IBT_EVENT_PATH_MIGRATED\n");
1490 break;
1491 case IBT_EVENT_SQD:
1492 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1493 break;
1494 case IBT_EVENT_COM_EST:
1495 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1496 break;
1497 case IBT_ERROR_CATASTROPHIC_CHAN:
1498 cmn_err(CE_NOTE, "rib_async_handler(): "
1499 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1500 break;
1501 case IBT_ERROR_INVALID_REQUEST_CHAN:
1502 cmn_err(CE_NOTE, "rib_async_handler(): "
1503 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1504 break;
1505 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1506 cmn_err(CE_NOTE, "rib_async_handler(): "
1507 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1508 break;
1509 case IBT_ERROR_PATH_MIGRATE_REQ:
1510 cmn_err(CE_NOTE, "rib_async_handler(): "
1511 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1512 break;
1513 case IBT_ERROR_CQ:
1514 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1515 break;
1516 case IBT_ERROR_PORT_DOWN:
1517 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1518 break;
1519 case IBT_ASYNC_OPAQUE1:
1520 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1521 break;
1522 case IBT_ASYNC_OPAQUE2:
1523 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1524 break;
1525 case IBT_ASYNC_OPAQUE3:
1526 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1527 break;
1528 case IBT_ASYNC_OPAQUE4:
1529 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1530 break;
1531 #endif
1532 default:
1533 break;
1538 * Client's reachable function.
1540 static rdma_stat
1541 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1543 rdma_stat status;
1544 rpcib_ping_t rpt;
1545 struct netbuf saddr;
1546 CONN *conn;
1548 bzero(&saddr, sizeof (struct netbuf));
1549 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1551 if (status == RDMA_SUCCESS) {
1552 *handle = (void *)rpt.hca;
1553 /* release the reference */
1554 (void) rib_conn_release(conn);
1555 return (RDMA_SUCCESS);
1556 } else {
1557 *handle = NULL;
1558 DTRACE_PROBE(rpcib__i__pingfailed);
1559 return (RDMA_FAILED);
1563 /* Client side qp creation */
1564 static rdma_stat
1565 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1567 rib_qp_t *kqp = NULL;
1568 CONN *conn;
1569 rdma_clnt_cred_ctrl_t *cc_info;
1571 ASSERT(qp != NULL);
1572 *qp = NULL;
1574 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1575 conn = qptoc(kqp);
1576 kqp->hca = hca;
1577 kqp->rdmaconn.c_rdmamod = &rib_mod;
1578 kqp->rdmaconn.c_private = (caddr_t)kqp;
1580 kqp->mode = RIB_CLIENT;
1581 kqp->chan_flags = IBT_BLOCKING;
1582 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1583 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1584 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1586 * Initialize
1588 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1589 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1595 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1597 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1599 * Initialize the client credit control
1600 * portion of the rdmaconn struct.
1602 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1603 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1604 cc_info->clnt_cc_granted_ops = 0;
1605 cc_info->clnt_cc_in_flight_ops = 0;
1606 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1608 *qp = kqp;
1609 return (RDMA_SUCCESS);
1612 /* Server side qp creation */
1613 static rdma_stat
1614 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1616 rib_qp_t *kqp = NULL;
1617 ibt_chan_sizes_t chan_sizes;
1618 ibt_rc_chan_alloc_args_t qp_attr;
1619 ibt_status_t ibt_status;
1620 rdma_srv_cred_ctrl_t *cc_info;
1622 *qp = NULL;
1624 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1625 kqp->hca = hca;
1626 kqp->port_num = port;
1627 kqp->rdmaconn.c_rdmamod = &rib_mod;
1628 kqp->rdmaconn.c_private = (caddr_t)kqp;
1631 * Create the qp handle
1633 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1634 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1635 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1636 qp_attr.rc_pd = hca->pd_hdl;
1637 qp_attr.rc_hca_port_num = port;
1638 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1639 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1640 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1641 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1642 qp_attr.rc_clone_chan = NULL;
1643 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1644 qp_attr.rc_flags = IBT_WR_SIGNALED;
1646 rw_enter(&hca->state_lock, RW_READER);
1647 if (hca->state != HCA_DETACHED) {
1648 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1649 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1650 &chan_sizes);
1651 } else {
1652 rw_exit(&hca->state_lock);
1653 goto fail;
1655 rw_exit(&hca->state_lock);
1657 if (ibt_status != IBT_SUCCESS) {
1658 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1659 int, ibt_status);
1660 goto fail;
1663 kqp->mode = RIB_SERVER;
1664 kqp->chan_flags = IBT_BLOCKING;
1665 kqp->q = q; /* server ONLY */
1667 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1668 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1672 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1674 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1676 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1678 * Set the private data area to qp to be used in callbacks
1680 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1681 kqp->rdmaconn.c_state = C_CONNECTED;
1684 * Initialize the server credit control
1685 * portion of the rdmaconn struct.
1687 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1688 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1689 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1690 cc_info->srv_cc_cur_buffers_used = 0;
1691 cc_info->srv_cc_posted = preposted_rbufs;
1693 *qp = kqp;
1695 return (RDMA_SUCCESS);
1696 fail:
1697 if (kqp)
1698 kmem_free(kqp, sizeof (rib_qp_t));
1700 return (RDMA_FAILED);
1703 /* ARGSUSED */
1704 ibt_cm_status_t
1705 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1706 ibt_cm_return_args_t *ret_args, void *priv_data,
1707 ibt_priv_data_len_t len)
1709 rib_hca_t *hca;
1711 hca = (rib_hca_t *)clnt_hdl;
1713 switch (event->cm_type) {
1715 /* got a connection close event */
1716 case IBT_CM_EVENT_CONN_CLOSED:
1718 CONN *conn;
1719 rib_qp_t *qp;
1721 /* check reason why connection was closed */
1722 switch (event->cm_event.closed) {
1723 case IBT_CM_CLOSED_DREP_RCVD:
1724 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1725 case IBT_CM_CLOSED_DUP:
1726 case IBT_CM_CLOSED_ABORT:
1727 case IBT_CM_CLOSED_ALREADY:
1729 * These cases indicate the local end initiated
1730 * the closing of the channel. Nothing to do here.
1732 break;
1733 default:
1735 * Reason for CONN_CLOSED event must be one of
1736 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737 * or IBT_CM_CLOSED_STALE. These indicate cases were
1738 * the remote end is closing the channel. In these
1739 * cases free the channel and transition to error
1740 * state
1742 qp = ibt_get_chan_private(event->cm_channel);
1743 conn = qptoc(qp);
1744 mutex_enter(&conn->c_lock);
1745 if (conn->c_state == C_DISCONN_PEND) {
1746 mutex_exit(&conn->c_lock);
1747 break;
1750 conn->c_state = C_ERROR_CONN;
1753 * Free the conn if c_ref is down to 0 already
1755 if (conn->c_ref == 0) {
1757 * Remove from list and free conn
1759 conn->c_state = C_DISCONN_PEND;
1760 mutex_exit(&conn->c_lock);
1761 rw_enter(&hca->state_lock, RW_READER);
1762 if (hca->state != HCA_DETACHED)
1763 (void) rib_disconnect_channel(conn,
1764 &hca->cl_conn_list);
1765 rw_exit(&hca->state_lock);
1766 } else {
1768 * conn will be freed when c_ref goes to 0.
1769 * Indicate to cleaning thread not to close
1770 * the connection, but just free the channel.
1772 conn->c_flags |= C_CLOSE_NOTNEEDED;
1773 mutex_exit(&conn->c_lock);
1775 #ifdef DEBUG
1776 if (rib_debug)
1777 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1778 "(CONN_CLOSED) channel disconnected");
1779 #endif
1780 break;
1782 break;
1784 default:
1785 break;
1787 return (IBT_CM_ACCEPT);
1791 * Connect to the server.
1793 rdma_stat
1794 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1796 ibt_chan_open_args_t chan_args; /* channel args */
1797 ibt_chan_sizes_t chan_sizes;
1798 ibt_rc_chan_alloc_args_t qp_attr;
1799 ibt_status_t ibt_status;
1800 ibt_rc_returns_t ret_args; /* conn reject info */
1801 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1802 ibt_ip_cm_info_t ipcm_info;
1803 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1806 (void) bzero(&chan_args, sizeof (chan_args));
1807 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1808 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1810 ipcm_info.src_addr.family = rptp->srcip.family;
1811 switch (ipcm_info.src_addr.family) {
1812 case AF_INET:
1813 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1814 break;
1815 case AF_INET6:
1816 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1817 break;
1820 ipcm_info.dst_addr.family = rptp->srcip.family;
1821 switch (ipcm_info.dst_addr.family) {
1822 case AF_INET:
1823 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1824 break;
1825 case AF_INET6:
1826 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1827 break;
1830 ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1832 ibt_status = ibt_format_ip_private_data(&ipcm_info,
1833 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1835 if (ibt_status != IBT_SUCCESS) {
1836 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1837 return (-1);
1840 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1841 /* Alloc a RC channel */
1842 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1843 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1844 qp_attr.rc_pd = hca->pd_hdl;
1845 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1846 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1847 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1848 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1849 qp_attr.rc_clone_chan = NULL;
1850 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1851 qp_attr.rc_flags = IBT_WR_SIGNALED;
1853 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1854 chan_args.oc_path = &rptp->path;
1856 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1857 chan_args.oc_cm_clnt_private = (void *)hca;
1858 chan_args.oc_rdma_ra_out = 4;
1859 chan_args.oc_rdma_ra_in = 4;
1860 chan_args.oc_path_retry_cnt = 2;
1861 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1862 chan_args.oc_priv_data = cmp_ip_pvt;
1863 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1865 refresh:
1866 rw_enter(&hca->state_lock, RW_READER);
1867 if (hca->state != HCA_DETACHED) {
1868 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1869 IBT_ACHAN_NO_FLAGS,
1870 &qp_attr, &qp->qp_hdl,
1871 &chan_sizes);
1872 } else {
1873 rw_exit(&hca->state_lock);
1874 return (RDMA_FAILED);
1876 rw_exit(&hca->state_lock);
1878 if (ibt_status != IBT_SUCCESS) {
1879 DTRACE_PROBE1(rpcib__i_conntosrv,
1880 int, ibt_status);
1881 return (RDMA_FAILED);
1884 /* Connect to the Server */
1885 (void) bzero(&ret_args, sizeof (ret_args));
1886 mutex_enter(&qp->cb_lock);
1887 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1888 IBT_BLOCKING, &chan_args, &ret_args);
1889 if (ibt_status != IBT_SUCCESS) {
1890 DTRACE_PROBE2(rpcib__i_openrctosrv,
1891 int, ibt_status, int, ret_args.rc_status);
1893 (void) ibt_free_channel(qp->qp_hdl);
1894 qp->qp_hdl = NULL;
1895 mutex_exit(&qp->cb_lock);
1896 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1897 ret_args.rc_status == IBT_CM_CONN_STALE) {
1899 * Got IBT_CM_CONN_STALE probably because of stale
1900 * data on the passive end of a channel that existed
1901 * prior to reboot. Retry establishing a channel
1902 * REFRESH_ATTEMPTS times, during which time the
1903 * stale conditions on the server might clear up.
1905 goto refresh;
1907 return (RDMA_FAILED);
1909 mutex_exit(&qp->cb_lock);
1911 * Set the private data area to qp to be used in callbacks
1913 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1914 return (RDMA_SUCCESS);
1917 rdma_stat
1918 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1920 uint_t i, addr_count;
1921 ibt_status_t ibt_status;
1922 uint8_t num_paths_p;
1923 ibt_ip_path_attr_t ipattr;
1924 ibt_path_ip_src_t srcip;
1925 rpcib_ipaddrs_t addrs4;
1926 rpcib_ipaddrs_t addrs6;
1927 struct sockaddr_in *sinp;
1928 struct sockaddr_in6 *sin6p;
1929 rdma_stat retval = RDMA_FAILED;
1930 rib_hca_t *hca;
1932 if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1933 return (RDMA_INVAL);
1934 ASSERT(raddr->buf != NULL);
1936 bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1938 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1939 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1940 retval = RDMA_FAILED;
1941 goto done2;
1944 if (addr_type == AF_INET) {
1945 addr_count = addrs4.ri_count;
1946 sinp = (struct sockaddr_in *)raddr->buf;
1947 rptp->dstip.family = AF_INET;
1948 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1949 sinp = addrs4.ri_list;
1950 } else {
1951 addr_count = addrs6.ri_count;
1952 sin6p = (struct sockaddr_in6 *)raddr->buf;
1953 rptp->dstip.family = AF_INET6;
1954 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1955 sin6p = addrs6.ri_list;
1958 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1959 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1960 rw_enter(&hca->state_lock, RW_READER);
1961 if (hca->state == HCA_DETACHED) {
1962 rw_exit(&hca->state_lock);
1963 continue;
1966 ipattr.ipa_dst_ip = &rptp->dstip;
1967 ipattr.ipa_hca_guid = hca->hca_guid;
1968 ipattr.ipa_ndst = 1;
1969 ipattr.ipa_max_paths = 1;
1970 ipattr.ipa_src_ip.family = rptp->dstip.family;
1971 for (i = 0; i < addr_count; i++) {
1972 num_paths_p = 0;
1973 if (addr_type == AF_INET) {
1974 ipattr.ipa_src_ip.un.ip4addr =
1975 sinp[i].sin_addr.s_addr;
1976 } else {
1977 ipattr.ipa_src_ip.un.ip6addr =
1978 sin6p[i].sin6_addr;
1980 bzero(&srcip, sizeof (ibt_path_ip_src_t));
1982 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1983 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1984 &num_paths_p, &srcip);
1985 if (ibt_status == IBT_SUCCESS &&
1986 num_paths_p != 0 &&
1987 rptp->path.pi_hca_guid == hca->hca_guid) {
1988 rptp->hca = hca;
1989 rw_exit(&hca->state_lock);
1990 if (addr_type == AF_INET) {
1991 rptp->srcip.family = AF_INET;
1992 rptp->srcip.un.ip4addr =
1993 srcip.ip_primary.un.ip4addr;
1994 } else {
1995 rptp->srcip.family = AF_INET6;
1996 rptp->srcip.un.ip6addr =
1997 srcip.ip_primary.un.ip6addr;
2000 retval = RDMA_SUCCESS;
2001 goto done1;
2004 rw_exit(&hca->state_lock);
2006 done1:
2007 rw_exit(&rib_stat->hcas_list_lock);
2008 done2:
2009 if (addrs4.ri_size > 0)
2010 kmem_free(addrs4.ri_list, addrs4.ri_size);
2011 if (addrs6.ri_size > 0)
2012 kmem_free(addrs6.ri_list, addrs6.ri_size);
2013 return (retval);
2017 * Close channel, remove from connection list and
2018 * free up resources allocated for that channel.
2020 rdma_stat
2021 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2023 rib_qp_t *qp = ctoqp(conn);
2024 rib_hca_t *hca;
2026 mutex_enter(&conn->c_lock);
2027 if (conn->c_timeout != NULL) {
2028 mutex_exit(&conn->c_lock);
2029 (void) untimeout(conn->c_timeout);
2030 mutex_enter(&conn->c_lock);
2033 while (conn->c_flags & C_CLOSE_PENDING) {
2034 cv_wait(&conn->c_cv, &conn->c_lock);
2036 mutex_exit(&conn->c_lock);
2039 * c_ref == 0 and connection is in C_DISCONN_PEND
2041 hca = qp->hca;
2042 if (conn_list != NULL)
2043 (void) rib_rm_conn(conn, conn_list);
2046 * There is only one case where we get here with
2047 * qp_hdl = NULL, which is during connection setup on
2048 * the client. In such a case there are no posted
2049 * send/recv buffers.
2051 if (qp->qp_hdl != NULL) {
2052 mutex_enter(&qp->posted_rbufs_lock);
2053 while (qp->n_posted_rbufs)
2054 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2055 mutex_exit(&qp->posted_rbufs_lock);
2057 mutex_enter(&qp->send_rbufs_lock);
2058 while (qp->n_send_rbufs)
2059 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2060 mutex_exit(&qp->send_rbufs_lock);
2062 (void) ibt_free_channel(qp->qp_hdl);
2063 qp->qp_hdl = NULL;
2066 ASSERT(qp->rdlist == NULL);
2068 if (qp->replylist != NULL) {
2069 (void) rib_rem_replylist(qp);
2072 cv_destroy(&qp->cb_conn_cv);
2073 cv_destroy(&qp->posted_rbufs_cv);
2074 cv_destroy(&qp->send_rbufs_cv);
2075 mutex_destroy(&qp->cb_lock);
2076 mutex_destroy(&qp->replylist_lock);
2077 mutex_destroy(&qp->posted_rbufs_lock);
2078 mutex_destroy(&qp->send_rbufs_lock);
2079 mutex_destroy(&qp->rdlist_lock);
2081 cv_destroy(&conn->c_cv);
2082 mutex_destroy(&conn->c_lock);
2084 if (conn->c_raddr.buf != NULL) {
2085 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2087 if (conn->c_laddr.buf != NULL) {
2088 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2090 if (conn->c_netid != NULL) {
2091 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2093 if (conn->c_addrmask.buf != NULL) {
2094 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2098 * Credit control cleanup.
2100 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2101 rdma_clnt_cred_ctrl_t *cc_info;
2102 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2103 cv_destroy(&cc_info->clnt_cc_cv);
2106 kmem_free(qp, sizeof (rib_qp_t));
2109 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110 * then the hca is no longer being used.
2112 if (conn_list != NULL) {
2113 rw_enter(&hca->state_lock, RW_READER);
2114 if (hca->state == HCA_DETACHED) {
2115 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2116 if (hca->srv_conn_list.conn_hd == NULL) {
2117 rw_enter(&hca->cl_conn_list.conn_lock,
2118 RW_READER);
2120 if (hca->cl_conn_list.conn_hd == NULL) {
2121 mutex_enter(&hca->inuse_lock);
2122 hca->inuse = FALSE;
2123 cv_signal(&hca->cb_cv);
2124 mutex_exit(&hca->inuse_lock);
2126 rw_exit(&hca->cl_conn_list.conn_lock);
2128 rw_exit(&hca->srv_conn_list.conn_lock);
2130 rw_exit(&hca->state_lock);
2133 return (RDMA_SUCCESS);
2137 * All sends are done under the protection of
2138 * the wdesc->sendwait_lock. n_send_rbufs count
2139 * is protected using the send_rbufs_lock.
2140 * lock ordering is:
2141 * sendwait_lock -> send_rbufs_lock
2144 void
2145 rib_send_hold(rib_qp_t *qp)
2147 mutex_enter(&qp->send_rbufs_lock);
2148 qp->n_send_rbufs++;
2149 mutex_exit(&qp->send_rbufs_lock);
2152 void
2153 rib_send_rele(rib_qp_t *qp)
2155 mutex_enter(&qp->send_rbufs_lock);
2156 qp->n_send_rbufs--;
2157 if (qp->n_send_rbufs == 0)
2158 cv_signal(&qp->send_rbufs_cv);
2159 mutex_exit(&qp->send_rbufs_lock);
2162 void
2163 rib_recv_rele(rib_qp_t *qp)
2165 mutex_enter(&qp->posted_rbufs_lock);
2166 qp->n_posted_rbufs--;
2167 if (qp->n_posted_rbufs == 0)
2168 cv_signal(&qp->posted_rbufs_cv);
2169 mutex_exit(&qp->posted_rbufs_lock);
2173 * Wait for send completion notification. Only on receiving a
2174 * notification be it a successful or error completion, free the
2175 * send_wid.
2177 static rdma_stat
2178 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2180 clock_t timout, cv_wait_ret;
2181 rdma_stat error = RDMA_SUCCESS;
2182 int i;
2185 * Wait for send to complete
2187 ASSERT(wd != NULL);
2188 mutex_enter(&wd->sendwait_lock);
2189 if (wd->status == (uint_t)SEND_WAIT) {
2190 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2191 ddi_get_lbolt();
2193 if (qp->mode == RIB_SERVER) {
2194 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2195 &wd->sendwait_lock, timout)) > 0 &&
2196 wd->status == (uint_t)SEND_WAIT)
2198 switch (cv_wait_ret) {
2199 case -1: /* timeout */
2200 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2202 wd->cv_sig = 0; /* no signal needed */
2203 error = RDMA_TIMEDOUT;
2204 break;
2205 default: /* got send completion */
2206 break;
2208 } else {
2209 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2210 &wd->sendwait_lock, timout)) > 0 &&
2211 wd->status == (uint_t)SEND_WAIT)
2213 switch (cv_wait_ret) {
2214 case -1: /* timeout */
2215 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2217 wd->cv_sig = 0; /* no signal needed */
2218 error = RDMA_TIMEDOUT;
2219 break;
2220 case 0: /* interrupted */
2221 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2223 wd->cv_sig = 0; /* no signal needed */
2224 error = RDMA_INTR;
2225 break;
2226 default: /* got send completion */
2227 break;
2232 if (wd->status != (uint_t)SEND_WAIT) {
2233 /* got send completion */
2234 if (wd->status != RDMA_SUCCESS) {
2235 switch (wd->status) {
2236 case RDMA_CONNLOST:
2237 error = RDMA_CONNLOST;
2238 break;
2239 default:
2240 error = RDMA_FAILED;
2241 break;
2244 for (i = 0; i < wd->nsbufs; i++) {
2245 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2246 (void *)(uintptr_t)wd->sbufaddr[i]);
2249 rib_send_rele(qp);
2251 mutex_exit(&wd->sendwait_lock);
2252 (void) rib_free_sendwait(wd);
2254 } else {
2255 mutex_exit(&wd->sendwait_lock);
2257 return (error);
2260 static struct send_wid *
2261 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2263 struct send_wid *wd;
2265 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2266 wd->xid = xid;
2267 wd->cv_sig = cv_sig;
2268 wd->qp = qp;
2269 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2270 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2271 wd->status = (uint_t)SEND_WAIT;
2273 return (wd);
2276 static int
2277 rib_free_sendwait(struct send_wid *wdesc)
2279 cv_destroy(&wdesc->wait_cv);
2280 mutex_destroy(&wdesc->sendwait_lock);
2281 kmem_free(wdesc, sizeof (*wdesc));
2283 return (0);
2286 static rdma_stat
2287 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2289 mutex_enter(&qp->replylist_lock);
2290 if (rep != NULL) {
2291 (void) rib_remreply(qp, rep);
2292 mutex_exit(&qp->replylist_lock);
2293 return (RDMA_SUCCESS);
2295 mutex_exit(&qp->replylist_lock);
2296 return (RDMA_FAILED);
2300 * Send buffers are freed here only in case of error in posting
2301 * on QP. If the post succeeded, the send buffers are freed upon
2302 * send completion in rib_sendwait() or in the scq_handler.
2304 rdma_stat
2305 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2306 int send_sig, int cv_sig, caddr_t *swid)
2308 struct send_wid *wdesc;
2309 struct clist *clp;
2310 ibt_status_t ibt_status = IBT_SUCCESS;
2311 rdma_stat ret = RDMA_SUCCESS;
2312 ibt_send_wr_t tx_wr;
2313 int i, nds;
2314 ibt_wr_ds_t sgl[DSEG_MAX];
2315 uint_t total_msg_size;
2316 rib_qp_t *qp;
2318 qp = ctoqp(conn);
2320 ASSERT(cl != NULL);
2322 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2324 nds = 0;
2325 total_msg_size = 0;
2326 clp = cl;
2327 while (clp != NULL) {
2328 if (nds >= DSEG_MAX) {
2329 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2330 return (RDMA_FAILED);
2332 sgl[nds].ds_va = clp->w.c_saddr;
2333 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2334 sgl[nds].ds_len = clp->c_len;
2335 total_msg_size += clp->c_len;
2336 clp = clp->c_next;
2337 nds++;
2340 if (send_sig) {
2341 /* Set SEND_SIGNAL flag. */
2342 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2343 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2344 *swid = (caddr_t)wdesc;
2345 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2346 mutex_enter(&wdesc->sendwait_lock);
2347 wdesc->nsbufs = nds;
2348 for (i = 0; i < nds; i++) {
2349 wdesc->sbufaddr[i] = sgl[i].ds_va;
2351 } else {
2352 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2353 *swid = NULL;
2354 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2357 tx_wr.wr_opcode = IBT_WRC_SEND;
2358 tx_wr.wr_trans = IBT_RC_SRV;
2359 tx_wr.wr_nds = nds;
2360 tx_wr.wr_sgl = sgl;
2362 mutex_enter(&conn->c_lock);
2363 if (conn->c_state == C_CONNECTED) {
2364 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2366 if (conn->c_state != C_CONNECTED ||
2367 ibt_status != IBT_SUCCESS) {
2368 if (conn->c_state != C_DISCONN_PEND)
2369 conn->c_state = C_ERROR_CONN;
2370 mutex_exit(&conn->c_lock);
2371 if (send_sig) {
2372 for (i = 0; i < nds; i++) {
2373 rib_rbuf_free(conn, SEND_BUFFER,
2374 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2376 mutex_exit(&wdesc->sendwait_lock);
2377 (void) rib_free_sendwait(wdesc);
2379 return (RDMA_CONNLOST);
2382 mutex_exit(&conn->c_lock);
2384 if (send_sig) {
2385 rib_send_hold(qp);
2386 mutex_exit(&wdesc->sendwait_lock);
2387 if (cv_sig) {
2389 * cv_wait for send to complete.
2390 * We can fail due to a timeout or signal or
2391 * unsuccessful send.
2393 ret = rib_sendwait(qp, wdesc);
2395 return (ret);
2399 return (RDMA_SUCCESS);
2403 rdma_stat
2404 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2406 rdma_stat ret;
2407 caddr_t wd;
2409 /* send-wait & cv_signal */
2410 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2411 return (ret);
2415 * Deprecated/obsolete interface not used currently
2416 * but earlier used for READ-READ protocol.
2417 * Send RPC reply and wait for RDMA_DONE.
2419 rdma_stat
2420 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2422 rdma_stat ret = RDMA_SUCCESS;
2423 struct rdma_done_list *rd;
2424 clock_t cv_wait_ret;
2425 caddr_t *wid = NULL;
2426 rib_qp_t *qp = ctoqp(conn);
2428 mutex_enter(&qp->rdlist_lock);
2429 rd = rdma_done_add(qp, msgid);
2431 /* No cv_signal (whether send-wait or no-send-wait) */
2432 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2434 if (ret != RDMA_SUCCESS) {
2435 rdma_done_rm(qp, rd);
2436 } else {
2438 * Wait for RDMA_DONE from remote end
2440 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2441 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2442 TR_CLOCK_TICK);
2444 rdma_done_rm(qp, rd);
2446 if (cv_wait_ret < 0) {
2447 ret = RDMA_TIMEDOUT;
2451 mutex_exit(&qp->rdlist_lock);
2452 return (ret);
2455 static struct recv_wid *
2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2458 struct recv_wid *rwid;
2460 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461 rwid->xid = msgid;
2462 rwid->addr = sgl->ds_va;
2463 rwid->qp = qp;
2465 return (rwid);
2468 static void
2469 rib_free_wid(struct recv_wid *rwid)
2471 kmem_free(rwid, sizeof (struct recv_wid));
2474 rdma_stat
2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2477 rib_qp_t *qp = ctoqp(conn);
2478 struct clist *clp = cl;
2479 struct reply *rep;
2480 struct recv_wid *rwid;
2481 int nds;
2482 ibt_wr_ds_t sgl[DSEG_MAX];
2483 ibt_recv_wr_t recv_wr;
2484 rdma_stat ret;
2485 ibt_status_t ibt_status;
2488 * rdma_clnt_postrecv uses RECV_BUFFER.
2491 nds = 0;
2492 while (cl != NULL) {
2493 if (nds >= DSEG_MAX) {
2494 ret = RDMA_FAILED;
2495 goto done;
2497 sgl[nds].ds_va = cl->w.c_saddr;
2498 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499 sgl[nds].ds_len = cl->c_len;
2500 cl = cl->c_next;
2501 nds++;
2504 if (nds != 1) {
2505 ret = RDMA_FAILED;
2506 goto done;
2509 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510 recv_wr.wr_nds = nds;
2511 recv_wr.wr_sgl = sgl;
2513 rwid = rib_create_wid(qp, &sgl[0], msgid);
2514 if (rwid) {
2515 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516 } else {
2517 ret = RDMA_NORESOURCE;
2518 goto done;
2520 rep = rib_addreplylist(qp, msgid);
2521 if (!rep) {
2522 rib_free_wid(rwid);
2523 ret = RDMA_NORESOURCE;
2524 goto done;
2527 mutex_enter(&conn->c_lock);
2529 if (conn->c_state == C_CONNECTED) {
2530 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2533 if (conn->c_state != C_CONNECTED ||
2534 ibt_status != IBT_SUCCESS) {
2535 if (conn->c_state != C_DISCONN_PEND)
2536 conn->c_state = C_ERROR_CONN;
2537 mutex_exit(&conn->c_lock);
2538 rib_free_wid(rwid);
2539 (void) rib_rem_rep(qp, rep);
2540 ret = RDMA_CONNLOST;
2541 goto done;
2544 mutex_enter(&qp->posted_rbufs_lock);
2545 qp->n_posted_rbufs++;
2546 mutex_exit(&qp->posted_rbufs_lock);
2548 mutex_exit(&conn->c_lock);
2549 return (RDMA_SUCCESS);
2551 done:
2552 while (clp != NULL) {
2553 rib_rbuf_free(conn, RECV_BUFFER,
2554 (void *)(uintptr_t)clp->w.c_saddr3);
2555 clp = clp->c_next;
2557 return (ret);
2560 rdma_stat
2561 rib_svc_post(CONN* conn, struct clist *cl)
2563 rib_qp_t *qp = ctoqp(conn);
2564 struct svc_recv *s_recvp;
2565 int nds;
2566 ibt_wr_ds_t sgl[DSEG_MAX];
2567 ibt_recv_wr_t recv_wr;
2568 ibt_status_t ibt_status;
2570 nds = 0;
2571 while (cl != NULL) {
2572 if (nds >= DSEG_MAX) {
2573 return (RDMA_FAILED);
2575 sgl[nds].ds_va = cl->w.c_saddr;
2576 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2577 sgl[nds].ds_len = cl->c_len;
2578 cl = cl->c_next;
2579 nds++;
2582 if (nds != 1) {
2583 rib_rbuf_free(conn, RECV_BUFFER,
2584 (caddr_t)(uintptr_t)sgl[0].ds_va);
2586 return (RDMA_FAILED);
2589 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2590 recv_wr.wr_nds = nds;
2591 recv_wr.wr_sgl = sgl;
2593 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2594 /* Use s_recvp's addr as wr id */
2595 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2596 mutex_enter(&conn->c_lock);
2597 if (conn->c_state == C_CONNECTED) {
2598 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2600 if (conn->c_state != C_CONNECTED ||
2601 ibt_status != IBT_SUCCESS) {
2602 if (conn->c_state != C_DISCONN_PEND)
2603 conn->c_state = C_ERROR_CONN;
2604 mutex_exit(&conn->c_lock);
2605 rib_rbuf_free(conn, RECV_BUFFER,
2606 (caddr_t)(uintptr_t)sgl[0].ds_va);
2607 (void) rib_free_svc_recv(s_recvp);
2609 return (RDMA_CONNLOST);
2611 mutex_exit(&conn->c_lock);
2613 return (RDMA_SUCCESS);
2616 /* Client */
2617 rdma_stat
2618 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2620 return (rib_clnt_post(conn, cl, msgid));
2623 /* Client */
2624 rdma_stat
2625 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2627 rib_qp_t *qp = ctoqp(conn);
2628 struct reply *rep;
2630 mutex_enter(&qp->replylist_lock);
2631 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2632 if (rep->xid == msgid) {
2633 if (rep->vaddr_cq) {
2634 rib_rbuf_free(conn, RECV_BUFFER,
2635 (caddr_t)(uintptr_t)rep->vaddr_cq);
2637 (void) rib_remreply(qp, rep);
2638 break;
2641 mutex_exit(&qp->replylist_lock);
2643 return (RDMA_SUCCESS);
2646 /* Server */
2647 rdma_stat
2648 rib_post_recv(CONN *conn, struct clist *cl)
2650 rib_qp_t *qp = ctoqp(conn);
2652 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2653 mutex_enter(&qp->posted_rbufs_lock);
2654 qp->n_posted_rbufs++;
2655 mutex_exit(&qp->posted_rbufs_lock);
2656 return (RDMA_SUCCESS);
2658 return (RDMA_FAILED);
2662 * Client side only interface to "recv" the rpc reply buf
2663 * posted earlier by rib_post_resp(conn, cl, msgid).
2665 rdma_stat
2666 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2668 struct reply *rep = NULL;
2669 clock_t timout, cv_wait_ret;
2670 rdma_stat ret = RDMA_SUCCESS;
2671 rib_qp_t *qp = ctoqp(conn);
2674 * Find the reply structure for this msgid
2676 mutex_enter(&qp->replylist_lock);
2678 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2679 if (rep->xid == msgid)
2680 break;
2683 if (rep != NULL) {
2685 * If message not yet received, wait.
2687 if (rep->status == (uint_t)REPLY_WAIT) {
2688 timout = ddi_get_lbolt() +
2689 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2691 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2692 &qp->replylist_lock, timout)) > 0 &&
2693 rep->status == (uint_t)REPLY_WAIT)
2696 switch (cv_wait_ret) {
2697 case -1: /* timeout */
2698 ret = RDMA_TIMEDOUT;
2699 break;
2700 case 0:
2701 ret = RDMA_INTR;
2702 break;
2703 default:
2704 break;
2708 if (rep->status == RDMA_SUCCESS) {
2709 struct clist *cl = NULL;
2712 * Got message successfully
2714 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2715 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2716 *clp = cl;
2717 } else {
2718 if (rep->status != (uint_t)REPLY_WAIT) {
2720 * Got error in reply message. Free
2721 * recv buffer here.
2723 ret = rep->status;
2724 rib_rbuf_free(conn, RECV_BUFFER,
2725 (caddr_t)(uintptr_t)rep->vaddr_cq);
2728 (void) rib_remreply(qp, rep);
2729 } else {
2731 * No matching reply structure found for given msgid on the
2732 * reply wait list.
2734 ret = RDMA_INVAL;
2735 DTRACE_PROBE(rpcib__i__nomatchxid2);
2739 * Done.
2741 mutex_exit(&qp->replylist_lock);
2742 return (ret);
2746 * RDMA write a buffer to the remote address.
2748 rdma_stat
2749 rib_write(CONN *conn, struct clist *cl, int wait)
2751 ibt_send_wr_t tx_wr;
2752 int cv_sig;
2753 ibt_wr_ds_t sgl[DSEG_MAX];
2754 struct send_wid *wdesc;
2755 ibt_status_t ibt_status;
2756 rdma_stat ret = RDMA_SUCCESS;
2757 rib_qp_t *qp = ctoqp(conn);
2758 uint64_t n_writes = 0;
2760 if (cl == NULL) {
2761 return (RDMA_FAILED);
2764 while ((cl != NULL)) {
2765 if (cl->c_len > 0) {
2766 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2767 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2768 tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2769 cl->c_dmemhandle.mrc_rmr; /* rkey */
2770 sgl[0].ds_va = cl->w.c_saddr;
2771 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2772 sgl[0].ds_len = cl->c_len;
2774 if (wait) {
2775 cv_sig = 1;
2776 } else {
2777 if (n_writes > max_unsignaled_rws) {
2778 n_writes = 0;
2779 cv_sig = 1;
2780 } else {
2781 cv_sig = 0;
2785 if (cv_sig) {
2786 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2787 wdesc = rib_init_sendwait(0, cv_sig, qp);
2788 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2789 mutex_enter(&wdesc->sendwait_lock);
2790 } else {
2791 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2792 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2794 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2795 tx_wr.wr_trans = IBT_RC_SRV;
2796 tx_wr.wr_nds = 1;
2797 tx_wr.wr_sgl = sgl;
2799 mutex_enter(&conn->c_lock);
2800 if (conn->c_state == C_CONNECTED) {
2801 ibt_status =
2802 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2804 if (conn->c_state != C_CONNECTED ||
2805 ibt_status != IBT_SUCCESS) {
2806 if (conn->c_state != C_DISCONN_PEND)
2807 conn->c_state = C_ERROR_CONN;
2808 mutex_exit(&conn->c_lock);
2809 if (cv_sig) {
2810 mutex_exit(&wdesc->sendwait_lock);
2811 (void) rib_free_sendwait(wdesc);
2813 return (RDMA_CONNLOST);
2816 mutex_exit(&conn->c_lock);
2819 * Wait for send to complete
2821 if (cv_sig) {
2823 rib_send_hold(qp);
2824 mutex_exit(&wdesc->sendwait_lock);
2826 ret = rib_sendwait(qp, wdesc);
2827 if (ret != 0)
2828 return (ret);
2830 n_writes ++;
2832 cl = cl->c_next;
2834 return (RDMA_SUCCESS);
2838 * RDMA Read a buffer from the remote address.
2840 rdma_stat
2841 rib_read(CONN *conn, struct clist *cl, int wait)
2843 ibt_send_wr_t rx_wr;
2844 int cv_sig = 0;
2845 ibt_wr_ds_t sgl;
2846 struct send_wid *wdesc;
2847 ibt_status_t ibt_status = IBT_SUCCESS;
2848 rdma_stat ret = RDMA_SUCCESS;
2849 rib_qp_t *qp = ctoqp(conn);
2851 if (cl == NULL) {
2852 return (RDMA_FAILED);
2855 while (cl != NULL) {
2856 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2858 * Remote address is at the head chunk item in list.
2860 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2861 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2863 sgl.ds_va = cl->u.c_daddr;
2864 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2865 sgl.ds_len = cl->c_len;
2868 * If there are multiple chunks to be read, and
2869 * wait is set, ask for signal only for the last chunk
2870 * and wait only on the last chunk. The completion of
2871 * RDMA_READ on last chunk ensures that reads on all
2872 * previous chunks are also completed.
2874 if (wait && (cl->c_next == NULL)) {
2875 cv_sig = 1;
2876 wdesc = rib_init_sendwait(0, cv_sig, qp);
2877 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2878 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2879 mutex_enter(&wdesc->sendwait_lock);
2880 } else {
2881 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2882 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2884 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2885 rx_wr.wr_trans = IBT_RC_SRV;
2886 rx_wr.wr_nds = 1;
2887 rx_wr.wr_sgl = &sgl;
2889 mutex_enter(&conn->c_lock);
2890 if (conn->c_state == C_CONNECTED) {
2891 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2893 if (conn->c_state != C_CONNECTED ||
2894 ibt_status != IBT_SUCCESS) {
2895 if (conn->c_state != C_DISCONN_PEND)
2896 conn->c_state = C_ERROR_CONN;
2897 mutex_exit(&conn->c_lock);
2898 if (wait && (cl->c_next == NULL)) {
2899 mutex_exit(&wdesc->sendwait_lock);
2900 (void) rib_free_sendwait(wdesc);
2902 return (RDMA_CONNLOST);
2905 mutex_exit(&conn->c_lock);
2908 * Wait for send to complete if this is the
2909 * last item in the list.
2911 if (wait && cl->c_next == NULL) {
2912 rib_send_hold(qp);
2913 mutex_exit(&wdesc->sendwait_lock);
2915 ret = rib_sendwait(qp, wdesc);
2917 if (ret != 0)
2918 return (ret);
2920 cl = cl->c_next;
2922 return (RDMA_SUCCESS);
2926 * rib_srv_cm_handler()
2927 * Connection Manager callback to handle RC connection requests.
2929 /* ARGSUSED */
2930 static ibt_cm_status_t
2931 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2932 ibt_cm_return_args_t *ret_args, void *priv_data,
2933 ibt_priv_data_len_t len)
2935 queue_t *q;
2936 rib_qp_t *qp;
2937 rib_hca_t *hca;
2938 rdma_stat status = RDMA_SUCCESS;
2939 int i;
2940 struct clist cl;
2941 rdma_buf_t rdbuf = {0};
2942 void *buf = NULL;
2943 CONN *conn;
2944 ibt_ip_cm_info_t ipinfo;
2945 struct sockaddr_in *s;
2946 struct sockaddr_in6 *s6;
2947 int sin_size = sizeof (struct sockaddr_in);
2948 int in_size = sizeof (struct in_addr);
2949 int sin6_size = sizeof (struct sockaddr_in6);
2951 ASSERT(any != NULL);
2952 ASSERT(event != NULL);
2954 hca = (rib_hca_t *)any;
2956 /* got a connection request */
2957 switch (event->cm_type) {
2958 case IBT_CM_EVENT_REQ_RCV:
2960 * If the plugin is in the NO_ACCEPT state, bail out.
2962 mutex_enter(&plugin_state_lock);
2963 if (plugin_state == NO_ACCEPT) {
2964 mutex_exit(&plugin_state_lock);
2965 return (IBT_CM_REJECT);
2967 mutex_exit(&plugin_state_lock);
2970 * Need to send a MRA MAD to CM so that it does not
2971 * timeout on us.
2973 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2974 event->cm_event.req.req_timeout * 8, NULL, 0);
2976 mutex_enter(&rib_stat->open_hca_lock);
2977 q = rib_stat->q;
2978 mutex_exit(&rib_stat->open_hca_lock);
2980 status = rib_svc_create_chan(hca, (caddr_t)q,
2981 event->cm_event.req.req_prim_hca_port, &qp);
2983 if (status) {
2984 return (IBT_CM_REJECT);
2987 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2988 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2989 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2990 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2993 * Pre-posts RECV buffers
2995 conn = qptoc(qp);
2996 for (i = 0; i < preposted_rbufs; i++) {
2997 bzero(&rdbuf, sizeof (rdbuf));
2998 rdbuf.type = RECV_BUFFER;
2999 buf = rib_rbuf_alloc(conn, &rdbuf);
3000 if (buf == NULL) {
3002 * A connection is not established yet.
3003 * Just flush the channel. Buffers
3004 * posted till now will error out with
3005 * IBT_WC_WR_FLUSHED_ERR.
3007 (void) ibt_flush_channel(qp->qp_hdl);
3008 (void) rib_disconnect_channel(conn, NULL);
3009 return (IBT_CM_REJECT);
3012 bzero(&cl, sizeof (cl));
3013 cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3014 cl.c_len = rdbuf.len;
3015 cl.c_smemhandle.mrc_lmr =
3016 rdbuf.handle.mrc_lmr; /* lkey */
3017 cl.c_next = NULL;
3018 status = rib_post_recv(conn, &cl);
3019 if (status != RDMA_SUCCESS) {
3021 * A connection is not established yet.
3022 * Just flush the channel. Buffers
3023 * posted till now will error out with
3024 * IBT_WC_WR_FLUSHED_ERR.
3026 (void) ibt_flush_channel(qp->qp_hdl);
3027 (void) rib_disconnect_channel(conn, NULL);
3028 return (IBT_CM_REJECT);
3031 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3034 * Get the address translation
3036 rw_enter(&hca->state_lock, RW_READER);
3037 if (hca->state == HCA_DETACHED) {
3038 rw_exit(&hca->state_lock);
3039 return (IBT_CM_REJECT);
3041 rw_exit(&hca->state_lock);
3043 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3045 if (ibt_get_ip_data(event->cm_priv_data_len,
3046 event->cm_priv_data,
3047 &ipinfo) != IBT_SUCCESS) {
3049 return (IBT_CM_REJECT);
3052 switch (ipinfo.src_addr.family) {
3053 case AF_INET:
3055 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3056 KM_SLEEP);
3057 (void) strcpy(conn->c_netid, RIBNETID_TCP);
3059 conn->c_raddr.maxlen =
3060 conn->c_raddr.len = sin_size;
3061 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3063 s = (struct sockaddr_in *)conn->c_raddr.buf;
3064 s->sin_family = AF_INET;
3065 bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3066 &s->sin_addr, in_size);
3068 conn->c_laddr.maxlen =
3069 conn->c_laddr.len = sin_size;
3070 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3072 s = (struct sockaddr_in *)conn->c_laddr.buf;
3073 s->sin_family = AF_INET;
3074 bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3075 &s->sin_addr, in_size);
3077 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3078 sizeof (struct sockaddr_in);
3079 conn->c_addrmask.buf =
3080 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3081 ((struct sockaddr_in *)
3082 conn->c_addrmask.buf)->sin_addr.s_addr =
3083 (uint32_t)~0;
3084 ((struct sockaddr_in *)
3085 conn->c_addrmask.buf)->sin_family =
3086 (sa_family_t)~0;
3087 break;
3089 case AF_INET6:
3091 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3092 KM_SLEEP);
3093 (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3095 conn->c_raddr.maxlen =
3096 conn->c_raddr.len = sin6_size;
3097 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3099 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3100 s6->sin6_family = AF_INET6;
3101 bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3102 &s6->sin6_addr,
3103 sizeof (struct in6_addr));
3105 conn->c_laddr.maxlen =
3106 conn->c_laddr.len = sin6_size;
3107 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3109 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3110 s6->sin6_family = AF_INET6;
3111 bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3112 &s6->sin6_addr,
3113 sizeof (struct in6_addr));
3115 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3116 sizeof (struct sockaddr_in6);
3117 conn->c_addrmask.buf =
3118 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3119 (void) memset(&((struct sockaddr_in6 *)
3120 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3121 sizeof (struct in6_addr));
3122 ((struct sockaddr_in6 *)
3123 conn->c_addrmask.buf)->sin6_family =
3124 (sa_family_t)~0;
3125 break;
3127 default:
3128 return (IBT_CM_REJECT);
3131 break;
3133 case IBT_CM_EVENT_CONN_CLOSED:
3135 CONN *conn;
3136 rib_qp_t *qp;
3138 switch (event->cm_event.closed) {
3139 case IBT_CM_CLOSED_DREP_RCVD:
3140 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3141 case IBT_CM_CLOSED_DUP:
3142 case IBT_CM_CLOSED_ABORT:
3143 case IBT_CM_CLOSED_ALREADY:
3145 * These cases indicate the local end initiated
3146 * the closing of the channel. Nothing to do here.
3148 break;
3149 default:
3151 * Reason for CONN_CLOSED event must be one of
3152 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153 * or IBT_CM_CLOSED_STALE. These indicate cases were
3154 * the remote end is closing the channel. In these
3155 * cases free the channel and transition to error
3156 * state
3158 qp = ibt_get_chan_private(event->cm_channel);
3159 conn = qptoc(qp);
3160 mutex_enter(&conn->c_lock);
3161 if (conn->c_state == C_DISCONN_PEND) {
3162 mutex_exit(&conn->c_lock);
3163 break;
3165 conn->c_state = C_ERROR_CONN;
3168 * Free the conn if c_ref goes down to 0
3170 if (conn->c_ref == 0) {
3172 * Remove from list and free conn
3174 conn->c_state = C_DISCONN_PEND;
3175 mutex_exit(&conn->c_lock);
3176 (void) rib_disconnect_channel(conn,
3177 &hca->srv_conn_list);
3178 } else {
3180 * conn will be freed when c_ref goes to 0.
3181 * Indicate to cleaning thread not to close
3182 * the connection, but just free the channel.
3184 conn->c_flags |= C_CLOSE_NOTNEEDED;
3185 mutex_exit(&conn->c_lock);
3187 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3188 break;
3190 break;
3192 case IBT_CM_EVENT_CONN_EST:
3194 * RTU received, hence connection established.
3196 if (rib_debug > 1)
3197 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3198 "(CONN_EST) channel established");
3199 break;
3201 default:
3202 if (rib_debug > 2) {
3203 /* Let CM handle the following events. */
3204 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3205 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3208 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3211 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3214 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3217 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3218 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3221 return (IBT_CM_DEFAULT);
3224 /* accept all other CM messages (i.e. let the CM handle them) */
3225 return (IBT_CM_ACCEPT);
3228 static rdma_stat
3229 rib_register_service(rib_hca_t *hca, int service_type,
3230 uint8_t protocol_num, in_port_t dst_port)
3232 ibt_srv_desc_t sdesc;
3233 ibt_hca_portinfo_t *port_infop;
3234 ib_svc_id_t srv_id;
3235 ibt_srv_hdl_t srv_hdl;
3236 uint_t port_size;
3237 uint_t pki, i, num_ports, nbinds;
3238 ibt_status_t ibt_status;
3239 rib_service_t *service;
3240 ib_pkey_t pkey;
3243 * Query all ports for the given HCA
3245 rw_enter(&hca->state_lock, RW_READER);
3246 if (hca->state != HCA_DETACHED) {
3247 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3248 &num_ports, &port_size);
3249 rw_exit(&hca->state_lock);
3250 } else {
3251 rw_exit(&hca->state_lock);
3252 return (RDMA_FAILED);
3254 if (ibt_status != IBT_SUCCESS) {
3255 return (RDMA_FAILED);
3258 DTRACE_PROBE1(rpcib__i__regservice_numports,
3259 int, num_ports);
3261 for (i = 0; i < num_ports; i++) {
3262 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3263 DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3264 int, i+1);
3265 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3266 DTRACE_PROBE1(rpcib__i__regservice__portactive,
3267 int, i+1);
3272 * Get all the IP addresses on this system to register the
3273 * given "service type" on all DNS recognized IP addrs.
3274 * Each service type such as NFS will have all the systems
3275 * IP addresses as its different names. For now the only
3276 * type of service we support in RPCIB is NFS.
3278 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3280 * Start registering and binding service to active
3281 * on active ports on this HCA.
3283 nbinds = 0;
3284 for (service = rib_stat->service_list;
3285 service && (service->srv_type != service_type);
3286 service = service->next)
3289 if (service == NULL) {
3291 * We use IP addresses as the service names for
3292 * service registration. Register each of them
3293 * with CM to obtain a svc_id and svc_hdl. We do not
3294 * register the service with machine's loopback address.
3296 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3297 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3298 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3299 sdesc.sd_handler = rib_srv_cm_handler;
3300 sdesc.sd_flags = 0;
3301 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3302 &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3303 1, &srv_hdl, &srv_id);
3304 if ((ibt_status != IBT_SUCCESS) &&
3305 (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3306 rw_exit(&rib_stat->service_list_lock);
3307 DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3308 int, ibt_status);
3309 ibt_free_portinfo(port_infop, port_size);
3310 return (RDMA_FAILED);
3314 * Allocate and prepare a service entry
3316 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3318 service->srv_type = service_type;
3319 service->srv_hdl = srv_hdl;
3320 service->srv_id = srv_id;
3322 service->next = rib_stat->service_list;
3323 rib_stat->service_list = service;
3324 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3325 int, service->srv_type);
3326 } else {
3327 srv_hdl = service->srv_hdl;
3328 srv_id = service->srv_id;
3329 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3330 int, service->srv_type);
3333 for (i = 0; i < num_ports; i++) {
3334 ibt_sbind_hdl_t sbp;
3335 rib_hca_service_t *hca_srv;
3336 ib_gid_t gid;
3338 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3339 continue;
3341 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3342 pkey = port_infop[i].p_pkey_tbl[pki];
3344 rw_enter(&hca->bound_services_lock, RW_READER);
3345 gid = port_infop[i].p_sgid_tbl[0];
3346 for (hca_srv = hca->bound_services; hca_srv;
3347 hca_srv = hca_srv->next) {
3348 if ((hca_srv->srv_id == service->srv_id) &&
3349 (hca_srv->gid.gid_prefix ==
3350 gid.gid_prefix) &&
3351 (hca_srv->gid.gid_guid == gid.gid_guid))
3352 break;
3354 rw_exit(&hca->bound_services_lock);
3355 if (hca_srv != NULL) {
3357 * port is alreay bound the the service
3359 DTRACE_PROBE1(
3360 rpcib__i__regservice__already__bound,
3361 int, i+1);
3362 nbinds++;
3363 continue;
3366 if ((pkey & IBSRM_HB) &&
3367 (pkey != IB_PKEY_INVALID_FULL)) {
3369 sbp = NULL;
3370 ibt_status = ibt_bind_service(srv_hdl,
3371 gid, NULL, hca, &sbp);
3373 if (ibt_status == IBT_SUCCESS) {
3374 hca_srv = kmem_zalloc(
3375 sizeof (rib_hca_service_t),
3376 KM_SLEEP);
3377 hca_srv->srv_id = srv_id;
3378 hca_srv->gid = gid;
3379 hca_srv->sbind_hdl = sbp;
3381 rw_enter(&hca->bound_services_lock,
3382 RW_WRITER);
3383 hca_srv->next = hca->bound_services;
3384 hca->bound_services = hca_srv;
3385 rw_exit(&hca->bound_services_lock);
3386 nbinds++;
3389 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3390 int, ibt_status);
3394 rw_exit(&rib_stat->service_list_lock);
3396 ibt_free_portinfo(port_infop, port_size);
3398 if (nbinds == 0) {
3399 return (RDMA_FAILED);
3400 } else {
3402 * Put this plugin into accept state, since atleast
3403 * one registration was successful.
3405 mutex_enter(&plugin_state_lock);
3406 plugin_state = ACCEPT;
3407 mutex_exit(&plugin_state_lock);
3408 return (RDMA_SUCCESS);
3412 void
3413 rib_listen(struct rdma_svc_data *rd)
3415 rdma_stat status;
3416 int n_listening = 0;
3417 rib_hca_t *hca;
3419 mutex_enter(&rib_stat->listen_lock);
3421 * if rd parameter is NULL then it means that rib_stat->q is
3422 * already initialized by a call from RDMA and we just want to
3423 * add a newly attached HCA to the same listening state as other
3424 * HCAs.
3426 if (rd == NULL) {
3427 if (rib_stat->q == NULL) {
3428 mutex_exit(&rib_stat->listen_lock);
3429 return;
3431 } else {
3432 rib_stat->q = &rd->q;
3434 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3435 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3437 * First check if a hca is still attached
3439 rw_enter(&hca->state_lock, RW_READER);
3440 if (hca->state != HCA_INITED) {
3441 rw_exit(&hca->state_lock);
3442 continue;
3444 rw_exit(&hca->state_lock);
3447 * Right now the only service type is NFS. Hence
3448 * force feed this value. Ideally to communicate
3449 * the service type it should be passed down in
3450 * rdma_svc_data.
3452 status = rib_register_service(hca, NFS,
3453 IPPROTO_TCP, nfs_rdma_port);
3454 if (status == RDMA_SUCCESS)
3455 n_listening++;
3457 rw_exit(&rib_stat->hcas_list_lock);
3460 * Service active on an HCA, check rd->err_code for more
3461 * explainable errors.
3463 if (rd) {
3464 if (n_listening > 0) {
3465 rd->active = 1;
3466 rd->err_code = RDMA_SUCCESS;
3467 } else {
3468 rd->active = 0;
3469 rd->err_code = RDMA_FAILED;
3472 mutex_exit(&rib_stat->listen_lock);
3475 /* XXXX */
3476 /* ARGSUSED */
3477 static void
3478 rib_listen_stop(struct rdma_svc_data *svcdata)
3480 rib_hca_t *hca;
3482 mutex_enter(&rib_stat->listen_lock);
3484 * KRPC called the RDMATF to stop the listeners, this means
3485 * stop sending incomming or recieved requests to KRPC master
3486 * transport handle for RDMA-IB. This is also means that the
3487 * master transport handle, responsible for us, is going away.
3489 mutex_enter(&plugin_state_lock);
3490 plugin_state = NO_ACCEPT;
3491 if (svcdata != NULL)
3492 svcdata->active = 0;
3493 mutex_exit(&plugin_state_lock);
3495 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3496 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3498 * First check if a hca is still attached
3500 rw_enter(&hca->state_lock, RW_READER);
3501 if (hca->state == HCA_DETACHED) {
3502 rw_exit(&hca->state_lock);
3503 continue;
3505 rib_close_channels(&hca->srv_conn_list);
3506 rib_stop_services(hca);
3507 rw_exit(&hca->state_lock);
3509 rw_exit(&rib_stat->hcas_list_lock);
3512 * Avoid rib_listen() using the stale q field.
3513 * This could happen if a port goes up after all services
3514 * are already unregistered.
3516 rib_stat->q = NULL;
3517 mutex_exit(&rib_stat->listen_lock);
3521 * Traverse the HCA's service list to unbind and deregister services.
3522 * For each bound service of HCA to be removed, first find the corresponding
3523 * service handle (srv_hdl) and then unbind the service by calling
3524 * ibt_unbind_service().
3526 static void
3527 rib_stop_services(rib_hca_t *hca)
3529 rib_hca_service_t *srv_list, *to_remove;
3532 * unbind and deregister the services for this service type.
3533 * Right now there is only one service type. In future it will
3534 * be passed down to this function.
3536 rw_enter(&hca->bound_services_lock, RW_READER);
3537 srv_list = hca->bound_services;
3538 hca->bound_services = NULL;
3539 rw_exit(&hca->bound_services_lock);
3541 while (srv_list != NULL) {
3542 rib_service_t *sc;
3544 to_remove = srv_list;
3545 srv_list = to_remove->next;
3546 rw_enter(&rib_stat->service_list_lock, RW_READER);
3547 for (sc = rib_stat->service_list;
3548 sc && (sc->srv_id != to_remove->srv_id);
3549 sc = sc->next)
3552 * if sc is NULL then the service doesn't exist anymore,
3553 * probably just removed completely through rib_stat.
3555 if (sc != NULL)
3556 (void) ibt_unbind_service(sc->srv_hdl,
3557 to_remove->sbind_hdl);
3558 rw_exit(&rib_stat->service_list_lock);
3559 kmem_free(to_remove, sizeof (rib_hca_service_t));
3563 static struct svc_recv *
3564 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3566 struct svc_recv *recvp;
3568 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3569 recvp->vaddr = sgl->ds_va;
3570 recvp->qp = qp;
3571 recvp->bytes_xfer = 0;
3572 return (recvp);
3575 static int
3576 rib_free_svc_recv(struct svc_recv *recvp)
3578 kmem_free(recvp, sizeof (*recvp));
3580 return (0);
3583 static struct reply *
3584 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3586 struct reply *rep;
3589 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3590 if (rep == NULL) {
3591 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3592 return (NULL);
3594 rep->xid = msgid;
3595 rep->vaddr_cq = (uintptr_t)NULL;
3596 rep->bytes_xfer = 0;
3597 rep->status = (uint_t)REPLY_WAIT;
3598 rep->prev = NULL;
3599 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3601 mutex_enter(&qp->replylist_lock);
3602 if (qp->replylist) {
3603 rep->next = qp->replylist;
3604 qp->replylist->prev = rep;
3606 qp->rep_list_size++;
3608 DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3609 int, qp->rep_list_size);
3611 qp->replylist = rep;
3612 mutex_exit(&qp->replylist_lock);
3614 return (rep);
3617 static rdma_stat
3618 rib_rem_replylist(rib_qp_t *qp)
3620 struct reply *r, *n;
3622 mutex_enter(&qp->replylist_lock);
3623 for (r = qp->replylist; r != NULL; r = n) {
3624 n = r->next;
3625 (void) rib_remreply(qp, r);
3627 mutex_exit(&qp->replylist_lock);
3629 return (RDMA_SUCCESS);
3632 static int
3633 rib_remreply(rib_qp_t *qp, struct reply *rep)
3636 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3637 if (rep->prev) {
3638 rep->prev->next = rep->next;
3640 if (rep->next) {
3641 rep->next->prev = rep->prev;
3643 if (qp->replylist == rep)
3644 qp->replylist = rep->next;
3646 cv_destroy(&rep->wait_cv);
3647 qp->rep_list_size--;
3649 DTRACE_PROBE1(rpcib__i__remreply__listsize,
3650 int, qp->rep_list_size);
3652 kmem_free(rep, sizeof (*rep));
3654 return (0);
3657 rdma_stat
3658 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3659 struct mrc *buf_handle)
3661 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3662 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3663 rdma_stat status;
3664 rib_hca_t *hca = (ctoqp(conn))->hca;
3667 * Note: ALL buffer pools use the same memory type RDMARW.
3669 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3670 if (status == RDMA_SUCCESS) {
3671 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3672 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3673 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3674 } else {
3675 buf_handle->mrc_linfo = (uintptr_t)NULL;
3676 buf_handle->mrc_lmr = 0;
3677 buf_handle->mrc_rmr = 0;
3679 return (status);
3682 static rdma_stat
3683 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3684 ibt_mr_flags_t spec,
3685 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3687 ibt_mr_attr_t mem_attr;
3688 ibt_status_t ibt_status;
3689 mem_attr.mr_vaddr = (uintptr_t)buf;
3690 mem_attr.mr_len = (ib_msglen_t)size;
3691 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3692 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3693 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3694 IBT_MR_ENABLE_WINDOW_BIND | spec;
3696 rw_enter(&hca->state_lock, RW_READER);
3697 if (hca->state != HCA_DETACHED) {
3698 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3699 &mem_attr, mr_hdlp, mr_descp);
3700 rw_exit(&hca->state_lock);
3701 } else {
3702 rw_exit(&hca->state_lock);
3703 return (RDMA_FAILED);
3706 if (ibt_status != IBT_SUCCESS) {
3707 return (RDMA_FAILED);
3709 return (RDMA_SUCCESS);
3712 rdma_stat
3713 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3714 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3716 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3717 rib_lrc_entry_t *l;
3718 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3719 rdma_stat status;
3720 rib_hca_t *hca = (ctoqp(conn))->hca;
3723 * Non-coherent memory registration.
3725 l = (rib_lrc_entry_t *)lrc;
3726 if (l) {
3727 if (l->registered) {
3728 buf_handle->mrc_linfo =
3729 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3730 buf_handle->mrc_lmr =
3731 (uint32_t)l->lrc_mhandle.mrc_lmr;
3732 buf_handle->mrc_rmr =
3733 (uint32_t)l->lrc_mhandle.mrc_rmr;
3734 *sync_handle = (RIB_SYNCMEM_HANDLE)
3735 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3736 return (RDMA_SUCCESS);
3737 } else {
3738 /* Always register the whole buffer */
3739 buf = (caddr_t)l->lrc_buf;
3740 buflen = l->lrc_len;
3743 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3745 if (status == RDMA_SUCCESS) {
3746 if (l) {
3747 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3748 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
3749 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
3750 l->registered = TRUE;
3752 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3753 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3754 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3755 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3756 } else {
3757 buf_handle->mrc_linfo = (uintptr_t)NULL;
3758 buf_handle->mrc_lmr = 0;
3759 buf_handle->mrc_rmr = 0;
3761 return (status);
3764 /* ARGSUSED */
3765 rdma_stat
3766 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3768 rib_hca_t *hca = (ctoqp(conn))->hca;
3770 * Allow memory deregistration even if HCA is
3771 * getting detached. Need all outstanding
3772 * memory registrations to be deregistered
3773 * before HCA_DETACH_EVENT can be accepted.
3775 (void) ibt_deregister_mr(hca->hca_hdl,
3776 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3777 return (RDMA_SUCCESS);
3780 /* ARGSUSED */
3781 rdma_stat
3782 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3783 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3785 rib_lrc_entry_t *l;
3786 l = (rib_lrc_entry_t *)lrc;
3787 if (l)
3788 if (l->registered)
3789 return (RDMA_SUCCESS);
3791 (void) rib_deregistermem(conn, buf, buf_handle);
3793 return (RDMA_SUCCESS);
3796 /* ARGSUSED */
3797 rdma_stat
3798 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3799 int len, int cpu)
3801 ibt_status_t status;
3802 rib_hca_t *hca = (ctoqp(conn))->hca;
3803 ibt_mr_sync_t mr_segment;
3805 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3806 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3807 mr_segment.ms_len = (ib_memlen_t)len;
3808 if (cpu) {
3809 /* make incoming data visible to memory */
3810 mr_segment.ms_flags = IBT_SYNC_WRITE;
3811 } else {
3812 /* make memory changes visible to IO */
3813 mr_segment.ms_flags = IBT_SYNC_READ;
3815 rw_enter(&hca->state_lock, RW_READER);
3816 if (hca->state != HCA_DETACHED) {
3817 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3818 rw_exit(&hca->state_lock);
3819 } else {
3820 rw_exit(&hca->state_lock);
3821 return (RDMA_FAILED);
3824 if (status == IBT_SUCCESS)
3825 return (RDMA_SUCCESS);
3826 else {
3827 return (RDMA_FAILED);
3832 * XXXX ????
3834 static rdma_stat
3835 rib_getinfo(rdma_info_t *info)
3838 * XXXX Hack!
3840 info->addrlen = 16;
3841 info->mts = 1000000;
3842 info->mtu = 1000000;
3844 return (RDMA_SUCCESS);
3847 rib_bufpool_t *
3848 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3850 rib_bufpool_t *rbp = NULL;
3851 bufpool_t *bp = NULL;
3852 caddr_t buf;
3853 ibt_mr_attr_t mem_attr;
3854 ibt_status_t ibt_status;
3855 int i, j;
3857 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3859 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3860 num * sizeof (void *), KM_SLEEP);
3862 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3863 bp->numelems = num;
3866 switch (ptype) {
3867 case SEND_BUFFER:
3868 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3869 bp->rsize = RPC_MSG_SZ;
3870 break;
3871 case RECV_BUFFER:
3872 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3873 bp->rsize = RPC_BUF_SIZE;
3874 break;
3875 default:
3876 goto fail;
3880 * Register the pool.
3882 bp->bufsize = num * bp->rsize;
3883 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3884 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3885 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3886 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3887 sizeof (ibt_mr_desc_t), KM_SLEEP);
3888 rw_enter(&hca->state_lock, RW_READER);
3890 if (hca->state == HCA_DETACHED) {
3891 rw_exit(&hca->state_lock);
3892 goto fail;
3895 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3896 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3897 mem_attr.mr_vaddr = (uintptr_t)buf;
3898 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3899 mem_attr.mr_as = NULL;
3900 ibt_status = ibt_register_mr(hca->hca_hdl,
3901 hca->pd_hdl, &mem_attr,
3902 &rbp->mr_hdl[i],
3903 &rbp->mr_desc[i]);
3904 if (ibt_status != IBT_SUCCESS) {
3905 for (j = 0; j < i; j++) {
3906 (void) ibt_deregister_mr(hca->hca_hdl,
3907 rbp->mr_hdl[j]);
3909 rw_exit(&hca->state_lock);
3910 goto fail;
3913 rw_exit(&hca->state_lock);
3914 buf = (caddr_t)bp->buf;
3915 for (i = 0; i < num; i++, buf += bp->rsize) {
3916 bp->buflist[i] = (void *)buf;
3918 bp->buffree = num - 1; /* no. of free buffers */
3919 rbp->bpool = bp;
3921 return (rbp);
3922 fail:
3923 if (bp) {
3924 if (bp->buf)
3925 kmem_free(bp->buf, bp->bufsize);
3926 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3928 if (rbp) {
3929 if (rbp->mr_hdl)
3930 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3931 if (rbp->mr_desc)
3932 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3933 kmem_free(rbp, sizeof (rib_bufpool_t));
3935 return (NULL);
3938 static void
3939 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3941 int i;
3942 rib_bufpool_t *rbp = NULL;
3943 bufpool_t *bp;
3946 * Obtain pool address based on type of pool
3948 switch (ptype) {
3949 case SEND_BUFFER:
3950 rbp = hca->send_pool;
3951 break;
3952 case RECV_BUFFER:
3953 rbp = hca->recv_pool;
3954 break;
3955 default:
3956 return;
3958 if (rbp == NULL)
3959 return;
3961 bp = rbp->bpool;
3964 * Deregister the pool memory and free it.
3966 for (i = 0; i < bp->numelems; i++) {
3967 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3971 static void
3972 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3975 rib_bufpool_t *rbp = NULL;
3976 bufpool_t *bp;
3979 * Obtain pool address based on type of pool
3981 switch (ptype) {
3982 case SEND_BUFFER:
3983 rbp = hca->send_pool;
3984 break;
3985 case RECV_BUFFER:
3986 rbp = hca->recv_pool;
3987 break;
3988 default:
3989 return;
3991 if (rbp == NULL)
3992 return;
3994 bp = rbp->bpool;
3997 * Free the pool memory.
3999 if (rbp->mr_hdl)
4000 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4002 if (rbp->mr_desc)
4003 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4004 if (bp->buf)
4005 kmem_free(bp->buf, bp->bufsize);
4006 mutex_destroy(&bp->buflock);
4007 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4008 kmem_free(rbp, sizeof (rib_bufpool_t));
4011 void
4012 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4015 * Deregister the pool memory and free it.
4017 rib_rbufpool_deregister(hca, ptype);
4018 rib_rbufpool_free(hca, ptype);
4022 * Fetch a buffer from the pool of type specified in rdbuf->type.
4024 static rdma_stat
4025 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4027 rib_lrc_entry_t *rlep;
4029 if (rdbuf->type == RDMA_LONG_BUFFER) {
4030 rlep = rib_get_cache_buf(conn, rdbuf->len);
4031 rdbuf->rb_private = (caddr_t)rlep;
4032 rdbuf->addr = rlep->lrc_buf;
4033 rdbuf->handle = rlep->lrc_mhandle;
4034 return (RDMA_SUCCESS);
4037 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4038 if (rdbuf->addr) {
4039 switch (rdbuf->type) {
4040 case SEND_BUFFER:
4041 rdbuf->len = RPC_MSG_SZ; /* 1K */
4042 break;
4043 case RECV_BUFFER:
4044 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4045 break;
4046 default:
4047 rdbuf->len = 0;
4049 return (RDMA_SUCCESS);
4050 } else
4051 return (RDMA_FAILED);
4055 * Fetch a buffer of specified type.
4056 * Note that rdbuf->handle is mw's rkey.
4058 static void *
4059 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4061 rib_qp_t *qp = ctoqp(conn);
4062 rib_hca_t *hca = qp->hca;
4063 rdma_btype ptype = rdbuf->type;
4064 void *buf;
4065 rib_bufpool_t *rbp = NULL;
4066 bufpool_t *bp;
4067 int i;
4070 * Obtain pool address based on type of pool
4072 switch (ptype) {
4073 case SEND_BUFFER:
4074 rbp = hca->send_pool;
4075 break;
4076 case RECV_BUFFER:
4077 rbp = hca->recv_pool;
4078 break;
4079 default:
4080 return (NULL);
4082 if (rbp == NULL)
4083 return (NULL);
4085 bp = rbp->bpool;
4087 mutex_enter(&bp->buflock);
4088 if (bp->buffree < 0) {
4089 mutex_exit(&bp->buflock);
4090 return (NULL);
4093 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094 buf = bp->buflist[bp->buffree];
4095 rdbuf->addr = buf;
4096 rdbuf->len = bp->rsize;
4097 for (i = bp->numelems - 1; i >= 0; i--) {
4098 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4099 rdbuf->handle.mrc_rmr =
4100 (uint32_t)rbp->mr_desc[i].md_rkey;
4101 rdbuf->handle.mrc_linfo =
4102 (uintptr_t)rbp->mr_hdl[i];
4103 rdbuf->handle.mrc_lmr =
4104 (uint32_t)rbp->mr_desc[i].md_lkey;
4105 bp->buffree--;
4107 mutex_exit(&bp->buflock);
4109 return (buf);
4113 mutex_exit(&bp->buflock);
4115 return (NULL);
4118 static void
4119 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4122 if (rdbuf->type == RDMA_LONG_BUFFER) {
4123 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4124 rdbuf->rb_private = NULL;
4125 return;
4127 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4130 static void
4131 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4133 rib_qp_t *qp = ctoqp(conn);
4134 rib_hca_t *hca = qp->hca;
4135 rib_bufpool_t *rbp = NULL;
4136 bufpool_t *bp;
4139 * Obtain pool address based on type of pool
4141 switch (ptype) {
4142 case SEND_BUFFER:
4143 rbp = hca->send_pool;
4144 break;
4145 case RECV_BUFFER:
4146 rbp = hca->recv_pool;
4147 break;
4148 default:
4149 return;
4151 if (rbp == NULL)
4152 return;
4154 bp = rbp->bpool;
4156 mutex_enter(&bp->buflock);
4157 if (++bp->buffree >= bp->numelems) {
4159 * Should never happen
4161 bp->buffree--;
4162 } else {
4163 bp->buflist[bp->buffree] = buf;
4165 mutex_exit(&bp->buflock);
4168 static rdma_stat
4169 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4171 rw_enter(&connlist->conn_lock, RW_WRITER);
4172 if (connlist->conn_hd) {
4173 cn->c_next = connlist->conn_hd;
4174 connlist->conn_hd->c_prev = cn;
4176 connlist->conn_hd = cn;
4177 rw_exit(&connlist->conn_lock);
4179 return (RDMA_SUCCESS);
4182 static rdma_stat
4183 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4185 rw_enter(&connlist->conn_lock, RW_WRITER);
4186 if (cn->c_prev) {
4187 cn->c_prev->c_next = cn->c_next;
4189 if (cn->c_next) {
4190 cn->c_next->c_prev = cn->c_prev;
4192 if (connlist->conn_hd == cn)
4193 connlist->conn_hd = cn->c_next;
4194 rw_exit(&connlist->conn_lock);
4196 return (RDMA_SUCCESS);
4199 /* ARGSUSED */
4200 static rdma_stat
4201 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4202 int addr_type, void *handle, CONN **conn)
4204 rdma_stat status;
4205 rpcib_ping_t rpt;
4207 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4208 return (status);
4212 * rib_find_hca_connection
4214 * if there is an existing connection to the specified address then
4215 * it will be returned in conn, otherwise conn will be set to NULL.
4216 * Also cleans up any connection that is in error state.
4218 static int
4219 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4220 struct netbuf *d_svcaddr, CONN **conn)
4222 CONN *cn;
4223 clock_t cv_stat, timout;
4225 *conn = NULL;
4226 again:
4227 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4228 cn = hca->cl_conn_list.conn_hd;
4229 while (cn != NULL) {
4231 * First, clear up any connection in the ERROR state
4233 mutex_enter(&cn->c_lock);
4234 if (cn->c_state == C_ERROR_CONN) {
4235 if (cn->c_ref == 0) {
4237 * Remove connection from list and destroy it.
4239 cn->c_state = C_DISCONN_PEND;
4240 mutex_exit(&cn->c_lock);
4241 rw_exit(&hca->cl_conn_list.conn_lock);
4242 rib_conn_close((void *)cn);
4243 goto again;
4245 mutex_exit(&cn->c_lock);
4246 cn = cn->c_next;
4247 continue;
4249 if (cn->c_state == C_DISCONN_PEND) {
4250 mutex_exit(&cn->c_lock);
4251 cn = cn->c_next;
4252 continue;
4256 * source address is only checked for if there is one,
4257 * this is the case for retries.
4259 if ((cn->c_raddr.len == d_svcaddr->len) &&
4260 (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4261 d_svcaddr->len) == 0) &&
4262 ((s_svcaddr->len == 0) ||
4263 ((cn->c_laddr.len == s_svcaddr->len) &&
4264 (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4265 s_svcaddr->len) == 0)))) {
4267 * Our connection. Give up conn list lock
4268 * as we are done traversing the list.
4270 rw_exit(&hca->cl_conn_list.conn_lock);
4271 if (cn->c_state == C_CONNECTED) {
4272 cn->c_ref++; /* sharing a conn */
4273 mutex_exit(&cn->c_lock);
4274 *conn = cn;
4275 return (RDMA_SUCCESS);
4277 if (cn->c_state == C_CONN_PEND) {
4279 * Hold a reference to this conn before
4280 * we give up the lock.
4282 cn->c_ref++;
4283 timout = ddi_get_lbolt() +
4284 drv_usectohz(CONN_WAIT_TIME * 1000000);
4285 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4286 &cn->c_lock, timout)) > 0 &&
4287 cn->c_state == C_CONN_PEND)
4289 if (cv_stat == 0) {
4290 (void) rib_conn_release_locked(cn);
4291 return (RDMA_INTR);
4293 if (cv_stat < 0) {
4294 (void) rib_conn_release_locked(cn);
4295 return (RDMA_TIMEDOUT);
4297 if (cn->c_state == C_CONNECTED) {
4298 *conn = cn;
4299 mutex_exit(&cn->c_lock);
4300 return (RDMA_SUCCESS);
4301 } else {
4302 (void) rib_conn_release_locked(cn);
4303 return (RDMA_TIMEDOUT);
4307 mutex_exit(&cn->c_lock);
4308 cn = cn->c_next;
4310 rw_exit(&hca->cl_conn_list.conn_lock);
4311 *conn = NULL;
4312 return (RDMA_FAILED);
4316 * Connection management.
4317 * IBTF does not support recycling of channels. So connections are only
4318 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319 * C_DISCONN_PEND state. No C_IDLE state.
4320 * C_CONN_PEND state: Connection establishment in progress to the server.
4321 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323 * only in this state.
4324 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328 * c_ref drops to 0 (this indicates that RPC has no more references to this
4329 * connection), the connection should be destroyed. A connection transitions
4330 * into this state when it is being destroyed.
4332 /* ARGSUSED */
4333 static rdma_stat
4334 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4335 int addr_type, rpcib_ping_t *rpt, CONN **conn)
4337 CONN *cn;
4338 int status;
4339 rib_hca_t *hca;
4340 rib_qp_t *qp;
4341 int s_addr_len;
4342 char *s_addr_buf;
4344 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4345 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4346 rw_enter(&hca->state_lock, RW_READER);
4347 if (hca->state != HCA_DETACHED) {
4348 status = rib_find_hca_connection(hca, s_svcaddr,
4349 d_svcaddr, conn);
4350 rw_exit(&hca->state_lock);
4351 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4352 rw_exit(&rib_stat->hcas_list_lock);
4353 return (status);
4355 } else
4356 rw_exit(&hca->state_lock);
4358 rw_exit(&rib_stat->hcas_list_lock);
4361 * No existing connection found, establish a new connection.
4363 bzero(rpt, sizeof (rpcib_ping_t));
4365 status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4366 if (status != RDMA_SUCCESS) {
4367 return (RDMA_FAILED);
4369 hca = rpt->hca;
4371 if (rpt->srcip.family == AF_INET) {
4372 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4373 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4374 } else if (rpt->srcip.family == AF_INET6) {
4375 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4376 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4377 } else {
4378 return (RDMA_FAILED);
4382 * Channel to server doesn't exist yet, create one.
4384 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4385 return (RDMA_FAILED);
4387 cn = qptoc(qp);
4388 cn->c_state = C_CONN_PEND;
4389 cn->c_ref = 1;
4391 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4392 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4393 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4395 if (rpt->srcip.family == AF_INET) {
4396 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4397 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4399 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4400 sizeof (struct sockaddr_in);
4401 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4403 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4404 (uint32_t)~0;
4405 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4406 (ushort_t)~0;
4408 } else {
4409 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4410 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4412 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4413 sizeof (struct sockaddr_in6);
4414 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4416 (void) memset(
4417 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4418 (uchar_t)~0, sizeof (struct in6_addr));
4419 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4420 (sa_family_t)~0;
4424 * Add to conn list.
4425 * We had given up the READER lock. In the time since then,
4426 * another thread might have created the connection we are
4427 * trying here. But for now, that is quiet alright - there
4428 * might be two connections between a pair of hosts instead
4429 * of one. If we really want to close that window,
4430 * then need to check the list after acquiring the
4431 * WRITER lock.
4433 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4434 status = rib_conn_to_srv(hca, qp, rpt);
4435 mutex_enter(&cn->c_lock);
4437 if (cn->c_flags & C_CLOSE_PENDING) {
4439 * This handles a case where the module or
4440 * HCA detached in the time a connection is
4441 * established. In such a case close the
4442 * connection immediately if this is the
4443 * only reference.
4445 if (cn->c_ref == 1) {
4446 cn->c_ref--;
4447 cn->c_state = C_DISCONN_PEND;
4448 mutex_exit(&cn->c_lock);
4449 rib_conn_close((void *)cn);
4450 return (RDMA_FAILED);
4454 * Connection to be closed later when c_ref = 0
4456 status = RDMA_FAILED;
4459 if (status == RDMA_SUCCESS) {
4460 cn->c_state = C_CONNECTED;
4461 *conn = cn;
4462 } else {
4463 cn->c_state = C_ERROR_CONN;
4464 cn->c_ref--;
4466 cv_signal(&cn->c_cv);
4467 mutex_exit(&cn->c_lock);
4468 return (status);
4471 static void
4472 rib_conn_close(void *rarg)
4474 CONN *conn = (CONN *)rarg;
4475 rib_qp_t *qp = ctoqp(conn);
4477 mutex_enter(&conn->c_lock);
4478 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4480 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4483 * Live connection in CONNECTED state.
4485 if (conn->c_state == C_CONNECTED) {
4486 conn->c_state = C_ERROR_CONN;
4488 mutex_exit(&conn->c_lock);
4490 rib_close_a_channel(conn);
4492 mutex_enter(&conn->c_lock);
4493 conn->c_flags &= ~C_CLOSE_PENDING;
4496 mutex_exit(&conn->c_lock);
4498 if (qp->mode == RIB_SERVER)
4499 (void) rib_disconnect_channel(conn,
4500 &qp->hca->srv_conn_list);
4501 else
4502 (void) rib_disconnect_channel(conn,
4503 &qp->hca->cl_conn_list);
4506 static void
4507 rib_conn_timeout_call(void *carg)
4509 time_t idle_time;
4510 CONN *conn = (CONN *)carg;
4511 rib_hca_t *hca = ctoqp(conn)->hca;
4512 int error;
4514 mutex_enter(&conn->c_lock);
4515 if ((conn->c_ref > 0) ||
4516 (conn->c_state == C_DISCONN_PEND)) {
4517 conn->c_timeout = NULL;
4518 mutex_exit(&conn->c_lock);
4519 return;
4522 idle_time = (gethrestime_sec() - conn->c_last_used);
4524 if ((idle_time <= rib_conn_timeout) &&
4525 (conn->c_state != C_ERROR_CONN)) {
4527 * There was activity after the last timeout.
4528 * Extend the conn life. Unless the conn is
4529 * already in error state.
4531 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4532 SEC_TO_TICK(rib_conn_timeout - idle_time));
4533 mutex_exit(&conn->c_lock);
4534 return;
4537 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4538 (void *)conn, DDI_NOSLEEP);
4541 * If taskq dispatch fails above, then reset the timeout
4542 * to try again after 10 secs.
4545 if (error != DDI_SUCCESS) {
4546 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547 SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4548 mutex_exit(&conn->c_lock);
4549 return;
4552 conn->c_state = C_DISCONN_PEND;
4553 mutex_exit(&conn->c_lock);
4556 static rdma_stat
4557 rib_conn_release(CONN *conn)
4559 mutex_enter(&conn->c_lock);
4560 return (rib_conn_release_locked(conn));
4564 * Expects conn->c_lock to be held on entry.
4565 * c_lock released on return
4567 static rdma_stat
4568 rib_conn_release_locked(CONN *conn)
4570 conn->c_ref--;
4572 conn->c_last_used = gethrestime_sec();
4573 if (conn->c_ref > 0) {
4574 mutex_exit(&conn->c_lock);
4575 return (RDMA_SUCCESS);
4579 * If a conn is C_ERROR_CONN, close the channel.
4581 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4582 conn->c_state = C_DISCONN_PEND;
4583 mutex_exit(&conn->c_lock);
4584 rib_conn_close((void *)conn);
4585 return (RDMA_SUCCESS);
4589 * c_ref == 0, set a timeout for conn release
4592 if (conn->c_timeout == NULL) {
4593 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4594 SEC_TO_TICK(rib_conn_timeout));
4597 mutex_exit(&conn->c_lock);
4598 return (RDMA_SUCCESS);
4602 * Add at front of list
4604 static struct rdma_done_list *
4605 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4607 struct rdma_done_list *rd;
4609 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4611 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4612 rd->xid = xid;
4613 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4615 rd->prev = NULL;
4616 rd->next = qp->rdlist;
4617 if (qp->rdlist != NULL)
4618 qp->rdlist->prev = rd;
4619 qp->rdlist = rd;
4621 return (rd);
4624 static void
4625 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4627 struct rdma_done_list *r;
4629 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4631 r = rd->next;
4632 if (r != NULL) {
4633 r->prev = rd->prev;
4636 r = rd->prev;
4637 if (r != NULL) {
4638 r->next = rd->next;
4639 } else {
4640 qp->rdlist = rd->next;
4643 cv_destroy(&rd->rdma_done_cv);
4644 kmem_free(rd, sizeof (*rd));
4647 static void
4648 rdma_done_rem_list(rib_qp_t *qp)
4650 struct rdma_done_list *r, *n;
4652 mutex_enter(&qp->rdlist_lock);
4653 for (r = qp->rdlist; r != NULL; r = n) {
4654 n = r->next;
4655 rdma_done_rm(qp, r);
4657 mutex_exit(&qp->rdlist_lock);
4660 static void
4661 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4663 struct rdma_done_list *r = qp->rdlist;
4665 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4667 while (r) {
4668 if (r->xid == xid) {
4669 cv_signal(&r->rdma_done_cv);
4670 return;
4671 } else {
4672 r = r->next;
4675 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4676 int, xid);
4680 * Expects conn->c_lock to be held by the caller.
4683 static void
4684 rib_close_a_channel(CONN *conn)
4686 rib_qp_t *qp;
4687 qp = ctoqp(conn);
4689 if (qp->qp_hdl == NULL) {
4690 /* channel already freed */
4691 return;
4695 * Call ibt_close_rc_channel in blocking mode
4696 * with no callbacks.
4698 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4699 NULL, 0, NULL, NULL, 0);
4703 * Goes through all connections and closes the channel
4704 * This will cause all the WRs on those channels to be
4705 * flushed.
4707 static void
4708 rib_close_channels(rib_conn_list_t *connlist)
4710 CONN *conn, *tmp;
4712 rw_enter(&connlist->conn_lock, RW_READER);
4713 conn = connlist->conn_hd;
4714 while (conn != NULL) {
4715 mutex_enter(&conn->c_lock);
4716 tmp = conn->c_next;
4717 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4719 if (conn->c_state == C_CONN_PEND) {
4720 conn->c_flags |= C_CLOSE_PENDING;
4721 goto next;
4724 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4727 * Live connection in CONNECTED state.
4729 if (conn->c_state == C_CONNECTED)
4730 conn->c_state = C_ERROR_CONN;
4731 mutex_exit(&conn->c_lock);
4733 rib_close_a_channel(conn);
4735 mutex_enter(&conn->c_lock);
4736 conn->c_flags &= ~C_CLOSE_PENDING;
4737 /* Signal a pending rib_disconnect_channel() */
4738 cv_signal(&conn->c_cv);
4740 next:
4741 mutex_exit(&conn->c_lock);
4742 conn = tmp;
4744 rw_exit(&connlist->conn_lock);
4748 * Frees up all connections that are no longer being referenced
4750 static void
4751 rib_purge_connlist(rib_conn_list_t *connlist)
4753 CONN *conn;
4755 top:
4756 rw_enter(&connlist->conn_lock, RW_READER);
4757 conn = connlist->conn_hd;
4758 while (conn != NULL) {
4759 mutex_enter(&conn->c_lock);
4762 * At this point connection is either in ERROR
4763 * or DISCONN_PEND state. If in DISCONN_PEND state
4764 * then some other thread is culling that connection.
4765 * If not and if c_ref is 0, then destroy the connection.
4767 if (conn->c_ref == 0 &&
4768 conn->c_state != C_DISCONN_PEND) {
4770 * Cull the connection
4772 conn->c_state = C_DISCONN_PEND;
4773 mutex_exit(&conn->c_lock);
4774 rw_exit(&connlist->conn_lock);
4775 (void) rib_disconnect_channel(conn, connlist);
4776 goto top;
4777 } else {
4779 * conn disconnect already scheduled or will
4780 * happen from conn_release when c_ref drops to 0.
4782 mutex_exit(&conn->c_lock);
4784 conn = conn->c_next;
4786 rw_exit(&connlist->conn_lock);
4789 * At this point, only connections with c_ref != 0 are on the list
4794 * Free all the HCA resources and close
4795 * the hca.
4798 static void
4799 rib_free_hca(rib_hca_t *hca)
4801 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4802 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4803 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4804 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4806 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4807 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4808 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4809 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4811 rib_rbufpool_destroy(hca, RECV_BUFFER);
4812 rib_rbufpool_destroy(hca, SEND_BUFFER);
4813 rib_destroy_cache(hca);
4814 if (rib_mod.rdma_count == 0)
4815 (void) rdma_unregister_mod(&rib_mod);
4816 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4817 (void) ibt_close_hca(hca->hca_hdl);
4818 hca->hca_hdl = NULL;
4822 static void
4823 rib_stop_hca_services(rib_hca_t *hca)
4825 rib_stop_services(hca);
4826 rib_close_channels(&hca->cl_conn_list);
4827 rib_close_channels(&hca->srv_conn_list);
4829 rib_purge_connlist(&hca->cl_conn_list);
4830 rib_purge_connlist(&hca->srv_conn_list);
4832 if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4833 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4834 GLOBAL_ZONEID);
4835 stats_enabled = FALSE;
4838 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4839 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4840 if (hca->srv_conn_list.conn_hd == NULL &&
4841 hca->cl_conn_list.conn_hd == NULL) {
4843 * conn_lists are NULL, so destroy
4844 * buffers, close hca and be done.
4846 rib_free_hca(hca);
4848 rw_exit(&hca->cl_conn_list.conn_lock);
4849 rw_exit(&hca->srv_conn_list.conn_lock);
4851 if (hca->hca_hdl != NULL) {
4852 mutex_enter(&hca->inuse_lock);
4853 while (hca->inuse)
4854 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4855 mutex_exit(&hca->inuse_lock);
4857 rib_free_hca(hca);
4859 rw_destroy(&hca->bound_services_lock);
4861 if (hca->cleanup_helper != NULL) {
4862 ddi_taskq_destroy(hca->cleanup_helper);
4863 hca->cleanup_helper = NULL;
4868 * Cleans and closes up all uses of the HCA
4870 static void
4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4873 rib_hca_t *hca = NULL;
4874 rib_hca_t **hcap;
4876 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4877 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4878 hca = *hcap;
4879 rw_enter(&hca->state_lock, RW_WRITER);
4880 if (hca->hca_hdl == hca_hdl) {
4882 * Mark as detached and remove from
4883 * hca list.
4885 hca->state = HCA_DETACHED;
4886 *hcap = hca->next;
4887 rib_stat->nhca_inited--;
4888 rib_mod.rdma_count--;
4889 rw_exit(&hca->state_lock);
4890 break;
4892 rw_exit(&hca->state_lock);
4894 rw_exit(&rib_stat->hcas_list_lock);
4896 if (hca == NULL)
4897 return;
4898 ASSERT(hca->hca_hdl == hca_hdl);
4901 * Stop all services on the HCA
4902 * Go through cl_conn_list and close all rc_channels
4903 * Go through svr_conn_list and close all rc_channels
4904 * Free connections whose c_ref has dropped to 0
4905 * Destroy all CQs
4906 * Deregister and released all buffer pool memory after all
4907 * connections are destroyed
4908 * Free the protection domain
4909 * ibt_close_hca()
4911 rib_stop_hca_services(hca);
4913 kmem_free(hca, sizeof (*hca));
4916 static void
4917 rib_server_side_cache_reclaim(void *argp)
4919 cache_avl_struct_t *rcas;
4920 rib_lrc_entry_t *rb;
4921 rib_hca_t *hca = (rib_hca_t *)argp;
4923 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4924 rcas = avl_first(&hca->avl_tree);
4925 if (rcas != NULL)
4926 avl_remove(&hca->avl_tree, rcas);
4928 while (rcas != NULL) {
4929 while (rcas->r.forw != &rcas->r) {
4930 rcas->elements--;
4931 rb = rcas->r.forw;
4932 remque(rb);
4933 if (rb->registered)
4934 (void) rib_deregistermem_via_hca(hca,
4935 rb->lrc_buf, rb->lrc_mhandle);
4937 hca->cache_allocation -= rb->lrc_len;
4938 kmem_free(rb->lrc_buf, rb->lrc_len);
4939 kmem_free(rb, sizeof (rib_lrc_entry_t));
4941 mutex_destroy(&rcas->node_lock);
4942 kmem_cache_free(hca->server_side_cache, rcas);
4943 rcas = avl_first(&hca->avl_tree);
4944 if (rcas != NULL)
4945 avl_remove(&hca->avl_tree, rcas);
4947 rw_exit(&hca->avl_rw_lock);
4950 static void
4951 rib_server_side_cache_cleanup(void *argp)
4953 cache_avl_struct_t *rcas;
4954 rib_lrc_entry_t *rb;
4955 rib_hca_t *hca = (rib_hca_t *)argp;
4957 mutex_enter(&hca->cache_allocation_lock);
4958 if (hca->cache_allocation < cache_limit) {
4959 mutex_exit(&hca->cache_allocation_lock);
4960 return;
4962 mutex_exit(&hca->cache_allocation_lock);
4964 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4965 rcas = avl_last(&hca->avl_tree);
4966 if (rcas != NULL)
4967 avl_remove(&hca->avl_tree, rcas);
4969 while (rcas != NULL) {
4970 while (rcas->r.forw != &rcas->r) {
4971 rcas->elements--;
4972 rb = rcas->r.forw;
4973 remque(rb);
4974 if (rb->registered)
4975 (void) rib_deregistermem_via_hca(hca,
4976 rb->lrc_buf, rb->lrc_mhandle);
4978 hca->cache_allocation -= rb->lrc_len;
4980 kmem_free(rb->lrc_buf, rb->lrc_len);
4981 kmem_free(rb, sizeof (rib_lrc_entry_t));
4983 mutex_destroy(&rcas->node_lock);
4984 if (hca->server_side_cache) {
4985 kmem_cache_free(hca->server_side_cache, rcas);
4988 if (hca->cache_allocation < cache_limit) {
4989 rw_exit(&hca->avl_rw_lock);
4990 return;
4993 rcas = avl_last(&hca->avl_tree);
4994 if (rcas != NULL)
4995 avl_remove(&hca->avl_tree, rcas);
4997 rw_exit(&hca->avl_rw_lock);
5000 static int
5001 avl_compare(const void *t1, const void *t2)
5003 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5004 return (0);
5006 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5007 return (-1);
5009 return (1);
5012 static void
5013 rib_destroy_cache(rib_hca_t *hca)
5015 if (hca->avl_init) {
5016 rib_server_side_cache_reclaim((void *)hca);
5017 if (hca->server_side_cache) {
5018 kmem_cache_destroy(hca->server_side_cache);
5019 hca->server_side_cache = NULL;
5021 avl_destroy(&hca->avl_tree);
5022 mutex_destroy(&hca->cache_allocation_lock);
5023 rw_destroy(&hca->avl_rw_lock);
5025 hca->avl_init = FALSE;
5028 static void
5029 rib_force_cleanup(void *hca)
5031 if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5032 (void) ddi_taskq_dispatch(
5033 ((rib_hca_t *)hca)->cleanup_helper,
5034 rib_server_side_cache_cleanup,
5035 (void *)hca, DDI_NOSLEEP);
5038 static rib_lrc_entry_t *
5039 rib_get_cache_buf(CONN *conn, uint32_t len)
5041 cache_avl_struct_t cas, *rcas;
5042 rib_hca_t *hca = (ctoqp(conn))->hca;
5043 rib_lrc_entry_t *reply_buf;
5044 avl_index_t where = (uintptr_t)NULL;
5045 uint64_t c_alloc = 0;
5047 if (!hca->avl_init)
5048 goto error_alloc;
5050 cas.len = len;
5052 rw_enter(&hca->avl_rw_lock, RW_READER);
5054 mutex_enter(&hca->cache_allocation_lock);
5055 c_alloc = hca->cache_allocation;
5056 mutex_exit(&hca->cache_allocation_lock);
5058 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5059 &where)) == NULL) {
5060 /* Am I above the cache limit */
5061 if ((c_alloc + len) >= cache_limit) {
5062 rib_force_cleanup((void *)hca);
5063 rw_exit(&hca->avl_rw_lock);
5064 mutex_enter(&hca->cache_allocation_lock);
5065 hca->cache_misses_above_the_limit ++;
5066 mutex_exit(&hca->cache_allocation_lock);
5068 /* Allocate and register the buffer directly */
5069 goto error_alloc;
5072 rw_exit(&hca->avl_rw_lock);
5073 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5075 /* Recheck to make sure no other thread added the entry in */
5076 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5077 &cas, &where)) == NULL) {
5078 /* Allocate an avl tree entry */
5079 rcas = (cache_avl_struct_t *)
5080 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5082 bzero(rcas, sizeof (cache_avl_struct_t));
5083 rcas->elements = 0;
5084 rcas->r.forw = &rcas->r;
5085 rcas->r.back = &rcas->r;
5086 rcas->len = len;
5087 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5088 avl_insert(&hca->avl_tree, rcas, where);
5092 mutex_enter(&rcas->node_lock);
5094 if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5095 reply_buf = rcas->r.forw;
5096 remque(reply_buf);
5097 rcas->elements--;
5098 mutex_exit(&rcas->node_lock);
5099 rw_exit(&hca->avl_rw_lock);
5101 mutex_enter(&hca->cache_allocation_lock);
5102 hca->cache_hits++;
5103 hca->cache_allocation -= len;
5104 mutex_exit(&hca->cache_allocation_lock);
5105 } else {
5106 /* Am I above the cache limit */
5107 mutex_exit(&rcas->node_lock);
5108 if ((c_alloc + len) >= cache_limit) {
5109 rib_force_cleanup((void *)hca);
5110 rw_exit(&hca->avl_rw_lock);
5112 mutex_enter(&hca->cache_allocation_lock);
5113 hca->cache_misses_above_the_limit++;
5114 mutex_exit(&hca->cache_allocation_lock);
5115 /* Allocate and register the buffer directly */
5116 goto error_alloc;
5118 rw_exit(&hca->avl_rw_lock);
5119 mutex_enter(&hca->cache_allocation_lock);
5120 hca->cache_misses++;
5121 mutex_exit(&hca->cache_allocation_lock);
5122 /* Allocate a reply_buf entry */
5123 reply_buf = (rib_lrc_entry_t *)
5124 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5125 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5126 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5127 reply_buf->lrc_len = len;
5128 reply_buf->registered = FALSE;
5129 reply_buf->avl_node = (void *)rcas;
5132 return (reply_buf);
5134 error_alloc:
5135 reply_buf = (rib_lrc_entry_t *)
5136 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5137 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5138 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5139 reply_buf->lrc_len = len;
5140 reply_buf->registered = FALSE;
5141 reply_buf->avl_node = NULL;
5143 return (reply_buf);
5147 * Return a pre-registered back to the cache (without
5148 * unregistering the buffer)..
5151 static void
5152 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5154 cache_avl_struct_t cas, *rcas;
5155 avl_index_t where = (uintptr_t)NULL;
5156 rib_hca_t *hca = (ctoqp(conn))->hca;
5158 if (!hca->avl_init)
5159 goto error_free;
5161 cas.len = reg_buf->lrc_len;
5162 rw_enter(&hca->avl_rw_lock, RW_READER);
5163 if ((rcas = (cache_avl_struct_t *)
5164 avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5165 rw_exit(&hca->avl_rw_lock);
5166 goto error_free;
5167 } else {
5168 cas.len = reg_buf->lrc_len;
5169 mutex_enter(&rcas->node_lock);
5170 insque(reg_buf, &rcas->r);
5171 rcas->elements ++;
5172 mutex_exit(&rcas->node_lock);
5173 rw_exit(&hca->avl_rw_lock);
5174 mutex_enter(&hca->cache_allocation_lock);
5175 hca->cache_allocation += cas.len;
5176 mutex_exit(&hca->cache_allocation_lock);
5179 return;
5181 error_free:
5183 if (reg_buf->registered)
5184 (void) rib_deregistermem_via_hca(hca,
5185 reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5186 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5187 kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5190 static rdma_stat
5191 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5192 uint_t buflen, struct mrc *buf_handle)
5194 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5195 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5196 rdma_stat status;
5200 * Note: ALL buffer pools use the same memory type RDMARW.
5202 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5203 if (status == RDMA_SUCCESS) {
5204 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5205 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5206 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5207 } else {
5208 buf_handle->mrc_linfo = (uintptr_t)NULL;
5209 buf_handle->mrc_lmr = 0;
5210 buf_handle->mrc_rmr = 0;
5212 return (status);
5215 /* ARGSUSED */
5216 static rdma_stat
5217 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5218 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5221 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5222 return (RDMA_SUCCESS);
5225 /* ARGSUSED */
5226 static rdma_stat
5227 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5230 (void) ibt_deregister_mr(hca->hca_hdl,
5231 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5232 return (RDMA_SUCCESS);
5236 * Check if the IP interface named by `lifrp' is RDMA-capable.
5238 static boolean_t
5239 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5241 char ifname[LIFNAMSIZ];
5242 char *cp;
5244 if (lifrp->lifr_type == IFT_IB)
5245 return (B_TRUE);
5248 * Strip off the logical interface portion before getting
5249 * intimate with the name.
5251 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5252 if ((cp = strchr(ifname, ':')) != NULL)
5253 *cp = '\0';
5255 return (strcmp("lo0", ifname) == 0);
5258 static int
5259 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5261 vnode_t *kkvp, *vp;
5262 TIUSER *tiptr;
5263 struct strioctl iocb;
5264 k_sigset_t smask;
5265 int err = 0;
5267 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5268 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5269 &tiptr, CRED()) == 0) {
5270 vp = tiptr->fp->f_vnode;
5271 } else {
5272 VN_RELE(kkvp);
5273 return (EPROTO);
5275 } else {
5276 return (EPROTO);
5279 iocb.ic_cmd = cmd;
5280 iocb.ic_timout = 0;
5281 iocb.ic_len = len;
5282 iocb.ic_dp = (caddr_t)arg;
5283 sigintr(&smask, 0);
5284 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5285 sigunintr(&smask);
5286 (void) t_kclose(tiptr, 0);
5287 VN_RELE(kkvp);
5288 return (err);
5292 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5295 static int
5296 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5298 int err;
5299 struct lifnum lifn;
5301 bzero(&lifn, sizeof (struct lifnum));
5302 lifn.lifn_family = AF_UNSPEC;
5304 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5305 if (err != 0)
5306 return (err);
5309 * Pad the interface count to account for additional interfaces that
5310 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5312 lifn.lifn_count += 4;
5314 bzero(lifcp, sizeof (struct lifconf));
5315 lifcp->lifc_family = AF_UNSPEC;
5316 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5317 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5319 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5320 if (err != 0) {
5321 kmem_free(lifcp->lifc_buf, *bufsizep);
5322 return (err);
5324 return (0);
5327 static boolean_t
5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5330 uint_t i, nifs;
5331 uint_t bufsize;
5332 struct lifconf lifc;
5333 struct lifreq *lifrp;
5334 struct sockaddr_in *sinp;
5335 struct sockaddr_in6 *sin6p;
5337 bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5338 bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5340 if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5341 return (B_FALSE);
5343 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5344 kmem_free(lifc.lifc_buf, bufsize);
5345 return (B_FALSE);
5349 * Worst case is that all of the addresses are IB-capable and have
5350 * the same address family, so size our buffers accordingly.
5352 addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5353 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5354 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5355 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5357 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5358 if (!rpcib_rdma_capable_interface(lifrp))
5359 continue;
5361 if (lifrp->lifr_addr.ss_family == AF_INET) {
5362 sinp = addrs4->ri_list;
5363 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5364 sizeof (struct sockaddr_in));
5365 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5366 sin6p = addrs6->ri_list;
5367 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5368 sizeof (struct sockaddr_in6));
5372 kmem_free(lifc.lifc_buf, bufsize);
5373 return (B_TRUE);
5376 /* ARGSUSED */
5377 static int
5378 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5380 rib_hca_t *hca;
5382 if (KSTAT_WRITE == rw) {
5383 return (EACCES);
5386 rpcib_kstat.cache_limit.value.ui64 =
5387 (uint64_t)cache_limit;
5388 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5389 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5390 rpcib_kstat.cache_allocation.value.ui64 +=
5391 (uint64_t)hca->cache_allocation;
5392 rpcib_kstat.cache_hits.value.ui64 +=
5393 (uint64_t)hca->cache_hits;
5394 rpcib_kstat.cache_misses.value.ui64 +=
5395 (uint64_t)hca->cache_misses;
5396 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5397 (uint64_t)hca->cache_misses_above_the_limit;
5399 rw_exit(&rib_stat->hcas_list_lock);
5400 return (0);