2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
6 * This file contains code imported from the OFED rds source file ib.c
7 * Oracle elects to have and use the contents of ib.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
13 * Copyright (c) 2006 Oracle. All rights reserved.
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44 #include <sys/sysmacros.h>
47 #include <sys/ib/ibtl/ibti.h>
48 #include <sys/ib/clients/rdsv3/rdsv3.h>
49 #include <sys/ib/clients/rdsv3/ib.h>
50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
52 unsigned int rdsv3_ib_retry_count
= RDSV3_IB_DEFAULT_RETRY_COUNT
;
54 struct list rdsv3_ib_devices
;
56 /* NOTE: if also grabbing ibdev lock, grab this first */
57 kmutex_t ib_nodev_conns_lock
;
58 list_t ib_nodev_conns
;
60 extern int rdsv3_ib_frag_constructor(void *buf
, void *arg
, int kmflags
);
61 extern void rdsv3_ib_frag_destructor(void *buf
, void *arg
);
64 rdsv3_ib_add_one(ib_device_t
*device
)
66 struct rdsv3_ib_device
*rds_ibdev
;
67 ibt_hca_attr_t
*dev_attr
;
70 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device
);
72 /* Only handle IB (no iWARP) devices */
73 if (device
->node_type
!= RDMA_NODE_IB_CA
)
76 dev_attr
= (ibt_hca_attr_t
*)kmem_alloc(sizeof (*dev_attr
),
81 if (ibt_query_hca(ib_get_ibt_hca_hdl(device
), dev_attr
)) {
82 RDSV3_DPRINTF2("rdsv3_ib_add_one",
83 "Query device failed for %s", device
->name
);
87 /* We depend on Reserved Lkey */
88 if (!(dev_attr
->hca_flags2
& IBT_HCA2_RES_LKEY
)) {
89 RDSV3_DPRINTF2("rdsv3_ib_add_one",
90 "Reserved Lkey support is required: %s",
95 rds_ibdev
= kmem_zalloc(sizeof (*rds_ibdev
), KM_NOSLEEP
);
99 rds_ibdev
->ibt_hca_hdl
= ib_get_ibt_hca_hdl(device
);
100 rds_ibdev
->hca_attr
= *dev_attr
;
102 rw_init(&rds_ibdev
->rwlock
, NULL
, RW_DRIVER
, NULL
);
103 mutex_init(&rds_ibdev
->spinlock
, NULL
, MUTEX_DRIVER
, NULL
);
105 rds_ibdev
->max_wrs
= dev_attr
->hca_max_chan_sz
;
106 rds_ibdev
->max_sge
= min(dev_attr
->hca_max_sgl
, RDSV3_IB_MAX_SGE
);
108 rds_ibdev
->max_initiator_depth
= (uint_t
)dev_attr
->hca_max_rdma_in_qp
;
109 rds_ibdev
->max_responder_resources
=
110 (uint_t
)dev_attr
->hca_max_rdma_in_qp
;
112 rds_ibdev
->dev
= device
;
113 rds_ibdev
->pd
= ib_alloc_pd(device
);
114 if (IS_ERR(rds_ibdev
->pd
))
117 if (rdsv3_ib_create_mr_pool(rds_ibdev
) != 0) {
121 if (rdsv3_ib_create_inc_pool(rds_ibdev
) != 0) {
122 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
126 (void) snprintf(name
, 64, "RDSV3_IB_FRAG_%llx",
127 (longlong_t
)htonll(dev_attr
->hca_node_guid
));
128 rds_ibdev
->ib_frag_slab
= kmem_cache_create(name
,
129 sizeof (struct rdsv3_page_frag
), 0, rdsv3_ib_frag_constructor
,
130 rdsv3_ib_frag_destructor
, NULL
, (void *)rds_ibdev
, NULL
, 0);
131 if (rds_ibdev
->ib_frag_slab
== NULL
) {
132 RDSV3_DPRINTF2("rdsv3_ib_add_one",
133 "kmem_cache_create for ib_frag_slab failed for device: %s",
135 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
136 rdsv3_ib_destroy_inc_pool(rds_ibdev
);
140 rds_ibdev
->aft_hcagp
= rdsv3_af_grp_create(rds_ibdev
->ibt_hca_hdl
,
141 (uint64_t)rds_ibdev
->hca_attr
.hca_node_guid
);
142 if (rds_ibdev
->aft_hcagp
== NULL
) {
143 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
144 rdsv3_ib_destroy_inc_pool(rds_ibdev
);
145 kmem_cache_destroy(rds_ibdev
->ib_frag_slab
);
148 rds_ibdev
->fmr_soft_cq
= rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn
,
149 (void *)rds_ibdev
->fmr_pool
, SCQ_HCA_BIND_CPU
,
150 rds_ibdev
->aft_hcagp
);
151 if (rds_ibdev
->fmr_soft_cq
== NULL
) {
152 rdsv3_af_grp_destroy(rds_ibdev
->aft_hcagp
);
153 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
154 rdsv3_ib_destroy_inc_pool(rds_ibdev
);
155 kmem_cache_destroy(rds_ibdev
->ib_frag_slab
);
159 rds_ibdev
->inc_soft_cq
= rdsv3_af_thr_create(rdsv3_ib_drain_inclist
,
160 (void *)rds_ibdev
->inc_pool
, SCQ_HCA_BIND_CPU
,
161 rds_ibdev
->aft_hcagp
);
162 if (rds_ibdev
->inc_soft_cq
== NULL
) {
163 rdsv3_af_thr_destroy(rds_ibdev
->fmr_soft_cq
);
164 rdsv3_af_grp_destroy(rds_ibdev
->aft_hcagp
);
165 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
166 rdsv3_ib_destroy_inc_pool(rds_ibdev
);
167 kmem_cache_destroy(rds_ibdev
->ib_frag_slab
);
171 list_create(&rds_ibdev
->ipaddr_list
, sizeof (struct rdsv3_ib_ipaddr
),
172 offsetof(struct rdsv3_ib_ipaddr
, list
));
173 list_create(&rds_ibdev
->conn_list
, sizeof (struct rdsv3_ib_connection
),
174 offsetof(struct rdsv3_ib_connection
, ib_node
));
176 list_insert_tail(&rdsv3_ib_devices
, rds_ibdev
);
178 ib_set_client_data(device
, &rdsv3_ib_client
, rds_ibdev
);
180 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device
);
185 (void) ib_dealloc_pd(rds_ibdev
->pd
);
187 mutex_destroy(&rds_ibdev
->spinlock
);
188 rw_destroy(&rds_ibdev
->rwlock
);
189 kmem_free(rds_ibdev
, sizeof (*rds_ibdev
));
191 kmem_free(dev_attr
, sizeof (*dev_attr
));
195 rdsv3_ib_remove_one(struct ib_device
*device
)
197 struct rdsv3_ib_device
*rds_ibdev
;
198 struct rdsv3_ib_ipaddr
*i_ipaddr
, *i_next
;
200 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device
);
202 rds_ibdev
= ib_get_client_data(device
, &rdsv3_ib_client
);
206 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr
, i_next
, &rds_ibdev
->ipaddr_list
,
208 list_remove_node(&i_ipaddr
->list
);
209 kmem_free(i_ipaddr
, sizeof (*i_ipaddr
));
212 rdsv3_ib_destroy_conns(rds_ibdev
);
214 if (rds_ibdev
->fmr_soft_cq
)
215 rdsv3_af_thr_destroy(rds_ibdev
->fmr_soft_cq
);
216 if (rds_ibdev
->inc_soft_cq
)
217 rdsv3_af_thr_destroy(rds_ibdev
->inc_soft_cq
);
219 rdsv3_ib_destroy_mr_pool(rds_ibdev
);
220 rdsv3_ib_destroy_inc_pool(rds_ibdev
);
222 kmem_cache_destroy(rds_ibdev
->ib_frag_slab
);
224 rdsv3_af_grp_destroy(rds_ibdev
->aft_hcagp
);
227 while (ib_dealloc_pd(rds_ibdev
->pd
)) {
229 RDSV3_DPRINTF5("rdsv3_ib_remove_one",
230 "%s-%d Failed to dealloc pd %p",
231 __func__
, __LINE__
, rds_ibdev
->pd
);
233 delay(drv_usectohz(1000));
236 if (ib_dealloc_pd(rds_ibdev
->pd
)) {
238 RDSV3_DPRINTF2("rdsv3_ib_remove_one",
239 "Failed to dealloc pd %p\n", rds_ibdev
->pd
);
244 list_destroy(&rds_ibdev
->ipaddr_list
);
245 list_destroy(&rds_ibdev
->conn_list
);
246 list_remove_node(&rds_ibdev
->list
);
247 mutex_destroy(&rds_ibdev
->spinlock
);
248 rw_destroy(&rds_ibdev
->rwlock
);
249 kmem_free(rds_ibdev
, sizeof (*rds_ibdev
));
251 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device
);
255 struct ib_client rdsv3_ib_client
= {
257 .add
= rdsv3_ib_add_one
,
258 .remove
= rdsv3_ib_remove_one
,
260 .state
= IB_CLNT_UNINITIALIZED
263 struct ib_client rdsv3_ib_client
= {
269 IB_CLNT_UNINITIALIZED
274 rds_ib_conn_info_visitor(struct rdsv3_connection
*conn
,
277 struct rds_info_rdma_connection
*iinfo
= buffer
;
278 struct rdsv3_ib_connection
*ic
;
280 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
283 /* We will only ever look at IB transports */
284 if (conn
->c_trans
!= &rdsv3_ib_transport
)
287 iinfo
->src_addr
= conn
->c_laddr
;
288 iinfo
->dst_addr
= conn
->c_faddr
;
290 (void) memset(&iinfo
->src_gid
, 0, sizeof (iinfo
->src_gid
));
291 (void) memset(&iinfo
->dst_gid
, 0, sizeof (iinfo
->dst_gid
));
292 if (rdsv3_conn_state(conn
) == RDSV3_CONN_UP
) {
293 struct rdsv3_ib_device
*rds_ibdev
;
294 struct rdma_dev_addr
*dev_addr
;
296 ic
= conn
->c_transport_data
;
297 dev_addr
= &ic
->i_cm_id
->route
.addr
.dev_addr
;
299 ib_addr_get_sgid(dev_addr
, (union ib_gid
*)&iinfo
->src_gid
);
300 ib_addr_get_dgid(dev_addr
, (union ib_gid
*)&iinfo
->dst_gid
);
302 rds_ibdev
= ib_get_client_data(ic
->i_cm_id
->device
,
304 iinfo
->max_send_wr
= ic
->i_send_ring
.w_nr
;
305 iinfo
->max_recv_wr
= ic
->i_recv_ring
.w_nr
;
306 iinfo
->max_send_sge
= rds_ibdev
->max_sge
;
309 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
315 rds_ib_ic_info(struct rsock
*sock
, unsigned int len
,
316 struct rdsv3_info_iterator
*iter
,
317 struct rdsv3_info_lengths
*lens
)
319 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
320 sock
, iter
, lens
, len
);
322 rdsv3_for_each_conn_info(sock
, len
, iter
, lens
,
323 rds_ib_conn_info_visitor
,
324 sizeof (struct rds_info_rdma_connection
));
328 * Early RDS/IB was built to only bind to an address if there is an IPoIB
329 * device with that address set.
331 * If it were me, I'd advocate for something more flexible. Sending and
332 * receiving should be device-agnostic. Transports would try and maintain
333 * connections between peers who have messages queued. Userspace would be
334 * allowed to influence which paths have priority. We could call userspace
335 * asserting this policy "routing".
338 rds_ib_laddr_check(uint32_be_t addr
)
341 struct rdma_cm_id
*cm_id
;
342 struct sockaddr_in sin
;
344 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr
));
347 * Create a CMA ID and try to bind it. This catches both
348 * IB and iWARP capable NICs.
350 cm_id
= rdma_create_id(NULL
, NULL
, RDMA_PS_TCP
);
352 return (-EADDRNOTAVAIL
);
354 (void) memset(&sin
, 0, sizeof (sin
));
355 sin
.sin_family
= AF_INET
;
356 sin
.sin_addr
.s_addr
= rdsv3_scaddr_to_ibaddr(addr
);
358 /* rdma_bind_addr will only succeed for IB & iWARP devices */
359 ret
= rdma_bind_addr(cm_id
, (struct sockaddr
*)&sin
);
361 * due to this, we will claim to support iWARP devices unless we
364 if (ret
|| cm_id
->device
->node_type
!= RDMA_NODE_IB_CA
)
365 ret
= -EADDRNOTAVAIL
;
367 RDSV3_DPRINTF5("rds_ib_laddr_check",
368 "addr %u.%u.%u.%u ret %d node type %d",
370 cm_id
->device
? cm_id
->device
->node_type
: -1);
372 rdma_destroy_id(cm_id
);
380 RDSV3_DPRINTF4("rds_ib_exit", "Enter");
382 rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS
, rds_ib_ic_info
);
383 rdsv3_ib_destroy_nodev_conns();
384 ib_unregister_client(&rdsv3_ib_client
);
385 rdsv3_ib_sysctl_exit();
386 rdsv3_ib_recv_exit();
387 rdsv3_trans_unregister(&rdsv3_ib_transport
);
388 kmem_free(rdsv3_ib_stats
,
389 nr_cpus
* sizeof (struct rdsv3_ib_statistics
));
390 mutex_destroy(&ib_nodev_conns_lock
);
391 list_destroy(&ib_nodev_conns
);
392 list_destroy(&rdsv3_ib_devices
);
394 RDSV3_DPRINTF4("rds_ib_exit", "Return");
398 struct rdsv3_transport rdsv3_ib_transport
= {
399 .laddr_check
= rds_ib_laddr_check
,
400 .xmit_complete
= rdsv3_ib_xmit_complete
,
401 .xmit
= rdsv3_ib_xmit
,
402 .xmit_cong_map
= NULL
,
403 .xmit_rdma
= rdsv3_ib_xmit_rdma
,
404 .recv
= rdsv3_ib_recv
,
405 .conn_alloc
= rdsv3_ib_conn_alloc
,
406 .conn_free
= rdsv3_ib_conn_free
,
407 .conn_connect
= rdsv3_ib_conn_connect
,
408 .conn_shutdown
= rdsv3_ib_conn_shutdown
,
409 .inc_copy_to_user
= rdsv3_ib_inc_copy_to_user
,
410 .inc_free
= rdsv3_ib_inc_free
,
411 .cm_initiate_connect
= rdsv3_ib_cm_initiate_connect
,
412 .cm_handle_connect
= rdsv3_ib_cm_handle_connect
,
413 .cm_connect_complete
= rdsv3_ib_cm_connect_complete
,
414 .stats_info_copy
= rdsv3_ib_stats_info_copy
,
415 .exit
= rdsv3_ib_exit
,
416 .get_mr
= rdsv3_ib_get_mr
,
417 .sync_mr
= rdsv3_ib_sync_mr
,
418 .free_mr
= rdsv3_ib_free_mr
,
419 .flush_mrs
= rdsv3_ib_flush_mrs
,
420 .t_name
= "infiniband",
421 .t_type
= RDS_TRANS_IB
424 struct rdsv3_transport rdsv3_ib_transport
;
432 RDSV3_DPRINTF4("rds_ib_init", "Enter");
434 list_create(&rdsv3_ib_devices
, sizeof (struct rdsv3_ib_device
),
435 offsetof(struct rdsv3_ib_device
, list
));
436 list_create(&ib_nodev_conns
, sizeof (struct rdsv3_ib_connection
),
437 offsetof(struct rdsv3_ib_connection
, ib_node
));
438 mutex_init(&ib_nodev_conns_lock
, NULL
, MUTEX_DRIVER
, NULL
);
440 /* allocate space for ib statistics */
441 ASSERT(rdsv3_ib_stats
== NULL
);
442 rdsv3_ib_stats
= kmem_zalloc(nr_cpus
*
443 sizeof (struct rdsv3_ib_statistics
), KM_SLEEP
);
445 rdsv3_ib_client
.dip
= rdsv3_dev_info
;
446 ret
= ib_register_client(&rdsv3_ib_client
);
450 ret
= rdsv3_ib_sysctl_init();
454 ret
= rdsv3_ib_recv_init();
458 ret
= rdsv3_trans_register(&rdsv3_ib_transport
);
462 rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS
, rds_ib_ic_info
);
464 RDSV3_DPRINTF4("rds_ib_init", "Return");
469 rdsv3_ib_recv_exit();
471 rdsv3_ib_sysctl_exit();
473 ib_unregister_client(&rdsv3_ib_client
);
475 kmem_free(rdsv3_ib_stats
,
476 nr_cpus
* sizeof (struct rdsv3_ib_statistics
));
477 mutex_destroy(&ib_nodev_conns_lock
);
478 list_destroy(&ib_nodev_conns
);
479 list_destroy(&rdsv3_ib_devices
);