2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
6 * This file contains code imported from the OFED rds source file ib_rdma.c
7 * Oracle elects to have and use the contents of ib_rdma.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
13 * Copyright (c) 2006 Oracle. All rights reserved.
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
45 #include <netinet/in.h>
47 #include <sys/ib/clients/rdsv3/rdsv3.h>
48 #include <sys/ib/clients/rdsv3/rdma.h>
49 #include <sys/ib/clients/rdsv3/ib.h>
50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
53 * This is stored as mr->r_trans_private.
56 list_node_t m_obj
; /* list obj of rdsv3_fmr_pool list */
57 struct rdsv3_ib_device
*m_device
;
58 struct rdsv3_fmr_pool
*m_pool
; /* hca fmr pool */
59 unsigned int m_inval
:1;
61 struct rdsv3_scatterlist
*sg
;
66 /* DDI pinned memory */
67 ddi_umem_cookie_t umem_cookie
;
68 /* IBTF type definitions */
69 ibt_hca_hdl_t rc_hca_hdl
;
70 ibt_fmr_pool_hdl_t fmr_pool_hdl
;
71 ibt_ma_hdl_t rc_ma_hdl
;
72 ibt_mr_hdl_t rc_fmr_hdl
;
73 ibt_pmr_desc_t rc_mem_desc
;
79 struct rdsv3_fmr_pool
{
80 list_t f_list
; /* list of freed mr */
81 kmutex_t f_lock
; /* lock of fmr pool */
85 static int rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device
*rds_ibdev
,
86 ibt_fmr_pool_hdl_t pool_hdl
, int free_all
);
87 static void rdsv3_ib_teardown_mr(struct rdsv3_ib_mr
*ibmr
);
88 static void rdsv3_ib_mr_pool_flush_worker(struct rdsv3_work_s
*work
);
89 static struct rdsv3_ib_mr
*rdsv3_ib_alloc_fmr(struct rdsv3_ib_device
91 static int rdsv3_ib_map_fmr(struct rdsv3_ib_device
*rds_ibdev
,
92 struct rdsv3_ib_mr
*ibmr
, struct buf
*bp
, unsigned int nents
);
94 static struct rdsv3_ib_device
*
95 rdsv3_ib_get_device(uint32_be_t ipaddr
)
97 struct rdsv3_ib_device
*rds_ibdev
;
98 struct rdsv3_ib_ipaddr
*i_ipaddr
;
100 RDSV3_DPRINTF4("rdsv3_ib_get_device", "Enter: ipaddr: 0x%x", ipaddr
);
102 RDSV3_FOR_EACH_LIST_NODE(rds_ibdev
, &rdsv3_ib_devices
, list
) {
103 rw_enter(&rds_ibdev
->rwlock
, RW_READER
);
104 RDSV3_FOR_EACH_LIST_NODE(i_ipaddr
, &rds_ibdev
->ipaddr_list
,
106 if (i_ipaddr
->ipaddr
== ipaddr
) {
107 rw_exit(&rds_ibdev
->rwlock
);
111 rw_exit(&rds_ibdev
->rwlock
);
114 RDSV3_DPRINTF4("rdsv3_ib_get_device", "Return: ipaddr: 0x%x", ipaddr
);
120 rdsv3_ib_add_ipaddr(struct rdsv3_ib_device
*rds_ibdev
, uint32_be_t ipaddr
)
122 struct rdsv3_ib_ipaddr
*i_ipaddr
;
124 RDSV3_DPRINTF4("rdsv3_ib_add_ipaddr", "rds_ibdev: %p ipaddr: %x",
127 i_ipaddr
= kmem_alloc(sizeof (*i_ipaddr
), KM_NOSLEEP
);
131 i_ipaddr
->ipaddr
= ipaddr
;
133 rw_enter(&rds_ibdev
->rwlock
, RW_WRITER
);
134 list_insert_tail(&rds_ibdev
->ipaddr_list
, i_ipaddr
);
135 rw_exit(&rds_ibdev
->rwlock
);
141 rdsv3_ib_remove_ipaddr(struct rdsv3_ib_device
*rds_ibdev
, uint32_be_t ipaddr
)
143 struct rdsv3_ib_ipaddr
*i_ipaddr
, *next
;
144 struct rdsv3_ib_ipaddr
*to_free
= NULL
;
146 RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "rds_ibdev: %p, ipaddr: %x",
149 rw_enter(&rds_ibdev
->rwlock
, RW_WRITER
);
150 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr
, next
, &rds_ibdev
->ipaddr_list
,
152 if (i_ipaddr
->ipaddr
== ipaddr
) {
153 list_remove_node(&i_ipaddr
->list
);
158 rw_exit(&rds_ibdev
->rwlock
);
161 kmem_free(i_ipaddr
, sizeof (*i_ipaddr
));
164 RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr",
165 "Return: rds_ibdev: %p, ipaddr: %x", rds_ibdev
, ipaddr
);
169 rdsv3_ib_update_ipaddr(struct rdsv3_ib_device
*rds_ibdev
, uint32_be_t ipaddr
)
171 struct rdsv3_ib_device
*rds_ibdev_old
;
173 RDSV3_DPRINTF4("rdsv3_ib_update_ipaddr", "rds_ibdev: %p, ipaddr: %x",
176 rds_ibdev_old
= rdsv3_ib_get_device(ipaddr
);
178 rdsv3_ib_remove_ipaddr(rds_ibdev_old
, ipaddr
);
180 return (rdsv3_ib_add_ipaddr(rds_ibdev
, ipaddr
));
184 rdsv3_ib_add_conn(struct rdsv3_ib_device
*rds_ibdev
,
185 struct rdsv3_connection
*conn
)
187 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
189 RDSV3_DPRINTF4("rdsv3_ib_add_conn", "rds_ibdev: %p, conn: %p",
192 /* conn was previously on the nodev_conns_list */
193 mutex_enter(&ib_nodev_conns_lock
);
194 ASSERT(!list_is_empty(&ib_nodev_conns
));
195 ASSERT(list_link_active(&ic
->ib_node
));
196 list_remove_node(&ic
->ib_node
);
198 mutex_enter(&rds_ibdev
->spinlock
);
199 list_insert_tail(&rds_ibdev
->conn_list
, ic
);
200 ic
->i_on_dev_list
= B_TRUE
;
201 mutex_exit(&rds_ibdev
->spinlock
);
202 mutex_exit(&ib_nodev_conns_lock
);
206 rdsv3_ib_remove_conn(struct rdsv3_ib_device
*rds_ibdev
,
207 struct rdsv3_connection
*conn
)
209 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
211 RDSV3_DPRINTF4("rdsv3_ib_remove_conn", "rds_ibdev: %p, conn: %p",
214 /* place conn on nodev_conns_list */
215 mutex_enter(&ib_nodev_conns_lock
);
217 mutex_enter(&rds_ibdev
->spinlock
);
218 ASSERT(list_link_active(&ic
->ib_node
));
219 list_remove_node(&ic
->ib_node
);
220 ic
->i_on_dev_list
= B_FALSE
;
221 mutex_exit(&rds_ibdev
->spinlock
);
223 list_insert_tail(&ib_nodev_conns
, ic
);
225 mutex_exit(&ib_nodev_conns_lock
);
227 RDSV3_DPRINTF4("rdsv3_ib_remove_conn",
228 "Return: rds_ibdev: %p, conn: %p", rds_ibdev
, conn
);
232 __rdsv3_ib_destroy_conns(struct list
*list
, kmutex_t
*list_lock
)
234 struct rdsv3_ib_connection
*ic
, *_ic
;
237 RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Enter: list: %p", list
);
239 /* avoid calling conn_destroy with irqs off */
240 mutex_enter(list_lock
);
241 list_splice(list
, &tmp_list
);
242 mutex_exit(list_lock
);
244 RDSV3_FOR_EACH_LIST_NODE_SAFE(ic
, _ic
, &tmp_list
, ib_node
) {
245 rdsv3_conn_destroy(ic
->conn
);
248 RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Return: list: %p", list
);
252 rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device
*rds_ibdev
)
254 struct rdsv3_fmr_pool
*pool
= rds_ibdev
->fmr_pool
;
256 RDSV3_DPRINTF4("rdsv3_ib_destroy_mr_pool", "Enter: ibdev: %p",
259 if (rds_ibdev
->fmr_pool_hdl
== NULL
)
263 list_destroy(&pool
->f_list
);
264 kmem_free((void *) pool
, sizeof (*pool
));
267 (void) rdsv3_ib_flush_mr_pool(rds_ibdev
, rds_ibdev
->fmr_pool_hdl
, 1);
268 (void) ibt_destroy_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev
->dev
),
269 rds_ibdev
->fmr_pool_hdl
);
272 #define IB_FMR_MAX_BUF_SIZE 0x1000000 /* 16MB max buf */
274 rdsv3_ib_create_mr_pool(struct rdsv3_ib_device
*rds_ibdev
)
277 ibt_fmr_pool_attr_t fmr_attr
;
278 ibt_status_t ibt_status
;
279 struct rdsv3_fmr_pool
*pool
;
281 RDSV3_DPRINTF4("rdsv3_ib_create_mr_pool",
282 "Enter: ibdev: %p", rds_ibdev
);
284 pool
= kmem_zalloc(sizeof (*pool
), KM_NOSLEEP
);
289 /* setup FMR pool attributes */
290 h_page_sz
= rds_ibdev
->hca_attr
.hca_page_sz
* 1024;
292 fmr_attr
.fmr_max_pages_per_fmr
= (IB_FMR_MAX_BUF_SIZE
/ h_page_sz
) + 2;
293 fmr_attr
.fmr_pool_size
= RDSV3_FMR_POOL_SIZE
;
294 fmr_attr
.fmr_dirty_watermark
= 128;
295 fmr_attr
.fmr_cache
= B_FALSE
;
296 fmr_attr
.fmr_flags
= IBT_MR_NOSLEEP
| IBT_MR_ENABLE_LOCAL_WRITE
|
297 IBT_MR_ENABLE_REMOTE_WRITE
| IBT_MR_ENABLE_REMOTE_READ
;
298 fmr_attr
.fmr_page_sz
= h_page_sz
;
299 fmr_attr
.fmr_func_hdlr
= NULL
;
300 fmr_attr
.fmr_func_arg
= NULL
;
302 /* create the FMR pool */
303 ibt_status
= ibt_create_fmr_pool(rds_ibdev
->ibt_hca_hdl
,
304 rds_ibdev
->pd
->ibt_pd
, &fmr_attr
, &rds_ibdev
->fmr_pool_hdl
);
305 if (ibt_status
!= IBT_SUCCESS
) {
306 kmem_free((void *) pool
, sizeof (*pool
));
307 rds_ibdev
->fmr_pool
= NULL
;
311 list_create(&pool
->f_list
, sizeof (struct rdsv3_ib_mr
),
312 offsetof(struct rdsv3_ib_mr
, m_obj
));
313 mutex_init(&pool
->f_lock
, NULL
, MUTEX_DRIVER
, NULL
);
314 rds_ibdev
->fmr_pool
= pool
;
315 rds_ibdev
->max_fmrs
= fmr_attr
.fmr_pool_size
;
316 rds_ibdev
->fmr_message_size
= fmr_attr
.fmr_max_pages_per_fmr
;
318 RDSV3_DPRINTF2("rdsv3_ib_create_mr_pool",
319 "Exit: ibdev: %p fmr_pool: %p", rds_ibdev
, pool
);
324 rdsv3_ib_get_mr_info(struct rdsv3_ib_device
*rds_ibdev
,
325 struct rds_info_rdma_connection
*iinfo
)
327 iinfo
->rdma_mr_max
= rds_ibdev
->max_fmrs
;
328 iinfo
->rdma_mr_size
= rds_ibdev
->fmr_message_size
;
332 rdsv3_ib_get_mr(struct rds_iovec
*args
, unsigned long nents
,
333 struct rdsv3_sock
*rs
, uint32_t *key_ret
)
335 struct rdsv3_ib_device
*rds_ibdev
;
336 struct rdsv3_ib_mr
*ibmr
= NULL
;
337 ddi_umem_cookie_t umem_cookie
;
343 RDSV3_DPRINTF4("rdsv3_ib_get_mr", "Enter: args.addr: %p", args
->addr
);
345 rds_ibdev
= rdsv3_ib_get_device(rs
->rs_bound_addr
);
347 if (rds_ibdev
== NULL
)
348 return (void *)(PTR_ERR(-EFAULT
));
350 ibmr
= rdsv3_ib_alloc_fmr(rds_ibdev
);
354 /* pin user memory pages */
355 umem_len
= ptob(btopr(args
->bytes
+
356 ((uintptr_t)args
->addr
& PAGEOFFSET
)));
357 umem_addr
= (caddr_t
)((uintptr_t)args
->addr
& ~PAGEOFFSET
);
358 ret
= umem_lockmemory(umem_addr
, umem_len
,
359 DDI_UMEMLOCK_WRITE
| DDI_UMEMLOCK_READ
,
360 &umem_cookie
, NULL
, NULL
);
362 kmem_free((void *) ibmr
, sizeof (*ibmr
));
363 ibmr
= ERR_PTR(-ret
);
367 /* transpose umem_cookie to buf structure for rdsv3_ib_map_fmr() */
368 bp
= ddi_umem_iosetup(umem_cookie
, 0, umem_len
,
369 B_WRITE
, 0, 0, NULL
, DDI_UMEM_SLEEP
);
371 ret
= rdsv3_ib_map_fmr(rds_ibdev
, ibmr
, bp
, nents
);
372 freerbuf(bp
); /* free bp */
374 ibmr
->umem_cookie
= umem_cookie
;
375 *key_ret
= (uint32_t)ibmr
->rc_mem_desc
.pmd_rkey
;
376 ibmr
->m_device
= rds_ibdev
;
377 ibmr
->m_pool
= rds_ibdev
->fmr_pool
;
378 RDSV3_DPRINTF4("rdsv3_ib_get_mr",
379 "Return: ibmr: %p umem_cookie %p", ibmr
, ibmr
->umem_cookie
);
381 } else { /* error return */
382 RDSV3_DPRINTF2("rdsv3_ib_get_mr", "map_fmr failed (errno=%d)\n",
384 ddi_umem_unlock(umem_cookie
);
385 kmem_free((void *)ibmr
, sizeof (*ibmr
));
386 return (ERR_PTR(ret
));
390 static struct rdsv3_ib_mr
*
391 rdsv3_ib_alloc_fmr(struct rdsv3_ib_device
*rds_ibdev
)
393 struct rdsv3_ib_mr
*ibmr
;
395 RDSV3_DPRINTF4("rdsv3_ib_alloc_fmr", "Enter: ibdev: %p", rds_ibdev
);
397 if (rds_ibdev
->fmr_pool_hdl
) {
398 ibmr
= kmem_zalloc(sizeof (*ibmr
), KM_SLEEP
);
399 ibmr
->rc_hca_hdl
= ib_get_ibt_hca_hdl(rds_ibdev
->dev
);
400 ibmr
->fmr_pool_hdl
= rds_ibdev
->fmr_pool_hdl
;
403 return (struct rdsv3_ib_mr
*)(PTR_ERR(-ENOMEM
));
407 rdsv3_ib_map_fmr(struct rdsv3_ib_device
*rds_ibdev
, struct rdsv3_ib_mr
*ibmr
,
408 struct buf
*bp
, unsigned int nents
)
410 ibt_va_attr_t va_attr
;
411 ibt_reg_req_t reg_req
;
412 uint_t paddr_list_len
;
414 ibt_status_t ibt_status
;
415 /* LINTED E_FUNC_SET_NOT_USED */
416 unsigned int l_nents
= nents
;
418 RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Enter: ibmr: %p", ibmr
);
419 RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "buf addr: %p", bp
->b_un
.b_addr
);
421 /* setup ibt_map_mem_area attributes */
422 bzero(&va_attr
, sizeof (ibt_va_attr_t
));
424 va_attr
.va_flags
= IBT_VA_FMR
| IBT_VA_BUF
;
426 page_sz
= rds_ibdev
->hca_attr
.hca_page_sz
* 1024; /* in kbytes */
427 paddr_list_len
= (bp
->b_bcount
/ page_sz
) + 2; /* start + end pg */
429 /* map user buffer to HCA address */
430 ibt_status
= ibt_map_mem_area(ibmr
->rc_hca_hdl
,
431 &va_attr
, paddr_list_len
, ®_req
, &ibmr
->rc_ma_hdl
);
432 if (ibt_status
!= IBT_SUCCESS
) {
436 /* use a free entry from FMR pool to register the specified memory */
437 ibt_status
= ibt_register_physical_fmr(ibmr
->rc_hca_hdl
,
439 ®_req
.fn_arg
, &ibmr
->rc_fmr_hdl
, &ibmr
->rc_mem_desc
);
440 if (ibt_status
!= IBT_SUCCESS
) {
441 RDSV3_DPRINTF2("rdsv3_ib_map_fmr", "reg_phy_fmr failed %d",
443 (void) ibt_unmap_mem_area(ibmr
->rc_hca_hdl
,
445 if (ibt_status
== IBT_INSUFF_RESOURCE
) {
450 RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Return: ibmr: %p rkey: 0x%x",
451 ibmr
, (uint32_t)ibmr
->rc_mem_desc
.pmd_rkey
);
456 rdsv3_ib_sync_mr(void *trans_private
, int direction
)
458 /* LINTED E_FUNC_SET_NOT_USED */
459 void *l_trans_private
= trans_private
;
460 /* LINTED E_FUNC_SET_NOT_USED */
461 int l_direction
= direction
;
463 /* FMR Sync not needed in Solaris on PCI-ex systems */
465 RDSV3_DPRINTF4("rdsv3_ib_sync_mr", "Enter:");
469 rdsv3_ib_flush_mrs(void)
471 struct rdsv3_ib_device
*rds_ibdev
;
473 RDSV3_DPRINTF4("rdsv3_ib_flush_mrs", "Enter:");
475 RDSV3_FOR_EACH_LIST_NODE(rds_ibdev
, &rdsv3_ib_devices
, list
) {
476 if (rds_ibdev
->fmr_pool_hdl
) {
477 (void) rdsv3_ib_flush_mr_pool(rds_ibdev
,
478 rds_ibdev
->fmr_pool_hdl
, 0);
484 rdsv3_ib_drop_mr(struct rdsv3_ib_mr
*ibmr
)
486 /* return the fmr to the IBTF pool */
487 (void) ibt_deregister_fmr(ibmr
->rc_hca_hdl
, ibmr
->rc_fmr_hdl
);
488 (void) ibt_unmap_mem_area(ibmr
->rc_hca_hdl
, ibmr
->rc_ma_hdl
);
489 (void) ddi_umem_unlock(ibmr
->umem_cookie
);
490 kmem_free((void *) ibmr
, sizeof (*ibmr
));
494 rdsv3_ib_drain_mrlist_fn(void *data
)
496 struct rdsv3_fmr_pool
*pool
= (struct rdsv3_fmr_pool
*)data
;
497 ibt_hca_hdl_t hca_hdl
;
498 ibt_fmr_pool_hdl_t fmr_pool_hdl
;
500 struct rdsv3_ib_mr
*ibmr
;
501 list_t
*listp
= &pool
->f_list
;
502 kmutex_t
*lockp
= &pool
->f_lock
;
509 ibmr
= (struct rdsv3_ib_mr
*)list_remove_head(listp
);
515 if ((inval
== 0) && ibmr
->m_inval
) {
517 hca_hdl
= ibmr
->rc_hca_hdl
;
518 fmr_pool_hdl
= ibmr
->fmr_pool_hdl
;
521 rdsv3_ib_drop_mr(ibmr
);
524 (void) ibt_flush_fmr_pool(hca_hdl
, fmr_pool_hdl
);
528 rdsv3_ib_free_mr(void *trans_private
, int invalidate
)
530 struct rdsv3_ib_mr
*ibmr
= trans_private
;
531 rdsv3_af_thr_t
*af_thr
;
533 RDSV3_DPRINTF4("rdsv3_ib_free_mr", "Enter: ibmr: %p inv: %d",
536 /* save af_thr at local as ibmr might be freed at mutex_exit */
537 af_thr
= ibmr
->m_device
->fmr_soft_cq
;
538 ibmr
->m_inval
= (unsigned int) invalidate
;
539 mutex_enter(&ibmr
->m_pool
->f_lock
);
540 list_insert_tail(&ibmr
->m_pool
->f_list
, ibmr
);
541 ibmr
->m_pool
->f_listcnt
++;
542 mutex_exit(&ibmr
->m_pool
->f_lock
);
544 rdsv3_af_thr_fire(af_thr
);
548 rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device
*rds_ibdev
,
549 ibt_fmr_pool_hdl_t pool_hdl
, int free_all
)
551 /* LINTED E_FUNC_SET_NOT_USED */
552 int l_free_all
= free_all
;
554 RDSV3_DPRINTF4("rdsv3_ib_flush_mr_pool", "Enter: pool: %p", pool_hdl
);
556 rdsv3_ib_stats_inc(s_ib_rdma_mr_pool_flush
);
558 (void) ibt_flush_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev
->dev
),