4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 #include <sys/types.h>
25 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/strlog.h>
32 #include <sys/cmn_err.h>
33 #include <sys/socket.h>
35 #include <net/if_types.h>
36 #include <netinet/in.h>
37 #include <sys/ethernet.h>
41 #include <inet/ip_ire.h>
42 #include <inet/ip_if.h>
43 #include <inet/ip_ftable.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sockio.h>
52 #include <sys/sysmacros.h>
53 #include <inet/common.h>
55 #include <net/if_types.h>
57 #include <sys/ib/clients/rdsv3/rdsv3.h>
58 #include <sys/ib/clients/rdsv3/rdma.h>
59 #include <sys/ib/clients/rdsv3/ib.h>
60 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
65 #include <sys/mac_client.h>
66 #include <sys/mac_provider.h>
67 #include <sys/mac_client_priv.h>
69 ddi_taskq_t
*rdsv3_taskq
= NULL
;
70 extern kmem_cache_t
*rdsv3_alloc_cache
;
72 extern unsigned int ip_ocsum(ushort_t
*address
, int halfword_count
,
76 * Check if the IP interface named by `lifrp' is RDS-capable.
79 rdsv3_capable_interface(struct lifreq
*lifrp
)
81 char ifname
[LIFNAMSIZ
];
82 char drv
[MAXLINKNAMELEN
];
86 RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter");
88 if (lifrp
->lifr_type
== IFT_IB
)
92 * Strip off the logical interface portion before getting
93 * intimate with the name.
95 (void) strlcpy(ifname
, lifrp
->lifr_name
, LIFNAMSIZ
);
96 if ((cp
= strchr(ifname
, ':')) != NULL
)
99 if (strcmp("lo0", ifname
) == 0) {
101 * loopback is considered RDS-capable
106 return (ddi_parse(ifname
, drv
, &ppa
) == DDI_SUCCESS
&&
107 rdsv3_if_lookup_by_name(drv
));
111 rdsv3_do_ip_ioctl(ksocket_t so4
, void **ipaddrs
, int *size
, int *nifs
)
115 struct lifreq
*lp
, *rlp
, lifr
;
118 int bufsize
, rbufsize
;
126 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter");
129 /* snapshot the current number of interfaces */
130 lifn
.lifn_family
= PF_UNSPEC
;
131 lifn
.lifn_flags
= LIFC_NOXMIT
| LIFC_TEMPORARY
| LIFC_ALLZONES
;
133 rval
= ksocket_ioctl(so4
, SIOCGLIFNUM
, (intptr_t)&lifn
, &rval
,
136 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
137 "ksocket_ioctl returned: %d", rval
);
141 numifs
= lifn
.lifn_count
;
143 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found");
147 /* allocate extra room in case more interfaces appear */
150 /* get the interface names and ip addresses */
151 bufsize
= numifs
* sizeof (struct lifreq
);
152 buf
= kmem_alloc(bufsize
, KM_SLEEP
);
154 lifc
.lifc_family
= AF_UNSPEC
;
155 lifc
.lifc_flags
= LIFC_NOXMIT
| LIFC_TEMPORARY
| LIFC_ALLZONES
;
156 lifc
.lifc_len
= bufsize
;
158 rc
= ksocket_ioctl(so4
, SIOCGLIFCONF
, (intptr_t)&lifc
, &rval
, CRED());
160 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed");
161 kmem_free(buf
, bufsize
);
164 /* if our extra room is used up, try again */
165 if (bufsize
<= lifc
.lifc_len
) {
166 kmem_free(buf
, bufsize
);
170 /* calc actual number of ifconfs */
171 n
= lifc
.lifc_len
/ sizeof (struct lifreq
);
174 * Count the RDS interfaces
176 for (i
= 0, j
= 0, lp
= lifc
.lifc_req
; i
< n
; i
++, lp
++) {
179 * Copy as the SIOCGLIFFLAGS ioctl is destructive
181 bcopy(lp
, &lifr
, sizeof (struct lifreq
));
183 * fetch the flags using the socket of the correct family
185 switch (lifr
.lifr_addr
.ss_family
) {
187 rc
= ksocket_ioctl(so4
, SIOCGLIFFLAGS
, (intptr_t)&lifr
,
194 if (rc
!= 0) continue;
197 * If we got the flags, skip uninteresting
198 * interfaces based on flags
200 if ((lifr
.lifr_flags
& IFF_UP
) != IFF_UP
)
202 if (lifr
.lifr_flags
&
203 (IFF_ANYCAST
|IFF_NOLOCAL
|IFF_DEPRECATED
))
205 if (!rdsv3_capable_interface(&lifr
))
211 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces");
212 kmem_free(buf
, bufsize
);
218 /* This is the buffer we pass back */
219 rbufsize
= numifs
* sizeof (struct lifreq
);
220 rbuf
= kmem_alloc(rbufsize
, KM_SLEEP
);
221 rlp
= (struct lifreq
*)rbuf
;
224 * Examine the array of interfaces and filter uninteresting ones
226 for (i
= 0, lp
= lifc
.lifc_req
; i
< n
; i
++, lp
++) {
229 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
231 bcopy(lp
, &lifr
, sizeof (struct lifreq
));
233 * fetch the flags using the socket of the correct family
235 switch (lifr
.lifr_addr
.ss_family
) {
237 rc
= ksocket_ioctl(so4
, SIOCGLIFFLAGS
, (intptr_t)&lifr
,
246 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
247 "ksocket_ioctl failed" " for %s", lifr
.lifr_name
);
252 * If we got the flags, skip uninteresting
253 * interfaces based on flags
255 if ((lifr
.lifr_flags
& IFF_UP
) != IFF_UP
)
257 if (lifr
.lifr_flags
&
258 (IFF_ANYCAST
|IFF_NOLOCAL
|IFF_DEPRECATED
))
260 if (!rdsv3_capable_interface(&lifr
))
263 /* save the record */
264 bcopy(lp
, rlp
, sizeof (struct lifreq
));
265 rlp
->lifr_addr
.ss_family
= AF_INET_OFFLOAD
;
269 kmem_free(buf
, bufsize
);
275 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return");
281 * Check if the IP interface named by `ifrp' is RDS-capable.
284 rdsv3_capable_interface_old(struct ifreq
*ifrp
)
286 char ifname
[IFNAMSIZ
];
287 char drv
[MAXLINKNAMELEN
];
291 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter");
294 * Strip off the logical interface portion before getting
295 * intimate with the name.
297 (void) strlcpy(ifname
, ifrp
->ifr_name
, IFNAMSIZ
);
298 if ((cp
= strchr(ifname
, ':')) != NULL
)
301 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname
);
303 if ((strcmp("lo0", ifname
) == 0) ||
304 (strncmp("ibd", ifname
, 3) == 0)) {
306 * loopback and IB are considered RDS-capable
311 return (ddi_parse(ifname
, drv
, &ppa
) == DDI_SUCCESS
&&
312 rdsv3_if_lookup_by_name(drv
));
316 rdsv3_do_ip_ioctl_old(ksocket_t so4
, void **ipaddrs
, int *size
, int *nifs
)
320 struct ifreq
*lp
, *rlp
, ifr
;
323 int bufsize
, rbufsize
;
331 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter");
334 rval
= ksocket_ioctl(so4
, SIOCGIFNUM
, (intptr_t)&ifn
, &rval
,
337 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
338 "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval
);
344 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found");
348 /* allocate extra room in case more interfaces appear */
351 /* get the interface names and ip addresses */
352 bufsize
= numifs
* sizeof (struct ifreq
);
353 buf
= kmem_alloc(bufsize
, KM_SLEEP
);
355 ifc
.ifc_len
= bufsize
;
357 rc
= ksocket_ioctl(so4
, SIOCGIFCONF
, (intptr_t)&ifc
, &rval
, CRED());
359 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
360 "SIOCGLIFCONF failed: %d", rc
);
361 kmem_free(buf
, bufsize
);
364 /* if our extra room is used up, try again */
365 if (bufsize
<= ifc
.ifc_len
) {
366 kmem_free(buf
, bufsize
);
370 /* calc actual number of ifconfs */
371 n
= ifc
.ifc_len
/ sizeof (struct ifreq
);
374 * Count the RDS interfaces
376 for (i
= 0, j
= 0, lp
= ifc
.ifc_req
; i
< n
; i
++, lp
++) {
379 * Copy as the SIOCGIFFLAGS ioctl is destructive
381 bcopy(lp
, &ifr
, sizeof (struct ifreq
));
383 * fetch the flags using the socket of the correct family
385 switch (ifr
.ifr_addr
.sa_family
) {
387 rc
= ksocket_ioctl(so4
, SIOCGIFFLAGS
, (intptr_t)&ifr
,
394 if (rc
!= 0) continue;
396 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
397 "1. ifr_name: %s, flags: %d", ifr
.ifr_name
,
398 (ushort_t
)ifr
.ifr_flags
);
401 * If we got the flags, skip uninteresting
402 * interfaces based on flags
404 if ((((ushort_t
)ifr
.ifr_flags
) & IFF_UP
) != IFF_UP
)
406 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
407 "2. ifr_name: %s, flags: %d", ifr
.ifr_name
,
408 (ushort_t
)ifr
.ifr_flags
);
409 if (((ushort_t
)ifr
.ifr_flags
) &
410 (IFF_ANYCAST
|IFF_NOLOCAL
|IFF_DEPRECATED
))
412 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
413 "3. ifr_name: %s, flags: %d", ifr
.ifr_name
,
414 (ushort_t
)ifr
.ifr_flags
);
415 if (!rdsv3_capable_interface_old(&ifr
))
417 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
418 "4. ifr_name: %s, flags: %d", ifr
.ifr_name
,
419 (ushort_t
)ifr
.ifr_flags
);
424 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces");
425 kmem_free(buf
, bufsize
);
431 /* This is the buffer we pass back */
432 rbufsize
= numifs
* sizeof (struct ifreq
);
433 rbuf
= kmem_alloc(rbufsize
, KM_SLEEP
);
434 rlp
= (struct ifreq
*)rbuf
;
437 * Examine the array of interfaces and filter uninteresting ones
439 for (i
= 0, lp
= ifc
.ifc_req
; i
< n
; i
++, lp
++) {
442 * Copy the address as the SIOCGIFFLAGS ioctl is destructive
444 bcopy(lp
, &ifr
, sizeof (struct ifreq
));
446 * fetch the flags using the socket of the correct family
448 switch (ifr
.ifr_addr
.sa_family
) {
450 rc
= ksocket_ioctl(so4
, SIOCGIFFLAGS
, (intptr_t)&ifr
,
459 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
460 "ksocket_ioctl failed: %d for %s",
466 * If we got the flags, skip uninteresting
467 * interfaces based on flags
469 if ((((ushort_t
)ifr
.ifr_flags
) & IFF_UP
) != IFF_UP
)
471 if (((ushort_t
)ifr
.ifr_flags
) &
472 (IFF_ANYCAST
|IFF_NOLOCAL
|IFF_DEPRECATED
))
474 if (!rdsv3_capable_interface_old(&ifr
))
477 /* save the record */
478 bcopy(lp
, rlp
, sizeof (struct ifreq
));
479 rlp
->ifr_addr
.sa_family
= AF_INET_OFFLOAD
;
483 kmem_free(buf
, bufsize
);
489 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return");
495 rdsv3_isloopback(ipaddr_t addr
)
499 ipst
= netstack_find_by_zoneid(GLOBAL_ZONEID
)->netstack_ip
;
500 ASSERT(ipst
!= NULL
);
501 if (ip_type_v4(addr
, ipst
) != IRE_LOOPBACK
) {
502 netstack_rele(ipst
->ips_netstack
);
505 netstack_rele(ipst
->ips_netstack
);
510 * Work Queue Implementation
513 #define RDSV3_WQ_THREAD_IDLE 0
514 #define RDSV3_WQ_THREAD_RUNNING 1
515 #define RDSV3_WQ_THREAD_FLUSHING 2
516 #define RDSV3_WQ_THREAD_EXITING 3
520 rdsv3_worker_thread(void *arg
)
522 rdsv3_workqueue_struct_t
*wq
= arg
;
525 RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq
);
527 mutex_enter(&wq
->wq_lock
);
528 work
= list_remove_head(&wq
->wq_queue
);
530 mutex_exit(&wq
->wq_lock
);
535 mutex_enter(&wq
->wq_lock
);
536 work
= list_remove_head(&wq
->wq_queue
);
539 /* No more work, go home, until called again */
540 if (wq
->wq_state
!= RDSV3_WQ_THREAD_EXITING
) {
541 wq
->wq_state
= RDSV3_WQ_THREAD_IDLE
;
543 mutex_exit(&wq
->wq_lock
);
545 RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq
);
550 rdsv3_flush_workqueue(rdsv3_workqueue_struct_t
*wq
)
552 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq
);
554 mutex_enter(&wq
->wq_lock
);
555 switch (wq
->wq_state
) {
556 case RDSV3_WQ_THREAD_IDLE
:
558 ASSERT(list_is_empty(&wq
->wq_queue
));
561 case RDSV3_WQ_THREAD_RUNNING
:
562 wq
->wq_state
= RDSV3_WQ_THREAD_FLUSHING
;
564 case RDSV3_WQ_THREAD_FLUSHING
:
565 /* already flushing, wait until the flushing is complete */
567 mutex_exit(&wq
->wq_lock
);
569 mutex_enter(&wq
->wq_lock
);
570 } while (wq
->wq_state
== RDSV3_WQ_THREAD_FLUSHING
);
572 case RDSV3_WQ_THREAD_EXITING
:
573 mutex_exit(&wq
->wq_lock
);
574 rdsv3_worker_thread(wq
);
577 mutex_exit(&wq
->wq_lock
);
579 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq
);
583 rdsv3_queue_work(rdsv3_workqueue_struct_t
*wq
, rdsv3_work_t
*wp
)
585 RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq
, wp
);
587 mutex_enter(&wq
->wq_lock
);
589 if (list_link_active(&wp
->work_item
)) {
590 /* This is already in the queue, ignore this call */
591 mutex_exit(&wq
->wq_lock
);
592 RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp
);
596 switch (wq
->wq_state
) {
597 case RDSV3_WQ_THREAD_RUNNING
:
598 list_insert_tail(&wq
->wq_queue
, wp
);
599 mutex_exit(&wq
->wq_lock
);
602 case RDSV3_WQ_THREAD_FLUSHING
:
604 mutex_exit(&wq
->wq_lock
);
606 mutex_enter(&wq
->wq_lock
);
607 } while (wq
->wq_state
== RDSV3_WQ_THREAD_FLUSHING
);
609 if (wq
->wq_state
== RDSV3_WQ_THREAD_RUNNING
) {
610 list_insert_tail(&wq
->wq_queue
, wp
);
611 mutex_exit(&wq
->wq_lock
);
616 case RDSV3_WQ_THREAD_IDLE
:
617 list_insert_tail(&wq
->wq_queue
, wp
);
618 wq
->wq_state
= RDSV3_WQ_THREAD_RUNNING
;
619 mutex_exit(&wq
->wq_lock
);
621 (void) ddi_taskq_dispatch(rdsv3_taskq
, rdsv3_worker_thread
, wq
,
625 case RDSV3_WQ_THREAD_EXITING
:
626 mutex_exit(&wq
->wq_lock
);
630 RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq
, wp
);
633 /* timeout handler for delayed work queuing */
635 rdsv3_work_timeout_handler(void *arg
)
637 rdsv3_delayed_work_t
*dwp
= (rdsv3_delayed_work_t
*)arg
;
639 RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
640 "Enter(wq: %p, wp: %p)", dwp
->wq
, &dwp
->work
);
642 mutex_enter(&dwp
->lock
);
644 mutex_exit(&dwp
->lock
);
646 mutex_enter(&dwp
->wq
->wq_lock
);
647 dwp
->wq
->wq_pending
--;
648 if (dwp
->wq
->wq_state
== RDSV3_WQ_THREAD_EXITING
) {
649 mutex_exit(&dwp
->wq
->wq_lock
);
652 mutex_exit(&dwp
->wq
->wq_lock
);
654 rdsv3_queue_work(dwp
->wq
, &dwp
->work
);
656 RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
657 "Return(wq: %p, wp: %p)", dwp
->wq
, &dwp
->work
);
661 rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t
*wq
,
662 rdsv3_delayed_work_t
*dwp
, uint_t delay
)
664 RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
665 "Enter(wq: %p, wp: %p)", wq
, dwp
);
668 rdsv3_queue_work(wq
, &dwp
->work
);
672 mutex_enter(&wq
->wq_lock
);
673 if (wq
->wq_state
== RDSV3_WQ_THREAD_EXITING
) {
674 mutex_exit(&wq
->wq_lock
);
675 RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
676 "WQ exiting - don't queue (wq: %p, wp: %p)", wq
, dwp
);
680 mutex_exit(&wq
->wq_lock
);
682 mutex_enter(&dwp
->lock
);
683 if (dwp
->timeid
== 0) {
685 dwp
->timeid
= timeout(rdsv3_work_timeout_handler
, dwp
,
686 jiffies
+ (delay
* rdsv3_one_sec_in_hz
));
687 mutex_exit(&dwp
->lock
);
689 mutex_exit(&dwp
->lock
);
690 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p",
692 mutex_enter(&wq
->wq_lock
);
694 mutex_exit(&wq
->wq_lock
);
697 RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
698 "Return(wq: %p, wp: %p)", wq
, dwp
);
702 rdsv3_cancel_delayed_work(rdsv3_delayed_work_t
*dwp
)
704 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
705 "Enter(wq: %p, dwp: %p)", dwp
->wq
, dwp
);
707 mutex_enter(&dwp
->lock
);
708 if (dwp
->timeid
!= 0) {
709 (void) untimeout(dwp
->timeid
);
712 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
713 "Nothing to cancel (wq: %p, dwp: %p)", dwp
->wq
, dwp
);
714 mutex_exit(&dwp
->lock
);
717 mutex_exit(&dwp
->lock
);
719 mutex_enter(&dwp
->wq
->wq_lock
);
720 dwp
->wq
->wq_pending
--;
721 mutex_exit(&dwp
->wq
->wq_lock
);
723 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
724 "Return(wq: %p, dwp: %p)", dwp
->wq
, dwp
);
728 rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t
*wq
)
730 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter");
734 mutex_enter(&wq
->wq_lock
);
735 wq
->wq_state
= RDSV3_WQ_THREAD_EXITING
;
737 while (wq
->wq_pending
> 0) {
738 mutex_exit(&wq
->wq_lock
);
740 mutex_enter(&wq
->wq_lock
);
742 mutex_exit(&wq
->wq_lock
);
744 rdsv3_flush_workqueue(wq
);
746 list_destroy(&wq
->wq_queue
);
747 mutex_destroy(&wq
->wq_lock
);
748 kmem_free(wq
, sizeof (rdsv3_workqueue_struct_t
));
751 ddi_taskq_destroy(rdsv3_taskq
);
756 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return");
761 rdsv3_rdma_init_worker(struct rdsv3_work_s
*work
)
766 #define RDSV3_NUM_TASKQ_THREADS 1
767 rdsv3_workqueue_struct_t
*
768 rdsv3_create_task_workqueue(char *name
)
770 rdsv3_workqueue_struct_t
*wq
;
772 RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)",
775 rdsv3_taskq
= ddi_taskq_create(rdsv3_dev_info
, name
,
776 RDSV3_NUM_TASKQ_THREADS
, TASKQ_DEFAULTPRI
, 0);
777 if (rdsv3_taskq
== NULL
) {
778 RDSV3_DPRINTF2(__FILE__
,
779 "ddi_taskq_create failed for rdsv3_taskq");
783 wq
= kmem_zalloc(sizeof (rdsv3_workqueue_struct_t
), KM_NOSLEEP
);
785 RDSV3_DPRINTF2(__FILE__
, "kmem_zalloc failed for wq");
786 ddi_taskq_destroy(rdsv3_taskq
);
790 list_create(&wq
->wq_queue
, sizeof (struct rdsv3_work_s
),
791 offsetof(struct rdsv3_work_s
, work_item
));
792 mutex_init(&wq
->wq_lock
, NULL
, MUTEX_DRIVER
, NULL
);
793 wq
->wq_state
= RDSV3_WQ_THREAD_IDLE
;
795 rdsv3_one_sec_in_hz
= drv_usectohz(1000000);
797 RDSV3_DPRINTF2("create_singlethread_workqueue", "Return");
803 * Implementation for struct sock
807 rdsv3_sock_exit_data(struct rsock
*sk
)
809 struct rdsv3_sock
*rs
= sk
->sk_protinfo
;
811 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs
, sk
);
814 ASSERT(rdsv3_sk_sock_flag(sk
, SOCK_DEAD
));
818 list_destroy(&rs
->rs_send_queue
);
819 list_destroy(&rs
->rs_notify_queue
);
820 list_destroy(&rs
->rs_recv_queue
);
822 rw_destroy(&rs
->rs_recv_lock
);
823 mutex_destroy(&rs
->rs_lock
);
825 mutex_destroy(&rs
->rs_rdma_lock
);
826 avl_destroy(&rs
->rs_rdma_keys
);
828 mutex_destroy(&rs
->rs_conn_lock
);
829 mutex_destroy(&rs
->rs_congested_lock
);
830 cv_destroy(&rs
->rs_congested_cv
);
832 rdsv3_exit_waitqueue(sk
->sk_sleep
);
833 kmem_free(sk
->sk_sleep
, sizeof (rdsv3_wait_queue_t
));
834 mutex_destroy(&sk
->sk_lock
);
836 kmem_cache_free(rdsv3_alloc_cache
, sk
);
837 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs
, sk
);
840 /* XXX - figure out right values */
841 #define RDSV3_RECV_HIWATER (256 * 1024)
842 #define RDSV3_RECV_LOWATER 128
843 #define RDSV3_XMIT_HIWATER (256 * 1024)
844 #define RDSV3_XMIT_LOWATER 1024
851 sk
= kmem_cache_alloc(rdsv3_alloc_cache
, KM_SLEEP
);
853 RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed");
857 bzero(sk
, sizeof (struct rsock
) + sizeof (struct rdsv3_sock
));
862 rdsv3_sock_init_data(struct rsock
*sk
)
864 sk
->sk_sleep
= kmem_zalloc(sizeof (rdsv3_wait_queue_t
), KM_SLEEP
);
865 rdsv3_init_waitqueue(sk
->sk_sleep
);
867 mutex_init(&sk
->sk_lock
, NULL
, MUTEX_DRIVER
, NULL
);
869 sk
->sk_protinfo
= (struct rdsv3_sock
*)(sk
+ 1);
870 sk
->sk_sndbuf
= RDSV3_XMIT_HIWATER
;
871 sk
->sk_rcvbuf
= RDSV3_RECV_HIWATER
;
879 rdsv3_conn_constructor(void *buf
, void *arg
, int kmflags
)
881 struct rdsv3_connection
*conn
= buf
;
883 bzero(conn
, sizeof (struct rdsv3_connection
));
885 conn
->c_next_tx_seq
= 1;
886 mutex_init(&conn
->c_lock
, NULL
, MUTEX_DRIVER
, NULL
);
887 mutex_init(&conn
->c_send_lock
, NULL
, MUTEX_DRIVER
, NULL
);
888 conn
->c_send_generation
= 1;
891 list_create(&conn
->c_send_queue
, sizeof (struct rdsv3_message
),
892 offsetof(struct rdsv3_message
, m_conn_item
));
893 list_create(&conn
->c_retrans
, sizeof (struct rdsv3_message
),
894 offsetof(struct rdsv3_message
, m_conn_item
));
900 rdsv3_conn_destructor(void *buf
, void *arg
)
902 struct rdsv3_connection
*conn
= buf
;
904 ASSERT(list_is_empty(&conn
->c_send_queue
));
905 ASSERT(list_is_empty(&conn
->c_retrans
));
906 list_destroy(&conn
->c_send_queue
);
907 list_destroy(&conn
->c_retrans
);
908 mutex_destroy(&conn
->c_send_lock
);
909 mutex_destroy(&conn
->c_lock
);
913 rdsv3_conn_compare(const void *conn1
, const void *conn2
)
915 uint32_be_t laddr1
, faddr1
, laddr2
, faddr2
;
917 laddr1
= ((rdsv3_conn_info_t
*)conn1
)->c_laddr
;
918 laddr2
= ((struct rdsv3_connection
*)conn2
)->c_laddr
;
920 if (laddr1
== laddr2
) {
921 faddr1
= ((rdsv3_conn_info_t
*)conn1
)->c_faddr
;
922 faddr2
= ((struct rdsv3_connection
*)conn2
)->c_faddr
;
923 if (faddr1
== faddr2
)
936 /* rdsv3_ib_incoming cache */
939 rdsv3_ib_inc_constructor(void *buf
, void *arg
, int kmflags
)
941 list_create(&((struct rdsv3_ib_incoming
*)buf
)->ii_frags
,
942 sizeof (struct rdsv3_page_frag
),
943 offsetof(struct rdsv3_page_frag
, f_item
));
950 rdsv3_ib_inc_destructor(void *buf
, void *arg
)
952 list_destroy(&((struct rdsv3_ib_incoming
*)buf
)->ii_frags
);
955 /* ib_frag_slab cache */
958 rdsv3_ib_frag_constructor(void *buf
, void *arg
, int kmflags
)
960 struct rdsv3_page_frag
*frag
= (struct rdsv3_page_frag
*)buf
;
961 struct rdsv3_ib_device
*rds_ibdev
= (struct rdsv3_ib_device
*)arg
;
962 ibt_iov_attr_t iov_attr
;
963 ibt_iov_t iov_arr
[1];
966 bzero(frag
, sizeof (struct rdsv3_page_frag
));
967 list_link_init(&frag
->f_item
);
969 frag
->f_page
= kmem_alloc(PAGE_SIZE
, kmflags
);
970 if (frag
->f_page
== NULL
) {
971 RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
972 "kmem_alloc for %d failed", PAGE_SIZE
);
977 iov_attr
.iov_as
= NULL
;
978 iov_attr
.iov
= &iov_arr
[0];
979 iov_attr
.iov_buf
= NULL
;
980 iov_attr
.iov_list_len
= 1;
981 iov_attr
.iov_wr_nds
= 1;
982 iov_attr
.iov_lso_hdr_sz
= 0;
983 iov_attr
.iov_flags
= IBT_IOV_SLEEP
| IBT_IOV_RECV
;
985 iov_arr
[0].iov_addr
= frag
->f_page
;
986 iov_arr
[0].iov_len
= PAGE_SIZE
;
989 wr
.recv
.wr_sgl
= &frag
->f_sge
;
991 if (ibt_map_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev
->dev
),
992 &iov_attr
, &wr
, &frag
->f_mapped
) != IBT_SUCCESS
) {
993 RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
994 "ibt_map_mem_iov failed");
995 kmem_free(frag
->f_page
, PAGE_SIZE
);
1004 rdsv3_ib_frag_destructor(void *buf
, void *arg
)
1006 struct rdsv3_page_frag
*frag
= (struct rdsv3_page_frag
*)buf
;
1007 struct rdsv3_ib_device
*rds_ibdev
= (struct rdsv3_ib_device
*)arg
;
1009 /* unmap the page */
1010 if (ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev
->dev
),
1011 frag
->f_mapped
) != IBT_SUCCESS
)
1012 RDSV3_DPRINTF2("rdsv3_ib_frag_destructor",
1013 "ibt_unmap_mem_iov failed");
1016 kmem_free(frag
->f_page
, PAGE_SIZE
);
1020 extern kmutex_t loop_conns_lock
;
1021 extern list_t loop_conns
;
1023 struct rdsv3_loop_connection
1025 struct list_node loop_node
;
1026 struct rdsv3_connection
*conn
;
1030 rdsv3_loop_init(void)
1032 list_create(&loop_conns
, sizeof (struct rdsv3_loop_connection
),
1033 offsetof(struct rdsv3_loop_connection
, loop_node
));
1034 mutex_init(&loop_conns_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1038 /* IB Rkey is used here for comparison */
1040 rdsv3_mr_compare(const void *mr1
, const void *mr2
)
1042 uint32_t key1
= *(uint32_t *)mr1
;
1043 uint32_t key2
= ((struct rdsv3_mr
*)mr2
)->r_key
;
1053 extern struct rdsv3_transport
*transports
[];
1054 extern krwlock_t trans_sem
;
1057 rdsv3_trans_exit(void)
1059 struct rdsv3_transport
*trans
;
1062 RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter");
1064 /* currently, only IB transport */
1065 rw_enter(&trans_sem
, RW_READER
);
1067 for (i
= 0; i
< RDS_TRANS_COUNT
; i
++) {
1068 if (transports
[i
]) {
1069 trans
= transports
[i
];
1073 rw_exit(&trans_sem
);
1075 /* trans->exit() will remove the trans from the list */
1079 rw_destroy(&trans_sem
);
1081 RDSV3_DPRINTF2("rdsv3_trans_exit", "Return");
1087 RDSV3_DPRINTF2("rdsv3_trans_init", "Enter");
1089 rw_init(&trans_sem
, NULL
, RW_DRIVER
, NULL
);
1091 RDSV3_DPRINTF2("rdsv3_trans_init", "Return");
1095 rdsv3_put_cmsg(struct msghdr
*msg
, int level
, int type
, size_t size
,
1104 RDSV3_DPRINTF4("rdsv3_put_cmsg",
1105 "Enter(msg: %p level: %d type: %d sz: %d)",
1106 msg
, level
, type
, size
);
1108 if (msg
== NULL
|| msg
->msg_controllen
== 0) {
1111 /* check for first cmsg or this is another cmsg to be appended */
1112 if (msg
->msg_control
== NULL
)
1113 msg
->msg_controllen
= 0;
1115 cmlen
= CMSG_LEN(size
);
1116 cmspace
= CMSG_SPACE(size
);
1117 bufsz
= msg
->msg_controllen
+ cmspace
;
1119 /* extend the existing cmsg to append the next cmsg */
1120 bp
= kmem_alloc(bufsz
, KM_SLEEP
);
1121 if (msg
->msg_control
) {
1122 bcopy(msg
->msg_control
, bp
, msg
->msg_controllen
);
1123 kmem_free(msg
->msg_control
, (size_t)msg
->msg_controllen
);
1126 /* assign payload the proper cmsg location */
1127 cp
= (struct cmsghdr
*)(bp
+ msg
->msg_controllen
);
1128 cp
->cmsg_len
= cmlen
;
1129 cp
->cmsg_level
= level
;
1130 cp
->cmsg_type
= type
;
1132 bcopy(payload
, CMSG_DATA(cp
), cmlen
-
1133 (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr
)));
1135 msg
->msg_control
= bp
;
1136 msg
->msg_controllen
= bufsz
;
1138 RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp
->cmsg_len
);
1145 rdsv3_verify_bind_address(ipaddr_t addr
)
1152 rdsv3_ip_fast_csum(void *hdr
, size_t length
)
1155 (uint16_t)(~ip_ocsum((ushort_t
*)hdr
, (int)length
<<1, 0)));
1158 /* scatterlist implementation */
1161 rdsv3_ib_sg_dma_address(ib_device_t
*dev
, struct rdsv3_scatterlist
*scat
,
1168 rdsv3_ib_dma_map_sg(struct ib_device
*dev
, struct rdsv3_scatterlist
*scat
,
1171 struct rdsv3_scatterlist
*s
, *first
;
1174 ibt_iov_attr_t iov_attr
;
1178 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat
, num
);
1180 s
= first
= &scat
[0];
1181 ASSERT(first
->mihdl
== NULL
);
1183 iov
= kmem_alloc(num
* sizeof (ibt_iov_t
), KM_SLEEP
);
1184 sgl
= kmem_zalloc((num
* 2) * sizeof (ibt_wr_ds_t
), KM_SLEEP
);
1186 for (i
= 0; i
< num
; i
++, s
++) {
1187 iov
[i
].iov_addr
= s
->vaddr
;
1188 iov
[i
].iov_len
= s
->length
;
1191 iov_attr
.iov_as
= NULL
;
1193 iov_attr
.iov_buf
= NULL
;
1194 iov_attr
.iov_list_len
= num
;
1195 iov_attr
.iov_wr_nds
= num
* 2;
1196 iov_attr
.iov_lso_hdr_sz
= 0;
1197 iov_attr
.iov_flags
= IBT_IOV_SLEEP
;
1201 i
= ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev
),
1202 &iov_attr
, (ibt_all_wr_t
*)&swr
, &first
->mihdl
);
1203 kmem_free(iov
, num
* sizeof (ibt_iov_t
));
1204 if (i
!= IBT_SUCCESS
) {
1205 RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg",
1206 "ibt_map_mem_iov returned: %d", i
);
1211 for (i
= 0; i
< num
; i
++, s
++, sgl
++) {
1219 rdsv3_ib_dma_unmap_sg(ib_device_t
*dev
, struct rdsv3_scatterlist
*scat
,
1222 /* Zero length messages have no scatter gather entries */
1224 ASSERT(scat
->mihdl
!= NULL
);
1225 ASSERT(scat
->sgl
!= NULL
);
1227 (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev
), scat
->mihdl
);
1229 kmem_free(scat
->sgl
, (num
* 2) * sizeof (ibt_wr_ds_t
));
1236 rdsv3_ib_alloc_hdrs(ib_device_t
*dev
, struct rdsv3_ib_connection
*ic
)
1240 ibt_mr_attr_t mr_attr
;
1241 ibt_mr_desc_t mr_desc
;
1242 ibt_mr_hdl_t mr_hdl
;
1245 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev
);
1247 ASSERT(ic
->i_mr
== NULL
);
1249 size
= (ic
->i_send_ring
.w_nr
+ ic
->i_recv_ring
.w_nr
+ 1) *
1250 sizeof (struct rdsv3_header
);
1252 addr
= kmem_zalloc(size
, KM_NOSLEEP
);
1256 mr_attr
.mr_vaddr
= (ib_vaddr_t
)(uintptr_t)addr
;
1257 mr_attr
.mr_len
= size
;
1258 mr_attr
.mr_as
= NULL
;
1259 mr_attr
.mr_flags
= IBT_MR_ENABLE_LOCAL_WRITE
;
1260 ret
= ibt_register_mr(ib_get_ibt_hca_hdl(dev
), RDSV3_PD2PDHDL(ic
->i_pd
),
1261 &mr_attr
, &mr_hdl
, &mr_desc
);
1262 if (ret
!= IBT_SUCCESS
) {
1263 RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs",
1264 "ibt_register_mr returned: " "%d", ret
);
1268 ic
->i_mr
= kmem_alloc(sizeof (struct rdsv3_hdrs_mr
), KM_SLEEP
);
1269 ic
->i_mr
->addr
= addr
;
1270 ic
->i_mr
->size
= size
;
1271 ic
->i_mr
->hdl
= mr_hdl
;
1272 ic
->i_mr
->lkey
= mr_desc
.md_lkey
;
1274 ic
->i_send_hdrs
= (struct rdsv3_header
*)addr
;
1275 ic
->i_send_hdrs_dma
= (uint64_t)(uintptr_t)addr
;
1277 ic
->i_recv_hdrs
= (struct rdsv3_header
*)(addr
+
1278 (ic
->i_send_ring
.w_nr
* sizeof (struct rdsv3_header
)));
1279 ic
->i_recv_hdrs_dma
= (uint64_t)(uintptr_t)(addr
+
1280 (ic
->i_send_ring
.w_nr
* sizeof (struct rdsv3_header
)));
1282 ic
->i_ack
= (struct rdsv3_header
*)(addr
+
1283 ((ic
->i_send_ring
.w_nr
+ ic
->i_recv_ring
.w_nr
) *
1284 sizeof (struct rdsv3_header
)));
1285 ic
->i_ack_dma
= (uint64_t)(uintptr_t)(addr
+
1286 ((ic
->i_send_ring
.w_nr
+ ic
->i_recv_ring
.w_nr
) *
1287 sizeof (struct rdsv3_header
)));
1289 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev
);
1295 rdsv3_ib_free_hdrs(ib_device_t
*dev
, struct rdsv3_ib_connection
*ic
)
1297 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev
);
1298 ASSERT(ic
->i_mr
!= NULL
);
1300 ic
->i_send_hdrs
= NULL
;
1301 ic
->i_send_hdrs_dma
= (uintptr_t)NULL
;
1303 ic
->i_recv_hdrs
= NULL
;
1304 ic
->i_recv_hdrs_dma
= (uintptr_t)NULL
;
1307 ic
->i_ack_dma
= (uintptr_t)NULL
;
1309 (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev
), ic
->i_mr
->hdl
);
1311 kmem_free(ic
->i_mr
->addr
, ic
->i_mr
->size
);
1312 kmem_free(ic
->i_mr
, sizeof (struct rdsv3_hdrs_mr
));
1315 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev
);
1319 * atomic_add_unless - add unless the number is a given value
1320 * @v: pointer of type atomic_t
1321 * @a: the amount to add to v...
1322 * @u: ...unless v is equal to u.
1324 * Atomically adds @a to @v, so long as it was not @u.
1325 * Returns non-zero if @v was not @u, and zero otherwise.
1328 atomic_add_unless(atomic_t
*v
, uint_t a
, ulong_t u
)
1333 while (c
!= u
&& (old
= atomic_cas_uint(v
, c
, c
+ a
)) != c
) {
1336 return ((ulong_t
)c
!= u
);