4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
30 #include <sys/sunddi.h>
31 #include <sys/ksynch.h>
32 #include <sys/pattr.h> /* HCK_* */
33 #include <inet/ip.h> /* ipha_t */
34 #include <inet/tcp.h> /* tcph_t */
35 #include <sys/mac_provider.h> /* mac_* */
36 #include <sys/strsun.h> /* MBLKL */
38 #include <sys/ib/clients/eoib/eib_impl.h>
41 * Declarations private to this file
43 static int eib_data_setup_cqs(eib_t
*, eib_vnic_t
*);
44 static int eib_data_setup_ud_channel(eib_t
*, eib_vnic_t
*);
45 static void eib_data_setup_lso(eib_wqe_t
*, mblk_t
*, uint32_t,
47 static int eib_data_prepare_sgl(eib_vnic_t
*, eib_wqe_t
*, mblk_t
*);
48 static int eib_data_is_mcast_pkt_ok(eib_vnic_t
*, uint8_t *, uint64_t *,
50 static void eib_data_rx_comp_intr(ibt_cq_hdl_t
, void *);
51 static void eib_data_tx_comp_intr(ibt_cq_hdl_t
, void *);
52 static mblk_t
*eib_data_rx_comp(eib_vnic_t
*, eib_wqe_t
*, ibt_wc_t
*);
53 static void eib_data_tx_comp(eib_vnic_t
*, eib_wqe_t
*, eib_chan_t
*);
54 static void eib_data_err_comp(eib_vnic_t
*, eib_wqe_t
*, ibt_wc_t
*);
55 static void eib_rb_data_setup_cqs(eib_t
*, eib_vnic_t
*);
56 static void eib_rb_data_setup_ud_channel(eib_t
*, eib_vnic_t
*);
60 eib_data_create_qp(eib_t
*ss
, eib_vnic_t
*vnic
, int *err
)
62 eib_chan_t
*chan
= NULL
;
65 * Allocate a eib_chan_t to store stuff about this vnic's data qp
66 * and initialize it with default admin qp pkey parameters. We'll
67 * re-associate this with the pkey we receive from the gw once we
68 * receive the login ack.
70 vnic
->vn_data_chan
= eib_chan_init();
72 chan
= vnic
->vn_data_chan
;
73 chan
->ch_pkey
= ss
->ei_admin_chan
->ch_pkey
;
74 chan
->ch_pkey_ix
= ss
->ei_admin_chan
->ch_pkey_ix
;
75 chan
->ch_vnic_inst
= vnic
->vn_instance
;
78 * Setup tx/rx CQs and completion handlers
80 if (eib_data_setup_cqs(ss
, vnic
) != EIB_E_SUCCESS
) {
81 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_create_qp: "
82 "eib_data_setup_cqs(vn_inst=0x%x) failed",
85 goto data_create_qp_fail
;
91 if (eib_data_setup_ud_channel(ss
, vnic
) != EIB_E_SUCCESS
) {
92 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_create_qp: "
93 "eib_data_setup_ud_channel(vn_inst=0x%x) failed",
96 goto data_create_qp_fail
;
99 return (EIB_E_SUCCESS
);
102 eib_rb_data_create_qp(ss
, vnic
);
103 return (EIB_E_FAILURE
);
108 eib_data_rx_comp_handler(caddr_t arg1
, caddr_t arg2
)
110 eib_vnic_t
*vnic
= (eib_vnic_t
*)(void *)arg1
;
111 eib_t
*ss
= vnic
->vn_ss
;
112 eib_chan_t
*chan
= vnic
->vn_data_chan
;
113 eib_stats_t
*stats
= ss
->ei_stats
;
120 uint_t pkts_per_call
= 0;
128 * Re-arm the rx notification callback before we start polling
129 * the completion queue. There's nothing much we can do if the
130 * enable_cq_notify fails - we issue a warning and move on.
132 ret
= ibt_enable_cq_notify(chan
->ch_rcv_cq_hdl
, IBT_NEXT_COMPLETION
);
133 if (ret
!= IBT_SUCCESS
) {
134 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp_handler: "
135 "ibt_enable_cq_notify() failed, ret=%d", ret
);
139 * We don't want to be stuck in receive processing for too long without
140 * giving others a chance.
142 num_wc
= (chan
->ch_rcv_cq_sz
< EIB_MAX_RX_PKTS_ONINTR
) ?
143 chan
->ch_rcv_cq_sz
: EIB_MAX_RX_PKTS_ONINTR
;
146 * Handle rx completions
148 while ((ret
= ibt_poll_cq(chan
->ch_rcv_cq_hdl
, chan
->ch_rcv_wc
,
149 num_wc
, &polled
)) == IBT_SUCCESS
) {
154 for (wc
= chan
->ch_rcv_wc
, i
= 0; i
< polled
; i
++, wc
++) {
155 wqe
= (eib_wqe_t
*)(uintptr_t)wc
->wc_id
;
157 ASSERT(EIB_WQE_TYPE(wqe
->qe_info
) == EIB_WQE_RX
);
160 * Clear the posted-to-hca flag and reduce the number
161 * of posted-rwqes count
163 wqe
->qe_info
&= (~EIB_WQE_FLG_POSTED_TO_HCA
);
164 eib_rsrc_decr_posted_rwqe(ss
, chan
);
166 rbytes
+= wc
->wc_bytes_xfer
;
167 if (wc
->wc_status
!= IBT_WC_SUCCESS
) {
168 EIB_INCR_COUNTER(&stats
->st_ierrors
);
169 eib_data_err_comp(vnic
, wqe
, wc
);
172 mp
= eib_data_rx_comp(vnic
, wqe
, wc
);
177 * Add this mp to the list to
178 * send it to the nw layer. Note
179 * that the wqe could've been
180 * returned to the pool if we're
181 * running low, so don't process
182 * wqe after this point.
194 * We reduce the number of atomic updates to key statistics
195 * by pooling them here, once per ibt_poll_cq(). The accuracy
196 * and consistency of the published statistics within a cq
197 * polling cycle will be compromised a little bit, but that
198 * should be ok, given that we probably gain a little bit by
199 * not having to do these atomic operations per packet.
201 EIB_UPDATE_COUNTER(&stats
->st_rbytes
, rbytes
);
202 EIB_UPDATE_COUNTER(&stats
->st_ipkts
, ipkts
);
204 pkts_per_call
+= ipkts
;
207 mac_rx(ss
->ei_mac_hdl
, NULL
, head
);
211 * If we have processed too many packets in one attempt, we'll
212 * have to come back here later.
214 if (pkts_per_call
>= EIB_MAX_RX_PKTS_ONINTR
) {
215 (void) ddi_intr_trigger_softint(vnic
->vn_data_rx_si_hdl
,
223 return (DDI_INTR_CLAIMED
);
228 eib_data_tx_comp_handler(caddr_t arg1
, caddr_t arg2
)
230 eib_vnic_t
*vnic
= (eib_vnic_t
*)(void *)arg1
;
231 eib_t
*ss
= vnic
->vn_ss
;
232 eib_chan_t
*chan
= vnic
->vn_data_chan
;
233 eib_stats_t
*stats
= ss
->ei_stats
;
241 * Re-arm the tx notification callback before we start polling
242 * the completion queue. There's nothing much we can do if the
243 * enable_cq_notify fails - we issue a warning and move on.
245 ret
= ibt_enable_cq_notify(chan
->ch_cq_hdl
, IBT_NEXT_COMPLETION
);
246 if (ret
!= IBT_SUCCESS
) {
247 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_tx_comp_handler: "
248 "ibt_enable_cq_notify() failed, ret=%d", ret
);
252 * Handle tx completions
254 while ((ret
= ibt_poll_cq(chan
->ch_cq_hdl
, chan
->ch_wc
, chan
->ch_cq_sz
,
255 &polled
)) == IBT_SUCCESS
) {
256 for (wc
= chan
->ch_wc
, i
= 0; i
< polled
; i
++, wc
++) {
257 wqe
= (eib_wqe_t
*)(uintptr_t)wc
->wc_id
;
259 ASSERT(EIB_WQE_TYPE(wqe
->qe_info
) == EIB_WQE_TX
);
261 if (wc
->wc_status
!= IBT_WC_SUCCESS
) {
262 EIB_INCR_COUNTER(&stats
->st_oerrors
);
263 eib_data_err_comp(vnic
, wqe
, wc
);
265 eib_data_tx_comp(vnic
, wqe
, vnic
->vn_data_chan
);
270 return (DDI_INTR_CLAIMED
);
274 eib_data_rx_recycle(caddr_t arg
)
276 eib_wqe_t
*rwqe
= (eib_wqe_t
*)(void *)arg
;
277 eib_t
*ss
= rwqe
->qe_pool
->wp_ss
;
283 * We come here from three places - (a) from the nw layer if the
284 * rx mblk we handed to it has been done with and the nw layer is
285 * calling the freemsg() (b) from eib_data_rx_comp() if the rx
286 * completion processing discovers that the received EoIB packet
287 * has a problem and (c) from eib_data_err_comp() if we're tearing
288 * down this channel. We only need to repost the rwqe if we're
289 * being called back from the nw layer. For the other two cases,
290 * we'll simply return the rwqe to the pool. Also, since we would've
291 * already updated the ch_rx_posted counters in the rx completion
292 * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe
293 * from within this routine.
296 if ((rwqe
->qe_info
& EIB_WQE_FLG_WITH_NW
) == 0) {
297 eib_rsrc_return_rwqe(ss
, rwqe
, NULL
);
301 rwqe
->qe_info
&= (~EIB_WQE_FLG_WITH_NW
);
304 * If the buffers are being returned by nw layer after a long
305 * time, this eoib instance could've even been stopped by now.
306 * If so, simply return the rwqe to the pool.
308 nic_state
= eib_mac_get_nic_state(ss
);
309 if ((nic_state
& EIB_NIC_STARTED
) != EIB_NIC_STARTED
) {
310 eib_rsrc_return_rwqe(ss
, rwqe
, NULL
);
315 * Or it could've taken even longer, and the nic has even been
316 * restarted. Only thing we can do is to make sure that the
317 * original channel pointer we passed corresponds to what's in
318 * the instance of the vnic currently.
320 vn_chan
= eib_vnic_get_data_chan(ss
, rwqe
->qe_vnic_inst
);
321 if (vn_chan
== NULL
|| vn_chan
!= rwqe
->qe_chan
) {
322 eib_rsrc_return_rwqe(ss
, rwqe
, NULL
);
327 * Try to repost the rwqe if we're not tearing down this channel
329 if (vn_chan
->ch_tear_down
) {
330 eib_rsrc_return_rwqe(ss
, rwqe
, NULL
);
332 ret
= eib_chan_post_recv(ss
, vn_chan
, rwqe
);
333 if (ret
!= EIB_E_SUCCESS
) {
335 freemsg(rwqe
->qe_mp
);
337 eib_rsrc_return_rwqe(ss
, rwqe
, NULL
);
343 eib_data_post_tx(eib_vnic_t
*vnic
, eib_wqe_t
*swqe
)
345 eib_chan_t
*chan
= vnic
->vn_data_chan
;
346 eib_t
*ss
= vnic
->vn_ss
;
347 eib_stats_t
*stats
= vnic
->vn_ss
->ei_stats
;
348 ibt_send_wr_t wrs
[EIB_MAX_POST_MULTIPLE
];
349 eib_wqe_t
*wqes
[EIB_MAX_POST_MULTIPLE
];
354 uint_t total_failed
= 0;
359 * See if we have room for this wqe and then add it to the
360 * list of tx wrs to post in this channel.
362 mutex_enter(&chan
->ch_tx_lock
);
364 if ((chan
->ch_tx_posted
+ 1) >= (chan
->ch_max_swqes
- 1)) {
365 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_post_tx: "
366 "too many swqes posted already, posted=0x%lx, "
367 "max=0x%lx", chan
->ch_tx_posted
, chan
->ch_max_swqes
);
368 mutex_exit(&chan
->ch_tx_lock
);
372 swqe
->qe_nxt_post
= NULL
;
374 chan
->ch_tx_tail
->qe_nxt_post
= swqe
;
378 chan
->ch_tx_tail
= swqe
;
379 chan
->ch_tx_posted
++; /* pre-increment */
382 * If someone's already posting tx wqes in this channel, let
383 * them post ours as well.
385 if (chan
->ch_tx_busy
== B_TRUE
) {
386 mutex_exit(&chan
->ch_tx_lock
);
389 chan
->ch_tx_busy
= B_TRUE
;
391 while (chan
->ch_tx
) {
393 * Post EIB_MAX_POST_MULTIPLE wrs at a time
395 for (n_wrs
= 0, elem
= chan
->ch_tx
;
396 (elem
) && (n_wrs
< EIB_MAX_POST_MULTIPLE
);
397 elem
= elem
->qe_nxt_post
, n_wrs
++) {
399 wrs
[n_wrs
] = (elem
->qe_wr
).send
;
403 chan
->ch_tx_tail
= NULL
;
405 mutex_exit(&chan
->ch_tx_lock
);
410 * If multiple wrs posting fails for some reason, we'll try
411 * posting the unposted ones one by one. If even that fails,
412 * we'll release any mappings/buffers/mblks associated with
413 * this wqe and return it to the pool.
415 n_posted
= n_failed
= 0;
416 ret
= ibt_post_send(chan
->ch_chan
, wrs
, n_wrs
, &n_posted
);
417 if (ret
!= IBT_SUCCESS
) {
418 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_post_tx: "
419 "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) "
420 "failed, ret=%d", n_wrs
, n_posted
, ret
);
422 for (i
= n_posted
; i
< n_wrs
; i
++) {
423 ret
= ibt_post_send(chan
->ch_chan
, &wrs
[i
],
425 if (ret
!= IBT_SUCCESS
) {
427 eib_data_tx_comp(vnic
, wqes
[i
], chan
);
429 EIB_DPRINTF_WARN(ss
->ei_instance
,
431 "ibt_post_send(n_wrs=1) failed, "
436 total_failed
+= n_failed
;
438 mutex_enter(&chan
->ch_tx_lock
);
441 chan
->ch_tx_busy
= B_FALSE
;
442 mutex_exit(&chan
->ch_tx_lock
);
445 * If we failed to post something, update error stats
448 EIB_UPDATE_COUNTER(&stats
->st_oerrors
, total_failed
);
453 eib_data_parse_ether_hdr(mblk_t
*mp
, eib_ether_hdr_t
*evh
)
455 struct ether_vlan_header
*vl_hdr
;
456 struct ether_header
*hdr
;
459 * Assume that the ether header (with or without vlan tag) is
460 * contained in one fragment
462 hdr
= (struct ether_header
*)(void *)mp
->b_rptr
;
463 vl_hdr
= (struct ether_vlan_header
*)(void *)mp
->b_rptr
;
465 evh
->eh_ether_type
= ntohs(hdr
->ether_type
);
466 if (evh
->eh_ether_type
!= ETHERTYPE_VLAN
) {
469 ether_copy((void *)hdr
->ether_dhost
.ether_addr_octet
,
470 (void *)evh
->eh_dmac
);
471 ether_copy((void *)hdr
->ether_shost
.ether_addr_octet
,
472 (void *)evh
->eh_smac
);
474 evh
->eh_ether_type
= ntohs(vl_hdr
->ether_type
);
476 evh
->eh_vlan
= VLAN_ID(ntohs(vl_hdr
->ether_tci
));
477 ether_copy((void *)vl_hdr
->ether_dhost
.ether_addr_octet
,
478 (void *)evh
->eh_dmac
);
479 ether_copy((void *)vl_hdr
->ether_shost
.ether_addr_octet
,
480 (void *)evh
->eh_smac
);
485 eib_data_lookup_vnic(eib_t
*ss
, uint8_t *mac
, uint16_t vlan
, eib_vnic_t
**vnicp
,
496 return (EIB_E_FAILURE
);
499 * For now, a simple search (but only what we've allocated). Note that
500 * if we're in the process of creating a vnic, the instance might've
501 * been allocated, but the vnic entry would be NULL.
503 mutex_enter(&ss
->ei_vnic_lock
);
504 av
= ss
->ei_active_vnics
;
505 while ((inst
= EIB_FIND_LSB_SET(av
)) != -1) {
506 if ((vnic
= ss
->ei_vnic
[inst
]) != NULL
) {
507 vn_mac
= vnic
->vn_login_data
.ld_assigned_mac
;
508 vn_vlan
= vnic
->vn_login_data
.ld_assigned_vlan
;
510 if ((vn_vlan
== vlan
) &&
511 (bcmp(vn_mac
, mac
, ETHERADDRL
) == 0)) {
515 mutex_exit(&ss
->ei_vnic_lock
);
516 return (EIB_E_SUCCESS
);
520 av
&= (~((uint64_t)1 << inst
));
522 mutex_exit(&ss
->ei_vnic_lock
);
525 * If we haven't been able to locate a vnic for this {mac,vlan} tuple,
526 * see if we've already failed a creation request for this vnic, and
527 * return that information.
530 mutex_enter(&ss
->ei_vnic_req_lock
);
532 for (vrq
= ss
->ei_failed_vnic_req
; vrq
; vrq
= vrq
->vr_next
) {
533 if ((vrq
->vr_vlan
== vlan
) &&
534 (bcmp(vrq
->vr_mac
, mac
, ETHERADDRL
) == 0)) {
538 mutex_exit(&ss
->ei_vnic_req_lock
);
541 return (EIB_E_FAILURE
);
545 eib_data_prepare_frame(eib_vnic_t
*vnic
, eib_wqe_t
*swqe
, mblk_t
*mp
,
546 eib_ether_hdr_t
*evh
)
553 * The swqe defaults are set to use the regular ud work request
554 * member and the IBT_WRC_SEND opcode, so we don't need to do
555 * anything here if this isn't an LSO packet.
557 mac_lso_get(mp
, &mss
, &lsoflags
);
558 if ((lsoflags
& HW_LSO
) == HW_LSO
)
559 eib_data_setup_lso(swqe
, mp
, mss
, evh
);
561 mac_hcksum_get(mp
, NULL
, NULL
, NULL
, NULL
, &hckflags
);
562 if ((hckflags
& HCK_FULLCKSUM
) == HCK_FULLCKSUM
) {
563 swqe
->qe_wr
.send
.wr_flags
|= IBT_WR_SEND_CKSUM
;
565 swqe
->qe_wr
.send
.wr_flags
&= (~IBT_WR_SEND_CKSUM
);
568 if (eib_data_prepare_sgl(vnic
, swqe
, mp
) != 0)
569 return (EIB_E_FAILURE
);
573 return (EIB_E_SUCCESS
);
577 eib_rb_data_create_qp(eib_t
*ss
, eib_vnic_t
*vnic
)
579 eib_rb_data_setup_ud_channel(ss
, vnic
);
581 eib_rb_data_setup_cqs(ss
, vnic
);
583 eib_chan_fini(vnic
->vn_data_chan
);
584 vnic
->vn_data_chan
= NULL
;
588 eib_data_setup_cqs(eib_t
*ss
, eib_vnic_t
*vnic
)
590 eib_chan_t
*chan
= vnic
->vn_data_chan
;
591 ibt_cq_attr_t cq_attr
;
598 * Allocate send completion queue. Note that we've already verified
599 * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements
602 cq_attr
.cq_sched
= NULL
;
603 cq_attr
.cq_flags
= IBT_CQ_NO_FLAGS
;
604 cq_attr
.cq_size
= ss
->ei_caps
->cp_max_swqe
+ 1;
606 ret
= ibt_alloc_cq(ss
->ei_hca_hdl
, &cq_attr
, &chan
->ch_cq_hdl
, &snd_sz
);
607 if (ret
!= IBT_SUCCESS
) {
608 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
609 "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d",
610 cq_attr
.cq_size
, ret
);
611 goto setup_data_cqs_fail
;
613 ret
= ibt_modify_cq(chan
->ch_cq_hdl
, EIB_TX_COMP_COUNT
,
614 EIB_TX_COMP_USEC
, 0);
615 if (ret
!= IBT_SUCCESS
) {
616 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_setup_cqs: "
617 "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) "
619 EIB_TX_COMP_COUNT
, EIB_TX_COMP_USEC
, ret
);
623 * Allocate receive completion queue
625 cq_attr
.cq_sched
= NULL
;
626 cq_attr
.cq_flags
= IBT_CQ_NO_FLAGS
;
627 cq_attr
.cq_size
= ss
->ei_caps
->cp_max_rwqe
+ 1;
629 ret
= ibt_alloc_cq(ss
->ei_hca_hdl
, &cq_attr
, &chan
->ch_rcv_cq_hdl
,
631 if (ret
!= IBT_SUCCESS
) {
632 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
633 "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d",
634 cq_attr
.cq_size
, ret
);
635 goto setup_data_cqs_fail
;
637 ret
= ibt_modify_cq(chan
->ch_rcv_cq_hdl
, EIB_RX_COMP_COUNT
,
638 EIB_RX_COMP_USEC
, 0);
639 if (ret
!= IBT_SUCCESS
) {
640 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_setup_cqs: "
641 "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) "
643 EIB_RX_COMP_COUNT
, EIB_RX_COMP_USEC
, ret
);
647 * Set up parameters for collecting tx and rx completion information
649 chan
->ch_cq_sz
= snd_sz
;
650 chan
->ch_wc
= kmem_zalloc(sizeof (ibt_wc_t
) * snd_sz
, KM_SLEEP
);
651 chan
->ch_rcv_cq_sz
= rcv_sz
;
652 chan
->ch_rcv_wc
= kmem_zalloc(sizeof (ibt_wc_t
) * rcv_sz
, KM_SLEEP
);
655 * Set up the vnic's data tx completion queue handler and allocate
656 * a softint for it as well.
658 if ((rv
= ddi_intr_add_softint(ss
->ei_dip
, &vnic
->vn_data_tx_si_hdl
,
659 EIB_SOFTPRI_DATA
, eib_data_tx_comp_handler
, vnic
)) != DDI_SUCCESS
) {
660 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
661 "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv
);
662 goto setup_data_cqs_fail
;
664 ibt_set_cq_handler(chan
->ch_cq_hdl
, eib_data_tx_comp_intr
, vnic
);
665 ret
= ibt_enable_cq_notify(chan
->ch_cq_hdl
, IBT_NEXT_COMPLETION
);
666 if (ret
!= IBT_SUCCESS
) {
667 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
668 "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret
);
669 goto setup_data_cqs_fail
;
673 * And then the data rx completion queue handler
675 if ((rv
= ddi_intr_add_softint(ss
->ei_dip
, &vnic
->vn_data_rx_si_hdl
,
676 EIB_SOFTPRI_DATA
, eib_data_rx_comp_handler
, vnic
)) != DDI_SUCCESS
) {
677 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
678 "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv
);
679 goto setup_data_cqs_fail
;
681 ibt_set_cq_handler(chan
->ch_rcv_cq_hdl
, eib_data_rx_comp_intr
, vnic
);
682 ret
= ibt_enable_cq_notify(chan
->ch_rcv_cq_hdl
, IBT_NEXT_COMPLETION
);
683 if (ret
!= IBT_SUCCESS
) {
684 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_cqs: "
685 "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret
);
686 goto setup_data_cqs_fail
;
689 return (EIB_E_SUCCESS
);
692 eib_rb_data_setup_cqs(ss
, vnic
);
693 return (EIB_E_FAILURE
);
697 eib_data_setup_ud_channel(eib_t
*ss
, eib_vnic_t
*vnic
)
699 eib_chan_t
*chan
= vnic
->vn_data_chan
;
700 ibt_ud_chan_alloc_args_t alloc_attr
;
701 ibt_ud_chan_query_attr_t query_attr
;
704 bzero(&alloc_attr
, sizeof (ibt_ud_chan_alloc_args_t
));
705 bzero(&query_attr
, sizeof (ibt_ud_chan_query_attr_t
));
707 alloc_attr
.ud_flags
= IBT_ALL_SIGNALED
;
708 if (ss
->ei_caps
->cp_resv_lkey_capab
)
709 alloc_attr
.ud_flags
|= IBT_FAST_REG_RES_LKEY
;
710 if (ss
->ei_caps
->cp_lso_maxlen
)
711 alloc_attr
.ud_flags
|= IBT_USES_LSO
;
713 alloc_attr
.ud_hca_port_num
= ss
->ei_props
->ep_port_num
;
714 alloc_attr
.ud_pkey_ix
= chan
->ch_pkey_ix
;
715 alloc_attr
.ud_sizes
.cs_sq
= ss
->ei_caps
->cp_max_swqe
;
716 alloc_attr
.ud_sizes
.cs_rq
= ss
->ei_caps
->cp_max_rwqe
;
717 alloc_attr
.ud_sizes
.cs_sq_sgl
= ss
->ei_caps
->cp_max_sgl
;
718 alloc_attr
.ud_sizes
.cs_rq_sgl
= 1;
719 alloc_attr
.ud_sizes
.cs_inline
= 0;
721 alloc_attr
.ud_qkey
= EIB_DATA_QKEY
;
722 alloc_attr
.ud_scq
= chan
->ch_cq_hdl
;
723 alloc_attr
.ud_rcq
= chan
->ch_rcv_cq_hdl
;
724 alloc_attr
.ud_pd
= ss
->ei_pd_hdl
;
726 ret
= ibt_alloc_ud_channel(ss
->ei_hca_hdl
, IBT_ACHAN_NO_FLAGS
,
727 &alloc_attr
, &chan
->ch_chan
, NULL
);
728 if (ret
!= IBT_SUCCESS
) {
729 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_ud_channel: "
730 "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, "
731 "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d",
732 alloc_attr
.ud_hca_port_num
, chan
->ch_pkey_ix
,
733 alloc_attr
.ud_sizes
.cs_sq
, alloc_attr
.ud_sizes
.cs_rq
,
734 alloc_attr
.ud_sizes
.cs_sq_sgl
, ret
);
736 goto setup_data_ud_channel_fail
;
739 ret
= ibt_query_ud_channel(chan
->ch_chan
, &query_attr
);
740 if (ret
!= IBT_SUCCESS
) {
741 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_setup_ud_channel: "
742 "ibt_query_ud_channel() failed, ret=%d", ret
);
743 goto setup_data_ud_channel_fail
;
746 chan
->ch_qpn
= query_attr
.ud_qpn
;
747 chan
->ch_max_swqes
= query_attr
.ud_chan_sizes
.cs_sq
;
748 chan
->ch_max_rwqes
= query_attr
.ud_chan_sizes
.cs_rq
;
749 chan
->ch_lwm_rwqes
= chan
->ch_max_rwqes
>> 2;
750 chan
->ch_rwqe_bktsz
= (chan
->ch_max_rwqes
< EIB_DATA_RWQE_BKT
) ?
751 chan
->ch_max_rwqes
: EIB_DATA_RWQE_BKT
;
752 chan
->ch_ip_hdr_align
= EIB_IP_HDR_ALIGN
;
753 chan
->ch_alloc_mp
= B_TRUE
;
754 chan
->ch_tear_down
= B_FALSE
;
756 return (EIB_E_SUCCESS
);
758 setup_data_ud_channel_fail
:
759 eib_rb_data_setup_ud_channel(ss
, vnic
);
760 return (EIB_E_FAILURE
);
764 eib_data_setup_lso(eib_wqe_t
*swqe
, mblk_t
*mp
, uint32_t mss
,
765 eib_ether_hdr_t
*evh
)
779 * When the swqe was grabbed, it would've had its wr_opcode and
780 * wr.ud.udwr_dest set to default values. Since we're now going
781 * to use LSO, we need to change these.
783 swqe
->qe_wr
.send
.wr_opcode
= IBT_WRC_SEND_LSO
;
784 lso
= &(swqe
->qe_wr
.send
.wr
.ud_lso
);
785 lso
->lso_ud_dest
= swqe
->qe_dest
;
789 * Details on the ethernet header in the mp is already known to us
791 eth_hdr_len
= (evh
->eh_tagless
) ? (sizeof (struct ether_header
)) :
792 (sizeof (struct ether_vlan_header
));
795 * Calculate the LSO header size and set it in the UD LSO structure.
796 * Note that the only assumption we make is that each of the Ethernet,
797 * IP and TCP headers will be contained in a single mblk fragment;
798 * together, the headers may span multiple mblk fragments. Note also
799 * that since the EoIB encapsulation header is not part of the message
800 * block we receive, we'll need to account space for inserting it later.
803 ip_start
= (uintptr_t)(nmp
->b_rptr
) + eth_hdr_len
;
804 if (ip_start
>= (uintptr_t)(nmp
->b_wptr
)) {
805 ip_start
= (uintptr_t)nmp
->b_cont
->b_rptr
806 + (ip_start
- (uintptr_t)(nmp
->b_wptr
));
809 ip_hdr_len
= IPH_HDR_LENGTH((ipha_t
*)ip_start
);
811 tcp_start
= ip_start
+ ip_hdr_len
;
812 if (tcp_start
>= (uintptr_t)(nmp
->b_wptr
)) {
813 tcp_start
= (uintptr_t)nmp
->b_cont
->b_rptr
814 + (tcp_start
- (uintptr_t)(nmp
->b_wptr
));
817 tcp_hdr_len
= TCP_HDR_LENGTH((tcph_t
*)tcp_start
);
820 * Since the passed mp fragment never contains the EoIB encapsulation
821 * header, we always have to copy the lso header. Sigh.
823 lso
->lso_hdr
= swqe
->qe_payload_hdr
;
824 lso
->lso_hdr_sz
= EIB_ENCAP_HDR_SZ
+ eth_hdr_len
+
825 ip_hdr_len
+ tcp_hdr_len
;
828 * We already have the EoIB encapsulation header written at the
829 * start of wqe->qe_payload_hdr during swqe acquisition. Only
830 * copy the remaining headers.
832 dst
= lso
->lso_hdr
+ EIB_ENCAP_HDR_SZ
;
833 pending
= lso
->lso_hdr_sz
- EIB_ENCAP_HDR_SZ
;
835 for (nmp
= mp
; nmp
&& pending
; nmp
= nmp
->b_cont
) {
837 if (pending
> mblen
) {
838 bcopy(nmp
->b_rptr
, dst
, mblen
);
842 bcopy(nmp
->b_rptr
, dst
, pending
);
849 eib_data_prepare_sgl(eib_vnic_t
*vnic
, eib_wqe_t
*swqe
, mblk_t
*mp
)
851 eib_t
*ss
= vnic
->vn_ss
;
852 eib_stats_t
*stats
= vnic
->vn_ss
->ei_stats
;
853 ibt_iov_t iov_arr
[EIB_MAX_SGL
];
854 ibt_iov_attr_t iov_attr
;
871 * Let's skip ahead to the TCP data if this is LSO. Note that while
872 * the lso header size in the swqe includes the EoIB encapsulation
873 * header size, that encapsulation header itself won't be found in
876 lsohdr_sz
= (swqe
->qe_wr
.send
.wr_opcode
== IBT_WRC_SEND
) ? 0 :
877 swqe
->qe_wr
.send
.wr
.ud_lso
.lso_hdr_sz
;
882 pending_hdr
= lsohdr_sz
- EIB_ENCAP_HDR_SZ
;
883 for (nmp
= mp
; nmp
; nmp
= nmp
->b_cont
) {
885 (uintptr_t)nmp
->b_wptr
- (uintptr_t)nmp
->b_rptr
;
886 if (frag_len
> pending_hdr
)
888 pending_hdr
-= frag_len
;
890 data_mp
= nmp
; /* start of data past lso header */
891 ASSERT(data_mp
!= NULL
);
895 * If this is an LSO packet, we want pktsz to hold the size of the
896 * data following the eoib/ethernet/tcp/ip headers. If this is a
897 * non-LSO packet, we want pktsz to refer to the size of the entire
898 * packet with all the headers, and nblks to hold the number of
899 * mappings we'll need to iov map this (for reserved lkey request).
901 if (lsohdr_sz
== 0) {
903 pktsz
= EIB_ENCAP_HDR_SZ
;
908 for (nmp
= data_mp
; nmp
!= NULL
; nmp
= nmp
->b_cont
) {
912 pktsz
-= pending_hdr
;
914 EIB_UPDATE_COUNTER(&stats
->st_obytes
, pktsz
);
915 EIB_INCR_COUNTER(&stats
->st_opkts
);
918 * We only do ibt_map_mem_iov() if the pktsz is above the tx copy
919 * threshold and if the number of mp fragments is less than the
920 * maximum acceptable.
922 if ((ss
->ei_caps
->cp_resv_lkey_capab
) && (pktsz
> EIB_TX_COPY_THRESH
) &&
923 (nblks
< ss
->ei_caps
->cp_hiwm_sgl
)) {
925 iov_attr
.iov_as
= NULL
;
926 iov_attr
.iov
= iov_arr
;
927 iov_attr
.iov_buf
= NULL
;
928 iov_attr
.iov_list_len
= nblks
;
929 iov_attr
.iov_wr_nds
= ss
->ei_caps
->cp_max_sgl
;
930 iov_attr
.iov_lso_hdr_sz
= lsohdr_sz
;
931 iov_attr
.iov_flags
= IBT_IOV_SLEEP
;
934 if (lsohdr_sz
== 0) {
935 iov_arr
[i
].iov_addr
= (caddr_t
)swqe
->qe_payload_hdr
;
936 iov_arr
[i
].iov_len
= EIB_ENCAP_HDR_SZ
;
939 for (nmp
= data_mp
; i
< nblks
; i
++, nmp
= nmp
->b_cont
) {
940 iov_arr
[i
].iov_addr
= (caddr_t
)(void *)nmp
->b_rptr
;
941 iov_arr
[i
].iov_len
= MBLKL(nmp
);
942 if (nmp
== data_mp
) {
943 iov_arr
[i
].iov_addr
+= pending_hdr
;
944 iov_arr
[i
].iov_len
-= pending_hdr
;
947 swqe
->qe_info
|= EIB_WQE_FLG_BUFTYPE_MAPPED
;
948 swqe
->qe_wr
.send
.wr_sgl
= swqe
->qe_big_sgl
;
950 ret
= ibt_map_mem_iov(ss
->ei_hca_hdl
, &iov_attr
,
951 &swqe
->qe_wr
, &swqe
->qe_iov_hdl
);
952 if (ret
!= IBT_SUCCESS
) {
953 EIB_DPRINTF_WARN(ss
->ei_instance
,
954 "eib_data_prepare_sgl: "
955 "ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ",
956 "attempting to use copy path", nblks
, ret
);
957 goto prepare_sgl_copy_path
;
960 return (EIB_E_SUCCESS
);
963 prepare_sgl_copy_path
:
964 if (pktsz
<= swqe
->qe_bufsz
) {
965 swqe
->qe_wr
.send
.wr_nds
= 1;
966 swqe
->qe_wr
.send
.wr_sgl
= &swqe
->qe_sgl
;
967 swqe
->qe_sgl
.ds_len
= pktsz
;
970 * Even though this is the copy path for transfers less than
971 * qe_bufsz, it could still be an LSO packet. If so, we only
972 * have to write the data following all the headers into the
973 * work request buffer, since we'll be sending the lso header
974 * itself separately. If this is not an LSO send (but pkt size
975 * greater than mtu, say for a jumbo frame), then we need
976 * to write all the headers including EoIB encapsulation,
977 * into the work request buffer.
979 bufp
= (uchar_t
*)(uintptr_t)swqe
->qe_sgl
.ds_va
;
980 if (lsohdr_sz
== 0) {
981 *(uint32_t *)((void *)bufp
) = htonl(EIB_TX_ENCAP_HDR
);
982 bufp
+= EIB_ENCAP_HDR_SZ
;
984 for (nmp
= data_mp
; nmp
!= NULL
; nmp
= nmp
->b_cont
) {
985 blksize
= MBLKL(nmp
) - pending_hdr
;
986 bcopy(nmp
->b_rptr
+ pending_hdr
, bufp
, blksize
);
992 * If the ethernet frame we're going to send is less than
993 * ETHERMIN, pad up the buffer to ETHERMIN (with zeros)
995 if ((pktsz
+ lsohdr_sz
) < (ETHERMIN
+ EIB_ENCAP_HDR_SZ
)) {
996 bzero(bufp
, (ETHERMIN
+ EIB_ENCAP_HDR_SZ
) -
997 (pktsz
+ lsohdr_sz
));
998 swqe
->qe_sgl
.ds_len
= ETHERMIN
+ EIB_ENCAP_HDR_SZ
;
1000 return (EIB_E_SUCCESS
);
1004 * Copy path for transfers greater than swqe->qe_bufsz
1006 swqe
->qe_wr
.send
.wr_sgl
= swqe
->qe_big_sgl
;
1007 if (eib_rsrc_grab_lsobufs(ss
, pktsz
, swqe
->qe_wr
.send
.wr_sgl
,
1008 &(swqe
->qe_wr
.send
.wr_nds
)) != EIB_E_SUCCESS
) {
1009 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_prepare_sgl: "
1010 "eib_rsrc_grab_lsobufs() failed");
1011 return (EIB_E_FAILURE
);
1013 swqe
->qe_info
|= EIB_WQE_FLG_BUFTYPE_LSO
;
1016 * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized,
1017 * pre-mapped LSO buffers. Note that we might need to skip part of
1018 * the LSO header in the first fragment as before.
1022 for (i
= 0; i
< swqe
->qe_wr
.send
.wr_nds
; i
++) {
1023 sgl
= swqe
->qe_wr
.send
.wr_sgl
+ i
;
1024 bufp
= (uchar_t
*)(uintptr_t)sgl
->ds_va
;
1025 avail
= EIB_LSO_BUFSZ
;
1028 * If this is a non-LSO packet (perhaps a jumbo frame?)
1029 * we may still need to prefix the EoIB header in the
1032 if ((i
== 0) && (lsohdr_sz
== 0)) {
1033 *(uint32_t *)((void *)bufp
) = htonl(EIB_TX_ENCAP_HDR
);
1034 bufp
+= EIB_ENCAP_HDR_SZ
;
1035 avail
-= EIB_ENCAP_HDR_SZ
;
1038 while (nmp
&& avail
) {
1039 blksize
= MBLKL(nmp
) - skip
;
1040 if (blksize
> avail
) {
1041 bcopy(nmp
->b_rptr
+ skip
, bufp
, avail
);
1045 bcopy(nmp
->b_rptr
+ skip
, bufp
, blksize
);
1054 return (EIB_E_SUCCESS
);
1059 eib_data_is_mcast_pkt_ok(eib_vnic_t
*vnic
, uint8_t *macaddr
, uint64_t *brdcst
,
1063 * If the dmac is a broadcast packet, let it through. Otherwise, either
1064 * we should be in promiscuous mode or the dmac should be in our list of
1065 * joined multicast addresses. Currently we only update the stat
1066 * counters and always let things through.
1068 if (bcmp(macaddr
, eib_broadcast_mac
, ETHERADDRL
) == 0)
1069 EIB_INCR_COUNTER(brdcst
);
1071 EIB_INCR_COUNTER(multicst
);
1077 eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl
, void *arg
)
1079 eib_vnic_t
*vnic
= arg
;
1080 eib_chan_t
*chan
= vnic
->vn_data_chan
;
1081 eib_t
*ss
= vnic
->vn_ss
;
1083 if (cq_hdl
!= chan
->ch_rcv_cq_hdl
) {
1084 EIB_DPRINTF_DEBUG(ss
->ei_instance
, "eib_data_rx_comp_intr: "
1085 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1086 "ignoring completion", cq_hdl
, chan
->ch_cq_hdl
);
1090 ASSERT(vnic
->vn_data_rx_si_hdl
!= NULL
);
1092 (void) ddi_intr_trigger_softint(vnic
->vn_data_rx_si_hdl
, NULL
);
1096 eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl
, void *arg
)
1098 eib_vnic_t
*vnic
= arg
;
1099 eib_chan_t
*chan
= vnic
->vn_data_chan
;
1100 eib_t
*ss
= vnic
->vn_ss
;
1102 if (cq_hdl
!= chan
->ch_cq_hdl
) {
1103 EIB_DPRINTF_DEBUG(ss
->ei_instance
, "eib_data_tx_comp_intr: "
1104 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1105 "ignoring completion", cq_hdl
, chan
->ch_cq_hdl
);
1109 ASSERT(vnic
->vn_data_tx_si_hdl
!= NULL
);
1111 (void) ddi_intr_trigger_softint(vnic
->vn_data_tx_si_hdl
, NULL
);
1115 eib_data_rx_comp(eib_vnic_t
*vnic
, eib_wqe_t
*wqe
, ibt_wc_t
*wc
)
1117 eib_t
*ss
= vnic
->vn_ss
;
1118 eib_chan_t
*chan
= vnic
->vn_data_chan
;
1119 eib_login_data_t
*ld
= &vnic
->vn_login_data
;
1120 eib_stats_t
*stats
= ss
->ei_stats
;
1121 eib_ether_hdr_t evh
;
1123 boolean_t allocd_mp
= B_FALSE
;
1131 * Before we process this mblk and send it up to network layer, see
1132 * if we're running low on rwqes in the wqe pool. If so, allocate a
1133 * new mblk, copy the received data into it and send it up (and return
1134 * the current rwqe back to the pool immediately by calling freemsg()
1135 * on the original mblk).
1137 if (!eib_rsrc_rxpool_low(wqe
)) {
1140 if ((mp
= allocb(wc
->wc_bytes_xfer
, BPRI_HI
)) != NULL
) {
1141 bcopy(wqe
->qe_mp
->b_rptr
, mp
->b_rptr
,
1143 freemsg(wqe
->qe_mp
);
1146 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1147 "wqe level below watermark, dropping rx pkt");
1148 EIB_INCR_COUNTER(&stats
->st_norcvbuf
);
1149 freemsg(wqe
->qe_mp
);
1155 * Adjust write pointer depending on how much data came in. Note that
1156 * since the nw layer will expect us to hand over the mp with the
1157 * ethernet header starting at mp->b_rptr, update the b_rptr as well.
1159 mp
->b_wptr
= mp
->b_rptr
+ wc
->wc_bytes_xfer
;
1162 * We have a problem if this really happens!
1164 if (mp
->b_next
!= NULL
) {
1165 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1166 "received packet's b_next not NULL, possible dup from cq");
1171 * Drop loopback packets ?
1173 if ((wc
->wc_slid
== ss
->ei_props
->ep_blid
) &&
1174 (wc
->wc_qpn
== chan
->ch_qpn
)) {
1175 goto data_rx_comp_fail
;
1178 mp
->b_rptr
+= EIB_GRH_SZ
;
1181 * Since the recv buffer has been aligned for IP header to start on
1182 * a word boundary, it is safe to say that the EoIB and ethernet
1183 * headers won't start on a word boundary.
1185 bcopy(mp
->b_rptr
, &ec_hdr
, EIB_ENCAP_HDR_SZ
);
1188 * Check EoIB signature and version
1190 ec_hdr
= ntohl(ec_hdr
);
1192 ec_sign
= (ec_hdr
>> EIB_ENCAP_SIGN_SHIFT
) & EIB_ENCAP_SIGN_MASK
;
1193 if (ec_sign
!= EIB_EH_SIGNATURE
) {
1194 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1195 "EoIB encapsulation header signature (0x%lx) unknown",
1197 goto data_rx_comp_fail
;
1200 ec_ver
= (ec_hdr
>> EIB_ENCAP_VER_SHIFT
) & EIB_ENCAP_VER_MASK
;
1201 if (ec_ver
!= EIB_EH_VERSION
) {
1202 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1203 "EoIB encapsulation header version (0x%lx) unknown",
1205 goto data_rx_comp_fail
;
1209 * Check TCP/UDP and IP checksum
1211 ec_tu_cs
= (ec_hdr
>> EIB_ENCAP_TCPCHK_SHIFT
) & EIB_ENCAP_TCPCHK_MASK
;
1212 ec_ip_cs
= (ec_hdr
>> EIB_ENCAP_IPCHK_SHIFT
) & EIB_ENCAP_IPCHK_MASK
;
1214 if ((ec_tu_cs
== EIB_EH_UDPCSUM_OK
|| ec_tu_cs
== EIB_EH_TCPCSUM_OK
) &&
1215 (ec_ip_cs
== EIB_EH_IPCSUM_OK
)) {
1216 mac_hcksum_set(mp
, 0, 0, 0, 0, HCK_FULLCKSUM_OK
);
1217 } else if (ec_tu_cs
== EIB_EH_CSUM_BAD
|| ec_ip_cs
== EIB_EH_CSUM_BAD
) {
1218 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1219 "EoIB encapsulation header tcp/udp checksum (0x%lx) or"
1220 "ip checksum (0x%lx) is bad", ec_tu_cs
, ec_ip_cs
);
1224 * Update the message block's b_rptr to the start of ethernet header
1225 * and parse the header information
1227 mp
->b_rptr
+= EIB_ENCAP_HDR_SZ
;
1228 eib_data_parse_ether_hdr(mp
, &evh
);
1231 * If the incoming packet is vlan-tagged, but the tag doesn't match
1232 * this vnic's vlan, drop it.
1234 if ((evh
.eh_tagless
== 0) && (evh
.eh_vlan
!= ld
->ld_assigned_vlan
)) {
1235 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1236 "received packet's vlan unknown, expected=0x%x, got=0x%x",
1237 ld
->ld_assigned_vlan
, evh
.eh_vlan
);
1238 goto data_rx_comp_fail
;
1242 * Final checks to see if the unicast destination is indeed correct
1243 * and to see if the multicast address is ok for us.
1245 if (EIB_UNICAST_MAC(evh
.eh_dmac
)) {
1246 if (bcmp(evh
.eh_dmac
, ld
->ld_assigned_mac
, ETHERADDRL
) != 0) {
1250 exp
= ld
->ld_assigned_mac
;
1253 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1254 "received packet's macaddr mismatch, "
1255 "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x",
1256 exp
[0], exp
[1], exp
[2], exp
[3], exp
[4], exp
[5],
1257 got
[0], got
[1], got
[2], got
[3], got
[4], got
[5]);
1259 goto data_rx_comp_fail
;
1262 if (!eib_data_is_mcast_pkt_ok(vnic
, evh
.eh_dmac
,
1263 &stats
->st_brdcstrcv
, &stats
->st_multircv
)) {
1264 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1265 "multicast packet not ok");
1266 goto data_rx_comp_fail
;
1271 * Strip ethernet FCS if present in the packet. ConnectX-2 doesn't
1272 * support ethernet FCS, so this shouldn't happen anyway.
1274 if ((ec_hdr
>> EIB_ENCAP_FCS_B_SHIFT
) & 0x1) {
1275 EIB_DPRINTF_WARN(ss
->ei_instance
, "eib_data_rx_comp: "
1276 "ethernet FCS present (ec_hdr=0%lx), ignoring",
1279 mp
->b_wptr
-= ETHERFCSL
;
1283 * If this is the same mp as was in the original rwqe (i.e. we didn't
1284 * do any allocb()), then mark the rwqe flag so we know that its mblk
1285 * is with the network layer.
1288 wqe
->qe_info
|= EIB_WQE_FLG_WITH_NW
;
1299 eib_data_tx_comp(eib_vnic_t
*vnic
, eib_wqe_t
*wqe
, eib_chan_t
*chan
)
1301 eib_t
*ss
= vnic
->vn_ss
;
1305 if (wqe
->qe_info
& EIB_WQE_FLG_BUFTYPE_MAPPED
) {
1306 ret
= ibt_unmap_mem_iov(ss
->ei_hca_hdl
,
1308 if (ret
!= IBT_SUCCESS
) {
1309 EIB_DPRINTF_WARN(ss
->ei_instance
,
1310 "eib_data_tx_comp: "
1311 "ibt_unmap_mem_iov() failed, ret=%d", ret
);
1313 wqe
->qe_iov_hdl
= NULL
;
1314 } else if (wqe
->qe_info
& EIB_WQE_FLG_BUFTYPE_LSO
) {
1315 eib_rsrc_return_lsobufs(ss
, wqe
->qe_big_sgl
,
1316 wqe
->qe_wr
.send
.wr_nds
);
1318 freemsg(wqe
->qe_mp
);
1322 eib_rsrc_return_swqe(ss
, wqe
, chan
);
1326 eib_data_err_comp(eib_vnic_t
*vnic
, eib_wqe_t
*wqe
, ibt_wc_t
*wc
)
1328 eib_t
*ss
= vnic
->vn_ss
;
1331 * Currently, all we do is report
1333 switch (wc
->wc_status
) {
1334 case IBT_WC_WR_FLUSHED_ERR
:
1337 case IBT_WC_LOCAL_CHAN_OP_ERR
:
1338 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_err_comp: "
1339 "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
1343 case IBT_WC_LOCAL_PROTECT_ERR
:
1344 EIB_DPRINTF_ERR(ss
->ei_instance
, "eib_data_err_comp: "
1345 "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
1351 * When a wc indicates error, we do not attempt to repost the
1352 * rwqe but simply return it to the wqe pool. Also for rwqes,
1353 * attempting to free the mblk in the wqe invokes the
1354 * eib_data_rx_recycle() callback. For tx wqes, error handling
1355 * is the same as successful completion handling. We still
1356 * have to unmap iov/free lsobufs/free mblk and then return the
1359 if (EIB_WQE_TYPE(wqe
->qe_info
) == EIB_WQE_RX
) {
1360 ASSERT(wqe
->qe_mp
!= NULL
);
1361 freemsg(wqe
->qe_mp
);
1363 eib_data_tx_comp(vnic
, wqe
, vnic
->vn_data_chan
);
1369 eib_rb_data_setup_cqs(eib_t
*ss
, eib_vnic_t
*vnic
)
1371 eib_chan_t
*chan
= vnic
->vn_data_chan
;
1378 * Reset any completion handlers we may have set up
1380 if (chan
->ch_rcv_cq_hdl
) {
1381 ibt_set_cq_handler(chan
->ch_rcv_cq_hdl
, NULL
, NULL
);
1383 if (chan
->ch_cq_hdl
) {
1384 ibt_set_cq_handler(chan
->ch_cq_hdl
, NULL
, NULL
);
1388 * Remove any softints that were added
1390 if (vnic
->vn_data_rx_si_hdl
) {
1391 (void) ddi_intr_remove_softint(vnic
->vn_data_rx_si_hdl
);
1392 vnic
->vn_data_rx_si_hdl
= NULL
;
1394 if (vnic
->vn_data_tx_si_hdl
) {
1395 (void) ddi_intr_remove_softint(vnic
->vn_data_tx_si_hdl
);
1396 vnic
->vn_data_tx_si_hdl
= NULL
;
1400 * Release any work completion buffers we may have allocated
1402 if (chan
->ch_rcv_wc
&& chan
->ch_rcv_cq_sz
) {
1403 kmem_free(chan
->ch_rcv_wc
,
1404 sizeof (ibt_wc_t
) * chan
->ch_rcv_cq_sz
);
1406 chan
->ch_rcv_cq_sz
= 0;
1407 chan
->ch_rcv_wc
= NULL
;
1409 if (chan
->ch_wc
&& chan
->ch_cq_sz
) {
1410 kmem_free(chan
->ch_wc
, sizeof (ibt_wc_t
) * chan
->ch_cq_sz
);
1416 * Free any completion queues we may have allocated
1418 if (chan
->ch_rcv_cq_hdl
) {
1419 ret
= ibt_free_cq(chan
->ch_rcv_cq_hdl
);
1420 if (ret
!= IBT_SUCCESS
) {
1421 EIB_DPRINTF_WARN(ss
->ei_instance
,
1422 "eib_rb_data_setup_cqs: "
1423 "ibt_free_cq(rcv_cq) failed, ret=%d", ret
);
1425 chan
->ch_rcv_cq_hdl
= NULL
;
1427 if (chan
->ch_cq_hdl
) {
1428 ret
= ibt_free_cq(chan
->ch_cq_hdl
);
1429 if (ret
!= IBT_SUCCESS
) {
1430 EIB_DPRINTF_WARN(ss
->ei_instance
,
1431 "eib_rb_data_setup_cqs: "
1432 "ibt_free_cq(snd_cq) failed, ret=%d", ret
);
1434 chan
->ch_cq_hdl
= NULL
;
1440 eib_rb_data_setup_ud_channel(eib_t
*ss
, eib_vnic_t
*vnic
)
1442 eib_chan_t
*chan
= vnic
->vn_data_chan
;
1448 if (chan
->ch_chan
) {
1450 * We're trying to tear down this UD channel. Make sure that
1451 * we don't attempt to refill (repost) at any point from now on.
1453 chan
->ch_tear_down
= B_TRUE
;
1454 if ((ret
= ibt_flush_channel(chan
->ch_chan
)) != IBT_SUCCESS
) {
1455 EIB_DPRINTF_WARN(ss
->ei_instance
,
1456 "eib_rb_data_setup_ud_channel: "
1457 "ibt_flush_channel() failed, ret=%d", ret
);
1461 * Wait until all posted tx wqes on this channel are back with
1464 mutex_enter(&chan
->ch_tx_lock
);
1465 while (chan
->ch_tx_posted
> 0)
1466 cv_wait(&chan
->ch_tx_cv
, &chan
->ch_tx_lock
);
1467 mutex_exit(&chan
->ch_tx_lock
);
1470 * Wait until all posted rx wqes on this channel are back with
1473 mutex_enter(&chan
->ch_rx_lock
);
1474 while (chan
->ch_rx_posted
> 0)
1475 cv_wait(&chan
->ch_rx_cv
, &chan
->ch_rx_lock
);
1476 mutex_exit(&chan
->ch_rx_lock
);
1479 * Now we're ready to free this channel
1481 if ((ret
= ibt_free_channel(chan
->ch_chan
)) != IBT_SUCCESS
) {
1482 EIB_DPRINTF_WARN(ss
->ei_instance
,
1483 "eib_rb_data_setup_ud_channel: "
1484 "ibt_free_channel() failed, ret=%d", ret
);
1487 chan
->ch_alloc_mp
= B_FALSE
;
1488 chan
->ch_ip_hdr_align
= 0;
1489 chan
->ch_rwqe_bktsz
= 0;
1490 chan
->ch_lwm_rwqes
= 0;
1491 chan
->ch_max_rwqes
= 0;
1492 chan
->ch_max_swqes
= 0;
1494 chan
->ch_chan
= NULL
;