4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
28 #include <sys/sunddi.h>
29 #include <sys/ksynch.h>
30 #include <sys/byteorder.h>
32 #include <sys/ib/clients/eoib/enx_impl.h>
34 const char fip_vendor_mellanox
[] = {
35 0x4d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78
41 * Verification of descriptor list length in the received packets is
42 * disabled, since experimentation shows that BX does not set the desc
43 * list length correctly.
45 int enx_wa_no_desc_list_len
= 1;
48 * Static function declarations
50 static int eibnx_fip_make_solicit_pkt(eibnx_thr_info_t
*, eibnx_wqe_t
*);
51 static int eibnx_fip_send_solicit_pkt(eibnx_thr_info_t
*, eibnx_wqe_t
*,
53 static int eibnx_fip_parse_advt_pkt(uint8_t *, eibnx_gw_msg_t
*);
54 static void eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t
*);
57 * Prepare and send a solicit multicast packet to the All-EoIB-GWs-GID
60 eibnx_fip_solicit_mcast(eibnx_thr_info_t
*info
)
65 if ((swqe
= eibnx_acquire_swqe(info
, KM_SLEEP
)) == NULL
)
66 return (ENX_E_FAILURE
);
68 ret
= eibnx_fip_make_solicit_pkt(info
, swqe
);
69 if (ret
!= ENX_E_SUCCESS
) {
70 eibnx_release_swqe(swqe
);
71 return (ENX_E_FAILURE
);
74 ret
= eibnx_fip_send_solicit_pkt(info
, swqe
, NULL
);
75 if (ret
!= ENX_E_SUCCESS
) {
76 eibnx_rb_fip_make_solicit_pkt(swqe
);
77 eibnx_release_swqe(swqe
);
78 return (ENX_E_FAILURE
);
81 return (ENX_E_SUCCESS
);
85 * Go through the list of already discovered gateways and send
86 * a unicast solicitation to each gateway. This is required by
87 * the EoIB specification ostensibly to receive updated
91 eibnx_fip_solicit_ucast(eibnx_thr_info_t
*info
, clock_t *solicit_period_ticks
)
95 clock_t min_solicit_period_msec
;
99 * We want to read the gwlist and send a unicast to each
100 * destination. Now, the only places where the gw list pointers
101 * are updated are when we're adding a new gw item to the list
102 * and when the list is being torn down and freed.
104 * Since new GWs are always inserted at the head of the list,
105 * we're guaranteed that any tail subchain of the list will
106 * not change by the addition of a new gw item coming into
109 * Also, since the gw list is torn down only by the port-monitor
110 * thread (i.e. ourselves), we are also protected against the
111 * list itself going away while we're here.
113 * Given these two constraints, we can safely read the list
114 * of gateways without the gw list lock in this routine.
116 min_solicit_period_msec
= drv_hztousec(*solicit_period_ticks
) / 1000;
117 for (gw
= info
->ti_gw
; gw
; gw
= gw
->gw_next
) {
119 if (eibnx_is_gw_dead(gw
))
123 ASSERT(swqe
!= NULL
);
125 mutex_enter(&swqe
->qe_lock
);
126 if (swqe
->qe_type
!= ENX_QETYP_SWQE
) {
127 ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: "
128 "gw wqe type (0x%lx) indicates this is not an "
129 "swqe!, cannot send solicitation to gw",
131 mutex_exit(&swqe
->qe_lock
);
133 } else if ((swqe
->qe_flags
& ENX_QEFL_INUSE
) !=
135 ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: "
136 "gw swqe flags (0x%lx) indicate swqe is free!, "
137 "cannot send solicitation to gw", swqe
->qe_flags
);
138 mutex_exit(&swqe
->qe_lock
);
140 } else if ((swqe
->qe_flags
& ENX_QEFL_POSTED
) ==
142 ENX_DPRINTF_DEBUG("eibnx_fip_solicit_ucast: gw swqe "
143 "flags (0x%lx) indicate swqe is still with HCA!, "
144 "cannot send solicitation to gw", swqe
->qe_flags
);
145 mutex_exit(&swqe
->qe_lock
);
148 mutex_exit(&swqe
->qe_lock
);
151 * EoIB spec requires that each host send solicitation
152 * to discovered gateways atleast every 4 * GW_ADV_PERIOD.
153 * We make sure we send a solicitation to all gateways
154 * every 4 * GW_ADV_PERIOD of the smallest value of
155 * GW_ADV_PERIOD that we have in our gw list.
157 if ((gw
->gw_adv_period
* 4) < min_solicit_period_msec
)
158 min_solicit_period_msec
= gw
->gw_adv_period
* 4;
160 ret
= eibnx_fip_make_solicit_pkt(info
, swqe
);
161 if (ret
!= ENX_E_SUCCESS
)
164 ret
= eibnx_fip_send_solicit_pkt(info
, swqe
, &gw
->gw_addr
);
165 if (ret
!= ENX_E_SUCCESS
)
166 eibnx_rb_fip_make_solicit_pkt(swqe
);
169 *solicit_period_ticks
= drv_usectohz(min_solicit_period_msec
* 1000);
171 return (ENX_E_SUCCESS
);
175 * Given a send wqe and an eibnx_thr_info_t pointer, fill in the
176 * send buffer with a solicit packet in the network byte order.
179 eibnx_fip_make_solicit_pkt(eibnx_thr_info_t
*info
, eibnx_wqe_t
*swqe
)
181 fip_solicit_t
*solicit
;
183 fip_basic_hdr_t
*hdr
;
188 uint8_t *pkt
= (uint8_t *)(uintptr_t)(swqe
->qe_sgl
.ds_va
);
189 uint_t pktsz
= swqe
->qe_sgl
.ds_len
;
190 uint_t solicit_sz
= sizeof (fip_solicit_t
);
192 if (pktsz
< solicit_sz
) {
193 ENX_DPRINTF_ERR("swqe bufsize too small for pkt, "
194 "pktsz=%x < expsz=%x", pktsz
, solicit_sz
);
195 return (ENX_E_FAILURE
);
199 * Lint complains that there may be an alignment issue here,
200 * but we know that the "pkt" is atleast double-word aligned,
203 solicit
= (fip_solicit_t
*)pkt
;
206 * Fill in the FIP protocol version
208 proto
= &solicit
->sl_proto_version
;
209 proto
->pr_version
= FIP_PROTO_VERSION
;
212 * Fill in the basic header
214 hdr
= &solicit
->sl_fip_hdr
;
215 hdr
->hd_opcode
= htons(FIP_OPCODE_EOIB
);
216 hdr
->hd_subcode
= FIP_SUBCODE_H_SOLICIT
;
217 hdr
->hd_desc_list_len
= htons((solicit_sz
>> 2) - 2);
219 hdr
->hd_type
= FIP_DESC_TYPE_VENDOR_ID
;
220 hdr
->hd_len
= FIP_DESC_LEN_VENDOR_ID
;
221 bcopy(fip_vendor_mellanox
, hdr
->hd_vendor_id
, FIP_VENDOR_LEN
);
224 * Fill in the Infiniband Address descriptor
226 iba
= &solicit
->sl_iba
;
227 iba
->ia_type
= FIP_DESC_TYPE_IBA
;
228 iba
->ia_len
= FIP_DESC_LEN_IBA
;
229 bcopy(fip_vendor_mellanox
, iba
->ia_vendor_id
, FIP_VENDOR_LEN
);
230 iba
->ia_qpn
= htonl(info
->ti_qpn
);
231 iba
->ia_sl_portid
= 0;
232 iba
->ia_lid
= htons(info
->ti_pi
->p_base_lid
);
233 port_gid
= info
->ti_pi
->p_sgid_tbl
[0];
234 port_guid
= htonll(port_gid
.gid_guid
);
235 bcopy(&port_guid
, iba
->ia_guid
, FIP_GUID_LEN
);
238 * Adjust the ds_len in the sgl to indicate the size of the
239 * solicit pkt before returning
241 swqe
->qe_sgl
.ds_len
= solicit_sz
;
243 return (ENX_E_SUCCESS
);
247 eibnx_setup_ud_dest(eibnx_thr_info_t
*info
, eibnx_wqe_t
*swqe
,
248 eibnx_gw_addr_t
*gw_addr
)
250 eibnx_t
*ss
= enx_global_ss
;
251 ibt_path_attr_t attr
;
252 ibt_path_info_t path
;
256 * If this a multicast send, we'll have the gateway address NULL,
257 * and we'll need to modify the UD destination to send to the
260 if (gw_addr
== NULL
) {
261 ret
= ibt_modify_ud_dest(swqe
->qe_wr
.send
.wr
.ud
.udwr_dest
,
262 info
->ti_solicit_mcg
->mc_qkey
, IB_MC_QPN
,
263 &info
->ti_solicit_mcg
->mc_adds_vect
);
264 if (ret
!= IBT_SUCCESS
) {
265 ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with "
266 "ret=%d, qkey=%x, qpn=%x", ret
,
267 info
->ti_solicit_mcg
->mc_qkey
, IB_MC_QPN
);
268 return (ENX_E_FAILURE
);
271 return (ENX_E_SUCCESS
);
275 * If this is a unicast send, but we already have the gw address
276 * vector, the ud destination handle has already been set up for
277 * this gateway, so we can return.
279 if (gw_addr
->ga_vect
)
280 return (ENX_E_SUCCESS
);
283 * Get the reversible path information for this gateway
285 bzero(&attr
, sizeof (ibt_path_info_t
));
286 attr
.pa_dgids
= &gw_addr
->ga_gid
;
287 attr
.pa_num_dgids
= 1;
288 attr
.pa_sgid
= info
->ti_pi
->p_sgid_tbl
[0];
289 attr
.pa_pkey
= gw_addr
->ga_pkey
;
291 bzero(&path
, sizeof (ibt_path_info_t
));
292 ret
= ibt_get_paths(ss
->nx_ibt_hdl
, IBT_PATH_PKEY
,
293 &attr
, 1, &path
, NULL
);
294 if ((ret
!= IBT_SUCCESS
) || (path
.pi_hca_guid
== 0)) {
295 ENX_DPRINTF_ERR("ibt_get_paths() failed with "
296 "ret=%d, gid_prefix=%llx, gid_guid=%llx", ret
,
297 gw_addr
->ga_gid
.gid_prefix
, gw_addr
->ga_gid
.gid_guid
);
298 return (ENX_E_FAILURE
);
302 * And save the address vector
304 gw_addr
->ga_vect
= kmem_zalloc(sizeof (ibt_adds_vect_t
), KM_SLEEP
);
305 bcopy(&path
.pi_prim_cep_path
.cep_adds_vect
, gw_addr
->ga_vect
,
306 sizeof (ibt_adds_vect_t
));
309 * Modify the UD destination handle on this swqe entry to address
312 ret
= ibt_modify_ud_dest(swqe
->qe_wr
.send
.wr
.ud
.udwr_dest
,
313 gw_addr
->ga_qkey
, gw_addr
->ga_qpn
, gw_addr
->ga_vect
);
314 if (ret
!= IBT_SUCCESS
) {
315 ENX_DPRINTF_ERR("ibt_modify_ud_dest() failed with "
316 "ret=%d, qkey=%x, qpn=%x", ret
, gw_addr
->ga_qkey
,
318 kmem_free(gw_addr
->ga_vect
, sizeof (ibt_adds_vect_t
));
319 gw_addr
->ga_vect
= NULL
;
320 return (ENX_E_FAILURE
);
323 return (ENX_E_SUCCESS
);
327 * Send a solicit packet to the appropriate destination: if the
328 * destination gw addr is specified, send a unicast message to it;
329 * if not, send a multicast using the solicit mcg address.
332 eibnx_fip_send_solicit_pkt(eibnx_thr_info_t
*info
, eibnx_wqe_t
*swqe
,
333 eibnx_gw_addr_t
*gw_addr
)
337 if (eibnx_setup_ud_dest(info
, swqe
, gw_addr
) != ENX_E_SUCCESS
)
338 return (ENX_E_FAILURE
);
340 mutex_enter(&swqe
->qe_lock
);
343 * Note that if the post send fails, we don't really need to undo
344 * anything we did in setting up the ud destination; we can always
345 * use it for the next time.
347 ret
= ibt_post_send(info
->ti_chan
, &(swqe
->qe_wr
.send
), 1, NULL
);
348 if (ret
!= IBT_SUCCESS
) {
349 mutex_exit(&swqe
->qe_lock
);
350 ENX_DPRINTF_ERR("ibt_post_send() failed for solicit, "
352 return (ENX_E_FAILURE
);
356 * Set the 'posted' flag for the send wqe. If this is an unicast
357 * send, the wqe is attached to a specific gw entry and we should
358 * not release the wqe back to the pool on the send completion.
360 swqe
->qe_flags
|= ENX_QEFL_POSTED
;
361 if (gw_addr
== NULL
) {
362 swqe
->qe_flags
|= ENX_QEFL_RELONCOMP
;
363 info
->ti_mcast_done
= 1;
366 mutex_exit(&swqe
->qe_lock
);
368 return (ENX_E_SUCCESS
);
372 * Parse a received packet from the gateway into the
373 * eibnx_gw_msg_t argument. Note that at this point, this
374 * driver only expects to receive advertisements from the
378 eibnx_fip_parse_pkt(uint8_t *pkt
, eibnx_gw_msg_t
*msg
)
380 fip_basic_hdr_t
*hdr
;
383 int ret
= ENX_E_FAILURE
;
386 * Lint complains about potential alignment problem here,
387 * but the fip_* structures are all packed and each of them
388 * is aligned on a word boundary, so we're ok.
390 hdr
= (fip_basic_hdr_t
*)(pkt
+ sizeof (fip_proto_t
));
393 * Verify that the opcode is EoIB
395 if ((opcode
= ntohs(hdr
->hd_opcode
)) != FIP_OPCODE_EOIB
) {
396 ENX_DPRINTF_WARN("unsupported opcode (%x) found in "
397 "gw advertisement, ignoring", opcode
);
398 return (ENX_E_FAILURE
);
402 * We only handle GW advertisements in the eibnx driver code. However,
403 * the BridgeX gateway software currently sends login acknowledgements
404 * to the one who did the solicitation instead of the one who actually
405 * made the login request, so we need to do something about this as
408 subcode
= hdr
->hd_subcode
;
410 case FIP_SUBCODE_G_ADVERTISE
:
411 ret
= eibnx_fip_parse_advt_pkt(pkt
, msg
);
414 case FIP_SUBCODE_G_VNIC_LOGIN_ACK
:
415 msg
->gm_type
= FIP_VNIC_LOGIN_ACK
;
420 ENX_DPRINTF_WARN("unsupported subcode (%x) found in "
421 "gw advertisement, ignoring", subcode
);
430 * Parse and validate a packet known to be an advertisement from
434 eibnx_fip_parse_advt_pkt(uint8_t *pkt
, eibnx_gw_msg_t
*msg
)
436 fip_advertise_t
*advertise
;
437 fip_basic_hdr_t
*hdr
;
438 fip_desc_iba_t
*desc_iba
;
439 fip_desc_gwinfo_t
*desc_gwinfo
;
440 fip_desc_gwid_t
*desc_gwid
;
441 fip_desc_keepalive_t
*desc_ka
;
442 eibnx_gw_info_t
*gwi
;
444 uint16_t rss_qpn_num_net_vnics
;
449 * Lint complains about potential alignment problem here,
450 * but we know that "pkt" is always atleast double-word
451 * aligned when it's passed to us, so we're ok.
453 advertise
= (fip_advertise_t
*)pkt
;
456 * Verify if the descriptor list length in the received
457 * packet is valid. Currently disabled.
459 * Experimentation shows that BX doesn't set the desc list
460 * length correctly, so we also simply ignore it and move
461 * on. If and when BX fixes this problem, we'll need to
462 * enable the warning+failure below.
464 hdr
= &(advertise
->ad_fip_header
);
465 if (!enx_wa_no_desc_list_len
) {
468 pkt_data_sz
= (ntohs(hdr
->hd_desc_list_len
) + 2) << 2;
469 if (pkt_data_sz
< sizeof (fip_advertise_t
)) {
470 ENX_DPRINTF_WARN("advertisement from gw too small; "
471 "expected %x, got %x", sizeof (fip_advertise_t
),
473 return (ENX_E_FAILURE
);
478 * Validate all the header and descriptor types and lengths
481 if (hdr
->hd_type
!= FIP_DESC_TYPE_VENDOR_ID
||
482 hdr
->hd_len
!= FIP_DESC_LEN_VENDOR_ID
) {
483 ENX_DPRINTF_WARN("invalid type/len in fip basic header; "
484 "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_VENDOR_ID
,
485 FIP_DESC_LEN_VENDOR_ID
, hdr
->hd_type
, hdr
->hd_len
);
486 return (ENX_E_FAILURE
);
489 desc_iba
= &(advertise
->ad_iba
);
490 if (desc_iba
->ia_type
!= FIP_DESC_TYPE_IBA
||
491 desc_iba
->ia_len
!= FIP_DESC_LEN_IBA
) {
492 ENX_DPRINTF_WARN("invalid type/len in fip iba desc; "
493 "expected (%x,%x), got (%x,%x)", FIP_DESC_TYPE_IBA
,
494 FIP_DESC_LEN_IBA
, desc_iba
->ia_type
, desc_iba
->ia_len
);
495 return (ENX_E_FAILURE
);
498 desc_gwinfo
= &(advertise
->ad_gwinfo
);
499 if (desc_gwinfo
->gi_type
!= FIP_DESC_TYPE_EOIB_GW_INFO
||
500 desc_gwinfo
->gi_len
!= FIP_DESC_LEN_EOIB_GW_INFO
) {
501 ENX_DPRINTF_WARN("invalid type/len in fip gwinfo desc; "
502 "expected (%x,%x), got (%x,%x)",
503 FIP_DESC_TYPE_EOIB_GW_INFO
, FIP_DESC_LEN_EOIB_GW_INFO
,
504 desc_gwinfo
->gi_type
, desc_gwinfo
->gi_len
);
505 return (ENX_E_FAILURE
);
508 desc_gwid
= &(advertise
->ad_gwid
);
509 if (desc_gwid
->id_type
!= FIP_DESC_TYPE_GW_ID
||
510 desc_gwid
->id_len
!= FIP_DESC_LEN_GW_ID
) {
511 ENX_DPRINTF_WARN("invalid type/len in fip gwid desc; "
512 "expected (%x,%x), got (%x,%x)",
513 FIP_DESC_TYPE_GW_ID
, FIP_DESC_LEN_GW_ID
,
514 desc_gwid
->id_type
, desc_gwid
->id_len
);
515 return (ENX_E_FAILURE
);
518 desc_ka
= &(advertise
->ad_keep_alive
);
519 if (desc_ka
->ka_type
!= FIP_DESC_TYPE_KEEP_ALIVE
||
520 desc_ka
->ka_len
!= FIP_DESC_LEN_KEEP_ALIVE
) {
521 ENX_DPRINTF_WARN("invalid type/len in fip ka desc; "
522 "expected (%x,%x), got (%x,%x)",
523 FIP_DESC_TYPE_KEEP_ALIVE
, FIP_DESC_LEN_KEEP_ALIVE
,
524 desc_ka
->ka_type
, desc_ka
->ka_len
);
525 return (ENX_E_FAILURE
);
529 * Record if the gw is available for login ('A' bit in the header)
531 flags
= ntohs(hdr
->hd_flags
);
532 gwi
= &(msg
->u
.gm_info
);
533 gwi
->gw_flag_available
= (flags
& FIP_BHFLAG_GWAVAIL
) ? 1 : 0;
536 * Record if this was in response to a solicit request (unicast
537 * advertisement) or not ('S' bit in the header)
539 gwi
->gw_flag_ucast_advt
= (flags
& FIP_BHFLAG_SLCTMSG
) ? 1 : 0;
540 msg
->gm_type
= (gwi
->gw_flag_ucast_advt
) ?
541 FIP_GW_ADVERTISE_UCAST
: FIP_GW_ADVERTISE_MCAST
;
544 * Record all info from the Infiniband Address descriptor
546 gwi
->gw_ctrl_qpn
= (ntohl(desc_iba
->ia_qpn
) & FIP_IBA_QPN_MASK
);
548 sl_portid
= ntohs(desc_iba
->ia_sl_portid
);
549 gwi
->gw_portid
= (sl_portid
& FIP_IBA_PORTID_MASK
);
550 gwi
->gw_sl
= ((sl_portid
& FIP_IBA_SL_MASK
) >> FIP_IBA_SL_SHIFT
);
552 gwi
->gw_lid
= ntohs(desc_iba
->ia_lid
);
554 bcopy(desc_iba
->ia_guid
, &guid
, sizeof (ib_guid_t
));
555 gwi
->gw_guid
= ntohll(guid
);
558 * Record all info from the EoIB GW Information descriptor
560 if (desc_gwinfo
->gi_flags
& FIP_GWI_HOST_ADMIND_VNICS_MASK
)
561 gwi
->gw_is_host_adm_vnics
= 1;
563 gwi
->gw_is_host_adm_vnics
= 0;
565 rss_qpn_num_net_vnics
= ntohs(desc_gwinfo
->gi_rss_qpn_num_net_vnics
);
566 gwi
->gw_num_net_vnics
= (rss_qpn_num_net_vnics
&
567 FIP_GWI_NUM_NET_VNICS_MASK
);
568 gwi
->gw_n_rss_qpn
= ((rss_qpn_num_net_vnics
&
569 FIP_GWI_RSS_QPN_MASK
) >> FIP_GWI_RSS_QPN_SHIFT
);
570 bcopy(desc_gwinfo
->gi_vendor_id
, gwi
->gw_vendor_id
, FIP_VENDOR_LEN
);
571 (gwi
->gw_vendor_id
)[FIP_VENDOR_LEN
] = '\0';
574 * Record all info from the Gateway Identifier descriptor
576 bcopy(desc_gwid
->id_guid
, &guid
, sizeof (ib_guid_t
));
577 gwi
->gw_system_guid
= ntohll(guid
);
578 bcopy(desc_gwid
->id_sysname
, gwi
->gw_system_name
, FIP_SYSNAME_LEN
);
579 (gwi
->gw_system_name
)[FIP_SYSNAME_LEN
] = '\0';
580 bcopy(desc_gwid
->id_portname
, gwi
->gw_port_name
, FIP_PORTNAME_LEN
);
581 (gwi
->gw_port_name
)[FIP_PORTNAME_LEN
] = '\0';
584 * Record all info from the Keep Alive descriptor
586 gwi
->gw_adv_period
= ntohl(desc_ka
->ka_gw_adv_period
);
587 gwi
->gw_ka_period
= ntohl(desc_ka
->ka_gw_ka_period
);
588 gwi
->gw_vnic_ka_period
= ntohl(desc_ka
->ka_vnic_ka_period
);
592 return (ENX_E_SUCCESS
);
596 * Rollback whatever we did for making a solicit packet
599 eibnx_rb_fip_make_solicit_pkt(eibnx_wqe_t
*swqe
)
601 uint8_t *pkt
= (uint8_t *)(uintptr_t)(swqe
->qe_sgl
.ds_va
);
603 bzero(pkt
, sizeof (fip_solicit_t
));
604 swqe
->qe_sgl
.ds_len
= swqe
->qe_bufsz
;