4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26 #ifndef _SYS_IB_CLIENTS_IBD_H
27 #define _SYS_IB_CLIENTS_IBD_H
33 /* The following macros are used in both ibd.c and ibd_cm.c */
36 * Completion queue polling control
38 #define IBD_CQ_POLLING 0x1
39 #define IBD_REDO_CQ_POLLING 0x2
42 * Maximum length for returning chained mps back to crossbow.
43 * Also used as the maximum number of rx wc's polled at a time.
45 #define IBD_MAX_RX_MP_LEN 16
48 * When doing multiple-send-wr, this value determines how many to do at
49 * a time (in a single ibt_post_send).
51 #define IBD_MAX_TX_POST_MULTIPLE 4
54 * Flag bits for resources to reap
56 #define IBD_RSRC_SWQE 0x1
57 #define IBD_RSRC_LSOBUF 0x2
58 #define IBD_RSRC_RC_SWQE 0x4
59 #define IBD_RSRC_RC_TX_LARGEBUF 0x8
62 * Async operation types
64 #define IBD_ASYNC_GETAH 1
65 #define IBD_ASYNC_JOIN 2
66 #define IBD_ASYNC_LEAVE 3
67 #define IBD_ASYNC_PROMON 4
68 #define IBD_ASYNC_PROMOFF 5
69 #define IBD_ASYNC_REAP 6
70 #define IBD_ASYNC_TRAP 7
71 #define IBD_ASYNC_SCHED 8
72 #define IBD_ASYNC_LINK 9
73 #define IBD_ASYNC_EXIT 10
74 #define IBD_ASYNC_RC_TOO_BIG 11
75 #define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12
76 #define IBD_ASYNC_RC_RECYCLE_ACE 13
77 #define IBD_ASYNC_RC_CLOSE_PAS_CHAN 14
80 * State of IBD driver initialization during attach/m_start
82 #define IBD_DRV_STATE_INITIALIZED 0x000001
83 #define IBD_DRV_RXINTR_ADDED 0x000002
84 #define IBD_DRV_TXINTR_ADDED 0x000004
85 #define IBD_DRV_IBTL_ATTACH_DONE 0x000008
86 #define IBD_DRV_HCA_OPENED 0x000010
87 #define IBD_DRV_PD_ALLOCD 0x000020
88 #define IBD_DRV_MAC_REGISTERED 0x000040
89 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080
90 #define IBD_DRV_BCAST_GROUP_FOUND 0x000100
91 #define IBD_DRV_ACACHE_INITIALIZED 0x000200
92 #define IBD_DRV_CQS_ALLOCD 0x000400
93 #define IBD_DRV_UD_CHANNEL_SETUP 0x000800
94 #define IBD_DRV_TXLIST_ALLOCD 0x001000
95 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000
96 #define IBD_DRV_RXLIST_ALLOCD 0x004000
97 #define IBD_DRV_BCAST_GROUP_JOINED 0x008000
98 #define IBD_DRV_ASYNC_THR_CREATED 0x010000
99 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000
100 #define IBD_DRV_SM_NOTICES_REGISTERED 0x040000
101 #define IBD_DRV_STARTED 0x080000
102 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000
103 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000
104 #define IBD_DRV_RC_LISTEN 0x400000
106 #define IBD_DRV_RC_PRIVATE_STATE 0x800000
108 #define IBD_DRV_IN_DELETION 0x1000000
109 #define IBD_DRV_IN_LATE_HCA_INIT 0x2000000
110 #define IBD_DRV_REQ_LIST_INITED 0x4000000
111 #define IBD_DRV_RC_TIMEOUT 0x8000000
114 * Miscellaneous constants
119 /* Tunables defaults and limits */
120 #define IBD_LINK_MODE_UD 0
121 #define IBD_LINK_MODE_RC 1
123 #define IBD_DEF_LINK_MODE IBD_LINK_MODE_RC
124 #define IBD_DEF_LSO_POLICY B_TRUE
125 #define IBD_DEF_NUM_LSO_BUFS 1024
126 #define IBD_DEF_CREATE_BCAST_GROUP B_TRUE
127 #define IBD_DEF_COALESCE_COMPLETIONS B_TRUE
128 #define IBD_DEF_UD_RX_COMP_COUNT 4
129 #define IBD_DEF_UD_RX_COMP_USEC 10
130 #define IBD_DEF_UD_TX_COMP_COUNT 16
131 #define IBD_DEF_UD_TX_COMP_USEC 300
132 #define IBD_DEF_RC_RX_COMP_COUNT 4
133 #define IBD_DEF_RC_RX_COMP_USEC 10
134 #define IBD_DEF_RC_TX_COMP_COUNT 10
135 #define IBD_DEF_RC_TX_COMP_USEC 300
136 #define IBD_DEF_UD_TX_COPY_THRESH 4096
137 #define IBD_DEF_RC_RX_COPY_THRESH 4096
138 #define IBD_DEF_RC_TX_COPY_THRESH 4096
139 #define IBD_DEF_UD_NUM_RWQE 4000
140 #define IBD_DEF_UD_NUM_SWQE 4000
141 #define IBD_DEF_RC_ENABLE_SRQ B_TRUE
143 #define IBD_DEF_RC_NUM_RWQE 511
144 #define IBD_DEF_RC_NUM_SWQE 255
146 #define IBD_DEF_RC_NUM_RWQE 2047
147 #define IBD_DEF_RC_NUM_SWQE 511
149 #define IBD_DEF_NUM_AH 256
150 #define IBD_DEF_HASH_SIZE 32
151 #define IBD_DEF_RC_NUM_SRQ (IBD_DEF_RC_NUM_RWQE - 1)
152 #define IBD_DEF_RC_RX_RWQE_THRESH (IBD_DEF_RC_NUM_RWQE >> 2)
155 #define IBD_MIN_NUM_LSO_BUFS 512
156 #define IBD_MAX_NUM_LSO_BUFS 4096
157 #define IBD_MIN_UD_TX_COPY_THRESH 2048
158 #define IBD_MAX_UD_TX_COPY_THRESH 65536
159 #define IBD_MIN_UD_NUM_SWQE 512
160 #define IBD_MAX_UD_NUM_SWQE 8000
161 #define IBD_MIN_UD_NUM_RWQE 512
162 #define IBD_MAX_UD_NUM_RWQE 8000
163 #define IBD_MIN_NUM_AH 32
164 #define IBD_MAX_NUM_AH 8192
165 #define IBD_MIN_HASH_SIZE 32
166 #define IBD_MAX_HASH_SIZE 1024
169 #define IBD_MIN_RC_NUM_SWQE 255
171 #define IBD_MIN_RC_NUM_SWQE 511
173 #define IBD_MAX_RC_NUM_SWQE 8000
174 #define IBD_MIN_RC_NUM_RWQE 511
175 #define IBD_MAX_RC_NUM_RWQE 8000
176 #define IBD_MIN_RC_RX_COPY_THRESH 1500
177 #define IBD_MAX_RC_RX_COPY_THRESH 65520
178 #define IBD_MIN_RC_TX_COPY_THRESH 1500
179 #define IBD_MAX_RC_TX_COPY_THRESH 65520
180 #define IBD_MIN_RC_NUM_SRQ (IBD_MIN_RC_NUM_RWQE - 1)
181 #define IBD_MIN_RC_RX_RWQE_THRESH (IBD_MIN_RC_NUM_RWQE >> 2)
186 * When waiting for resources (swqes or lso buffers) to become available,
187 * the first two thresholds below determine how long to wait before informing
188 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
189 * determines how low the available swqes should go before we start polling
190 * the completion queue.
192 #define IBD_FREE_LSOS_THRESH 8
193 #define IBD_FREE_SWQES_THRESH 20
194 #define IBD_TX_POLL_THRESH 80
197 void debug_print(int l
, char *fmt
, ...);
198 #define DPRINT debug_print
204 * AH and MCE active list manipulation:
206 * Multicast disable requests and MCG delete traps are two cases
207 * where the active AH entry for the mcg (if any unreferenced one exists)
208 * will be moved to the free list (to force the next Tx to the mcg to
209 * join the MCG in SendOnly mode). Port up handling will also move AHs
210 * from active to free list.
212 * In the case when some transmits are still pending on an entry
213 * for an mcg, but a multicast disable has already been issued on the
214 * mcg, there are some options to consider to preserve the join state
215 * to ensure the emitted packet is properly routed on the IBA fabric.
217 * 1. take out of active list at multicast disable time.
218 * 2. take out of active list only when last pending Tx completes.
219 * For the MCE, we can
220 * 3. take out of active list at multicast disable time.
221 * 4. take out of active list only when last pending Tx completes.
222 * 5. move from active list to stale list at multicast disable time.
223 * We choose to use 2,4. We use option 4 so that if a multicast enable
224 * is tried before the pending Tx completes, the enable code finds the
225 * mce in the active list and just has to make sure it will not be reaped
226 * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
227 * a stale list (#5) that would be checked in the enable code would need
228 * to be implemented. Option 2 is used, because otherwise, a Tx attempt
229 * after the multicast disable would try to put an AH in the active list,
230 * and associate the mce it finds in the active list to this new AH,
231 * whereas the mce is already associated with the previous AH (taken off
232 * the active list), and will be removed once the pending Tx's complete
233 * (unless a reference count on mce's is implemented). One implication of
234 * using 2,4 is that new Tx's posted before the pending Tx's complete will
235 * grab new references on the AH, further delaying the leave.
237 * In the case of mcg delete (or create) trap when the port is sendonly
238 * joined, the AH and MCE handling is different: the AH and MCE has to be
239 * immediately taken off the active lists (forcing a join and path lookup
240 * at the next Tx is the only guaranteed means of ensuring a proper Tx
241 * to an mcg as it is repeatedly created and deleted and goes thru
244 * When a port is already sendonly joined, and a multicast enable is
245 * attempted, the same mce structure is promoted; this ensures only a
246 * single mce on the active list tracks the most powerful join state.
248 * In the case of port up event handling, the MCE for sendonly membership
249 * is freed up, and the ACE is put into the free list as soon as possible
250 * (depending on whether posted Tx's have completed). For fullmembership
251 * MCE's though, the ACE is similarly handled; but the MCE is kept around
252 * (a re-JOIN is attempted) only if the DLPI leave has not already been
253 * done; else the mce is deconstructed (mc_fullreap case).
255 * MCG creation and deletion trap handling:
257 * These traps are unreliable (meaning sometimes the trap might never
258 * be delivered to the subscribed nodes) and may arrive out-of-order
259 * since they use UD transport. An alternative to relying on these
260 * unreliable traps is to poll for mcg presence every so often, but
261 * instead of doing that, we try to be as conservative as possible
262 * while handling the traps, and hope that the traps do arrive at
263 * the subscribed nodes soon. Note that if a node is fullmember
264 * joined to an mcg, it can not possibly receive a mcg create/delete
265 * trap for that mcg (by fullmember definition); if it does, it is
266 * an old trap from a previous incarnation of the mcg.
268 * Whenever a trap is received, the driver cleans up its sendonly
269 * membership to the group; we choose to do a sendonly leave even
270 * on a creation trap to handle the case of a prior deletion of the mcg
271 * having gone unnoticed. Consider an example scenario:
272 * T1: MCG M is deleted, and fires off deletion trap D1.
273 * T2: MCG M is recreated, fires off creation trap C1, which is lost.
274 * T3: Node N tries to transmit to M, joining in sendonly mode.
275 * T4: MCG M is deleted, and fires off deletion trap D2.
276 * T5: N receives a deletion trap, but can not distinguish D1 from D2.
277 * If the trap is D2, then a LEAVE is not required, since the mcg
278 * is already deleted; but if it is D1, a LEAVE is required. A safe
279 * approach is to always LEAVE, but the SM may be confused if it
280 * receives a LEAVE without a prior JOIN.
282 * Management of the non-membership to an mcg is similar to the above,
283 * except that if the interface is in promiscuous mode, it is required
284 * to attempt to re-join the mcg after receiving a trap. Unfortunately,
285 * if the re-join attempt fails (in which case a warning message needs
286 * to be printed), it is not clear whether it failed due to the mcg not
287 * existing, or some fabric/hca issues, due to the delayed nature of
288 * trap delivery. Querying the SA to establish presence/absence of the
289 * mcg is also racy at best. Thus, the driver just prints a warning
290 * message when it can not rejoin after receiving a create trap, although
291 * this might be (on rare occasions) a mis-warning if the create trap is
292 * received after the mcg was deleted.
296 * Implementation of atomic "recycle" bits and reference count
297 * on address handles. This utilizes the fact that max reference
298 * count on any handle is limited by number of send wqes, thus
299 * high bits in the ac_ref field can be used as the recycle bits,
300 * and only the low bits hold the number of pending Tx requests.
301 * This atomic AH reference counting allows the Tx completion
302 * handler not to acquire the id_ac_mutex to process every completion,
303 * thus reducing lock contention problems between completion and
306 #define CYCLEVAL 0x80000
307 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0
308 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
309 #define GET_REF(ace) ((ace)->ac_ref)
310 #define GET_REF_CYCLE(ace) ( \
312 * Make sure "cycle" bit is set. \
314 ASSERT(CYCLE_SET(ace)), \
315 ((ace)->ac_ref & ~(CYCLEVAL)) \
317 #define INC_REF(ace, num) { \
318 atomic_add_32(&(ace)->ac_ref, num); \
320 #define SET_CYCLE_IF_REF(ace) ( \
321 CYCLE_SET(ace) ? B_TRUE : \
322 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \
325 * Clear the "cycle" bit we just set; \
326 * ref count known to be 0 from above. \
328 CLEAR_REFCYCLE(ace), B_FALSE : \
330 * We set "cycle" bit; let caller know. \
334 #define DEC_REF_DO_CYCLE(ace) ( \
335 atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \
337 * Ref count known to be 0 from above. \
344 * Address handle entries maintained by the driver are kept in the
345 * free and active lists. Each entry starts out in the free list;
346 * it migrates to the active list when primed using ibt_get_paths()
347 * and ibt_modify_ud_dest() for transmission to a specific destination.
348 * In the active list, the entry has a reference count indicating the
349 * number of ongoing/uncompleted transmits that reference it. The
350 * entry is left in the active list even after the reference count
351 * goes to 0, since successive transmits can find it there and do
352 * not need to set up another entry (ie the path information is
353 * cached using the active list). Entries on the active list are
354 * also hashed using the destination link address as a key for faster
355 * lookups during transmits.
357 * For any destination address (unicast or multicast, whatever the
358 * join states), there will be at most one entry in the active list.
359 * Entries with a 0 reference count on the active list can be reused
360 * for a transmit to a new destination, if the free list is empty.
362 * The AH free list insertion/deletion is protected with the id_ac_mutex,
363 * since the async thread and Tx callback handlers insert/delete. The
364 * active list does not need a lock (all operations are done by the
365 * async thread) but updates to the reference count are atomically
366 * done (increments done by Tx path, decrements by the Tx callback handler).
368 #define IBD_ACACHE_INSERT_FREE(state, ce) \
369 list_insert_head(&state->id_ah_free, ce)
370 #define IBD_ACACHE_GET_FREE(state) \
371 list_get_head(&state->id_ah_free)
372 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \
374 list_insert_head(&state->id_ah_active, ce); \
375 _ret_ = mod_hash_insert(state->id_ah_active_hash, \
376 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \
377 ASSERT(_ret_ == 0); \
378 state->id_ac_hot_ace = ce; \
380 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \
381 list_remove(&state->id_ah_active, ce); \
382 if (state->id_ac_hot_ace == ce) \
383 state->id_ac_hot_ace = NULL; \
384 (void) mod_hash_remove(state->id_ah_active_hash, \
385 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \
387 #define IBD_ACACHE_GET_ACTIVE(state) \
388 list_get_head(&state->id_ah_active)
391 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
392 * front of optional src/tgt link layer address. Right now Solaris inserts
393 * padding by default at the end. The routine which is doing is nce_xmit()
394 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
395 * the packet comes down from IP layer to the IBD driver, it is in the
396 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
397 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
398 * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
400 * The send routine at IBD driver changes this packet as follows:
401 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
402 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
405 * At the receiving side again ibd_process_rx takes the above packet and
406 * removes the two bytes of front padding and inserts it at the end. This
407 * is since the IP layer does not understand padding at the front.
409 #define IBD_PAD_NSNA(ip6h, len, type) { \
410 uchar_t *nd_lla_ptr; \
415 icmp6 = (icmp6_t *)&ip6h[1]; \
416 len -= sizeof (nd_neighbor_advert_t); \
417 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \
418 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \
420 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \
421 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \
422 ASSERT(opt != NULL); \
423 nd_lla_ptr = (uchar_t *)&opt[1]; \
424 if (type == IBD_SEND) { \
425 for (i = IPOIB_ADDRL; i > 0; i--) \
426 *(nd_lla_ptr + i + 1) = \
427 *(nd_lla_ptr + i - 1); \
429 for (i = 0; i < IPOIB_ADDRL; i++) \
430 *(nd_lla_ptr + i) = \
431 *(nd_lla_ptr + i + 2); \
433 *(nd_lla_ptr + i) = 0; \
434 *(nd_lla_ptr + i + 1) = 0; \
440 * IETF defined IPoIB encapsulation header, with 2b of ethertype
441 * followed by 2 reserved bytes. This is at the start of the
442 * datagram sent to and received over the wire by the driver.
444 typedef struct ipoib_header
{
449 #define IPOIB_HDRSIZE sizeof (struct ipoib_header)
452 * IETF defined IPoIB link address; IBA QPN, followed by GID,
453 * which has a prefix and suffix, as reported via ARP.
455 typedef struct ipoib_mac
{
457 uint32_t ipoib_gidpref
[2];
458 uint32_t ipoib_gidsuff
[2];
461 #define IPOIB_ADDRL sizeof (struct ipoib_mac)
464 * Pseudo header prepended to datagram in DLIOCRAW transmit path
465 * and when GLD hands the datagram to the gldm_send entry point.
467 typedef struct ipoib_ptxhdr
{
468 ipoib_mac_t ipoib_dest
;
469 ipoib_hdr_t ipoib_rhdr
;
472 #define IPOIBDLSAP(p, offset) ((ipoib_ptxhdr_t *)((caddr_t)(p)+offset))
475 * The pseudo-GRH structure that sits before the data in the
476 * receive buffer, and is overlaid on top of the real GRH.
477 * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
478 * does not hold valid information. If it is indicated valid,
479 * the driver must additionally provide the sender's qpn in
480 * network byte order in ipoib_sqpn, and not touch the
481 * remaining parts which were DMA'ed in by the IBA hardware.
483 typedef struct ipoib_pgrh
{
484 uint32_t ipoib_vertcflow
;
486 uint32_t ipoib_sgid_pref
[2];
487 uint32_t ipoib_sgid_suff
[2];
488 uint32_t ipoib_dgid_pref
[2];
489 uint32_t ipoib_dgid_suff
[2];
493 * The GRH is also dma'ed into recv buffers, thus space needs
494 * to be allocated for them.
496 #define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t)
498 /* support the RC (reliable connected) mode */
499 #define IBD_MAC_ADDR_RC 0x80000000
500 /* support the UC (unreliable connected) mode */
501 #define IBD_MAC_ADDR_UC 0x40000000
503 #define IBD_RC_SERVICE_ID 0x100000000000000ULL
506 * Legacy OFED had used a wrong service ID (one additional zero digit) for
507 * many years. To interop with legacy OFED, we support this wrong service ID
510 #define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
512 #define IBD_RC_MIN_CQ_SIZE 0x7f
514 /* Number of ibt_wc_t provided for each RC channel */
515 #define IBD_RC_MAX_CQ_WC 0x3f
517 #if defined(_KERNEL) && !defined(_BOOT)
519 #include <sys/ib/ibtl/ibti.h>
520 #include <sys/ib/ib_pkt_hdrs.h>
521 #include <sys/list.h>
522 #include <sys/mac_provider.h>
523 #include <sys/mac_ib.h>
524 #include <sys/modhash.h>
526 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
528 IBD_RC_STATE_INIT
= 0,
531 IBD_RC_STATE_ACT_REP_RECV
, /* reply received */
532 IBD_RC_STATE_ACT_ESTAB
, /* established, ready to send */
533 IBD_RC_STATE_ACT_REJECT
, /* rejected */
534 /* Someone else is closing this channel, please don't re-close it */
535 IBD_RC_STATE_ACT_CLOSING
,
536 IBD_RC_STATE_ACT_CLOSED
,
537 IBD_RC_STATE_ACT_ERROR
,
540 IBD_RC_STATE_PAS_REQ_RECV
, /* request received */
541 IBD_RC_STATE_PAS_ESTAB
, /* established, ready to receive */
542 IBD_RC_STATE_PAS_REJECT
, /* rejected */
544 IBD_RC_STATE_PAS_CLOSED
545 } ibd_rc_chan_state_t
;
548 * Structure to encapsulate various types of async requests.
550 typedef struct ibd_acache_rq
{
551 struct list_node rq_list
; /* list of pending work */
552 int rq_op
; /* what operation */
559 typedef struct ibd_mcache
{
560 struct list_node mc_list
; /* full/non list */
562 boolean_t mc_fullreap
;
563 ibt_mcg_info_t mc_info
;
564 ibd_req_t mc_req
; /* to queue LEAVE req */
567 typedef struct ibd_acache_s
{
568 struct list_node ac_list
; /* free/active list */
569 ibt_ud_dest_hdl_t ac_dest
;
572 ibd_mce_t
*ac_mce
; /* for MCG AHs */
574 /* For Reliable Connected mode */
575 struct ibd_rc_chan_s
*ac_chan
;
576 /* protect tx_too_big_ongoing */
577 kmutex_t tx_too_big_mutex
;
578 /* Deal with too big packet */
579 boolean_t tx_too_big_ongoing
;
582 #define IBD_MAX_SQSEG 59
583 #define IBD_MAX_RQSEG 1
594 IBD_WQE_RC_COPYBUF
= 4
598 typedef struct ibd_rc_stat_s
{
599 kstat_named_t rc_rcv_trans_byte
;
600 kstat_named_t rc_rcv_trans_pkt
;
601 kstat_named_t rc_rcv_copy_byte
;
602 kstat_named_t rc_rcv_copy_pkt
;
603 kstat_named_t rc_rcv_alloc_fail
;
605 kstat_named_t rc_rcq_err
; /* fail in rcq handler */
607 kstat_named_t rc_rwqe_short
; /* short rwqe */
609 kstat_named_t rc_xmt_bytes
;
610 /* pkt size <= state->id_rc_tx_copy_thresh */
611 kstat_named_t rc_xmt_small_pkt
;
612 kstat_named_t rc_xmt_fragmented_pkt
;
613 /* fail in ibt_map_mem_iov() */
614 kstat_named_t rc_xmt_map_fail_pkt
;
615 /* succ in ibt_map_mem_iov() */
616 kstat_named_t rc_xmt_map_succ_pkt
;
618 kstat_named_t rc_ace_not_found
; /* ace not found */
619 /* no swqe even after recycle */
620 kstat_named_t rc_scq_no_swqe
;
621 /* no tx large buf even after recycle */
622 kstat_named_t rc_scq_no_largebuf
;
624 /* short swqe in ibd_send() */
625 kstat_named_t rc_swqe_short
;
626 /* call mac_tx_update() when there is enough swqe */
627 kstat_named_t rc_swqe_mac_update
;
628 /* short large buf in ibd_send() */
629 kstat_named_t rc_xmt_buf_short
;
630 /* call mac_tx_update() when there is enough Tx large buffers */
631 kstat_named_t rc_xmt_buf_mac_update
;
633 kstat_named_t rc_conn_succ
; /* # of success connect */
634 kstat_named_t rc_conn_fail
; /* # of fail connect */
635 /* ace->ac_chan == NULL for unicast packet */
636 kstat_named_t rc_null_conn
;
637 /* not in active established state */
638 kstat_named_t rc_no_estab_conn
;
640 kstat_named_t rc_act_close
; /* call ibd_rc_act_close() */
641 kstat_named_t rc_pas_close
; /* call ibd_rc_pas_close() */
642 kstat_named_t rc_delay_ace_recycle
;
643 kstat_named_t rc_act_close_simultaneous
;
645 kstat_named_t rc_reset_cnt
; /* # of Reset RC channel */
646 kstat_named_t rc_timeout_act
;
647 kstat_named_t rc_timeout_pas
;
651 typedef struct ibd_rc_chan_list_s
{
652 /* This mutex protects chan_list and ibd_rc_chan_t.next */
653 kmutex_t chan_list_mutex
;
654 struct ibd_rc_chan_s
*chan_list
;
655 } ibd_rc_chan_list_t
;
657 typedef struct ibd_rc_tx_largebuf_s
{
658 struct ibd_rc_tx_largebuf_s
*lb_next
;
660 } ibd_rc_tx_largebuf_t
;
663 * Pre-registered copybuf used for send and receive
665 typedef struct ibd_copybuf_s
{
670 typedef struct ibd_wqe_s
{
671 struct ibd_wqe_s
*w_next
;
672 ibd_copybuf_t w_copybuf
;
679 typedef struct ibd_swqe_s
{
680 ibd_wqe_t w_ibd_swqe
;
681 ibd_wqe_buftype_t w_buftype
;
683 ibd_ace_t
*w_ahandle
;
684 ibt_mi_hdl_t w_mi_hdl
;
685 ibt_wr_ds_t w_sgl
[IBD_MAX_SQSEG
];
686 ibd_rc_tx_largebuf_t
*w_rc_tx_largebuf
;
689 #define swqe_next w_ibd_swqe.w_next
690 #define swqe_copybuf w_ibd_swqe.w_copybuf
691 #define swqe_im_mblk w_ibd_swqe.im_mblk
692 #define SWQE_TO_WQE(swqe) (ibd_wqe_t *)&((swqe)->w_ibd_swqe)
693 #define WQE_TO_SWQE(wqe) (ibd_swqe_t *)wqe
698 typedef struct ibd_rwqe_s
{
699 ibd_wqe_t w_ibd_rwqe
;
700 struct ibd_state_s
*w_state
;
703 boolean_t w_freeing_wqe
;
704 struct ibd_rc_chan_s
*w_chan
;
707 #define rwqe_next w_ibd_rwqe.w_next
708 #define rwqe_copybuf w_ibd_rwqe.w_copybuf
709 #define rwqe_im_mblk w_ibd_rwqe.im_mblk
710 #define RWQE_TO_WQE(rwqe) (ibd_wqe_t *)&((rwqe)->w_ibd_rwqe)
711 #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe
713 typedef struct ibd_list_s
{
717 boolean_t pending_sends
;
718 uint32_t bufs_outstanding
;
723 #define dl_pending_sends ustat.pending_sends
724 #define dl_bufs_outstanding ustat.bufs_outstanding
729 * Under normal circumstances we should never need to use any buffer
730 * that's larger than MTU. Unfortunately, IB HCA has limitations
731 * on the length of SGL that are much smaller than those for regular
732 * ethernet NICs. Since the network layer doesn't care to limit the
733 * number of mblk fragments in any send mp chain, we end up having to
734 * use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
735 * buffers occasionally.
737 typedef struct ibd_lsobuf_s
{
738 struct ibd_lsobuf_s
*lb_next
;
743 typedef struct ibd_lsobkt_s
{
745 ibd_lsobuf_t
*bkt_bufl
;
746 ibd_lsobuf_t
*bkt_free_head
;
747 ibt_mr_hdl_t bkt_mr_hdl
;
748 ibt_mr_desc_t bkt_mr_desc
;
753 #define IBD_PORT_DRIVER 0x1
754 #define IBD_PARTITION_OBJ 0x2
757 * Posting to a single software rx post queue is contentious,
758 * so break it out to (multiple) an array of queues.
760 * Try to ensure rx_queue structs fall in different cache lines using a filler.
761 * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
763 #define RX_QUEUE_CACHE_LINE \
764 (64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t)))
765 typedef struct ibd_rx_queue_s
{
766 kmutex_t rx_post_lock
;
769 uint8_t rx_pad
[RX_QUEUE_CACHE_LINE
];
773 * This structure maintains information per port per HCA
774 * (per network interface).
776 typedef struct ibd_state_s
{
779 ibt_clnt_hdl_t id_ibt_hdl
;
780 ibt_hca_hdl_t id_hca_hdl
;
781 ibt_pd_hdl_t id_pd_hdl
;
782 kmem_cache_t
*id_req_kmc
;
784 ibd_list_t id_tx_rel_list
;
788 uint32_t id_max_sqseg
;
789 uint32_t id_max_sqseg_hiwm
;
790 ibd_list_t id_tx_list
;
791 ddi_softintr_t id_tx
;
792 uint32_t id_tx_sends
;
794 kmutex_t id_txpost_lock
;
795 ibd_swqe_t
*id_tx_head
;
796 ibd_swqe_t
*id_tx_tail
;
801 ibd_swqe_t
*id_tx_wqes
;
802 ibt_mr_hdl_t id_tx_mr_hdl
;
803 ibt_mr_desc_t id_tx_mr_desc
;
805 kmutex_t id_lso_lock
;
806 ibd_lsobkt_t
*id_lso
;
808 kmutex_t id_scq_poll_lock
;
809 int id_scq_poll_busy
;
811 ibt_cq_hdl_t id_scq_hdl
;
813 uint32_t id_txwcs_size
;
816 ibd_rx_queue_t
*id_rx_queues
;
817 int id_rx_post_queue_index
;
818 uint32_t id_rx_post_active
;
820 ibd_rwqe_t
*id_rx_wqes
;
822 ibt_mr_hdl_t id_rx_mr_hdl
;
823 ibt_mr_desc_t id_rx_mr_desc
;
827 * Number of "receive WQE" elements that will be allocated and used
828 * by ibd. This parameter is limited by the maximum channel size of
829 * the HCA. Each buffer in the receive wqe will be of MTU size.
831 uint32_t id_ud_num_rwqe
;
832 ibd_list_t id_rx_list
;
833 ddi_softintr_t id_rx
;
834 uint32_t id_rx_bufs_outstanding_limit
;
835 uint32_t id_rx_allocb
;
836 uint32_t id_rx_allocb_failed
;
837 ibd_list_t id_rx_free_list
;
839 kmutex_t id_rcq_poll_lock
;
840 int id_rcq_poll_busy
;
841 uint32_t id_rxwcs_size
;
843 ibt_cq_hdl_t id_rcq_hdl
;
845 ibt_channel_hdl_t id_chnl_hdl
;
849 ibt_mcg_info_t
*id_mcinfo
;
852 mac_resource_handle_t id_rh
;
855 ipoib_mac_t id_macaddr
;
857 ipoib_mac_t id_bcaddr
;
862 kmutex_t id_acache_req_lock
;
863 kcondvar_t id_acache_req_cv
;
864 struct list id_req_list
;
865 kt_did_t id_async_thrid
;
867 kmutex_t id_ac_mutex
;
868 ibd_ace_t
*id_ac_hot_ace
;
869 struct list id_ah_active
;
870 struct list id_ah_free
;
871 ipoib_mac_t id_ah_addr
;
874 uint64_t id_ah_error
;
875 ibd_ace_t
*id_ac_list
;
876 mod_hash_t
*id_ah_active_hash
;
878 kmutex_t id_mc_mutex
;
879 struct list id_mc_full
;
880 struct list id_mc_non
;
882 kmutex_t id_trap_lock
;
883 kcondvar_t id_trap_cv
;
884 boolean_t id_trap_stop
;
885 uint32_t id_trap_inprog
;
889 kmutex_t id_sched_lock
;
892 int id_sched_lso_cnt
;
894 kmutex_t id_link_mutex
;
895 link_state_t id_link_state
;
896 uint64_t id_link_speed
;
898 uint64_t id_num_intrs
;
899 uint64_t id_tx_short
;
902 * Number of "send WQE" elements that will be allocated and used by
903 * ibd. When tuning this parameter, the size of pre-allocated, pre-
904 * mapped copy buffer in each of these send wqes must be taken into
905 * account. This copy buffer size is determined by the value of
906 * IBD_TX_BUF_SZ (this is currently set to the same value of
907 * ibd_tx_copy_thresh, but may be changed independently if needed).
909 uint32_t id_ud_num_swqe
;
911 uint64_t id_xmt_bytes
;
912 uint64_t id_rcv_bytes
;
913 uint64_t id_multi_xmt
;
915 uint64_t id_multi_rcv
;
920 uint32_t id_hwcksum_capab
;
921 boolean_t id_lso_policy
;
922 boolean_t id_lso_capable
;
923 uint_t id_lso_maxlen
;
924 int id_hca_res_lkey_capab
;
925 ibt_lkey_t id_res_lkey
;
927 boolean_t id_bgroup_created
;
928 kmutex_t id_macst_lock
;
929 kcondvar_t id_macst_cv
;
930 uint32_t id_mac_state
;
932 /* For Reliable Connected Mode */
933 boolean_t id_enable_rc
;
934 boolean_t rc_enable_srq
;
937 uint32_t rc_tx_max_sqseg
;
939 * In IPoIB over Reliable Connected mode, its mac address is added
940 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
941 * ibd_process_rx(), the input mac address should not include the
942 * "IBD_MAC_ADDR_RC" prefix.
944 * So, we introduce the rc_macaddr_loopback for the loopback filter in
945 * IPoIB over Reliable Connected mode.
947 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
949 ipoib_mac_t rc_macaddr_loopback
;
951 ibt_srv_hdl_t rc_listen_hdl
;
952 ibt_sbind_hdl_t rc_listen_bind
;
953 ibt_srv_hdl_t rc_listen_hdl_OFED_interop
;
954 ibt_sbind_hdl_t rc_listen_bind_OFED_interop
;
956 ibd_rc_chan_list_t rc_pass_chan_list
;
957 /* obsolete active channel list */
958 ibd_rc_chan_list_t rc_obs_act_chan_list
;
960 kmutex_t rc_ace_recycle_lock
;
961 ibd_ace_t
*rc_ace_recycle
;
965 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
966 * and ibd_rc_tx_largebuf_t->lb_next
968 kmutex_t rc_tx_large_bufs_lock
;
969 ibd_rc_tx_largebuf_t
*rc_tx_largebuf_free_head
;
970 uint_t rc_tx_largebuf_nfree
;
971 /* The chunk of whole Tx large buffers */
972 uint8_t *rc_tx_mr_bufs
;
973 ibt_mr_hdl_t rc_tx_mr_hdl
;
974 ibt_mr_desc_t rc_tx_mr_desc
;
975 ibd_rc_tx_largebuf_t
*rc_tx_largebuf_desc_base
; /* base addr */
977 boolean_t rc_enable_iov_map
;
978 uint_t rc_max_sqseg_hiwm
;
981 uint32_t rc_srq_size
;
982 ibt_srq_hdl_t rc_srq_hdl
;
983 ibd_list_t rc_srq_rwqe_list
;
984 ibd_list_t rc_srq_free_list
;
985 ibd_rwqe_t
*rc_srq_rwqes
;
986 uint8_t *rc_srq_rx_bufs
;
987 ibt_mr_hdl_t rc_srq_rx_mr_hdl
;
988 ibt_mr_desc_t rc_srq_rx_mr_desc
;
990 /* For chained receive */
993 mblk_t
*rc_rx_mp_tail
;
994 uint32_t rc_rx_mp_len
;
996 uint32_t rc_num_tx_chan
;
997 uint32_t rc_num_rx_chan
;
999 /* Protect rc_timeout_start and rc_timeout */
1000 kmutex_t rc_timeout_lock
;
1001 boolean_t rc_timeout_start
;
1002 timeout_id_t rc_timeout
;
1004 /* Counters for RC mode */
1007 * # of Received packets. These packets are directly transferred to GLD
1010 uint64_t rc_rcv_trans_byte
;
1011 uint64_t rc_rcv_trans_pkt
;
1013 * # of Received packets. We will allocate new buffers for these packet,
1014 * copy their content into new buffers, then transfer to GLD
1016 uint64_t rc_rcv_copy_byte
;
1017 uint64_t rc_rcv_copy_pkt
;
1018 uint64_t rc_rcv_alloc_fail
;
1021 uint64_t rc_rwqe_short
; /* short rwqe */
1024 /* wc->wc_status != IBT_WC_SUCCESS */
1025 uint64_t rc_rcq_err
;
1028 uint64_t rc_xmt_bytes
;
1030 /* pkt size <= ibd_rc_tx_copy_thresh */
1031 uint64_t rc_xmt_small_pkt
;
1032 uint64_t rc_xmt_fragmented_pkt
;
1033 /* fail in ibt_map_mem_iov() */
1034 uint64_t rc_xmt_map_fail_pkt
;
1035 /* succ in ibt_map_mem_iov() */
1036 uint64_t rc_xmt_map_succ_pkt
;
1038 uint64_t rc_ace_not_found
;
1040 uint64_t rc_xmt_drop_too_long_pkt
;
1041 uint64_t rc_xmt_icmp_too_long_pkt
;
1042 uint64_t rc_xmt_reenter_too_long_pkt
;
1044 /* short swqe in ibd_send() */
1045 uint64_t rc_swqe_short
;
1046 /* call mac_tx_update when there is enough swqe */
1047 uint64_t rc_swqe_mac_update
;
1048 /* short tx large copy buf in ibd_send() */
1049 uint64_t rc_xmt_buf_short
;
1050 /* call mac_tx_update when there is enough Tx copy buf */
1051 uint64_t rc_xmt_buf_mac_update
;
1053 /* No swqe even after call swqe recycle function */
1054 uint64_t rc_scq_no_swqe
;
1055 /* No large Tx buf even after call swqe recycle function */
1056 uint64_t rc_scq_no_largebuf
;
1058 /* Connection setup and close */
1059 uint64_t rc_conn_succ
; /* time of succ connect */
1060 uint64_t rc_conn_fail
; /* time of fail connect */
1061 /* ace->ac_chan == NULL for unicast packet */
1062 uint64_t rc_null_conn
;
1063 /* not in active established state */
1064 uint64_t rc_no_estab_conn
;
1066 uint64_t rc_act_close
; /* call ibd_rc_act_close() */
1067 uint64_t rc_pas_close
; /* call ibd_rc_pas_close() */
1068 uint64_t rc_delay_ace_recycle
;
1069 uint64_t rc_act_close_simultaneous
;
1070 /* Fail to close a channel because someone else is still using it */
1071 uint64_t rc_act_close_not_clean
;
1072 /* RCQ is being invoked when closing RC channel */
1073 uint64_t rc_pas_close_rcq_invoking
;
1075 /* the counter of reset RC channel */
1076 uint64_t rc_reset_cnt
;
1078 uint64_t rc_timeout_act
;
1079 uint64_t rc_timeout_pas
;
1082 * Fail to stop this port because this port is connecting to a remote
1085 uint64_t rc_stop_connect
;
1090 ib_guid_t id_hca_guid
;
1091 ib_guid_t id_port_guid
;
1092 datalink_id_t id_dlinkid
;
1093 datalink_id_t id_plinkid
;
1095 struct ibd_state_s
*id_next
;
1096 boolean_t id_force_create
;
1097 boolean_t id_bgroup_present
;
1098 uint_t id_hca_max_chan_sz
;
1103 * id_ud_tx_copy_thresh
1104 * This sets the threshold at which ibd will attempt to do a bcopy
1105 * of the outgoing data into a pre-mapped buffer. IPoIB driver's
1106 * send behavior is restricted by various parameters, so setting of
1107 * this value must be made after careful considerations only. For
1108 * instance, IB HCAs currently impose a relatively small limit
1109 * (when compared to ethernet NICs) on the length of the SGL for
1110 * transmit. On the other hand, the ip stack could send down mp
1111 * chains that are quite long when LSO is enabled.
1114 * Number of "larger-than-MTU" copy buffers to use for cases when the
1115 * outgoing mblk chain is too fragmented to be used with
1116 * ibt_map_mem_iov() and too large to be used with regular MTU-sized
1117 * copy buffers. It is not recommended to tune this variable without
1118 * understanding the application environment and/or memory resources.
1119 * The size of each of these lso buffers is determined by the value of
1123 * Number of AH cache entries to allocate
1126 * Hash table size for the active AH list
1129 uint_t id_ud_tx_copy_thresh
;
1130 uint_t id_num_lso_bufs
;
1132 uint_t id_hash_size
;
1134 boolean_t id_create_broadcast_group
;
1136 boolean_t id_allow_coalesce_comp_tuning
;
1137 uint_t id_ud_rx_comp_count
;
1138 uint_t id_ud_rx_comp_usec
;
1139 uint_t id_ud_tx_comp_count
;
1140 uint_t id_ud_tx_comp_usec
;
1142 /* RC Mode Tunables */
1144 uint_t id_rc_rx_comp_count
;
1145 uint_t id_rc_rx_comp_usec
;
1146 uint_t id_rc_tx_comp_count
;
1147 uint_t id_rc_tx_comp_usec
;
1149 * id_rc_tx_copy_thresh
1150 * This sets the threshold at which ibd will attempt to do a bcopy
1151 * of the outgoing data into a pre-mapped buffer.
1153 * id_rc_rx_copy_thresh
1154 * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd
1155 * will attempt to allocate a buffer and do a bcopy of the incoming
1156 * data into the allocated buffer.
1158 * id_rc_rx_rwqe_thresh
1159 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd
1160 * will attempt to allocate a buffer and do a bcopy of the incoming
1161 * data into the allocated buffer.
1164 * 1) Send CQ size = ibd_rc_num_swqe
1165 * 2) The send queue size = ibd_rc_num_swqe -1
1166 * 3) Number of pre-allocated Tx buffers for ibt_post_send() =
1167 * ibd_rc_num_swqe - 1.
1170 * 1) For non-SRQ, we pre-post id_rc_num_rwqe number of WRs
1171 * via ibt_post_receive() for receive queue of each RC channel.
1172 * 2) For SRQ and non-SRQ, receive CQ size = id_rc_num_rwqe
1175 * If using SRQ, we allocate id_rc_num_srq number of buffers (the
1176 * size of each buffer is equal to RC mtu). And post them by
1180 * id_rc_num_srq should not be larger than id_rc_num_rwqe,
1181 * otherwise it will cause a bug with the following warnings:
1182 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
1183 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic
1185 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1186 * catastrophic channel error
1187 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1188 * completion queue error
1190 uint_t id_rc_tx_copy_thresh
;
1191 uint_t id_rc_rx_copy_thresh
;
1192 uint_t id_rc_rx_rwqe_thresh
;
1193 uint_t id_rc_num_swqe
;
1194 uint_t id_rc_num_rwqe
;
1195 uint_t id_rc_num_srq
;
1199 * Structures to track global IBTF data, data that is shared
1200 * among the IBD device instances. This includes the one ibt_hdl
1201 * and the list of service registrations.
1203 typedef struct ibd_service_s
{
1204 struct ibd_service_s
*is_link
;
1205 ibt_srv_hdl_t is_srv_hdl
;
1210 typedef struct ibd_global_state_s
{
1212 ibt_clnt_hdl_t ig_ibt_hdl
;
1213 uint_t ig_ibt_hdl_ref_cnt
;
1214 ibd_service_t
*ig_service_list
;
1215 } ibd_global_state_t
;
1217 typedef struct ibd_rc_msg_hello_s
{
1218 uint32_t reserved_qpn
;
1220 } ibd_rc_msg_hello_t
;
1222 typedef struct ibd_rc_chan_s
{
1223 struct ibd_rc_chan_s
*next
;
1224 /* channel hdl that we'll be using for Reliable Connected Mode */
1225 ibt_channel_hdl_t chan_hdl
;
1226 struct ibd_state_s
*state
;
1228 ibd_rc_chan_state_t chan_state
;
1230 ibd_list_t tx_wqe_list
; /* free wqe list */
1231 ibd_list_t tx_rel_list
; /* for swqe recycle */
1233 ibd_swqe_t
*tx_wqes
;
1235 /* start address of Tx Buffers */
1236 uint8_t *tx_mr_bufs
;
1237 ibt_mr_hdl_t tx_mr_hdl
;
1238 ibt_mr_desc_t tx_mr_desc
;
1240 ibt_cq_hdl_t scq_hdl
; /* Tx completion queue */
1241 ibt_wc_t tx_wc
[IBD_RC_MAX_CQ_WC
];
1242 ddi_softintr_t scq_softintr
;
1244 /* For chained send */
1245 kmutex_t tx_post_lock
;
1246 ibd_swqe_t
*tx_head
;
1247 ibd_swqe_t
*tx_tail
;
1250 /* For tx buffer recycle */
1251 kmutex_t tx_poll_lock
;
1255 ibd_list_t rx_wqe_list
; /* used by ibt_post_recv */
1256 ibd_list_t rx_free_list
; /* free rwqe list */
1258 ibt_cq_hdl_t rcq_hdl
; /* Rx completion queue */
1259 ibt_wc_t rx_wc
[IBD_RC_MAX_CQ_WC
];
1261 ibd_rwqe_t
*rx_rwqes
; /* the chuck of whole rwqes */
1262 uint8_t *rx_bufs
; /* the chuck of whole Rx bufs */
1263 ibt_mr_hdl_t rx_mr_hdl
; /* ibt_mr_hdl_t for rx_bufs */
1264 ibt_mr_desc_t rx_mr_desc
; /* ibt_mr_desc_t for rx_bufs */
1266 /* For chained receive */
1275 * We need two channels for each connection.
1276 * One channel for Tx; another channel for Rx.
1277 * If "is_tx_chan == B_TRUE", this is a Tx channel.
1279 boolean_t is_tx_chan
;
1282 * For the connection reaper routine ibd_rc_conn_timeout_call().
1283 * "is_used == B_FALSE" indicates this RC channel has not been used for
1284 * a long (=ibd_rc_conn_timeout) time.
1288 * When closing this channel, we need to make sure
1289 * "chan->rcq_invoking == 0".
1291 uint32_t rcq_invoking
;
1295 * The following functions are defined in "ibd.c".
1296 * They are also used by "ibd_cm.c"
1298 void ibd_print_warn(ibd_state_t
*, char *, ...);
1299 void ibd_unmap_mem(ibd_state_t
*, ibd_swqe_t
*);
1300 void ibd_queue_work_slot(ibd_state_t
*, ibd_req_t
*, int);
1301 boolean_t
ibd_acache_recycle(ibd_state_t
*, ipoib_mac_t
*, boolean_t
);
1302 void ibd_dec_ref_ace(ibd_state_t
*, ibd_ace_t
*);
1303 ibd_ace_t
*ibd_acache_find(ibd_state_t
*, ipoib_mac_t
*, boolean_t
, int);
1306 * The following functions are defined in "ibd_cm.c".
1307 * They are also used in "ibd.c".
1309 void ibd_async_rc_process_too_big(ibd_state_t
*, ibd_req_t
*);
1310 void ibd_async_rc_close_act_chan(ibd_state_t
*, ibd_req_t
*);
1311 void ibd_async_rc_recycle_ace(ibd_state_t
*, ibd_req_t
*);
1313 /* Connection Setup/Close Functions */
1314 ibt_status_t
ibd_rc_listen(ibd_state_t
*);
1315 void ibd_rc_stop_listen(ibd_state_t
*);
1316 ibt_status_t
ibd_rc_connect(ibd_state_t
*, ibd_ace_t
*, ibt_path_info_t
*,
1318 void ibd_rc_try_connect(ibd_state_t
*, ibd_ace_t
*, ibt_path_info_t
*);
1319 void ibd_rc_signal_act_close(ibd_state_t
*, ibd_ace_t
*);
1320 void ibd_rc_signal_ace_recycle(ibd_state_t
*, ibd_ace_t
*);
1321 int ibd_rc_pas_close(ibd_rc_chan_t
*, boolean_t
, boolean_t
);
1322 void ibd_rc_close_all_chan(ibd_state_t
*);
1323 void ibd_rc_conn_timeout_call(void *carg
);
1325 /* Receive Functions */
1326 int ibd_rc_init_srq_list(ibd_state_t
*);
1327 void ibd_rc_fini_srq_list(ibd_state_t
*);
1328 int ibd_rc_repost_srq_free_list(ibd_state_t
*);
1330 /* Send Functions */
1331 int ibd_rc_init_tx_largebuf_list(ibd_state_t
*);
1332 void ibd_rc_fini_tx_largebuf_list(ibd_state_t
*);
1333 ibd_swqe_t
*ibd_rc_acquire_swqes(ibd_rc_chan_t
*);
1334 void ibd_rc_post_send(ibd_rc_chan_t
*, ibd_swqe_t
*);
1335 void ibd_rc_drain_scq(ibd_rc_chan_t
*, ibt_cq_hdl_t
);
1336 void ibd_rc_tx_cleanup(ibd_swqe_t
*);
1339 void ibd_rc_get_conf(ibd_state_t
*);
1340 int ibd_rc_init_stats(ibd_state_t
*);
1342 #endif /* _KERNEL && !_BOOT */
1348 #endif /* _SYS_IB_CLIENTS_IBD_H */