4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2017, Joyent, Inc.
26 #include <sys/types.h>
27 #include <sys/callb.h>
28 #include <sys/cpupart.h>
30 #include <sys/pool_pset.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
35 #include <inet/ipsec_impl.h>
36 #include <inet/ip_impl.h>
37 #include <inet/sadb.h>
38 #include <inet/ipsecesp.h>
39 #include <inet/ipsecah.h>
41 #include <sys/mac_impl.h>
42 #include <sys/mac_client_impl.h>
43 #include <sys/mac_client_priv.h>
44 #include <sys/mac_soft_ring.h>
45 #include <sys/mac_flow_impl.h>
46 #include <sys/mac_stat.h>
48 static void mac_srs_soft_rings_signal(mac_soft_ring_set_t
*, uint_t
);
49 static void mac_srs_update_fanout_list(mac_soft_ring_set_t
*);
50 static void mac_srs_poll_unbind(mac_soft_ring_set_t
*);
51 static void mac_srs_worker_unbind(mac_soft_ring_set_t
*);
52 static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t
*, uint_t
);
54 static int mac_srs_cpu_setup(cpu_setup_t
, int, void *);
55 static void mac_srs_worker_bind(mac_soft_ring_set_t
*, processorid_t
);
56 static void mac_srs_poll_bind(mac_soft_ring_set_t
*, processorid_t
);
57 static void mac_srs_threads_unbind(mac_soft_ring_set_t
*);
58 static void mac_srs_add_glist(mac_soft_ring_set_t
*);
59 static void mac_srs_remove_glist(mac_soft_ring_set_t
*);
60 static void mac_srs_fanout_list_free(mac_soft_ring_set_t
*);
61 static void mac_soft_ring_remove(mac_soft_ring_set_t
*, mac_soft_ring_t
*);
63 static int mac_compute_soft_ring_count(flow_entry_t
*, int, int);
64 static void mac_walk_srs_and_bind(int);
65 static void mac_walk_srs_and_unbind(int);
67 extern boolean_t mac_latency_optimize
;
69 static kmem_cache_t
*mac_srs_cache
;
70 kmem_cache_t
*mac_soft_ring_cache
;
73 * The duration in msec we wait before signalling the soft ring
74 * worker thread in case packets get queued.
76 uint32_t mac_soft_ring_worker_wait
= 0;
79 * A global tunable for turning polling on/off. By default, dynamic
80 * polling is always on and is always very beneficial. It should be
81 * turned off with absolute care and for the rare workload (very
82 * low latency sensitive traffic).
84 int mac_poll_enable
= B_TRUE
;
87 * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
88 * Large values could end up in consuming lot of system memory and cause
91 int mac_soft_ring_max_q_cnt
= 1024;
92 int mac_soft_ring_min_q_cnt
= 256;
93 int mac_soft_ring_poll_thres
= 16;
95 boolean_t mac_tx_serialize
= B_FALSE
;
98 * mac_tx_srs_hiwat is the queue depth threshold at which callers of
99 * mac_tx() will be notified of flow control condition.
101 * TCP does not honour flow control condition sent up by mac_tx().
102 * Thus provision is made for TCP to allow more packets to be queued
103 * in SRS upto a maximum of mac_tx_srs_max_q_cnt.
105 * Note that mac_tx_srs_hiwat is always be lesser than
106 * mac_tx_srs_max_q_cnt.
108 uint32_t mac_tx_srs_max_q_cnt
= 100000;
109 uint32_t mac_tx_srs_hiwat
= 1000;
112 * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
114 * Global tunables that determines the number of soft rings to be used for
115 * fanning out incoming traffic on a link. These count will be used only
116 * when no explicit set of CPUs was assigned to the data-links.
118 * mac_rx_soft_ring_count tunable will come into effect only if
119 * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by
120 * default only for sun4v platforms.
122 * mac_rx_soft_ring_10gig_count will come into effect if you are running on a
123 * 10Gbps link and is not dependent upon mac_soft_ring_enable.
125 * The number of soft rings for fanout for a link or a flow is determined
126 * by mac_compute_soft_ring_count() routine. This routine will take into
127 * account mac_soft_ring_enable, mac_rx_soft_ring_count and
128 * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link.
130 * If a bandwidth is specified, the determination of the number of soft
131 * rings is based on specified bandwidth, CPU speed and number of CPUs in
134 uint_t mac_rx_soft_ring_count
= 8;
135 uint_t mac_rx_soft_ring_10gig_count
= 8;
138 * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
139 * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The
140 * list is used to walk the list of all MAC threads when a CPU is
141 * coming online or going offline.
143 static mac_soft_ring_set_t
*mac_srs_g_list
= NULL
;
144 static krwlock_t mac_srs_g_lock
;
147 * Whether the SRS threads should be bound, or not.
149 boolean_t mac_srs_thread_bind
= B_TRUE
;
152 * Whether Rx/Tx interrupts should be re-targeted. Disabled by default.
153 * dladm command would override this.
155 boolean_t mac_tx_intr_retarget
= B_FALSE
;
156 boolean_t mac_rx_intr_retarget
= B_FALSE
;
159 * If cpu bindings are specified by user, then Tx SRS and its soft
160 * rings should also be bound to the CPUs specified by user. The
161 * CPUs for Tx bindings are at the end of the cpu list provided by
162 * the user. If enough CPUs are not available (for Tx and Rx
163 * SRSes), then the CPUs are shared by both Tx and Rx SRSes.
165 #define BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) { \
166 processorid_t cpuid; \
168 mac_soft_ring_t *softring; \
169 mac_cpus_t *srs_cpu; \
171 srs_cpu = &mac_tx_srs->srs_cpu; \
172 cpuid = srs_cpu->mc_tx_fanout_cpus[0]; \
173 mac_srs_worker_bind(mac_tx_srs, cpuid); \
174 if (MAC_TX_SOFT_RINGS(mac_tx_srs)) { \
175 for (i = 0; i < mac_tx_srs->srs_tx_ring_count; i++) { \
176 cpuid = srs_cpu->mc_tx_fanout_cpus[i]; \
177 softring = mac_tx_srs->srs_tx_soft_rings[i]; \
179 (void) mac_soft_ring_bind(softring, \
187 * Re-targeting is allowed only for exclusive group or for primary.
189 #define RETARGETABLE_CLIENT(group, mcip) \
190 ((((group) != NULL) && \
191 ((group)->mrg_state == MAC_GROUP_STATE_RESERVED)) || \
192 mac_is_primary_client(mcip))
194 #define MAC_RING_RETARGETABLE(ring) \
195 (((ring) != NULL) && \
196 ((ring)->mr_info.mri_intr.mi_ddi_handle != NULL) && \
197 !((ring)->mr_info.mri_intr.mi_ddi_shared))
200 /* INIT and FINI ROUTINES */
203 mac_soft_ring_init(void)
205 mac_soft_ring_cache
= kmem_cache_create("mac_soft_ring_cache",
206 sizeof (mac_soft_ring_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
208 mac_srs_cache
= kmem_cache_create("mac_srs_cache",
209 sizeof (mac_soft_ring_set_t
),
210 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
212 rw_init(&mac_srs_g_lock
, NULL
, RW_DEFAULT
, NULL
);
213 mutex_enter(&cpu_lock
);
214 register_cpu_setup_func(mac_srs_cpu_setup
, NULL
);
215 mutex_exit(&cpu_lock
);
219 mac_soft_ring_finish(void)
221 mutex_enter(&cpu_lock
);
222 unregister_cpu_setup_func(mac_srs_cpu_setup
, NULL
);
223 mutex_exit(&cpu_lock
);
224 rw_destroy(&mac_srs_g_lock
);
225 kmem_cache_destroy(mac_soft_ring_cache
);
226 kmem_cache_destroy(mac_srs_cache
);
230 mac_srs_soft_rings_free(mac_soft_ring_set_t
*mac_srs
)
232 mac_soft_ring_t
*softring
, *next
, *head
;
235 * Synchronize with mac_walk_srs_bind/unbind which are callbacks from
236 * DR. The callbacks from DR are called with cpu_lock held, and hence
237 * can't wait to grab the mac perimeter. The soft ring list is hence
238 * protected for read access by srs_lock. Changing the soft ring list
239 * needs the mac perimeter and the srs_lock.
241 mutex_enter(&mac_srs
->srs_lock
);
243 head
= mac_srs
->srs_soft_ring_head
;
244 mac_srs
->srs_soft_ring_head
= NULL
;
245 mac_srs
->srs_soft_ring_tail
= NULL
;
246 mac_srs
->srs_soft_ring_count
= 0;
248 mutex_exit(&mac_srs
->srs_lock
);
250 for (softring
= head
; softring
!= NULL
; softring
= next
) {
251 next
= softring
->s_ring_next
;
252 mac_soft_ring_free(softring
);
257 mac_srs_add_glist(mac_soft_ring_set_t
*mac_srs
)
259 ASSERT(mac_srs
->srs_next
== NULL
&& mac_srs
->srs_prev
== NULL
);
260 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
262 rw_enter(&mac_srs_g_lock
, RW_WRITER
);
263 mutex_enter(&mac_srs
->srs_lock
);
265 ASSERT((mac_srs
->srs_state
& SRS_IN_GLIST
) == 0);
267 if (mac_srs_g_list
== NULL
) {
268 mac_srs_g_list
= mac_srs
;
270 mac_srs
->srs_next
= mac_srs_g_list
;
271 mac_srs_g_list
->srs_prev
= mac_srs
;
272 mac_srs
->srs_prev
= NULL
;
273 mac_srs_g_list
= mac_srs
;
275 mac_srs
->srs_state
|= SRS_IN_GLIST
;
277 mutex_exit(&mac_srs
->srs_lock
);
278 rw_exit(&mac_srs_g_lock
);
282 mac_srs_remove_glist(mac_soft_ring_set_t
*mac_srs
)
284 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
286 rw_enter(&mac_srs_g_lock
, RW_WRITER
);
287 mutex_enter(&mac_srs
->srs_lock
);
289 ASSERT((mac_srs
->srs_state
& SRS_IN_GLIST
) != 0);
291 if (mac_srs
== mac_srs_g_list
) {
292 mac_srs_g_list
= mac_srs
->srs_next
;
293 if (mac_srs_g_list
!= NULL
)
294 mac_srs_g_list
->srs_prev
= NULL
;
296 mac_srs
->srs_prev
->srs_next
= mac_srs
->srs_next
;
297 if (mac_srs
->srs_next
!= NULL
)
298 mac_srs
->srs_next
->srs_prev
= mac_srs
->srs_prev
;
300 mac_srs
->srs_state
&= ~SRS_IN_GLIST
;
302 mutex_exit(&mac_srs
->srs_lock
);
303 rw_exit(&mac_srs_g_lock
);
306 /* POLLING SETUP AND TEAR DOWN ROUTINES */
309 * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart
311 * These routines are used to call back into the upper layer
312 * (primarily TCP squeue) to stop polling the soft rings or
316 mac_srs_client_poll_quiesce(mac_client_impl_t
*mcip
,
317 mac_soft_ring_set_t
*mac_srs
)
319 mac_soft_ring_t
*softring
;
321 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
323 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
324 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
328 for (softring
= mac_srs
->srs_soft_ring_head
;
329 softring
!= NULL
; softring
= softring
->s_ring_next
) {
330 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
331 (softring
->s_ring_rx_arg2
!= NULL
)) {
332 mcip
->mci_resource_quiesce(mcip
->mci_resource_arg
,
333 softring
->s_ring_rx_arg2
);
339 mac_srs_client_poll_restart(mac_client_impl_t
*mcip
,
340 mac_soft_ring_set_t
*mac_srs
)
342 mac_soft_ring_t
*softring
;
344 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
346 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
347 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
351 for (softring
= mac_srs
->srs_soft_ring_head
;
352 softring
!= NULL
; softring
= softring
->s_ring_next
) {
353 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
354 (softring
->s_ring_rx_arg2
!= NULL
)) {
355 mcip
->mci_resource_restart(mcip
->mci_resource_arg
,
356 softring
->s_ring_rx_arg2
);
362 * Register the given SRS and associated soft rings with the consumer and
363 * enable the polling interface used by the consumer.(i.e IP) over this
364 * SRS and associated soft rings.
367 mac_srs_client_poll_enable(mac_client_impl_t
*mcip
,
368 mac_soft_ring_set_t
*mac_srs
)
371 mac_soft_ring_t
*softring
;
373 ASSERT(mac_srs
->srs_mcip
== mcip
);
374 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
376 if (!(mcip
->mci_state_flags
& MCIS_CLIENT_POLL_CAPABLE
))
379 bzero(&mrf
, sizeof (mac_rx_fifo_t
));
380 mrf
.mrf_type
= MAC_RX_FIFO
;
383 * A SRS is capable of acting as a soft ring for cases
384 * where no fanout is needed. This is the case for userland
387 if (mac_srs
->srs_type
& SRST_NO_SOFT_RINGS
)
390 mrf
.mrf_receive
= (mac_receive_t
)mac_soft_ring_poll
;
391 mrf
.mrf_intr_enable
= (mac_intr_enable_t
)mac_soft_ring_intr_enable
;
392 mrf
.mrf_intr_disable
= (mac_intr_disable_t
)mac_soft_ring_intr_disable
;
393 mac_srs
->srs_type
|= SRST_CLIENT_POLL_ENABLED
;
395 softring
= mac_srs
->srs_soft_ring_head
;
396 while (softring
!= NULL
) {
397 if (softring
->s_ring_type
& (ST_RING_TCP
| ST_RING_UDP
)) {
399 * TCP and UDP support DLS bypass. Squeue polling
400 * support implies DLS bypass since the squeue poll
401 * path does not have DLS processing.
403 mac_soft_ring_dls_bypass(softring
,
404 mcip
->mci_direct_rx_fn
, mcip
->mci_direct_rx_arg
);
407 * Non-TCP protocols don't support squeues. Hence we don't
408 * make any ring addition callbacks for non-TCP rings
410 if (!(softring
->s_ring_type
& ST_RING_TCP
)) {
411 softring
->s_ring_rx_arg2
= NULL
;
412 softring
= softring
->s_ring_next
;
415 mrf
.mrf_rx_arg
= softring
;
416 mrf
.mrf_intr_handle
= (mac_intr_handle_t
)softring
;
417 mrf
.mrf_cpu_id
= softring
->s_ring_cpuid
;
418 mrf
.mrf_flow_priority
= mac_srs
->srs_pri
;
420 softring
->s_ring_rx_arg2
= mcip
->mci_resource_add(
421 mcip
->mci_resource_arg
, (mac_resource_t
*)&mrf
);
423 softring
= softring
->s_ring_next
;
428 * Unregister the given SRS and associated soft rings with the consumer and
429 * disable the polling interface used by the consumer.(i.e IP) over this
430 * SRS and associated soft rings.
433 mac_srs_client_poll_disable(mac_client_impl_t
*mcip
,
434 mac_soft_ring_set_t
*mac_srs
)
436 mac_soft_ring_t
*softring
;
438 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
441 * A SRS is capable of acting as a soft ring for cases
442 * where no protocol fanout is needed. This is the case
443 * for userland flows. Nothing to do here.
445 if (mac_srs
->srs_type
& SRST_NO_SOFT_RINGS
)
448 mutex_enter(&mac_srs
->srs_lock
);
449 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
450 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
451 mutex_exit(&mac_srs
->srs_lock
);
454 mac_srs
->srs_type
&= ~(SRST_CLIENT_POLL_ENABLED
| SRST_DLS_BYPASS
);
455 mutex_exit(&mac_srs
->srs_lock
);
458 * DLS bypass is now disabled in the case of both TCP and UDP.
459 * Reset the soft ring callbacks to the standard 'mac_rx_deliver'
460 * callback. In addition, in the case of TCP, invoke IP's callback
463 for (softring
= mac_srs
->srs_soft_ring_head
;
464 softring
!= NULL
; softring
= softring
->s_ring_next
) {
465 if (!(softring
->s_ring_type
& (ST_RING_UDP
| ST_RING_TCP
)))
468 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
469 softring
->s_ring_rx_arg2
!= NULL
) {
470 mcip
->mci_resource_remove(mcip
->mci_resource_arg
,
471 softring
->s_ring_rx_arg2
);
474 mutex_enter(&softring
->s_ring_lock
);
475 while (softring
->s_ring_state
& S_RING_PROC
) {
476 softring
->s_ring_state
|= S_RING_CLIENT_WAIT
;
477 cv_wait(&softring
->s_ring_client_cv
,
478 &softring
->s_ring_lock
);
480 softring
->s_ring_state
&= ~S_RING_CLIENT_WAIT
;
481 softring
->s_ring_rx_arg2
= NULL
;
482 softring
->s_ring_rx_func
= mac_rx_deliver
;
483 softring
->s_ring_rx_arg1
= mcip
;
484 mutex_exit(&softring
->s_ring_lock
);
489 * Enable or disable poll capability of the SRS on the underlying Rx ring.
491 * There is a need to enable or disable the poll capability of an SRS over an
492 * Rx ring depending on the number of mac clients sharing the ring and also
493 * whether user flows are configured on it. However the poll state is actively
494 * manipulated by the SRS worker and poll threads and uncoordinated changes by
495 * yet another thread to the underlying capability can surprise them leading
496 * to assert failures. Instead we quiesce the SRS, make the changes and then
500 mac_srs_poll_state_change(mac_soft_ring_set_t
*mac_srs
,
501 boolean_t turn_off_poll_capab
, mac_rx_func_t rx_func
)
503 boolean_t need_restart
= B_FALSE
;
504 mac_srs_rx_t
*srs_rx
= &mac_srs
->srs_rx
;
507 if (!SRS_QUIESCED(mac_srs
)) {
508 mac_rx_srs_quiesce(mac_srs
, SRS_QUIESCE
);
509 need_restart
= B_TRUE
;
512 ring
= mac_srs
->srs_ring
;
513 if ((ring
!= NULL
) &&
514 (ring
->mr_classify_type
== MAC_HW_CLASSIFIER
)) {
515 if (turn_off_poll_capab
)
516 mac_srs
->srs_state
&= ~SRS_POLLING_CAPAB
;
517 else if (mac_poll_enable
)
518 mac_srs
->srs_state
|= SRS_POLLING_CAPAB
;
520 srs_rx
->sr_lower_proc
= rx_func
;
523 mac_rx_srs_restart(mac_srs
);
526 /* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */
529 * Return the next CPU to be used to bind a MAC kernel thread.
530 * If a cpupart is specified, the cpu chosen must be from that
534 mac_next_bind_cpu(cpupart_t
*cpupart
)
536 static cpu_t
*cp
= NULL
;
539 ASSERT(MUTEX_HELD(&cpu_lock
));
544 cp
= cp
->cpu_next_onln
;
548 if ((cpupart
== NULL
) || (cp
->cpu_part
== cpupart
))
551 } while ((cp
= cp
->cpu_next_onln
) != cp_start
);
558 mac_srs_cpu_setup(cpu_setup_t what
, int id
, void *arg
)
560 ASSERT(MUTEX_HELD(&cpu_lock
));
565 mac_walk_srs_and_bind(id
);
570 case CPU_CPUPART_OUT
:
571 mac_walk_srs_and_unbind(id
);
581 * mac_compute_soft_ring_count():
583 * This routine computes the number of soft rings needed to handle incoming
584 * load given a flow_entry.
586 * The routine does the following:
587 * 1) soft rings will be created if mac_soft_ring_enable is set.
588 * 2) If the underlying link is a 10Gbps link, then soft rings will be
589 * created even if mac_soft_ring_enable is not set. The number of soft
590 * rings, so created, will equal mac_rx_soft_ring_10gig_count.
591 * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the
592 * mac_rx_soft_ring_10gig_count number of soft rings will be created for a
595 * If a bandwidth limit is specified, the number that gets computed is
596 * dependent upon CPU speed, the number of Rx rings configured, and
597 * the bandwidth limit.
598 * If more Rx rings are available, less number of soft rings is needed.
600 * mac_use_bw_heuristic is another "hidden" variable that can be used to
601 * override the default use of soft ring count computation. Depending upon
602 * the usefulness of it, mac_use_bw_heuristic can later be made into a
603 * data-link property or removed altogether.
605 * TODO: Cleanup and tighten some of the assumptions.
607 boolean_t mac_use_bw_heuristic
= B_TRUE
;
609 mac_compute_soft_ring_count(flow_entry_t
*flent
, int rx_srs_cnt
, int maxcpus
)
611 uint64_t cpu_speed
, bw
= 0;
613 boolean_t bw_enabled
= B_FALSE
;
615 ASSERT(!(flent
->fe_type
& FLOW_USER
));
616 if (flent
->fe_resource_props
.mrp_mask
& MRP_MAXBW
&&
617 mac_use_bw_heuristic
) {
618 /* bandwidth enabled */
620 bw
= flent
->fe_resource_props
.mrp_maxbw
;
623 /* No bandwidth enabled */
624 if (mac_soft_ring_enable
)
625 srings
= mac_rx_soft_ring_count
;
627 /* Is this a 10Gig link? */
628 flent
->fe_nic_speed
= mac_client_stat_get(flent
->fe_mcip
,
630 /* convert to Mbps */
631 if (((flent
->fe_nic_speed
)/1000000) > 1000 &&
632 mac_rx_soft_ring_10gig_count
> 0) {
633 /* This is a 10Gig link */
634 srings
= mac_rx_soft_ring_10gig_count
;
636 * Use 2 times mac_rx_soft_ring_10gig_count for
639 if (mac_soft_ring_enable
)
644 * Soft ring computation using CPU speed and specified
647 /* Assumption: all CPUs have the same frequency */
648 cpu_speed
= (uint64_t)CPU
->cpu_type_info
.pi_clock
;
650 /* cpu_speed is in MHz; make bw in units of Mbps. */
655 * bw is greater than or equal to 1Gbps.
656 * The number of soft rings required is a function
657 * of bandwidth and CPU speed. To keep this simple,
658 * let's use this rule: 1GHz CPU can handle 1Gbps.
659 * If bw is less than 1 Gbps, then there is no need
660 * for soft rings. Assumption is that CPU speeds
661 * (on modern systems) are at least 1GHz.
663 srings
= bw
/cpu_speed
;
664 if (srings
<= 1 && mac_soft_ring_enable
) {
666 * Give at least 2 soft rings
674 * If the flent has multiple Rx SRSs, then each SRS need not
675 * have that many soft rings on top of it. The number of
676 * soft rings for each Rx SRS is found by dividing srings by
679 if (rx_srs_cnt
> 1) {
682 remainder
= srings
%rx_srs_cnt
;
683 srings
= srings
/rx_srs_cnt
;
687 * Fanning out to 1 soft ring is not very useful.
688 * Set it as well to 0 and mac_srs_fanout_init()
689 * will take care of creating a single soft ring
695 /* Do some more massaging */
696 srings
= min(srings
, maxcpus
);
697 srings
= min(srings
, MAX_SR_FANOUT
);
703 * set up CPUs for Tx interrupt re-targeting and Tx worker
707 mac_tx_cpu_init(flow_entry_t
*flent
, mac_resource_props_t
*mrp
,
710 mac_soft_ring_set_t
*tx_srs
= flent
->fe_tx_srs
;
711 mac_srs_tx_t
*srs_tx
= &tx_srs
->srs_tx
;
712 mac_cpus_t
*srs_cpu
= &tx_srs
->srs_cpu
;
713 mac_soft_ring_t
*sringp
;
715 processorid_t worker_cpuid
;
716 boolean_t retargetable_client
= B_FALSE
;
719 if (RETARGETABLE_CLIENT((mac_group_t
*)flent
->fe_tx_ring_group
,
721 retargetable_client
= B_TRUE
;
724 if (MAC_TX_SOFT_RINGS(tx_srs
)) {
726 j
= mrp
->mrp_ncpus
- 1;
727 for (i
= 0; i
< tx_srs
->srs_tx_ring_count
; i
++) {
730 j
= mrp
->mrp_ncpus
- 1;
731 worker_cpuid
= mrp
->mrp_cpu
[j
];
734 * Bind interrupt to the next CPU available
735 * and leave the worker unbound.
739 sringp
= tx_srs
->srs_tx_soft_rings
[i
];
740 ring
= (mac_ring_t
*)sringp
->s_ring_tx_arg2
;
741 srs_cpu
->mc_tx_fanout_cpus
[i
] = worker_cpuid
;
742 if (MAC_RING_RETARGETABLE(ring
) &&
743 retargetable_client
) {
744 mutex_enter(&cpu_lock
);
745 srs_cpu
->mc_tx_intr_cpu
[i
] =
746 (mrp
!= NULL
) ? mrp
->mrp_cpu
[j
] :
747 (mac_tx_intr_retarget
?
748 mac_next_bind_cpu(cpupart
) : -1);
749 mutex_exit(&cpu_lock
);
751 srs_cpu
->mc_tx_intr_cpu
[i
] = -1;
757 /* Tx mac_ring_handle_t is stored in st_arg2 */
758 srs_cpu
->mc_tx_fanout_cpus
[0] =
759 (mrp
!= NULL
) ? mrp
->mrp_cpu
[mrp
->mrp_ncpus
- 1] : -1;
760 ring
= (mac_ring_t
*)srs_tx
->st_arg2
;
761 if (MAC_RING_RETARGETABLE(ring
) && retargetable_client
) {
762 mutex_enter(&cpu_lock
);
763 srs_cpu
->mc_tx_intr_cpu
[0] = (mrp
!= NULL
) ?
764 mrp
->mrp_cpu
[mrp
->mrp_ncpus
- 1] :
765 (mac_tx_intr_retarget
?
766 mac_next_bind_cpu(cpupart
) : -1);
767 mutex_exit(&cpu_lock
);
769 srs_cpu
->mc_tx_intr_cpu
[0] = -1;
775 * Assignment of user specified CPUs to a link.
777 * Minimum CPUs required to get an optimal assignmet:
778 * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize
779 * flag is set -- one for polling, one for fanout soft ring.
780 * If mac_latency_optimize is not set, then 3 CPUs are needed -- one
781 * for polling, one for SRS worker thread and one for fanout soft ring.
783 * The CPUs needed for Tx side is equal to the number of Tx rings
786 * mac_flow_user_cpu_init() categorizes the CPU assignment depending
787 * upon the number of CPUs in 3 different buckets.
789 * In the first bucket, the most optimal case is handled. The user has
790 * passed enough number of CPUs and every thread gets its own CPU.
792 * The second and third are the sub-optimal cases. Enough CPUs are not
795 * The second bucket handles the case where atleast one distinct CPU is
796 * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx
797 * SRS or soft rings).
799 * In the third case (worst case scenario), specified CPU count is less
800 * than the Rx rings configured for the link. In this case, we round
801 * robin the CPUs among the Rx SRSes and Tx SRS/soft rings.
804 mac_flow_user_cpu_init(flow_entry_t
*flent
, mac_resource_props_t
*mrp
)
806 mac_soft_ring_set_t
*rx_srs
, *tx_srs
;
809 int no_of_cpus
, cpu_cnt
;
810 int rx_srs_cnt
, reqd_rx_cpu_cnt
;
811 int fanout_cpu_cnt
, reqd_tx_cpu_cnt
;
812 int reqd_poll_worker_cnt
, fanout_cnt_per_srs
;
813 mac_resource_props_t
*emrp
= &flent
->fe_effective_props
;
815 ASSERT(mrp
->mrp_fanout_mode
== MCM_CPUS
);
817 * The check for nbc_ncpus to be within limits for
818 * the user specified case was done earlier and if
819 * not within limits, an error would have been
820 * returned to the user.
822 ASSERT(mrp
->mrp_ncpus
> 0);
824 no_of_cpus
= mrp
->mrp_ncpus
;
826 if (mrp
->mrp_rx_intr_cpu
!= -1) {
828 * interrupt has been re-targetted. Poll
829 * thread needs to be bound to interrupt
832 * Find where in the list is the intr
833 * CPU and swap it with the first one.
834 * We will be using the first CPU in the
837 for (i
= 0; i
< no_of_cpus
; i
++) {
838 if (mrp
->mrp_cpu
[i
] == mrp
->mrp_rx_intr_cpu
)
841 mrp
->mrp_cpu
[i
] = mrp
->mrp_cpu
[0];
842 mrp
->mrp_cpu
[0] = mrp
->mrp_rx_intr_cpu
;
847 * The number of CPUs that each Rx ring needs is dependent
848 * upon mac_latency_optimize flag.
849 * 1) If set, atleast 2 CPUs are needed -- one for
850 * polling, one for fanout soft ring.
851 * 2) If not set, then atleast 3 CPUs are needed -- one
852 * for polling, one for srs worker thread, and one for
855 rx_srs_cnt
= (flent
->fe_rx_srs_cnt
> 1) ?
856 (flent
->fe_rx_srs_cnt
- 1) : flent
->fe_rx_srs_cnt
;
857 reqd_rx_cpu_cnt
= mac_latency_optimize
?
858 (rx_srs_cnt
* 2) : (rx_srs_cnt
* 3);
860 /* How many CPUs are needed for Tx side? */
861 tx_srs
= flent
->fe_tx_srs
;
862 reqd_tx_cpu_cnt
= MAC_TX_SOFT_RINGS(tx_srs
) ?
863 tx_srs
->srs_tx_ring_count
: 1;
865 /* CPUs needed for Rx SRSes poll and worker threads */
866 reqd_poll_worker_cnt
= mac_latency_optimize
?
867 rx_srs_cnt
: rx_srs_cnt
* 2;
869 /* Has the user provided enough CPUs? */
870 if (no_of_cpus
>= (reqd_rx_cpu_cnt
+ reqd_tx_cpu_cnt
)) {
872 * Best case scenario. There is enough CPUs. All
873 * Rx rings will get their own set of CPUs plus
874 * Tx soft rings will get their own.
877 * fanout_cpu_cnt is the number of CPUs available
878 * for Rx side fanout soft rings.
880 fanout_cpu_cnt
= no_of_cpus
-
881 reqd_poll_worker_cnt
- reqd_tx_cpu_cnt
;
884 * Divide fanout_cpu_cnt by rx_srs_cnt to find
885 * out how many fanout soft rings each Rx SRS
888 fanout_cnt_per_srs
= fanout_cpu_cnt
/rx_srs_cnt
;
890 /* fanout_cnt_per_srs should not be > MAX_SR_FANOUT */
891 fanout_cnt_per_srs
= min(fanout_cnt_per_srs
, MAX_SR_FANOUT
);
893 /* Do the assignment for the default Rx ring */
895 rx_srs
= flent
->fe_rx_srs
[0];
896 ASSERT(rx_srs
->srs_ring
== NULL
);
897 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
898 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
899 srs_cpu
= &rx_srs
->srs_cpu
;
900 srs_cpu
->mc_ncpus
= no_of_cpus
;
902 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
903 srs_cpu
->mc_rx_fanout_cnt
= fanout_cnt_per_srs
;
904 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
905 /* Retarget the interrupt to the same CPU as the poll */
906 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
907 srs_cpu
->mc_rx_workerid
= (mac_latency_optimize
?
908 srs_cpu
->mc_rx_pollid
: mrp
->mrp_cpu
[cpu_cnt
++]);
909 for (i
= 0; i
< fanout_cnt_per_srs
; i
++)
910 srs_cpu
->mc_rx_fanout_cpus
[i
] = mrp
->mrp_cpu
[cpu_cnt
++];
912 /* Do the assignment for h/w Rx SRSes */
913 if (flent
->fe_rx_srs_cnt
> 1) {
916 srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
917 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
918 ASSERT(rx_srs
->srs_ring
!= NULL
);
919 if (rx_srs
->srs_fanout_state
==
921 rx_srs
->srs_fanout_state
=
924 srs_cpu
= &rx_srs
->srs_cpu
;
925 srs_cpu
->mc_ncpus
= no_of_cpus
;
926 bcopy(mrp
->mrp_cpu
, srs_cpu
->mc_cpus
,
927 sizeof (srs_cpu
->mc_cpus
));
928 srs_cpu
->mc_rx_fanout_cnt
= fanout_cnt_per_srs
;
929 /* The first CPU in the list is the intr CPU */
930 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
931 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
932 srs_cpu
->mc_rx_workerid
=
933 (mac_latency_optimize
?
934 srs_cpu
->mc_rx_pollid
:
935 mrp
->mrp_cpu
[cpu_cnt
++]);
936 for (i
= 0; i
< fanout_cnt_per_srs
; i
++) {
937 srs_cpu
->mc_rx_fanout_cpus
[i
] =
938 mrp
->mrp_cpu
[cpu_cnt
++];
940 ASSERT(cpu_cnt
<= no_of_cpus
);
948 * We have the following information:
949 * no_of_cpus - no. of cpus that user passed.
950 * rx_srs_cnt - no. of rx rings.
951 * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3
952 * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side.
953 * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2
956 * If we bind the Rx fanout soft rings to the same CPUs
957 * as poll/worker, would that be enough?
959 if (no_of_cpus
>= (rx_srs_cnt
+ reqd_tx_cpu_cnt
)) {
960 boolean_t worker_assign
= B_FALSE
;
963 * If mac_latency_optimize is not set, are there
964 * enough CPUs to assign a CPU for worker also?
966 if (no_of_cpus
>= (reqd_poll_worker_cnt
+ reqd_tx_cpu_cnt
))
967 worker_assign
= B_TRUE
;
969 * Zero'th Rx SRS is the default Rx ring. It is not
970 * associated with h/w Rx ring.
972 rx_srs
= flent
->fe_rx_srs
[0];
973 ASSERT(rx_srs
->srs_ring
== NULL
);
974 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
975 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
977 srs_cpu
= &rx_srs
->srs_cpu
;
978 srs_cpu
->mc_ncpus
= no_of_cpus
;
980 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
981 srs_cpu
->mc_rx_fanout_cnt
= 1;
982 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
983 /* Retarget the interrupt to the same CPU as the poll */
984 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
985 srs_cpu
->mc_rx_workerid
=
986 ((!mac_latency_optimize
&& worker_assign
) ?
987 mrp
->mrp_cpu
[cpu_cnt
++] : srs_cpu
->mc_rx_pollid
);
989 srs_cpu
->mc_rx_fanout_cpus
[0] = mrp
->mrp_cpu
[cpu_cnt
];
991 /* Do CPU bindings for SRSes having h/w Rx rings */
992 if (flent
->fe_rx_srs_cnt
> 1) {
995 srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
996 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
997 ASSERT(rx_srs
->srs_ring
!= NULL
);
998 if (rx_srs
->srs_fanout_state
==
1000 rx_srs
->srs_fanout_state
=
1003 srs_cpu
= &rx_srs
->srs_cpu
;
1004 srs_cpu
->mc_ncpus
= no_of_cpus
;
1005 bcopy(mrp
->mrp_cpu
, srs_cpu
->mc_cpus
,
1006 sizeof (srs_cpu
->mc_cpus
));
1007 srs_cpu
->mc_rx_pollid
=
1008 mrp
->mrp_cpu
[cpu_cnt
];
1009 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
1010 srs_cpu
->mc_rx_workerid
=
1011 ((!mac_latency_optimize
&& worker_assign
) ?
1012 mrp
->mrp_cpu
[++cpu_cnt
] :
1013 srs_cpu
->mc_rx_pollid
);
1014 srs_cpu
->mc_rx_fanout_cnt
= 1;
1015 srs_cpu
->mc_rx_fanout_cpus
[0] =
1016 mrp
->mrp_cpu
[cpu_cnt
];
1018 ASSERT(cpu_cnt
<= no_of_cpus
);
1025 * Real sub-optimal case. Not enough CPUs for poll and
1026 * Tx soft rings. Do a round robin assignment where
1027 * each Rx SRS will get the same CPU for poll, worker
1028 * and fanout soft ring.
1031 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1032 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1033 srs_cpu
= &rx_srs
->srs_cpu
;
1034 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
1035 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
1036 srs_cpu
->mc_ncpus
= no_of_cpus
;
1038 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
1039 srs_cpu
->mc_rx_fanout_cnt
= 1;
1040 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
];
1041 /* Retarget the interrupt to the same CPU as the poll */
1042 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
1043 srs_cpu
->mc_rx_workerid
= mrp
->mrp_cpu
[cpu_cnt
];
1044 srs_cpu
->mc_rx_fanout_cpus
[0] = mrp
->mrp_cpu
[cpu_cnt
];
1045 if (++cpu_cnt
>= no_of_cpus
)
1050 mac_tx_cpu_init(flent
, mrp
, NULL
);
1053 * Copy the user specified CPUs to the effective CPUs
1055 for (i
= 0; i
< mrp
->mrp_ncpus
; i
++) {
1056 emrp
->mrp_cpu
[i
] = mrp
->mrp_cpu
[i
];
1058 emrp
->mrp_ncpus
= mrp
->mrp_ncpus
;
1059 emrp
->mrp_mask
= mrp
->mrp_mask
;
1060 bzero(emrp
->mrp_pool
, MAXPATHLEN
);
1064 * mac_flow_cpu_init():
1066 * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in
1067 * the CPU binding information in srs_cpu for all Rx SRSes associated
1071 mac_flow_cpu_init(flow_entry_t
*flent
, cpupart_t
*cpupart
)
1073 mac_soft_ring_set_t
*rx_srs
;
1074 processorid_t cpuid
;
1075 int i
, j
, k
, srs_cnt
, nscpus
, maxcpus
, soft_ring_cnt
= 0;
1076 mac_cpus_t
*srs_cpu
;
1077 mac_resource_props_t
*emrp
= &flent
->fe_effective_props
;
1078 uint32_t cpus
[MRP_NCPUS
];
1081 * The maximum number of CPUs available can either be
1082 * the number of CPUs in the pool or the number of CPUs
1085 maxcpus
= (cpupart
!= NULL
) ? cpupart
->cp_ncpus
: ncpus
;
1088 * Compute the number of soft rings needed on top for each Rx
1089 * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS
1090 * associated with h/w Rx rings. Soft ring count needed for
1091 * each h/w Rx SRS is computed and the same is applied to
1092 * software classified Rx SRS. The first Rx SRS in fe_rx_srs[]
1093 * is the software classified Rx SRS.
1095 soft_ring_cnt
= mac_compute_soft_ring_count(flent
,
1096 flent
->fe_rx_srs_cnt
- 1, maxcpus
);
1097 if (soft_ring_cnt
== 0) {
1099 * Even when soft_ring_cnt is 0, we still need
1100 * to create a soft ring for TCP, UDP and
1101 * OTHER. So set it to 1.
1105 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1106 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1107 srs_cpu
= &rx_srs
->srs_cpu
;
1108 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
1109 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
1110 srs_cpu
->mc_ncpus
= soft_ring_cnt
;
1111 srs_cpu
->mc_rx_fanout_cnt
= soft_ring_cnt
;
1112 mutex_enter(&cpu_lock
);
1113 for (j
= 0; j
< soft_ring_cnt
; j
++) {
1114 cpuid
= mac_next_bind_cpu(cpupart
);
1115 srs_cpu
->mc_cpus
[j
] = cpuid
;
1116 srs_cpu
->mc_rx_fanout_cpus
[j
] = cpuid
;
1118 cpuid
= mac_next_bind_cpu(cpupart
);
1119 srs_cpu
->mc_rx_pollid
= cpuid
;
1120 srs_cpu
->mc_rx_intr_cpu
= (mac_rx_intr_retarget
?
1121 srs_cpu
->mc_rx_pollid
: -1);
1122 /* increment ncpus to account for polling cpu */
1123 srs_cpu
->mc_ncpus
++;
1124 srs_cpu
->mc_cpus
[j
++] = cpuid
;
1125 if (!mac_latency_optimize
) {
1126 cpuid
= mac_next_bind_cpu(cpupart
);
1127 srs_cpu
->mc_ncpus
++;
1128 srs_cpu
->mc_cpus
[j
++] = cpuid
;
1130 srs_cpu
->mc_rx_workerid
= cpuid
;
1131 mutex_exit(&cpu_lock
);
1135 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1136 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1137 srs_cpu
= &rx_srs
->srs_cpu
;
1138 for (j
= 0; j
< srs_cpu
->mc_ncpus
; j
++) {
1139 cpus
[nscpus
++] = srs_cpu
->mc_cpus
[j
];
1145 * Copy cpu list to fe_effective_props
1146 * without duplicates.
1149 for (i
= 0; i
< nscpus
; i
++) {
1150 for (j
= 0; j
< k
; j
++) {
1151 if (emrp
->mrp_cpu
[j
] == cpus
[i
])
1155 emrp
->mrp_cpu
[k
++] = cpus
[i
];
1157 emrp
->mrp_ncpus
= k
;
1159 mac_tx_cpu_init(flent
, NULL
, cpupart
);
1163 * DATAPATH SETUP ROUTINES
1164 * (setup SRS and set/update FANOUT, B/W and PRIORITY)
1168 * mac_srs_fanout_list_alloc:
1170 * The underlying device can expose upto MAX_RINGS_PER_GROUP worth of
1171 * rings to a client. In such a case, MAX_RINGS_PER_GROUP worth of
1172 * array space is needed to store Tx soft rings. Thus we allocate so
1173 * much array space for srs_tx_soft_rings.
1175 * And when it is an aggr, again we allocate MAX_RINGS_PER_GROUP worth
1176 * of space to st_soft_rings. This array is used for quick access to
1177 * soft ring associated with a pseudo Tx ring based on the pseudo
1178 * ring's index (mr_index).
1181 mac_srs_fanout_list_alloc(mac_soft_ring_set_t
*mac_srs
)
1183 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
1185 if (mac_srs
->srs_type
& SRST_TX
) {
1186 mac_srs
->srs_tx_soft_rings
= (mac_soft_ring_t
**)
1187 kmem_zalloc(sizeof (mac_soft_ring_t
*) *
1188 MAX_RINGS_PER_GROUP
, KM_SLEEP
);
1189 if (mcip
->mci_state_flags
& MCIS_IS_AGGR
) {
1190 mac_srs_tx_t
*tx
= &mac_srs
->srs_tx
;
1192 tx
->st_soft_rings
= (mac_soft_ring_t
**)
1193 kmem_zalloc(sizeof (mac_soft_ring_t
*) *
1194 MAX_RINGS_PER_GROUP
, KM_SLEEP
);
1197 mac_srs
->srs_tcp_soft_rings
= (mac_soft_ring_t
**)
1198 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1200 mac_srs
->srs_udp_soft_rings
= (mac_soft_ring_t
**)
1201 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1203 mac_srs
->srs_oth_soft_rings
= (mac_soft_ring_t
**)
1204 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1210 mac_srs_worker_bind(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1213 boolean_t clear
= B_FALSE
;
1215 ASSERT(MUTEX_HELD(&cpu_lock
));
1217 if (!mac_srs_thread_bind
)
1220 cp
= cpu_get(cpuid
);
1221 if (cp
== NULL
|| !cpu_is_online(cp
))
1224 mutex_enter(&mac_srs
->srs_lock
);
1225 mac_srs
->srs_state
|= SRS_WORKER_BOUND
;
1226 if (mac_srs
->srs_worker_cpuid
!= -1)
1228 mac_srs
->srs_worker_cpuid
= cpuid
;
1229 mutex_exit(&mac_srs
->srs_lock
);
1232 thread_affinity_clear(mac_srs
->srs_worker
);
1234 thread_affinity_set(mac_srs
->srs_worker
, cpuid
);
1235 DTRACE_PROBE1(worker__CPU
, processorid_t
, cpuid
);
1239 mac_srs_poll_bind(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1242 boolean_t clear
= B_FALSE
;
1244 ASSERT(MUTEX_HELD(&cpu_lock
));
1246 if (!mac_srs_thread_bind
|| mac_srs
->srs_poll_thr
== NULL
)
1249 cp
= cpu_get(cpuid
);
1250 if (cp
== NULL
|| !cpu_is_online(cp
))
1253 mutex_enter(&mac_srs
->srs_lock
);
1254 mac_srs
->srs_state
|= SRS_POLL_BOUND
;
1255 if (mac_srs
->srs_poll_cpuid
!= -1)
1257 mac_srs
->srs_poll_cpuid
= cpuid
;
1258 mutex_exit(&mac_srs
->srs_lock
);
1261 thread_affinity_clear(mac_srs
->srs_poll_thr
);
1263 thread_affinity_set(mac_srs
->srs_poll_thr
, cpuid
);
1264 DTRACE_PROBE1(poll__CPU
, processorid_t
, cpuid
);
1268 * Re-target interrupt to the passed CPU. If re-target is successful,
1269 * set mc_rx_intr_cpu to the re-targeted CPU. Otherwise set it to -1.
1272 mac_rx_srs_retarget_intr(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1275 mac_ring_t
*ring
= mac_srs
->srs_ring
;
1276 mac_intr_t
*mintr
= &ring
->mr_info
.mri_intr
;
1277 flow_entry_t
*flent
= mac_srs
->srs_flent
;
1278 boolean_t primary
= mac_is_primary_client(mac_srs
->srs_mcip
);
1280 ASSERT(MUTEX_HELD(&cpu_lock
));
1283 * Don't re-target the interrupt for these cases:
1285 * 2) the interrupt is shared (mi_ddi_shared)
1286 * 3) ddi_handle is NULL and !primary
1287 * 4) primary, ddi_handle is NULL but fe_rx_srs_cnt > 2
1288 * Case 3 & 4 are because of mac_client_intr_cpu() routine.
1289 * This routine will re-target fixed interrupt for primary
1290 * mac client if the client has only one ring. In that
1291 * case, mc_rx_intr_cpu will already have the correct value.
1293 if (ring
== NULL
|| mintr
->mi_ddi_shared
|| cpuid
== -1 ||
1294 (mintr
->mi_ddi_handle
== NULL
&& !primary
) || (primary
&&
1295 mintr
->mi_ddi_handle
== NULL
&& flent
->fe_rx_srs_cnt
> 2)) {
1296 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= -1;
1300 if (mintr
->mi_ddi_handle
== NULL
)
1303 cp
= cpu_get(cpuid
);
1304 if (cp
== NULL
|| !cpu_is_online(cp
))
1307 /* Drop the cpu_lock as set_intr_affinity() holds it */
1308 mutex_exit(&cpu_lock
);
1309 if (set_intr_affinity(mintr
->mi_ddi_handle
, cpuid
) == DDI_SUCCESS
)
1310 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= cpuid
;
1312 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= -1;
1313 mutex_enter(&cpu_lock
);
1317 * Re-target Tx interrupts
1320 mac_tx_srs_retarget_intr(mac_soft_ring_set_t
*mac_srs
)
1325 mac_soft_ring_t
*sringp
;
1326 mac_srs_tx_t
*srs_tx
;
1327 mac_cpus_t
*srs_cpu
;
1328 processorid_t cpuid
;
1331 ASSERT(MUTEX_HELD(&cpu_lock
));
1333 srs_cpu
= &mac_srs
->srs_cpu
;
1334 if (MAC_TX_SOFT_RINGS(mac_srs
)) {
1335 for (i
= 0; i
< mac_srs
->srs_tx_ring_count
; i
++) {
1336 sringp
= mac_srs
->srs_tx_soft_rings
[i
];
1337 ring
= (mac_ring_t
*)sringp
->s_ring_tx_arg2
;
1338 cpuid
= srs_cpu
->mc_tx_intr_cpu
[i
];
1339 cp
= cpu_get(cpuid
);
1340 if (cp
== NULL
|| !cpu_is_online(cp
) ||
1341 !MAC_RING_RETARGETABLE(ring
)) {
1342 srs_cpu
->mc_tx_retargeted_cpu
[i
] = -1;
1345 mintr
= &ring
->mr_info
.mri_intr
;
1347 * Drop the cpu_lock as set_intr_affinity()
1350 mutex_exit(&cpu_lock
);
1351 if (set_intr_affinity(mintr
->mi_ddi_handle
,
1352 cpuid
) == DDI_SUCCESS
) {
1353 srs_cpu
->mc_tx_retargeted_cpu
[i
] = cpuid
;
1355 srs_cpu
->mc_tx_retargeted_cpu
[i
] = -1;
1357 mutex_enter(&cpu_lock
);
1360 cpuid
= srs_cpu
->mc_tx_intr_cpu
[0];
1361 cp
= cpu_get(cpuid
);
1362 if (cp
== NULL
|| !cpu_is_online(cp
)) {
1363 srs_cpu
->mc_tx_retargeted_cpu
[0] = -1;
1366 srs_tx
= &mac_srs
->srs_tx
;
1367 ring
= (mac_ring_t
*)srs_tx
->st_arg2
;
1368 if (MAC_RING_RETARGETABLE(ring
)) {
1369 mintr
= &ring
->mr_info
.mri_intr
;
1370 mutex_exit(&cpu_lock
);
1371 if ((set_intr_affinity(mintr
->mi_ddi_handle
,
1372 cpuid
) == DDI_SUCCESS
)) {
1373 srs_cpu
->mc_tx_retargeted_cpu
[0] = cpuid
;
1375 srs_cpu
->mc_tx_retargeted_cpu
[0] = -1;
1377 mutex_enter(&cpu_lock
);
1383 * When a CPU comes back online, bind the MAC kernel threads which
1384 * were previously bound to that CPU, and had to be unbound because
1385 * the CPU was going away.
1387 * These functions are called with cpu_lock held and hence we can't
1388 * cv_wait to grab the mac perimeter. Since these functions walk the soft
1389 * ring list of an SRS without being in the perimeter, the list itself
1390 * is protected by the SRS lock.
1393 mac_walk_srs_and_bind(int cpuid
)
1395 mac_soft_ring_set_t
*mac_srs
;
1396 mac_soft_ring_t
*soft_ring
;
1398 rw_enter(&mac_srs_g_lock
, RW_READER
);
1400 if ((mac_srs
= mac_srs_g_list
) == NULL
)
1403 for (; mac_srs
!= NULL
; mac_srs
= mac_srs
->srs_next
) {
1404 if (mac_srs
->srs_worker_cpuid
== -1 &&
1405 mac_srs
->srs_worker_cpuid_save
== cpuid
) {
1406 mac_srs
->srs_worker_cpuid_save
= -1;
1407 mac_srs_worker_bind(mac_srs
, cpuid
);
1410 if (!(mac_srs
->srs_type
& SRST_TX
)) {
1411 if (mac_srs
->srs_poll_cpuid
== -1 &&
1412 mac_srs
->srs_poll_cpuid_save
== cpuid
) {
1413 mac_srs
->srs_poll_cpuid_save
= -1;
1414 mac_srs_poll_bind(mac_srs
, cpuid
);
1418 /* Next tackle the soft rings associated with the srs */
1419 mutex_enter(&mac_srs
->srs_lock
);
1420 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
1421 soft_ring
= soft_ring
->s_ring_next
) {
1422 if (soft_ring
->s_ring_cpuid
== -1 &&
1423 soft_ring
->s_ring_cpuid_save
== cpuid
) {
1424 soft_ring
->s_ring_cpuid_save
= -1;
1425 (void) mac_soft_ring_bind(soft_ring
, cpuid
);
1428 mutex_exit(&mac_srs
->srs_lock
);
1431 rw_exit(&mac_srs_g_lock
);
1435 * Change the priority of the SRS's poll and worker thread. Additionally,
1436 * update the priority of the worker threads for the SRS's soft rings.
1437 * Need to modify any associated squeue threads.
1440 mac_update_srs_priority(mac_soft_ring_set_t
*mac_srs
, pri_t prival
)
1442 mac_soft_ring_t
*ringp
;
1444 mac_srs
->srs_pri
= prival
;
1445 thread_lock(mac_srs
->srs_worker
);
1446 (void) thread_change_pri(mac_srs
->srs_worker
, mac_srs
->srs_pri
, 0);
1447 thread_unlock(mac_srs
->srs_worker
);
1448 if (mac_srs
->srs_poll_thr
!= NULL
) {
1449 thread_lock(mac_srs
->srs_poll_thr
);
1450 (void) thread_change_pri(mac_srs
->srs_poll_thr
,
1451 mac_srs
->srs_pri
, 0);
1452 thread_unlock(mac_srs
->srs_poll_thr
);
1454 if ((ringp
= mac_srs
->srs_soft_ring_head
) == NULL
)
1456 while (ringp
!= mac_srs
->srs_soft_ring_tail
) {
1457 thread_lock(ringp
->s_ring_worker
);
1458 (void) thread_change_pri(ringp
->s_ring_worker
,
1459 mac_srs
->srs_pri
, 0);
1460 thread_unlock(ringp
->s_ring_worker
);
1461 ringp
= ringp
->s_ring_next
;
1463 ASSERT(ringp
== mac_srs
->srs_soft_ring_tail
);
1464 thread_lock(ringp
->s_ring_worker
);
1465 (void) thread_change_pri(ringp
->s_ring_worker
, mac_srs
->srs_pri
, 0);
1466 thread_unlock(ringp
->s_ring_worker
);
1470 * Change the receive bandwidth limit.
1473 mac_rx_srs_update_bwlimit(mac_soft_ring_set_t
*srs
, mac_resource_props_t
*mrp
)
1475 mac_soft_ring_t
*softring
;
1477 mutex_enter(&srs
->srs_lock
);
1478 mutex_enter(&srs
->srs_bw
->mac_bw_lock
);
1480 if (mrp
->mrp_maxbw
== MRP_MAXBW_RESETVAL
) {
1481 /* Reset bandwidth limit */
1482 if (srs
->srs_type
& SRST_BW_CONTROL
) {
1483 softring
= srs
->srs_soft_ring_head
;
1484 while (softring
!= NULL
) {
1485 softring
->s_ring_type
&= ~ST_RING_BW_CTL
;
1486 softring
= softring
->s_ring_next
;
1488 srs
->srs_type
&= ~SRST_BW_CONTROL
;
1489 srs
->srs_drain_func
= mac_rx_srs_drain
;
1492 /* Set/Modify bandwidth limit */
1493 srs
->srs_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
1495 * Give twice the queuing capability before
1496 * dropping packets. The unit is bytes/tick.
1498 srs
->srs_bw
->mac_bw_drop_threshold
=
1499 srs
->srs_bw
->mac_bw_limit
<< 1;
1500 if (!(srs
->srs_type
& SRST_BW_CONTROL
)) {
1501 softring
= srs
->srs_soft_ring_head
;
1502 while (softring
!= NULL
) {
1503 softring
->s_ring_type
|= ST_RING_BW_CTL
;
1504 softring
= softring
->s_ring_next
;
1506 srs
->srs_type
|= SRST_BW_CONTROL
;
1507 srs
->srs_drain_func
= mac_rx_srs_drain_bw
;
1511 mutex_exit(&srs
->srs_bw
->mac_bw_lock
);
1512 mutex_exit(&srs
->srs_lock
);
1515 /* Change the transmit bandwidth limit */
1517 mac_tx_srs_update_bwlimit(mac_soft_ring_set_t
*srs
, mac_resource_props_t
*mrp
)
1519 uint32_t tx_mode
, ring_info
= 0;
1520 mac_srs_tx_t
*srs_tx
= &srs
->srs_tx
;
1521 mac_client_impl_t
*mcip
= srs
->srs_mcip
;
1524 * We need to quiesce/restart the client here because mac_tx() and
1525 * srs->srs_tx->st_func do not hold srs->srs_lock while accessing
1526 * st_mode and related fields, which are modified by the code below.
1528 mac_tx_client_quiesce((mac_client_handle_t
)mcip
);
1530 mutex_enter(&srs
->srs_lock
);
1531 mutex_enter(&srs
->srs_bw
->mac_bw_lock
);
1533 tx_mode
= srs_tx
->st_mode
;
1534 if (mrp
->mrp_maxbw
== MRP_MAXBW_RESETVAL
) {
1535 /* Reset bandwidth limit */
1536 if (tx_mode
== SRS_TX_BW
) {
1537 if (srs_tx
->st_arg2
!= NULL
)
1538 ring_info
= mac_hwring_getinfo(srs_tx
->st_arg2
);
1539 if (mac_tx_serialize
||
1540 (ring_info
& MAC_RING_TX_SERIALIZE
)) {
1541 srs_tx
->st_mode
= SRS_TX_SERIALIZE
;
1543 srs_tx
->st_mode
= SRS_TX_DEFAULT
;
1545 } else if (tx_mode
== SRS_TX_BW_FANOUT
) {
1546 srs_tx
->st_mode
= SRS_TX_FANOUT
;
1547 } else if (tx_mode
== SRS_TX_BW_AGGR
) {
1548 srs_tx
->st_mode
= SRS_TX_AGGR
;
1550 srs
->srs_type
&= ~SRST_BW_CONTROL
;
1552 /* Set/Modify bandwidth limit */
1553 srs
->srs_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
1555 * Give twice the queuing capability before
1556 * dropping packets. The unit is bytes/tick.
1558 srs
->srs_bw
->mac_bw_drop_threshold
=
1559 srs
->srs_bw
->mac_bw_limit
<< 1;
1560 srs
->srs_type
|= SRST_BW_CONTROL
;
1561 if (tx_mode
!= SRS_TX_BW
&& tx_mode
!= SRS_TX_BW_FANOUT
&&
1562 tx_mode
!= SRS_TX_BW_AGGR
) {
1563 if (tx_mode
== SRS_TX_SERIALIZE
||
1564 tx_mode
== SRS_TX_DEFAULT
) {
1565 srs_tx
->st_mode
= SRS_TX_BW
;
1566 } else if (tx_mode
== SRS_TX_FANOUT
) {
1567 srs_tx
->st_mode
= SRS_TX_BW_FANOUT
;
1568 } else if (tx_mode
== SRS_TX_AGGR
) {
1569 srs_tx
->st_mode
= SRS_TX_BW_AGGR
;
1576 srs_tx
->st_func
= mac_tx_get_func(srs_tx
->st_mode
);
1577 mutex_exit(&srs
->srs_bw
->mac_bw_lock
);
1578 mutex_exit(&srs
->srs_lock
);
1580 mac_tx_client_restart((mac_client_handle_t
)mcip
);
1584 * The uber function that deals with any update to bandwidth limits.
1587 mac_srs_update_bwlimit(flow_entry_t
*flent
, mac_resource_props_t
*mrp
)
1591 for (count
= 0; count
< flent
->fe_rx_srs_cnt
; count
++)
1592 mac_rx_srs_update_bwlimit(flent
->fe_rx_srs
[count
], mrp
);
1593 mac_tx_srs_update_bwlimit(flent
->fe_tx_srs
, mrp
);
1597 * When the first sub-flow is added to a link, we disable polling on the
1598 * link and also modify the entry point to mac_rx_srs_subflow_process.
1599 * (polling is disabled because with the subflow added, accounting
1600 * for polling needs additional logic, it is assumed that when a subflow is
1601 * added, we can take some hit as a result of disabling polling rather than
1602 * adding more complexity - if this becomes a perf. issue we need to
1603 * re-rvaluate this logic). When the last subflow is removed, we turn back
1604 * polling and also reset the entry point to mac_rx_srs_process.
1606 * In the future if there are multiple SRS, we can simply
1607 * take one and give it to the flow rather than disabling polling and
1608 * resetting the entry point.
1611 mac_client_update_classifier(mac_client_impl_t
*mcip
, boolean_t enable
)
1613 flow_entry_t
*flent
= mcip
->mci_flent
;
1615 mac_impl_t
*mip
= mcip
->mci_mip
;
1616 mac_rx_func_t rx_func
;
1618 boolean_t enable_classifier
;
1620 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
1622 enable_classifier
= !FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
) && enable
;
1624 rx_func
= enable_classifier
? mac_rx_srs_subflow_process
:
1627 /* Tell mac_srs_poll_state_change to disable polling if necessary */
1628 if (mip
->mi_state_flags
& MIS_POLL_DISABLE
)
1629 enable_classifier
= B_TRUE
;
1632 * If receive function has already been configured correctly for
1633 * current subflow configuration, do nothing.
1635 if (flent
->fe_cb_fn
== (flow_fn_t
)rx_func
)
1638 rx_srs_cnt
= flent
->fe_rx_srs_cnt
;
1639 for (i
= 0; i
< rx_srs_cnt
; i
++) {
1640 ASSERT(flent
->fe_rx_srs
[i
] != NULL
);
1641 mac_srs_poll_state_change(flent
->fe_rx_srs
[i
],
1642 enable_classifier
, rx_func
);
1646 * Change the S/W classifier so that we can land in the
1647 * correct processing function with correct argument.
1648 * If all subflows have been removed we can revert to
1649 * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
1651 mutex_enter(&flent
->fe_lock
);
1652 flent
->fe_cb_fn
= (flow_fn_t
)rx_func
;
1653 flent
->fe_cb_arg1
= (void *)mip
;
1654 flent
->fe_cb_arg2
= flent
->fe_rx_srs
[0];
1655 mutex_exit(&flent
->fe_lock
);
1659 mac_srs_update_fanout_list(mac_soft_ring_set_t
*mac_srs
)
1661 int tcp_count
= 0, udp_count
= 0, oth_count
= 0, tx_count
= 0;
1662 mac_soft_ring_t
*softring
;
1664 softring
= mac_srs
->srs_soft_ring_head
;
1665 if (softring
== NULL
) {
1666 ASSERT(mac_srs
->srs_soft_ring_count
== 0);
1667 mac_srs
->srs_tcp_ring_count
= 0;
1668 mac_srs
->srs_udp_ring_count
= 0;
1669 mac_srs
->srs_oth_ring_count
= 0;
1670 mac_srs
->srs_tx_ring_count
= 0;
1674 while (softring
!= NULL
) {
1675 if (softring
->s_ring_type
& ST_RING_TCP
) {
1676 mac_srs
->srs_tcp_soft_rings
[tcp_count
++] = softring
;
1677 } else if (softring
->s_ring_type
& ST_RING_UDP
) {
1678 mac_srs
->srs_udp_soft_rings
[udp_count
++] = softring
;
1679 } else if (softring
->s_ring_type
& ST_RING_OTH
) {
1680 mac_srs
->srs_oth_soft_rings
[oth_count
++] = softring
;
1682 ASSERT(softring
->s_ring_type
& ST_RING_TX
);
1683 mac_srs
->srs_tx_soft_rings
[tx_count
++] = softring
;
1685 softring
= softring
->s_ring_next
;
1688 ASSERT(mac_srs
->srs_soft_ring_count
==
1689 (tcp_count
+ udp_count
+ oth_count
+ tx_count
));
1690 mac_srs
->srs_tcp_ring_count
= tcp_count
;
1691 mac_srs
->srs_udp_ring_count
= udp_count
;
1692 mac_srs
->srs_oth_ring_count
= oth_count
;
1693 mac_srs
->srs_tx_ring_count
= tx_count
;
1697 mac_srs_create_proto_softrings(int id
, uint16_t type
, pri_t pri
,
1698 mac_client_impl_t
*mcip
, mac_soft_ring_set_t
*mac_srs
,
1699 processorid_t cpuid
, mac_direct_rx_t rx_func
, void *x_arg1
,
1700 mac_resource_handle_t x_arg2
, boolean_t set_bypass
)
1702 mac_soft_ring_t
*softring
;
1705 bzero(&mrf
, sizeof (mac_rx_fifo_t
));
1706 mrf
.mrf_type
= MAC_RX_FIFO
;
1707 mrf
.mrf_receive
= (mac_receive_t
)mac_soft_ring_poll
;
1708 mrf
.mrf_intr_enable
=
1709 (mac_intr_enable_t
)mac_soft_ring_intr_enable
;
1710 mrf
.mrf_intr_disable
=
1711 (mac_intr_disable_t
)mac_soft_ring_intr_disable
;
1712 mrf
.mrf_flow_priority
= pri
;
1714 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1715 (type
|ST_RING_TCP
), pri
, mcip
, mac_srs
,
1716 cpuid
, rx_func
, x_arg1
, x_arg2
);
1717 softring
->s_ring_rx_arg2
= NULL
;
1720 * TCP and UDP support DLS bypass. In addition TCP
1721 * squeue can also poll their corresponding soft rings.
1723 if (set_bypass
&& (mcip
->mci_resource_arg
!= NULL
)) {
1724 mac_soft_ring_dls_bypass(softring
,
1725 mcip
->mci_direct_rx_fn
,
1726 mcip
->mci_direct_rx_arg
);
1728 mrf
.mrf_rx_arg
= softring
;
1729 mrf
.mrf_intr_handle
= (mac_intr_handle_t
)softring
;
1732 * Make a call in IP to get a TCP squeue assigned to
1733 * this softring to maintain full CPU locality through
1734 * the stack and allow the squeue to be able to poll
1735 * the softring so the flow control can be pushed
1736 * all the way to H/W.
1738 softring
->s_ring_rx_arg2
=
1739 mcip
->mci_resource_add((void *)mcip
->mci_resource_arg
,
1740 (mac_resource_t
*)&mrf
);
1744 * Non-TCP protocols don't support squeues. Hence we
1745 * don't make any ring addition callbacks for non-TCP
1746 * rings. Now create the UDP softring and allow it to
1747 * bypass the DLS layer.
1749 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1750 (type
|ST_RING_UDP
), pri
, mcip
, mac_srs
,
1751 cpuid
, rx_func
, x_arg1
, x_arg2
);
1752 softring
->s_ring_rx_arg2
= NULL
;
1754 if (set_bypass
&& (mcip
->mci_resource_arg
!= NULL
)) {
1755 mac_soft_ring_dls_bypass(softring
,
1756 mcip
->mci_direct_rx_fn
,
1757 mcip
->mci_direct_rx_arg
);
1760 /* Create the Oth softrings which has to go through the DLS */
1761 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1762 (type
|ST_RING_OTH
), pri
, mcip
, mac_srs
,
1763 cpuid
, rx_func
, x_arg1
, x_arg2
);
1764 softring
->s_ring_rx_arg2
= NULL
;
1768 * This routine associates a CPU or a set of CPU to process incoming
1769 * traffic from a mac client. If multiple CPUs are specified, then
1770 * so many soft rings are created with each soft ring worker thread
1771 * bound to a CPU in the set. Each soft ring in turn will be
1772 * associated with an squeue and the squeue will be moved to the
1773 * same CPU as that of the soft ring's.
1776 mac_srs_fanout_modify(mac_client_impl_t
*mcip
, mac_direct_rx_t rx_func
,
1777 void *x_arg1
, mac_resource_handle_t x_arg2
,
1778 mac_soft_ring_set_t
*mac_rx_srs
, mac_soft_ring_set_t
*mac_tx_srs
)
1780 mac_soft_ring_t
*softring
;
1781 uint32_t soft_ring_flag
= 0;
1782 processorid_t cpuid
= -1;
1783 int i
, srings_present
, new_fanout_cnt
;
1784 mac_cpus_t
*srs_cpu
;
1786 /* fanout state is REINIT. Set it back to INIT */
1787 ASSERT(mac_rx_srs
->srs_fanout_state
== SRS_FANOUT_REINIT
);
1788 mac_rx_srs
->srs_fanout_state
= SRS_FANOUT_INIT
;
1790 /* how many are present right now */
1791 srings_present
= mac_rx_srs
->srs_tcp_ring_count
;
1793 srs_cpu
= &mac_rx_srs
->srs_cpu
;
1794 new_fanout_cnt
= srs_cpu
->mc_rx_fanout_cnt
;
1796 mutex_enter(&mac_rx_srs
->srs_lock
);
1797 if (mac_rx_srs
->srs_type
& SRST_BW_CONTROL
)
1798 soft_ring_flag
|= ST_RING_BW_CTL
;
1799 mutex_exit(&mac_rx_srs
->srs_lock
);
1801 if (new_fanout_cnt
> srings_present
) {
1802 /* soft rings increased */
1803 mutex_enter(&mac_rx_srs
->srs_lock
);
1804 mac_rx_srs
->srs_type
|= SRST_FANOUT_SRC_IP
;
1805 mutex_exit(&mac_rx_srs
->srs_lock
);
1807 for (i
= mac_rx_srs
->srs_tcp_ring_count
;
1808 i
< new_fanout_cnt
; i
++) {
1810 * Create the protocol softrings and set the
1811 * DLS bypass where possible.
1813 mac_srs_create_proto_softrings(i
, soft_ring_flag
,
1814 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1815 rx_func
, x_arg1
, x_arg2
, B_TRUE
);
1817 mac_srs_update_fanout_list(mac_rx_srs
);
1818 } else if (new_fanout_cnt
< srings_present
) {
1819 /* soft rings decreased */
1820 if (new_fanout_cnt
== 1) {
1821 mutex_enter(&mac_rx_srs
->srs_lock
);
1822 mac_rx_srs
->srs_type
&= ~SRST_FANOUT_SRC_IP
;
1823 ASSERT(mac_rx_srs
->srs_type
& SRST_FANOUT_PROTO
);
1824 mutex_exit(&mac_rx_srs
->srs_lock
);
1826 /* Get rid of extra soft rings */
1827 for (i
= new_fanout_cnt
;
1828 i
< mac_rx_srs
->srs_tcp_ring_count
; i
++) {
1829 softring
= mac_rx_srs
->srs_tcp_soft_rings
[i
];
1830 if (softring
->s_ring_rx_arg2
!= NULL
) {
1831 mcip
->mci_resource_remove(
1832 (void *)mcip
->mci_resource_arg
,
1833 softring
->s_ring_rx_arg2
);
1835 mac_soft_ring_remove(mac_rx_srs
,
1836 mac_rx_srs
->srs_tcp_soft_rings
[i
]);
1837 mac_soft_ring_remove(mac_rx_srs
,
1838 mac_rx_srs
->srs_udp_soft_rings
[i
]);
1839 mac_soft_ring_remove(mac_rx_srs
,
1840 mac_rx_srs
->srs_oth_soft_rings
[i
]);
1842 mac_srs_update_fanout_list(mac_rx_srs
);
1845 ASSERT(new_fanout_cnt
== mac_rx_srs
->srs_tcp_ring_count
);
1846 mutex_enter(&cpu_lock
);
1847 for (i
= 0; i
< mac_rx_srs
->srs_tcp_ring_count
; i
++) {
1848 cpuid
= srs_cpu
->mc_rx_fanout_cpus
[i
];
1849 (void) mac_soft_ring_bind(mac_rx_srs
->srs_udp_soft_rings
[i
],
1851 (void) mac_soft_ring_bind(mac_rx_srs
->srs_oth_soft_rings
[i
],
1853 (void) mac_soft_ring_bind(mac_rx_srs
->srs_tcp_soft_rings
[i
],
1855 softring
= mac_rx_srs
->srs_tcp_soft_rings
[i
];
1856 if (softring
->s_ring_rx_arg2
!= NULL
) {
1857 mcip
->mci_resource_bind((void *)mcip
->mci_resource_arg
,
1858 softring
->s_ring_rx_arg2
, cpuid
);
1862 mac_srs_worker_bind(mac_rx_srs
, srs_cpu
->mc_rx_workerid
);
1863 mac_srs_poll_bind(mac_rx_srs
, srs_cpu
->mc_rx_pollid
);
1864 mac_rx_srs_retarget_intr(mac_rx_srs
, srs_cpu
->mc_rx_intr_cpu
);
1866 * Bind Tx srs and soft ring threads too. Let's bind tx
1867 * srs to the last cpu in mrp list.
1869 if (mac_tx_srs
!= NULL
) {
1870 BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs
, mrp
);
1871 mac_tx_srs_retarget_intr(mac_tx_srs
);
1873 mutex_exit(&cpu_lock
);
1877 * Bind SRS threads and soft rings to CPUs/create fanout list.
1880 mac_srs_fanout_init(mac_client_impl_t
*mcip
, mac_resource_props_t
*mrp
,
1881 mac_direct_rx_t rx_func
, void *x_arg1
, mac_resource_handle_t x_arg2
,
1882 mac_soft_ring_set_t
*mac_rx_srs
, mac_soft_ring_set_t
*mac_tx_srs
,
1886 processorid_t cpuid
;
1887 uint32_t soft_ring_flag
= 0;
1889 mac_cpus_t
*srs_cpu
= &mac_rx_srs
->srs_cpu
;
1892 * Remove the no soft ring flag and we will adjust it
1893 * appropriately further down.
1895 mutex_enter(&mac_rx_srs
->srs_lock
);
1896 mac_rx_srs
->srs_type
&= ~SRST_NO_SOFT_RINGS
;
1897 mutex_exit(&mac_rx_srs
->srs_lock
);
1899 ASSERT(mac_rx_srs
->srs_soft_ring_head
== NULL
);
1901 if (mac_rx_srs
->srs_type
& SRST_BW_CONTROL
)
1902 soft_ring_flag
|= ST_RING_BW_CTL
;
1904 ASSERT(mac_rx_srs
->srs_fanout_state
== SRS_FANOUT_UNINIT
);
1905 mac_rx_srs
->srs_fanout_state
= SRS_FANOUT_INIT
;
1907 * Ring count can be 0 if no fanout is required and no cpu
1908 * were specified. Leave the SRS worker and poll thread
1911 ASSERT(mrp
!= NULL
);
1912 soft_ring_cnt
= srs_cpu
->mc_rx_fanout_cnt
;
1914 /* Step 1: bind cpu contains cpu list where threads need to bind */
1915 if (soft_ring_cnt
> 0) {
1916 mutex_enter(&cpu_lock
);
1917 for (i
= 0; i
< soft_ring_cnt
; i
++) {
1918 cpuid
= srs_cpu
->mc_rx_fanout_cpus
[i
];
1919 /* Create the protocol softrings */
1920 mac_srs_create_proto_softrings(i
, soft_ring_flag
,
1921 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1922 rx_func
, x_arg1
, x_arg2
, B_FALSE
);
1924 mac_srs_worker_bind(mac_rx_srs
, srs_cpu
->mc_rx_workerid
);
1925 mac_srs_poll_bind(mac_rx_srs
, srs_cpu
->mc_rx_pollid
);
1926 mac_rx_srs_retarget_intr(mac_rx_srs
, srs_cpu
->mc_rx_intr_cpu
);
1928 * Bind Tx srs and soft ring threads too.
1929 * Let's bind tx srs to the last cpu in
1932 if (mac_tx_srs
== NULL
) {
1933 mutex_exit(&cpu_lock
);
1937 BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs
, mrp
);
1938 mac_tx_srs_retarget_intr(mac_tx_srs
);
1939 mutex_exit(&cpu_lock
);
1941 mutex_enter(&cpu_lock
);
1943 * For a subflow, mrp_workerid and mrp_pollid
1946 mac_srs_worker_bind(mac_rx_srs
, mrp
->mrp_rx_workerid
);
1947 mac_srs_poll_bind(mac_rx_srs
, mrp
->mrp_rx_pollid
);
1948 mutex_exit(&cpu_lock
);
1953 if (soft_ring_cnt
> 1)
1954 mac_rx_srs
->srs_type
|= SRST_FANOUT_SRC_IP
;
1955 mac_srs_update_fanout_list(mac_rx_srs
);
1956 mac_srs_client_poll_enable(mcip
, mac_rx_srs
);
1960 if (mac_rx_srs
->srs_type
& SRST_FANOUT_PROTO
) {
1961 mutex_enter(&cpu_lock
);
1962 cpuid
= mac_next_bind_cpu(cpupart
);
1963 /* Create the protocol softrings */
1964 mac_srs_create_proto_softrings(0, soft_ring_flag
,
1965 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1966 rx_func
, x_arg1
, x_arg2
, B_FALSE
);
1967 mutex_exit(&cpu_lock
);
1970 * This is the case when there is no fanout which is
1971 * true for subflows.
1973 mac_rx_srs
->srs_type
|= SRST_NO_SOFT_RINGS
;
1975 mac_srs_update_fanout_list(mac_rx_srs
);
1976 mac_srs_client_poll_enable(mcip
, mac_rx_srs
);
1982 * Calls mac_srs_fanout_init() or modify() depending upon whether
1983 * the SRS is getting initialized or re-initialized.
1986 mac_fanout_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
1987 mac_resource_props_t
*mrp
, mac_direct_rx_t rx_func
, void *x_arg1
,
1988 mac_resource_handle_t x_arg2
, cpupart_t
*cpupart
)
1990 mac_soft_ring_set_t
*mac_rx_srs
, *mac_tx_srs
;
1993 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
1995 * This is an aggregation port. Fanout will be setup
1996 * over the aggregation itself.
1998 if (mcip
->mci_state_flags
& MCIS_EXCLUSIVE
)
2001 mac_rx_srs
= flent
->fe_rx_srs
[0];
2003 * Set up the fanout on the tx side only once, with the
2004 * first rx SRS. The CPU binding, fanout, and bandwidth
2005 * criteria are common to both RX and TX, so
2006 * initializing them along side avoids redundant code.
2008 mac_tx_srs
= flent
->fe_tx_srs
;
2009 rx_srs_cnt
= flent
->fe_rx_srs_cnt
;
2011 /* No fanout for subflows */
2012 if (flent
->fe_type
& FLOW_USER
) {
2013 mac_srs_fanout_init(mcip
, mrp
, rx_func
,
2014 x_arg1
, x_arg2
, mac_rx_srs
, mac_tx_srs
,
2019 if (mrp
->mrp_mask
& MRP_CPUS_USERSPEC
)
2020 mac_flow_user_cpu_init(flent
, mrp
);
2022 mac_flow_cpu_init(flent
, cpupart
);
2024 mrp
->mrp_rx_fanout_cnt
= mac_rx_srs
->srs_cpu
.mc_rx_fanout_cnt
;
2027 * Set up fanout for both SW (0th SRS) and HW classified
2028 * SRS (the rest of Rx SRSs in flent).
2030 for (i
= 0; i
< rx_srs_cnt
; i
++) {
2031 mac_rx_srs
= flent
->fe_rx_srs
[i
];
2034 switch (mac_rx_srs
->srs_fanout_state
) {
2035 case SRS_FANOUT_UNINIT
:
2036 mac_srs_fanout_init(mcip
, mrp
, rx_func
,
2037 x_arg1
, x_arg2
, mac_rx_srs
, mac_tx_srs
,
2040 case SRS_FANOUT_INIT
:
2042 case SRS_FANOUT_REINIT
:
2043 mac_rx_srs_quiesce(mac_rx_srs
, SRS_QUIESCE
);
2044 mac_srs_fanout_modify(mcip
, rx_func
, x_arg1
,
2045 x_arg2
, mac_rx_srs
, mac_tx_srs
);
2046 mac_rx_srs_restart(mac_rx_srs
);
2049 VERIFY(mac_rx_srs
->srs_fanout_state
<=
2059 * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
2060 * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
2061 * processing is created.
2063 * Details on Rx SRS:
2064 * Create a SRS and also add the necessary soft rings for TCP and
2065 * non-TCP based on fanout type and count specified.
2067 * mac_soft_ring_fanout, mac_srs_fanout_modify (?),
2068 * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need
2069 * to be heavily modified.
2071 * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear.
2073 mac_soft_ring_set_t
*
2074 mac_srs_create(mac_client_impl_t
*mcip
, flow_entry_t
*flent
, uint32_t srs_type
,
2075 mac_direct_rx_t rx_func
, void *x_arg1
, mac_resource_handle_t x_arg2
,
2078 mac_soft_ring_set_t
*mac_srs
;
2079 mac_srs_rx_t
*srs_rx
;
2080 mac_srs_tx_t
*srs_tx
;
2081 mac_bw_ctl_t
*mac_bw
;
2082 mac_resource_props_t
*mrp
;
2083 boolean_t is_tx_srs
= ((srs_type
& SRST_TX
) != 0);
2085 mac_srs
= kmem_cache_alloc(mac_srs_cache
, KM_SLEEP
);
2086 bzero(mac_srs
, sizeof (mac_soft_ring_set_t
));
2087 srs_rx
= &mac_srs
->srs_rx
;
2088 srs_tx
= &mac_srs
->srs_tx
;
2090 mutex_enter(&flent
->fe_lock
);
2093 * Get the bandwidth control structure from the flent. Get
2094 * rid of any residual values in the control structure for
2095 * the tx bw struct and also for the rx, if the rx srs is
2096 * the 1st one being brought up (the rx bw ctl struct may
2097 * be shared by multiple SRSs)
2100 mac_srs
->srs_bw
= &flent
->fe_tx_bw
;
2101 bzero(mac_srs
->srs_bw
, sizeof (mac_bw_ctl_t
));
2102 flent
->fe_tx_srs
= mac_srs
;
2105 * The bw counter (stored in the flent) is shared
2106 * by SRS's within an rx group.
2108 mac_srs
->srs_bw
= &flent
->fe_rx_bw
;
2109 /* First rx SRS, clear the bw structure */
2110 if (flent
->fe_rx_srs_cnt
== 0)
2111 bzero(mac_srs
->srs_bw
, sizeof (mac_bw_ctl_t
));
2114 * It is better to panic here rather than just assert because
2115 * on a non-debug kernel we might end up courrupting memory
2116 * and making it difficult to debug.
2118 if (flent
->fe_rx_srs_cnt
>= MAX_RINGS_PER_GROUP
) {
2119 panic("Array Overrun detected due to MAC client %p "
2120 " having more rings than %d", (void *)mcip
,
2121 MAX_RINGS_PER_GROUP
);
2123 flent
->fe_rx_srs
[flent
->fe_rx_srs_cnt
] = mac_srs
;
2124 flent
->fe_rx_srs_cnt
++;
2126 mac_srs
->srs_flent
= flent
;
2127 mutex_exit(&flent
->fe_lock
);
2129 mac_srs
->srs_state
= 0;
2130 mac_srs
->srs_type
= (srs_type
| SRST_NO_SOFT_RINGS
);
2131 mac_srs
->srs_worker_cpuid
= mac_srs
->srs_worker_cpuid_save
= -1;
2132 mac_srs
->srs_poll_cpuid
= mac_srs
->srs_poll_cpuid_save
= -1;
2133 mac_srs
->srs_mcip
= mcip
;
2134 mac_srs_fanout_list_alloc(mac_srs
);
2137 * For a flow we use the underlying MAC client's priority range with
2138 * the priority value to find an absolute priority value. For a MAC
2139 * client we use the MAC client's maximum priority as the value.
2141 mrp
= &flent
->fe_effective_props
;
2142 if ((mac_srs
->srs_type
& SRST_FLOW
) != 0) {
2143 mac_srs
->srs_pri
= FLOW_PRIORITY(mcip
->mci_min_pri
,
2144 mcip
->mci_max_pri
, mrp
->mrp_priority
);
2146 mac_srs
->srs_pri
= mcip
->mci_max_pri
;
2149 * We need to insert the SRS in the global list before
2150 * binding the SRS and SR threads. Otherwise there is a
2151 * is a small window where the cpu reconfig callbacks
2152 * may miss the SRS in the list walk and DR could fail
2153 * as there are bound threads.
2155 mac_srs_add_glist(mac_srs
);
2157 /* Initialize bw limit */
2158 if ((mrp
->mrp_mask
& MRP_MAXBW
) != 0) {
2159 mac_srs
->srs_drain_func
= mac_rx_srs_drain_bw
;
2161 mac_bw
= mac_srs
->srs_bw
;
2162 mutex_enter(&mac_bw
->mac_bw_lock
);
2163 mac_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
2166 * Give twice the queuing capability before
2167 * dropping packets. The unit is bytes/tick.
2169 mac_bw
->mac_bw_drop_threshold
= mac_bw
->mac_bw_limit
<< 1;
2170 mutex_exit(&mac_bw
->mac_bw_lock
);
2171 mac_srs
->srs_type
|= SRST_BW_CONTROL
;
2173 mac_srs
->srs_drain_func
= mac_rx_srs_drain
;
2177 * We use the following policy to control Receive
2178 * Side Dynamic Polling:
2179 * 1) We switch to poll mode anytime the processing thread causes
2180 * a backlog to build up in SRS and its associated Soft Rings
2181 * (sr_poll_pkt_cnt > 0).
2182 * 2) As long as the backlog stays under the low water mark
2183 * (sr_lowat), we poll the H/W for more packets.
2184 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we
2185 * stay in poll mode but don't poll the H/W for more packets.
2186 * 4) Anytime in polling mode, if we poll the H/W for packets and
2187 * find nothing plus we have an existing backlog
2188 * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
2189 * the H/W for packets anymore (let the polling thread go to sleep).
2190 * 5) Once the backlog is relived (packets are processed) we reenable
2191 * polling (by signalling the poll thread) only when the backlog
2192 * dips below sr_poll_thres.
2193 * 6) sr_hiwat is used exclusively when we are not polling capable
2194 * and is used to decide when to drop packets so the SRS queue
2195 * length doesn't grow infinitely.
2198 srs_rx
->sr_hiwat
= mac_soft_ring_max_q_cnt
;
2199 /* Low water mark needs to be less than high water mark */
2200 srs_rx
->sr_lowat
= mac_soft_ring_min_q_cnt
<=
2201 mac_soft_ring_max_q_cnt
? mac_soft_ring_min_q_cnt
:
2202 (mac_soft_ring_max_q_cnt
>> 2);
2203 /* Poll threshold need to be half of low water mark or less */
2204 srs_rx
->sr_poll_thres
= mac_soft_ring_poll_thres
<=
2205 (srs_rx
->sr_lowat
>> 1) ? mac_soft_ring_poll_thres
:
2206 (srs_rx
->sr_lowat
>> 1);
2207 if (mac_latency_optimize
)
2208 mac_srs
->srs_state
|= SRS_LATENCY_OPT
;
2210 mac_srs
->srs_state
|= SRS_SOFTRING_QUEUE
;
2213 mac_srs
->srs_worker
= thread_create(NULL
, 0,
2214 mac_srs_worker
, mac_srs
, 0, &p0
, TS_RUN
, mac_srs
->srs_pri
);
2217 /* Handle everything about Tx SRS and return */
2218 mac_srs
->srs_drain_func
= mac_tx_srs_drain
;
2219 srs_tx
->st_max_q_cnt
= mac_tx_srs_max_q_cnt
;
2221 (mac_tx_srs_hiwat
> mac_tx_srs_max_q_cnt
) ?
2222 mac_tx_srs_max_q_cnt
: mac_tx_srs_hiwat
;
2223 srs_tx
->st_arg1
= x_arg1
;
2224 srs_tx
->st_arg2
= x_arg2
;
2228 if ((srs_type
& SRST_FLOW
) != 0 ||
2229 FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
))
2230 srs_rx
->sr_lower_proc
= mac_rx_srs_process
;
2232 srs_rx
->sr_lower_proc
= mac_rx_srs_subflow_process
;
2234 srs_rx
->sr_func
= rx_func
;
2235 srs_rx
->sr_arg1
= x_arg1
;
2236 srs_rx
->sr_arg2
= x_arg2
;
2241 /* Is the mac_srs created over the RX default group? */
2242 if (ring
->mr_gh
== (mac_group_handle_t
)
2243 MAC_DEFAULT_RX_GROUP(mcip
->mci_mip
)) {
2244 mac_srs
->srs_type
|= SRST_DEFAULT_GRP
;
2246 mac_srs
->srs_ring
= ring
;
2247 ring
->mr_srs
= mac_srs
;
2248 ring
->mr_classify_type
= MAC_HW_CLASSIFIER
;
2249 ring
->mr_flag
|= MR_INCIPIENT
;
2251 if (!(mcip
->mci_mip
->mi_state_flags
& MIS_POLL_DISABLE
) &&
2252 FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
) && mac_poll_enable
)
2253 mac_srs
->srs_state
|= SRS_POLLING_CAPAB
;
2255 mac_srs
->srs_poll_thr
= thread_create(NULL
, 0,
2256 mac_rx_srs_poll_ring
, mac_srs
, 0, &p0
, TS_RUN
,
2259 * Some drivers require serialization and don't send
2260 * packet chains in interrupt context. For such
2261 * drivers, we should always queue in soft ring
2262 * so that we get a chance to switch into a polling
2263 * mode under backlog.
2265 ring_info
= mac_hwring_getinfo((mac_ring_handle_t
)ring
);
2266 if (ring_info
& MAC_RING_RX_ENQUEUE
)
2267 mac_srs
->srs_state
|= SRS_SOFTRING_QUEUE
;
2270 mac_srs_stat_create(mac_srs
);
2275 * Figure out the number of soft rings required. Its dependant on
2276 * if protocol fanout is required (for LINKs), global settings
2277 * require us to do fanout for performance (based on mac_soft_ring_enable),
2278 * or user has specifically requested fanout.
2281 mac_find_fanout(flow_entry_t
*flent
, uint32_t link_type
)
2283 uint32_t fanout_type
;
2284 mac_resource_props_t
*mrp
= &flent
->fe_effective_props
;
2286 /* no fanout for subflows */
2287 switch (link_type
) {
2289 fanout_type
= SRST_NO_SOFT_RINGS
;
2292 fanout_type
= SRST_FANOUT_PROTO
;
2296 /* A primary NIC/link is being plumbed */
2297 if (flent
->fe_type
& FLOW_PRIMARY_MAC
) {
2298 if (mac_soft_ring_enable
&& mac_rx_soft_ring_count
> 1) {
2299 fanout_type
|= SRST_FANOUT_SRC_IP
;
2301 } else if (flent
->fe_type
& FLOW_VNIC
) {
2302 /* A VNIC is being created */
2303 if (mrp
!= NULL
&& mrp
->mrp_ncpus
> 0) {
2304 fanout_type
|= SRST_FANOUT_SRC_IP
;
2308 return (fanout_type
);
2312 * Change a group from h/w to s/w classification.
2315 mac_rx_switch_grp_to_sw(mac_group_t
*group
)
2318 mac_soft_ring_set_t
*mac_srs
;
2320 for (ring
= group
->mrg_rings
; ring
!= NULL
; ring
= ring
->mr_next
) {
2321 if (ring
->mr_classify_type
== MAC_HW_CLASSIFIER
) {
2323 * Remove the SRS associated with the HW ring.
2324 * As a result, polling will be disabled.
2326 mac_srs
= ring
->mr_srs
;
2327 ASSERT(mac_srs
!= NULL
);
2328 mac_rx_srs_remove(mac_srs
);
2329 ring
->mr_srs
= NULL
;
2332 if (ring
->mr_state
!= MR_INUSE
)
2333 (void) mac_start_ring(ring
);
2336 * We need to perform SW classification
2337 * for packets landing in these rings
2340 ring
->mr_classify_type
= MAC_SW_CLASSIFIER
;
2345 * Create the Rx SRS for S/W classifier and for each ring in the
2346 * group (if exclusive group). Also create the Tx SRS.
2349 mac_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2353 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
2354 mac_resource_props_t
*emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
2355 boolean_t use_default
= B_FALSE
;
2357 mac_rx_srs_group_setup(mcip
, flent
, link_type
);
2358 mac_tx_srs_group_setup(mcip
, flent
, link_type
);
2361 cpupart
= mac_pset_find(mrp
, &use_default
);
2362 mac_fanout_setup(mcip
, flent
, MCIP_RESOURCE_PROPS(mcip
),
2363 mac_rx_deliver
, mcip
, NULL
, cpupart
);
2364 mac_set_pool_effective(use_default
, cpupart
, mrp
, emrp
);
2369 * Set up the RX SRSs. If the S/W SRS is not set, set it up, if there
2370 * is a group associated with this MAC client, set up SRSs for individual
2374 mac_rx_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2377 mac_impl_t
*mip
= mcip
->mci_mip
;
2378 mac_soft_ring_set_t
*mac_srs
;
2380 uint32_t fanout_type
;
2381 mac_group_t
*rx_group
= flent
->fe_rx_ring_group
;
2383 fanout_type
= mac_find_fanout(flent
, link_type
);
2385 /* Create the SRS for S/W classification if none exists */
2386 if (flent
->fe_rx_srs
[0] == NULL
) {
2387 ASSERT(flent
->fe_rx_srs_cnt
== 0);
2388 /* Setup the Rx SRS */
2389 mac_srs
= mac_srs_create(mcip
, flent
, fanout_type
| link_type
,
2390 mac_rx_deliver
, mcip
, NULL
, NULL
);
2391 mutex_enter(&flent
->fe_lock
);
2392 flent
->fe_cb_fn
= (flow_fn_t
)mac_srs
->srs_rx
.sr_lower_proc
;
2393 flent
->fe_cb_arg1
= (void *)mip
;
2394 flent
->fe_cb_arg2
= (void *)mac_srs
;
2395 mutex_exit(&flent
->fe_lock
);
2398 if (rx_group
== NULL
)
2401 * fanout for default SRS is done when default SRS are created
2402 * above. As each ring is added to the group, we setup the
2403 * SRS and fanout to it.
2405 switch (rx_group
->mrg_state
) {
2406 case MAC_GROUP_STATE_RESERVED
:
2407 for (ring
= rx_group
->mrg_rings
; ring
!= NULL
;
2408 ring
= ring
->mr_next
) {
2409 switch (ring
->mr_state
) {
2412 if (ring
->mr_srs
!= NULL
)
2414 if (ring
->mr_state
!= MR_INUSE
)
2415 (void) mac_start_ring(ring
);
2418 * Since the group is exclusively ours create
2419 * an SRS for this ring to allow the
2420 * individual SRS to dynamically poll the
2421 * ring. Do this only if the client is not
2422 * a VLAN MAC client, since for VLAN we do
2423 * s/w classification for the VID check, and
2424 * if it has a unicast address.
2426 if ((mcip
->mci_state_flags
&
2427 MCIS_NO_UNICAST_ADDR
) ||
2428 i_mac_flow_vid(mcip
->mci_flent
) !=
2432 mac_srs
= mac_srs_create(mcip
, flent
,
2433 fanout_type
| link_type
,
2434 mac_rx_deliver
, mcip
, NULL
, ring
);
2438 "srs_setup: mcip = %p "
2439 "trying to add UNKNOWN ring = %p\n",
2440 (void *)mcip
, (void *)ring
);
2445 case MAC_GROUP_STATE_SHARED
:
2447 * Set all rings of this group to software classified.
2449 * If the group is current RESERVED, the existing mac
2450 * client (the only client on this group) is using
2451 * this group exclusively. In that case we need to
2452 * disable polling on the rings of the group (if it
2453 * was enabled), and free the SRS associated with the
2456 mac_rx_switch_grp_to_sw(rx_group
);
2465 * Set up the TX SRS.
2468 mac_tx_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2477 * If we are opened exclusively (like aggr does for aggr_ports),
2478 * don't set up Tx SRS and Tx soft rings as they won't be used.
2479 * The same thing has to be done for Rx side also. See bug:
2482 if (mcip
->mci_state_flags
& MCIS_EXCLUSIVE
) {
2484 * If we have rings, start them here.
2486 if (flent
->fe_tx_ring_group
== NULL
)
2488 grp
= (mac_group_t
*)flent
->fe_tx_ring_group
;
2489 ringcnt
= grp
->mrg_cur_count
;
2490 ring
= grp
->mrg_rings
;
2491 for (cnt
= 0; cnt
< ringcnt
; cnt
++) {
2492 if (ring
->mr_state
!= MR_INUSE
) {
2493 (void) mac_start_ring(ring
);
2495 ring
= ring
->mr_next
;
2499 if (flent
->fe_tx_srs
== NULL
) {
2500 (void) mac_srs_create(mcip
, flent
, SRST_TX
| link_type
,
2501 NULL
, mcip
, NULL
, NULL
);
2503 mac_tx_srs_setup(mcip
, flent
);
2507 * Remove all the RX SRSs. If we want to remove only the SRSs associated
2508 * with h/w rings, leave the S/W SRS alone. This is used when we want to
2509 * move the MAC client from one group to another, so we need to teardown
2513 mac_rx_srs_group_teardown(flow_entry_t
*flent
, boolean_t hwonly
)
2515 mac_soft_ring_set_t
*mac_srs
;
2517 int count
= flent
->fe_rx_srs_cnt
;
2519 for (i
= 0; i
< count
; i
++) {
2520 if (i
== 0 && hwonly
)
2522 mac_srs
= flent
->fe_rx_srs
[i
];
2523 mac_rx_srs_quiesce(mac_srs
, SRS_CONDEMNED
);
2524 mac_srs_free(mac_srs
);
2525 flent
->fe_rx_srs
[i
] = NULL
;
2526 flent
->fe_rx_srs_cnt
--;
2528 ASSERT(!hwonly
|| flent
->fe_rx_srs_cnt
== 1);
2529 ASSERT(hwonly
|| flent
->fe_rx_srs_cnt
== 0);
2533 * Remove the TX SRS.
2536 mac_tx_srs_group_teardown(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2539 mac_soft_ring_set_t
*tx_srs
;
2542 if ((tx_srs
= flent
->fe_tx_srs
) == NULL
)
2545 tx
= &tx_srs
->srs_tx
;
2546 switch (link_type
) {
2549 * For flows, we need to work with passed
2550 * flent to find the Rx/Tx SRS.
2552 mac_tx_srs_quiesce(tx_srs
, SRS_CONDEMNED
);
2555 mac_tx_client_condemn((mac_client_handle_t
)mcip
);
2556 if (tx
->st_arg2
!= NULL
) {
2557 ASSERT(tx_srs
->srs_type
& SRST_TX
);
2559 * The ring itself will be stopped when
2560 * we release the group or in the
2561 * mac_datapath_teardown (for the default
2571 mac_srs_free(tx_srs
);
2572 flent
->fe_tx_srs
= NULL
;
2576 * This is the group state machine.
2578 * The state of an Rx group is given by
2579 * the following table. The default group and its rings are started in
2580 * mac_start itself and the default group stays in SHARED state until
2581 * mac_stop at which time the group and rings are stopped and and it
2582 * reverts to the Registered state.
2584 * Typically this function is called on a group after adding or removing a
2585 * client from it, to find out what should be the new state of the group.
2586 * If the new state is RESERVED, then the client that owns this group
2587 * exclusively is also returned. Note that adding or removing a client from
2588 * a group could also impact the default group and the caller needs to
2589 * evaluate the effect on the default group.
2591 * Group type # of clients mi_nactiveclients Group State
2594 * Non-default 0 N.A. REGISTERED
2595 * Non-default 1 N.A. RESERVED
2597 * Default 0 N.A. SHARED
2598 * Default 1 1 RESERVED
2599 * Default 1 > 1 SHARED
2600 * Default > 1 N.A. SHARED
2602 * For a TX group, the following is the state table.
2604 * Group type # of clients Group State
2607 * Non-default 0 REGISTERED
2608 * Non-default 1 RESERVED
2610 * Default 0 REGISTERED
2611 * Default 1 RESERVED
2612 * Default > 1 SHARED
2615 mac_group_next_state(mac_group_t
*grp
, mac_client_impl_t
**group_only_mcip
,
2616 mac_group_t
*defgrp
, boolean_t rx_group
)
2618 mac_impl_t
*mip
= (mac_impl_t
*)grp
->mrg_mh
;
2620 *group_only_mcip
= NULL
;
2622 /* Non-default group */
2624 if (grp
!= defgrp
) {
2625 if (MAC_GROUP_NO_CLIENT(grp
))
2626 return (MAC_GROUP_STATE_REGISTERED
);
2628 *group_only_mcip
= MAC_GROUP_ONLY_CLIENT(grp
);
2629 if (*group_only_mcip
!= NULL
)
2630 return (MAC_GROUP_STATE_RESERVED
);
2632 return (MAC_GROUP_STATE_SHARED
);
2637 if (MAC_GROUP_NO_CLIENT(grp
)) {
2639 return (MAC_GROUP_STATE_SHARED
);
2641 return (MAC_GROUP_STATE_REGISTERED
);
2643 *group_only_mcip
= MAC_GROUP_ONLY_CLIENT(grp
);
2644 if (*group_only_mcip
== NULL
)
2645 return (MAC_GROUP_STATE_SHARED
);
2647 if (rx_group
&& mip
->mi_nactiveclients
!= 1)
2648 return (MAC_GROUP_STATE_SHARED
);
2650 ASSERT(*group_only_mcip
!= NULL
);
2651 return (MAC_GROUP_STATE_RESERVED
);
2655 * OVERVIEW NOTES FOR DATAPATH
2656 * ===========================
2658 * Create an SRS and setup the corresponding flow function and args.
2659 * Add a classification rule for the flow specified by 'flent' and program
2660 * the hardware classifier when applicable.
2662 * Rx ring assignment, SRS, polling and B/W enforcement
2663 * ----------------------------------------------------
2665 * We try to use H/W classification on NIC and assign traffic to a
2666 * MAC address to a particular Rx ring. There is a 1-1 mapping
2667 * between a SRS and a Rx ring. The SRS (short for soft ring set)
2668 * dynamically switches the underlying Rx ring between interrupt
2669 * and polling mode and enforces any specified B/W control.
2671 * There is always a SRS created and tied to each H/W and S/W rule.
2672 * Whenever we create a H/W rule, we always add the the same rule to
2673 * S/W classifier and tie a SRS to it.
2675 * In case a B/W control is specified, its broken into bytes
2676 * per ticks and as soon as the quota for a tick is exhausted,
2677 * the underlying Rx ring is forced into poll mode for remianing
2678 * tick. The SRS poll thread only polls for bytes that are
2679 * allowed to come in the SRS. We typically let 4x the configured
2680 * B/W worth of packets to come in the SRS (to prevent unnecessary
2681 * drops due to bursts) but only process the specified amount.
2683 * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more
2684 * Rx rings (and corresponding SRSs) assigned to it. The SRS
2685 * in turn can have softrings to do protocol level fanout or
2686 * softrings to do S/W based fanout or both. In case the NIC
2687 * has no Rx rings, we do S/W classification to respective SRS.
2688 * The S/W classification rule is always setup and ready. This
2689 * allows the MAC layer to reassign Rx rings whenever needed
2690 * but packets still continue to flow via the default path and
2691 * getting S/W classified to correct SRS.
2693 * In other cases where a NIC or VNIC is plumbed, our goal is use
2694 * H/W classifier and get two Rx ring assigned for the Link. One
2695 * for TCP and one for UDP|SCTP. The respective SRS still do the
2696 * polling on the Rx ring. For Link that is plumbed for IP, there
2697 * is a TCP squeue which also does polling and can control the
2698 * the Rx ring directly (where SRS is just pass through). For
2699 * the following cases, the SRS does the polling underneath.
2700 * 1) non IP based Links (Links which are not plumbed via ifconfig)
2701 * and paths which have no IP squeues (UDP & SCTP)
2702 * 2) If B/W control is specified on the Link
2703 * 3) If S/W fanout is secified
2705 * Note1: As of current implementation, we try to assign only 1 Rx
2706 * ring per Link and more than 1 Rx ring for primary Link for
2707 * H/W based fanout. We always create following softrings per SRS:
2708 * 1) TCP softring which is polled by TCP squeue where possible
2709 * (and also bypasses DLS)
2710 * 2) UDP/SCTP based which bypasses DLS
2711 * 3) OTH softring which goes via DLS (currently deal with IPv6
2712 * and non TCP/UDP/SCTP for IPv4 packets).
2714 * It is necessary to create 3 softrings since SRS has to poll
2715 * the single Rx ring underneath and enforce any link level B/W
2716 * control (we can't switch the Rx ring in poll mode just based
2717 * on TCP squeue if the same Rx ring is sharing UDP and other
2718 * traffic as well). Once polling is done and any Link level B/W
2719 * control is specified, the packets are assigned to respective
2720 * softring based on protocol. Since TCP has IP based squeue
2721 * which benefits by polling, we separate TCP packets into
2722 * its own softring which can be polled by IP squeue. We need
2723 * to separate out UDP/SCTP to UDP softring since it can bypass
2724 * the DLS layer which has heavy performance advanatges and we
2725 * need a softring (OTH) for rest.
2727 * ToDo: The 3 softrings for protocol are needed only till we can
2728 * get rid of DLS from datapath, make IPv4 and IPv6 paths
2729 * symmetric (deal with mac_header_info for v6 and polling for
2730 * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues
2731 * are generic), and bring SAP based classification to MAC layer
2733 * H/W and S/W based fanout and multiple Rx rings per Link
2734 * -------------------------------------------------------
2736 * In case, fanout is requested (or determined automatically based
2737 * on Link speed and processor speed), we try to assign multiple
2738 * Rx rings per Link with their respective SRS. In this case
2739 * the NIC should be capable of fanning out incoming packets between
2740 * the assigned Rx rings (H/W based fanout). All the SRS
2741 * individually switch their Rx ring between interrupt and polling
2742 * mode but share a common B/W control counter in case of Link
2743 * level B/W is specified.
2745 * If S/W based fanout is specified in lieu of H/W based fanout,
2746 * the Link SRS creates the specified number of softrings for
2747 * each protocol (TCP, UDP, OTH). Incoming packets are fanned
2748 * out to the correct softring based on their protocol and
2749 * protocol specific hash function.
2751 * Primary and non primary MAC clients
2752 * -----------------------------------
2754 * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links
2755 * and are a Layer 2 construct.
2758 * The Link that owns the primary MAC address and typically
2759 * is used as the data NIC in non virtualized cases. As such
2760 * H/W resources are preferntially given to primary NIC. As
2761 * far as code is concerned, there is no difference in the
2762 * primary NIC vs VNICs. They are all treated as Links.
2763 * At the very first call to mac_unicast_add() we program the S/W
2764 * classifier for the primary MAC address, get a soft ring set
2765 * (and soft rings based on 'ip_soft_ring_cnt')
2766 * and a Rx ring assigned for polling to get enabled.
2767 * When IP get plumbed and negotiates polling, we can
2768 * let squeue do the polling on TCP softring.
2771 * Same as any other Link. As long as the H/W resource assignments
2772 * are equal, the data path and setup for all Links is same.
2775 * Can be configured on Links. They have their own SRS and the
2776 * S/W classifier is programmed appropriately based on the flow.
2777 * The flows typically deal with layer 3 and above and
2778 * creates a soft ring set specific to the flow. The receive
2779 * side function is switched from mac_rx_srs_process to
2780 * mac_rx_srs_subflow_process which first tries to assign the
2781 * packet to appropriate flow SRS and failing which assigns it
2782 * to link SRS. This allows us to avoid the layered approach
2783 * which gets complex.
2785 * By the time mac_datapath_setup() completes, we already have the
2786 * soft rings set, Rx rings, soft rings, etc figured out and both H/W
2787 * and S/W classifiers programmed. IP is not plumbed yet (and might
2788 * never be for Virtual Machines guest OS path). When IP is plumbed
2789 * (for both NIC and VNIC), we do a capability negotiation for polling
2790 * and upcall functions etc.
2792 * Rx ring Assignement NOTES
2793 * -------------------------
2795 * For NICs which have only 1 Rx ring (we treat NICs with no Rx rings
2796 * as NIC with a single default ring), we assign the only ring to
2797 * primary Link. The primary Link SRS can do polling on it as long as
2798 * it is the only link in use and we compare the MAC address for unicast
2799 * packets before accepting an incoming packet (there is no need for S/W
2800 * classification in this case). We disable polling on the only ring the
2801 * moment 2nd link gets created (the polling remains enabled even though
2802 * there are broadcast and * multicast flows created).
2804 * If the NIC has more than 1 Rx ring, we assign the default ring (the
2805 * 1st ring) to deal with broadcast, multicast and traffic for other
2806 * NICs which needs S/W classification. We assign the primary mac
2807 * addresses to another ring by specifiying a classification rule for
2808 * primary unicast MAC address to the selected ring. The primary Link
2809 * (and its SRS) can continue to poll the assigned Rx ring at all times
2812 * Note: In future, if no fanout is specified, we try to assign 2 Rx
2813 * rings for the primary Link with the primary MAC address + TCP going
2814 * to one ring and primary MAC address + UDP|SCTP going to other ring.
2815 * Any remaining traffic for primary MAC address can go to the default
2816 * Rx ring and get S/W classified. This way the respective SRSs don't
2817 * need to do proto fanout and don't need to have softrings at all and
2818 * can poll their respective Rx rings.
2820 * As an optimization, when a new NIC or VNIC is created, we can get
2821 * only one Rx ring and make it a TCP specific Rx ring and use the
2822 * H/W default Rx ring for the rest (this Rx ring is never polled).
2824 * For clients that don't have MAC address, but want to receive and
2825 * transmit packets (e.g, bpf, gvrp etc.), we need to setup the datapath.
2826 * For such clients (identified by the MCIS_NO_UNICAST_ADDR flag) we
2827 * always give the default group and use software classification (i.e.
2828 * even if this is the only client in the default group, we will
2829 * leave group as shared).
2832 mac_datapath_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2835 mac_impl_t
*mip
= mcip
->mci_mip
;
2836 mac_group_t
*rgroup
= NULL
;
2837 mac_group_t
*tgroup
= NULL
;
2838 mac_group_t
*default_rgroup
;
2839 mac_group_t
*default_tgroup
;
2842 mac_group_state_t next_state
;
2843 mac_client_impl_t
*group_only_mcip
;
2844 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
2845 mac_resource_props_t
*emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
2848 boolean_t use_default
= B_FALSE
;
2850 boolean_t no_unicast
;
2851 boolean_t isprimary
= flent
->fe_type
& FLOW_PRIMARY_MAC
;
2852 mac_client_impl_t
*reloc_pmcip
= NULL
;
2854 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
2856 switch (link_type
) {
2858 mac_srs_group_setup(mcip
, flent
, link_type
);
2862 no_unicast
= mcip
->mci_state_flags
& MCIS_NO_UNICAST_ADDR
;
2863 mac_addr
= flent
->fe_flow_desc
.fd_dst_mac
;
2865 /* Default RX group */
2866 default_rgroup
= MAC_DEFAULT_RX_GROUP(mip
);
2868 /* Default TX group */
2869 default_tgroup
= MAC_DEFAULT_TX_GROUP(mip
);
2872 rgroup
= default_rgroup
;
2873 tgroup
= default_tgroup
;
2876 rxhw
= (mrp
->mrp_mask
& MRP_RX_RINGS
) &&
2877 (mrp
->mrp_nrxrings
> 0 ||
2878 (mrp
->mrp_mask
& MRP_RXRINGS_UNSPEC
));
2879 txhw
= (mrp
->mrp_mask
& MRP_TX_RINGS
) &&
2880 (mrp
->mrp_ntxrings
> 0 ||
2881 (mrp
->mrp_mask
& MRP_TXRINGS_UNSPEC
));
2884 * By default we have given the primary all the rings
2885 * i.e. the default group. Let's see if the primary
2886 * needs to be relocated so that the addition of this
2887 * client doesn't impact the primary's performance,
2888 * i.e. if the primary is in the default group and
2889 * we add this client, the primary will lose polling.
2890 * We do this only for NICs supporting dynamic ring
2891 * grouping and only when this is the first client
2892 * after the primary (i.e. nactiveclients is 2)
2894 if (!isprimary
&& mip
->mi_nactiveclients
== 2 &&
2895 (group_only_mcip
= mac_primary_client_handle(mip
)) !=
2896 NULL
&& mip
->mi_rx_group_type
== MAC_GROUP_TYPE_DYNAMIC
) {
2897 reloc_pmcip
= mac_check_primary_relocation(
2898 group_only_mcip
, rxhw
);
2901 * Check to see if we can get an exclusive group for
2902 * this mac address or if there already exists a
2903 * group that has this mac address (case of VLANs).
2904 * If no groups are available, use the default group.
2906 rgroup
= mac_reserve_rx_group(mcip
, mac_addr
, B_FALSE
);
2907 if (rgroup
== NULL
&& rxhw
) {
2910 } else if (rgroup
== NULL
) {
2911 rgroup
= default_rgroup
;
2914 * Check to see if we can get an exclusive group for
2915 * this mac client. If no groups are available, use
2916 * the default group.
2918 tgroup
= mac_reserve_tx_group(mcip
, B_FALSE
);
2919 if (tgroup
== NULL
&& txhw
) {
2920 if (rgroup
!= NULL
&& rgroup
!= default_rgroup
)
2921 mac_release_rx_group(mcip
, rgroup
);
2924 } else if (tgroup
== NULL
) {
2925 tgroup
= default_tgroup
;
2929 * Some NICs don't support any Rx rings, so there may not
2930 * even be a default group.
2933 if (rgroup
!= NULL
) {
2934 if (rgroup
!= default_rgroup
&&
2935 MAC_GROUP_NO_CLIENT(rgroup
) &&
2936 (rxhw
|| mcip
->mci_share
!= (uintptr_t)NULL
)) {
2937 MAC_RX_GRP_RESERVED(mip
);
2938 if (mip
->mi_rx_group_type
==
2939 MAC_GROUP_TYPE_DYNAMIC
) {
2940 MAC_RX_RING_RESERVED(mip
,
2941 rgroup
->mrg_cur_count
);
2944 flent
->fe_rx_ring_group
= rgroup
;
2946 * Add the client to the group. This could cause
2947 * either this group to move to the shared state or
2948 * cause the default group to move to the shared state.
2949 * The actions on this group are done here, while the
2950 * actions on the default group are postponed to
2951 * the end of this function.
2953 mac_group_add_client(rgroup
, mcip
);
2954 next_state
= mac_group_next_state(rgroup
,
2955 &group_only_mcip
, default_rgroup
, B_TRUE
);
2956 mac_set_group_state(rgroup
, next_state
);
2959 if (tgroup
!= NULL
) {
2960 if (tgroup
!= default_tgroup
&&
2961 MAC_GROUP_NO_CLIENT(tgroup
) &&
2962 (txhw
|| mcip
->mci_share
!= (uintptr_t)NULL
)) {
2963 MAC_TX_GRP_RESERVED(mip
);
2964 if (mip
->mi_tx_group_type
==
2965 MAC_GROUP_TYPE_DYNAMIC
) {
2966 MAC_TX_RING_RESERVED(mip
,
2967 tgroup
->mrg_cur_count
);
2970 flent
->fe_tx_ring_group
= tgroup
;
2971 mac_group_add_client(tgroup
, mcip
);
2972 next_state
= mac_group_next_state(tgroup
,
2973 &group_only_mcip
, default_tgroup
, B_FALSE
);
2974 tgroup
->mrg_state
= next_state
;
2977 * Setup the Rx and Tx SRSes. If we got a pristine group
2978 * exclusively above, mac_srs_group_setup would simply create
2979 * the required SRSes. If we ended up sharing a previously
2980 * reserved group, mac_srs_group_setup would also dismantle the
2981 * SRSes of the previously exclusive group
2983 mac_srs_group_setup(mcip
, flent
, link_type
);
2985 /* We are setting up minimal datapath only */
2988 /* Program the S/W Classifer */
2989 if ((err
= mac_flow_add(mip
->mi_flow_tab
, flent
)) != 0)
2992 /* Program the H/W Classifier */
2993 if ((err
= mac_add_macaddr(mip
, rgroup
, mac_addr
,
2994 (mcip
->mci_state_flags
& MCIS_UNICAST_HW
) != 0)) != 0)
2996 mcip
->mci_unicast
= mac_find_macaddr(mip
, mac_addr
);
2997 ASSERT(mcip
->mci_unicast
!= NULL
);
2998 /* (Re)init the v6 token & local addr used by link protection */
2999 mac_protect_update_mac_token(mcip
);
3008 * All broadcast and multicast traffic is received only on the default
3009 * group. If we have setup the datapath for a non-default group above
3010 * then move the default group to shared state to allow distribution of
3011 * incoming broadcast traffic to the other groups and dismantle the
3012 * SRSes over the default group.
3014 if (rgroup
!= NULL
) {
3015 if (rgroup
!= default_rgroup
) {
3016 if (default_rgroup
->mrg_state
==
3017 MAC_GROUP_STATE_RESERVED
) {
3018 group_only_mcip
= MAC_GROUP_ONLY_CLIENT(
3020 ASSERT(group_only_mcip
!= NULL
&&
3021 mip
->mi_nactiveclients
> 1);
3023 mac_set_group_state(default_rgroup
,
3024 MAC_GROUP_STATE_SHARED
);
3025 mac_rx_srs_group_setup(group_only_mcip
,
3026 group_only_mcip
->mci_flent
, SRST_LINK
);
3028 cpupart
= mac_pset_find(mrp
, &use_default
);
3029 mac_fanout_setup(group_only_mcip
,
3030 group_only_mcip
->mci_flent
,
3031 MCIP_RESOURCE_PROPS(group_only_mcip
),
3032 mac_rx_deliver
, group_only_mcip
, NULL
,
3034 mac_set_pool_effective(use_default
, cpupart
,
3038 ASSERT(default_rgroup
->mrg_state
==
3039 MAC_GROUP_STATE_SHARED
);
3042 * If we get an exclusive group for a VLAN MAC client we
3043 * need to take the s/w path to make the additional check for
3044 * the vid. Disable polling and set it to s/w classification.
3045 * Similarly for clients that don't have a unicast address.
3047 if (rgroup
->mrg_state
== MAC_GROUP_STATE_RESERVED
&&
3048 (i_mac_flow_vid(flent
) != VLAN_ID_NONE
|| no_unicast
)) {
3049 mac_rx_switch_grp_to_sw(rgroup
);
3052 mac_set_rings_effective(mcip
);
3056 /* Switch the primary back to default group */
3057 if (reloc_pmcip
!= NULL
) {
3058 (void) mac_rx_switch_group(reloc_pmcip
,
3059 reloc_pmcip
->mci_flent
->fe_rx_ring_group
, default_rgroup
);
3061 mac_datapath_teardown(mcip
, flent
, link_type
);
3066 mac_datapath_teardown(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
3069 mac_impl_t
*mip
= mcip
->mci_mip
;
3070 mac_group_t
*group
= NULL
;
3071 mac_client_impl_t
*grp_only_mcip
;
3072 flow_entry_t
*group_only_flent
;
3073 mac_group_t
*default_group
;
3074 boolean_t check_default_group
= B_FALSE
;
3075 mac_group_state_t next_state
;
3076 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
3078 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
3080 switch (link_type
) {
3082 mac_rx_srs_group_teardown(flent
, B_FALSE
);
3083 mac_tx_srs_group_teardown(mcip
, flent
, SRST_FLOW
);
3087 /* Stop sending packets */
3088 mac_tx_client_block(mcip
);
3090 /* Stop the packets coming from the H/W */
3091 if (mcip
->mci_unicast
!= NULL
) {
3093 err
= mac_remove_macaddr(mcip
->mci_unicast
);
3095 cmn_err(CE_WARN
, "%s: failed to remove a MAC"
3096 " address because of error 0x%x",
3099 mcip
->mci_unicast
= NULL
;
3102 /* Stop the packets coming from the S/W classifier */
3103 mac_flow_remove(mip
->mi_flow_tab
, flent
, B_FALSE
);
3104 mac_flow_wait(flent
, FLOW_DRIVER_UPCALL
);
3106 /* Now quiesce and destroy all SRS and soft rings */
3107 mac_rx_srs_group_teardown(flent
, B_FALSE
);
3108 mac_tx_srs_group_teardown(mcip
, flent
, SRST_LINK
);
3110 ASSERT((mcip
->mci_flent
== flent
) &&
3111 (flent
->fe_next
== NULL
));
3114 * Release our hold on the group as well. We need
3115 * to check if the shared group has only one client
3116 * left who can use it exclusively. Also, if we
3117 * were the last client, release the group.
3119 group
= flent
->fe_rx_ring_group
;
3120 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3121 if (group
!= NULL
) {
3122 mac_group_remove_client(group
, mcip
);
3123 next_state
= mac_group_next_state(group
,
3124 &grp_only_mcip
, default_group
, B_TRUE
);
3125 if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3127 * Only one client left on this RX group.
3129 ASSERT(grp_only_mcip
!= NULL
);
3130 mac_set_group_state(group
,
3131 MAC_GROUP_STATE_RESERVED
);
3132 group_only_flent
= grp_only_mcip
->mci_flent
;
3135 * The only remaining client has exclusive
3136 * access on the group. Allow it to
3137 * dynamically poll the H/W rings etc.
3139 mac_rx_srs_group_setup(grp_only_mcip
,
3140 group_only_flent
, SRST_LINK
);
3141 mac_fanout_setup(grp_only_mcip
,
3143 MCIP_RESOURCE_PROPS(grp_only_mcip
),
3144 mac_rx_deliver
, grp_only_mcip
, NULL
, NULL
);
3145 mac_rx_group_unmark(group
, MR_INCIPIENT
);
3146 mac_set_rings_effective(grp_only_mcip
);
3147 } else if (next_state
== MAC_GROUP_STATE_REGISTERED
) {
3149 * This is a non-default group being freed up.
3150 * We need to reevaluate the default group
3151 * to see if the primary client can get
3152 * exclusive access to the default group.
3154 ASSERT(group
!= MAC_DEFAULT_RX_GROUP(mip
));
3155 if (mrp
->mrp_mask
& MRP_RX_RINGS
) {
3156 MAC_RX_GRP_RELEASED(mip
);
3157 if (mip
->mi_rx_group_type
==
3158 MAC_GROUP_TYPE_DYNAMIC
) {
3159 MAC_RX_RING_RELEASED(mip
,
3160 group
->mrg_cur_count
);
3163 mac_release_rx_group(mcip
, group
);
3164 mac_set_group_state(group
,
3165 MAC_GROUP_STATE_REGISTERED
);
3166 check_default_group
= B_TRUE
;
3168 ASSERT(next_state
== MAC_GROUP_STATE_SHARED
);
3169 mac_set_group_state(group
,
3170 MAC_GROUP_STATE_SHARED
);
3171 mac_rx_group_unmark(group
, MR_CONDEMNED
);
3173 flent
->fe_rx_ring_group
= NULL
;
3176 * Remove the client from the TX group. Additionally, if
3177 * this a non-default group, then we also need to release
3180 group
= flent
->fe_tx_ring_group
;
3181 default_group
= MAC_DEFAULT_TX_GROUP(mip
);
3182 if (group
!= NULL
) {
3183 mac_group_remove_client(group
, mcip
);
3184 next_state
= mac_group_next_state(group
,
3185 &grp_only_mcip
, default_group
, B_FALSE
);
3186 if (next_state
== MAC_GROUP_STATE_REGISTERED
) {
3187 if (group
!= default_group
) {
3188 if (mrp
->mrp_mask
& MRP_TX_RINGS
) {
3189 MAC_TX_GRP_RELEASED(mip
);
3190 if (mip
->mi_tx_group_type
==
3191 MAC_GROUP_TYPE_DYNAMIC
) {
3192 MAC_TX_RING_RELEASED(
3197 mac_release_tx_group(mcip
, group
);
3199 * If the default group is reserved,
3200 * then we need to set the effective
3201 * rings as we would have given
3202 * back some rings when the group
3205 if (mip
->mi_tx_group_type
==
3206 MAC_GROUP_TYPE_DYNAMIC
&&
3207 default_group
->mrg_state
==
3208 MAC_GROUP_STATE_RESERVED
) {
3210 MAC_GROUP_ONLY_CLIENT
3212 mac_set_rings_effective(
3221 * Stop all the rings except the
3224 ringcnt
= group
->mrg_cur_count
;
3225 ring
= group
->mrg_rings
;
3226 for (cnt
= 0; cnt
< ringcnt
; cnt
++) {
3227 if (ring
->mr_state
==
3230 mip
->mi_default_tx_ring
) {
3231 mac_stop_ring(ring
);
3234 ring
= ring
->mr_next
;
3237 } else if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3238 mac_set_rings_effective(grp_only_mcip
);
3240 flent
->fe_tx_ring_group
= NULL
;
3241 group
->mrg_state
= next_state
;
3250 * The mac client using the default group gets exclusive access to the
3251 * default group if and only if it is the sole client on the entire
3252 * mip. If so set the group state to reserved, and set up the SRSes
3253 * over the default group.
3255 if (check_default_group
) {
3256 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3257 ASSERT(default_group
->mrg_state
== MAC_GROUP_STATE_SHARED
);
3258 next_state
= mac_group_next_state(default_group
,
3259 &grp_only_mcip
, default_group
, B_TRUE
);
3260 if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3261 ASSERT(grp_only_mcip
!= NULL
&&
3262 mip
->mi_nactiveclients
== 1);
3263 mac_set_group_state(default_group
,
3264 MAC_GROUP_STATE_RESERVED
);
3265 mac_rx_srs_group_setup(grp_only_mcip
,
3266 grp_only_mcip
->mci_flent
, SRST_LINK
);
3267 mac_fanout_setup(grp_only_mcip
,
3268 grp_only_mcip
->mci_flent
,
3269 MCIP_RESOURCE_PROPS(grp_only_mcip
), mac_rx_deliver
,
3270 grp_only_mcip
, NULL
, NULL
);
3271 mac_rx_group_unmark(default_group
, MR_INCIPIENT
);
3272 mac_set_rings_effective(grp_only_mcip
);
3277 * If the primary is the only one left and the MAC supports
3278 * dynamic grouping, we need to see if the primary needs to
3279 * be moved to the default group so that it can use all the
3282 if (!(flent
->fe_type
& FLOW_PRIMARY_MAC
) &&
3283 mip
->mi_nactiveclients
== 1 &&
3284 mip
->mi_rx_group_type
== MAC_GROUP_TYPE_DYNAMIC
) {
3285 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3286 grp_only_mcip
= mac_primary_client_handle(mip
);
3287 if (grp_only_mcip
== NULL
)
3289 group_only_flent
= grp_only_mcip
->mci_flent
;
3290 mrp
= MCIP_RESOURCE_PROPS(grp_only_mcip
);
3292 * If the primary has an explicit property set, leave it
3295 if (mrp
->mrp_mask
& MRP_RX_RINGS
)
3298 * Switch the primary to the default group.
3300 (void) mac_rx_switch_group(grp_only_mcip
,
3301 group_only_flent
->fe_rx_ring_group
, default_group
);
3305 /* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */
3308 mac_srs_fanout_list_free(mac_soft_ring_set_t
*mac_srs
)
3310 if (mac_srs
->srs_type
& SRST_TX
) {
3313 ASSERT(mac_srs
->srs_tcp_soft_rings
== NULL
);
3314 ASSERT(mac_srs
->srs_udp_soft_rings
== NULL
);
3315 ASSERT(mac_srs
->srs_oth_soft_rings
== NULL
);
3316 ASSERT(mac_srs
->srs_tx_soft_rings
!= NULL
);
3317 kmem_free(mac_srs
->srs_tx_soft_rings
,
3318 sizeof (mac_soft_ring_t
*) * MAX_RINGS_PER_GROUP
);
3319 mac_srs
->srs_tx_soft_rings
= NULL
;
3320 tx
= &mac_srs
->srs_tx
;
3321 if (tx
->st_soft_rings
!= NULL
) {
3322 kmem_free(tx
->st_soft_rings
,
3323 sizeof (mac_soft_ring_t
*) * MAX_RINGS_PER_GROUP
);
3326 ASSERT(mac_srs
->srs_tx_soft_rings
== NULL
);
3327 ASSERT(mac_srs
->srs_tcp_soft_rings
!= NULL
);
3328 kmem_free(mac_srs
->srs_tcp_soft_rings
,
3329 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3330 mac_srs
->srs_tcp_soft_rings
= NULL
;
3331 ASSERT(mac_srs
->srs_udp_soft_rings
!= NULL
);
3332 kmem_free(mac_srs
->srs_udp_soft_rings
,
3333 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3334 mac_srs
->srs_udp_soft_rings
= NULL
;
3335 ASSERT(mac_srs
->srs_oth_soft_rings
!= NULL
);
3336 kmem_free(mac_srs
->srs_oth_soft_rings
,
3337 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3338 mac_srs
->srs_oth_soft_rings
= NULL
;
3343 * An RX SRS is attached to at most one mac_ring.
3344 * A TX SRS has no rings.
3347 mac_srs_ring_free(mac_soft_ring_set_t
*mac_srs
)
3349 mac_client_impl_t
*mcip
;
3351 flow_entry_t
*flent
;
3353 ring
= mac_srs
->srs_ring
;
3354 if (mac_srs
->srs_type
& SRST_TX
) {
3355 ASSERT(ring
== NULL
);
3363 * Broadcast flows don't have a client impl association, but they
3364 * use only soft rings.
3366 flent
= mac_srs
->srs_flent
;
3367 mcip
= flent
->fe_mcip
;
3368 ASSERT(mcip
!= NULL
);
3370 ring
->mr_classify_type
= MAC_NO_CLASSIFIER
;
3371 ring
->mr_srs
= NULL
;
3375 * Physical unlink and free of the data structures happen below. This is
3376 * driven from mac_flow_destroy(), on the last refrele of a flow.
3378 * Assumes Rx srs is 1-1 mapped with an ring.
3381 mac_srs_free(mac_soft_ring_set_t
*mac_srs
)
3383 ASSERT(mac_srs
->srs_mcip
== NULL
||
3384 MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
3385 ASSERT((mac_srs
->srs_state
& (SRS_CONDEMNED
| SRS_CONDEMNED_DONE
|
3386 SRS_PROC
| SRS_PROC_FAST
)) == (SRS_CONDEMNED
| SRS_CONDEMNED_DONE
));
3388 mac_pkt_drop(NULL
, NULL
, mac_srs
->srs_first
, B_FALSE
);
3389 mac_srs_ring_free(mac_srs
);
3390 mac_srs_soft_rings_free(mac_srs
);
3391 mac_srs_fanout_list_free(mac_srs
);
3393 mac_srs
->srs_bw
= NULL
;
3394 mac_srs_stat_delete(mac_srs
);
3395 kmem_cache_free(mac_srs_cache
, mac_srs
);
3399 mac_srs_soft_rings_quiesce(mac_soft_ring_set_t
*mac_srs
, uint_t s_ring_flag
)
3401 mac_soft_ring_t
*softring
;
3403 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3405 mac_srs_soft_rings_signal(mac_srs
, s_ring_flag
);
3406 if (s_ring_flag
== S_RING_CONDEMNED
) {
3407 while (mac_srs
->srs_soft_ring_condemned_count
!=
3408 mac_srs
->srs_soft_ring_count
)
3409 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3411 while (mac_srs
->srs_soft_ring_quiesced_count
!=
3412 mac_srs
->srs_soft_ring_count
)
3413 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3415 mutex_exit(&mac_srs
->srs_lock
);
3417 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3418 softring
= softring
->s_ring_next
) {
3419 (void) untimeout(softring
->s_ring_tid
);
3420 softring
->s_ring_tid
= NULL
;
3423 (void) untimeout(mac_srs
->srs_tid
);
3424 mac_srs
->srs_tid
= NULL
;
3426 mutex_enter(&mac_srs
->srs_lock
);
3430 * The block comment above mac_rx_classify_flow_state_change explains the
3431 * background. At this point upcalls from the driver (both hardware classified
3432 * and software classified) have been cut off. We now need to quiesce the
3433 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3434 * the master controller. The steps involved are described below in the function
3437 mac_srs_worker_quiesce(mac_soft_ring_set_t
*mac_srs
)
3440 uint_t srs_poll_wait_flag
;
3442 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3443 ASSERT(mac_srs
->srs_state
& (SRS_CONDEMNED
| SRS_QUIESCE
));
3445 if (mac_srs
->srs_state
& SRS_CONDEMNED
) {
3446 s_ring_flag
= S_RING_CONDEMNED
;
3447 srs_poll_wait_flag
= SRS_POLL_THR_EXITED
;
3449 s_ring_flag
= S_RING_QUIESCE
;
3450 srs_poll_wait_flag
= SRS_POLL_THR_QUIESCED
;
3454 * In the case of Rx SRS wait till the poll thread is done.
3456 if ((mac_srs
->srs_type
& SRST_TX
) == 0 &&
3457 mac_srs
->srs_poll_thr
!= NULL
) {
3458 while (!(mac_srs
->srs_state
& srs_poll_wait_flag
))
3459 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3462 * Turn off polling as part of the quiesce operation.
3464 MAC_SRS_POLLING_OFF(mac_srs
);
3465 mac_srs
->srs_state
&= ~(SRS_POLLING
| SRS_GET_PKTS
);
3469 * Then signal the soft ring worker threads to quiesce or quit
3470 * as needed and then wait till that happens.
3472 mac_srs_soft_rings_quiesce(mac_srs
, s_ring_flag
);
3474 if (mac_srs
->srs_state
& SRS_CONDEMNED
)
3475 mac_srs
->srs_state
|= (SRS_QUIESCE_DONE
| SRS_CONDEMNED_DONE
);
3477 mac_srs
->srs_state
|= SRS_QUIESCE_DONE
;
3478 cv_signal(&mac_srs
->srs_quiesce_done_cv
);
3482 * Signal an SRS to start a temporary quiesce, or permanent removal, or restart
3483 * a quiesced SRS by setting the appropriate flags and signaling the SRS worker
3484 * or poll thread. This function is internal to the quiescing logic and is
3485 * called internally from the SRS quiesce or flow quiesce or client quiesce
3486 * higher level functions.
3489 mac_srs_signal(mac_soft_ring_set_t
*mac_srs
, uint_t srs_flag
)
3493 ring
= mac_srs
->srs_ring
;
3494 ASSERT(ring
== NULL
|| ring
->mr_refcnt
== 0);
3496 if (srs_flag
== SRS_CONDEMNED
) {
3498 * The SRS is going away. We need to unbind the SRS and SR
3499 * threads before removing from the global SRS list. Otherwise
3500 * there is a small window where the cpu reconfig callbacks
3501 * may miss the SRS in the list walk and DR could fail since
3502 * there are still bound threads.
3504 mac_srs_threads_unbind(mac_srs
);
3505 mac_srs_remove_glist(mac_srs
);
3508 * Wakeup the SRS worker and poll threads.
3510 mutex_enter(&mac_srs
->srs_lock
);
3511 mac_srs
->srs_state
|= srs_flag
;
3512 cv_signal(&mac_srs
->srs_async
);
3513 cv_signal(&mac_srs
->srs_cv
);
3514 mutex_exit(&mac_srs
->srs_lock
);
3518 * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
3519 * from the driver are done, then the Rx SRS is quiesced and only then can
3520 * we signal the soft rings. Thus this function can't be called arbitrarily
3521 * without satisfying the prerequisites. On the Tx side, the threads from
3522 * top need to quiesced, then the Tx SRS and only then can we signal the
3526 mac_srs_soft_rings_signal(mac_soft_ring_set_t
*mac_srs
, uint_t sr_flag
)
3528 mac_soft_ring_t
*softring
;
3530 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3531 softring
= softring
->s_ring_next
)
3532 mac_soft_ring_signal(softring
, sr_flag
);
3536 * The block comment above mac_rx_classify_flow_state_change explains the
3537 * background. At this point the SRS is quiesced and we need to restart the
3538 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3539 * the master controller. The steps involved are described below in the function
3542 mac_srs_worker_restart(mac_soft_ring_set_t
*mac_srs
)
3544 boolean_t iam_rx_srs
;
3545 mac_soft_ring_t
*softring
;
3547 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3548 if ((mac_srs
->srs_type
& SRST_TX
) != 0) {
3549 iam_rx_srs
= B_FALSE
;
3550 ASSERT((mac_srs
->srs_state
&
3551 (SRS_POLL_THR_QUIESCED
| SRS_QUIESCE_DONE
| SRS_QUIESCE
)) ==
3552 (SRS_QUIESCE_DONE
| SRS_QUIESCE
));
3554 iam_rx_srs
= B_TRUE
;
3555 ASSERT((mac_srs
->srs_state
&
3556 (SRS_QUIESCE_DONE
| SRS_QUIESCE
)) ==
3557 (SRS_QUIESCE_DONE
| SRS_QUIESCE
));
3558 if (mac_srs
->srs_poll_thr
!= NULL
) {
3559 ASSERT((mac_srs
->srs_state
& SRS_POLL_THR_QUIESCED
) ==
3560 SRS_POLL_THR_QUIESCED
);
3565 * Signal any quiesced soft ring workers to restart and wait for the
3566 * soft ring down count to come down to zero.
3568 if (mac_srs
->srs_soft_ring_quiesced_count
!= 0) {
3569 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3570 softring
= softring
->s_ring_next
) {
3571 if (!(softring
->s_ring_state
& S_RING_QUIESCE
))
3573 mac_soft_ring_signal(softring
, S_RING_RESTART
);
3575 while (mac_srs
->srs_soft_ring_quiesced_count
!= 0)
3576 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3579 mac_srs
->srs_state
&= ~(SRS_QUIESCE_DONE
| SRS_QUIESCE
| SRS_RESTART
);
3580 if (iam_rx_srs
&& mac_srs
->srs_poll_thr
!= NULL
) {
3582 * Signal the poll thread and ask it to restart. Wait till it
3583 * actually restarts and the SRS_POLL_THR_QUIESCED flag gets
3586 mac_srs
->srs_state
|= SRS_POLL_THR_RESTART
;
3587 cv_signal(&mac_srs
->srs_cv
);
3588 while (mac_srs
->srs_state
& SRS_POLL_THR_QUIESCED
)
3589 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3590 ASSERT(!(mac_srs
->srs_state
& SRS_POLL_THR_RESTART
));
3592 /* Wake up any waiter waiting for the restart to complete */
3593 mac_srs
->srs_state
|= SRS_RESTART_DONE
;
3594 cv_signal(&mac_srs
->srs_quiesce_done_cv
);
3598 mac_srs_worker_unbind(mac_soft_ring_set_t
*mac_srs
)
3600 mutex_enter(&mac_srs
->srs_lock
);
3601 if (!(mac_srs
->srs_state
& SRS_WORKER_BOUND
)) {
3602 ASSERT(mac_srs
->srs_worker_cpuid
== -1);
3603 mutex_exit(&mac_srs
->srs_lock
);
3607 mac_srs
->srs_worker_cpuid
= -1;
3608 mac_srs
->srs_state
&= ~SRS_WORKER_BOUND
;
3609 thread_affinity_clear(mac_srs
->srs_worker
);
3610 mutex_exit(&mac_srs
->srs_lock
);
3614 mac_srs_poll_unbind(mac_soft_ring_set_t
*mac_srs
)
3616 mutex_enter(&mac_srs
->srs_lock
);
3617 if (mac_srs
->srs_poll_thr
== NULL
||
3618 (mac_srs
->srs_state
& SRS_POLL_BOUND
) == 0) {
3619 ASSERT(mac_srs
->srs_poll_cpuid
== -1);
3620 mutex_exit(&mac_srs
->srs_lock
);
3624 mac_srs
->srs_poll_cpuid
= -1;
3625 mac_srs
->srs_state
&= ~SRS_POLL_BOUND
;
3626 thread_affinity_clear(mac_srs
->srs_poll_thr
);
3627 mutex_exit(&mac_srs
->srs_lock
);
3631 mac_srs_threads_unbind(mac_soft_ring_set_t
*mac_srs
)
3633 mac_soft_ring_t
*soft_ring
;
3635 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
3637 mutex_enter(&cpu_lock
);
3638 mac_srs_worker_unbind(mac_srs
);
3639 if (!(mac_srs
->srs_type
& SRST_TX
))
3640 mac_srs_poll_unbind(mac_srs
);
3642 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
3643 soft_ring
= soft_ring
->s_ring_next
) {
3644 mac_soft_ring_unbind(soft_ring
);
3646 mutex_exit(&cpu_lock
);
3650 * When a CPU is going away, unbind all MAC threads which are bound
3651 * to that CPU. The affinity of the thread to the CPU is saved to allow
3652 * the thread to be rebound to the CPU if it comes back online.
3655 mac_walk_srs_and_unbind(int cpuid
)
3657 mac_soft_ring_set_t
*mac_srs
;
3658 mac_soft_ring_t
*soft_ring
;
3660 rw_enter(&mac_srs_g_lock
, RW_READER
);
3662 if ((mac_srs
= mac_srs_g_list
) == NULL
)
3665 for (; mac_srs
!= NULL
; mac_srs
= mac_srs
->srs_next
) {
3666 if (mac_srs
->srs_worker_cpuid
== cpuid
) {
3667 mac_srs
->srs_worker_cpuid_save
= cpuid
;
3668 mac_srs_worker_unbind(mac_srs
);
3671 if (!(mac_srs
->srs_type
& SRST_TX
)) {
3672 if (mac_srs
->srs_poll_cpuid
== cpuid
) {
3673 mac_srs
->srs_poll_cpuid_save
= cpuid
;
3674 mac_srs_poll_unbind(mac_srs
);
3678 /* Next tackle the soft rings associated with the srs */
3679 mutex_enter(&mac_srs
->srs_lock
);
3680 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
3681 soft_ring
= soft_ring
->s_ring_next
) {
3682 if (soft_ring
->s_ring_cpuid
== cpuid
) {
3683 soft_ring
->s_ring_cpuid_save
= cpuid
;
3684 mac_soft_ring_unbind(soft_ring
);
3687 mutex_exit(&mac_srs
->srs_lock
);
3690 rw_exit(&mac_srs_g_lock
);
3693 /* TX SETUP and TEARDOWN ROUTINES */
3696 * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring()
3697 * handle the case where the number of rings is one. I.e. there is
3698 * a ring pointed to by mac_srs->srs_tx_arg2.
3701 mac_tx_srs_add_ring(mac_soft_ring_set_t
*mac_srs
, mac_ring_t
*tx_ring
)
3703 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
3704 mac_soft_ring_t
*soft_ring
;
3705 int count
= mac_srs
->srs_tx_ring_count
;
3706 uint32_t soft_ring_type
= ST_RING_TX
;
3709 ASSERT(mac_srs
->srs_state
& SRS_QUIESCE
);
3710 ring_info
= mac_hwring_getinfo((mac_ring_handle_t
)tx_ring
);
3711 if (mac_tx_serialize
|| (ring_info
& MAC_RING_TX_SERIALIZE
))
3712 soft_ring_type
|= ST_RING_WORKER_ONLY
;
3713 soft_ring
= mac_soft_ring_create(count
, 0,
3714 soft_ring_type
, maxclsyspri
, mcip
, mac_srs
, -1,
3715 NULL
, mcip
, (mac_resource_handle_t
)tx_ring
);
3716 mac_srs
->srs_tx_ring_count
++;
3717 mac_srs_update_fanout_list(mac_srs
);
3719 * put this soft ring in quiesce mode too so when we restart
3720 * all soft rings in the srs are in the same state.
3722 mac_soft_ring_signal(soft_ring
, S_RING_QUIESCE
);
3726 mac_soft_ring_remove(mac_soft_ring_set_t
*mac_srs
, mac_soft_ring_t
*softring
)
3730 mutex_enter(&mac_srs
->srs_lock
);
3731 sringcnt
= mac_srs
->srs_soft_ring_count
;
3732 ASSERT(sringcnt
> 0);
3733 mac_soft_ring_signal(softring
, S_RING_CONDEMNED
);
3735 ASSERT(mac_srs
->srs_soft_ring_condemned_count
== 0);
3736 while (mac_srs
->srs_soft_ring_condemned_count
!= 1)
3737 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3739 if (softring
== mac_srs
->srs_soft_ring_head
) {
3740 mac_srs
->srs_soft_ring_head
= softring
->s_ring_next
;
3741 if (mac_srs
->srs_soft_ring_head
!= NULL
) {
3742 mac_srs
->srs_soft_ring_head
->s_ring_prev
= NULL
;
3744 mac_srs
->srs_soft_ring_tail
= NULL
;
3747 softring
->s_ring_prev
->s_ring_next
=
3748 softring
->s_ring_next
;
3749 if (softring
->s_ring_next
!= NULL
) {
3750 softring
->s_ring_next
->s_ring_prev
=
3751 softring
->s_ring_prev
;
3753 mac_srs
->srs_soft_ring_tail
=
3754 softring
->s_ring_prev
;
3757 mac_srs
->srs_soft_ring_count
--;
3759 mac_srs
->srs_soft_ring_condemned_count
--;
3760 mutex_exit(&mac_srs
->srs_lock
);
3762 mac_soft_ring_free(softring
);
3766 mac_tx_srs_del_ring(mac_soft_ring_set_t
*mac_srs
, mac_ring_t
*tx_ring
)
3769 mac_soft_ring_t
*soft_ring
, *remove_sring
;
3770 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
3772 mutex_enter(&mac_srs
->srs_lock
);
3773 for (i
= 0; i
< mac_srs
->srs_tx_ring_count
; i
++) {
3774 soft_ring
= mac_srs
->srs_tx_soft_rings
[i
];
3775 if (soft_ring
->s_ring_tx_arg2
== tx_ring
)
3778 mutex_exit(&mac_srs
->srs_lock
);
3779 ASSERT(i
< mac_srs
->srs_tx_ring_count
);
3780 remove_sring
= soft_ring
;
3782 * In the case of aggr, the soft ring associated with a Tx ring
3783 * is also stored in st_soft_rings[] array. That entry should
3786 if (mcip
->mci_state_flags
& MCIS_IS_AGGR
) {
3787 mac_srs_tx_t
*tx
= &mac_srs
->srs_tx
;
3789 ASSERT(tx
->st_soft_rings
[tx_ring
->mr_index
] == remove_sring
);
3790 tx
->st_soft_rings
[tx_ring
->mr_index
] = NULL
;
3792 mac_soft_ring_remove(mac_srs
, remove_sring
);
3793 mac_srs_update_fanout_list(mac_srs
);
3797 * mac_tx_srs_setup():
3798 * Used to setup Tx rings. If no free Tx ring is available, then default
3802 mac_tx_srs_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
)
3804 mac_impl_t
*mip
= mcip
->mci_mip
;
3805 mac_soft_ring_set_t
*tx_srs
= flent
->fe_tx_srs
;
3807 int tx_ring_count
= 0;
3808 uint32_t soft_ring_type
;
3809 mac_group_t
*grp
= NULL
;
3811 mac_srs_tx_t
*tx
= &tx_srs
->srs_tx
;
3813 uint_t ring_info
= 0;
3815 is_aggr
= (mcip
->mci_state_flags
& MCIS_IS_AGGR
) != 0;
3816 grp
= flent
->fe_tx_ring_group
;
3818 ring
= (mac_ring_t
*)mip
->mi_default_tx_ring
;
3821 tx_ring_count
= grp
->mrg_cur_count
;
3822 ring
= grp
->mrg_rings
;
3824 * An attempt is made to reserve 'tx_ring_count' number
3825 * of Tx rings. If tx_ring_count is 0, default Tx ring
3826 * is used. If it is 1, an attempt is made to reserve one
3827 * Tx ring. In both the cases, the ring information is
3828 * stored in Tx SRS. If multiple Tx rings are specified,
3829 * then each Tx ring will have a Tx-side soft ring. All
3830 * these soft rings will be hang off Tx SRS.
3832 switch (grp
->mrg_state
) {
3833 case MAC_GROUP_STATE_SHARED
:
3834 case MAC_GROUP_STATE_RESERVED
:
3835 if (tx_ring_count
<= 1 && !is_aggr
) {
3838 ring
->mr_state
!= MR_INUSE
) {
3839 (void) mac_start_ring(ring
);
3840 ring_info
= mac_hwring_getinfo(
3841 (mac_ring_handle_t
)ring
);
3843 tx
->st_arg2
= (void *)ring
;
3844 mac_tx_srs_stat_recreate(tx_srs
, B_FALSE
);
3845 if (tx_srs
->srs_type
& SRST_BW_CONTROL
) {
3846 tx
->st_mode
= SRS_TX_BW
;
3847 } else if (mac_tx_serialize
||
3848 (ring_info
& MAC_RING_TX_SERIALIZE
)) {
3849 tx
->st_mode
= SRS_TX_SERIALIZE
;
3851 tx
->st_mode
= SRS_TX_DEFAULT
;
3855 soft_ring_type
= ST_RING_TX
;
3856 if (tx_srs
->srs_type
& SRST_BW_CONTROL
) {
3857 tx
->st_mode
= is_aggr
?
3858 SRS_TX_BW_AGGR
: SRS_TX_BW_FANOUT
;
3860 tx
->st_mode
= is_aggr
? SRS_TX_AGGR
:
3863 for (i
= 0; i
< tx_ring_count
; i
++) {
3864 ASSERT(ring
!= NULL
);
3865 switch (ring
->mr_state
) {
3868 ASSERT(ring
->mr_srs
== NULL
);
3870 if (ring
->mr_state
!= MR_INUSE
)
3871 (void) mac_start_ring(ring
);
3872 ring_info
= mac_hwring_getinfo(
3873 (mac_ring_handle_t
)ring
);
3874 if (mac_tx_serialize
|| (ring_info
&
3875 MAC_RING_TX_SERIALIZE
)) {
3877 ST_RING_WORKER_ONLY
;
3879 (void) mac_soft_ring_create(i
, 0,
3880 soft_ring_type
, maxclsyspri
,
3881 mcip
, tx_srs
, -1, NULL
, mcip
,
3882 (mac_resource_handle_t
)ring
);
3886 "srs_setup: mcip = %p "
3887 "trying to add UNKNOWN ring = %p\n",
3888 (void *)mcip
, (void *)ring
);
3891 ring
= ring
->mr_next
;
3893 mac_srs_update_fanout_list(tx_srs
);
3899 tx
->st_func
= mac_tx_get_func(tx
->st_mode
);
3901 VERIFY(i_mac_capab_get((mac_handle_t
)mip
,
3902 MAC_CAPAB_AGGR
, &tx
->st_capab_aggr
));
3904 DTRACE_PROBE3(tx__srs___setup__return
, mac_soft_ring_set_t
*, tx_srs
,
3905 int, tx
->st_mode
, int, tx_srs
->srs_tx_ring_count
);
3909 * Update the fanout of a client if its recorded link speed doesn't match
3910 * its current link speed.
3913 mac_fanout_recompute_client(mac_client_impl_t
*mcip
, cpupart_t
*cpupart
)
3915 uint64_t link_speed
;
3916 mac_resource_props_t
*mcip_mrp
;
3917 flow_entry_t
*flent
= mcip
->mci_flent
;
3918 mac_soft_ring_set_t
*rx_srs
;
3919 mac_cpus_t
*srs_cpu
;
3920 int soft_ring_count
, maxcpus
;
3922 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
3924 link_speed
= mac_client_stat_get(mcip
->mci_flent
->fe_mcip
,
3927 if ((link_speed
!= 0) &&
3928 (link_speed
!= mcip
->mci_flent
->fe_nic_speed
)) {
3929 mcip_mrp
= MCIP_RESOURCE_PROPS(mcip
);
3931 * Before calling mac_fanout_setup(), check to see if
3932 * the SRSes already have the right number of soft
3933 * rings. mac_fanout_setup() is a heavy duty operation
3934 * where new cpu bindings are done for SRS and soft
3935 * ring threads and interrupts re-targeted.
3937 maxcpus
= (cpupart
!= NULL
) ? cpupart
->cp_ncpus
: ncpus
;
3938 soft_ring_count
= mac_compute_soft_ring_count(flent
,
3939 flent
->fe_rx_srs_cnt
- 1, maxcpus
);
3941 * If soft_ring_count returned by
3942 * mac_compute_soft_ring_count() is 0, bump it
3943 * up by 1 because we always have atleast one
3944 * TCP, UDP, and OTH soft ring associated with
3947 soft_ring_count
= (soft_ring_count
== 0) ?
3948 1 : soft_ring_count
;
3949 rx_srs
= flent
->fe_rx_srs
[0];
3950 srs_cpu
= &rx_srs
->srs_cpu
;
3951 if (soft_ring_count
!= srs_cpu
->mc_rx_fanout_cnt
) {
3952 mac_fanout_setup(mcip
, flent
, mcip_mrp
,
3953 mac_rx_deliver
, mcip
, NULL
, cpupart
);
3959 * Walk through the list of mac clients for the MAC.
3960 * For each active mac client, recompute the number of soft rings
3961 * associated with every client, only if current speed is different
3962 * from the speed that was previously used for soft ring computation.
3963 * If the cable is disconnected whlie the NIC is started, we would get
3964 * notification with speed set to 0. We do not recompute in that case.
3967 mac_fanout_recompute(mac_impl_t
*mip
)
3969 mac_client_impl_t
*mcip
;
3971 boolean_t use_default
;
3972 mac_resource_props_t
*mrp
, *emrp
;
3974 i_mac_perim_enter(mip
);
3975 if ((mip
->mi_state_flags
& MIS_IS_VNIC
) != 0 ||
3976 mip
->mi_linkstate
!= LINK_STATE_UP
) {
3977 i_mac_perim_exit(mip
);
3981 for (mcip
= mip
->mi_clients_list
; mcip
!= NULL
;
3982 mcip
= mcip
->mci_client_next
) {
3983 if ((mcip
->mci_state_flags
& MCIS_SHARE_BOUND
) != 0 ||
3984 !MCIP_DATAPATH_SETUP(mcip
))
3986 mrp
= MCIP_RESOURCE_PROPS(mcip
);
3987 emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
3988 use_default
= B_FALSE
;
3990 cpupart
= mac_pset_find(mrp
, &use_default
);
3991 mac_fanout_recompute_client(mcip
, cpupart
);
3992 mac_set_pool_effective(use_default
, cpupart
, mrp
, emrp
);
3995 i_mac_perim_exit(mip
);
3999 * Given a MAC, change the polling state for all its MAC clients. 'enable' is
4000 * B_TRUE to enable polling or B_FALSE to disable. Polling is enabled by
4004 mac_poll_state_change(mac_handle_t mh
, boolean_t enable
)
4006 mac_impl_t
*mip
= (mac_impl_t
*)mh
;
4007 mac_client_impl_t
*mcip
;
4009 i_mac_perim_enter(mip
);
4011 mip
->mi_state_flags
&= ~MIS_POLL_DISABLE
;
4013 mip
->mi_state_flags
|= MIS_POLL_DISABLE
;
4014 for (mcip
= mip
->mi_clients_list
; mcip
!= NULL
;
4015 mcip
= mcip
->mci_client_next
)
4016 mac_client_update_classifier(mcip
, B_TRUE
);
4017 i_mac_perim_exit(mip
);