Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / io / aggr / aggr_grp.c
blob444e52812ae2e4b36fae2e42713148ea56dce322
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Joyent, Inc.
27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
29 * An instance of the structure aggr_grp_t is allocated for each
30 * link aggregation group. When created, aggr_grp_t objects are
31 * entered into the aggr_grp_hash hash table maintained by the modhash
32 * module. The hash key is the linkid associated with the link
33 * aggregation group.
35 * A set of MAC ports are associated with each association group.
37 * Aggr pseudo TX rings
38 * --------------------
39 * The underlying ports (NICs) in an aggregation can have TX rings. To
40 * enhance aggr's performance, these TX rings are made available to the
41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42 * They are already present and implemented on the RX side. It is called
43 * as pseudo RX rings. The same concept is extended to the TX side where
44 * each TX ring of an underlying port is reflected in aggr as a pseudo
45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47 * TX ring is given to the aggregation layer.
49 * With this change, the outgoing stack depth looks much better:
51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59 * ring belonging to a port on which the packet has to be sent.
60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61 * policy and then uses the fanout_hint passed to it to pick a TX ring from
62 * the selected port.
64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65 * bandwidth limit is applied first on the outgoing packet and the packets
66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67 * particular TX ring.
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
88 #include <sys/dlpi.h>
89 #include <sys/zone.h>
90 #include <sys/mac_provider.h>
91 #include <sys/dls.h>
92 #include <sys/vlan.h>
93 #include <sys/aggr.h>
94 #include <sys/aggr_impl.h>
96 static int aggr_m_start(void *);
97 static void aggr_m_stop(void *);
98 static int aggr_m_promisc(void *, boolean_t);
99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
100 static int aggr_m_unicst(void *, const uint8_t *);
101 static int aggr_m_stat(void *, uint_t, uint64_t *);
102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105 const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107 mac_prop_info_handle_t);
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111 boolean_t *);
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
126 static int aggr_addmac(void *, const uint8_t *);
127 static int aggr_remmac(void *, const uint8_t *);
128 static mblk_t *aggr_rx_poll(void *, int);
129 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
130 const int, mac_ring_info_t *, mac_ring_handle_t);
131 static void aggr_fill_group(void *, mac_ring_type_t, const int,
132 mac_group_info_t *, mac_group_handle_t);
134 static kmem_cache_t *aggr_grp_cache;
135 static mod_hash_t *aggr_grp_hash;
136 static krwlock_t aggr_grp_lock;
137 static uint_t aggr_grp_cnt;
138 static id_space_t *key_ids;
140 #define GRP_HASHSZ 64
141 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
142 #define AGGR_PORT_NAME_DELIMIT '-'
144 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
146 #define AGGR_M_CALLBACK_FLAGS \
147 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
149 static mac_callbacks_t aggr_m_callbacks = {
150 AGGR_M_CALLBACK_FLAGS,
151 aggr_m_stat,
152 aggr_m_start,
153 aggr_m_stop,
154 aggr_m_promisc,
155 aggr_m_multicst,
156 NULL,
157 NULL,
158 NULL,
159 aggr_m_ioctl,
160 aggr_m_capab_get,
161 NULL,
162 NULL,
163 aggr_m_setprop,
164 NULL,
165 aggr_m_propinfo
168 /*ARGSUSED*/
169 static int
170 aggr_grp_constructor(void *buf, void *arg, int kmflag)
172 aggr_grp_t *grp = buf;
174 bzero(grp, sizeof (*grp));
175 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
176 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
177 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
178 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
179 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
180 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
181 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
182 grp->lg_link_state = LINK_STATE_UNKNOWN;
183 return (0);
186 /*ARGSUSED*/
187 static void
188 aggr_grp_destructor(void *buf, void *arg)
190 aggr_grp_t *grp = buf;
192 if (grp->lg_tx_ports != NULL) {
193 kmem_free(grp->lg_tx_ports,
194 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
197 mutex_destroy(&grp->lg_lacp_lock);
198 cv_destroy(&grp->lg_lacp_cv);
199 mutex_destroy(&grp->lg_port_lock);
200 cv_destroy(&grp->lg_port_cv);
201 rw_destroy(&grp->lg_tx_lock);
202 mutex_destroy(&grp->lg_tx_flowctl_lock);
203 cv_destroy(&grp->lg_tx_flowctl_cv);
206 void
207 aggr_grp_init(void)
209 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
210 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
211 aggr_grp_destructor, NULL, NULL, NULL, 0);
213 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
214 GRP_HASHSZ, mod_hash_null_valdtor);
215 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
216 aggr_grp_cnt = 0;
219 * Allocate an id space to manage key values (when key is not
220 * specified). The range of the id space will be from
221 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
222 * uses a 16-bit key.
224 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
225 ASSERT(key_ids != NULL);
228 void
229 aggr_grp_fini(void)
231 id_space_destroy(key_ids);
232 rw_destroy(&aggr_grp_lock);
233 mod_hash_destroy_idhash(aggr_grp_hash);
234 kmem_cache_destroy(aggr_grp_cache);
237 uint_t
238 aggr_grp_count(void)
240 uint_t count;
242 rw_enter(&aggr_grp_lock, RW_READER);
243 count = aggr_grp_cnt;
244 rw_exit(&aggr_grp_lock);
245 return (count);
249 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
250 * requires the mac perimeter, this function holds a reference of the aggr
251 * and aggr won't call mac_unregister() until this reference drops to 0.
253 void
254 aggr_grp_port_hold(aggr_port_t *port)
256 aggr_grp_t *grp = port->lp_grp;
258 AGGR_PORT_REFHOLD(port);
259 mutex_enter(&grp->lg_port_lock);
260 grp->lg_port_ref++;
261 mutex_exit(&grp->lg_port_lock);
265 * Release the reference of the grp and inform aggr_grp_delete() calling
266 * mac_unregister() is now safe.
268 void
269 aggr_grp_port_rele(aggr_port_t *port)
271 aggr_grp_t *grp = port->lp_grp;
273 mutex_enter(&grp->lg_port_lock);
274 if (--grp->lg_port_ref == 0)
275 cv_signal(&grp->lg_port_cv);
276 mutex_exit(&grp->lg_port_lock);
277 AGGR_PORT_REFRELE(port);
281 * Wait for the port's lacp timer thread and the port's notification callback
282 * to exit.
284 void
285 aggr_grp_port_wait(aggr_grp_t *grp)
287 mutex_enter(&grp->lg_port_lock);
288 if (grp->lg_port_ref != 0)
289 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
290 mutex_exit(&grp->lg_port_lock);
294 * Attach a port to a link aggregation group.
296 * A port is attached to a link aggregation group once its speed
297 * and link state have been verified.
299 * Returns B_TRUE if the group link state or speed has changed. If
300 * it's the case, the caller must notify the MAC layer via a call
301 * to mac_link().
303 boolean_t
304 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
306 boolean_t link_state_changed = B_FALSE;
308 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
309 ASSERT(MAC_PERIM_HELD(port->lp_mh));
311 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
312 return (B_FALSE);
315 * Validate the MAC port link speed and update the group
316 * link speed if needed.
318 if (port->lp_ifspeed == 0 ||
319 port->lp_link_state != LINK_STATE_UP ||
320 port->lp_link_duplex != LINK_DUPLEX_FULL) {
322 * Can't attach a MAC port with unknown link speed,
323 * down link, or not in full duplex mode.
325 return (B_FALSE);
328 if (grp->lg_ifspeed == 0) {
330 * The group inherits the speed of the first link being
331 * attached.
333 grp->lg_ifspeed = port->lp_ifspeed;
334 link_state_changed = B_TRUE;
335 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
337 * The link speed of the MAC port must be the same as
338 * the group link speed, as per 802.3ad. Since it is
339 * not, the attach is cancelled.
341 return (B_FALSE);
344 grp->lg_nattached_ports++;
347 * Update the group link state.
349 if (grp->lg_link_state != LINK_STATE_UP) {
350 grp->lg_link_state = LINK_STATE_UP;
351 grp->lg_link_duplex = LINK_DUPLEX_FULL;
352 link_state_changed = B_TRUE;
356 * Update port's state.
358 port->lp_state = AGGR_PORT_STATE_ATTACHED;
360 aggr_grp_multicst_port(port, B_TRUE);
363 * Set port's receive callback
365 mac_rx_set(port->lp_mch, aggr_recv_cb, port);
368 * If LACP is OFF, the port can be used to send data as soon
369 * as its link is up and verified to be compatible with the
370 * aggregation.
372 * If LACP is active or passive, notify the LACP subsystem, which
373 * will enable sending on the port following the LACP protocol.
375 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
376 aggr_send_port_enable(port);
377 else
378 aggr_lacp_port_attached(port);
380 return (link_state_changed);
383 boolean_t
384 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
386 boolean_t link_state_changed = B_FALSE;
388 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
389 ASSERT(MAC_PERIM_HELD(port->lp_mh));
391 /* update state */
392 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
393 return (B_FALSE);
395 mac_rx_clear(port->lp_mch);
397 aggr_grp_multicst_port(port, B_FALSE);
399 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
400 aggr_send_port_disable(port);
401 else
402 aggr_lacp_port_detached(port);
404 port->lp_state = AGGR_PORT_STATE_STANDBY;
406 grp->lg_nattached_ports--;
407 if (grp->lg_nattached_ports == 0) {
408 /* the last attached MAC port of the group is being detached */
409 grp->lg_ifspeed = 0;
410 grp->lg_link_state = LINK_STATE_DOWN;
411 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
412 link_state_changed = B_TRUE;
415 return (link_state_changed);
419 * Update the MAC addresses of the constituent ports of the specified
420 * group. This function is invoked:
421 * - after creating a new aggregation group.
422 * - after adding new ports to an aggregation group.
423 * - after removing a port from a group when the MAC address of
424 * that port was used for the MAC address of the group.
425 * - after the MAC address of a port changed when the MAC address
426 * of that port was used for the MAC address of the group.
428 * Return true if the link state of the aggregation changed, for example
429 * as a result of a failure changing the MAC address of one of the
430 * constituent ports.
432 boolean_t
433 aggr_grp_update_ports_mac(aggr_grp_t *grp)
435 aggr_port_t *cport;
436 boolean_t link_state_changed = B_FALSE;
437 mac_perim_handle_t mph;
439 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
441 for (cport = grp->lg_ports; cport != NULL;
442 cport = cport->lp_next) {
443 mac_perim_enter_by_mh(cport->lp_mh, &mph);
444 if (aggr_port_unicst(cport) != 0) {
445 if (aggr_grp_detach_port(grp, cport))
446 link_state_changed = B_TRUE;
447 } else {
449 * If a port was detached because of a previous
450 * failure changing the MAC address, the port is
451 * reattached when it successfully changes the MAC
452 * address now, and this might cause the link state
453 * of the aggregation to change.
455 if (aggr_grp_attach_port(grp, cport))
456 link_state_changed = B_TRUE;
458 mac_perim_exit(mph);
460 return (link_state_changed);
464 * Invoked when the MAC address of a port has changed. If the port's
465 * MAC address was used for the group MAC address, set mac_addr_changedp
466 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
467 * notification. If the link state changes due to detach/attach of
468 * the constituent port, set link_state_changedp to B_TRUE to indicate
469 * to the caller that it should send a MAC_NOTE_LINK notification. In both
470 * cases, it is the responsibility of the caller to invoke notification
471 * functions after releasing the the port lock.
473 void
474 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
475 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
477 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
478 ASSERT(MAC_PERIM_HELD(port->lp_mh));
479 ASSERT(mac_addr_changedp != NULL);
480 ASSERT(link_state_changedp != NULL);
482 *mac_addr_changedp = B_FALSE;
483 *link_state_changedp = B_FALSE;
485 if (grp->lg_addr_fixed) {
487 * The group is using a fixed MAC address or an automatic
488 * MAC address has not been set.
490 return;
493 if (grp->lg_mac_addr_port == port) {
495 * The MAC address of the port was assigned to the group
496 * MAC address. Update the group MAC address.
498 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
499 *mac_addr_changedp = B_TRUE;
500 } else {
502 * Update the actual port MAC address to the MAC address
503 * of the group.
505 if (aggr_port_unicst(port) != 0) {
506 *link_state_changedp = aggr_grp_detach_port(grp, port);
507 } else {
509 * If a port was detached because of a previous
510 * failure changing the MAC address, the port is
511 * reattached when it successfully changes the MAC
512 * address now, and this might cause the link state
513 * of the aggregation to change.
515 *link_state_changedp = aggr_grp_attach_port(grp, port);
521 * Add a port to a link aggregation group.
523 static int
524 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
525 aggr_port_t **pp)
527 aggr_port_t *port, **cport;
528 mac_perim_handle_t mph;
529 zoneid_t port_zoneid = ALL_ZONES;
530 int err;
532 /* The port must be int the same zone as the aggregation. */
533 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
534 port_zoneid = GLOBAL_ZONEID;
535 if (grp->lg_zoneid != port_zoneid)
536 return (EBUSY);
539 * lg_mh could be NULL when the function is called during the creation
540 * of the aggregation.
542 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
544 /* create new port */
545 err = aggr_port_create(grp, port_linkid, force, &port);
546 if (err != 0)
547 return (err);
549 mac_perim_enter_by_mh(port->lp_mh, &mph);
551 /* add port to list of group constituent ports */
552 cport = &grp->lg_ports;
553 while (*cport != NULL)
554 cport = &((*cport)->lp_next);
555 *cport = port;
558 * Back reference to the group it is member of. A port always
559 * holds a reference to its group to ensure that the back
560 * reference is always valid.
562 port->lp_grp = grp;
563 AGGR_GRP_REFHOLD(grp);
564 grp->lg_nports++;
566 aggr_lacp_init_port(port);
567 mac_perim_exit(mph);
569 if (pp != NULL)
570 *pp = port;
572 return (0);
576 * This is called in response to either our LACP state machine or a MAC
577 * notification that the link has gone down via aggr_send_port_disable(). At
578 * this point, we may need to update our default ring. To that end, we go
579 * through the set of ports (underlying datalinks in an aggregation) that are
580 * currently enabled to transmit data. If all our links have been disabled for
581 * transmit, then we don't do anything.
583 * Note, because we only have a single TX group, we don't have to worry about
584 * the rings moving between groups and the chance that mac will reassign it
585 * unless someone removes a port, at which point, we play it safe and call this
586 * again.
588 void
589 aggr_grp_update_default(aggr_grp_t *grp)
591 aggr_port_t *port;
592 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
594 rw_enter(&grp->lg_tx_lock, RW_WRITER);
596 if (grp->lg_ntx_ports == 0) {
597 rw_exit(&grp->lg_tx_lock);
598 return;
601 port = grp->lg_tx_ports[0];
602 ASSERT(port->lp_tx_ring_cnt > 0);
603 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
604 rw_exit(&grp->lg_tx_lock);
608 * Add a pseudo RX ring for the given HW ring handle.
610 static int
611 aggr_add_pseudo_rx_ring(aggr_port_t *port,
612 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
614 aggr_pseudo_rx_ring_t *ring;
615 int err;
616 int j;
618 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
619 ring = rx_grp->arg_rings + j;
620 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
621 break;
625 * No slot for this new RX ring.
627 if (j == MAX_RINGS_PER_GROUP)
628 return (EIO);
630 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
631 ring->arr_hw_rh = hw_rh;
632 ring->arr_port = port;
633 rx_grp->arg_ring_cnt++;
636 * The group is already registered, dynamically add a new ring to the
637 * mac group.
639 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
640 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
641 ring->arr_hw_rh = NULL;
642 ring->arr_port = NULL;
643 rx_grp->arg_ring_cnt--;
644 } else {
645 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
646 mac_find_ring(rx_grp->arg_gh, j));
648 return (err);
652 * Remove the pseudo RX ring of the given HW ring handle.
654 static void
655 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
657 aggr_pseudo_rx_ring_t *ring;
658 int j;
660 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
661 ring = rx_grp->arg_rings + j;
662 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
663 ring->arr_hw_rh != hw_rh) {
664 continue;
667 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
669 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
670 ring->arr_hw_rh = NULL;
671 ring->arr_port = NULL;
672 rx_grp->arg_ring_cnt--;
673 mac_hwring_teardown(hw_rh);
674 break;
679 * This function is called to create pseudo rings over the hardware rings of
680 * the underlying device. Note that there is a 1:1 mapping between the pseudo
681 * RX rings of the aggr and the hardware rings of the underlying port.
683 static int
684 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
686 aggr_grp_t *grp = port->lp_grp;
687 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
688 aggr_unicst_addr_t *addr, *a;
689 mac_perim_handle_t pmph;
690 int hw_rh_cnt, i = 0, j;
691 int err = 0;
693 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
694 mac_perim_enter_by_mh(port->lp_mh, &pmph);
697 * This function must be called after the aggr registers its mac
698 * and its RX group has been initialized.
700 ASSERT(rx_grp->arg_gh != NULL);
703 * Get the list the the underlying HW rings.
705 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
706 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
708 if (port->lp_hwgh != NULL) {
710 * Quiesce the HW ring and the mac srs on the ring. Note
711 * that the HW ring will be restarted when the pseudo ring
712 * is started. At that time all the packets will be
713 * directly passed up to the pseudo RX ring and handled
714 * by mac srs created over the pseudo RX ring.
716 mac_rx_client_quiesce(port->lp_mch);
717 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
721 * Add all the unicast addresses to the newly added port.
723 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
724 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
725 break;
728 for (i = 0; err == 0 && i < hw_rh_cnt; i++)
729 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
731 if (err != 0) {
732 for (j = 0; j < i; j++)
733 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
735 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
736 aggr_port_remmac(port, a->aua_addr);
738 if (port->lp_hwgh != NULL) {
739 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
740 mac_rx_client_restart(port->lp_mch);
741 port->lp_hwgh = NULL;
743 } else {
744 port->lp_rx_grp_added = B_TRUE;
746 done:
747 mac_perim_exit(pmph);
748 return (err);
752 * This function is called by aggr to remove pseudo RX rings over the
753 * HW rings of the underlying port.
755 static void
756 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
758 aggr_grp_t *grp = port->lp_grp;
759 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
760 aggr_unicst_addr_t *addr;
761 mac_group_handle_t hwgh;
762 mac_perim_handle_t pmph;
763 int hw_rh_cnt, i;
765 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
766 mac_perim_enter_by_mh(port->lp_mh, &pmph);
768 if (!port->lp_rx_grp_added)
769 goto done;
771 ASSERT(rx_grp->arg_gh != NULL);
772 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
773 &hwgh, hw_rh, MAC_RING_TYPE_RX);
776 * If hw_rh_cnt is 0, it means that the underlying port does not
777 * support RX rings. Directly return in this case.
779 for (i = 0; i < hw_rh_cnt; i++)
780 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
782 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
783 aggr_port_remmac(port, addr->aua_addr);
785 if (port->lp_hwgh != NULL) {
786 port->lp_hwgh = NULL;
789 * First clear the permanent-quiesced flag of the RX srs then
790 * restart the HW ring and the mac srs on the ring. Note that
791 * the HW ring and associated SRS will soon been removed when
792 * the port is removed from the aggr.
794 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
795 mac_rx_client_restart(port->lp_mch);
798 port->lp_rx_grp_added = B_FALSE;
799 done:
800 mac_perim_exit(pmph);
804 * Add a pseudo TX ring for the given HW ring handle.
806 static int
807 aggr_add_pseudo_tx_ring(aggr_port_t *port,
808 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
809 mac_ring_handle_t *pseudo_rh)
811 aggr_pseudo_tx_ring_t *ring;
812 int err;
813 int i;
815 ASSERT(MAC_PERIM_HELD(port->lp_mh));
816 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
817 ring = tx_grp->atg_rings + i;
818 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
819 break;
822 * No slot for this new TX ring.
824 if (i == MAX_RINGS_PER_GROUP)
825 return (EIO);
827 * The following 4 statements needs to be done before
828 * calling mac_group_add_ring(). Otherwise it will
829 * result in an assertion failure in mac_init_ring().
831 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
832 ring->atr_hw_rh = hw_rh;
833 ring->atr_port = port;
834 tx_grp->atg_ring_cnt++;
837 * The TX side has no concept of ring groups unlike RX groups.
838 * There is just a single group which stores all the TX rings.
839 * This group will be used to store aggr's pseudo TX rings.
841 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
842 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
843 ring->atr_hw_rh = NULL;
844 ring->atr_port = NULL;
845 tx_grp->atg_ring_cnt--;
846 } else {
847 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
848 if (hw_rh != NULL) {
849 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
850 mac_find_ring(tx_grp->atg_gh, i));
854 return (err);
858 * Remove the pseudo TX ring of the given HW ring handle.
860 static void
861 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
862 mac_ring_handle_t pseudo_hw_rh)
864 aggr_pseudo_tx_ring_t *ring;
865 int i;
867 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
868 ring = tx_grp->atg_rings + i;
869 if (ring->atr_rh != pseudo_hw_rh)
870 continue;
872 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
873 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
874 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
875 mac_hwring_teardown(ring->atr_hw_rh);
876 ring->atr_hw_rh = NULL;
877 ring->atr_port = NULL;
878 tx_grp->atg_ring_cnt--;
879 break;
884 * This function is called to create pseudo rings over hardware rings of
885 * the underlying device. There is a 1:1 mapping between the pseudo TX
886 * rings of the aggr and the hardware rings of the underlying port.
888 static int
889 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
891 aggr_grp_t *grp = port->lp_grp;
892 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
893 mac_perim_handle_t pmph;
894 int hw_rh_cnt, i = 0, j;
895 int err = 0;
897 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
898 mac_perim_enter_by_mh(port->lp_mh, &pmph);
901 * Get the list the the underlying HW rings.
903 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
904 NULL, hw_rh, MAC_RING_TYPE_TX);
907 * Even if the underlying NIC does not have TX rings, we
908 * still make a psuedo TX ring for that NIC with NULL as
909 * the ring handle.
911 if (hw_rh_cnt == 0)
912 port->lp_tx_ring_cnt = 1;
913 else
914 port->lp_tx_ring_cnt = hw_rh_cnt;
916 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
917 port->lp_tx_ring_cnt), KM_SLEEP);
918 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
919 port->lp_tx_ring_cnt), KM_SLEEP);
921 if (hw_rh_cnt == 0) {
922 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
923 NULL, &pseudo_rh)) == 0) {
924 port->lp_tx_rings[0] = NULL;
925 port->lp_pseudo_tx_rings[0] = pseudo_rh;
927 } else {
928 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
929 err = aggr_add_pseudo_tx_ring(port,
930 tx_grp, hw_rh[i], &pseudo_rh);
931 if (err != 0)
932 break;
933 port->lp_tx_rings[i] = hw_rh[i];
934 port->lp_pseudo_tx_rings[i] = pseudo_rh;
938 if (err != 0) {
939 if (hw_rh_cnt != 0) {
940 for (j = 0; j < i; j++) {
941 aggr_rem_pseudo_tx_ring(tx_grp,
942 port->lp_pseudo_tx_rings[j]);
945 kmem_free(port->lp_tx_rings,
946 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
947 kmem_free(port->lp_pseudo_tx_rings,
948 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
949 port->lp_tx_ring_cnt = 0;
950 } else {
951 port->lp_tx_grp_added = B_TRUE;
952 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
953 aggr_tx_ring_update, port);
955 mac_perim_exit(pmph);
956 aggr_grp_update_default(grp);
957 return (err);
961 * This function is called by aggr to remove pseudo TX rings over the
962 * HW rings of the underlying port.
964 static void
965 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
967 aggr_grp_t *grp = port->lp_grp;
968 mac_perim_handle_t pmph;
969 int i;
971 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
972 mac_perim_enter_by_mh(port->lp_mh, &pmph);
974 if (!port->lp_tx_grp_added)
975 goto done;
977 ASSERT(tx_grp->atg_gh != NULL);
979 for (i = 0; i < port->lp_tx_ring_cnt; i++)
980 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
982 kmem_free(port->lp_tx_rings,
983 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
984 kmem_free(port->lp_pseudo_tx_rings,
985 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
987 port->lp_tx_ring_cnt = 0;
988 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
989 port->lp_tx_grp_added = B_FALSE;
990 aggr_grp_update_default(grp);
991 done:
992 mac_perim_exit(pmph);
995 static int
996 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
998 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
999 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1002 static int
1003 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1005 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1006 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1009 static int
1010 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1012 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1013 int err;
1015 err = mac_hwring_start(rr_ring->arr_hw_rh);
1016 if (err == 0)
1017 rr_ring->arr_gen = mr_gen;
1018 return (err);
1021 static void
1022 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
1024 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1025 mac_hwring_stop(rr_ring->arr_hw_rh);
1029 * Add one or more ports to an existing link aggregation group.
1032 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1033 laioc_port_t *ports)
1035 int rc, i, nadded = 0;
1036 aggr_grp_t *grp = NULL;
1037 aggr_port_t *port;
1038 boolean_t link_state_changed = B_FALSE;
1039 mac_perim_handle_t mph, pmph;
1041 /* get group corresponding to linkid */
1042 rw_enter(&aggr_grp_lock, RW_READER);
1043 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1044 (mod_hash_val_t *)&grp) != 0) {
1045 rw_exit(&aggr_grp_lock);
1046 return (ENOENT);
1048 AGGR_GRP_REFHOLD(grp);
1051 * Hold the perimeter so that the aggregation won't be destroyed.
1053 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1054 rw_exit(&aggr_grp_lock);
1056 /* add the specified ports to group */
1057 for (i = 0; i < nports; i++) {
1058 /* add port to group */
1059 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1060 force, &port)) != 0) {
1061 goto bail;
1063 ASSERT(port != NULL);
1064 nadded++;
1066 /* check capabilities */
1067 if (!aggr_grp_capab_check(grp, port) ||
1068 !aggr_grp_sdu_check(grp, port) ||
1069 !aggr_grp_margin_check(grp, port)) {
1070 rc = ENOTSUP;
1071 goto bail;
1075 * Create the pseudo ring for each HW ring of the underlying
1076 * port.
1078 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1079 if (rc != 0)
1080 goto bail;
1081 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1082 if (rc != 0)
1083 goto bail;
1085 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1087 /* set LACP mode */
1088 aggr_port_lacp_set_mode(grp, port);
1090 /* start port if group has already been started */
1091 if (grp->lg_started) {
1092 rc = aggr_port_start(port);
1093 if (rc != 0) {
1094 mac_perim_exit(pmph);
1095 goto bail;
1099 * Turn on the promiscuous mode over the port when it
1100 * is requested to be turned on to receive the
1101 * non-primary address over a port, or the promiscous
1102 * mode is enabled over the aggr.
1104 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1105 rc = aggr_port_promisc(port, B_TRUE);
1106 if (rc != 0) {
1107 mac_perim_exit(pmph);
1108 goto bail;
1112 mac_perim_exit(pmph);
1115 * Attach each port if necessary.
1117 if (aggr_port_notify_link(grp, port))
1118 link_state_changed = B_TRUE;
1121 * Initialize the callback functions for this port.
1123 aggr_port_init_callbacks(port);
1126 /* update the MAC address of the constituent ports */
1127 if (aggr_grp_update_ports_mac(grp))
1128 link_state_changed = B_TRUE;
1130 if (link_state_changed)
1131 mac_link_update(grp->lg_mh, grp->lg_link_state);
1133 bail:
1134 if (rc != 0) {
1135 /* stop and remove ports that have been added */
1136 for (i = 0; i < nadded; i++) {
1137 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1138 ASSERT(port != NULL);
1139 if (grp->lg_started) {
1140 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1141 (void) aggr_port_promisc(port, B_FALSE);
1142 aggr_port_stop(port);
1143 mac_perim_exit(pmph);
1145 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1146 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1147 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1151 mac_perim_exit(mph);
1152 AGGR_GRP_REFRELE(grp);
1153 return (rc);
1156 static int
1157 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1158 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1159 aggr_lacp_timer_t lacp_timer)
1161 boolean_t mac_addr_changed = B_FALSE;
1162 boolean_t link_state_changed = B_FALSE;
1163 mac_perim_handle_t pmph;
1165 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1167 /* validate fixed address if specified */
1168 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1169 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1170 (mac_addr[0] & 0x01))) {
1171 return (EINVAL);
1174 /* update policy if requested */
1175 if (update_mask & AGGR_MODIFY_POLICY)
1176 aggr_send_update_policy(grp, policy);
1178 /* update unicast MAC address if requested */
1179 if (update_mask & AGGR_MODIFY_MAC) {
1180 if (mac_fixed) {
1181 /* user-supplied MAC address */
1182 grp->lg_mac_addr_port = NULL;
1183 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1184 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1185 mac_addr_changed = B_TRUE;
1187 } else if (grp->lg_addr_fixed) {
1188 /* switch from user-supplied to automatic */
1189 aggr_port_t *port = grp->lg_ports;
1191 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1192 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1193 grp->lg_mac_addr_port = port;
1194 mac_addr_changed = B_TRUE;
1195 mac_perim_exit(pmph);
1197 grp->lg_addr_fixed = mac_fixed;
1200 if (mac_addr_changed)
1201 link_state_changed = aggr_grp_update_ports_mac(grp);
1203 if (update_mask & AGGR_MODIFY_LACP_MODE)
1204 aggr_lacp_update_mode(grp, lacp_mode);
1206 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1207 aggr_lacp_update_timer(grp, lacp_timer);
1209 if (link_state_changed)
1210 mac_link_update(grp->lg_mh, grp->lg_link_state);
1212 if (mac_addr_changed)
1213 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1215 return (0);
1219 * Update properties of an existing link aggregation group.
1222 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1223 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1224 aggr_lacp_timer_t lacp_timer)
1226 aggr_grp_t *grp = NULL;
1227 mac_perim_handle_t mph;
1228 int err;
1230 /* get group corresponding to linkid */
1231 rw_enter(&aggr_grp_lock, RW_READER);
1232 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1233 (mod_hash_val_t *)&grp) != 0) {
1234 rw_exit(&aggr_grp_lock);
1235 return (ENOENT);
1237 AGGR_GRP_REFHOLD(grp);
1240 * Hold the perimeter so that the aggregation won't be destroyed.
1242 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1243 rw_exit(&aggr_grp_lock);
1245 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1246 mac_addr, lacp_mode, lacp_timer);
1248 mac_perim_exit(mph);
1249 AGGR_GRP_REFRELE(grp);
1250 return (err);
1254 * Create a new link aggregation group upon request from administrator.
1255 * Returns 0 on success, an errno on failure.
1258 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1259 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1260 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1261 cred_t *credp)
1263 aggr_grp_t *grp = NULL;
1264 aggr_port_t *port;
1265 mac_register_t *mac;
1266 boolean_t link_state_changed;
1267 mac_perim_handle_t mph;
1268 int err;
1269 int i;
1270 kt_did_t tid = 0;
1272 /* need at least one port */
1273 if (nports == 0)
1274 return (EINVAL);
1276 rw_enter(&aggr_grp_lock, RW_WRITER);
1278 /* does a group with the same linkid already exist? */
1279 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1280 (mod_hash_val_t *)&grp);
1281 if (err == 0) {
1282 rw_exit(&aggr_grp_lock);
1283 return (EEXIST);
1286 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1288 grp->lg_refs = 1;
1289 grp->lg_closing = B_FALSE;
1290 grp->lg_force = force;
1291 grp->lg_linkid = linkid;
1292 grp->lg_zoneid = crgetzoneid(credp);
1293 grp->lg_ifspeed = 0;
1294 grp->lg_link_state = LINK_STATE_UNKNOWN;
1295 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1296 grp->lg_started = B_FALSE;
1297 grp->lg_promisc = B_FALSE;
1298 grp->lg_lacp_done = B_FALSE;
1299 grp->lg_tx_notify_done = B_FALSE;
1300 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1301 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1302 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1303 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1304 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1305 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1306 MAX_RINGS_PER_GROUP), KM_SLEEP);
1307 grp->lg_tx_blocked_cnt = 0;
1308 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1309 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1310 aggr_lacp_init_grp(grp);
1312 /* add MAC ports to group */
1313 grp->lg_ports = NULL;
1314 grp->lg_nports = 0;
1315 grp->lg_nattached_ports = 0;
1316 grp->lg_ntx_ports = 0;
1319 * If key is not specified by the user, allocate the key.
1321 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1322 err = ENOMEM;
1323 goto bail;
1325 grp->lg_key = key;
1327 for (i = 0; i < nports; i++) {
1328 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1329 if (err != 0)
1330 goto bail;
1334 * If no explicit MAC address was specified by the administrator,
1335 * set it to the MAC address of the first port.
1337 grp->lg_addr_fixed = mac_fixed;
1338 if (grp->lg_addr_fixed) {
1339 /* validate specified address */
1340 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1341 err = EINVAL;
1342 goto bail;
1344 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1345 } else {
1346 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1347 grp->lg_mac_addr_port = grp->lg_ports;
1350 /* set the initial group capabilities */
1351 aggr_grp_capab_set(grp);
1353 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1354 err = ENOMEM;
1355 goto bail;
1357 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1358 mac->m_driver = grp;
1359 mac->m_dip = aggr_dip;
1360 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1361 mac->m_src_addr = grp->lg_addr;
1362 mac->m_callbacks = &aggr_m_callbacks;
1363 mac->m_min_sdu = 0;
1364 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1365 mac->m_margin = aggr_grp_max_margin(grp);
1366 mac->m_v12n = MAC_VIRT_LEVEL1;
1367 err = mac_register(mac, &grp->lg_mh);
1368 mac_free(mac);
1369 if (err != 0)
1370 goto bail;
1372 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1373 if (err != 0) {
1374 (void) mac_unregister(grp->lg_mh);
1375 grp->lg_mh = NULL;
1376 goto bail;
1379 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1382 * Update the MAC address of the constituent ports.
1383 * None of the port is attached at this time, the link state of the
1384 * aggregation will not change.
1386 link_state_changed = aggr_grp_update_ports_mac(grp);
1387 ASSERT(!link_state_changed);
1389 /* update outbound load balancing policy */
1390 aggr_send_update_policy(grp, policy);
1392 /* set LACP mode */
1393 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1396 * Attach each port if necessary.
1398 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1400 * Create the pseudo ring for each HW ring of the underlying
1401 * port. Note that this is done after the aggr registers the
1402 * mac.
1404 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1405 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1406 if (aggr_port_notify_link(grp, port))
1407 link_state_changed = B_TRUE;
1410 * Initialize the callback functions for this port.
1412 aggr_port_init_callbacks(port);
1415 if (link_state_changed)
1416 mac_link_update(grp->lg_mh, grp->lg_link_state);
1418 /* add new group to hash table */
1419 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1420 (mod_hash_val_t)grp);
1421 ASSERT(err == 0);
1422 aggr_grp_cnt++;
1424 mac_perim_exit(mph);
1425 rw_exit(&aggr_grp_lock);
1426 return (0);
1428 bail:
1430 grp->lg_closing = B_TRUE;
1432 port = grp->lg_ports;
1433 while (port != NULL) {
1434 aggr_port_t *cport;
1436 cport = port->lp_next;
1437 aggr_port_delete(port);
1438 port = cport;
1442 * Inform the lacp_rx thread to exit.
1444 mutex_enter(&grp->lg_lacp_lock);
1445 grp->lg_lacp_done = B_TRUE;
1446 cv_signal(&grp->lg_lacp_cv);
1447 while (grp->lg_lacp_rx_thread != NULL)
1448 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1449 mutex_exit(&grp->lg_lacp_lock);
1451 * Inform the tx_notify thread to exit.
1453 mutex_enter(&grp->lg_tx_flowctl_lock);
1454 if (grp->lg_tx_notify_thread != NULL) {
1455 tid = grp->lg_tx_notify_thread->t_did;
1456 grp->lg_tx_notify_done = B_TRUE;
1457 cv_signal(&grp->lg_tx_flowctl_cv);
1459 mutex_exit(&grp->lg_tx_flowctl_lock);
1460 if (tid != 0)
1461 thread_join(tid);
1463 kmem_free(grp->lg_tx_blocked_rings,
1464 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1465 rw_exit(&aggr_grp_lock);
1466 AGGR_GRP_REFRELE(grp);
1467 return (err);
1471 * Return a pointer to the member of a group with specified linkid.
1473 static aggr_port_t *
1474 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1476 aggr_port_t *port;
1478 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1480 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1481 if (port->lp_linkid == linkid)
1482 break;
1485 return (port);
1489 * Stop, detach and remove a port from a link aggregation group.
1491 static int
1492 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1493 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1495 int rc = 0;
1496 aggr_port_t **pport;
1497 boolean_t mac_addr_changed = B_FALSE;
1498 boolean_t link_state_changed = B_FALSE;
1499 mac_perim_handle_t mph;
1500 uint64_t val;
1501 uint_t i;
1502 uint_t stat;
1504 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1505 ASSERT(grp->lg_nports > 1);
1506 ASSERT(!grp->lg_closing);
1508 /* unlink port */
1509 for (pport = &grp->lg_ports; *pport != port;
1510 pport = &(*pport)->lp_next) {
1511 if (*pport == NULL) {
1512 rc = ENOENT;
1513 goto done;
1516 *pport = port->lp_next;
1518 mac_perim_enter_by_mh(port->lp_mh, &mph);
1521 * If the MAC address of the port being removed was assigned
1522 * to the group, update the group MAC address
1523 * using the MAC address of a different port.
1525 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1527 * Set the MAC address of the group to the
1528 * MAC address of its first port.
1530 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1531 grp->lg_mac_addr_port = grp->lg_ports;
1532 mac_addr_changed = B_TRUE;
1535 link_state_changed = aggr_grp_detach_port(grp, port);
1538 * Add the counter statistics of the ports while it was aggregated
1539 * to the group's residual statistics. This is done by obtaining
1540 * the current counter from the underlying MAC then subtracting the
1541 * value of the counter at the moment it was added to the
1542 * aggregation.
1544 for (i = 0; i < MAC_NSTAT; i++) {
1545 stat = i + MAC_STAT_MIN;
1546 if (!MAC_STAT_ISACOUNTER(stat))
1547 continue;
1548 val = aggr_port_stat(port, stat);
1549 val -= port->lp_stat[i];
1550 grp->lg_stat[i] += val;
1552 for (i = 0; i < ETHER_NSTAT; i++) {
1553 stat = i + MACTYPE_STAT_MIN;
1554 if (!ETHER_STAT_ISACOUNTER(stat))
1555 continue;
1556 val = aggr_port_stat(port, stat);
1557 val -= port->lp_ether_stat[i];
1558 grp->lg_ether_stat[i] += val;
1561 grp->lg_nports--;
1562 mac_perim_exit(mph);
1564 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1565 aggr_port_delete(port);
1568 * If the group MAC address has changed, update the MAC address of
1569 * the remaining constituent ports according to the new MAC
1570 * address of the group.
1572 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1573 link_state_changed = B_TRUE;
1575 done:
1576 if (mac_addr_changedp != NULL)
1577 *mac_addr_changedp = mac_addr_changed;
1578 if (link_state_changedp != NULL)
1579 *link_state_changedp = link_state_changed;
1581 return (rc);
1585 * Remove one or more ports from an existing link aggregation group.
1588 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1590 int rc = 0, i;
1591 aggr_grp_t *grp = NULL;
1592 aggr_port_t *port;
1593 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1594 boolean_t link_state_update = B_FALSE, link_state_changed;
1595 mac_perim_handle_t mph, pmph;
1597 /* get group corresponding to linkid */
1598 rw_enter(&aggr_grp_lock, RW_READER);
1599 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1600 (mod_hash_val_t *)&grp) != 0) {
1601 rw_exit(&aggr_grp_lock);
1602 return (ENOENT);
1604 AGGR_GRP_REFHOLD(grp);
1607 * Hold the perimeter so that the aggregation won't be destroyed.
1609 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1610 rw_exit(&aggr_grp_lock);
1612 /* we need to keep at least one port per group */
1613 if (nports >= grp->lg_nports) {
1614 rc = EINVAL;
1615 goto bail;
1618 /* first verify that all the groups are valid */
1619 for (i = 0; i < nports; i++) {
1620 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1621 /* port not found */
1622 rc = ENOENT;
1623 goto bail;
1627 /* clear the promiscous mode for the specified ports */
1628 for (i = 0; i < nports && rc == 0; i++) {
1629 /* lookup port */
1630 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1631 ASSERT(port != NULL);
1633 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1634 rc = aggr_port_promisc(port, B_FALSE);
1635 mac_perim_exit(pmph);
1637 if (rc != 0) {
1638 for (i = 0; i < nports; i++) {
1639 port = aggr_grp_port_lookup(grp,
1640 ports[i].lp_linkid);
1641 ASSERT(port != NULL);
1644 * Turn the promiscuous mode back on if it is required
1645 * to receive the non-primary address over a port, or
1646 * the promiscous mode is enabled over the aggr.
1648 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1649 if (port->lp_started && (grp->lg_promisc ||
1650 port->lp_prom_addr != NULL)) {
1651 (void) aggr_port_promisc(port, B_TRUE);
1653 mac_perim_exit(pmph);
1655 goto bail;
1658 /* remove the specified ports from group */
1659 for (i = 0; i < nports; i++) {
1660 /* lookup port */
1661 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1662 ASSERT(port != NULL);
1664 /* stop port if group has already been started */
1665 if (grp->lg_started) {
1666 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1667 aggr_port_stop(port);
1668 mac_perim_exit(pmph);
1672 * aggr_rem_pseudo_tx_group() is not called here. Instead
1673 * it is called from inside aggr_grp_rem_port() after the
1674 * port has been detached. The reason is that
1675 * aggr_rem_pseudo_tx_group() removes one ring at a time
1676 * and if there is still traffic going on, then there
1677 * is the possibility of aggr_find_tx_ring() returning a
1678 * removed ring for transmission. Once the port has been
1679 * detached, that port will not be used and
1680 * aggr_find_tx_ring() will not return any rings
1681 * belonging to it.
1683 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1685 /* remove port from group */
1686 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1687 &link_state_changed);
1688 ASSERT(rc == 0);
1689 mac_addr_update = mac_addr_update || mac_addr_changed;
1690 link_state_update = link_state_update || link_state_changed;
1693 bail:
1694 if (mac_addr_update)
1695 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1696 if (link_state_update)
1697 mac_link_update(grp->lg_mh, grp->lg_link_state);
1699 mac_perim_exit(mph);
1700 AGGR_GRP_REFRELE(grp);
1702 return (rc);
1706 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1708 aggr_grp_t *grp = NULL;
1709 aggr_port_t *port, *cport;
1710 datalink_id_t tmpid;
1711 mod_hash_val_t val;
1712 mac_perim_handle_t mph, pmph;
1713 int err;
1714 kt_did_t tid = 0;
1716 rw_enter(&aggr_grp_lock, RW_WRITER);
1718 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1719 (mod_hash_val_t *)&grp) != 0) {
1720 rw_exit(&aggr_grp_lock);
1721 return (ENOENT);
1725 * Note that dls_devnet_destroy() must be called before lg_lock is
1726 * held. Otherwise, it will deadlock if another thread is in
1727 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1728 * dls_devnet_destroy() needs to delete.
1730 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1731 rw_exit(&aggr_grp_lock);
1732 return (err);
1734 ASSERT(linkid == tmpid);
1737 * Unregister from the MAC service module. Since this can
1738 * fail if a client hasn't closed the MAC port, we gracefully
1739 * fail the operation.
1741 if ((err = mac_disable(grp->lg_mh)) != 0) {
1742 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1743 rw_exit(&aggr_grp_lock);
1744 return (err);
1746 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1747 ASSERT(grp == (aggr_grp_t *)val);
1749 ASSERT(aggr_grp_cnt > 0);
1750 aggr_grp_cnt--;
1751 rw_exit(&aggr_grp_lock);
1754 * Inform the lacp_rx thread to exit.
1756 mutex_enter(&grp->lg_lacp_lock);
1757 grp->lg_lacp_done = B_TRUE;
1758 cv_signal(&grp->lg_lacp_cv);
1759 while (grp->lg_lacp_rx_thread != NULL)
1760 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1761 mutex_exit(&grp->lg_lacp_lock);
1763 * Inform the tx_notify_thread to exit.
1765 mutex_enter(&grp->lg_tx_flowctl_lock);
1766 if (grp->lg_tx_notify_thread != NULL) {
1767 tid = grp->lg_tx_notify_thread->t_did;
1768 grp->lg_tx_notify_done = B_TRUE;
1769 cv_signal(&grp->lg_tx_flowctl_cv);
1771 mutex_exit(&grp->lg_tx_flowctl_lock);
1772 if (tid != 0)
1773 thread_join(tid);
1775 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1777 grp->lg_closing = B_TRUE;
1778 /* detach and free MAC ports associated with group */
1779 port = grp->lg_ports;
1780 while (port != NULL) {
1781 cport = port->lp_next;
1782 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1783 if (grp->lg_started)
1784 aggr_port_stop(port);
1785 (void) aggr_grp_detach_port(grp, port);
1786 mac_perim_exit(pmph);
1787 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1788 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1789 aggr_port_delete(port);
1790 port = cport;
1793 mac_perim_exit(mph);
1795 kmem_free(grp->lg_tx_blocked_rings,
1796 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1798 * Wait for the port's lacp timer thread and its notification callback
1799 * to exit before calling mac_unregister() since both needs to access
1800 * the mac perimeter of the grp.
1802 aggr_grp_port_wait(grp);
1804 VERIFY(mac_unregister(grp->lg_mh) == 0);
1805 grp->lg_mh = NULL;
1807 AGGR_GRP_REFRELE(grp);
1808 return (0);
1811 void
1812 aggr_grp_free(aggr_grp_t *grp)
1814 ASSERT(grp->lg_refs == 0);
1815 ASSERT(grp->lg_port_ref == 0);
1816 if (grp->lg_key > AGGR_MAX_KEY) {
1817 id_free(key_ids, grp->lg_key);
1818 grp->lg_key = 0;
1820 kmem_cache_free(aggr_grp_cache, grp);
1824 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1825 aggr_grp_info_new_grp_fn_t new_grp_fn,
1826 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1828 aggr_grp_t *grp;
1829 aggr_port_t *port;
1830 mac_perim_handle_t mph, pmph;
1831 int rc = 0;
1834 * Make sure that the aggregation link is visible from the caller's
1835 * zone.
1837 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1838 return (ENOENT);
1840 rw_enter(&aggr_grp_lock, RW_READER);
1842 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1843 (mod_hash_val_t *)&grp) != 0) {
1844 rw_exit(&aggr_grp_lock);
1845 return (ENOENT);
1847 AGGR_GRP_REFHOLD(grp);
1849 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1850 rw_exit(&aggr_grp_lock);
1852 rc = new_grp_fn(fn_arg, grp->lg_linkid,
1853 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1854 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1855 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1857 if (rc != 0)
1858 goto bail;
1860 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1861 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1862 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1863 port->lp_state, &port->lp_lacp.ActorOperPortState);
1864 mac_perim_exit(pmph);
1866 if (rc != 0)
1867 goto bail;
1870 bail:
1871 mac_perim_exit(mph);
1872 AGGR_GRP_REFRELE(grp);
1873 return (rc);
1876 /*ARGSUSED*/
1877 static void
1878 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1880 miocnak(q, mp, 0, ENOTSUP);
1883 static int
1884 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1886 aggr_port_t *port;
1887 uint_t stat_index;
1889 /* We only aggregate counter statistics. */
1890 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1891 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1892 return (ENOTSUP);
1896 * Counter statistics for a group are computed by aggregating the
1897 * counters of the members MACs while they were aggregated, plus
1898 * the residual counter of the group itself, which is updated each
1899 * time a MAC is removed from the group.
1901 *val = 0;
1902 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1903 /* actual port statistic */
1904 *val += aggr_port_stat(port, stat);
1906 * minus the port stat when it was added, plus any residual
1907 * amount for the group.
1909 if (IS_MAC_STAT(stat)) {
1910 stat_index = stat - MAC_STAT_MIN;
1911 *val -= port->lp_stat[stat_index];
1912 *val += grp->lg_stat[stat_index];
1913 } else if (IS_MACTYPE_STAT(stat)) {
1914 stat_index = stat - MACTYPE_STAT_MIN;
1915 *val -= port->lp_ether_stat[stat_index];
1916 *val += grp->lg_ether_stat[stat_index];
1919 return (0);
1923 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1925 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1927 if (rx_ring->arr_hw_rh != NULL) {
1928 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1929 } else {
1930 aggr_port_t *port = rx_ring->arr_port;
1932 *val = mac_stat_get(port->lp_mh, stat);
1935 return (0);
1939 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1941 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1943 if (tx_ring->atr_hw_rh != NULL) {
1944 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1945 } else {
1946 aggr_port_t *port = tx_ring->atr_port;
1948 *val = mac_stat_get(port->lp_mh, stat);
1950 return (0);
1953 static int
1954 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1956 aggr_grp_t *grp = arg;
1957 mac_perim_handle_t mph;
1958 int rval = 0;
1960 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1962 switch (stat) {
1963 case MAC_STAT_IFSPEED:
1964 *val = grp->lg_ifspeed;
1965 break;
1967 case ETHER_STAT_LINK_DUPLEX:
1968 *val = grp->lg_link_duplex;
1969 break;
1971 default:
1973 * For all other statistics, we return the aggregated stat
1974 * from the underlying ports. aggr_grp_stat() will set
1975 * rval appropriately if the statistic isn't a counter.
1977 rval = aggr_grp_stat(grp, stat, val);
1980 mac_perim_exit(mph);
1981 return (rval);
1984 static int
1985 aggr_m_start(void *arg)
1987 aggr_grp_t *grp = arg;
1988 aggr_port_t *port;
1989 mac_perim_handle_t mph, pmph;
1991 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1994 * Attempts to start all configured members of the group.
1995 * Group members will be attached when their link-up notification
1996 * is received.
1998 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1999 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2000 if (aggr_port_start(port) != 0) {
2001 mac_perim_exit(pmph);
2002 continue;
2006 * Turn on the promiscuous mode if it is required to receive
2007 * the non-primary address over a port, or the promiscous
2008 * mode is enabled over the aggr.
2010 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2011 if (aggr_port_promisc(port, B_TRUE) != 0)
2012 aggr_port_stop(port);
2014 mac_perim_exit(pmph);
2017 grp->lg_started = B_TRUE;
2019 mac_perim_exit(mph);
2020 return (0);
2023 static void
2024 aggr_m_stop(void *arg)
2026 aggr_grp_t *grp = arg;
2027 aggr_port_t *port;
2028 mac_perim_handle_t mph, pmph;
2030 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2032 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2033 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2035 /* reset port promiscuous mode */
2036 (void) aggr_port_promisc(port, B_FALSE);
2038 aggr_port_stop(port);
2039 mac_perim_exit(pmph);
2042 grp->lg_started = B_FALSE;
2043 mac_perim_exit(mph);
2046 static int
2047 aggr_m_promisc(void *arg, boolean_t on)
2049 aggr_grp_t *grp = arg;
2050 aggr_port_t *port;
2051 boolean_t link_state_changed = B_FALSE;
2052 mac_perim_handle_t mph, pmph;
2054 AGGR_GRP_REFHOLD(grp);
2055 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2057 ASSERT(!grp->lg_closing);
2059 if (on == grp->lg_promisc)
2060 goto bail;
2062 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2063 int err = 0;
2065 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2066 AGGR_PORT_REFHOLD(port);
2067 if (!on && (port->lp_prom_addr == NULL))
2068 err = aggr_port_promisc(port, B_FALSE);
2069 else if (on && port->lp_started)
2070 err = aggr_port_promisc(port, B_TRUE);
2072 if (err != 0) {
2073 if (aggr_grp_detach_port(grp, port))
2074 link_state_changed = B_TRUE;
2075 } else {
2077 * If a port was detached because of a previous
2078 * failure changing the promiscuity, the port
2079 * is reattached when it successfully changes
2080 * the promiscuity now, and this might cause
2081 * the link state of the aggregation to change.
2083 if (aggr_grp_attach_port(grp, port))
2084 link_state_changed = B_TRUE;
2086 mac_perim_exit(pmph);
2087 AGGR_PORT_REFRELE(port);
2090 grp->lg_promisc = on;
2092 if (link_state_changed)
2093 mac_link_update(grp->lg_mh, grp->lg_link_state);
2095 bail:
2096 mac_perim_exit(mph);
2097 AGGR_GRP_REFRELE(grp);
2099 return (0);
2102 static void
2103 aggr_grp_port_rename(const char *new_name, void *arg)
2106 * aggr port's mac client name is the format of "aggr link name" plus
2107 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2109 int aggr_len, link_len, clnt_name_len, i;
2110 char *str_end, *str_st, *str_del;
2111 char aggr_name[MAXNAMELEN];
2112 char link_name[MAXNAMELEN];
2113 char *clnt_name;
2114 aggr_grp_t *aggr_grp = arg;
2115 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2117 for (i = 0; i < aggr_grp->lg_nports; i++) {
2118 clnt_name = mac_client_name(aggr_port->lp_mch);
2119 clnt_name_len = strlen(clnt_name);
2120 str_st = clnt_name;
2121 str_end = &(clnt_name[clnt_name_len]);
2122 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2123 ASSERT(str_del != NULL);
2124 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2125 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2126 bzero(aggr_name, MAXNAMELEN);
2127 bzero(link_name, MAXNAMELEN);
2128 bcopy(clnt_name, aggr_name, aggr_len);
2129 bcopy(str_del, link_name, link_len + 1);
2130 bzero(clnt_name, MAXNAMELEN);
2131 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2132 link_name);
2134 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2135 aggr_port = aggr_port->lp_next;
2140 * Initialize the capabilities that are advertised for the group
2141 * according to the capabilities of the constituent ports.
2143 static boolean_t
2144 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2146 aggr_grp_t *grp = arg;
2148 switch (cap) {
2149 case MAC_CAPAB_HCKSUM: {
2150 uint32_t *hcksum_txflags = cap_data;
2151 *hcksum_txflags = grp->lg_hcksum_txflags;
2152 break;
2154 case MAC_CAPAB_LSO: {
2155 mac_capab_lso_t *cap_lso = cap_data;
2157 if (grp->lg_lso) {
2158 *cap_lso = grp->lg_cap_lso;
2159 break;
2160 } else {
2161 return (B_FALSE);
2164 case MAC_CAPAB_NO_NATIVEVLAN:
2165 return (!grp->lg_vlan);
2166 case MAC_CAPAB_NO_ZCOPY:
2167 return (!grp->lg_zcopy);
2168 case MAC_CAPAB_RINGS: {
2169 mac_capab_rings_t *cap_rings = cap_data;
2171 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2172 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2173 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2176 * An aggregation advertises only one (pseudo) RX
2177 * group, which virtualizes the main/primary group of
2178 * the underlying devices.
2180 cap_rings->mr_gnum = 1;
2181 cap_rings->mr_gaddring = NULL;
2182 cap_rings->mr_gremring = NULL;
2183 } else {
2184 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2185 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2186 cap_rings->mr_gnum = 0;
2188 cap_rings->mr_rget = aggr_fill_ring;
2189 cap_rings->mr_gget = aggr_fill_group;
2190 break;
2192 case MAC_CAPAB_AGGR:
2194 mac_capab_aggr_t *aggr_cap;
2196 if (cap_data != NULL) {
2197 aggr_cap = cap_data;
2198 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2199 aggr_cap->mca_unicst = aggr_m_unicst;
2200 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2201 aggr_cap->mca_arg = arg;
2203 return (B_TRUE);
2205 default:
2206 return (B_FALSE);
2208 return (B_TRUE);
2212 * Callback funtion for MAC layer to register groups.
2214 static void
2215 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2216 mac_group_info_t *infop, mac_group_handle_t gh)
2218 aggr_grp_t *grp = arg;
2219 aggr_pseudo_rx_group_t *rx_group;
2220 aggr_pseudo_tx_group_t *tx_group;
2222 ASSERT(index == 0);
2223 if (rtype == MAC_RING_TYPE_RX) {
2224 rx_group = &grp->lg_rx_group;
2225 rx_group->arg_gh = gh;
2226 rx_group->arg_grp = grp;
2228 infop->mgi_driver = (mac_group_driver_t)rx_group;
2229 infop->mgi_start = NULL;
2230 infop->mgi_stop = NULL;
2231 infop->mgi_addmac = aggr_addmac;
2232 infop->mgi_remmac = aggr_remmac;
2233 infop->mgi_count = rx_group->arg_ring_cnt;
2234 } else {
2235 tx_group = &grp->lg_tx_group;
2236 tx_group->atg_gh = gh;
2241 * Callback funtion for MAC layer to register all rings.
2243 static void
2244 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2245 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2247 aggr_grp_t *grp = arg;
2249 switch (rtype) {
2250 case MAC_RING_TYPE_RX: {
2251 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
2252 aggr_pseudo_rx_ring_t *rx_ring;
2253 mac_intr_t aggr_mac_intr;
2255 ASSERT(rg_index == 0);
2257 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2258 rx_ring = rx_group->arg_rings + index;
2259 rx_ring->arr_rh = rh;
2262 * Entrypoint to enable interrupt (disable poll) and
2263 * disable interrupt (enable poll).
2265 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2266 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2267 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2268 aggr_mac_intr.mi_ddi_handle = NULL;
2270 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2271 infop->mri_start = aggr_pseudo_start_ring;
2272 infop->mri_stop = aggr_pseudo_stop_ring;
2274 infop->mri_intr = aggr_mac_intr;
2275 infop->mri_poll = aggr_rx_poll;
2277 infop->mri_stat = aggr_rx_ring_stat;
2278 break;
2280 case MAC_RING_TYPE_TX: {
2281 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2282 aggr_pseudo_tx_ring_t *tx_ring;
2284 ASSERT(rg_index == -1);
2285 ASSERT(index < tx_group->atg_ring_cnt);
2287 tx_ring = &tx_group->atg_rings[index];
2288 tx_ring->atr_rh = rh;
2290 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2291 infop->mri_start = NULL;
2292 infop->mri_stop = NULL;
2293 infop->mri_tx = aggr_ring_tx;
2294 infop->mri_stat = aggr_tx_ring_stat;
2296 * Use the hw TX ring handle to find if the ring needs
2297 * serialization or not. For NICs that do not expose
2298 * Tx rings, atr_hw_rh will be NULL.
2300 if (tx_ring->atr_hw_rh != NULL) {
2301 infop->mri_flags =
2302 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2304 break;
2306 default:
2307 break;
2311 static mblk_t *
2312 aggr_rx_poll(void *arg, int bytes_to_pickup)
2314 aggr_pseudo_rx_ring_t *rr_ring = arg;
2315 aggr_port_t *port = rr_ring->arr_port;
2316 aggr_grp_t *grp = port->lp_grp;
2317 mblk_t *mp_chain, *mp, **mpp;
2319 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2321 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2322 return (mp_chain);
2324 mpp = &mp_chain;
2325 while ((mp = *mpp) != NULL) {
2326 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2327 struct ether_header *ehp;
2329 ehp = (struct ether_header *)mp->b_rptr;
2330 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2331 *mpp = mp->b_next;
2332 mp->b_next = NULL;
2333 aggr_recv_lacp(port,
2334 (mac_resource_handle_t)rr_ring, mp);
2335 continue;
2339 if (!port->lp_collector_enabled) {
2340 *mpp = mp->b_next;
2341 mp->b_next = NULL;
2342 freemsg(mp);
2343 continue;
2345 mpp = &mp->b_next;
2347 return (mp_chain);
2350 static int
2351 aggr_addmac(void *arg, const uint8_t *mac_addr)
2353 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2354 aggr_unicst_addr_t *addr, **pprev;
2355 aggr_grp_t *grp = rx_group->arg_grp;
2356 aggr_port_t *port, *p;
2357 mac_perim_handle_t mph;
2358 int err = 0;
2360 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2362 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2363 mac_perim_exit(mph);
2364 return (0);
2368 * Insert this mac address into the list of mac addresses owned by
2369 * the aggregation pseudo group.
2371 pprev = &rx_group->arg_macaddr;
2372 while ((addr = *pprev) != NULL) {
2373 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2374 mac_perim_exit(mph);
2375 return (EEXIST);
2377 pprev = &addr->aua_next;
2379 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2380 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2381 addr->aua_next = NULL;
2382 *pprev = addr;
2384 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2385 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2386 break;
2388 if (err != 0) {
2389 for (p = grp->lg_ports; p != port; p = p->lp_next)
2390 aggr_port_remmac(p, mac_addr);
2392 *pprev = NULL;
2393 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2396 mac_perim_exit(mph);
2397 return (err);
2400 static int
2401 aggr_remmac(void *arg, const uint8_t *mac_addr)
2403 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2404 aggr_unicst_addr_t *addr, **pprev;
2405 aggr_grp_t *grp = rx_group->arg_grp;
2406 aggr_port_t *port;
2407 mac_perim_handle_t mph;
2408 int err = 0;
2410 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2412 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2413 mac_perim_exit(mph);
2414 return (0);
2418 * Insert this mac address into the list of mac addresses owned by
2419 * the aggregation pseudo group.
2421 pprev = &rx_group->arg_macaddr;
2422 while ((addr = *pprev) != NULL) {
2423 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2424 pprev = &addr->aua_next;
2425 continue;
2427 break;
2429 if (addr == NULL) {
2430 mac_perim_exit(mph);
2431 return (EINVAL);
2434 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2435 aggr_port_remmac(port, mac_addr);
2437 *pprev = addr->aua_next;
2438 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2440 mac_perim_exit(mph);
2441 return (err);
2445 * Add or remove the multicast addresses that are defined for the group
2446 * to or from the specified port.
2448 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2449 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2450 * called when the port is either stopped or detached.
2452 void
2453 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2455 aggr_grp_t *grp = port->lp_grp;
2457 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2458 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2460 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2461 return;
2463 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2466 static int
2467 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2469 aggr_grp_t *grp = arg;
2470 aggr_port_t *port = NULL, *errport = NULL;
2471 mac_perim_handle_t mph;
2472 int err = 0;
2474 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2475 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2476 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2477 !port->lp_started) {
2478 continue;
2480 err = aggr_port_multicst(port, add, addrp);
2481 if (err != 0) {
2482 errport = port;
2483 break;
2488 * At least one port caused error return and this error is returned to
2489 * mac, eventually a NAK would be sent upwards.
2490 * Some ports have this multicast address listed now, and some don't.
2491 * Treat this error as a whole aggr failure not individual port failure.
2492 * Therefore remove this multicast address from other ports.
2494 if ((err != 0) && add) {
2495 for (port = grp->lg_ports; port != errport;
2496 port = port->lp_next) {
2497 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2498 !port->lp_started) {
2499 continue;
2501 (void) aggr_port_multicst(port, B_FALSE, addrp);
2504 mac_perim_exit(mph);
2505 return (err);
2508 static int
2509 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2511 aggr_grp_t *grp = arg;
2512 mac_perim_handle_t mph;
2513 int err;
2515 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2516 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2517 0, 0);
2518 mac_perim_exit(mph);
2519 return (err);
2523 * Initialize the capabilities that are advertised for the group
2524 * according to the capabilities of the constituent ports.
2526 static void
2527 aggr_grp_capab_set(aggr_grp_t *grp)
2529 uint32_t cksum;
2530 aggr_port_t *port;
2531 mac_capab_lso_t cap_lso;
2533 ASSERT(grp->lg_mh == NULL);
2534 ASSERT(grp->lg_ports != NULL);
2536 grp->lg_hcksum_txflags = (uint32_t)-1;
2537 grp->lg_zcopy = B_TRUE;
2538 grp->lg_vlan = B_TRUE;
2540 grp->lg_lso = B_TRUE;
2541 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2542 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2545 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2546 cksum = 0;
2547 grp->lg_hcksum_txflags &= cksum;
2549 grp->lg_vlan &=
2550 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2552 grp->lg_zcopy &=
2553 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2555 grp->lg_lso &=
2556 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2557 if (grp->lg_lso) {
2558 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2559 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2560 cap_lso.lso_basic_tcp_ipv4.lso_max)
2561 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2562 cap_lso.lso_basic_tcp_ipv4.lso_max;
2568 * Checks whether the capabilities of the port being added are compatible
2569 * with the current capabilities of the aggregation.
2571 static boolean_t
2572 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2574 uint32_t hcksum_txflags;
2576 ASSERT(grp->lg_ports != NULL);
2578 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2579 grp->lg_vlan) != grp->lg_vlan) {
2580 return (B_FALSE);
2583 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2584 grp->lg_zcopy) != grp->lg_zcopy) {
2585 return (B_FALSE);
2588 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2589 if (grp->lg_hcksum_txflags != 0)
2590 return (B_FALSE);
2591 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2592 grp->lg_hcksum_txflags) {
2593 return (B_FALSE);
2596 if (grp->lg_lso) {
2597 mac_capab_lso_t cap_lso;
2599 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2600 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2601 grp->lg_cap_lso.lso_flags)
2602 return (B_FALSE);
2603 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2604 cap_lso.lso_basic_tcp_ipv4.lso_max)
2605 return (B_FALSE);
2606 } else {
2607 return (B_FALSE);
2611 return (B_TRUE);
2615 * Returns the maximum SDU according to the SDU of the constituent ports.
2617 static uint_t
2618 aggr_grp_max_sdu(aggr_grp_t *grp)
2620 uint_t max_sdu = (uint_t)-1;
2621 aggr_port_t *port;
2623 ASSERT(grp->lg_ports != NULL);
2625 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2626 uint_t port_sdu_max;
2628 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2629 if (max_sdu > port_sdu_max)
2630 max_sdu = port_sdu_max;
2633 return (max_sdu);
2637 * Checks if the maximum SDU of the specified port is compatible
2638 * with the maximum SDU of the specified aggregation group, returns
2639 * B_TRUE if it is, B_FALSE otherwise.
2641 static boolean_t
2642 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2644 uint_t port_sdu_max;
2646 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2647 return (port_sdu_max >= grp->lg_max_sdu);
2651 * Returns the maximum margin according to the margin of the constituent ports.
2653 static uint32_t
2654 aggr_grp_max_margin(aggr_grp_t *grp)
2656 uint32_t margin = UINT32_MAX;
2657 aggr_port_t *port;
2659 ASSERT(grp->lg_mh == NULL);
2660 ASSERT(grp->lg_ports != NULL);
2662 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2663 if (margin > port->lp_margin)
2664 margin = port->lp_margin;
2667 grp->lg_margin = margin;
2668 return (margin);
2672 * Checks if the maximum margin of the specified port is compatible
2673 * with the maximum margin of the specified aggregation group, returns
2674 * B_TRUE if it is, B_FALSE otherwise.
2676 static boolean_t
2677 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2679 if (port->lp_margin >= grp->lg_margin)
2680 return (B_TRUE);
2683 * See whether the current margin value is allowed to be changed to
2684 * the new value.
2686 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2687 return (B_FALSE);
2689 grp->lg_margin = port->lp_margin;
2690 return (B_TRUE);
2694 * Set MTU on individual ports of an aggregation group
2696 static int
2697 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2698 uint32_t *old_mtu)
2700 boolean_t removed = B_FALSE;
2701 mac_perim_handle_t mph;
2702 mac_diag_t diag;
2703 int err, rv, retry = 0;
2705 if (port->lp_mah != NULL) {
2706 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2707 port->lp_mah = NULL;
2708 removed = B_TRUE;
2710 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2711 try_again:
2712 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2713 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2714 &port->lp_mah, 0, &diag)) != 0) {
2716 * following is a workaround for a bug in 'bge' driver.
2717 * See CR 6794654 for more information and this work around
2718 * will be removed once the CR is fixed.
2720 if (rv == EIO && retry++ < 3) {
2721 ddi_sleep(2);
2722 goto try_again;
2725 * if mac_unicast_add() failed while setting the MTU,
2726 * detach the port from the group.
2728 mac_perim_enter_by_mh(port->lp_mh, &mph);
2729 (void) aggr_grp_detach_port(grp, port);
2730 mac_perim_exit(mph);
2731 cmn_err(CE_WARN, "Unable to restart the port %s while "
2732 "setting MTU. Detaching the port from the aggregation.",
2733 mac_client_name(port->lp_mch));
2735 return (err);
2738 static int
2739 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2741 int err = 0, i, rv;
2742 aggr_port_t *port;
2743 uint32_t *mtu;
2745 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2748 * If the MTU being set is equal to aggr group's maximum
2749 * allowable value, then there is nothing to change
2751 if (sdu == grp->lg_max_sdu)
2752 return (0);
2754 /* 0 is aggr group's min sdu */
2755 if (sdu == 0)
2756 return (EINVAL);
2758 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2759 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2760 port = port->lp_next, i++) {
2761 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2763 if (err != 0) {
2764 /* recover from error: reset the mtus of the ports */
2765 aggr_port_t *tmp;
2767 for (tmp = grp->lg_ports, i = 0; tmp != port;
2768 tmp = tmp->lp_next, i++) {
2769 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2771 goto bail;
2773 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2774 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2775 ASSERT(rv == 0);
2776 bail:
2777 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2778 return (err);
2782 * Callback functions for set/get of properties
2784 /*ARGSUSED*/
2785 static int
2786 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2787 uint_t pr_valsize, const void *pr_val)
2789 int err = ENOTSUP;
2790 aggr_grp_t *grp = m_driver;
2792 switch (pr_num) {
2793 case MAC_PROP_MTU: {
2794 uint32_t mtu;
2796 if (pr_valsize < sizeof (mtu)) {
2797 err = EINVAL;
2798 break;
2800 bcopy(pr_val, &mtu, sizeof (mtu));
2801 err = aggr_sdu_update(grp, mtu);
2802 break;
2804 default:
2805 break;
2807 return (err);
2810 typedef struct rboundary {
2811 uint32_t bval;
2812 int btype;
2813 } rboundary_t;
2816 * This function finds the intersection of mtu ranges stored in arrays -
2817 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2818 * Individual arrays are assumed to contain non-overlapping ranges.
2819 * Algorithm:
2820 * A range has two boundaries - min and max. We scan all arrays and store
2821 * each boundary as a separate element in a temporary array. We also store
2822 * the boundary types, min or max, as +1 or -1 respectively in the temporary
2823 * array. Then we sort the temporary array in ascending order. We scan the
2824 * sorted array from lower to higher values and keep a cumulative sum of
2825 * boundary types. Element in the temporary array for which the sum reaches
2826 * mcount is a min boundary of a range in the result and next element will be
2827 * max boundary.
2829 * Example for mcount = 3,
2831 * ----|_________|-------|_______|----|__|------ mrange[0]
2833 * -------|________|--|____________|-----|___|-- mrange[1]
2835 * --------|________________|-------|____|------ mrange[2]
2837 * 3 2 1
2838 * \|/
2839 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
2840 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2842 * same min and max
2844 * --------|_____|-------|__|------------|------ intersecting ranges
2846 void
2847 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2848 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2850 mac_propval_uint32_range_t *rval, *ur;
2851 int rmaxcnt, rcount;
2852 size_t sz_range32;
2853 rboundary_t *ta; /* temporary array */
2854 rboundary_t temp;
2855 boolean_t range_started = B_FALSE;
2856 int i, j, m, sum;
2858 sz_range32 = sizeof (mac_propval_uint32_range_t);
2860 for (i = 0, rmaxcnt = 0; i < mcount; i++)
2861 rmaxcnt += mrange[i]->mpr_count;
2863 /* Allocate enough space to store the results */
2864 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2866 /* Number of boundaries are twice as many as ranges */
2867 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2869 for (i = 0, m = 0; i < mcount; i++) {
2870 ur = &(mrange[i]->mpr_range_uint32[0]);
2871 for (j = 0; j < mrange[i]->mpr_count; j++) {
2872 ta[m].bval = ur[j].mpur_min;
2873 ta[m++].btype = 1;
2874 ta[m].bval = ur[j].mpur_max;
2875 ta[m++].btype = -1;
2880 * Sort the temporary array in ascending order of bval;
2881 * if boundary values are same then sort on btype.
2883 for (i = 0; i < m-1; i++) {
2884 for (j = i+1; j < m; j++) {
2885 if ((ta[i].bval > ta[j].bval) ||
2886 ((ta[i].bval == ta[j].bval) &&
2887 (ta[i].btype < ta[j].btype))) {
2888 temp = ta[i];
2889 ta[i] = ta[j];
2890 ta[j] = temp;
2895 /* Walk through temporary array to find all ranges in the results */
2896 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2897 sum += ta[i].btype;
2898 if (sum == mcount) {
2899 rval[rcount].mpur_min = ta[i].bval;
2900 range_started = B_TRUE;
2901 } else if (sum < mcount && range_started) {
2902 rval[rcount++].mpur_max = ta[i].bval;
2903 range_started = B_FALSE;
2907 *prval = rval;
2908 *prmaxcnt = rmaxcnt;
2909 *prcount = rcount;
2911 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2915 * Returns the mtu ranges which could be supported by aggr group.
2916 * prmaxcnt returns the size of the buffer prval, prcount returns
2917 * the number of valid entries in prval. Caller is responsible
2918 * for freeing up prval.
2921 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2922 int *prmaxcnt, int *prcount)
2924 mac_propval_range_t **vals;
2925 aggr_port_t *port;
2926 mac_perim_handle_t mph;
2927 uint_t i, numr;
2928 int err = 0;
2929 size_t sz_propval, sz_range32;
2930 size_t size;
2932 sz_propval = sizeof (mac_propval_range_t);
2933 sz_range32 = sizeof (mac_propval_uint32_range_t);
2935 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2937 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2938 KM_SLEEP);
2940 for (port = grp->lg_ports, i = 0; port != NULL;
2941 port = port->lp_next, i++) {
2943 size = sz_propval;
2944 vals[i] = kmem_alloc(size, KM_SLEEP);
2945 vals[i]->mpr_count = 1;
2947 mac_perim_enter_by_mh(port->lp_mh, &mph);
2949 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2950 NULL, 0, vals[i], NULL);
2951 if (err == ENOSPC) {
2953 * Not enough space to hold all ranges.
2954 * Allocate extra space as indicated and retry.
2956 numr = vals[i]->mpr_count;
2957 kmem_free(vals[i], sz_propval);
2958 size = sz_propval + (numr - 1) * sz_range32;
2959 vals[i] = kmem_alloc(size, KM_SLEEP);
2960 vals[i]->mpr_count = numr;
2961 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2962 NULL, 0, vals[i], NULL);
2963 ASSERT(err != ENOSPC);
2965 mac_perim_exit(mph);
2966 if (err != 0) {
2967 kmem_free(vals[i], size);
2968 vals[i] = NULL;
2969 break;
2974 * if any of the underlying ports does not support changing MTU then
2975 * just return ENOTSUP
2977 if (port != NULL) {
2978 ASSERT(err != 0);
2979 goto done;
2982 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2983 prcount);
2985 done:
2986 for (i = 0; i < grp->lg_nports; i++) {
2987 if (vals[i] != NULL) {
2988 numr = vals[i]->mpr_count;
2989 size = sz_propval + (numr - 1) * sz_range32;
2990 kmem_free(vals[i], size);
2994 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2995 return (err);
2998 static void
2999 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3000 mac_prop_info_handle_t prh)
3002 aggr_grp_t *grp = m_driver;
3003 mac_propval_uint32_range_t *rval = NULL;
3004 int i, rcount, rmaxcnt;
3005 int err = 0;
3007 _NOTE(ARGUNUSED(pr_name));
3009 switch (pr_num) {
3010 case MAC_PROP_MTU:
3012 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3013 &rcount);
3014 if (err != 0) {
3015 ASSERT(rval == NULL);
3016 return;
3018 for (i = 0; i < rcount; i++) {
3019 mac_prop_info_set_range_uint32(prh,
3020 rval[i].mpur_min, rval[i].mpur_max);
3022 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3023 break;