4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/atomic.h>
38 #include <sys/callb.h>
39 #define _SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
42 #include <inet/common.h>
44 #include <inet/mib2.h>
45 #include <inet/snmpcom.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
51 #include <inet/ip_impl.h>
53 #include <inet/ip6_asp.h>
54 #include <inet/ip_multi.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ip_ftable.h>
58 #include <inet/ip_rts.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_listutils.h>
63 #include <sys/sunddi.h>
66 * Routines for handling destination cache entries.
67 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
68 * That entry holds both the IP ident value and the dce generation number.
70 * Any time a DCE is changed significantly (different path MTU, but NOT
71 * different ULP info!), the dce_generation number is increased.
72 * Also, when a new DCE is created, the dce_generation number in the default
73 * DCE is bumped. That allows the dce_t information to be cached efficiently
74 * as long as the entity caching the dce_t also caches the dce_generation,
75 * and compares the cached generation to detect any changes.
76 * Furthermore, when a DCE is deleted, if there are any outstanding references
77 * to the DCE it will be marked as condemned. The condemned mark is
78 * a designated generation number which is never otherwise used, hence
79 * the single comparison with the generation number captures that as well.
81 * An example of code which caches is as follows:
83 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
85 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
86 * &mystruct->my_dce_generation);
87 * Not needed in practice, since we have the default DCE:
88 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
92 * Note that for IPv6 link-local addresses we record the ifindex since the
93 * link-locals are not globally unique.
95 * DCEs can remain for an arbitrarily long time, until memory pressure or
96 * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
97 * to actually remove DCEs from the cache.
101 * Hash bucket structure for DCEs
103 typedef struct dcb_s
{
109 static void dce_delete_locked(dcb_t
*, dce_t
*);
110 static void dce_make_condemned(dce_t
*);
112 static kmem_cache_t
*dce_cache
;
113 static kthread_t
*dce_reclaim_thread
;
114 static kmutex_t dce_reclaim_lock
;
115 static kcondvar_t dce_reclaim_cv
;
116 static int dce_reclaim_shutdown
;
118 /* Global so it can be tuned in /etc/system. This must be a power of two. */
119 uint_t ip_dce_hash_size
= 1024;
121 /* The time in seconds between executions of the IP DCE reclaim worker. */
122 uint_t ip_dce_reclaim_interval
= 60;
124 /* The factor of the DCE threshold at which to start hard reclaims */
125 uint_t ip_dce_reclaim_threshold_hard
= 2;
127 /* Operates on a uint64_t */
128 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
131 * Reclaim a fraction of dce's in the dcb.
132 * For now we have a higher probability to delete DCEs without DCE_PMTU.
135 dcb_reclaim(dcb_t
*dcb
, ip_stack_t
*ipst
, uint_t fraction
)
137 uint_t fraction_pmtu
= fraction
*4;
139 dce_t
*dce
, *nextdce
;
140 hrtime_t seed
= gethrtime();
142 uint_t max
= ipst
->ips_ip_dce_reclaim_threshold
;
144 max
*= ip_dce_reclaim_threshold_hard
;
146 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
147 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= nextdce
) {
148 nextdce
= dce
->dce_next
;
149 /* Clear DCEF_PMTU if the pmtu is too old */
150 mutex_enter(&dce
->dce_lock
);
151 if ((dce
->dce_flags
& DCEF_PMTU
) &&
152 TICK_TO_SEC(ddi_get_lbolt64()) - dce
->dce_last_change_time
>
153 ipst
->ips_ip_pathmtu_interval
) {
154 dce
->dce_flags
&= ~DCEF_PMTU
;
155 mutex_exit(&dce
->dce_lock
);
156 dce_increment_generation(dce
);
158 mutex_exit(&dce
->dce_lock
);
161 if (max
== 0 || retained
< max
) {
162 hash
= RANDOM_HASH((uint64_t)((uintptr_t)dce
| seed
));
164 if (dce
->dce_flags
& DCEF_PMTU
) {
165 if (hash
% fraction_pmtu
!= 0) {
170 if (hash
% fraction
!= 0) {
177 IP_STAT(ipst
, ip_dce_reclaim_deleted
);
178 dce_delete_locked(dcb
, dce
);
181 rw_exit(&dcb
->dcb_lock
);
185 * kmem_cache callback to free up memory.
189 ip_dce_reclaim_stack(ip_stack_t
*ipst
)
193 IP_STAT(ipst
, ip_dce_reclaim_calls
);
194 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
195 dcb_reclaim(&ipst
->ips_dce_hash_v4
[i
], ipst
,
196 ipst
->ips_ip_dce_reclaim_fraction
);
198 dcb_reclaim(&ipst
->ips_dce_hash_v6
[i
], ipst
,
199 ipst
->ips_ip_dce_reclaim_fraction
);
203 * Walk all CONNs that can have a reference on an ire, nce or dce.
204 * Get them to update any stale references to drop any refholds they
207 ipcl_walk(conn_ixa_cleanup
, (void *)B_FALSE
, ipst
);
211 * Called by dce_reclaim_worker() below, and no one else. Typically this will
212 * mean that the number of entries in the hash buckets has exceeded a tunable
218 netstack_handle_t nh
;
222 ASSERT(curthread
== dce_reclaim_thread
);
224 netstack_next_init(&nh
);
225 while ((ns
= netstack_next(&nh
)) != NULL
) {
227 * netstack_next() can return a netstack_t with a NULL
228 * netstack_ip at boot time.
230 if ((ipst
= ns
->netstack_ip
) == NULL
) {
234 if (atomic_swap_uint(&ipst
->ips_dce_reclaim_needed
, 0) != 0)
235 ip_dce_reclaim_stack(ipst
);
238 netstack_next_fini(&nh
);
243 dce_reclaim_worker(void *arg
)
247 CALLB_CPR_INIT(&cprinfo
, &dce_reclaim_lock
, callb_generic_cpr
,
248 "dce_reclaim_worker");
250 mutex_enter(&dce_reclaim_lock
);
251 while (!dce_reclaim_shutdown
) {
252 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
253 (void) cv_timedwait(&dce_reclaim_cv
, &dce_reclaim_lock
,
254 ddi_get_lbolt() + ip_dce_reclaim_interval
* hz
);
255 CALLB_CPR_SAFE_END(&cprinfo
, &dce_reclaim_lock
);
257 if (dce_reclaim_shutdown
)
260 mutex_exit(&dce_reclaim_lock
);
262 mutex_enter(&dce_reclaim_lock
);
265 ASSERT(MUTEX_HELD(&dce_reclaim_lock
));
266 dce_reclaim_thread
= NULL
;
267 dce_reclaim_shutdown
= 0;
268 cv_broadcast(&dce_reclaim_cv
);
269 CALLB_CPR_EXIT(&cprinfo
); /* drops the lock */
277 dce_cache
= kmem_cache_create("dce_cache",
278 sizeof (dce_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
280 mutex_init(&dce_reclaim_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
281 cv_init(&dce_reclaim_cv
, NULL
, CV_DEFAULT
, NULL
);
283 dce_reclaim_thread
= thread_create(NULL
, 0, dce_reclaim_worker
,
284 NULL
, 0, &p0
, TS_RUN
, minclsyspri
);
290 mutex_enter(&dce_reclaim_lock
);
291 dce_reclaim_shutdown
= 1;
292 cv_signal(&dce_reclaim_cv
);
293 while (dce_reclaim_thread
!= NULL
)
294 cv_wait(&dce_reclaim_cv
, &dce_reclaim_lock
);
295 mutex_exit(&dce_reclaim_lock
);
297 cv_destroy(&dce_reclaim_cv
);
298 mutex_destroy(&dce_reclaim_lock
);
300 kmem_cache_destroy(dce_cache
);
304 * Allocate a default DCE and a hash table for per-IP address DCEs
307 dce_stack_init(ip_stack_t
*ipst
)
311 ipst
->ips_dce_default
= kmem_cache_alloc(dce_cache
, KM_SLEEP
);
312 bzero(ipst
->ips_dce_default
, sizeof (dce_t
));
313 ipst
->ips_dce_default
->dce_flags
= DCEF_DEFAULT
;
314 ipst
->ips_dce_default
->dce_generation
= DCE_GENERATION_INITIAL
;
315 ipst
->ips_dce_default
->dce_last_change_time
=
316 TICK_TO_SEC(ddi_get_lbolt64());
317 ipst
->ips_dce_default
->dce_refcnt
= 1; /* Should never go away */
318 ipst
->ips_dce_default
->dce_ipst
= ipst
;
320 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
321 ipst
->ips_dce_hashsize
= ip_dce_hash_size
;
322 ipst
->ips_dce_hash_v4
= kmem_zalloc(ipst
->ips_dce_hashsize
*
323 sizeof (dcb_t
), KM_SLEEP
);
324 ipst
->ips_dce_hash_v6
= kmem_zalloc(ipst
->ips_dce_hashsize
*
325 sizeof (dcb_t
), KM_SLEEP
);
326 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
327 rw_init(&ipst
->ips_dce_hash_v4
[i
].dcb_lock
, NULL
, RW_DEFAULT
,
329 rw_init(&ipst
->ips_dce_hash_v6
[i
].dcb_lock
, NULL
, RW_DEFAULT
,
335 * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
336 * ifindex-specific matching, others don't. Don't overload ifindex to indicate
337 * specificity, just indicate so explicitly.
340 dce_bucket_clean(dcb_t
*dcb
, boolean_t specific_ifindex
, uint_t ifindex
)
342 dce_t
*dce
, *nextdce
;
344 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
346 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= nextdce
) {
347 nextdce
= dce
->dce_next
;
348 if ((!specific_ifindex
) || dce
->dce_ifindex
== ifindex
) {
349 dce_delete_locked(dcb
, dce
);
354 rw_exit(&dcb
->dcb_lock
);
358 dce_stack_destroy(ip_stack_t
*ipst
)
361 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
362 dce_bucket_clean(&ipst
->ips_dce_hash_v4
[i
], B_FALSE
, 0);
363 rw_destroy(&ipst
->ips_dce_hash_v4
[i
].dcb_lock
);
364 dce_bucket_clean(&ipst
->ips_dce_hash_v6
[i
], B_FALSE
, 0);
365 rw_destroy(&ipst
->ips_dce_hash_v6
[i
].dcb_lock
);
367 kmem_free(ipst
->ips_dce_hash_v4
,
368 ipst
->ips_dce_hashsize
* sizeof (dcb_t
));
369 ipst
->ips_dce_hash_v4
= NULL
;
370 kmem_free(ipst
->ips_dce_hash_v6
,
371 ipst
->ips_dce_hashsize
* sizeof (dcb_t
));
372 ipst
->ips_dce_hash_v6
= NULL
;
373 ipst
->ips_dce_hashsize
= 0;
375 ASSERT(ipst
->ips_dce_default
->dce_refcnt
== 1);
376 kmem_cache_free(dce_cache
, ipst
->ips_dce_default
);
377 ipst
->ips_dce_default
= NULL
;
380 /* When any DCE is good enough */
382 dce_get_default(ip_stack_t
*ipst
)
386 dce
= ipst
->ips_dce_default
;
392 * Generic for IPv4 and IPv6.
394 * Used by callers that need to cache e.g., the datapath
395 * Returns the generation number in the last argument.
398 dce_lookup_pkt(mblk_t
*mp
, ip_xmit_attr_t
*ixa
, uint_t
*generationp
)
400 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
402 * If we have a source route we need to look for the final
403 * destination in the source route option.
406 ipha_t
*ipha
= (ipha_t
*)mp
->b_rptr
;
408 final_dst
= ip_get_dst(ipha
);
409 return (dce_lookup_v4(final_dst
, ixa
->ixa_ipst
, generationp
));
413 * If we have a routing header we need to look for the final
414 * destination in the routing extension header.
416 in6_addr_t final_dst
;
417 ip6_t
*ip6h
= (ip6_t
*)mp
->b_rptr
;
419 final_dst
= ip_get_dst_v6(ip6h
, mp
, NULL
);
421 if (IN6_IS_ADDR_LINKSCOPE(&final_dst
) && ixa
->ixa_nce
!= NULL
) {
422 ifindex
= ixa
->ixa_nce
->nce_common
->ncec_ill
->
423 ill_phyint
->phyint_ifindex
;
425 return (dce_lookup_v6(&final_dst
, ifindex
, ixa
->ixa_ipst
,
431 * Used by callers that need to cache e.g., the datapath
432 * Returns the generation number in the last argument.
435 dce_lookup_v4(ipaddr_t dst
, ip_stack_t
*ipst
, uint_t
*generationp
)
441 /* Set *generationp before dropping the lock(s) that allow additions */
442 if (generationp
!= NULL
)
443 *generationp
= ipst
->ips_dce_default
->dce_generation
;
445 hash
= IRE_ADDR_HASH(dst
, ipst
->ips_dce_hashsize
);
446 dcb
= &ipst
->ips_dce_hash_v4
[hash
];
447 rw_enter(&dcb
->dcb_lock
, RW_READER
);
448 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
449 if (dce
->dce_v4addr
== dst
) {
450 mutex_enter(&dce
->dce_lock
);
451 if (!DCE_IS_CONDEMNED(dce
)) {
453 if (generationp
!= NULL
)
454 *generationp
= dce
->dce_generation
;
455 mutex_exit(&dce
->dce_lock
);
456 rw_exit(&dcb
->dcb_lock
);
459 mutex_exit(&dce
->dce_lock
);
462 rw_exit(&dcb
->dcb_lock
);
464 dce
= ipst
->ips_dce_default
;
470 * Used by callers that need to cache e.g., the datapath
471 * Returns the generation number in the last argument.
472 * ifindex should only be set for link-locals
475 dce_lookup_v6(const in6_addr_t
*dst
, uint_t ifindex
, ip_stack_t
*ipst
,
482 /* Set *generationp before dropping the lock(s) that allow additions */
483 if (generationp
!= NULL
)
484 *generationp
= ipst
->ips_dce_default
->dce_generation
;
486 hash
= IRE_ADDR_HASH_V6(*dst
, ipst
->ips_dce_hashsize
);
487 dcb
= &ipst
->ips_dce_hash_v6
[hash
];
488 rw_enter(&dcb
->dcb_lock
, RW_READER
);
489 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
490 if (IN6_ARE_ADDR_EQUAL(&dce
->dce_v6addr
, dst
) &&
491 dce
->dce_ifindex
== ifindex
) {
492 mutex_enter(&dce
->dce_lock
);
493 if (!DCE_IS_CONDEMNED(dce
)) {
495 if (generationp
!= NULL
)
496 *generationp
= dce
->dce_generation
;
497 mutex_exit(&dce
->dce_lock
);
498 rw_exit(&dcb
->dcb_lock
);
501 mutex_exit(&dce
->dce_lock
);
504 rw_exit(&dcb
->dcb_lock
);
506 dce
= ipst
->ips_dce_default
;
512 * Atomically looks for a non-default DCE, and if not found tries to create one.
513 * If there is no memory it returns NULL.
514 * When an entry is created we increase the generation number on
515 * the default DCE so that conn_ip_output will detect there is a new DCE.
518 dce_lookup_and_add_v4(ipaddr_t dst
, ip_stack_t
*ipst
)
524 hash
= IRE_ADDR_HASH(dst
, ipst
->ips_dce_hashsize
);
525 dcb
= &ipst
->ips_dce_hash_v4
[hash
];
527 * Assuming that we get fairly even distribution across all of the
528 * buckets, once one bucket is overly full, prune the whole cache.
530 if (dcb
->dcb_cnt
> ipst
->ips_ip_dce_reclaim_threshold
)
531 atomic_or_uint(&ipst
->ips_dce_reclaim_needed
, 1);
532 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
533 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
534 if (dce
->dce_v4addr
== dst
) {
535 mutex_enter(&dce
->dce_lock
);
536 if (!DCE_IS_CONDEMNED(dce
)) {
538 mutex_exit(&dce
->dce_lock
);
539 rw_exit(&dcb
->dcb_lock
);
542 mutex_exit(&dce
->dce_lock
);
545 dce
= kmem_cache_alloc(dce_cache
, KM_NOSLEEP
);
547 rw_exit(&dcb
->dcb_lock
);
550 bzero(dce
, sizeof (dce_t
));
551 dce
->dce_ipst
= ipst
; /* No netstack_hold */
552 dce
->dce_v4addr
= dst
;
553 dce
->dce_generation
= DCE_GENERATION_INITIAL
;
554 dce
->dce_ipversion
= IPV4_VERSION
;
555 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
556 dce_refhold(dce
); /* For the hash list */
559 if (dcb
->dcb_dce
!= NULL
)
560 dcb
->dcb_dce
->dce_ptpn
= &dce
->dce_next
;
561 dce
->dce_next
= dcb
->dcb_dce
;
562 dce
->dce_ptpn
= &dcb
->dcb_dce
;
564 dce
->dce_bucket
= dcb
;
565 atomic_inc_32(&dcb
->dcb_cnt
);
566 dce_refhold(dce
); /* For the caller */
567 rw_exit(&dcb
->dcb_lock
);
569 /* Initialize dce_ident to be different than for the last packet */
570 dce
->dce_ident
= ipst
->ips_dce_default
->dce_ident
+ 1;
572 dce_increment_generation(ipst
->ips_dce_default
);
577 * Atomically looks for a non-default DCE, and if not found tries to create one.
578 * If there is no memory it returns NULL.
579 * When an entry is created we increase the generation number on
580 * the default DCE so that conn_ip_output will detect there is a new DCE.
581 * ifindex should only be used with link-local addresses.
584 dce_lookup_and_add_v6(const in6_addr_t
*dst
, uint_t ifindex
, ip_stack_t
*ipst
)
590 /* We should not create entries for link-locals w/o an ifindex */
591 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst
)) || ifindex
!= 0);
593 hash
= IRE_ADDR_HASH_V6(*dst
, ipst
->ips_dce_hashsize
);
594 dcb
= &ipst
->ips_dce_hash_v6
[hash
];
596 * Assuming that we get fairly even distribution across all of the
597 * buckets, once one bucket is overly full, prune the whole cache.
599 if (dcb
->dcb_cnt
> ipst
->ips_ip_dce_reclaim_threshold
)
600 atomic_or_uint(&ipst
->ips_dce_reclaim_needed
, 1);
601 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
602 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
603 if (IN6_ARE_ADDR_EQUAL(&dce
->dce_v6addr
, dst
) &&
604 dce
->dce_ifindex
== ifindex
) {
605 mutex_enter(&dce
->dce_lock
);
606 if (!DCE_IS_CONDEMNED(dce
)) {
608 mutex_exit(&dce
->dce_lock
);
609 rw_exit(&dcb
->dcb_lock
);
612 mutex_exit(&dce
->dce_lock
);
616 dce
= kmem_cache_alloc(dce_cache
, KM_NOSLEEP
);
618 rw_exit(&dcb
->dcb_lock
);
621 bzero(dce
, sizeof (dce_t
));
622 dce
->dce_ipst
= ipst
; /* No netstack_hold */
623 dce
->dce_v6addr
= *dst
;
624 dce
->dce_ifindex
= ifindex
;
625 dce
->dce_generation
= DCE_GENERATION_INITIAL
;
626 dce
->dce_ipversion
= IPV6_VERSION
;
627 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
628 dce_refhold(dce
); /* For the hash list */
631 if (dcb
->dcb_dce
!= NULL
)
632 dcb
->dcb_dce
->dce_ptpn
= &dce
->dce_next
;
633 dce
->dce_next
= dcb
->dcb_dce
;
634 dce
->dce_ptpn
= &dcb
->dcb_dce
;
636 dce
->dce_bucket
= dcb
;
637 atomic_inc_32(&dcb
->dcb_cnt
);
638 dce_refhold(dce
); /* For the caller */
639 rw_exit(&dcb
->dcb_lock
);
641 /* Initialize dce_ident to be different than for the last packet */
642 dce
->dce_ident
= ipst
->ips_dce_default
->dce_ident
+ 1;
643 dce_increment_generation(ipst
->ips_dce_default
);
648 * Set/update uinfo. Creates a per-destination dce if none exists.
650 * Note that we do not bump the generation number here.
651 * New connections will find the new uinfo.
653 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
656 dce_setuinfo(dce_t
*dce
, iulp_t
*uinfo
)
659 * Update the round trip time estimate and/or the max frag size
660 * and/or the slow start threshold.
662 * We serialize multiple advises using dce_lock.
664 mutex_enter(&dce
->dce_lock
);
665 /* Gard against setting to zero */
666 if (uinfo
->iulp_rtt
!= 0) {
668 * If there is no old cached values, initialize them
669 * conservatively. Set them to be (1.5 * new value).
671 if (dce
->dce_uinfo
.iulp_rtt
!= 0) {
672 dce
->dce_uinfo
.iulp_rtt
= (dce
->dce_uinfo
.iulp_rtt
+
673 uinfo
->iulp_rtt
) >> 1;
675 dce
->dce_uinfo
.iulp_rtt
= uinfo
->iulp_rtt
+
676 (uinfo
->iulp_rtt
>> 1);
678 if (dce
->dce_uinfo
.iulp_rtt_sd
!= 0) {
679 dce
->dce_uinfo
.iulp_rtt_sd
=
680 (dce
->dce_uinfo
.iulp_rtt_sd
+
681 uinfo
->iulp_rtt_sd
) >> 1;
683 dce
->dce_uinfo
.iulp_rtt_sd
= uinfo
->iulp_rtt_sd
+
684 (uinfo
->iulp_rtt_sd
>> 1);
687 if (uinfo
->iulp_mtu
!= 0) {
688 if (dce
->dce_flags
& DCEF_PMTU
) {
689 dce
->dce_pmtu
= MIN(uinfo
->iulp_mtu
, dce
->dce_pmtu
);
691 dce
->dce_pmtu
= MIN(uinfo
->iulp_mtu
, IP_MAXPACKET
);
692 dce
->dce_flags
|= DCEF_PMTU
;
694 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
696 if (uinfo
->iulp_ssthresh
!= 0) {
697 if (dce
->dce_uinfo
.iulp_ssthresh
!= 0)
698 dce
->dce_uinfo
.iulp_ssthresh
=
699 (uinfo
->iulp_ssthresh
+
700 dce
->dce_uinfo
.iulp_ssthresh
) >> 1;
702 dce
->dce_uinfo
.iulp_ssthresh
= uinfo
->iulp_ssthresh
;
704 /* We have uinfo for sure */
705 dce
->dce_flags
|= DCEF_UINFO
;
706 mutex_exit(&dce
->dce_lock
);
711 dce_update_uinfo_v4(ipaddr_t dst
, iulp_t
*uinfo
, ip_stack_t
*ipst
)
715 dce
= dce_lookup_and_add_v4(dst
, ipst
);
719 dce_setuinfo(dce
, uinfo
);
725 dce_update_uinfo_v6(const in6_addr_t
*dst
, uint_t ifindex
, iulp_t
*uinfo
,
730 dce
= dce_lookup_and_add_v6(dst
, ifindex
, ipst
);
734 dce_setuinfo(dce
, uinfo
);
739 /* Common routine for IPv4 and IPv6 */
741 dce_update_uinfo(const in6_addr_t
*dst
, uint_t ifindex
, iulp_t
*uinfo
,
746 if (IN6_IS_ADDR_V4MAPPED_ANY(dst
)) {
747 IN6_V4MAPPED_TO_IPADDR(dst
, dst4
);
748 return (dce_update_uinfo_v4(dst4
, uinfo
, ipst
));
750 return (dce_update_uinfo_v6(dst
, ifindex
, uinfo
, ipst
));
755 dce_make_condemned(dce_t
*dce
)
757 ip_stack_t
*ipst
= dce
->dce_ipst
;
759 mutex_enter(&dce
->dce_lock
);
760 ASSERT(!DCE_IS_CONDEMNED(dce
));
761 dce
->dce_generation
= DCE_GENERATION_CONDEMNED
;
762 mutex_exit(&dce
->dce_lock
);
763 /* Count how many condemned dces for kmem_cache callback */
764 atomic_inc_32(&ipst
->ips_num_dce_condemned
);
768 * Increment the generation avoiding the special condemned value
771 dce_increment_generation(dce_t
*dce
)
775 mutex_enter(&dce
->dce_lock
);
776 if (!DCE_IS_CONDEMNED(dce
)) {
777 generation
= dce
->dce_generation
+ 1;
778 if (generation
== DCE_GENERATION_CONDEMNED
)
779 generation
= DCE_GENERATION_INITIAL
;
780 ASSERT(generation
!= DCE_GENERATION_VERIFY
);
781 dce
->dce_generation
= generation
;
783 mutex_exit(&dce
->dce_lock
);
787 * Increment the generation number on all dces that have a path MTU and
788 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
791 dce_increment_all_generations(boolean_t isv6
, ip_stack_t
*ipst
)
797 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
799 dcb
= &ipst
->ips_dce_hash_v6
[i
];
801 dcb
= &ipst
->ips_dce_hash_v4
[i
];
802 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
803 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
804 if (DCE_IS_CONDEMNED(dce
))
806 dce_increment_generation(dce
);
808 rw_exit(&dcb
->dcb_lock
);
810 dce_increment_generation(ipst
->ips_dce_default
);
814 * Caller needs to do a dce_refrele since we can't do the
815 * dce_refrele under dcb_lock.
818 dce_delete_locked(dcb_t
*dcb
, dce_t
*dce
)
820 dce
->dce_bucket
= NULL
;
821 *dce
->dce_ptpn
= dce
->dce_next
;
822 if (dce
->dce_next
!= NULL
)
823 dce
->dce_next
->dce_ptpn
= dce
->dce_ptpn
;
824 dce
->dce_ptpn
= NULL
;
825 dce
->dce_next
= NULL
;
826 atomic_dec_32(&dcb
->dcb_cnt
);
827 dce_make_condemned(dce
);
831 dce_inactive(dce_t
*dce
)
833 ip_stack_t
*ipst
= dce
->dce_ipst
;
835 ASSERT(!(dce
->dce_flags
& DCEF_DEFAULT
));
836 ASSERT(dce
->dce_ptpn
== NULL
);
837 ASSERT(dce
->dce_bucket
== NULL
);
839 /* Count how many condemned dces for kmem_cache callback */
840 if (DCE_IS_CONDEMNED(dce
))
841 atomic_dec_32(&ipst
->ips_num_dce_condemned
);
843 kmem_cache_free(dce_cache
, dce
);
847 dce_refrele(dce_t
*dce
)
849 ASSERT(dce
->dce_refcnt
!= 0);
850 if (atomic_dec_32_nv(&dce
->dce_refcnt
) == 0)
855 dce_refhold(dce_t
*dce
)
857 atomic_inc_32(&dce
->dce_refcnt
);
858 ASSERT(dce
->dce_refcnt
!= 0);
861 /* No tracing support yet hence the same as the above functions */
863 dce_refrele_notr(dce_t
*dce
)
865 ASSERT(dce
->dce_refcnt
!= 0);
866 if (atomic_dec_32_nv(&dce
->dce_refcnt
) == 0)
871 dce_refhold_notr(dce_t
*dce
)
873 atomic_inc_32(&dce
->dce_refcnt
);
874 ASSERT(dce
->dce_refcnt
!= 0);
877 /* Report both the IPv4 and IPv6 DCEs. */
879 ip_snmp_get_mib2_ip_dce(queue_t
*q
, mblk_t
*mpctl
, ip_stack_t
*ipst
)
883 dest_cache_entry_t dest_cache
;
884 mblk_t
*mp_tail
= NULL
;
888 uint64_t current_time
;
890 current_time
= TICK_TO_SEC(ddi_get_lbolt64());
893 * make a copy of the original message
895 mp2ctl
= copymsg(mpctl
);
897 /* First we do IPv4 entries */
898 optp
= (struct opthdr
*)&mpctl
->b_rptr
[
899 sizeof (struct T_optmgmt_ack
)];
900 optp
->level
= MIB2_IP
;
901 optp
->name
= EXPER_IP_DCE
;
903 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
904 dcb
= &ipst
->ips_dce_hash_v4
[i
];
905 rw_enter(&dcb
->dcb_lock
, RW_READER
);
906 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
907 dest_cache
.DestIpv4Address
= dce
->dce_v4addr
;
908 dest_cache
.DestFlags
= dce
->dce_flags
;
909 if (dce
->dce_flags
& DCEF_PMTU
)
910 dest_cache
.DestPmtu
= dce
->dce_pmtu
;
912 dest_cache
.DestPmtu
= 0;
913 dest_cache
.DestIdent
= dce
->dce_ident
;
914 dest_cache
.DestIfindex
= 0;
915 dest_cache
.DestAge
= current_time
-
916 dce
->dce_last_change_time
;
917 if (!snmp_append_data2(mpctl
->b_cont
, &mp_tail
,
918 (char *)&dest_cache
, (int)sizeof (dest_cache
))) {
919 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
920 "failed to allocate %u bytes\n",
921 (uint_t
)sizeof (dest_cache
)));
924 rw_exit(&dcb
->dcb_lock
);
926 optp
->len
= (t_uscalar_t
)msgdsize(mpctl
->b_cont
);
927 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
928 (int)optp
->level
, (int)optp
->name
, (int)optp
->len
));
931 if (mp2ctl
== NULL
) {
932 /* Copymsg failed above */
939 mp2ctl
= copymsg(mpctl
);
940 optp
= (struct opthdr
*)&mpctl
->b_rptr
[
941 sizeof (struct T_optmgmt_ack
)];
942 optp
->level
= MIB2_IP6
;
943 optp
->name
= EXPER_IP_DCE
;
945 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
946 dcb
= &ipst
->ips_dce_hash_v6
[i
];
947 rw_enter(&dcb
->dcb_lock
, RW_READER
);
948 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
949 dest_cache
.DestIpv6Address
= dce
->dce_v6addr
;
950 dest_cache
.DestFlags
= dce
->dce_flags
;
951 if (dce
->dce_flags
& DCEF_PMTU
)
952 dest_cache
.DestPmtu
= dce
->dce_pmtu
;
954 dest_cache
.DestPmtu
= 0;
955 dest_cache
.DestIdent
= dce
->dce_ident
;
956 if (IN6_IS_ADDR_LINKSCOPE(&dce
->dce_v6addr
))
957 dest_cache
.DestIfindex
= dce
->dce_ifindex
;
959 dest_cache
.DestIfindex
= 0;
960 dest_cache
.DestAge
= current_time
-
961 dce
->dce_last_change_time
;
962 if (!snmp_append_data2(mpctl
->b_cont
, &mp_tail
,
963 (char *)&dest_cache
, (int)sizeof (dest_cache
))) {
964 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
965 "failed to allocate %u bytes\n",
966 (uint_t
)sizeof (dest_cache
)));
969 rw_exit(&dcb
->dcb_lock
);
971 optp
->len
= (t_uscalar_t
)msgdsize(mpctl
->b_cont
);
972 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
973 (int)optp
->level
, (int)optp
->name
, (int)optp
->len
));
980 * Remove IPv6 DCEs which refer to an ifindex that is going away.
981 * This is not required for correctness, but it avoids netstat -d
982 * showing stale stuff that will never be used.
985 dce_cleanup(uint_t ifindex
, ip_stack_t
*ipst
)
989 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++)
990 dce_bucket_clean(&ipst
->ips_dce_hash_v6
[i
], B_TRUE
, ifindex
);