find_elf: avoid 3x elfedit on non-elf files
[unleashed/tickless.git] / kernel / net / ip / ip6_ire.c
bloba467fa2079afe36c57dc80f33a46f609ac940af2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
27 * This file contains routines that manipulate Internet Routing Entries (IREs).
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/ddi.h>
33 #include <sys/cmn_err.h>
35 #include <sys/systm.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <net/if.h>
39 #include <net/route.h>
40 #include <netinet/in.h>
41 #include <net/if_dl.h>
42 #include <netinet/ip6.h>
43 #include <netinet/icmp6.h>
45 #include <inet/common.h>
46 #include <inet/mi.h>
47 #include <inet/ip.h>
48 #include <inet/ip6.h>
49 #include <inet/ip_ndp.h>
50 #include <inet/ip_if.h>
51 #include <inet/ip_ire.h>
52 #include <inet/ipclassifier.h>
53 #include <inet/nd.h>
54 #include <inet/tunables.h>
55 #include <sys/kmem.h>
56 #include <sys/zone.h>
58 #define IS_DEFAULT_ROUTE_V6(ire) \
59 (((ire)->ire_type & IRE_DEFAULT) || \
60 (((ire)->ire_type & IRE_INTERFACE) && \
61 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
63 static ire_t ire_null;
65 static ire_t *
66 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
67 const in6_addr_t *gateway, int type, const ill_t *ill,
68 zoneid_t zoneid, int flags, ip_stack_t *ipst);
71 * Initialize the ire that is specific to IPv6 part and call
72 * ire_init_common to finish it.
73 * Returns zero or errno.
75 int
76 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
77 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
78 zoneid_t zoneid, uint_t flags, ip_stack_t *ipst)
80 int error;
82 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
83 if (v6addr != NULL)
84 ire->ire_addr_v6 = *v6addr;
85 if (v6gateway != NULL)
86 ire->ire_gateway_addr_v6 = *v6gateway;
88 /* Make sure we don't have stray values in some fields */
89 switch (type) {
90 case IRE_LOOPBACK:
91 case IRE_HOST:
92 case IRE_LOCAL:
93 case IRE_IF_CLONE:
94 ire->ire_mask_v6 = ipv6_all_ones;
95 ire->ire_masklen = IPV6_ABITS;
96 break;
97 case IRE_PREFIX:
98 case IRE_DEFAULT:
99 case IRE_IF_RESOLVER:
100 case IRE_IF_NORESOLVER:
101 if (v6mask != NULL) {
102 ire->ire_mask_v6 = *v6mask;
103 ire->ire_masklen =
104 ip_mask_to_plen_v6(&ire->ire_mask_v6);
106 break;
107 case IRE_MULTICAST:
108 case IRE_NOROUTE:
109 ASSERT(v6mask == NULL);
110 break;
111 default:
112 ASSERT(0);
113 return (EINVAL);
116 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
117 ipst);
118 if (error != 0)
119 return (error);
121 /* Determine which function pointers to use */
122 ire->ire_postfragfn = ip_xmit; /* Common case */
124 switch (ire->ire_type) {
125 case IRE_LOCAL:
126 ire->ire_sendfn = ire_send_local_v6;
127 ire->ire_recvfn = ire_recv_local_v6;
128 ASSERT(ire->ire_ill != NULL);
129 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
130 ire->ire_recvfn = ire_recv_noaccept_v6;
131 break;
132 case IRE_LOOPBACK:
133 ire->ire_sendfn = ire_send_local_v6;
134 ire->ire_recvfn = ire_recv_loopback_v6;
135 break;
136 case IRE_MULTICAST:
137 ire->ire_postfragfn = ip_postfrag_loopcheck;
138 ire->ire_sendfn = ire_send_multicast_v6;
139 ire->ire_recvfn = ire_recv_multicast_v6;
140 break;
141 default:
143 * For IRE_IF_ALL and IRE_OFFLINK we forward received
144 * packets by default.
146 ire->ire_sendfn = ire_send_wire_v6;
147 ire->ire_recvfn = ire_recv_forward_v6;
148 break;
150 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
151 ire->ire_sendfn = ire_send_noroute_v6;
152 ire->ire_recvfn = ire_recv_noroute_v6;
154 ire->ire_nce_capable = ire_determine_nce_capable(ire);
155 return (0);
159 * ire_create_v6 is called to allocate and initialize a new IRE.
161 * NOTE : This is called as writer sometimes though not required
162 * by this function.
164 /* ARGSUSED */
165 ire_t *
166 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
167 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
168 uint_t flags, ip_stack_t *ipst)
170 ire_t *ire;
171 int error;
173 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
175 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
176 if (ire == NULL) {
177 DTRACE_PROBE(kmem__cache__alloc);
178 return (NULL);
180 *ire = ire_null;
182 error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
183 type, ill, zoneid, flags, ipst);
185 if (error != 0) {
186 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
187 kmem_cache_free(ire_cache, ire);
188 return (NULL);
190 return (ire);
194 * Find the ill matching a multicast group.
195 * Allows different routes for multicast addresses
196 * in the unicast routing table (akin to FF::0/8 but could be more specific)
197 * which point at different interfaces. This is used when IPV6_MULTICAST_IF
198 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
199 * specify the interface to join on.
201 * Supports link-local addresses by using ire_route_recursive which follows
202 * the ill when recursing.
204 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
205 * We have a setsrcp argument for the same reason.
207 ill_t *
208 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
209 ip_stack_t *ipst, in6_addr_t *setsrcp)
211 ire_t *ire;
212 ill_t *ill;
214 ire = ire_route_recursive_v6(group, 0, NULL, zoneid,
215 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL);
216 ASSERT(ire != NULL);
218 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
219 ire_refrele(ire);
220 return (NULL);
223 ill = ire_nexthop_ill(ire);
224 ire_refrele(ire);
225 return (ill);
229 * This function takes a mask and returns number of bits set in the
230 * mask (the represented prefix length). Assumes a contiguous mask.
233 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
235 int bits;
236 int plen = IPV6_ABITS;
237 int i;
239 for (i = 3; i >= 0; i--) {
240 if (v6mask->s6_addr32[i] == 0) {
241 plen -= 32;
242 continue;
244 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
245 if (bits == 0)
246 break;
247 plen -= bits;
250 return (plen);
254 * Convert a prefix length to the mask for that prefix.
255 * Returns the argument bitmask.
257 in6_addr_t *
258 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
260 uint32_t *ptr;
262 if (plen < 0 || plen > IPV6_ABITS)
263 return (NULL);
264 *bitmask = ipv6_all_zeros;
265 if (plen == 0)
266 return (bitmask);
268 ptr = (uint32_t *)bitmask;
269 while (plen > 32) {
270 *ptr++ = 0xffffffffU;
271 plen -= 32;
273 *ptr = htonl(0xffffffffU << (32 - plen));
274 return (bitmask);
278 * Add a fully initialized IPv6 IRE to the forwarding table.
279 * This returns NULL on failure, or a held IRE on success.
280 * Normally the returned IRE is the same as the argument. But a different
281 * IRE will be returned if the added IRE is deemed identical to an existing
282 * one. In that case ire_identical_ref will be increased.
283 * The caller always needs to do an ire_refrele() on the returned IRE.
285 ire_t *
286 ire_add_v6(ire_t *ire)
288 ire_t *ire1;
289 int mask_table_index;
290 irb_t *irb_ptr;
291 ire_t **irep;
292 int match_flags;
293 int error;
294 ip_stack_t *ipst = ire->ire_ipst;
296 ASSERT(ire->ire_ipversion == IPV6_VERSION);
298 /* Make sure the address is properly masked. */
299 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
301 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
302 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
303 irb_t *ptr;
304 int i;
306 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
307 sizeof (irb_t)));
308 if (ptr == NULL) {
309 ire_delete(ire);
310 return (NULL);
312 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
313 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
314 ptr[i].irb_ipst = ipst;
316 mutex_enter(&ipst->ips_ire_ft_init_lock);
317 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
318 NULL) {
319 ipst->ips_ip_forwarding_table_v6[mask_table_index] =
320 ptr;
321 mutex_exit(&ipst->ips_ire_ft_init_lock);
322 } else {
324 * Some other thread won the race in
325 * initializing the forwarding table at the
326 * same index.
328 mutex_exit(&ipst->ips_ire_ft_init_lock);
329 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
330 rw_destroy(&ptr[i].irb_lock);
332 mi_free(ptr);
335 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
336 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
337 ipst->ips_ip6_ftable_hash_size)]);
339 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
340 if (ire->ire_ill != NULL)
341 match_flags |= MATCH_IRE_ILL;
343 * Start the atomic add of the ire. Grab the bucket lock and the
344 * ill lock. Check for condemned.
346 error = ire_atomic_start(irb_ptr, ire);
347 if (error != 0) {
348 ire_delete(ire);
349 return (NULL);
353 * If we are creating a hidden IRE, make sure we search for
354 * hidden IREs when searching for duplicates below.
355 * Otherwise, we might find an IRE on some other interface
356 * that's not marked hidden.
358 if (ire->ire_testhidden)
359 match_flags |= MATCH_IRE_TESTHIDDEN;
362 * Atomically check for duplicate and insert in the table.
364 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
365 if (IRE_IS_CONDEMNED(ire1))
366 continue;
368 * Here we need an exact match on zoneid, i.e.,
369 * ire_match_args doesn't fit.
371 if (ire1->ire_zoneid != ire->ire_zoneid)
372 continue;
374 if (ire1->ire_type != ire->ire_type)
375 continue;
378 * Note: We do not allow multiple routes that differ only
379 * in the gateway security attributes; such routes are
380 * considered duplicates.
381 * To change that we explicitly have to treat them as
382 * different here.
384 if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
385 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
386 ire->ire_type, ire->ire_ill, ire->ire_zoneid,
387 match_flags)) {
389 * Return the old ire after doing a REFHOLD.
390 * As most of the callers continue to use the IRE
391 * after adding, we return a held ire. This will
392 * avoid a lookup in the caller again. If the callers
393 * don't want to use it, they need to do a REFRELE.
395 * We only allow exactly one IRE_IF_CLONE for any dst,
396 * so, if the is an IF_CLONE, return the ire without
397 * an identical_ref, but with an ire_ref held.
399 if (ire->ire_type != IRE_IF_CLONE) {
400 atomic_add_32(&ire1->ire_identical_ref, 1);
401 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
402 ire_t *, ire);
404 ip1dbg(("found dup ire existing %p new %p",
405 (void *)ire1, (void *)ire));
406 ire_refhold(ire1);
407 ire_atomic_end(irb_ptr, ire);
408 ire_delete(ire);
409 return (ire1);
414 * Normally we do head insertion since most things do not care about
415 * the order of the IREs in the bucket.
416 * However, due to shared-IP zones (and restrict_interzone_loopback)
417 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
418 * address. For that reason we do tail insertion for IRE_IF_CLONE.
420 irep = (ire_t **)irb_ptr;
421 if (ire->ire_type & IRE_IF_CLONE) {
422 while ((ire1 = *irep) != NULL)
423 irep = &ire1->ire_next;
425 /* Insert at *irep */
426 ire1 = *irep;
427 if (ire1 != NULL)
428 ire1->ire_ptpn = &ire->ire_next;
429 ire->ire_next = ire1;
430 /* Link the new one in. */
431 ire->ire_ptpn = irep;
433 * ire_walk routines de-reference ire_next without holding
434 * a lock. Before we point to the new ire, we want to make
435 * sure the store that sets the ire_next of the new ire
436 * reaches global visibility, so that ire_walk routines
437 * don't see a truncated list of ires i.e if the ire_next
438 * of the new ire gets set after we do "*irep = ire" due
439 * to re-ordering, the ire_walk thread will see a NULL
440 * once it accesses the ire_next of the new ire.
441 * membar_producer() makes sure that the following store
442 * happens *after* all of the above stores.
444 membar_producer();
445 *irep = ire;
446 ire->ire_bucket = irb_ptr;
448 * We return a bumped up IRE above. Keep it symmetrical
449 * so that the callers will always have to release. This
450 * helps the callers of this function because they continue
451 * to use the IRE after adding and hence they don't have to
452 * lookup again after we return the IRE.
454 * NOTE : We don't have to use atomics as this is appearing
455 * in the list for the first time and no one else can bump
456 * up the reference count on this yet.
458 ire_refhold_locked(ire);
459 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
460 irb_ptr->irb_ire_cnt++;
462 if (ire->ire_ill != NULL) {
463 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
464 (char *), "ire", (void *), ire);
465 ire->ire_ill->ill_ire_cnt++;
466 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
468 ire_atomic_end(irb_ptr, ire);
470 /* Make any caching of the IREs be notified or updated */
471 ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
473 return (ire);
477 * Search for all HOST REDIRECT routes that are
478 * pointing at the specified gateway and
479 * delete them. This routine is called only
480 * when a default gateway is going away.
482 static void
483 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
485 irb_t *irb_ptr;
486 irb_t *irb;
487 ire_t *ire;
488 in6_addr_t gw_addr_v6;
489 int i;
491 /* get the hash table for HOST routes */
492 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
493 if (irb_ptr == NULL)
494 return;
495 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
496 irb = &irb_ptr[i];
497 irb_refhold(irb);
498 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
499 if (!(ire->ire_flags & RTF_DYNAMIC))
500 continue;
501 mutex_enter(&ire->ire_lock);
502 gw_addr_v6 = ire->ire_gateway_addr_v6;
503 mutex_exit(&ire->ire_lock);
504 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
505 ire_delete(ire);
507 irb_refrele(irb);
512 * Delete the specified IRE.
513 * All calls should use ire_delete().
514 * Sometimes called as writer though not required by this function.
516 * NOTE : This function is called only if the ire was added
517 * in the list.
519 void
520 ire_delete_v6(ire_t *ire)
522 in6_addr_t gw_addr_v6;
523 ip_stack_t *ipst = ire->ire_ipst;
526 * Make sure ire_generation increases from ire_flush_cache happen
527 * after any lookup/reader has read ire_generation.
528 * Since the rw_enter makes us wait until any lookup/reader has
529 * completed we can exit the lock immediately.
531 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
532 rw_exit(&ipst->ips_ip6_ire_head_lock);
534 ASSERT(ire->ire_refcnt >= 1);
535 ASSERT(ire->ire_ipversion == IPV6_VERSION);
537 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
539 if (ire->ire_type == IRE_DEFAULT) {
541 * when a default gateway is going away
542 * delete all the host redirects pointing at that
543 * gateway.
545 mutex_enter(&ire->ire_lock);
546 gw_addr_v6 = ire->ire_gateway_addr_v6;
547 mutex_exit(&ire->ire_lock);
548 ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
552 * If we are deleting an IRE_INTERFACE then we make sure we also
553 * delete any IRE_IF_CLONE that has been created from it.
554 * Those are always in ire_dep_children.
556 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
557 ire_dep_delete_if_clone(ire);
559 /* Remove from parent dependencies and child */
560 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
561 if (ire->ire_dep_parent != NULL) {
562 ire_dep_remove(ire);
564 while (ire->ire_dep_children != NULL)
565 ire_dep_remove(ire->ire_dep_children);
566 rw_exit(&ipst->ips_ire_dep_lock);
570 * When an IRE is added or deleted this routine is called to make sure
571 * any caching of IRE information is notified or updated.
573 * The flag argument indicates if the flush request is due to addition
574 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
575 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
577 void
578 ire_flush_cache_v6(ire_t *ire, int flag)
580 ip_stack_t *ipst = ire->ire_ipst;
583 * IRE_IF_CLONE ire's don't provide any new information
584 * than the parent from which they are cloned, so don't
585 * perturb the generation numbers.
587 if (ire->ire_type & IRE_IF_CLONE)
588 return;
591 * Ensure that an ire_add during a lookup serializes the updates of
592 * the generation numbers under ire_head_lock so that the lookup gets
593 * either the old ire and old generation number, or a new ire and new
594 * generation number.
596 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
599 * If a route was just added, we need to notify everybody that
600 * has cached an IRE_NOROUTE since there might now be a better
601 * route for them.
603 if (flag == IRE_FLUSH_ADD) {
604 ire_increment_generation(ipst->ips_ire_reject_v6);
605 ire_increment_generation(ipst->ips_ire_blackhole_v6);
608 /* Adding a default can't otherwise provide a better route */
609 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
610 rw_exit(&ipst->ips_ip6_ire_head_lock);
611 return;
614 switch (flag) {
615 case IRE_FLUSH_DELETE:
616 case IRE_FLUSH_GWCHANGE:
618 * Update ire_generation for all ire_dep_children chains
619 * starting with this IRE
621 ire_dep_incr_generation(ire);
622 break;
623 case IRE_FLUSH_ADD: {
624 in6_addr_t addr;
625 in6_addr_t mask;
626 ip_stack_t *ipst = ire->ire_ipst;
627 uint_t masklen;
630 * Find an IRE which is a shorter match than the ire to be added
631 * For any such IRE (which we repeat) we update the
632 * ire_generation the same way as in the delete case.
634 addr = ire->ire_addr_v6;
635 mask = ire->ire_mask_v6;
636 masklen = ip_mask_to_plen_v6(&mask);
638 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
639 ALL_ZONES, MATCH_IRE_SHORTERMASK, ipst);
640 while (ire != NULL) {
641 /* We need to handle all in the same bucket */
642 irb_increment_generation(ire->ire_bucket);
644 mask = ire->ire_mask_v6;
645 ASSERT(masklen > ip_mask_to_plen_v6(&mask));
646 masklen = ip_mask_to_plen_v6(&mask);
647 ire_refrele(ire);
648 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
649 NULL, ALL_ZONES, MATCH_IRE_SHORTERMASK, ipst);
652 break;
654 rw_exit(&ipst->ips_ip6_ire_head_lock);
658 * Matches the arguments passed with the values in the ire.
660 * Note: for match types that match using "ill" passed in, ill
661 * must be checked for non-NULL before calling this routine.
663 boolean_t
664 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
665 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
666 int match_flags)
668 in6_addr_t masked_addr;
669 in6_addr_t gw_addr_v6;
670 ill_t *ire_ill = NULL, *dst_ill;
671 ip_stack_t *ipst = ire->ire_ipst;
673 ASSERT(ire->ire_ipversion == IPV6_VERSION);
674 ASSERT(addr != NULL);
675 ASSERT(mask != NULL);
676 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
677 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
678 (ill != NULL && ill->ill_isv6));
681 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
682 * is in fact hidden, to ensure the caller gets the right one.
684 if (ire->ire_testhidden) {
685 if (!(match_flags & MATCH_IRE_TESTHIDDEN))
686 return (B_FALSE);
689 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
690 ire->ire_zoneid != ALL_ZONES) {
692 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
693 * does not match that of ire_zoneid, a failure to
694 * match is reported at this point. Otherwise, since some IREs
695 * that are available in the global zone can be used in local
696 * zones, additional checks need to be performed:
698 * IRE_LOOPBACK
699 * entries should never be matched in this situation.
700 * Each zone has its own IRE_LOOPBACK.
702 * IRE_LOCAL
703 * We allow them for any zoneid. ire_route_recursive
704 * does additional checks when
705 * ip_restrict_interzone_loopback is set.
707 * If ill_usesrc_ifindex is set
708 * Then we check if the zone has a valid source address
709 * on the usesrc ill.
711 * If ire_ill is set, then check that the zone has an ipif
712 * on that ill.
714 * Outside of this function (in ire_round_robin) we check
715 * that any IRE_OFFLINK has a gateway that reachable from the
716 * zone when we have multiple choices (ECMP).
718 if (match_flags & MATCH_IRE_ZONEONLY)
719 return (B_FALSE);
720 if (ire->ire_type & IRE_LOOPBACK)
721 return (B_FALSE);
723 if (ire->ire_type & IRE_LOCAL)
724 goto matchit;
727 * The normal case of IRE_ONLINK has a matching zoneid.
728 * Here we handle the case when shared-IP zones have been
729 * configured with IP addresses on vniN. In that case it
730 * is ok for traffic from a zone to use IRE_ONLINK routes
731 * if the ill has a usesrc pointing at vniN
732 * Applies to IRE_INTERFACE.
734 dst_ill = ire->ire_ill;
735 if (ire->ire_type & IRE_ONLINK) {
736 uint_t ifindex;
739 * Note there is no IRE_INTERFACE on vniN thus
740 * can't do an IRE lookup for a matching route.
742 ifindex = dst_ill->ill_usesrc_ifindex;
743 if (ifindex == 0)
744 return (B_FALSE);
747 * If there is a usable source address in the
748 * zone, then it's ok to return this IRE_INTERFACE
750 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
751 zoneid, ipst)) {
752 ip3dbg(("ire_match_args: no usrsrc for zone"
753 " dst_ill %p\n", (void *)dst_ill));
754 return (B_FALSE);
758 * For example, with
759 * route add 11.0.0.0 gw1 -ifp bge0
760 * route add 11.0.0.0 gw2 -ifp bge1
761 * this code would differentiate based on
762 * where the sending zone has addresses.
763 * Only if the zone has an address on bge0 can it use the first
764 * route. It isn't clear if this behavior is documented
765 * anywhere.
767 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
768 ipif_t *tipif;
770 mutex_enter(&dst_ill->ill_lock);
771 for (tipif = dst_ill->ill_ipif;
772 tipif != NULL; tipif = tipif->ipif_next) {
773 if (!IPIF_IS_CONDEMNED(tipif) &&
774 (tipif->ipif_flags & IPIF_UP) &&
775 (tipif->ipif_zoneid == zoneid ||
776 tipif->ipif_zoneid == ALL_ZONES))
777 break;
779 mutex_exit(&dst_ill->ill_lock);
780 if (tipif == NULL)
781 return (B_FALSE);
785 matchit:
786 ire_ill = ire->ire_ill;
787 if (match_flags & MATCH_IRE_GW) {
788 mutex_enter(&ire->ire_lock);
789 gw_addr_v6 = ire->ire_gateway_addr_v6;
790 mutex_exit(&ire->ire_lock);
792 if (match_flags & MATCH_IRE_ILL) {
795 * If asked to match an ill, we *must* match
796 * on the ire_ill for ipmp test addresses, or
797 * any of the ill in the group for data addresses.
798 * If we don't, we may as well fail.
799 * However, we need an exception for IRE_LOCALs to ensure
800 * we loopback packets even sent to test addresses on different
801 * interfaces in the group.
803 if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
804 !(ire->ire_type & IRE_LOCAL)) {
805 if (ire->ire_ill != ill)
806 return (B_FALSE);
807 } else {
808 match_flags &= ~MATCH_IRE_TESTHIDDEN;
810 * We know that ill is not NULL, but ire_ill could be
811 * NULL
813 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
814 return (B_FALSE);
817 if (match_flags & MATCH_IRE_SRC_ILL) {
818 if (ire_ill == NULL)
819 return (B_FALSE);
820 if (!IS_ON_SAME_LAN(ill, ire_ill)) {
821 if (ire_ill->ill_usesrc_ifindex == 0 ||
822 (ire_ill->ill_usesrc_ifindex !=
823 ill->ill_phyint->phyint_ifindex))
824 return (B_FALSE);
828 /* No ire_addr_v6 bits set past the mask */
829 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
830 ire->ire_addr_v6));
831 V6_MASK_COPY(*addr, *mask, masked_addr);
832 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
833 ((!(match_flags & MATCH_IRE_GW)) ||
834 ((!(match_flags & MATCH_IRE_DIRECT)) ||
835 !(ire->ire_flags & RTF_INDIRECT)) &&
836 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
837 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
838 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
839 ((!(match_flags & MATCH_IRE_MASK)) ||
840 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask)))) {
841 /* We found the matched IRE */
842 return (B_TRUE);
844 return (B_FALSE);
848 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
849 * gateway address. If ill is non-NULL we also match on it.
850 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
852 boolean_t
853 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
854 ip_stack_t *ipst, boolean_t lock_held)
856 ire_t *ire;
857 uint_t match_flags;
859 if (lock_held)
860 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
861 else
862 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
864 match_flags = MATCH_IRE_TYPE;
865 if (ill != NULL)
866 match_flags |= MATCH_IRE_ILL;
868 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
869 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, match_flags,
870 ipst);
872 if (!lock_held)
873 rw_exit(&ipst->ips_ip6_ire_head_lock);
874 if (ire != NULL) {
875 ire_refrele(ire);
876 return (B_TRUE);
877 } else {
878 return (B_FALSE);
883 * Lookup a route in forwarding table.
884 * specific lookup is indicated by passing the
885 * required parameters and indicating the
886 * match required in flag field.
888 * Supports link-local addresses by following the ipif/ill when recursing.
890 ire_t *
891 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
892 const in6_addr_t *gateway, int type, const ill_t *ill,
893 zoneid_t zoneid, int flags, uint32_t xmit_hint, ip_stack_t *ipst,
894 uint_t *generationp)
896 ire_t *ire = NULL;
898 ASSERT(addr != NULL);
899 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
900 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
901 ASSERT(ill == NULL || ill->ill_isv6);
903 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
906 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
907 * or MATCH_IRE_SRC_ILL is set.
909 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
910 return (NULL);
912 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
913 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
914 flags, ipst);
915 if (ire == NULL) {
916 rw_exit(&ipst->ips_ip6_ire_head_lock);
917 return (NULL);
921 * round-robin only if we have more than one route in the bucket.
922 * ips_ip_ecmp_behavior controls when we do ECMP
923 * 2: always
924 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
925 * 0: never
927 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
928 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
929 * and the IRE_INTERFACESs are likely to be shorter matches.
931 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
932 if (ipst->ips_ip_ecmp_behavior == 2 ||
933 (ipst->ips_ip_ecmp_behavior == 1 &&
934 IS_DEFAULT_ROUTE_V6(ire))) {
935 ire_t *next_ire;
936 ire_ftable_args_t margs;
938 bzero(&margs, sizeof (margs));
939 margs.ift_addr_v6 = *addr;
940 if (mask != NULL)
941 margs.ift_mask_v6 = *mask;
942 if (gateway != NULL)
943 margs.ift_gateway_v6 = *gateway;
944 margs.ift_type = type;
945 margs.ift_ill = ill;
946 margs.ift_zoneid = zoneid;
947 margs.ift_flags = flags;
949 next_ire = ire_round_robin(ire->ire_bucket, &margs,
950 xmit_hint, ire, ipst);
951 if (next_ire == NULL) {
952 /* keep ire if next_ire is null */
953 goto done;
955 ire_refrele(ire);
956 ire = next_ire;
960 done:
961 /* Return generation before dropping lock */
962 if (generationp != NULL)
963 *generationp = ire->ire_generation;
965 rw_exit(&ipst->ips_ip6_ire_head_lock);
968 * For shared-IP zones we need additional checks to what was
969 * done in ire_match_args to make sure IRE_LOCALs are handled.
971 * When ip_restrict_interzone_loopback is set, then
972 * we ensure that IRE_LOCAL are only used for loopback
973 * between zones when the logical "Ethernet" would
974 * have looped them back. That is, if in the absense of
975 * the IRE_LOCAL we would have sent to packet out the
976 * same ill.
978 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
979 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
980 ipst->ips_ip_restrict_interzone_loopback) {
981 ire = ire_alt_local(ire, zoneid, ill, generationp);
982 ASSERT(ire != NULL);
985 return (ire);
989 * Look up a single ire. The caller holds either the read or write lock.
991 ire_t *
992 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
993 const in6_addr_t *gateway, int type, const ill_t *ill,
994 zoneid_t zoneid, int flags, ip_stack_t *ipst)
996 irb_t *irb_ptr;
997 ire_t *ire = NULL;
998 int i;
1000 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1003 * If the mask is known, the lookup
1004 * is simple, if the mask is not known
1005 * we need to search.
1007 if (flags & MATCH_IRE_MASK) {
1008 uint_t masklen;
1010 masklen = ip_mask_to_plen_v6(mask);
1011 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1012 return (NULL);
1014 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1015 IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1016 ipst->ips_ip6_ftable_hash_size)]);
1017 rw_enter(&irb_ptr->irb_lock, RW_READER);
1018 for (ire = irb_ptr->irb_ire; ire != NULL;
1019 ire = ire->ire_next) {
1020 if (IRE_IS_CONDEMNED(ire))
1021 continue;
1022 if (ire_match_args_v6(ire, addr, mask, gateway, type,
1023 ill, zoneid, flags))
1024 goto found_ire;
1026 rw_exit(&irb_ptr->irb_lock);
1027 } else {
1028 uint_t masklen;
1031 * In this case we don't know the mask, we need to
1032 * search the table assuming different mask sizes.
1034 if (flags & MATCH_IRE_SHORTERMASK) {
1035 masklen = ip_mask_to_plen_v6(mask);
1036 if (masklen == 0) {
1037 /* Nothing shorter than zero */
1038 return (NULL);
1040 masklen--;
1041 } else {
1042 masklen = IP6_MASK_TABLE_SIZE - 1;
1045 for (i = masklen; i >= 0; i--) {
1046 in6_addr_t tmpmask;
1048 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1049 continue;
1050 (void) ip_plen_to_mask_v6(i, &tmpmask);
1051 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1052 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1053 ipst->ips_ip6_ftable_hash_size)];
1054 rw_enter(&irb_ptr->irb_lock, RW_READER);
1055 for (ire = irb_ptr->irb_ire; ire != NULL;
1056 ire = ire->ire_next) {
1057 if (IRE_IS_CONDEMNED(ire))
1058 continue;
1059 if (ire_match_args_v6(ire, addr,
1060 &ire->ire_mask_v6, gateway, type, ill,
1061 zoneid, flags))
1062 goto found_ire;
1064 rw_exit(&irb_ptr->irb_lock);
1067 ASSERT(ire == NULL);
1068 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1069 return (NULL);
1071 found_ire:
1072 ire_refhold(ire);
1073 rw_exit(&irb_ptr->irb_lock);
1074 return (ire);
1079 * This function is called by
1080 * ip_input/ire_route_recursive when doing a route lookup on only the
1081 * destination address.
1083 * The optimizations of this function over ire_ftable_lookup are:
1084 * o removing unnecessary flag matching
1085 * o doing longest prefix match instead of overloading it further
1086 * with the unnecessary "best_prefix_match"
1088 * If no route is found we return IRE_NOROUTE.
1090 ire_t *
1091 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1092 ip_stack_t *ipst, uint_t *generationp)
1094 ire_t *ire;
1096 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES,
1097 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1098 if (ire == NULL) {
1099 ire = ire_reject(ipst, B_TRUE);
1100 if (generationp != NULL)
1101 *generationp = IRE_GENERATION_VERIFY;
1103 /* ftable_lookup did round robin */
1104 return (ire);
1107 ire_t *
1108 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
1109 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1110 int *errorp)
1112 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1114 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp));
1118 * Recursively look for a route to the destination. Can also match on
1119 * the zoneid and ill. Used for the data paths. See also
1120 * ire_route_recursive_dstonly.
1122 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1123 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1124 * forwarding.
1125 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1126 * resolve the gateway.
1128 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1129 * instead.
1131 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1132 * is an error.
1133 * Allow at most one RTF_INDIRECT.
1135 ire_t *
1136 ire_route_recursive_impl_v6(ire_t *ire,
1137 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1138 zoneid_t zoneid, uint_t match_args,
1139 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1140 in6_addr_t *setsrcp, uint_t *generationp)
1142 int i, j;
1143 in6_addr_t v6nexthop = *nexthop;
1144 ire_t *ires[MAX_IRE_RECURSION];
1145 uint_t generation;
1146 uint_t generations[MAX_IRE_RECURSION];
1147 boolean_t need_refrele = B_FALSE;
1148 boolean_t invalidate = B_FALSE;
1149 ill_t *ill = NULL;
1150 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK);
1152 if (setsrcp != NULL)
1153 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1156 * We iterate up to three times to resolve a route, even though
1157 * we have four slots in the array. The extra slot is for an
1158 * IRE_IF_CLONE we might need to create.
1160 i = 0;
1161 while (i < MAX_IRE_RECURSION - 1) {
1162 /* ire_ftable_lookup handles round-robin/ECMP */
1163 if (ire == NULL) {
1164 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1165 (ill != NULL ? ill : ill_arg), zoneid, match_args,
1166 xmit_hint, ipst, &generation);
1167 } else {
1168 /* Caller passed it; extra hold since we will rele */
1169 ire_refhold(ire);
1170 if (generationp != NULL)
1171 generation = *generationp;
1172 else
1173 generation = IRE_GENERATION_VERIFY;
1176 if (ire == NULL) {
1177 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1178 ire = ires[0];
1179 ire_refhold(ire);
1180 } else {
1181 ire = ire_reject(ipst, B_TRUE);
1183 goto error;
1186 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1187 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1188 goto error;
1190 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1192 * Verify that the IRE_IF_CLONE has a consistent generation
1193 * number.
1195 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1196 ire_refrele(ire);
1197 ire = NULL;
1198 continue;
1202 * Don't allow anything unusual past the first iteration.
1203 * After the first lookup, we should no longer look for
1204 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes.
1206 * In addition, after we have found a direct IRE_OFFLINK,
1207 * we should only look for interface or clone routes.
1209 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1210 if ((ire->ire_type & IRE_OFFLINK) &&
1211 !(ire->ire_flags & RTF_INDIRECT)) {
1212 ire_type = IRE_IF_ALL;
1213 } else {
1214 if (!(match_args & MATCH_IRE_TYPE))
1215 ire_type = (IRE_OFFLINK|IRE_ONLINK);
1216 ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */
1218 match_args |= MATCH_IRE_TYPE;
1219 /* We have a usable IRE */
1220 ires[i] = ire;
1221 generations[i] = generation;
1222 i++;
1224 /* The first RTF_SETSRC address is passed back if setsrcp */
1225 if ((ire->ire_flags & RTF_SETSRC) &&
1226 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1227 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1228 &ire->ire_setsrc_addr_v6));
1229 *setsrcp = ire->ire_setsrc_addr_v6;
1233 * Check if we have a short-cut pointer to an IRE for this
1234 * destination, and that the cached dependency isn't stale.
1235 * In that case we've rejoined an existing tree towards a
1236 * parent, thus we don't need to continue the loop to
1237 * discover the rest of the tree.
1239 mutex_enter(&ire->ire_lock);
1240 if (ire->ire_dep_parent != NULL &&
1241 ire->ire_dep_parent->ire_generation ==
1242 ire->ire_dep_parent_generation) {
1243 mutex_exit(&ire->ire_lock);
1244 ire = NULL;
1245 goto done;
1247 mutex_exit(&ire->ire_lock);
1250 * If this type should have an ire_nce_cache (even if it
1251 * doesn't yet have one) then we are done. Includes
1252 * IRE_INTERFACE with a full 128 bit mask.
1254 if (ire->ire_nce_capable) {
1255 ire = NULL;
1256 goto done;
1258 ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1260 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1261 * particular destination
1263 if (ire->ire_type & IRE_INTERFACE) {
1264 ire_t *clone;
1266 ASSERT(ire->ire_masklen != IPV6_ABITS);
1269 * In the case of ip_input and ILLF_FORWARDING not
1270 * being set, and in the case of RTM_GET, there is
1271 * no point in allocating an IRE_IF_CLONE. We return
1272 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1273 * result in a ire_dep_parent which is IRE_IF_*
1274 * without an IRE_IF_CLONE.
1275 * We recover from that when we need to send packets
1276 * by ensuring that the generations become
1277 * IRE_GENERATION_VERIFY in this case.
1279 if (!(irr_flags & IRR_ALLOCATE)) {
1280 invalidate = B_TRUE;
1281 ire = NULL;
1282 goto done;
1285 clone = ire_create_if_clone(ire, &v6nexthop,
1286 &generation);
1287 if (clone == NULL) {
1289 * Temporary failure - no memory.
1290 * Don't want caller to cache IRE_NOROUTE.
1292 invalidate = B_TRUE;
1293 ire = ire_blackhole(ipst, B_TRUE);
1294 goto error;
1297 * Make clone next to last entry and the
1298 * IRE_INTERFACE the last in the dependency
1299 * chain since the clone depends on the
1300 * IRE_INTERFACE.
1302 ASSERT(i >= 1);
1303 ASSERT(i < MAX_IRE_RECURSION);
1305 ires[i] = ires[i-1];
1306 generations[i] = generations[i-1];
1307 ires[i-1] = clone;
1308 generations[i-1] = generation;
1309 i++;
1311 ire = NULL;
1312 goto done;
1316 * We only match on the type and optionally ILL when
1317 * recursing. The type match is used by some callers
1318 * to exclude certain types (such as IRE_IF_CLONE or
1319 * IRE_LOCAL|IRE_LOOPBACK).
1321 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1322 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1323 * ire_ill, so we set ill to the ire_ill
1325 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1326 v6nexthop = ire->ire_gateway_addr_v6;
1327 if (ill == NULL && ire->ire_ill != NULL) {
1328 ill = ire->ire_ill;
1329 need_refrele = B_TRUE;
1330 ill_refhold(ill);
1331 match_args |= MATCH_IRE_ILL;
1333 ire = NULL;
1335 ASSERT(ire == NULL);
1336 ire = ire_reject(ipst, B_TRUE);
1338 error:
1339 ASSERT(ire != NULL);
1340 if (need_refrele)
1341 ill_refrele(ill);
1343 cleanup:
1344 /* cleanup ires[i] */
1345 ire_dep_unbuild(ires, i);
1346 for (j = 0; j < i; j++)
1347 ire_refrele(ires[j]);
1349 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1350 (irr_flags & IRR_INCOMPLETE));
1352 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1353 * ip_select_route since the reject or lack of memory might be gone.
1355 if (generationp != NULL)
1356 *generationp = IRE_GENERATION_VERIFY;
1357 return (ire);
1359 done:
1360 ASSERT(ire == NULL);
1361 if (need_refrele)
1362 ill_refrele(ill);
1364 /* Build dependencies */
1365 if (i > 1 && !ire_dep_build(ires, generations, i)) {
1366 /* Something in chain was condemned; tear it apart */
1367 ire = ire_blackhole(ipst, B_TRUE);
1368 goto cleanup;
1372 * Release all refholds except the one for ires[0] that we
1373 * will return to the caller.
1375 for (j = 1; j < i; j++)
1376 ire_refrele(ires[j]);
1378 if (invalidate) {
1380 * Since we needed to allocate but couldn't we need to make
1381 * sure that the dependency chain is rebuilt the next time.
1383 ire_dep_invalidate_generations(ires[0]);
1384 generation = IRE_GENERATION_VERIFY;
1385 } else {
1387 * IREs can have been added or deleted while we did the
1388 * recursive lookup and we can't catch those until we've built
1389 * the dependencies. We verify the stored
1390 * ire_dep_parent_generation to catch any such changes and
1391 * return IRE_GENERATION_VERIFY (which will cause
1392 * ip_select_route to be called again so we can redo the
1393 * recursive lookup next time we send a packet.
1395 if (ires[0]->ire_dep_parent == NULL)
1396 generation = ires[0]->ire_generation;
1397 else
1398 generation = ire_dep_validate_generations(ires[0]);
1399 if (generations[0] != ires[0]->ire_generation) {
1400 /* Something changed at the top */
1401 generation = IRE_GENERATION_VERIFY;
1404 if (generationp != NULL)
1405 *generationp = generation;
1407 return (ires[0]);
1410 ire_t *
1411 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1412 const ill_t *ill, zoneid_t zoneid, uint_t match_args, uint_t irr_flags,
1413 uint32_t xmit_hint, ip_stack_t *ipst, in6_addr_t *setsrcp,
1414 uint_t *generationp)
1416 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1417 zoneid, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1418 generationp));
1422 * Recursively look for a route to the destination.
1423 * We only handle a destination match here, yet we have the same arguments
1424 * as the full match to allow function pointers to select between the two.
1426 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1427 * instead.
1429 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1430 * is an error.
1431 * Allow at most one RTF_INDIRECT.
1433 ire_t *
1434 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
1435 uint32_t xmit_hint, ip_stack_t *ipst)
1437 ire_t *ire;
1438 ire_t *ire1;
1439 uint_t generation;
1441 /* ire_ftable_lookup handles round-robin/ECMP */
1442 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1443 &generation);
1444 ASSERT(ire != NULL);
1447 * If the IRE has a current cached parent we know that the whole
1448 * parent chain is current, hence we don't need to discover and
1449 * build any dependencies by doing a recursive lookup.
1451 mutex_enter(&ire->ire_lock);
1452 if (ire->ire_dep_parent != NULL) {
1453 if (ire->ire_dep_parent->ire_generation ==
1454 ire->ire_dep_parent_generation) {
1455 mutex_exit(&ire->ire_lock);
1456 return (ire);
1458 mutex_exit(&ire->ire_lock);
1459 } else {
1460 mutex_exit(&ire->ire_lock);
1462 * If this type should have an ire_nce_cache (even if it
1463 * doesn't yet have one) then we are done. Includes
1464 * IRE_INTERFACE with a full 128 bit mask.
1466 if (ire->ire_nce_capable)
1467 return (ire);
1471 * Fallback to loop in the normal code starting with the ire
1472 * we found. Normally this would return the same ire.
1474 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1475 MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, &generation);
1476 ire_refrele(ire);
1477 return (ire1);