4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 #include "mpd_tables.h"
29 int debug
= 0; /* Debug flag */
30 static int pollfd_num
= 0; /* Num. of poll descriptors */
31 static struct pollfd
*pollfds
= NULL
; /* Array of poll descriptors */
32 /* All times below in ms */
33 int user_failure_detection_time
; /* user specified failure detection */
35 int user_probe_interval
; /* derived from user specified fdt */
38 * Structure to store mib2 information returned by the kernel.
39 * This is used to process routing table information.
41 typedef struct mib_item_s
{
42 struct mib_item_s
*mi_next
;
43 struct opthdr mi_opthdr
;
47 static int rtsock_v4
; /* AF_INET routing socket */
48 static int rtsock_v6
; /* AF_INET6 routing socket */
49 int ifsock_v4
= -1; /* IPv4 socket for ioctls */
50 int ifsock_v6
= -1; /* IPv6 socket for ioctls */
51 static int lsock_v4
; /* Listen socket to detect mpathd */
52 static int lsock_v6
; /* Listen socket to detect mpathd */
53 static int mibfd
= -1; /* fd to get mib info */
54 static boolean_t force_mcast
= _B_FALSE
; /* Only for test purposes */
56 static uint_t last_initifs_time
; /* Time when initifs was last run */
57 static char **argv0
; /* Saved for re-exec on SIGHUP */
58 boolean_t handle_link_notifications
= _B_TRUE
;
59 static int ipRouteEntrySize
; /* Size of IPv4 route entry */
60 static int ipv6RouteEntrySize
; /* Size of IPv6 route entry */
62 static void initlog(void);
63 static void run_timeouts(void);
64 static void initifs(void);
65 static void check_if_removed(struct phyint_instance
*pii
);
66 static void select_test_ifs(void);
67 static void update_router_list(mib_item_t
*item
);
68 static void mib_get_constants(mib_item_t
*item
);
69 static int mibwalk(void (*proc
)(mib_item_t
*));
70 static void ire_process_v4(mib2_ipRouteEntry_t
*buf
, size_t len
);
71 static void ire_process_v6(mib2_ipv6RouteEntry_t
*buf
, size_t len
);
72 static void router_add_common(int af
, char *ifname
,
73 struct in6_addr nexthop
);
74 static void init_router_targets();
75 static void cleanup(void);
76 static int setup_listener(int af
);
77 static void check_config(void);
78 static void check_testconfig(void);
79 static void check_addr_unique(struct phyint_instance
*,
80 struct sockaddr_storage
*);
81 static void init_host_targets(void);
82 static void dup_host_targets(struct phyint_instance
*desired_pii
);
83 static void loopback_cmd(int sock
, int family
);
84 static boolean_t
daemonize(void);
85 static int closefunc(void *, int);
86 static unsigned int process_cmd(int newfd
, union mi_commands
*mpi
);
87 static unsigned int process_query(int fd
, mi_query_t
*miq
);
88 static unsigned int send_addrinfo(int fd
, ipmp_addrinfo_t
*adinfop
);
89 static unsigned int send_groupinfo(int fd
, ipmp_groupinfo_t
*grinfop
);
90 static unsigned int send_grouplist(int fd
, ipmp_grouplist_t
*grlistp
);
91 static unsigned int send_ifinfo(int fd
, ipmp_ifinfo_t
*ifinfop
);
92 static unsigned int send_result(int fd
, unsigned int error
, int syserror
);
94 addrlist_t
*localaddrs
;
97 * Return the current time in milliseconds (from an arbitrary reference)
98 * truncated to fit into an int. Truncation is ok since we are interested
99 * only in differences and not the absolute values.
104 uint_t cur_time
; /* In ms */
107 * Use of a non-user-adjustable source of time is
108 * required. However millisecond precision is sufficient.
111 cur_time
= (uint_t
)(gethrtime() / 1000000LL);
118 return (gethrtime() / NANOSEC
);
122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
129 struct pollfd
*newfds
;
131 /* Check if already present */
132 for (i
= 0; i
< pollfd_num
; i
++) {
133 if (pollfds
[i
].fd
== fd
)
136 /* Check for empty spot already present */
137 for (i
= 0; i
< pollfd_num
; i
++) {
138 if (pollfds
[i
].fd
== -1) {
144 /* Allocate space for 32 more fds and initialize to -1 */
145 new_num
= pollfd_num
+ 32;
146 newfds
= reallocarray(pollfds
, new_num
, sizeof (struct pollfd
));
147 if (newfds
== NULL
) {
148 logperror("poll_add: realloc");
151 for (i
= pollfd_num
; i
< new_num
; i
++) {
153 newfds
[i
].events
= POLLIN
;
155 pollfd_num
= new_num
;
161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
168 /* Check if already present */
169 for (i
= 0; i
< pollfd_num
; i
++) {
170 if (pollfds
[i
].fd
== fd
) {
179 * Extract information about the phyint instance. If the phyint instance still
180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
181 * will use it to detect phyint instances that don't exist any longer and
182 * remove them, from our database of phyint instances.
184 * returns true if the phyint instance exists in the kernel,
185 * returns false otherwise
188 pii_process(int af
, char *name
, struct phyint_instance
**pii_p
)
191 struct phyint_instance
*pii
;
192 struct phyint_instance
*pii_other
;
194 if (debug
& D_PHYINT
)
195 logdebug("pii_process(%s %s)\n", AF_STR(af
), name
);
197 pii
= phyint_inst_lookup(af
, name
);
200 * Phyint instance does not exist in our tables,
201 * create new phyint instance
203 pii
= phyint_inst_init_from_k(af
, name
);
205 /* Phyint exists in our tables */
206 err
= phyint_inst_update_from_k(pii
);
210 /* Some ioctl error. don't change anything */
214 case PI_GROUP_CHANGED
:
215 case PI_IFINDEX_CHANGED
:
217 * Interface index or group membership has changed.
218 * Delete the old state and recreate based on the new
219 * state (it may no longer be in a group).
221 pii_other
= phyint_inst_other(pii
);
222 if (pii_other
!= NULL
)
223 phyint_inst_delete(pii_other
);
224 phyint_inst_delete(pii
);
225 pii
= phyint_inst_init_from_k(af
, name
);
229 /* Phyint instance has disappeared from kernel */
234 /* Phyint instance exists and is fine */
240 logerr("pii_process: Unknown status %d\n", err
);
247 return (pii
->pii_in_use
? _B_TRUE
: _B_FALSE
);
253 * Scan all interfaces to detect changes as well as new and deleted interfaces
266 struct lifreq lifreq
;
269 struct phyint_instance
*pii
;
270 struct phyint_instance
*next_pii
;
271 struct phyint_group
*pg
, *next_pg
;
272 char pi_name
[LIFNAMSIZ
+ 1];
274 if (debug
& D_PHYINT
)
275 logdebug("initifs: Scanning interfaces\n");
277 last_initifs_time
= getcurrenttime();
280 * Free the existing local address list; we'll build a new list below.
282 addrlist_free(&localaddrs
);
285 * Mark the interfaces so that we can find phyints and logints
286 * which have disappeared from the kernel. pii_process() and
287 * logint_init_from_k() will set {pii,li}_in_use when they find
288 * the interface in the kernel. Also, clear dupaddr bit on probe
289 * logint. check_addr_unique() will set the dupaddr bit on the
290 * probe logint, if the testaddress is not unique.
292 for (pii
= phyint_instances
; pii
!= NULL
; pii
= pii
->pii_next
) {
294 for (li
= pii
->pii_logint
; li
!= NULL
; li
= li
->li_next
) {
296 if (pii
->pii_probe_logint
== li
)
302 * As above, mark groups so that we can detect IPMP interfaces which
303 * have been removed from the kernel. Also, delete the group address
304 * list since we'll iteratively recreate it below.
306 for (pg
= phyint_groups
; pg
!= NULL
; pg
= pg
->pg_next
) {
307 pg
->pg_in_use
= _B_FALSE
;
308 addrlist_free(&pg
->pg_addrs
);
311 lifn
.lifn_family
= AF_UNSPEC
;
312 lifn
.lifn_flags
= LIFC_ALLZONES
| LIFC_UNDER_IPMP
;
314 if (ioctl(ifsock_v4
, SIOCGLIFNUM
, (char *)&lifn
) < 0) {
315 logperror("initifs: ioctl (get interface count)");
319 * Pad the interface count to detect when additional interfaces have
320 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
322 lifn
.lifn_count
+= 4;
324 if ((buf
= calloc(lifn
.lifn_count
, sizeof (struct lifreq
))) == NULL
) {
325 logperror("initifs: calloc");
329 lifc
.lifc_family
= AF_UNSPEC
;
330 lifc
.lifc_flags
= LIFC_ALLZONES
| LIFC_UNDER_IPMP
;
331 lifc
.lifc_len
= lifn
.lifn_count
* sizeof (struct lifreq
);
334 if (ioctl(ifsock_v4
, SIOCGLIFCONF
, (char *)&lifc
) < 0) {
335 logperror("initifs: ioctl (get interface configuration)");
341 * If every lifr_req slot is taken, then additional interfaces must
342 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
343 * Recalculate to make sure we didn't miss any interfaces.
345 nlifr
= lifc
.lifc_len
/ sizeof (struct lifreq
);
346 if (nlifr
>= lifn
.lifn_count
) {
352 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
353 * global list of addresses, phyint groups, phyints, and logints.
355 for (lifr
= lifc
.lifc_req
, i
= 0; i
< nlifr
; i
++, lifr
++) {
356 af
= lifr
->lifr_addr
.ss_family
;
357 sockfd
= (af
== AF_INET
) ? ifsock_v4
: ifsock_v6
;
358 (void) strlcpy(lifreq
.lifr_name
, lifr
->lifr_name
, LIFNAMSIZ
);
360 if (ioctl(sockfd
, SIOCGLIFFLAGS
, &lifreq
) == -1) {
362 logperror("initifs: ioctl (SIOCGLIFFLAGS)");
365 flags
= lifreq
.lifr_flags
;
368 * If the address is IFF_UP, add it to the local address list.
369 * (We ignore addresses that aren't IFF_UP since another node
370 * might legitimately have that address IFF_UP.)
372 if (flags
& IFF_UP
) {
373 (void) addrlist_add(&localaddrs
, lifr
->lifr_name
, flags
,
378 * If this address is on an IPMP meta-interface, update our
379 * phyint_group information (either by recording that group
380 * still exists or creating a new group), and track what
381 * group the address is part of.
383 if (flags
& IFF_IPMP
) {
384 if (ioctl(sockfd
, SIOCGLIFGROUPNAME
, &lifreq
) == -1) {
386 logperror("initifs: ioctl "
387 "(SIOCGLIFGROUPNAME)");
391 pg
= phyint_group_lookup(lifreq
.lifr_groupname
);
393 pg
= phyint_group_create(lifreq
.lifr_groupname
);
395 logerr("initifs: cannot create group "
396 "%s\n", lifreq
.lifr_groupname
);
399 phyint_group_insert(pg
);
401 pg
->pg_in_use
= _B_TRUE
;
404 * Add this to the group's list of data addresses.
406 if (!addrlist_add(&pg
->pg_addrs
, lifr
->lifr_name
, flags
,
408 logerr("initifs: insufficient memory to track "
409 "data address information for %s\n",
416 * This isn't an address on an IPMP meta-interface, so it's
417 * either on an underlying interface or not related to any
418 * group. Update our phyint and logint information (via
419 * pii_process() and logint_init_from_k()) -- but first,
420 * convert the logint name to a phyint name so we can call
423 (void) strlcpy(pi_name
, lifr
->lifr_name
, sizeof (pi_name
));
424 if ((cp
= strchr(pi_name
, IF_SEPARATOR
)) != NULL
)
427 if (pii_process(af
, pi_name
, &pii
)) {
428 /* The phyint is fine. So process the logint */
429 logint_init_from_k(pii
, lifr
->lifr_name
);
430 check_addr_unique(pii
, &lifr
->lifr_addr
);
436 * Scan for groups, phyints and logints that have disappeared from the
437 * kernel, and delete them.
439 for (pii
= phyint_instances
; pii
!= NULL
; pii
= next_pii
) {
440 next_pii
= pii
->pii_next
;
441 check_if_removed(pii
);
444 for (pg
= phyint_groups
; pg
!= NULL
; pg
= next_pg
) {
445 next_pg
= pg
->pg_next
;
446 if (!pg
->pg_in_use
) {
447 phyint_group_delete(pg
);
451 * Refresh the group's state. This is necessary since the
452 * group's state is defined by the set of usable interfaces in
453 * the group, and an interface is considered unusable if all
454 * of its addresses are down. When an address goes down/up,
455 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
457 phyint_group_refresh_state(pg
);
461 * Select a test address for sending probes on each phyint instance
466 * Handle link up/down notifications.
468 process_link_state_changes();
472 * Check that a given test address is unique across all of the interfaces in a
473 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding
474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
475 * Any issues will be reported by check_testconfig().
478 check_addr_unique(struct phyint_instance
*ourpii
, struct sockaddr_storage
*ss
)
481 struct phyint_group
*pg
;
482 struct in6_addr addr
;
483 struct phyint_instance
*pii
;
484 struct sockaddr_in
*sin
;
486 if (ss
->ss_family
== AF_INET
) {
487 sin
= (struct sockaddr_in
*)ss
;
488 IN6_INADDR_TO_V4MAPPED(&sin
->sin_addr
, &addr
);
490 assert(ss
->ss_family
== AF_INET6
);
491 addr
= ((struct sockaddr_in6
*)ss
)->sin6_addr
;
495 * For anonymous groups, every interface is assumed to be on its own
496 * link, so there is no chance of overlapping addresses.
498 pg
= ourpii
->pii_phyint
->pi_group
;
499 if (pg
== phyint_anongroup
)
503 * Walk the list of phyint instances in the group and check for test
504 * addresses matching ours. Of course, we skip ourself.
506 for (pi
= pg
->pg_phyint
; pi
!= NULL
; pi
= pi
->pi_pgnext
) {
507 pii
= PHYINT_INSTANCE(pi
, ss
->ss_family
);
508 if (pii
== NULL
|| pii
== ourpii
||
509 pii
->pii_probe_logint
== NULL
)
513 * If this test address is not unique, set the dupaddr bit.
515 if (IN6_ARE_ADDR_EQUAL(&addr
, &pii
->pii_probe_logint
->li_addr
))
516 pii
->pii_probe_logint
->li_dupaddr
= 1;
521 * Stop probing an interface. Called when an interface is offlined.
522 * The probe socket is closed on each interface instance, and the
523 * interface state set to PI_OFFLINE.
526 stop_probing(struct phyint
*pi
)
528 struct phyint_instance
*pii
;
532 if (pii
->pii_probe_sock
!= -1)
533 close_probe_socket(pii
, _B_TRUE
);
534 pii
->pii_probe_logint
= NULL
;
539 if (pii
->pii_probe_sock
!= -1)
540 close_probe_socket(pii
, _B_TRUE
);
541 pii
->pii_probe_logint
= NULL
;
544 phyint_chstate(pi
, PI_OFFLINE
);
547 enum { BAD_TESTFLAGS
, OK_TESTFLAGS
, BEST_TESTFLAGS
};
550 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set.
551 * IFF_UP must also be set so that the associated address can be used as a
552 * source address. Further, we must be able to exchange packets with local
553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical
554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
557 rate_testflags(uint64_t flags
)
559 if ((flags
& (IFF_NOFAILOVER
| IFF_UP
)) != (IFF_NOFAILOVER
| IFF_UP
))
560 return (BAD_TESTFLAGS
);
562 if ((flags
& (IFF_NOXMIT
| IFF_NOLOCAL
)) != 0)
563 return (BAD_TESTFLAGS
);
565 if ((flags
& (IFF_IPV6
| IFF_DEPRECATED
)) == IFF_DEPRECATED
)
566 return (BEST_TESTFLAGS
);
568 if ((flags
& (IFF_IPV6
| IFF_DEPRECATED
)) == IFF_IPV6
)
569 return (BEST_TESTFLAGS
);
571 return (OK_TESTFLAGS
);
575 * Attempt to select a test address for each phyint instance.
576 * Call phyint_inst_sockinit() to complete the initializations.
579 select_test_ifs(void)
582 struct phyint_instance
*pii
;
583 struct phyint_instance
*next_pii
;
585 struct logint
*probe_logint
;
586 boolean_t target_scan_reqd
= _B_FALSE
;
589 if (debug
& D_PHYINT
)
590 logdebug("select_test_ifs\n");
593 * For each phyint instance, do the test address selection
595 for (pii
= phyint_instances
; pii
!= NULL
; pii
= next_pii
) {
596 next_pii
= pii
->pii_next
;
600 * An interface that is offline should not be probed.
601 * IFF_OFFLINE interfaces should always be PI_OFFLINE
602 * unless some other entity has set the offline flag.
604 if (pii
->pii_phyint
->pi_flags
& IFF_OFFLINE
) {
605 if (pii
->pii_phyint
->pi_state
!= PI_OFFLINE
) {
606 logerr("shouldn't be probing offline"
607 " interface %s (state is: %u)."
608 " Stopping probes.\n",
609 pii
->pii_phyint
->pi_name
,
610 pii
->pii_phyint
->pi_state
);
611 stop_probing(pii
->pii_phyint
);
616 * If something cleared IFF_OFFLINE (e.g., by accident
617 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
618 * inherently racy), the phyint may still be offline.
621 if (pii
->pii_phyint
->pi_state
== PI_OFFLINE
)
625 li
= pii
->pii_probe_logint
;
628 * We've already got a test address; only proceed
629 * if it's suboptimal.
631 if (rate_testflags(li
->li_flags
) == BEST_TESTFLAGS
)
636 * Walk the logints of this phyint instance, and select
637 * the best available test address
639 for (li
= pii
->pii_logint
; li
!= NULL
; li
= li
->li_next
) {
641 * Skip 0.0.0.0 addresses, as those are never
644 if (pii
->pii_af
== AF_INET
&&
645 IN6_IS_ADDR_V4MAPPED_ANY(&li
->li_addr
))
649 * Skip any IPv6 logints that are not link-local,
650 * since we should always have a link-local address
651 * anyway and in6_data() expects link-local replies.
653 if (pii
->pii_af
== AF_INET6
&&
654 !IN6_IS_ADDR_LINKLOCAL(&li
->li_addr
))
658 * Rate the testflags. If we've found an optimal
659 * match, then break out; otherwise, record the most
662 rating
= rate_testflags(li
->li_flags
);
663 if (rating
== BAD_TESTFLAGS
)
667 if (rating
== BEST_TESTFLAGS
)
672 * If the probe logint has changed, ditch the old one.
674 if (pii
->pii_probe_logint
!= NULL
&&
675 pii
->pii_probe_logint
!= probe_logint
) {
676 if (pii
->pii_probe_sock
!= -1)
677 close_probe_socket(pii
, _B_TRUE
);
678 pii
->pii_probe_logint
= NULL
;
681 if (probe_logint
== NULL
) {
683 * We don't have a test address; zero out the probe
684 * stats array since it is no longer relevant.
685 * Optimize by checking if it is already zeroed out.
689 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
690 if (pii
->pii_probes
[pr_ndx
].pr_status
!= PR_UNUSED
) {
691 clear_pii_probe_stats(pii
);
692 reset_crtt_all(pii
->pii_phyint
);
695 } else if (probe_logint
== pii
->pii_probe_logint
) {
697 * If we didn't find any new test addr, go to the
704 * The phyint is either being assigned a new testaddr
705 * or is being assigned a testaddr for the 1st time.
706 * Need to initialize the phyint socket
708 pii
->pii_probe_logint
= probe_logint
;
709 if (!phyint_inst_sockinit(pii
)) {
710 if (debug
& D_PHYINT
) {
711 logdebug("select_test_ifs: "
712 "phyint_sockinit failed\n");
714 phyint_inst_delete(pii
);
719 * This phyint instance is now enabled for probes; this
720 * impacts our state machine in two ways:
722 * 1. If we're probe *capable* as well (i.e., we have
723 * probe targets) and the interface is in PI_NOTARGETS,
724 * then transition to PI_RUNNING.
726 * 2. If we're not probe capable, and the other phyint
727 * instance is also not probe capable, and we were in
728 * PI_RUNNING, then transition to PI_NOTARGETS.
730 * Also see the state diagram in mpd_probe.c.
732 if (PROBE_CAPABLE(pii
)) {
733 if (pii
->pii_phyint
->pi_state
== PI_NOTARGETS
)
734 phyint_chstate(pii
->pii_phyint
, PI_RUNNING
);
735 } else if (!PROBE_CAPABLE(phyint_inst_other(pii
))) {
736 if (pii
->pii_phyint
->pi_state
== PI_RUNNING
)
737 phyint_chstate(pii
->pii_phyint
, PI_NOTARGETS
);
741 * If no targets are currently known for this phyint
742 * we need to call init_router_targets. Since
743 * init_router_targets() initializes the list of targets
744 * for all phyints it is done below the loop.
746 if (pii
->pii_targets
== NULL
)
747 target_scan_reqd
= _B_TRUE
;
750 * Start the probe timer for this instance.
752 if (!pii
->pii_basetime_inited
&& PROBE_ENABLED(pii
)) {
754 pii
->pii_basetime_inited
= 1;
759 * Scan the interface list for any interfaces that are PI_FAILED or
760 * PI_NOTARGETS but no longer enabled to send probes, and call
761 * phyint_check_for_repair() to see if the link state indicates that
762 * the interface should be repaired. Also see the state diagram in
765 for (pi
= phyints
; pi
!= NULL
; pi
= pi
->pi_next
) {
766 if ((!PROBE_ENABLED(pi
->pi_v4
) && !PROBE_ENABLED(pi
->pi_v6
)) &&
767 (pi
->pi_state
== PI_FAILED
||
768 pi
->pi_state
== PI_NOTARGETS
)) {
769 phyint_check_for_repair(pi
);
776 * Try to populate the target list. init_router_targets populates
777 * the target list from the routing table. If our target list is
778 * still empty, init_host_targets adds host targets based on the
779 * host target list of other phyints in the group.
781 if (target_scan_reqd
) {
782 init_router_targets();
788 * Check test address configuration, and log notices/errors if appropriate.
789 * Note that this function only logs pre-existing conditions (e.g., that
790 * probe-based failure detection is disabled).
793 check_testconfig(void)
797 char abuf
[INET6_ADDRSTRLEN
];
800 for (pi
= phyints
; pi
!= NULL
; pi
= pi
->pi_next
) {
801 if (pi
->pi_flags
& IFF_OFFLINE
)
804 if (PROBE_ENABLED(pi
->pi_v4
) || PROBE_ENABLED(pi
->pi_v6
)) {
805 if (pi
->pi_taddrmsg_printed
||
806 pi
->pi_duptaddrmsg_printed
) {
807 if (pi
->pi_duptaddrmsg_printed
)
811 logmsg(pri
, "Test address now configured on "
812 "interface %s; enabling probe-based "
813 "failure detection on it\n", pi
->pi_name
);
814 pi
->pi_taddrmsg_printed
= 0;
815 pi
->pi_duptaddrmsg_printed
= 0;
821 if (pi
->pi_v4
!= NULL
&& pi
->pi_v4
->pii_probe_logint
!= NULL
&&
822 pi
->pi_v4
->pii_probe_logint
->li_dupaddr
)
823 li
= pi
->pi_v4
->pii_probe_logint
;
825 if (pi
->pi_v6
!= NULL
&& pi
->pi_v6
->pii_probe_logint
!= NULL
&&
826 pi
->pi_v6
->pii_probe_logint
->li_dupaddr
)
827 li
= pi
->pi_v6
->pii_probe_logint
;
829 if (li
!= NULL
&& li
->li_dupaddr
) {
830 if (pi
->pi_duptaddrmsg_printed
)
832 logerr("Test address %s is not unique in group; "
833 "disabling probe-based failure detection on %s\n",
834 pr_addr(li
->li_phyint_inst
->pii_af
,
835 li
->li_addr
, abuf
, sizeof (abuf
)), pi
->pi_name
);
836 pi
->pi_duptaddrmsg_printed
= 1;
840 if (getcurrentsec() < pi
->pi_taddrthresh
)
843 if (!pi
->pi_taddrmsg_printed
) {
844 logtrace("No test address configured on interface %s; "
845 "disabling probe-based failure detection on it\n",
847 pi
->pi_taddrmsg_printed
= 1;
853 * Check phyint group configuration, to detect any inconsistencies,
854 * and log an error message. This is called from runtimeouts every
855 * 20 secs. But the error message is displayed once. If the
856 * consistency is resolved by the admin, a recovery message is displayed
862 struct phyint_group
*pg
;
864 boolean_t v4_in_group
;
865 boolean_t v6_in_group
;
868 * All phyints of a group must be homogeneous to ensure that they can
869 * take over for one another. If any phyint in a group has IPv4
870 * plumbed, check that all phyints have IPv4 plumbed. Do a similar
873 for (pg
= phyint_groups
; pg
!= NULL
; pg
= pg
->pg_next
) {
874 if (pg
== phyint_anongroup
)
877 v4_in_group
= _B_FALSE
;
878 v6_in_group
= _B_FALSE
;
880 * 1st pass. Determine if at least 1 phyint in the group
881 * has IPv4 plumbed and if so set v4_in_group to true.
882 * Repeat similarly for IPv6.
884 for (pi
= pg
->pg_phyint
; pi
!= NULL
; pi
= pi
->pi_pgnext
) {
885 if (pi
->pi_v4
!= NULL
)
886 v4_in_group
= _B_TRUE
;
887 if (pi
->pi_v6
!= NULL
)
888 v6_in_group
= _B_TRUE
;
892 * 2nd pass. If v4_in_group is true, check that phyint
893 * has IPv4 plumbed. Repeat similarly for IPv6. Print
894 * out a message the 1st time only.
896 for (pi
= pg
->pg_phyint
; pi
!= NULL
; pi
= pi
->pi_pgnext
) {
897 if (pi
->pi_flags
& IFF_OFFLINE
)
900 if (v4_in_group
== _B_TRUE
&& pi
->pi_v4
== NULL
) {
901 if (!pi
->pi_cfgmsg_printed
) {
902 logerr("IP interface %s in group %s is"
903 " not plumbed for IPv4, affecting"
904 " IPv4 connectivity\n",
906 pi
->pi_group
->pg_name
);
907 pi
->pi_cfgmsg_printed
= 1;
909 } else if (v6_in_group
== _B_TRUE
&&
911 if (!pi
->pi_cfgmsg_printed
) {
912 logerr("IP interface %s in group %s is"
913 " not plumbed for IPv6, affecting"
914 " IPv6 connectivity\n",
916 pi
->pi_group
->pg_name
);
917 pi
->pi_cfgmsg_printed
= 1;
921 * The phyint matches the group configuration,
922 * if we have reached this point. If it was
923 * improperly configured earlier, log an
924 * error recovery message
926 if (pi
->pi_cfgmsg_printed
) {
927 logerr("IP interface %s is now"
928 " consistent with group %s "
929 " and connectivity is restored\n",
930 pi
->pi_name
, pi
->pi_group
->pg_name
);
931 pi
->pi_cfgmsg_printed
= 0;
940 * Timer mechanism using relative time (in milliseconds) from the
941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
942 * will fire after TIMER_INFINITY milliseconds.
943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
944 * time values. Hence 2 consecutive timer events cannot be spaced farther
945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
946 * that can be passed for the delay parameter of timer_schedule()
948 static uint_t timer_next
; /* Currently scheduled timeout */
949 static boolean_t timer_active
= _B_FALSE
; /* SIGALRM has not yet occurred */
954 timer_next
= getcurrenttime() + TIMER_INFINITY
;
956 * The call to run_timeouts() will get the timer started
957 * Since there are no phyints at this point, the timer will
958 * be set for IF_SCAN_INTERVAL ms.
964 * Make sure the next SIGALRM occurs delay milliseconds from the current
965 * time if not earlier. We are interested only in time differences.
968 timer_schedule(uint_t delay
)
971 struct itimerval itimerval
;
974 logdebug("timer_schedule(%u)\n", delay
);
976 assert(delay
<= TIMER_INFINITY
);
978 now
= getcurrenttime();
980 /* Minimum allowed delay */
983 /* Will this timer occur before the currently scheduled SIGALRM? */
984 if (timer_active
&& TIME_GE(now
+ delay
, timer_next
)) {
985 if (debug
& D_TIMER
) {
986 logdebug("timer_schedule(%u) - no action: "
987 "now %u next %u\n", delay
, now
, timer_next
);
991 timer_next
= now
+ delay
;
993 itimerval
.it_value
.tv_sec
= delay
/ 1000;
994 itimerval
.it_value
.tv_usec
= (delay
% 1000) * 1000;
995 itimerval
.it_interval
.tv_sec
= 0;
996 itimerval
.it_interval
.tv_usec
= 0;
997 if (debug
& D_TIMER
) {
998 logdebug("timer_schedule(%u): sec %ld usec %ld\n",
999 delay
, itimerval
.it_value
.tv_sec
,
1000 itimerval
.it_value
.tv_usec
);
1002 timer_active
= _B_TRUE
;
1003 if (setitimer(ITIMER_REAL
, &itimerval
, NULL
) < 0) {
1004 logperror("timer_schedule: setitimer");
1012 struct itimerval itimerval
;
1014 if (debug
& D_TIMER
)
1015 logdebug("timer_cancel()\n");
1017 bzero(&itimerval
, sizeof (itimerval
));
1018 if (setitimer(ITIMER_REAL
, &itimerval
, NULL
) < 0)
1019 logperror("timer_cancel: setitimer");
1023 * Timer has fired. Determine when the next timer event will occur by asking
1024 * all the timer routines. Should not be called from a timer routine.
1030 uint_t next_event_time
;
1031 struct phyint_instance
*pii
;
1032 struct phyint_instance
*next_pii
;
1033 static boolean_t timeout_running
;
1035 /* assert that recursive timeouts don't happen. */
1036 assert(!timeout_running
);
1038 timeout_running
= _B_TRUE
;
1040 if (debug
& D_TIMER
)
1041 logdebug("run_timeouts()\n");
1043 if ((getcurrenttime() - last_initifs_time
) > IF_SCAN_INTERVAL
) {
1048 next
= TIMER_INFINITY
;
1050 for (pii
= phyint_instances
; pii
!= NULL
; pii
= next_pii
) {
1051 next_pii
= pii
->pii_next
;
1052 next_event_time
= phyint_inst_timer(pii
);
1053 if (next_event_time
!= TIMER_INFINITY
&& next_event_time
< next
)
1054 next
= next_event_time
;
1056 if (debug
& D_TIMER
) {
1057 logdebug("run_timeouts(%s %s): next scheduled for"
1058 " this phyint inst %u, next scheduled global"
1060 AF_STR(pii
->pii_af
), pii
->pii_phyint
->pi_name
,
1061 next_event_time
, next
);
1066 * Make sure initifs() is called at least once every
1067 * IF_SCAN_INTERVAL, to make sure that we are in sync
1068 * with the kernel, in case we have missed any routing
1071 if (next
> IF_SCAN_INTERVAL
)
1072 next
= IF_SCAN_INTERVAL
;
1074 if (debug
& D_TIMER
)
1075 logdebug("run_timeouts: %u ms\n", next
);
1077 timer_schedule(next
);
1078 timeout_running
= _B_FALSE
;
1081 static int eventpipe_read
= -1; /* Used for synchronous signal delivery */
1082 static int eventpipe_write
= -1;
1083 boolean_t cleanup_started
= _B_FALSE
; /* true if we're going away */
1086 * Ensure that signals are processed synchronously with the rest of
1087 * the code by just writing a one character signal number on the pipe.
1088 * The poll loop will pick this up and process the signal event.
1091 sig_handler(int signo
)
1093 uchar_t buf
= (uchar_t
)signo
;
1096 * Don't write to pipe if cleanup has already begun. cleanup()
1097 * might have closed the pipe already
1099 if (cleanup_started
)
1102 if (eventpipe_write
== -1) {
1103 logerr("sig_handler: no pipe found\n");
1106 if (write(eventpipe_write
, &buf
, sizeof (buf
)) < 0)
1107 logperror("sig_handler: write");
1110 extern struct probes_missed probes_missed
;
1113 * Pick up a signal "byte" from the pipe and process it.
1119 uint64_t sent
, acked
, lost
, unacked
, unknown
;
1120 struct phyint_instance
*pii
;
1123 switch (read(fd
, &buf
, sizeof (buf
))) {
1125 logperror("in_signal: read");
1131 logerr("in_signal: read end of file\n");
1135 logerr("in_signal: read > 1\n");
1139 if (debug
& D_TIMER
)
1140 logdebug("in_signal() got %d\n", buf
);
1144 if (debug
& D_TIMER
) {
1145 uint_t now
= getcurrenttime();
1147 logdebug("in_signal(SIGALRM) delta %u\n",
1150 timer_active
= _B_FALSE
;
1154 logdebug("Printing configuration:\n");
1155 /* Print out the internal tables */
1156 phyint_inst_print_all();
1159 * Print out the accumulated statistics about missed
1160 * probes (happens due to scheduling delay).
1162 logerr("Missed sending total of %d probes spread over"
1163 " %d occurrences\n", probes_missed
.pm_nprobes
,
1164 probes_missed
.pm_ntimes
);
1167 * Print out the accumulated statistics about probes
1170 for (pii
= phyint_instances
; pii
!= NULL
;
1171 pii
= pii
->pii_next
) {
1173 acked
= pii
->pii_cum_stats
.acked
;
1174 lost
= pii
->pii_cum_stats
.lost
;
1175 sent
= pii
->pii_cum_stats
.sent
;
1176 unknown
= pii
->pii_cum_stats
.unknown
;
1177 for (pr_ndx
= 0; pr_ndx
< PROBE_STATS_COUNT
; pr_ndx
++) {
1178 switch (pii
->pii_probes
[pr_ndx
].pr_status
) {
1190 logerr("\nProbe stats on (%s %s)\n"
1191 "Number of probes sent %lld\n"
1192 "Number of probe acks received %lld\n"
1193 "Number of probes/acks lost %lld\n"
1194 "Number of valid unacknowledged probes %lld\n"
1195 "Number of ambiguous probe acks received %lld\n",
1196 AF_STR(pii
->pii_af
), pii
->pii_name
,
1197 sent
, acked
, lost
, unacked
, unknown
);
1201 logerr("SIGHUP: restart and reread config file\n");
1203 * Cancel the interval timer. Needed since setitimer() uses
1204 * alarm() and the time left is inherited across exec(), and
1205 * thus the SIGALRM may be delivered before a handler has been
1206 * setup, causing in.mpathd to erroneously exit.
1210 (void) execv(argv0
[0], argv0
);
1220 logerr("in_signal: unknown signal: %d\n", buf
);
1227 struct phyint_instance
*pii
;
1228 struct phyint_instance
*next_pii
;
1231 * Make sure that we don't write to eventpipe in
1232 * sig_handler() if any signal notably SIGALRM,
1233 * occurs after we close the eventpipe descriptor below
1235 cleanup_started
= _B_TRUE
;
1237 for (pii
= phyint_instances
; pii
!= NULL
; pii
= next_pii
) {
1238 next_pii
= pii
->pii_next
;
1239 phyint_inst_delete(pii
);
1242 (void) close(ifsock_v4
);
1243 (void) close(ifsock_v6
);
1244 (void) close(rtsock_v4
);
1245 (void) close(rtsock_v6
);
1246 (void) close(lsock_v4
);
1247 (void) close(lsock_v6
);
1251 (void) close(mibfd
);
1252 (void) close(eventpipe_read
);
1253 (void) close(eventpipe_write
);
1257 * Create pipe for signal delivery and set up signal handlers.
1260 setup_eventpipe(void)
1263 struct sigaction act
;
1265 if ((pipe(fds
)) < 0) {
1266 logperror("setup_eventpipe: pipe");
1269 eventpipe_read
= fds
[0];
1270 eventpipe_write
= fds
[1];
1271 if (poll_add(eventpipe_read
) == -1) {
1275 act
.sa_handler
= sig_handler
;
1276 act
.sa_flags
= SA_RESTART
;
1277 (void) sigaction(SIGALRM
, &act
, NULL
);
1279 (void) sigset(SIGHUP
, sig_handler
);
1280 (void) sigset(SIGUSR1
, sig_handler
);
1281 (void) sigset(SIGTERM
, sig_handler
);
1282 (void) sigset(SIGINT
, sig_handler
);
1283 (void) sigset(SIGQUIT
, sig_handler
);
1287 * Create a routing socket for receiving RTM_IFINFO messages.
1290 setup_rtsock(int af
)
1294 int aware
= RTAW_UNDER_IPMP
;
1296 s
= socket(PF_ROUTE
, SOCK_RAW
, af
);
1298 logperror("setup_rtsock: socket PF_ROUTE");
1302 if (setsockopt(s
, SOL_ROUTE
, RT_AWARE
, &aware
, sizeof (aware
)) == -1) {
1303 logperror("setup_rtsock: setsockopt RT_AWARE");
1308 if ((flags
= fcntl(s
, F_GETFL
, 0)) < 0) {
1309 logperror("setup_rtsock: fcntl F_GETFL");
1313 if ((fcntl(s
, F_SETFL
, flags
| O_NONBLOCK
)) < 0) {
1314 logperror("setup_rtsock: fcntl F_SETFL");
1318 if (poll_add(s
) == -1) {
1326 * Process an RTM_IFINFO message received on a routing socket.
1327 * The return value indicates whether a full interface scan is required.
1328 * Link up/down notifications are reflected in the IFF_RUNNING flag.
1329 * If just the state of the IFF_RUNNING interface flag has changed, a
1330 * a full interface scan isn't required.
1333 process_rtm_ifinfo(if_msghdr_t
*ifm
, int type
)
1335 struct sockaddr_dl
*sdl
;
1338 struct phyint_instance
*pii
;
1340 assert(ifm
->ifm_type
== RTM_IFINFO
&& ifm
->ifm_addrs
== RTA_IFP
);
1343 * Although the sockaddr_dl structure is directly after the
1344 * if_msghdr_t structure. At the time of writing, the size of the
1345 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1346 * to the presence of a timeval structure, which contains longs,
1347 * in the if_data structure. Anyway, we know where the message ends,
1348 * so we work backwards to get the start of the sockaddr_dl structure.
1351 sdl
= (struct sockaddr_dl
*)((char *)ifm
+ ifm
->ifm_msglen
-
1352 sizeof (struct sockaddr_dl
));
1354 assert(sdl
->sdl_family
== AF_LINK
);
1357 * The interface name is in sdl_data.
1358 * RTM_IFINFO messages are only generated for logical interface
1359 * zero, so there is no colon and logical interface number to
1360 * strip from the name. The name is not null terminated, but
1361 * there should be enough space in sdl_data to add the null.
1363 if (sdl
->sdl_nlen
>= sizeof (sdl
->sdl_data
)) {
1364 if (debug
& D_LINKNOTE
)
1365 logdebug("process_rtm_ifinfo: phyint name too long\n");
1368 sdl
->sdl_data
[sdl
->sdl_nlen
] = 0;
1370 pi
= phyint_lookup(sdl
->sdl_data
);
1372 if (debug
& D_LINKNOTE
)
1373 logdebug("process_rtm_ifinfo: phyint lookup failed"
1374 " for %s\n", sdl
->sdl_data
);
1379 * We want to try and avoid doing a full interface scan for
1380 * link state notifications from the datalink layer, as indicated
1381 * by the state of the IFF_RUNNING flag. If just the
1382 * IFF_RUNNING flag has changed state, the link state changes
1383 * are processed without a full scan.
1384 * If there is both an IPv4 and IPv6 instance associated with
1385 * the physical interface, we will get an RTM_IFINFO message
1386 * for each instance. If we just maintained a single copy of
1387 * the physical interface flags, it would appear that no flags
1388 * had changed when the second message is processed, leading us
1389 * to believe that the message wasn't generated by a flags change,
1390 * and that a full interface scan is required.
1391 * To get around this problem, two additional copies of the flags
1392 * are kept, one copy for each instance. These are only used in
1393 * this routine. At any one time, all three copies of the flags
1394 * should be identical except for the IFF_RUNNING flag. The
1395 * copy of the flags in the "phyint" structure is always up to
1398 pii
= (type
== AF_INET
) ? pi
->pi_v4
: pi
->pi_v6
;
1400 if (debug
& D_LINKNOTE
)
1401 logdebug("process_rtm_ifinfo: no instance of address "
1402 "family %s for %s\n", AF_STR(type
), pi
->pi_name
);
1406 old_flags
= pii
->pii_flags
;
1407 pii
->pii_flags
= PHYINT_FLAGS(ifm
->ifm_flags
);
1408 pi
->pi_flags
= pii
->pii_flags
;
1410 if (debug
& D_LINKNOTE
) {
1411 logdebug("process_rtm_ifinfo: %s address family: %s, "
1412 "old flags: %llx, new flags: %llx\n", pi
->pi_name
,
1413 AF_STR(type
), old_flags
, pi
->pi_flags
);
1417 * If IFF_STANDBY has changed, indicate that the interface has changed
1418 * types and refresh IFF_INACTIVE if need be.
1420 if ((old_flags
^ pii
->pii_flags
) & IFF_STANDBY
) {
1422 if (pii
->pii_flags
& IFF_STANDBY
)
1423 phyint_standby_refresh_inactive(pi
);
1426 /* Has just the IFF_RUNNING flag changed state ? */
1427 if ((old_flags
^ pii
->pii_flags
) != IFF_RUNNING
) {
1428 struct phyint_instance
*pii_other
;
1430 * It wasn't just a link state change. Update
1431 * the other instance's copy of the flags.
1433 pii_other
= phyint_inst_other(pii
);
1434 if (pii_other
!= NULL
)
1435 pii_other
->pii_flags
= pii
->pii_flags
;
1443 * Retrieve as many routing socket messages as possible, and try to
1444 * empty the routing sockets. Initiate full scan of targets or interfaces
1446 * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1447 * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1450 process_rtsock(int rtsock_v4
, int rtsock_v6
)
1453 int64_t msg
[2048 / 8];
1454 struct rt_msghdr
*rtm
;
1455 boolean_t need_if_scan
= _B_FALSE
;
1456 boolean_t need_rt_scan
= _B_FALSE
;
1457 boolean_t rtm_ifinfo_seen
= _B_FALSE
;
1460 /* Read as many messages as possible and try to empty the sockets */
1461 for (type
= AF_INET
; ; type
= AF_INET6
) {
1463 nbytes
= read((type
== AF_INET
) ? rtsock_v4
:
1464 rtsock_v6
, msg
, sizeof (msg
));
1466 /* No more messages */
1469 rtm
= (struct rt_msghdr
*)msg
;
1470 if (rtm
->rtm_version
!= RTM_VERSION
) {
1471 logerr("process_rtsock: version %d "
1472 "not understood\n", rtm
->rtm_version
);
1476 if (debug
& D_PHYINT
) {
1477 logdebug("process_rtsock: message %d\n",
1481 switch (rtm
->rtm_type
) {
1485 * Some logical interface has changed,
1486 * have to scan everything to determine
1487 * what actually changed.
1489 need_if_scan
= _B_TRUE
;
1493 rtm_ifinfo_seen
= _B_TRUE
;
1494 need_if_scan
|= process_rtm_ifinfo(
1495 (if_msghdr_t
*)rtm
, type
);
1503 need_rt_scan
= _B_TRUE
;
1507 /* Not interesting */
1511 if (type
== AF_INET6
)
1516 if (debug
& D_LINKNOTE
&& rtm_ifinfo_seen
)
1517 logdebug("process_rtsock: synchronizing with kernel\n");
1519 } else if (rtm_ifinfo_seen
) {
1520 if (debug
& D_LINKNOTE
)
1521 logdebug("process_rtsock: "
1522 "link up/down notification(s) seen\n");
1523 process_link_state_changes();
1527 init_router_targets();
1531 * Look if the phyint instance or one of its logints have been removed from
1532 * the kernel and take appropriate action.
1533 * Uses {pii,li}_in_use.
1536 check_if_removed(struct phyint_instance
*pii
)
1539 struct logint
*next_li
;
1541 /* Detect phyints that have been removed from the kernel. */
1542 if (!pii
->pii_in_use
) {
1543 logtrace("%s %s has been removed from kernel\n",
1544 AF_STR(pii
->pii_af
), pii
->pii_phyint
->pi_name
);
1545 phyint_inst_delete(pii
);
1547 /* Detect logints that have been removed. */
1548 for (li
= pii
->pii_logint
; li
!= NULL
; li
= next_li
) {
1549 next_li
= li
->li_next
;
1550 if (!li
->li_in_use
) {
1558 * Parse the supplied mib2 information to extract the routing information
1559 * table. Process the routing table to get the list of known onlink routers
1560 * and update our database. These onlink routers will serve as probe
1564 update_router_list(mib_item_t
*item
)
1566 for (; item
!= NULL
; item
= item
->mi_next
) {
1567 if (item
->mi_opthdr
.name
== 0)
1569 if (item
->mi_opthdr
.level
== MIB2_IP
&&
1570 item
->mi_opthdr
.name
== MIB2_IP_ROUTE
) {
1571 ire_process_v4((mib2_ipRouteEntry_t
*)item
->mi_valp
,
1572 item
->mi_opthdr
.len
);
1573 } else if (item
->mi_opthdr
.level
== MIB2_IP6
&&
1574 item
->mi_opthdr
.name
== MIB2_IP6_ROUTE
) {
1575 ire_process_v6((mib2_ipv6RouteEntry_t
*)item
->mi_valp
,
1576 item
->mi_opthdr
.len
);
1583 * Convert octet `octp' to a phyint name and store in `ifname'
1586 oct2ifname(const Octet_t
*octp
, char *ifname
, size_t ifsize
)
1589 size_t len
= MIN(octp
->o_length
, ifsize
- 1);
1591 (void) strncpy(ifname
, octp
->o_bytes
, len
);
1594 if ((cp
= strchr(ifname
, IF_SEPARATOR
)) != NULL
)
1599 * Examine the IPv4 routing table `buf' for possible targets. For each
1600 * possible target, if it's on the same subnet an interface route, pass
1601 * it to router_add_common() for further consideration.
1604 ire_process_v4(mib2_ipRouteEntry_t
*buf
, size_t len
)
1606 char ifname
[LIFNAMSIZ
];
1607 mib2_ipRouteEntry_t
*rp
, *rp1
, *endp
;
1608 struct in_addr nexthop_v4
;
1609 struct in6_addr nexthop
;
1611 if (debug
& D_TARGET
)
1612 logdebug("ire_process_v4(len %d)\n", len
);
1617 assert((len
% ipRouteEntrySize
) == 0);
1618 endp
= buf
+ (len
/ ipRouteEntrySize
);
1621 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1622 * cross-reference them with the interface routes to determine if
1623 * they're possible probe targets.
1625 for (rp
= buf
; rp
< endp
; rp
++) {
1626 if (!(rp
->ipRouteInfo
.re_ire_type
& IRE_OFFSUBNET
))
1629 /* Get the nexthop address. */
1630 nexthop_v4
.s_addr
= rp
->ipRouteNextHop
;
1633 * Rescan the routing table looking for interface routes that
1634 * are on the same subnet, and try to add them. If they're
1635 * not relevant (e.g., the interface route isn't part of an
1636 * IPMP group, router_add_common() will discard).
1638 for (rp1
= buf
; rp1
< endp
; rp1
++) {
1639 if (!(rp1
->ipRouteInfo
.re_ire_type
& IRE_INTERFACE
) ||
1640 rp1
->ipRouteIfIndex
.o_length
== 0)
1643 if ((rp1
->ipRouteDest
& rp1
->ipRouteMask
) !=
1644 (nexthop_v4
.s_addr
& rp1
->ipRouteMask
))
1647 oct2ifname(&rp1
->ipRouteIfIndex
, ifname
, LIFNAMSIZ
);
1648 IN6_INADDR_TO_V4MAPPED(&nexthop_v4
, &nexthop
);
1649 router_add_common(AF_INET
, ifname
, nexthop
);
1655 router_add_common(int af
, char *ifname
, struct in6_addr nexthop
)
1657 struct phyint_instance
*pii
;
1660 if (debug
& D_TARGET
)
1661 logdebug("router_add_common(%s %s)\n", AF_STR(af
), ifname
);
1664 * Retrieve the phyint instance; bail if it's not known to us yet.
1666 pii
= phyint_inst_lookup(af
, ifname
);
1671 * Don't use our own addresses as targets.
1673 if (own_address(nexthop
))
1677 * If the phyint is part a named group, then add the address to all
1678 * members of the group; note that this is suboptimal in the IPv4 case
1679 * as it has already been added to all matching interfaces in
1680 * ire_process_v4(). Otherwise, add the address only to the phyint
1681 * itself, since other phyints in the anongroup may not be on the same
1684 pi
= pii
->pii_phyint
;
1685 if (pi
->pi_group
== phyint_anongroup
) {
1686 target_add(pii
, nexthop
, _B_TRUE
);
1688 pi
= pi
->pi_group
->pg_phyint
;
1689 for (; pi
!= NULL
; pi
= pi
->pi_pgnext
)
1690 target_add(PHYINT_INSTANCE(pi
, af
), nexthop
, _B_TRUE
);
1695 * Examine the IPv6 routing table `buf' for possible link-local targets, and
1696 * pass any contenders to router_add_common() for further consideration.
1699 ire_process_v6(mib2_ipv6RouteEntry_t
*buf
, size_t len
)
1702 char ifname
[LIFNAMSIZ
];
1703 char grname
[LIFGRNAMSIZ
];
1704 mib2_ipv6RouteEntry_t
*rp
, *rp1
, *endp
;
1705 struct in6_addr nexthop_v6
;
1707 if (debug
& D_TARGET
)
1708 logdebug("ire_process_v6(len %d)\n", len
);
1713 assert((len
% ipv6RouteEntrySize
) == 0);
1714 endp
= buf
+ (len
/ ipv6RouteEntrySize
);
1717 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1718 * cross-reference them with the interface routes to determine if
1719 * they're possible probe targets.
1721 for (rp
= buf
; rp
< endp
; rp
++) {
1722 if (!(rp
->ipv6RouteInfo
.re_ire_type
& IRE_OFFSUBNET
) ||
1723 !IN6_IS_ADDR_LINKLOCAL(&rp
->ipv6RouteNextHop
))
1726 /* Get the nexthop address. */
1727 nexthop_v6
= rp
->ipv6RouteNextHop
;
1730 * The interface name should always exist for link-locals;
1731 * we use it to map this entry to an IPMP group name.
1733 if (rp
->ipv6RouteIfIndex
.o_length
== 0)
1736 oct2ifname(&rp
->ipv6RouteIfIndex
, lifr
.lifr_name
, LIFNAMSIZ
);
1737 if (ioctl(ifsock_v6
, SIOCGLIFGROUPNAME
, &lifr
) == -1 ||
1738 strlcpy(grname
, lifr
.lifr_groupname
, LIFGRNAMSIZ
) == 0) {
1743 * Rescan the list of routes for interface routes, and add the
1744 * above target to any interfaces in the same IPMP group.
1746 for (rp1
= buf
; rp1
< endp
; rp1
++) {
1747 if (!(rp1
->ipv6RouteInfo
.re_ire_type
& IRE_INTERFACE
) ||
1748 rp1
->ipv6RouteIfIndex
.o_length
== 0) {
1751 oct2ifname(&rp1
->ipv6RouteIfIndex
, ifname
, LIFNAMSIZ
);
1752 (void) strlcpy(lifr
.lifr_name
, ifname
, LIFNAMSIZ
);
1754 if (ioctl(ifsock_v6
, SIOCGLIFGROUPNAME
, &lifr
) != -1 &&
1755 strcmp(lifr
.lifr_groupname
, grname
) == 0) {
1756 router_add_common(AF_INET6
, ifname
, nexthop_v6
);
1763 * Build a list of target routers, by scanning the routing tables.
1764 * It is assumed that interface routes exist, to reach the routers.
1767 init_router_targets(void)
1770 struct target
*next_tg
;
1771 struct phyint_instance
*pii
;
1777 for (pii
= phyint_instances
; pii
!= NULL
; pii
= pii
->pii_next
) {
1778 pi
= pii
->pii_phyint
;
1780 * Set tg_in_use to false only for router targets.
1782 if (!pii
->pii_targets_are_routers
)
1785 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
)
1789 if (mibwalk(update_router_list
) == -1)
1792 for (pii
= phyint_instances
; pii
!= NULL
; pii
= pii
->pii_next
) {
1793 pi
= pii
->pii_phyint
;
1794 if (!pii
->pii_targets_are_routers
)
1797 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= next_tg
) {
1798 next_tg
= tg
->tg_next
;
1800 * If the group has failed, it's likely the route was
1801 * removed by an application affected by that failure.
1802 * In that case, we keep the target so that we can
1803 * reliably repair, at which point we'll refresh the
1804 * target list again.
1806 if (!tg
->tg_in_use
&& !GROUP_FAILED(pi
->pi_group
))
1813 * Attempt to assign host targets to any interfaces that do not currently
1814 * have probe targets by sharing targets with other interfaces in the group.
1817 init_host_targets(void)
1819 struct phyint_instance
*pii
;
1820 struct phyint_group
*pg
;
1822 for (pii
= phyint_instances
; pii
!= NULL
; pii
= pii
->pii_next
) {
1823 pg
= pii
->pii_phyint
->pi_group
;
1824 if (pg
!= phyint_anongroup
&& pii
->pii_targets
== NULL
)
1825 dup_host_targets(pii
);
1830 * Duplicate host targets from other phyints of the group to
1831 * the phyint instance 'desired_pii'.
1834 dup_host_targets(struct phyint_instance
*desired_pii
)
1838 struct phyint_instance
*pii
;
1841 assert(desired_pii
->pii_phyint
->pi_group
!= phyint_anongroup
);
1843 af
= desired_pii
->pii_af
;
1846 * For every phyint in the same group as desired_pii, check if
1847 * it has any host targets. If so add them to desired_pii.
1849 for (pi
= desired_pii
->pii_phyint
; pi
!= NULL
; pi
= pi
->pi_pgnext
) {
1850 pii
= PHYINT_INSTANCE(pi
, af
);
1852 * We know that we don't have targets on this phyint instance
1853 * since we have been called. But we still check for
1854 * pii_targets_are_routers because another phyint instance
1855 * could have router targets, since IFF_NOFAILOVER addresses
1856 * on different phyint instances may belong to different
1859 if ((pii
== NULL
) || (pii
== desired_pii
) ||
1860 pii
->pii_targets_are_routers
)
1862 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
) {
1863 target_create(desired_pii
, tg
->tg_address
, _B_FALSE
);
1871 (void) fprintf(stderr
, "usage: %s\n", cmd
);
1875 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd"
1877 /* Get an option from the /etc/default/mpathd file */
1879 getdefault(char *name
)
1881 char namebuf
[BUFSIZ
];
1884 if (defopen(MPATHD_DEFAULT_FILE
) == 0) {
1891 flags
= defcntl(DC_GETFLAGS
, 0);
1892 TURNOFF(flags
, DC_CASE
);
1893 (void) defcntl(DC_SETFLAGS
, flags
);
1895 /* Add "=" to the name */
1896 (void) strncpy(namebuf
, name
, sizeof (namebuf
) - 2);
1897 (void) strncat(namebuf
, "=", 2);
1899 if ((cp
= defread(namebuf
)) != NULL
)
1903 (void) defopen(NULL
);
1910 * Command line options below
1912 boolean_t failback_enabled
= _B_TRUE
; /* failback enabled/disabled */
1913 boolean_t track_all_phyints
= _B_FALSE
; /* track all IP interfaces */
1914 static boolean_t adopt
= _B_FALSE
;
1915 static boolean_t foreground
= _B_FALSE
;
1918 main(int argc
, char *argv
[])
1923 struct phyint_instance
*pii
;
1926 argv0
= argv
; /* Saved for re-exec on SIGHUP */
1927 srandom(gethostid()); /* Initialize the random number generator */
1930 * NOTE: The messages output by in.mpathd are not suitable for
1931 * translation, so we do not call textdomain().
1933 (void) setlocale(LC_ALL
, "");
1936 * Get the user specified value of 'failure detection time'
1937 * from /etc/default/mpathd
1939 value
= getdefault("FAILURE_DETECTION_TIME");
1940 if (value
!= NULL
) {
1941 user_failure_detection_time
=
1942 (int)strtol((char *)value
, NULL
, 0);
1944 if (user_failure_detection_time
<= 0) {
1945 user_failure_detection_time
= FAILURE_DETECTION_TIME
;
1946 logerr("Invalid failure detection time %s, assuming "
1947 "default of %d ms\n", value
,
1948 user_failure_detection_time
);
1950 } else if (user_failure_detection_time
<
1951 MIN_FAILURE_DETECTION_TIME
) {
1952 user_failure_detection_time
=
1953 MIN_FAILURE_DETECTION_TIME
;
1954 logerr("Too small failure detection time of %s, "
1955 "assuming minimum of %d ms\n", value
,
1956 user_failure_detection_time
);
1960 /* User has not specified the parameter, Use default value */
1961 user_failure_detection_time
= FAILURE_DETECTION_TIME
;
1965 * This gives the frequency at which probes will be sent.
1966 * When fdt ms elapses, we should be able to determine
1967 * whether 5 consecutive probes have failed or not.
1968 * 1 probe will be sent in every user_probe_interval ms,
1969 * randomly anytime in the (0.5 - 1.0) 2nd half of every
1970 * user_probe_interval. Thus when we send out probe 'n' we
1971 * can be sure that probe 'n - 2' is lost, if we have not
1972 * got the ack. (since the probe interval is > crtt). But
1973 * probe 'n - 1' may be a valid unacked probe, since the
1974 * time between 2 successive probes could be as small as
1975 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2
1977 user_probe_interval
= user_failure_detection_time
/
1978 (NUM_PROBE_FAILS
+ 2);
1981 * Get the user specified value of failback_enabled from
1982 * /etc/default/mpathd
1984 value
= getdefault("FAILBACK");
1985 if (value
!= NULL
) {
1986 if (strcasecmp(value
, "yes") == 0)
1987 failback_enabled
= _B_TRUE
;
1988 else if (strcasecmp(value
, "no") == 0)
1989 failback_enabled
= _B_FALSE
;
1991 logerr("Invalid value for FAILBACK %s\n", value
);
1994 failback_enabled
= _B_TRUE
;
1998 * Get the user specified value of track_all_phyints from
1999 * /etc/default/mpathd. The sense is reversed in
2000 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2002 value
= getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2003 if (value
!= NULL
) {
2004 if (strcasecmp(value
, "yes") == 0)
2005 track_all_phyints
= _B_FALSE
;
2006 else if (strcasecmp(value
, "no") == 0)
2007 track_all_phyints
= _B_TRUE
;
2009 logerr("Invalid value for "
2010 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value
);
2013 track_all_phyints
= _B_FALSE
;
2016 while ((c
= getopt(argc
, argv
, "adD:ml")) != EOF
) {
2022 force_mcast
= _B_TRUE
;
2026 foreground
= _B_TRUE
;
2029 i
= (int)strtol(optarg
, NULL
, 0);
2031 (void) fprintf(stderr
, "Bad debug flags: %s\n",
2036 foreground
= _B_TRUE
;
2040 * Turn off link state notification handling.
2041 * Undocumented command line flag, for debugging
2044 handle_link_notifications
= _B_FALSE
;
2053 * The sockets for the loopback command interface should be listening
2054 * before we fork and exit in daemonize(). This way, whoever started us
2055 * can use the loopback interface as soon as they get a zero exit
2058 lsock_v4
= setup_listener(AF_INET
);
2059 lsock_v6
= setup_listener(AF_INET6
);
2061 if (lsock_v4
< 0 && lsock_v6
< 0) {
2062 logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2068 logerr("cannot daemonize\n");
2076 * 1. Create ifsock* sockets. These are used for performing SIOC*
2077 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2078 * 2. Initialize a pipe for handling/recording signal events.
2079 * 3. Create the routing sockets, used for listening
2080 * to routing / interface changes.
2081 * 4. phyint_init() - Initialize physical interface state
2082 * (in mpd_tables.c). Must be done before creating interfaces,
2083 * which timer_init() does indirectly.
2084 * 5. Query kernel for route entry sizes (v4 and v6).
2085 * 6. timer_init() - Initialize timer related stuff
2086 * 7. initifs() - Initialize our database of all known interfaces
2087 * 8. init_router_targets() - Initialize our database of all known
2090 ifsock_v4
= socket(AF_INET
, SOCK_DGRAM
, 0);
2091 if (ifsock_v4
< 0) {
2092 logperror("main: IPv4 socket open");
2096 ifsock_v6
= socket(AF_INET6
, SOCK_DGRAM
, 0);
2097 if (ifsock_v6
< 0) {
2098 logperror("main: IPv6 socket open");
2104 rtsock_v4
= setup_rtsock(AF_INET
);
2105 rtsock_v6
= setup_rtsock(AF_INET6
);
2107 if (phyint_init() == -1) {
2108 logerr("cannot initialize physical interface structures");
2112 if (mibwalk(mib_get_constants
) == -1)
2120 * If we're operating in "adopt" mode and no interfaces need to be
2121 * tracked, shut down (ifconfig(1M) will restart us on demand if
2122 * interfaces are subsequently put into multipathing groups).
2124 if (adopt
&& phyint_instances
== NULL
)
2128 * Main body. Keep listening for activity on any of the sockets
2129 * that we are monitoring and take appropriate action as necessary.
2130 * signals are also handled synchronously.
2133 if (poll(pollfds
, pollfd_num
, -1) < 0) {
2136 logperror("main: poll");
2139 for (i
= 0; i
< pollfd_num
; i
++) {
2140 if ((pollfds
[i
].fd
== -1) ||
2141 !(pollfds
[i
].revents
& POLLIN
))
2143 if (pollfds
[i
].fd
== eventpipe_read
) {
2144 in_signal(eventpipe_read
);
2147 if (pollfds
[i
].fd
== rtsock_v4
||
2148 pollfds
[i
].fd
== rtsock_v6
) {
2149 process_rtsock(rtsock_v4
, rtsock_v6
);
2153 for (pii
= phyint_instances
; pii
!= NULL
;
2154 pii
= pii
->pii_next
) {
2155 if (pollfds
[i
].fd
== pii
->pii_probe_sock
) {
2156 if (pii
->pii_af
== AF_INET
)
2164 for (pi
= phyints
; pi
!= NULL
; pi
= pi
->pi_next
) {
2165 if (pi
->pi_notes
!= 0 &&
2166 pollfds
[i
].fd
== dlpi_fd(pi
->pi_dh
)) {
2167 (void) dlpi_recv(pi
->pi_dh
, NULL
, NULL
,
2168 NULL
, NULL
, 0, NULL
);
2173 if (pollfds
[i
].fd
== lsock_v4
)
2174 loopback_cmd(lsock_v4
, AF_INET
);
2175 else if (pollfds
[i
].fd
== lsock_v6
)
2176 loopback_cmd(lsock_v6
, AF_INET6
);
2180 return (EXIT_SUCCESS
);
2184 setup_listener(int af
)
2190 struct sockaddr_storage laddr
;
2191 struct sockaddr_in
*sin
;
2192 struct sockaddr_in6
*sin6
;
2193 struct in6_addr loopback_addr
= IN6ADDR_LOOPBACK_INIT
;
2195 assert(af
== AF_INET
|| af
== AF_INET6
);
2197 sock
= socket(af
, SOCK_STREAM
, 0);
2199 logperror("setup_listener: socket");
2204 if (setsockopt(sock
, SOL_SOCKET
, SO_REUSEADDR
, (char *)&on
,
2206 logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2210 bzero(&laddr
, sizeof (laddr
));
2211 laddr
.ss_family
= af
;
2213 if (af
== AF_INET
) {
2214 sin
= (struct sockaddr_in
*)&laddr
;
2215 sin
->sin_port
= htons(MPATHD_PORT
);
2216 sin
->sin_addr
.s_addr
= htonl(INADDR_LOOPBACK
);
2217 len
= sizeof (struct sockaddr_in
);
2219 sin6
= (struct sockaddr_in6
*)&laddr
;
2220 sin6
->sin6_port
= htons(MPATHD_PORT
);
2221 sin6
->sin6_addr
= loopback_addr
;
2222 len
= sizeof (struct sockaddr_in6
);
2225 ret
= bind(sock
, (struct sockaddr
*)&laddr
, len
);
2227 if (errno
== EADDRINUSE
) {
2229 * Another instance of mpathd may be already active.
2231 logerr("main: is another instance of in.mpathd "
2232 "already active?\n");
2239 if (listen(sock
, 30) < 0) {
2240 logperror("main: listen");
2243 if (poll_add(sock
) == -1) {
2252 * Table of commands and their expected size; used by loopback_cmd().
2258 { "MI_PING", sizeof (uint32_t) },
2259 { "MI_OFFLINE", sizeof (mi_offline_t
) },
2260 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t
) },
2261 { "MI_QUERY", sizeof (mi_query_t
) }
2265 * Commands received over the loopback interface come here (via libipmp).
2268 loopback_cmd(int sock
, int family
)
2272 boolean_t is_priv
= _B_FALSE
;
2273 struct sockaddr_storage peer
;
2274 struct sockaddr_in
*peer_sin
;
2275 struct sockaddr_in6
*peer_sin6
;
2277 union mi_commands mpi
;
2278 char abuf
[INET6_ADDRSTRLEN
];
2282 peerlen
= sizeof (peer
);
2283 newfd
= accept(sock
, (struct sockaddr
*)&peer
, &peerlen
);
2285 logperror("loopback_cmd: accept");
2292 * Validate the address and port to make sure that
2293 * non privileged processes don't connect and start
2296 if (peerlen
!= sizeof (struct sockaddr_in
)) {
2297 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen
);
2298 (void) close(newfd
);
2301 peer_sin
= (struct sockaddr_in
*)&peer
;
2302 is_priv
= ntohs(peer_sin
->sin_port
) < IPPORT_RESERVED
;
2303 (void) inet_ntop(AF_INET
, &peer_sin
->sin_addr
.s_addr
,
2304 abuf
, sizeof (abuf
));
2306 if (ntohl(peer_sin
->sin_addr
.s_addr
) != INADDR_LOOPBACK
) {
2307 logerr("Attempt to connect from addr %s port %d\n",
2308 abuf
, ntohs(peer_sin
->sin_port
));
2309 (void) close(newfd
);
2315 if (peerlen
!= sizeof (struct sockaddr_in6
)) {
2316 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen
);
2317 (void) close(newfd
);
2321 * Validate the address and port to make sure that
2322 * non privileged processes don't connect and start
2325 peer_sin6
= (struct sockaddr_in6
*)&peer
;
2326 is_priv
= ntohs(peer_sin6
->sin6_port
) < IPPORT_RESERVED
;
2327 (void) inet_ntop(AF_INET6
, &peer_sin6
->sin6_addr
, abuf
,
2329 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6
->sin6_addr
)) {
2330 logerr("Attempt to connect from addr %s port %d\n",
2331 abuf
, ntohs(peer_sin6
->sin6_port
));
2332 (void) close(newfd
);
2337 logdebug("loopback_cmd: family %d\n", family
);
2338 (void) close(newfd
);
2343 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2344 * all supported commands
2346 len
= read(newfd
, &mpi
, sizeof (mpi
));
2349 * In theory, we can receive any sized message for a stream socket,
2350 * but we don't expect that to happen for a small message over a
2351 * loopback connection.
2353 if (len
< sizeof (uint32_t)) {
2354 logerr("loopback_cmd: bad command format or read returns "
2355 "partial data %d\n", len
);
2356 (void) close(newfd
);
2360 cmd
= mpi
.mi_command
;
2361 if (cmd
>= MI_NCMD
) {
2362 logerr("loopback_cmd: unknown command id `%d'\n", cmd
);
2363 (void) close(newfd
);
2368 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2370 if (!is_priv
&& (cmd
!= MI_QUERY
&& cmd
!= MI_PING
)) {
2371 logerr("Unprivileged request from %s for privileged "
2372 "command %s\n", abuf
, commands
[cmd
].name
);
2373 (void) close(newfd
);
2377 if (len
< commands
[cmd
].size
) {
2378 logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2379 commands
[cmd
].name
, commands
[cmd
].size
, len
);
2380 (void) close(newfd
);
2384 retval
= process_cmd(newfd
, &mpi
);
2385 if (retval
!= IPMP_SUCCESS
) {
2386 logerr("failed processing %s: %s\n", commands
[cmd
].name
,
2387 ipmp_errmsg(retval
));
2389 (void) close(newfd
);
2393 * Process the commands received via libipmp.
2396 process_cmd(int newfd
, union mi_commands
*mpi
)
2399 struct mi_offline
*mio
;
2400 struct mi_undo_offline
*miu
;
2401 unsigned int retval
;
2403 switch (mpi
->mi_command
) {
2405 return (send_result(newfd
, IPMP_SUCCESS
, 0));
2408 mio
= &mpi
->mi_ocmd
;
2410 pi
= phyint_lookup(mio
->mio_ifname
);
2412 return (send_result(newfd
, IPMP_EUNKIF
, 0));
2414 retval
= phyint_offline(pi
, mio
->mio_min_redundancy
);
2415 if (retval
== IPMP_FAILURE
)
2416 return (send_result(newfd
, IPMP_FAILURE
, errno
));
2418 return (send_result(newfd
, retval
, 0));
2420 case MI_UNDO_OFFLINE
:
2421 miu
= &mpi
->mi_ucmd
;
2423 pi
= phyint_lookup(miu
->miu_ifname
);
2425 return (send_result(newfd
, IPMP_EUNKIF
, 0));
2427 retval
= phyint_undo_offline(pi
);
2428 if (retval
== IPMP_FAILURE
)
2429 return (send_result(newfd
, IPMP_FAILURE
, errno
));
2431 return (send_result(newfd
, retval
, 0));
2434 return (process_query(newfd
, &mpi
->mi_qcmd
));
2440 return (send_result(newfd
, IPMP_EPROTO
, 0));
2444 * Process the query request pointed to by `miq' and send a reply on file
2445 * descriptor `fd'. Returns an IPMP error code.
2448 process_query(int fd
, mi_query_t
*miq
)
2450 ipmp_addrinfo_t
*adinfop
;
2451 ipmp_addrinfolist_t
*adlp
;
2452 ipmp_groupinfo_t
*grinfop
;
2453 ipmp_groupinfolist_t
*grlp
;
2454 ipmp_grouplist_t
*grlistp
;
2455 ipmp_ifinfo_t
*ifinfop
;
2456 ipmp_ifinfolist_t
*iflp
;
2458 unsigned int retval
;
2460 switch (miq
->miq_inforeq
) {
2462 retval
= getgraddrinfo(miq
->miq_grname
, &miq
->miq_addr
,
2464 if (retval
!= IPMP_SUCCESS
)
2465 return (send_result(fd
, retval
, errno
));
2467 retval
= send_result(fd
, IPMP_SUCCESS
, 0);
2468 if (retval
== IPMP_SUCCESS
)
2469 retval
= send_addrinfo(fd
, adinfop
);
2471 ipmp_freeaddrinfo(adinfop
);
2474 case IPMP_GROUPLIST
:
2475 retval
= getgrouplist(&grlistp
);
2476 if (retval
!= IPMP_SUCCESS
)
2477 return (send_result(fd
, retval
, errno
));
2479 retval
= send_result(fd
, IPMP_SUCCESS
, 0);
2480 if (retval
== IPMP_SUCCESS
)
2481 retval
= send_grouplist(fd
, grlistp
);
2483 ipmp_freegrouplist(grlistp
);
2486 case IPMP_GROUPINFO
:
2487 miq
->miq_grname
[LIFGRNAMSIZ
- 1] = '\0';
2488 retval
= getgroupinfo(miq
->miq_grname
, &grinfop
);
2489 if (retval
!= IPMP_SUCCESS
)
2490 return (send_result(fd
, retval
, errno
));
2492 retval
= send_result(fd
, IPMP_SUCCESS
, 0);
2493 if (retval
== IPMP_SUCCESS
)
2494 retval
= send_groupinfo(fd
, grinfop
);
2496 ipmp_freegroupinfo(grinfop
);
2500 miq
->miq_ifname
[LIFNAMSIZ
- 1] = '\0';
2501 retval
= getifinfo(miq
->miq_ifname
, &ifinfop
);
2502 if (retval
!= IPMP_SUCCESS
)
2503 return (send_result(fd
, retval
, errno
));
2505 retval
= send_result(fd
, IPMP_SUCCESS
, 0);
2506 if (retval
== IPMP_SUCCESS
)
2507 retval
= send_ifinfo(fd
, ifinfop
);
2509 ipmp_freeifinfo(ifinfop
);
2514 * Before taking the snapshot, sync with the kernel.
2518 retval
= getsnap(&snap
);
2519 if (retval
!= IPMP_SUCCESS
)
2520 return (send_result(fd
, retval
, errno
));
2522 retval
= send_result(fd
, IPMP_SUCCESS
, 0);
2523 if (retval
!= IPMP_SUCCESS
)
2526 retval
= ipmp_writetlv(fd
, IPMP_SNAP
, sizeof (*snap
), snap
);
2527 if (retval
!= IPMP_SUCCESS
)
2530 retval
= send_grouplist(fd
, snap
->sn_grlistp
);
2531 if (retval
!= IPMP_SUCCESS
)
2534 iflp
= snap
->sn_ifinfolistp
;
2535 for (; iflp
!= NULL
; iflp
= iflp
->ifl_next
) {
2536 retval
= send_ifinfo(fd
, iflp
->ifl_ifinfop
);
2537 if (retval
!= IPMP_SUCCESS
)
2541 grlp
= snap
->sn_grinfolistp
;
2542 for (; grlp
!= NULL
; grlp
= grlp
->grl_next
) {
2543 retval
= send_groupinfo(fd
, grlp
->grl_grinfop
);
2544 if (retval
!= IPMP_SUCCESS
)
2548 adlp
= snap
->sn_adinfolistp
;
2549 for (; adlp
!= NULL
; adlp
= adlp
->adl_next
) {
2550 retval
= send_addrinfo(fd
, adlp
->adl_adinfop
);
2551 if (retval
!= IPMP_SUCCESS
)
2555 ipmp_snap_free(snap
);
2562 return (send_result(fd
, IPMP_EPROTO
, 0));
2566 * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2567 * Returns an IPMP error code.
2570 send_groupinfo(int fd
, ipmp_groupinfo_t
*grinfop
)
2572 ipmp_iflist_t
*iflistp
= grinfop
->gr_iflistp
;
2573 ipmp_addrlist_t
*adlistp
= grinfop
->gr_adlistp
;
2574 unsigned int retval
;
2576 retval
= ipmp_writetlv(fd
, IPMP_GROUPINFO
, sizeof (*grinfop
), grinfop
);
2577 if (retval
!= IPMP_SUCCESS
)
2580 retval
= ipmp_writetlv(fd
, IPMP_IFLIST
,
2581 IPMP_IFLIST_SIZE(iflistp
->il_nif
), iflistp
);
2582 if (retval
!= IPMP_SUCCESS
)
2585 return (ipmp_writetlv(fd
, IPMP_ADDRLIST
,
2586 IPMP_ADDRLIST_SIZE(adlistp
->al_naddr
), adlistp
));
2590 * Send the interface information pointed to by `ifinfop' on file descriptor
2591 * `fd'. Returns an IPMP error code.
2594 send_ifinfo(int fd
, ipmp_ifinfo_t
*ifinfop
)
2596 ipmp_addrlist_t
*adlist4p
= ifinfop
->if_targinfo4
.it_targlistp
;
2597 ipmp_addrlist_t
*adlist6p
= ifinfop
->if_targinfo6
.it_targlistp
;
2598 unsigned int retval
;
2600 retval
= ipmp_writetlv(fd
, IPMP_IFINFO
, sizeof (*ifinfop
), ifinfop
);
2601 if (retval
!= IPMP_SUCCESS
)
2604 retval
= ipmp_writetlv(fd
, IPMP_ADDRLIST
,
2605 IPMP_ADDRLIST_SIZE(adlist4p
->al_naddr
), adlist4p
);
2606 if (retval
!= IPMP_SUCCESS
)
2609 return (ipmp_writetlv(fd
, IPMP_ADDRLIST
,
2610 IPMP_ADDRLIST_SIZE(adlist6p
->al_naddr
), adlist6p
));
2614 * Send the address information pointed to by `adinfop' on file descriptor
2615 * `fd'. Returns an IPMP error code.
2618 send_addrinfo(int fd
, ipmp_addrinfo_t
*adinfop
)
2620 return (ipmp_writetlv(fd
, IPMP_ADDRINFO
, sizeof (*adinfop
), adinfop
));
2624 * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2625 * Returns an IPMP error code.
2628 send_grouplist(int fd
, ipmp_grouplist_t
*grlistp
)
2630 return (ipmp_writetlv(fd
, IPMP_GROUPLIST
,
2631 IPMP_GROUPLIST_SIZE(grlistp
->gl_ngroup
), grlistp
));
2635 * Initialize an mi_result_t structure using `error' and `syserror' and
2636 * send it on file descriptor `fd'. Returns an IPMP error code.
2639 send_result(int fd
, unsigned int error
, int syserror
)
2643 me
.me_mpathd_error
= error
;
2644 if (error
== IPMP_FAILURE
)
2645 me
.me_sys_error
= syserror
;
2647 me
.me_sys_error
= 0;
2649 return (ipmp_write(fd
, &me
, sizeof (me
)));
2653 * Daemonize the process.
2664 * Lose our controlling terminal, and become both a session
2665 * leader and a process group leader.
2671 * Under POSIX, a session leader can accidentally (through
2672 * open(2)) acquire a controlling terminal if it does not
2673 * have one. Just to be safe, fork() again so we are not a
2683 (void) fdwalk(closefunc
, NULL
);
2687 _exit(EXIT_SUCCESS
);
2692 _exit(EXIT_SUCCESS
);
2699 * The parent has created some fds before forking on purpose, keep them open.
2702 closefunc(void *not_used
, int fd
)
2705 if (fd
!= lsock_v4
&& fd
!= lsock_v6
)
2715 * Logging routines. All routines log to syslog, unless the daemon is
2716 * running in the foreground, in which case the logging goes to stderr.
2718 * The following routines are available:
2720 * logdebug(): A printf-like function for outputting debug messages
2721 * (messages at LOG_DEBUG) that are only of use to developers.
2723 * logtrace(): A printf-like function for outputting tracing messages
2724 * (messages at LOG_INFO) from the daemon. This is typically used
2725 * to log the receipt of interesting network-related conditions.
2727 * logerr(): A printf-like function for outputting error messages
2728 * (messages at LOG_ERR) from the daemon.
2730 * logperror*(): A set of functions used to output error messages
2731 * (messages at LOG_ERR); these automatically append strerror(errno)
2732 * and a newline to the message passed to them.
2734 * NOTE: since the logging functions write to syslog, the messages passed
2735 * to them are not eligible for localization. Thus, gettext() must
2739 static int logging
= 0;
2745 openlog("in.mpathd", LOG_PID
, LOG_DAEMON
);
2750 logmsg(int pri
, const char *fmt
, ...)
2757 vsyslog(pri
, fmt
, ap
);
2759 (void) vfprintf(stderr
, fmt
, ap
);
2765 logperror(const char *str
)
2768 syslog(LOG_ERR
, "%s: %m\n", str
);
2770 (void) fprintf(stderr
, "%s: %s\n", str
, strerror(errno
));
2774 logperror_pii(struct phyint_instance
*pii
, const char *str
)
2777 syslog(LOG_ERR
, "%s (%s %s): %m\n",
2778 str
, AF_STR(pii
->pii_af
), pii
->pii_phyint
->pi_name
);
2780 (void) fprintf(stderr
, "%s (%s %s): %s\n",
2781 str
, AF_STR(pii
->pii_af
), pii
->pii_phyint
->pi_name
,
2787 logperror_li(struct logint
*li
, const char *str
)
2789 struct phyint_instance
*pii
= li
->li_phyint_inst
;
2792 syslog(LOG_ERR
, "%s (%s %s): %m\n",
2793 str
, AF_STR(pii
->pii_af
), li
->li_name
);
2795 (void) fprintf(stderr
, "%s (%s %s): %s\n",
2796 str
, AF_STR(pii
->pii_af
), li
->li_name
,
2802 close_probe_socket(struct phyint_instance
*pii
, boolean_t polled
)
2805 (void) poll_remove(pii
->pii_probe_sock
);
2806 (void) close(pii
->pii_probe_sock
);
2807 pii
->pii_probe_sock
= -1;
2808 pii
->pii_basetime_inited
= 0;
2812 addrlist_add(addrlist_t
**addrsp
, const char *name
, uint64_t flags
,
2813 struct sockaddr_storage
*ssp
)
2817 if ((addrp
= malloc(sizeof (addrlist_t
))) == NULL
)
2820 (void) strlcpy(addrp
->al_name
, name
, LIFNAMSIZ
);
2821 addrp
->al_flags
= flags
;
2822 addrp
->al_addr
= *ssp
;
2823 addrp
->al_next
= *addrsp
;
2829 addrlist_free(addrlist_t
**addrsp
)
2831 addrlist_t
*addrp
, *next_addrp
;
2833 for (addrp
= *addrsp
; addrp
!= NULL
; addrp
= next_addrp
) {
2834 next_addrp
= addrp
->al_next
;
2841 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2842 * tables defined by mib2.h. Pass the table information returned to the
2843 * supplied function.
2846 mibwalk(void (*proc
)(mib_item_t
*))
2848 mib_item_t
*head_item
= NULL
;
2849 mib_item_t
*last_item
= NULL
;
2851 struct strbuf ctlbuf
, databuf
;
2854 uintptr_t buf
[512 / sizeof (uintptr_t)];
2855 struct T_optmgmt_req
*tor
= (struct T_optmgmt_req
*)buf
;
2856 struct T_optmgmt_ack
*toa
= (struct T_optmgmt_ack
*)buf
;
2857 struct T_error_ack
*tea
= (struct T_error_ack
*)buf
;
2858 struct opthdr
*req
, *optp
;
2862 if ((mibfd
= open("/dev/ip", O_RDWR
)) < 0) {
2863 logperror("mibwalk(): ip open");
2868 tor
->PRIM_type
= T_SVR4_OPTMGMT_REQ
;
2869 tor
->OPT_offset
= sizeof (struct T_optmgmt_req
);
2870 tor
->OPT_length
= sizeof (struct opthdr
);
2871 tor
->MGMT_flags
= T_CURRENT
;
2874 * Note: we use the special level value below so that IP will return
2875 * us information concerning IRE_MARK_TESTHIDDEN routes.
2877 req
= (struct opthdr
*)&tor
[1];
2878 req
->level
= EXPER_IP_AND_ALL_IRES
;
2882 ctlbuf
.buf
= (char *)&buf
;
2883 ctlbuf
.len
= tor
->OPT_length
+ tor
->OPT_offset
;
2885 if (putmsg(mibfd
, &ctlbuf
, NULL
, 0) == -1) {
2886 logperror("mibwalk(): putmsg(ctl)");
2891 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2892 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains
2893 * a control and data part. The control part contains a struct
2894 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2895 * the level, name and length of the data in the data part. The
2896 * data part contains the actual table data. The last message
2897 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2898 * single option with zero optlen.
2902 ctlbuf
.maxlen
= sizeof (buf
);
2903 rval
= getmsg(mibfd
, &ctlbuf
, NULL
, &flags
);
2904 if (rval
& MORECTL
|| rval
< 0) {
2907 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2911 if (ctlbuf
.len
< sizeof (t_scalar_t
)) {
2912 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf
.len
);
2916 switch (toa
->PRIM_type
) {
2918 if (ctlbuf
.len
< sizeof (struct T_error_ack
)) {
2919 logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2920 "too short: %d\n", ctlbuf
.len
);
2923 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2924 " UNIX_err = 0x%lx\n", tea
->TLI_error
,
2925 t_strerror(tea
->TLI_error
), tea
->UNIX_error
);
2929 optp
= (struct opthdr
*)&toa
[1];
2930 if (ctlbuf
.len
< (sizeof (struct T_optmgmt_ack
) +
2931 sizeof (struct opthdr
))) {
2932 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2933 "short: %d\n", ctlbuf
.len
);
2936 if (toa
->MGMT_flags
!= T_SUCCESS
) {
2937 logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
2938 "0x%lx\n", toa
->MGMT_flags
);
2946 /* The following assert also implies MGMT_flags == T_SUCCESS */
2947 assert(toa
->PRIM_type
== T_OPTMGMT_ACK
);
2950 * We have reached the end of this T_OPTMGMT_ACK
2951 * message. If this is the last message i.e EOD,
2952 * break, else process the next T_OPTMGMT_ACK msg.
2955 if (optp
->len
== 0 && optp
->name
== 0 &&
2957 /* This is the EOD message. */
2960 /* Not EOD but no data to retrieve */
2965 * We should only be here if MOREDATA was set.
2966 * Allocate an empty mib_item_t and link into the list
2969 if ((tmp
= malloc(sizeof (*tmp
))) == NULL
) {
2970 logperror("mibwalk(): malloc() failed.");
2973 if (last_item
!= NULL
)
2974 last_item
->mi_next
= tmp
;
2978 last_item
->mi_next
= NULL
;
2979 last_item
->mi_opthdr
= *optp
;
2980 last_item
->mi_valp
= malloc(optp
->len
);
2981 if (last_item
->mi_valp
== NULL
) {
2982 logperror("mibwalk(): malloc() failed.");
2986 databuf
.maxlen
= last_item
->mi_opthdr
.len
;
2987 databuf
.buf
= (char *)last_item
->mi_valp
;
2990 /* Retrieve the actual MIB data */
2993 if ((rval
= getmsg(mibfd
, NULL
, &databuf
,
2995 if (rval
< 0 && errno
== EINTR
)
2998 * We shouldn't get MOREDATA here so treat that
3001 logperror("mibwalk(): getmsg(data)");
3008 /* Pass the accumulated MIB data to the supplied function pointer */
3011 while (head_item
!= NULL
) {
3013 head_item
= tmp
->mi_next
;
3021 * Parse the supplied mib2 information to get the size of routing table
3022 * entries. This is needed when running in a branded zone where the
3023 * Solaris application environment and the Solaris kernel may not be the
3024 * the same release version.
3027 mib_get_constants(mib_item_t
*item
)
3030 mib2_ipv6IfStatsEntry_t
*ipv6
;
3032 for (; item
!= NULL
; item
= item
->mi_next
) {
3033 if (item
->mi_opthdr
.name
!= 0)
3035 if (item
->mi_opthdr
.level
== MIB2_IP
) {
3036 ipv4
= (mib2_ip_t
*)item
->mi_valp
;
3037 ipRouteEntrySize
= ipv4
->ipRouteEntrySize
;
3038 } else if (item
->mi_opthdr
.level
== MIB2_IP6
) {
3039 ipv6
= (mib2_ipv6IfStatsEntry_t
*)item
->mi_valp
;
3040 ipv6RouteEntrySize
= ipv6
->ipv6RouteEntrySize
;