2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
7 * Copyright (c) 1987 Regents of the University of California.
10 * Redistribution and use in source and binary forms are permitted
11 * provided that the above copyright notice and this paragraph are
12 * duplicated in all such forms and that any documentation,
13 * advertising materials, and other materials related to such
14 * distribution and use acknowledge that the software was developed
15 * by the University of California, Berkeley. The name of the
16 * University may not be used to endorse or promote products derived
17 * from this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
24 #include "mpd_tables.h"
27 * Probe types for probe()
29 #define PROBE_UNI 0x1234 /* Unicast probe packet */
30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */
31 #define PROBE_RTT 0x9abc /* RTT only probe packet */
33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
36 * Format of probe / probe response packets. This is an ICMP Echo request
37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
41 uint8_t pr_icmp_type
; /* type field */
42 uint8_t pr_icmp_code
; /* code field */
43 uint16_t pr_icmp_cksum
; /* checksum field */
44 uint16_t pr_icmp_id
; /* Identification */
45 uint16_t pr_icmp_seq
; /* sequence number */
46 uint64_t pr_icmp_timestamp
; /* Time stamp (in ns) */
47 uint32_t pr_icmp_mtype
; /* Message type */
50 static struct in6_addr all_nodes_mcast_v6
= { { 0xff, 0x2, 0x0, 0x0,
53 0x0, 0x0, 0x0, 0x1 } };
55 static struct in_addr all_nodes_mcast_v4
= { { { 0xe0, 0x0, 0x0, 0x1 } } };
57 static hrtime_t last_fdt_bumpup_time
; /* When FDT was bumped up last */
59 static void *find_ancillary(struct msghdr
*msg
, int cmsg_level
,
61 static void pi_set_crtt(struct target
*tg
, int64_t m
,
62 boolean_t is_probe_uni
);
63 static void incoming_echo_reply(struct phyint_instance
*pii
,
64 struct pr_icmp
*reply
, struct in6_addr fromaddr
, struct timeval
*recv_tvp
);
65 static void incoming_rtt_reply(struct phyint_instance
*pii
,
66 struct pr_icmp
*reply
, struct in6_addr fromaddr
);
67 static void incoming_mcast_reply(struct phyint_instance
*pii
,
68 struct pr_icmp
*reply
, struct in6_addr fromaddr
);
70 static boolean_t
check_pg_crtt_improved(struct phyint_group
*pg
);
71 static boolean_t
check_pii_crtt_improved(struct phyint_instance
*pii
);
72 static boolean_t
check_exception_target(struct phyint_instance
*pii
,
73 struct target
*target
);
74 static void probe_fail_info(struct phyint_instance
*pii
,
75 struct target
*cur_tg
, struct probe_fail_count
*pfinfo
);
76 static void probe_success_info(struct phyint_instance
*pii
,
77 struct target
*cur_tg
, struct probe_success_count
*psinfo
);
78 static boolean_t
phyint_repaired(struct phyint
*pi
);
80 static boolean_t
highest_ack_tg(uint16_t seq
, struct target
*tg
);
81 static int in_cksum(ushort_t
*addr
, int len
);
82 static void reset_snxt_basetimes(void);
83 static int ns2ms(int64_t ns
);
84 static int64_t tv2ns(struct timeval
*);
87 * CRTT - Conservative Round Trip Time Estimate
88 * Probe success - A matching probe reply received before CRTT ms has elapsed
89 * after sending the probe.
90 * Probe failure - No probe reply received and more than CRTT ms has elapsed
91 * after sending the probe.
93 * TLS - Time last success. Most recent probe ack received at this time.
94 * TFF - Time first fail. The time of the earliest probe failure in
95 * a consecutive series of probe failures.
96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required
97 * before declaring phyint repair.
98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to
99 * declare a phyint failure.
101 * Phyint state diagram
103 * The state of a phyint that is capable of being probed, is completely
104 * specified by the 3-tuple <pi_state, pg_state, I>.
106 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107 * IFF_OFFLINE is set. If the phyint is also configured with a test address
108 * (the common case) and probe targets, then a phyint must also successfully
109 * be able to send and receive probes in order to remain in the PI_RUNNING
110 * state (otherwise, it transitions to PI_FAILED).
112 * Further, if a PI_RUNNING phyint is configured with a test address but is
113 * unable to find any probe targets, it will transition to the PI_NOTARGETS
114 * state, which indicates that the link is apparently functional but that
115 * in.mpathd is unable to send probes to verify functionality (in this case,
116 * in.mpathd makes the optimistic assumption that the interface is working
117 * correctly and thus does not mark the interface FAILED, but reports it as
118 * IPMP_IF_UNKNOWN through the async events and query interfaces).
120 * At any point, a phyint may be administratively marked offline via if_mpadm.
121 * In this case, the interface always transitions to PI_OFFLINE, regardless
122 * of its previous state. When the interface is later brought back online,
123 * in.mpathd acts as if the interface is new (and thus it transitions to
124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of
125 * its probes, if probes are sent).
127 * pi_state - PI_RUNNING or PI_FAILED
128 * PI_RUNNING: The failure detection logic says the phyint is good.
129 * PI_FAILED: The failure detection logic says the phyint has failed.
131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
132 * PG_OK: All interfaces in the group are OK.
133 * PG_DEGRADED: Some interfaces in the group are unusable.
134 * PG_FAILED: All interfaces in the group are unusable.
136 * In the case of router targets, we assume that the current list of
137 * targets obtained from the routing table, is still valid, so the
138 * phyint stat is PI_FAILED. In the case of host targets, we delete the
139 * list of targets, and multicast to the all hosts, to reconstruct the
140 * target list. So the phyints are in the PI_NOTARGETS state.
142 * I - value of (pi_flags & IFF_INACTIVE)
143 * IFF_INACTIVE: This phyint will not send or receive packets.
144 * Usually, inactive is tied to standby interfaces that are not yet
145 * needed (e.g., no non-standby interfaces in the group have failed).
146 * When failback has been disabled (FAILBACK=no configured), phyint can
147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148 * subsequently recovers after a failure.
150 * Not all 9 possible combinations of the above 3-tuple are possible.
152 * I is tracked by IP. pi_state is tracked by mpathd.
154 * pi_state state machine
155 * ---------------------------------------------------------------------------
156 * Event State New State
158 * ---------------------------------------------------------------------------
159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
160 * detection : set IFF_FAILED on this phyint
162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
163 * detection : set IFF_FAILED on this phyint
165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
166 * detection -> (PI_RUNNING, I == 0)
167 * : clear IFF_FAILED on this phyint
169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
170 * detection -> (PI_RUNNING, I == 1)
171 * : clear IFF_FAILED on this phyint
172 * : if failback is disabled set I == 1
174 * Group failure (perform on all phyints in the group)
175 * detection PI_RUNNING PI_FAILED
176 * (Router targets) : set IFF_FAILED
178 * Group failure (perform on all phyints in the group)
179 * detection PI_RUNNING PI_NOTARGETS
180 * (Host targets) : set IFF_FAILED
181 * : delete the target list on all phyints
182 * ---------------------------------------------------------------------------
185 struct probes_missed probes_missed
;
188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header
189 * will be added on by the kernel. The id field identifies this phyint.
190 * and the sequence number is an increasing (modulo 2^^16) integer. The data
191 * portion holds the time value when the packet is sent. On echo this is
192 * extracted to compute the round-trip time. Three different types of
193 * probe packets are used.
195 * PROBE_UNI: This type is used to do failure detection / failure recovery
196 * and RTT calculation. PROBE_UNI probes are spaced apart in time,
197 * not less than the current CRTT. pii_probes[] stores data
198 * about these probes. These packets consume sequence number space.
200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these
201 * are not used. Under heavy network load, the rtt may go up very high,
202 * due to a spike, or may appear to go high, due to extreme scheduling
203 * delays. Once the network stress is removed, mpathd takes long time to
204 * recover, because the probe_interval is already high, and it takes
205 * a long time to send out sufficient number of probes to bring down the
206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every
207 * user_probe_interval ms. and will cause only rtt updates. These packets
208 * do not consume sequence number space nor is information about these
209 * packets stored in the pii_probes[]
211 * PROBE_MULTI: This type is only used to construct a list of targets, when
212 * no targets are known. The packet is multicast to the all hosts addr.
215 probe(struct phyint_instance
*pii
, uint_t probe_type
, hrtime_t start_hrtime
)
217 hrtime_t sent_hrtime
;
218 struct timeval sent_tv
;
219 struct pr_icmp probe_pkt
; /* Probe packet */
220 struct sockaddr_storage targ
; /* target address */
221 uint_t targaddrlen
; /* targed address length */
222 int pr_ndx
; /* probe index in pii->pii_probes[] */
223 boolean_t sent
= _B_FALSE
;
226 if (debug
& D_TARGET
) {
227 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii
->pii_af
),
228 pii
->pii_name
, probe_type
, start_hrtime
);
231 assert(pii
->pii_probe_sock
!= -1);
232 assert(probe_type
== PROBE_UNI
|| probe_type
== PROBE_MULTI
||
233 probe_type
== PROBE_RTT
);
235 probe_pkt
.pr_icmp_type
= (pii
->pii_af
== AF_INET
) ?
236 ICMP_ECHO_REQUEST
: ICMP6_ECHO_REQUEST
;
237 probe_pkt
.pr_icmp_code
= 0;
238 probe_pkt
.pr_icmp_cksum
= 0;
239 probe_pkt
.pr_icmp_seq
= htons(pii
->pii_snxt
);
242 * Since there is no need to do arithmetic on the icmpid,
243 * (only equality check is done) pii_icmpid is stored in
244 * network byte order at initialization itself.
246 probe_pkt
.pr_icmp_id
= pii
->pii_icmpid
;
247 probe_pkt
.pr_icmp_timestamp
= htonll(start_hrtime
);
248 probe_pkt
.pr_icmp_mtype
= htonl(probe_type
);
251 * If probe_type is PROBE_MULTI, this packet will be multicast to
252 * the all hosts address. Otherwise it is unicast to the next target.
254 assert(probe_type
== PROBE_MULTI
|| ((pii
->pii_target_next
!= NULL
) &&
255 pii
->pii_rtt_target_next
!= NULL
));
257 bzero(&targ
, sizeof (targ
));
258 targ
.ss_family
= pii
->pii_af
;
260 if (pii
->pii_af
== AF_INET6
) {
261 struct in6_addr
*addr6
;
263 addr6
= &((struct sockaddr_in6
*)&targ
)->sin6_addr
;
264 targaddrlen
= sizeof (struct sockaddr_in6
);
265 if (probe_type
== PROBE_MULTI
) {
266 *addr6
= all_nodes_mcast_v6
;
267 } else if (probe_type
== PROBE_UNI
) {
268 *addr6
= pii
->pii_target_next
->tg_address
;
269 } else { /* type is PROBE_RTT */
270 *addr6
= pii
->pii_rtt_target_next
->tg_address
;
273 struct in_addr
*addr4
;
275 addr4
= &((struct sockaddr_in
*)&targ
)->sin_addr
;
276 targaddrlen
= sizeof (struct sockaddr_in
);
277 if (probe_type
== PROBE_MULTI
) {
278 *addr4
= all_nodes_mcast_v4
;
279 } else if (probe_type
== PROBE_UNI
) {
280 IN6_V4MAPPED_TO_INADDR(
281 &pii
->pii_target_next
->tg_address
, addr4
);
282 } else { /* type is PROBE_RTT */
283 IN6_V4MAPPED_TO_INADDR(
284 &pii
->pii_rtt_target_next
->tg_address
, addr4
);
288 * Compute the IPv4 icmp checksum. Does not cover the IP header.
290 probe_pkt
.pr_icmp_cksum
=
291 in_cksum((ushort_t
*)&probe_pkt
, (int)sizeof (probe_pkt
));
295 * Use the current time as the time we sent. Not atomic, but the best
296 * we can do from here.
298 sent_hrtime
= gethrtime();
299 (void) gettimeofday(&sent_tv
, NULL
);
300 rval
= sendto(pii
->pii_probe_sock
, &probe_pkt
, sizeof (probe_pkt
), 0,
301 (struct sockaddr
*)&targ
, targaddrlen
);
303 * If the send would block, this may either be transient or a hang in a
304 * lower layer. We pretend the probe was actually sent, the daemon will
305 * not see a reply to the probe and will fail the interface if normal
306 * failure detection criteria are met.
308 if (rval
== sizeof (probe_pkt
) ||
309 (rval
== -1 && errno
== EWOULDBLOCK
)) {
312 logperror_pii(pii
, "probe: probe sendto");
316 * If this is a PROBE_UNI probe packet being unicast to a target, then
317 * update our tables. We will need this info in processing the probe
318 * response. PROBE_MULTI and PROBE_RTT packets are not used for
319 * the purpose of failure or recovery detection. PROBE_MULTI packets
320 * are only used to construct a list of targets. PROBE_RTT packets are
321 * used only for updating the rtt and not for failure detection.
323 if (probe_type
== PROBE_UNI
&& sent
) {
324 pr_ndx
= pii
->pii_probe_next
;
325 assert(pr_ndx
>= 0 && pr_ndx
< PROBE_STATS_COUNT
);
327 /* Collect statistics, before we reuse the last slot. */
328 if (pii
->pii_probes
[pr_ndx
].pr_status
== PR_LOST
)
329 pii
->pii_cum_stats
.lost
++;
330 else if (pii
->pii_probes
[pr_ndx
].pr_status
== PR_ACKED
)
331 pii
->pii_cum_stats
.acked
++;
332 pii
->pii_cum_stats
.sent
++;
334 pii
->pii_probes
[pr_ndx
].pr_id
= pii
->pii_snxt
;
335 pii
->pii_probes
[pr_ndx
].pr_tv_sent
= sent_tv
;
336 pii
->pii_probes
[pr_ndx
].pr_hrtime_sent
= sent_hrtime
;
337 pii
->pii_probes
[pr_ndx
].pr_hrtime_start
= start_hrtime
;
338 pii
->pii_probes
[pr_ndx
].pr_target
= pii
->pii_target_next
;
339 probe_chstate(&pii
->pii_probes
[pr_ndx
], pii
, PR_UNACKED
);
341 pii
->pii_probe_next
= PROBE_INDEX_NEXT(pii
->pii_probe_next
);
342 pii
->pii_target_next
= target_next(pii
->pii_target_next
);
343 assert(pii
->pii_target_next
!= NULL
);
345 * If we have a single variable to denote the next target to
346 * probe for both rtt probes and failure detection probes, we
347 * could end up with a situation where the failure detection
348 * probe targets become disjoint from the rtt probe targets.
349 * Eg. if 2 targets and the actual fdt is double the user
350 * specified fdt. So we have 2 variables. In this scheme
351 * we also reset pii_rtt_target_next for every fdt probe,
352 * though that may not be necessary.
354 pii
->pii_rtt_target_next
= pii
->pii_target_next
;
356 } else if (probe_type
== PROBE_RTT
) {
357 pii
->pii_rtt_target_next
=
358 target_next(pii
->pii_rtt_target_next
);
359 assert(pii
->pii_rtt_target_next
!= NULL
);
364 * Incoming IPv4 data from wire, is received here. Called from main.
367 in_data(struct phyint_instance
*pii
)
369 struct sockaddr_in from
;
370 struct in6_addr fromaddr
;
371 static uint64_t in_packet
[(IP_MAXPACKET
+ 1)/8];
372 static uint64_t ancillary_data
[(IP_MAXPACKET
+ 1)/8];
376 char abuf
[INET_ADDRSTRLEN
];
379 struct pr_icmp
*reply
;
380 struct timeval
*recv_tvp
;
382 if (debug
& D_PROBE
) {
383 logdebug("in_data(%s %s)\n",
384 AF_STR(pii
->pii_af
), pii
->pii_name
);
387 iov
.iov_base
= (char *)in_packet
;
388 iov
.iov_len
= sizeof (in_packet
);
391 msg
.msg_name
= (struct sockaddr
*)&from
;
392 msg
.msg_namelen
= sizeof (from
);
393 msg
.msg_control
= ancillary_data
;
394 msg
.msg_controllen
= sizeof (ancillary_data
);
397 * Poll has already told us that a message is waiting,
398 * on this socket. Read it now. We should not block.
400 if ((len
= recvmsg(pii
->pii_probe_sock
, &msg
, 0)) < 0) {
401 logperror_pii(pii
, "in_data: recvmsg");
406 * If the datalink has indicated the link is down, don't go
409 if (LINK_DOWN(pii
->pii_phyint
))
412 /* Get the printable address for error reporting */
413 (void) inet_ntop(AF_INET
, &from
.sin_addr
, abuf
, sizeof (abuf
));
415 /* Ignore packets > 64k or control buffers that don't fit */
416 if (msg
.msg_flags
& (MSG_TRUNC
|MSG_CTRUNC
)) {
417 if (debug
& D_PKTBAD
) {
418 logdebug("Truncated message: msg_flags 0x%x from %s\n",
419 msg
.msg_flags
, abuf
);
424 /* Make sure packet contains at least minimum ICMP header */
425 ip
= (struct ip
*)in_packet
;
426 iphlen
= ip
->ip_hl
<< 2;
427 if (len
< iphlen
+ ICMP_MINLEN
) {
428 if (debug
& D_PKTBAD
) {
429 logdebug("in_data: packet too short (%d bytes)"
430 " from %s\n", len
, abuf
);
436 * Subtract the IP hdr length, 'len' will be length of the probe
437 * reply, starting from the icmp hdr.
441 reply
= (struct pr_icmp
*)((char *)in_packet
+ iphlen
);
443 /* Probe replies are icmp echo replies. Ignore anything else */
444 if (reply
->pr_icmp_type
!= ICMP_ECHO_REPLY
)
448 * The icmp id should match what we sent, which is stored
449 * in pi_icmpid. The icmp code for reply must be 0.
450 * The reply content must be a struct pr_icmp
452 if (reply
->pr_icmp_id
!= pii
->pii_icmpid
) {
453 /* Not in response to our probe */
457 if (reply
->pr_icmp_code
!= 0) {
458 logtrace("probe reply code %d from %s on %s\n",
459 reply
->pr_icmp_code
, abuf
, pii
->pii_name
);
463 if (len
< sizeof (struct pr_icmp
)) {
464 logtrace("probe reply too short: %d bytes from %s on %s\n",
465 len
, abuf
, pii
->pii_name
);
469 recv_tvp
= find_ancillary(&msg
, SOL_SOCKET
, SCM_TIMESTAMP
);
470 if (recv_tvp
== NULL
) {
471 logtrace("message without timestamp from %s on %s\n",
472 abuf
, pii
->pii_name
);
476 IN6_INADDR_TO_V4MAPPED(&from
.sin_addr
, &fromaddr
);
477 if (reply
->pr_icmp_mtype
== htonl(PROBE_UNI
))
478 /* Unicast probe reply */
479 incoming_echo_reply(pii
, reply
, fromaddr
, recv_tvp
);
480 else if (reply
->pr_icmp_mtype
== htonl(PROBE_MULTI
)) {
481 /* Multicast reply */
482 incoming_mcast_reply(pii
, reply
, fromaddr
);
483 } else if (reply
->pr_icmp_mtype
== htonl(PROBE_RTT
)) {
484 incoming_rtt_reply(pii
, reply
, fromaddr
);
486 /* Probably not in response to our probe */
487 logtrace("probe reply type: %d from %s on %s\n",
488 reply
->pr_icmp_mtype
, abuf
, pii
->pii_name
);
494 * Incoming IPv6 data from wire is received here. Called from main.
497 in6_data(struct phyint_instance
*pii
)
499 struct sockaddr_in6 from
;
500 static uint64_t in_packet
[(IP_MAXPACKET
+ 1)/8];
501 static uint64_t ancillary_data
[(IP_MAXPACKET
+ 1)/8];
503 char abuf
[INET6_ADDRSTRLEN
];
507 struct pr_icmp
*reply
;
508 struct timeval
*recv_tvp
;
510 if (debug
& D_PROBE
) {
511 logdebug("in6_data(%s %s)\n",
512 AF_STR(pii
->pii_af
), pii
->pii_name
);
515 iov
.iov_base
= (char *)in_packet
;
516 iov
.iov_len
= sizeof (in_packet
);
519 msg
.msg_name
= (struct sockaddr
*)&from
;
520 msg
.msg_namelen
= sizeof (from
);
521 msg
.msg_control
= ancillary_data
;
522 msg
.msg_controllen
= sizeof (ancillary_data
);
524 if ((len
= recvmsg(pii
->pii_probe_sock
, &msg
, 0)) < 0) {
525 logperror_pii(pii
, "in6_data: recvmsg");
530 * If the datalink has indicated that the link is down, don't go
533 if (LINK_DOWN(pii
->pii_phyint
))
536 /* Get the printable address for error reporting */
537 (void) inet_ntop(AF_INET6
, &from
.sin6_addr
, abuf
, sizeof (abuf
));
538 if (len
< ICMP_MINLEN
) {
539 if (debug
& D_PKTBAD
) {
540 logdebug("Truncated message: msg_flags 0x%x from %s\n",
541 msg
.msg_flags
, abuf
);
545 /* Ignore packets > 64k or control buffers that don't fit */
546 if (msg
.msg_flags
& (MSG_TRUNC
|MSG_CTRUNC
)) {
547 if (debug
& D_PKTBAD
) {
548 logdebug("Truncated message: msg_flags 0x%x from %s\n",
549 msg
.msg_flags
, abuf
);
554 reply
= (struct pr_icmp
*)in_packet
;
555 if (reply
->pr_icmp_type
!= ICMP6_ECHO_REPLY
)
558 if (reply
->pr_icmp_id
!= pii
->pii_icmpid
) {
559 /* Not in response to our probe */
564 * The kernel has already verified the the ICMP checksum.
566 if (!IN6_IS_ADDR_LINKLOCAL(&from
.sin6_addr
)) {
567 logtrace("ICMPv6 echo reply source address not linklocal from "
568 "%s on %s\n", abuf
, pii
->pii_name
);
571 opt
= find_ancillary(&msg
, IPPROTO_IPV6
, IPV6_RTHDR
);
573 /* Can't allow routing headers in probe replies */
574 logtrace("message with routing header from %s on %s\n",
575 abuf
, pii
->pii_name
);
579 if (reply
->pr_icmp_code
!= 0) {
580 logtrace("probe reply code: %d from %s on %s\n",
581 reply
->pr_icmp_code
, abuf
, pii
->pii_name
);
584 if (len
< (sizeof (struct pr_icmp
))) {
585 logtrace("probe reply too short: %d bytes from %s on %s\n",
586 len
, abuf
, pii
->pii_name
);
590 recv_tvp
= find_ancillary(&msg
, SOL_SOCKET
, SCM_TIMESTAMP
);
591 if (recv_tvp
== NULL
) {
592 logtrace("message without timestamp from %s on %s\n",
593 abuf
, pii
->pii_name
);
597 if (reply
->pr_icmp_mtype
== htonl(PROBE_UNI
)) {
598 incoming_echo_reply(pii
, reply
, from
.sin6_addr
, recv_tvp
);
599 } else if (reply
->pr_icmp_mtype
== htonl(PROBE_MULTI
)) {
600 incoming_mcast_reply(pii
, reply
, from
.sin6_addr
);
601 } else if (reply
->pr_icmp_mtype
== htonl(PROBE_RTT
)) {
602 incoming_rtt_reply(pii
, reply
, from
.sin6_addr
);
604 /* Probably not in response to our probe */
605 logtrace("probe reply type: %d from %s on %s\n",
606 reply
->pr_icmp_mtype
, abuf
, pii
->pii_name
);
611 * Process the incoming rtt reply, in response to our rtt probe.
612 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
613 * have any stored information about the probe we sent. So we don't log
614 * any errors if we receive bad replies.
617 incoming_rtt_reply(struct phyint_instance
*pii
, struct pr_icmp
*reply
,
618 struct in6_addr fromaddr
)
620 int64_t m
; /* rtt measurement in ns */
621 char abuf
[INET6_ADDRSTRLEN
];
622 struct target
*target
;
623 struct phyint_group
*pg
;
625 /* Get the printable address for error reporting */
626 (void) pr_addr(pii
->pii_af
, fromaddr
, abuf
, sizeof (abuf
));
628 if (debug
& D_PROBE
) {
629 logdebug("incoming_rtt_reply: %s %s %s\n",
630 AF_STR(pii
->pii_af
), pii
->pii_name
, abuf
);
633 /* Do we know this target ? */
634 target
= target_lookup(pii
, fromaddr
);
638 m
= (int64_t)(gethrtime() - ntohll(reply
->pr_icmp_timestamp
));
639 /* Invalid rtt. It has wrapped around */
644 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
645 * The initial few responses after the interface is repaired may
646 * contain high rtt's because they could have been queued up waiting
647 * for ARP/NDP resolution on a failed interface.
649 pg
= pii
->pii_phyint
->pi_group
;
650 if ((pii
->pii_state
!= PI_RUNNING
) || GROUP_FAILED(pg
))
654 * Update rtt only if the new rtt is lower than the current rtt.
655 * (specified by the 3rd parameter to pi_set_crtt).
656 * If a spike has caused the current probe_interval to be >
657 * user_probe_interval, then this mechanism is used to bring down
658 * the rtt rapidly once the network stress is removed.
659 * If the new rtt is higher than the current rtt, we don't want to
660 * update the rtt. We are having more than 1 outstanding probe and
661 * the increase in rtt we are seeing is being unnecessarily weighted
662 * many times. The regular rtt update will be handled by
663 * incoming_echo_reply() and will take care of any rtt increase.
665 pi_set_crtt(target
, m
, _B_FALSE
);
666 if ((target
->tg_crtt
< (pg
->pg_probeint
/ LOWER_FDT_TRIGGER
)) &&
667 (user_failure_detection_time
< pg
->pg_fdt
) &&
668 (last_fdt_bumpup_time
+ MIN_SETTLING_TIME
< gethrtime())) {
670 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
671 * investigate if we can improve the failure detection time to
672 * meet whatever the user specified.
674 if (check_pg_crtt_improved(pg
)) {
675 pg
->pg_fdt
= MAX(pg
->pg_fdt
/ NEXT_FDT_MULTIPLE
,
676 user_failure_detection_time
);
677 pg
->pg_probeint
= pg
->pg_fdt
/ (NUM_PROBE_FAILS
+ 2);
678 if (pii
->pii_phyint
->pi_group
!= phyint_anongroup
) {
679 logerr("Improved failure detection time %d ms "
680 "on (%s %s) for group \"%s\"\n",
681 pg
->pg_fdt
, AF_STR(pii
->pii_af
),
683 pii
->pii_phyint
->pi_group
->pg_name
);
685 if (user_failure_detection_time
== pg
->pg_fdt
) {
686 /* Avoid any truncation or rounding errors */
687 pg
->pg_probeint
= user_probe_interval
;
689 * No more rtt probes will be sent. The actual
690 * fdt has dropped to the user specified value.
691 * pii_fd_snxt_basetime and pii_snxt_basetime
692 * will be in sync henceforth.
694 reset_snxt_basetimes();
701 * Process the incoming echo reply, in response to our unicast probe.
702 * Common for both IPv4 and IPv6
705 incoming_echo_reply(struct phyint_instance
*pii
, struct pr_icmp
*reply
,
706 struct in6_addr fromaddr
, struct timeval
*recv_tvp
)
708 int64_t m
; /* rtt measurement in ns */
709 hrtime_t cur_hrtime
; /* in ns from some arbitrary point */
710 char abuf
[INET6_ADDRSTRLEN
];
712 struct target
*target
;
714 uint64_t pr_icmp_timestamp
;
715 uint16_t pr_icmp_seq
;
716 struct probe_stats
*pr_statp
;
717 struct phyint_group
*pg
= pii
->pii_phyint
->pi_group
;
719 /* Get the printable address for error reporting */
720 (void) pr_addr(pii
->pii_af
, fromaddr
, abuf
, sizeof (abuf
));
722 if (debug
& D_PROBE
) {
723 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
724 AF_STR(pii
->pii_af
), pii
->pii_name
, abuf
,
725 ntohs(reply
->pr_icmp_seq
), tv2ns(recv_tvp
));
728 pr_icmp_timestamp
= ntohll(reply
->pr_icmp_timestamp
);
729 pr_icmp_seq
= ntohs(reply
->pr_icmp_seq
);
731 /* Reject out of window probe replies */
732 if (SEQ_GE(pr_icmp_seq
, pii
->pii_snxt
) ||
733 SEQ_LT(pr_icmp_seq
, pii
->pii_snxt
- PROBE_STATS_COUNT
)) {
734 logtrace("out of window probe seq %u snxt %u on %s from %s\n",
735 pr_icmp_seq
, pii
->pii_snxt
, pii
->pii_name
, abuf
);
736 pii
->pii_cum_stats
.unknown
++;
740 cur_hrtime
= gethrtime();
741 m
= (int64_t)(cur_hrtime
- pr_icmp_timestamp
);
744 * This is a ridiculously high value of rtt. rtt has wrapped
745 * around. Log a message, and ignore the rtt.
747 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
748 "reply timestamp %lld\n", cur_hrtime
, pr_icmp_timestamp
);
752 * Get the probe index pr_ndx corresponding to the received icmp seq.
753 * number in our pii->pii_probes[] array. The icmp sequence number
754 * pii_snxt corresponds to the probe index pii->pii_probe_next
756 pr_ndx
= MOD_SUB(pii
->pii_probe_next
,
757 (uint16_t)(pii
->pii_snxt
- pr_icmp_seq
), PROBE_STATS_COUNT
);
759 assert(PR_STATUS_VALID(pii
->pii_probes
[pr_ndx
].pr_status
));
761 target
= pii
->pii_probes
[pr_ndx
].pr_target
;
764 * Perform sanity checks, whether this probe reply that we
765 * have received is genuine
767 if (target
!= NULL
) {
769 * Compare the src. addr of the received ICMP or ICMPv6
770 * probe reply with the target address in our tables.
772 if (!IN6_ARE_ADDR_EQUAL(&target
->tg_address
, &fromaddr
)) {
774 * We don't have any record of having sent a probe to
775 * this target. This is a fake probe reply. Log an error
777 logtrace("probe status %d Fake probe reply seq %u "
778 "snxt %u on %s from %s\n",
779 pii
->pii_probes
[pr_ndx
].pr_status
,
780 pr_icmp_seq
, pii
->pii_snxt
, pii
->pii_name
, abuf
);
781 pii
->pii_cum_stats
.unknown
++;
783 } else if (pii
->pii_probes
[pr_ndx
].pr_status
== PR_ACKED
) {
785 * The address matches, but our tables indicate that
786 * this probe reply has been acked already. So this
787 * is a duplicate probe reply. Log an error
789 logtrace("probe status %d Duplicate probe reply seq %u "
790 "snxt %u on %s from %s\n",
791 pii
->pii_probes
[pr_ndx
].pr_status
,
792 pr_icmp_seq
, pii
->pii_snxt
, pii
->pii_name
, abuf
);
793 pii
->pii_cum_stats
.unknown
++;
798 * Target must not be NULL in the PR_UNACKED state
800 assert(pii
->pii_probes
[pr_ndx
].pr_status
!= PR_UNACKED
);
801 if (pii
->pii_probes
[pr_ndx
].pr_status
== PR_UNUSED
) {
803 * The probe stats slot is unused. So we didn't
804 * send out any probe to this target. This is a fake.
807 logtrace("probe status %d Fake probe reply seq %u "
808 "snxt %u on %s from %s\n",
809 pii
->pii_probes
[pr_ndx
].pr_status
,
810 pr_icmp_seq
, pii
->pii_snxt
, pii
->pii_name
, abuf
);
812 pii
->pii_cum_stats
.unknown
++;
817 * If the rtt does not appear to be right, don't update the
818 * rtt stats. This can happen if the system dropped into the
819 * debugger, or the system was hung or too busy for a
820 * substantial time that we didn't get a chance to run.
822 if ((m
< 0) || (ns2ms(m
) > PROBE_STATS_COUNT
* pg
->pg_probeint
)) {
824 * If the probe corresponding to this received response
825 * was truly sent 'm' ns. ago, then this response must
826 * have been rejected by the sequence number checks. The
827 * fact that it has passed the sequence number checks
828 * means that the measured rtt is wrong. We were probably
829 * scheduled long after the packet was received.
835 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
836 * The initial few responses after the interface is repaired may
837 * contain high rtt's because they could have been queued up waiting
838 * for ARP/NDP resolution on a failed interface.
840 if ((pii
->pii_state
!= PI_RUNNING
) || GROUP_FAILED(pg
))
844 * Don't update the Conservative Round Trip Time estimate for this
845 * (phint, target) pair if this is the not the highest ack seq seen
846 * thus far on this target.
848 if (!highest_ack_tg(pr_icmp_seq
, target
))
852 * Always update the rtt. This is a failure detection probe
853 * and we want to measure both increase / decrease in rtt.
855 pi_set_crtt(target
, m
, _B_TRUE
);
858 * If the crtt exceeds the average time between probes,
859 * investigate if this slow target is an exception. If so we
860 * can avoid this target and still meet the failure detection
861 * time. Otherwise we can't meet the failure detection time.
863 if (target
->tg_crtt
> pg
->pg_probeint
) {
864 exception
= check_exception_target(pii
, target
);
867 * This target is exceptionally slow. Don't use it
868 * for future probes. check_exception_target() has
869 * made sure that we have at least MIN_PROBE_TARGETS
870 * other active targets
872 if (pii
->pii_targets_are_routers
) {
874 * This is a slow router, mark it as slow
875 * and don't use it for further probes. We
876 * don't delete it, since it will be populated
877 * again when we do a router scan. Hence we
878 * need to maintain extra state (unlike the
879 * host case below). Mark it as TG_SLOW.
881 if (target
->tg_status
== TG_ACTIVE
)
883 target
->tg_status
= TG_SLOW
;
884 target
->tg_latime
= gethrtime();
885 target
->tg_rtt_sa
= -1;
887 target
->tg_rtt_sd
= 0;
888 if (pii
->pii_target_next
== target
) {
889 pii
->pii_target_next
=
894 * the slow target is not a router, we can
895 * just delete it. Send an icmp multicast and
896 * pick the fastest responder that is not
897 * already an active target. target_delete()
898 * adjusts pii->pii_target_next
900 target_delete(target
);
901 probe(pii
, PROBE_MULTI
, cur_hrtime
);
905 * We can't meet the failure detection time.
906 * Log a message, and update the detection time to
907 * whatever we can achieve.
909 pg
->pg_probeint
= target
->tg_crtt
* NEXT_FDT_MULTIPLE
;
910 pg
->pg_fdt
= pg
->pg_probeint
* (NUM_PROBE_FAILS
+ 2);
911 last_fdt_bumpup_time
= gethrtime();
912 if (pg
!= phyint_anongroup
) {
913 logtrace("Cannot meet requested failure"
914 " detection time of %d ms on (%s %s) new"
915 " failure detection time for group \"%s\""
916 " is %d ms\n", user_failure_detection_time
,
917 AF_STR(pii
->pii_af
), pii
->pii_name
,
918 pg
->pg_name
, pg
->pg_fdt
);
921 } else if ((target
->tg_crtt
< (pg
->pg_probeint
/ LOWER_FDT_TRIGGER
)) &&
922 (user_failure_detection_time
< pg
->pg_fdt
) &&
923 (last_fdt_bumpup_time
+ MIN_SETTLING_TIME
< gethrtime())) {
925 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
926 * investigate if we can improve the failure detection time to
927 * meet whatever the user specified.
929 if (check_pg_crtt_improved(pg
)) {
930 pg
->pg_fdt
= MAX(pg
->pg_fdt
/ NEXT_FDT_MULTIPLE
,
931 user_failure_detection_time
);
932 pg
->pg_probeint
= pg
->pg_fdt
/ (NUM_PROBE_FAILS
+ 2);
933 if (pg
!= phyint_anongroup
) {
934 logtrace("Improved failure detection time %d ms"
935 " on (%s %s) for group \"%s\"\n",
936 pg
->pg_fdt
, AF_STR(pii
->pii_af
),
937 pii
->pii_name
, pg
->pg_name
);
939 if (user_failure_detection_time
== pg
->pg_fdt
) {
940 /* Avoid any truncation or rounding errors */
941 pg
->pg_probeint
= user_probe_interval
;
943 * No more rtt probes will be sent. The actual
944 * fdt has dropped to the user specified value.
945 * pii_fd_snxt_basetime and pii_snxt_basetime
946 * will be in sync henceforth.
948 reset_snxt_basetimes();
953 pr_statp
= &pii
->pii_probes
[pr_ndx
];
954 pr_statp
->pr_hrtime_ackproc
= cur_hrtime
;
955 pr_statp
->pr_hrtime_ackrecv
= pr_statp
->pr_hrtime_sent
+
956 (tv2ns(recv_tvp
) - tv2ns(&pr_statp
->pr_tv_sent
));
958 probe_chstate(pr_statp
, pii
, PR_ACKED
);
961 * Update pii->pii_rack, i.e. the sequence number of the last received
962 * probe response, based on the echo reply we have received now, if
963 * either of the following conditions are satisfied.
964 * a. pii_rack is outside the current receive window of
965 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
966 * This means we have not received probe responses for a
967 * long time, and the sequence number has wrapped around.
968 * b. pii_rack is within the current receive window and this echo
969 * reply corresponds to the highest sequence number we have seen
972 if (SEQ_GE(pii
->pii_rack
, pii
->pii_snxt
) ||
973 SEQ_LT(pii
->pii_rack
, pii
->pii_snxt
- PROBE_STATS_COUNT
) ||
974 SEQ_GT(pr_icmp_seq
, pii
->pii_rack
)) {
975 pii
->pii_rack
= pr_icmp_seq
;
980 * Returns true if seq is the highest unacknowledged seq for target tg
984 highest_ack_tg(uint16_t seq
, struct target
*tg
)
986 struct phyint_instance
*pii
;
990 pii
= tg
->tg_phyint_inst
;
993 * Get the seq number of the most recent probe sent so far,
994 * and also get the corresponding probe index in the probe stats
997 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
998 pr_seq
= pii
->pii_snxt
;
1002 * Start from the most recent probe and walk back, trying to find
1003 * an acked probe corresponding to target tg.
1005 for (; pr_ndx
!= pii
->pii_probe_next
;
1006 pr_ndx
= PROBE_INDEX_PREV(pr_ndx
), pr_seq
--) {
1007 if (pii
->pii_probes
[pr_ndx
].pr_target
== tg
&&
1008 pii
->pii_probes
[pr_ndx
].pr_status
== PR_ACKED
) {
1009 if (SEQ_GT(pr_seq
, seq
))
1017 * Check whether the crtt for the group has improved by a factor of
1018 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
1019 * detection time flapping in the face of small crtt changes.
1022 check_pg_crtt_improved(struct phyint_group
*pg
)
1026 if (debug
& D_PROBE
)
1027 logdebug("check_pg_crtt_improved()\n");
1030 * The crtt for the group is only improved if each phyint_instance
1031 * for both ipv4 and ipv6 is improved.
1033 for (pi
= pg
->pg_phyint
; pi
!= NULL
; pi
= pi
->pi_pgnext
) {
1034 if (!check_pii_crtt_improved(pi
->pi_v4
) ||
1035 !check_pii_crtt_improved(pi
->pi_v6
))
1043 * Check whether the crtt has improved substantially on this phyint_instance.
1044 * Returns _B_TRUE if there's no crtt information available, because pii
1045 * is NULL or the phyint_instance is not capable of probing.
1048 check_pii_crtt_improved(struct phyint_instance
*pii
) {
1054 if (!PROBE_CAPABLE(pii
) ||
1055 pii
->pii_phyint
->pi_state
== PI_FAILED
)
1058 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
) {
1059 if (tg
->tg_status
!= TG_ACTIVE
)
1061 if (tg
->tg_crtt
> (pii
->pii_phyint
->pi_group
->pg_probeint
/
1062 LOWER_FDT_TRIGGER
)) {
1071 * This target responds very slowly to probes. The target's crtt exceeds
1072 * the probe interval of its group. Compare against other targets
1073 * and determine if this target is an exception, if so return true, else false
1076 check_exception_target(struct phyint_instance
*pii
, struct target
*target
)
1079 char abuf
[INET6_ADDRSTRLEN
];
1081 if (debug
& D_PROBE
) {
1082 logdebug("check_exception_target(%s %s target %s)\n",
1083 AF_STR(pii
->pii_af
), pii
->pii_name
,
1084 pr_addr(pii
->pii_af
, target
->tg_address
,
1085 abuf
, sizeof (abuf
)));
1089 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1090 * to make a good judgement. Otherwise don't drop this target.
1092 if (pii
->pii_ntargets
< MIN_PROBE_TARGETS
+ 1)
1096 * Determine whether only this particular target is slow.
1097 * We know that this target's crtt exceeds the group's probe interval.
1098 * If all other active targets have a
1099 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1100 * then this target is considered slow.
1102 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
) {
1103 if (tg
!= target
&& tg
->tg_status
== TG_ACTIVE
) {
1105 pii
->pii_phyint
->pi_group
->pg_probeint
/
1116 * Update the target list. The icmp all hosts multicast has given us
1117 * some host to which we can send probes. If we already have sufficient
1118 * targets, discard it.
1121 incoming_mcast_reply(struct phyint_instance
*pii
, struct pr_icmp
*reply
,
1122 struct in6_addr fromaddr
)
1126 char abuf
[INET6_ADDRSTRLEN
];
1129 if (debug
& D_PROBE
) {
1130 logdebug("incoming_mcast_reply(%s %s %s)\n",
1131 AF_STR(pii
->pii_af
), pii
->pii_name
,
1132 pr_addr(pii
->pii_af
, fromaddr
, abuf
, sizeof (abuf
)));
1136 * Using host targets is a fallback mechanism. If we have
1137 * found a router, don't add this host target. If we already
1138 * know MAX_PROBE_TARGETS, don't add another target.
1140 assert(pii
->pii_ntargets
<= MAX_PROBE_TARGETS
);
1141 if (pii
->pii_targets
!= NULL
) {
1142 if (pii
->pii_targets_are_routers
||
1143 (pii
->pii_ntargets
== MAX_PROBE_TARGETS
)) {
1148 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr
) ||
1149 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr
)) {
1151 * Guard against response from 0.0.0.0
1152 * and ::. Log a trace message
1154 logtrace("probe response from %s on %s\n",
1155 pr_addr(pii
->pii_af
, fromaddr
, abuf
, sizeof (abuf
)),
1161 * This address is one of our own, so reject this address as a
1162 * valid probe target.
1165 if (own_address(fromaddr
))
1169 * If the phyint is part a named group, then add the address to all
1170 * members of the group. Otherwise, add the address only to the
1171 * phyint itself, since other phyints in the anongroup may not be on
1174 pi
= pii
->pii_phyint
;
1175 if (pi
->pi_group
== phyint_anongroup
) {
1176 target_add(pii
, fromaddr
, _B_FALSE
);
1178 pi
= pi
->pi_group
->pg_phyint
;
1179 for (; pi
!= NULL
; pi
= pi
->pi_pgnext
)
1180 target_add(PHYINT_INSTANCE(pi
, af
), fromaddr
, _B_FALSE
);
1185 * Compute CRTT given an existing scaled average, scaled deviation estimate
1186 * and a new rtt time. The formula is from Jacobson and Karels'
1187 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
1188 * are the same as those in Appendix A.2 of that paper.
1190 * m = new measurement
1191 * sa = scaled RTT average (8 * average estimates)
1192 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1193 * crtt = Conservative round trip time. Used to determine whether probe
1196 * New scaled average and deviation are passed back via sap and svp
1199 compute_crtt(int64_t *sap
, int64_t *svp
, int64_t m
)
1204 int64_t saved_m
= m
;
1211 * Update average estimator:
1212 * new rtt = old rtt + 1/8 Error
1213 * where Error = m - old rtt
1214 * i.e. 8 * new rtt = 8 * old rtt + Error
1215 * i.e. new sa = old sa + Error
1217 m
-= sa
>> 3; /* m is now Error in estimate. */
1218 if ((sa
+= m
) < 0) {
1219 /* Don't allow the smoothed average to be negative. */
1224 * Update deviation estimator:
1225 * new mdev = old mdev + 1/4 (abs(Error) - old mdev)
1226 * i.e. 4 * new mdev = 4 * old mdev +
1227 * (abs(Error) - old mdev)
1228 * i.e. new sv = old sv + (abs(Error) - old mdev)
1235 /* Initialization. This is the first response received. */
1240 crtt
= (sa
>> 3) + sv
;
1242 if (debug
& D_PROBE
) {
1243 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244 "crtt = %lld\n", saved_m
, sa
, sv
, crtt
);
1251 * CRTT = average estimates + 4 * deviation estimates
1258 pi_set_crtt(struct target
*tg
, int64_t m
, boolean_t is_probe_uni
)
1260 struct phyint_instance
*pii
= tg
->tg_phyint_inst
;
1261 int probe_interval
= pii
->pii_phyint
->pi_group
->pg_probeint
;
1262 int64_t sa
= tg
->tg_rtt_sa
;
1263 int64_t sv
= tg
->tg_rtt_sd
;
1267 if (debug
& D_PROBE
)
1268 logdebug("pi_set_crtt: target - m %lld\n", m
);
1270 /* store the round trip time, in case we need to defer computation */
1271 tg
->tg_deferred
[tg
->tg_num_deferred
] = m
;
1273 new_crtt
= ns2ms(compute_crtt(&sa
, &sv
, m
));
1276 * If this probe's round trip time would singlehandedly cause an
1277 * increase in the group's probe interval consider it suspect.
1279 if ((new_crtt
> probe_interval
) && is_probe_uni
) {
1280 if (debug
& D_PROBE
) {
1281 logdebug("Received a suspect probe on %s, new_crtt ="
1282 " %d, probe_interval = %d, num_deferred = %d\n",
1283 pii
->pii_probe_logint
->li_name
, new_crtt
,
1284 probe_interval
, tg
->tg_num_deferred
);
1288 * If we've deferred as many rtts as we plan on deferring, then
1289 * assume the link really did slow down and process all queued
1292 if (tg
->tg_num_deferred
== MAXDEFERREDRTT
) {
1293 if (debug
& D_PROBE
) {
1294 logdebug("Received MAXDEFERREDRTT probes which "
1295 "would cause an increased probe_interval. "
1296 "Integrating queued rtt data points.\n");
1299 for (i
= 0; i
<= tg
->tg_num_deferred
; i
++) {
1300 tg
->tg_crtt
= ns2ms(compute_crtt(&tg
->tg_rtt_sa
,
1301 &tg
->tg_rtt_sd
, tg
->tg_deferred
[i
]));
1304 tg
->tg_num_deferred
= 0;
1306 tg
->tg_num_deferred
++;
1312 * If this is a normal probe, or an RTT probe that would lead to a
1313 * reduced CRTT, then update our CRTT data. Further, if this was
1314 * a normal probe, pitch any deferred probes since our probes are
1315 * again being answered within our CRTT estimates.
1317 if (is_probe_uni
|| new_crtt
< tg
->tg_crtt
) {
1320 tg
->tg_crtt
= new_crtt
;
1322 tg
->tg_num_deferred
= 0;
1327 * Return a pointer to the specified option buffer.
1328 * If not found return NULL.
1331 find_ancillary(struct msghdr
*msg
, int cmsg_level
, int cmsg_type
)
1333 struct cmsghdr
*cmsg
;
1335 for (cmsg
= CMSG_FIRSTHDR(msg
); cmsg
!= NULL
;
1336 cmsg
= CMSG_NXTHDR(msg
, cmsg
)) {
1337 if (cmsg
->cmsg_level
== cmsg_level
&&
1338 cmsg
->cmsg_type
== cmsg_type
) {
1339 return (CMSG_DATA(cmsg
));
1346 * Try to activate another INACTIVE interface in the same group as `pi'.
1347 * Prefer STANDBY INACTIVE to just INACTIVE.
1350 phyint_activate_another(struct phyint
*pi
)
1353 struct phyint
*inactivepi
= NULL
;
1355 if (pi
->pi_group
== phyint_anongroup
)
1358 for (pi2
= pi
->pi_group
->pg_phyint
; pi2
!= NULL
; pi2
= pi2
->pi_pgnext
) {
1359 if (pi
== pi2
|| !phyint_is_functioning(pi2
) ||
1360 !(pi2
->pi_flags
& IFF_INACTIVE
))
1364 if (pi2
->pi_flags
& IFF_STANDBY
)
1368 if (inactivepi
!= NULL
)
1369 (void) change_pif_flags(inactivepi
, 0, IFF_INACTIVE
);
1373 * Transition a phyint to PI_RUNNING. The caller must ensure that the
1374 * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if
1375 * appropriate. Also sets IFF_INACTIVE on this or other interfaces as
1376 * appropriate (see comment below). Finally, also updates the phyint's group
1377 * state to account for the change.
1380 phyint_transition_to_running(struct phyint
*pi
)
1383 struct phyint
*actstandbypi
= NULL
;
1384 uint_t nactive
= 0, nnonstandby
= 0;
1385 boolean_t onlining
= (pi
->pi_state
== PI_OFFLINE
);
1386 boolean_t initial
= (pi
->pi_state
== PI_INIT
);
1387 uint64_t set
, clear
;
1390 * The interface is running again, but should it or another interface
1391 * in the group end up INACTIVE? There are three cases:
1393 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394 * the group is operating at capacity (i.e., there are at least as
1395 * many active interfaces as non-STANDBY interfaces in the group).
1396 * No other interfaces should be changed.
1398 * 2. If it's a non-STANDBY interface and we're onlining it or
1399 * FAILBACK is enabled, then it should *not* end up INACTIVE.
1400 * Further, if the group is above capacity as a result of this
1401 * interface, then an active STANDBY interface in the group should
1404 * 3. If it's a non-STANDBY interface, we're repairing it, and
1405 * FAILBACK is disabled, then it should end up INACTIVE *unless*
1406 * the group was failed (in which case we have no choice but to
1407 * use it). No other interfaces should be changed.
1409 if (pi
->pi_group
!= phyint_anongroup
) {
1410 pi2
= pi
->pi_group
->pg_phyint
;
1411 for (; pi2
!= NULL
; pi2
= pi2
->pi_pgnext
) {
1412 if (!(pi2
->pi_flags
& IFF_STANDBY
))
1415 if (phyint_is_functioning(pi2
) &&
1416 !(pi2
->pi_flags
& IFF_INACTIVE
)) {
1418 if (pi2
->pi_flags
& IFF_STANDBY
)
1425 clear
= (onlining
? IFF_OFFLINE
: IFF_FAILED
);
1427 if (pi
->pi_flags
& IFF_STANDBY
) { /* case 1 */
1428 if (nactive
>= nnonstandby
)
1429 set
|= IFF_INACTIVE
;
1431 clear
|= IFF_INACTIVE
;
1432 } else if (onlining
|| failback_enabled
) { /* case 2 */
1433 if (nactive
>= nnonstandby
&& actstandbypi
!= NULL
)
1434 (void) change_pif_flags(actstandbypi
, IFF_INACTIVE
, 0);
1435 } else if (!initial
&& !GROUP_FAILED(pi
->pi_group
)) { /* case 3 */
1436 set
|= IFF_INACTIVE
;
1438 (void) change_pif_flags(pi
, set
, clear
);
1440 phyint_chstate(pi
, PI_RUNNING
);
1443 * Update the group state to account for the change.
1445 phyint_group_refresh_state(pi
->pi_group
);
1449 * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
1450 * to have at least one active interface and as many active interfaces as
1451 * non-standby interfaces.
1454 phyint_standby_refresh_inactive(struct phyint
*pi
)
1457 uint_t nactive
= 0, nnonstandby
= 0;
1460 * All phyints in the anonymous group are effectively in their own
1461 * group and thus active regardless of whether they're marked standby.
1463 if (pi
->pi_group
== phyint_anongroup
) {
1464 (void) change_pif_flags(pi
, 0, IFF_INACTIVE
);
1469 * If the phyint isn't functioning we can't consider it.
1471 if (!phyint_is_functioning(pi
))
1474 for (pi2
= pi
->pi_group
->pg_phyint
; pi2
!= NULL
; pi2
= pi2
->pi_pgnext
) {
1475 if (!(pi2
->pi_flags
& IFF_STANDBY
))
1478 if (phyint_is_functioning(pi2
) &&
1479 !(pi2
->pi_flags
& IFF_INACTIVE
))
1483 if (nactive
== 0 || nactive
< nnonstandby
)
1484 (void) change_pif_flags(pi
, 0, IFF_INACTIVE
);
1485 else if (nactive
> nnonstandby
)
1486 (void) change_pif_flags(pi
, IFF_INACTIVE
, 0);
1490 * See if a previously failed interface has started working again.
1493 phyint_check_for_repair(struct phyint
*pi
)
1495 if (!phyint_repaired(pi
))
1498 if (pi
->pi_group
== phyint_anongroup
) {
1499 logerr("IP interface repair detected on %s\n", pi
->pi_name
);
1501 logerr("IP interface repair detected on %s of group %s\n",
1502 pi
->pi_name
, pi
->pi_group
->pg_name
);
1506 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1507 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1508 * until it is brought back online.
1510 if (pi
->pi_state
== PI_OFFLINE
) {
1511 (void) change_pif_flags(pi
, 0, IFF_FAILED
);
1515 phyint_transition_to_running(pi
); /* calls phyint_chstate() */
1519 * See if an interface has failed, or if the whole group of interfaces has
1523 phyint_inst_check_for_failure(struct phyint_instance
*pii
)
1525 struct phyint
*pi
= pii
->pii_phyint
;
1527 boolean_t was_active
;
1529 switch (failure_state(pii
)) {
1530 case PHYINT_FAILURE
:
1531 was_active
= ((pi
->pi_flags
& IFF_INACTIVE
) == 0);
1533 (void) change_pif_flags(pi
, IFF_FAILED
, IFF_INACTIVE
);
1534 if (pi
->pi_group
== phyint_anongroup
) {
1535 logerr("IP interface failure detected on %s\n",
1538 logerr("IP interface failure detected on %s of group"
1539 " %s\n", pii
->pii_name
, pi
->pi_group
->pg_name
);
1543 * If the failed interface was active, activate another
1544 * INACTIVE interface in the group if possible.
1547 phyint_activate_another(pi
);
1550 * If the interface is offline, the state change will be
1551 * noted when it comes back online.
1553 if (pi
->pi_state
!= PI_OFFLINE
) {
1554 phyint_chstate(pi
, PI_FAILED
);
1560 pi2
= pi
->pi_group
->pg_phyint
;
1561 for (; pi2
!= NULL
; pi2
= pi2
->pi_pgnext
) {
1562 (void) change_pif_flags(pi2
, IFF_FAILED
, IFF_INACTIVE
);
1563 if (pi2
->pi_state
== PI_OFFLINE
) /* see comment above */
1566 reset_crtt_all(pi2
);
1568 * In the case of host targets, we would have flushed
1569 * the targets, and gone to PI_NOTARGETS state.
1571 if (pi2
->pi_state
== PI_RUNNING
)
1572 phyint_chstate(pi2
, PI_FAILED
);
1582 * Determines if any timeout event has occurred and returns the number of
1583 * milliseconds until the next timeout event for the phyint. Returns
1584 * TIMER_INFINITY for "never".
1587 phyint_inst_timer(struct phyint_instance
*pii
)
1591 struct target
*cur_tg
;
1592 struct probe_stats
*pr_statp
;
1593 struct phyint_instance
*pii_other
;
1595 int valid_unack_count
;
1600 hrtime_t cur_hrtime
;
1601 int probe_interval
= pii
->pii_phyint
->pi_group
->pg_probeint
;
1603 cur_hrtime
= gethrtime();
1604 cur_time
= ns2ms(cur_hrtime
);
1606 if (debug
& D_TIMER
) {
1607 logdebug("phyint_inst_timer(%s %s)\n",
1608 AF_STR(pii
->pii_af
), pii
->pii_name
);
1611 pii_other
= phyint_inst_other(pii
);
1612 if (!PROBE_ENABLED(pii
) && !PROBE_ENABLED(pii_other
)) {
1614 * Check to see if we're here due to link up/down flapping; If
1615 * enough time has passed, then try to bring the interface
1616 * back up; otherwise, schedule a timer to bring it back up
1617 * when enough time *has* elapsed.
1619 pi
= pii
->pii_phyint
;
1620 if (pi
->pi_state
== PI_FAILED
&& LINK_UP(pi
)) {
1621 check_time
= pi
->pi_whenup
[pi
->pi_whendx
] + MSEC_PERMIN
;
1622 if (check_time
> cur_time
)
1623 return (check_time
- cur_time
);
1625 phyint_check_for_repair(pi
);
1630 * If probing is not enabled on this phyint instance, don't proceed.
1632 if (!PROBE_ENABLED(pii
))
1633 return (TIMER_INFINITY
);
1636 * If the timer has fired too soon, probably triggered
1637 * by some other phyint instance, return the remaining
1640 if (TIME_LT(cur_time
, pii
->pii_snxt_time
))
1641 return (pii
->pii_snxt_time
- cur_time
);
1644 * If the link is down, don't send any probes for now.
1646 if (LINK_DOWN(pii
->pii_phyint
))
1647 return (TIMER_INFINITY
);
1650 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1651 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1652 * Base probe time is strictly periodic.
1654 interval
= GET_RANDOM(
1655 (int)(MIN_RANDOM_FACTOR
* user_probe_interval
),
1656 (int)(MAX_RANDOM_FACTOR
* user_probe_interval
));
1657 pii
->pii_snxt_time
= pii
->pii_snxt_basetime
+ interval
;
1660 * Check if the current time > next time to probe. If so, we missed
1661 * sending 1 or more probes, probably due to heavy system load. At least
1662 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1663 * were scheduled. Make adjustments to the times, in multiples of
1664 * user_probe_interval.
1666 if (TIME_GT(cur_time
, pii
->pii_snxt_time
)) {
1669 n
= (cur_time
- pii
->pii_snxt_time
) / user_probe_interval
;
1670 pii
->pii_snxt_time
+= (n
+ 1) * user_probe_interval
;
1671 pii
->pii_snxt_basetime
+= (n
+ 1) * user_probe_interval
;
1672 logtrace("missed sending %d probes cur_time %u snxt_time %u"
1673 " snxt_basetime %u\n", n
+ 1, cur_time
, pii
->pii_snxt_time
,
1674 pii
->pii_snxt_basetime
);
1676 /* Collect statistics about missed probes */
1677 probes_missed
.pm_nprobes
+= n
+ 1;
1678 probes_missed
.pm_ntimes
++;
1680 pii
->pii_snxt_basetime
+= user_probe_interval
;
1681 interval
= pii
->pii_snxt_time
- cur_time
;
1682 if (debug
& D_TARGET
) {
1683 logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1684 " interval %u\n", cur_time
, pii
->pii_snxt_time
,
1685 pii
->pii_snxt_basetime
, interval
);
1689 * If no targets are known, we need to send an ICMP multicast. The
1690 * probe type is PROBE_MULTI. We'll check back in 'interval' msec
1691 * to see if we found a target.
1693 if (pii
->pii_target_next
== NULL
) {
1694 assert(pii
->pii_ntargets
== 0);
1695 pii
->pii_fd_snxt_basetime
= pii
->pii_snxt_basetime
;
1696 probe(pii
, PROBE_MULTI
, cur_time
);
1700 if ((user_probe_interval
!= probe_interval
) &&
1701 TIME_LT(pii
->pii_snxt_time
, pii
->pii_fd_snxt_basetime
)) {
1703 * the failure detection (fd) probe timer has not yet fired.
1704 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1706 probe(pii
, PROBE_RTT
, cur_hrtime
);
1710 * the fd probe timer has fired. Need to do all failure
1711 * detection / recovery calculations, and then send an fd probe
1712 * of type PROBE_UNI.
1714 if (user_probe_interval
== probe_interval
) {
1716 * We could have missed some probes, and then adjusted
1717 * pii_snxt_basetime above. Otherwise we could have
1718 * blindly added probe_interval to pii_fd_snxt_basetime.
1720 pii
->pii_fd_snxt_basetime
= pii
->pii_snxt_basetime
;
1722 pii
->pii_fd_snxt_basetime
+= probe_interval
;
1723 if (TIME_GT(cur_time
, pii
->pii_fd_snxt_basetime
)) {
1726 n
= (cur_time
- pii
->pii_fd_snxt_basetime
) /
1728 pii
->pii_fd_snxt_basetime
+= (n
+ 1) * probe_interval
;
1733 * We can have at most, the latest 2 probes that we sent, in
1734 * the PR_UNACKED state. All previous probes sent, are either
1735 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1736 * timed out if the probe's time_start + the CRTT < currenttime.
1737 * For each of the last 2 probes, examine whether it has timed
1738 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1740 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
1741 valid_unack_count
= 0;
1743 for (i
= 0; i
< 2; i
++) {
1744 pr_statp
= &pii
->pii_probes
[pr_ndx
];
1745 cur_tg
= pii
->pii_probes
[pr_ndx
].pr_target
;
1746 switch (pr_statp
->pr_status
) {
1749 * We received back an ACK, so the switch clearly
1750 * is not dropping our traffic, and thus we can
1751 * enable failure detection immediately.
1753 if (pii
->pii_fd_hrtime
> gethrtime()) {
1754 if (debug
& D_PROBE
) {
1755 logdebug("successful probe on %s; "
1756 "ending quiet period\n",
1757 pii
->pii_phyint
->pi_name
);
1759 pii
->pii_fd_hrtime
= gethrtime();
1764 assert(cur_tg
!= NULL
);
1766 * The crtt could be zero for some reason,
1767 * Eg. the phyint could be failed. If the crtt is
1768 * not available use group's probe interval,
1769 * which is a worst case estimate.
1771 timeout
= ns2ms(pr_statp
->pr_hrtime_start
);
1772 if (cur_tg
->tg_crtt
!= 0) {
1773 timeout
+= cur_tg
->tg_crtt
;
1775 timeout
+= probe_interval
;
1777 if (TIME_LT(timeout
, cur_time
)) {
1778 pr_statp
->pr_time_lost
= timeout
;
1779 probe_chstate(pr_statp
, pii
, PR_LOST
);
1780 } else if (i
== 1) {
1782 * We are forced to consider this probe
1783 * lost, as we can have at most 2 unack.
1784 * probes any time, and we will be sending a
1785 * probe at the end of this function.
1786 * Normally, we should not be here, but
1787 * this can happen if an incoming response
1788 * that was considered lost has increased
1789 * the crtt for this target, and also bumped
1790 * up the FDT. Note that we never cancel or
1791 * increase the current pii_time_left, so
1792 * when the timer fires, we find 2 valid
1793 * unacked probes, and they are yet to timeout
1795 pr_statp
->pr_time_lost
= cur_time
;
1796 probe_chstate(pr_statp
, pii
, PR_LOST
);
1799 * Only the most recent probe can enter
1800 * this 'else' arm. The second most recent
1801 * probe must take either of the above arms,
1804 valid_unack_count
++;
1808 pr_ndx
= PROBE_INDEX_PREV(pr_ndx
);
1812 * We send out 1 probe randomly in the interval between one half
1813 * and one probe interval for the group. Given that the CRTT is always
1814 * less than the group's probe interval, we can have at most 1
1815 * unacknowledged probe now. All previous probes are either lost or
1818 assert(valid_unack_count
== 0 || valid_unack_count
== 1);
1821 * The timer has fired. Take appropriate action depending
1822 * on the current state of the phyint.
1824 * PI_RUNNING state - Failure detection
1825 * PI_FAILED state - Repair detection
1827 switch (pii
->pii_phyint
->pi_state
) {
1830 * If the most recent probe (excluding unacked probes that
1831 * are yet to time out) has been acked, check whether the
1832 * phyint is now repaired.
1834 if (pii
->pii_rack
+ valid_unack_count
+ 1 == pii
->pii_snxt
) {
1835 phyint_check_for_repair(pii
->pii_phyint
);
1841 * It's possible our probes have been lost because of a
1842 * spanning-tree mandated quiet period on the switch. If so,
1843 * ignore the lost probes.
1845 if (pii
->pii_fd_hrtime
- cur_hrtime
> 0)
1848 if (pii
->pii_rack
+ valid_unack_count
+ 1 != pii
->pii_snxt
) {
1850 * We have 1 or more failed probes (excluding unacked
1851 * probes that are yet to time out). Determine if the
1852 * phyint has failed.
1854 phyint_inst_check_for_failure(pii
);
1859 logerr("phyint_inst_timer: invalid state %d\n",
1860 pii
->pii_phyint
->pi_state
);
1865 * Start the next probe. probe() will also set pii->pii_probe_time_left
1866 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1867 * was called, the target list may be empty.
1869 if (pii
->pii_target_next
!= NULL
) {
1870 probe(pii
, PROBE_UNI
, cur_hrtime
);
1872 * If we have just the one probe target, and we're not using
1873 * router targets, try to find another as we presently have
1876 if (!pii
->pii_targets_are_routers
&& pii
->pii_ntargets
== 1)
1877 probe(pii
, PROBE_MULTI
, cur_hrtime
);
1879 probe(pii
, PROBE_MULTI
, cur_hrtime
);
1885 * Start the probe timer for an interface instance.
1888 start_timer(struct phyint_instance
*pii
)
1893 * Spread the base probe times (pi_snxt_basetime) across phyints
1894 * uniformly over the (curtime..curtime + the group's probe_interval).
1895 * pi_snxt_basetime is strictly periodic with a frequency of
1896 * the group's probe interval. The actual probe time pi_snxt_time
1897 * adds some randomness to pi_snxt_basetime and happens in probe().
1898 * For the 1st probe on each phyint after the timer is started,
1899 * pi_snxt_time and pi_snxt_basetime are the same.
1901 interval
= GET_RANDOM(0,
1902 (int)pii
->pii_phyint
->pi_group
->pg_probeint
);
1904 pii
->pii_snxt_basetime
= getcurrenttime() + interval
;
1905 pii
->pii_fd_snxt_basetime
= pii
->pii_snxt_basetime
;
1906 pii
->pii_snxt_time
= pii
->pii_snxt_basetime
;
1907 timer_schedule(interval
);
1911 * Restart the probe timer on an interface instance.
1914 restart_timer(struct phyint_instance
*pii
)
1917 * We don't need to restart the timer if it was never started in
1918 * the first place (pii->pii_basetime_inited not set), as the timer
1919 * won't have gone off yet.
1921 if (pii
->pii_basetime_inited
!= 0) {
1923 if (debug
& D_LINKNOTE
)
1924 logdebug("restart timer: restarting timer on %s, "
1925 "address family %s\n", pii
->pii_phyint
->pi_name
,
1926 AF_STR(pii
->pii_af
));
1933 process_link_state_down(struct phyint
*pi
)
1935 logerr("The link has gone down on %s\n", pi
->pi_name
);
1938 * Clear the probe statistics arrays, we don't want the repair
1939 * detection logic relying on probes that were successful prior
1940 * to the link going down.
1942 if (PROBE_CAPABLE(pi
->pi_v4
))
1943 clear_pii_probe_stats(pi
->pi_v4
);
1944 if (PROBE_CAPABLE(pi
->pi_v6
))
1945 clear_pii_probe_stats(pi
->pi_v6
);
1947 * Check for interface failure. Although we know the interface
1948 * has failed, we don't know if all the other interfaces in the
1949 * group have failed as well.
1951 if ((pi
->pi_state
== PI_RUNNING
) ||
1952 (pi
->pi_state
!= PI_FAILED
&& !GROUP_FAILED(pi
->pi_group
))) {
1953 if (debug
& D_LINKNOTE
) {
1954 logdebug("process_link_state_down:"
1955 " checking for failure on %s\n", pi
->pi_name
);
1958 if (pi
->pi_v4
!= NULL
)
1959 phyint_inst_check_for_failure(pi
->pi_v4
);
1960 else if (pi
->pi_v6
!= NULL
)
1961 phyint_inst_check_for_failure(pi
->pi_v6
);
1966 process_link_state_up(struct phyint
*pi
)
1968 logerr("The link has come up on %s\n", pi
->pi_name
);
1971 * We stopped any running timers on each instance when the link
1972 * went down, so restart them.
1975 restart_timer(pi
->pi_v4
);
1977 restart_timer(pi
->pi_v6
);
1979 phyint_check_for_repair(pi
);
1981 pi
->pi_whenup
[pi
->pi_whendx
++] = getcurrenttime();
1982 if (pi
->pi_whendx
== LINK_UP_PERMIN
)
1987 * Process any changes in link state passed up from the interfaces.
1990 process_link_state_changes(void)
1994 /* Look for interfaces where the link state has just changed */
1996 for (pi
= phyints
; pi
!= NULL
; pi
= pi
->pi_next
) {
1997 boolean_t old_link_state_up
= LINK_UP(pi
);
2000 * Except when the "phyint" structure is created, this is
2001 * the only place the link state is updated. This allows
2002 * this routine to detect changes in link state, rather
2003 * than just the current state.
2005 UPDATE_LINK_STATE(pi
);
2007 if (LINK_DOWN(pi
)) {
2009 * Has link just gone down?
2011 if (old_link_state_up
)
2012 process_link_state_down(pi
);
2015 * Has link just gone back up?
2017 if (!old_link_state_up
)
2018 process_link_state_up(pi
);
2024 reset_crtt_all(struct phyint
*pi
)
2026 struct phyint_instance
*pii
;
2031 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
) {
2040 for (tg
= pii
->pii_targets
; tg
!= NULL
; tg
= tg
->tg_next
) {
2049 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2050 * probes on both instances IPv4 and IPv6.
2051 * If the interface has failed, return the time of the first probe failure
2055 phyint_inst_probe_failure_state(struct phyint_instance
*pii
, uint_t
*tff
)
2058 struct target
*cur_tg
;
2059 struct probe_fail_count pfinfo
;
2060 struct phyint_instance
*pii_other
;
2064 * Get the number of consecutive failed probes on
2065 * this phyint across all targets. Also get the number
2066 * of consecutive failed probes on this target only
2068 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
2069 cur_tg
= pii
->pii_probes
[pr_ndx
].pr_target
;
2070 probe_fail_info(pii
, cur_tg
, &pfinfo
);
2072 /* Get the time of first failure, for later use */
2073 pi_tff
= pfinfo
.pf_tff
;
2076 * If the current target has not responded to the
2077 * last NUM_PROBE_FAILS probes, and other targets are
2078 * responding delete this target. Dead gateway detection
2079 * will eventually remove this target (if router) from the
2080 * routing tables. If that does not occur, we may end
2081 * up adding this to our list again.
2083 if (pfinfo
.pf_nfail
< NUM_PROBE_FAILS
&&
2084 pfinfo
.pf_nfail_tg
>= NUM_PROBE_FAILS
) {
2085 if (pii
->pii_targets_are_routers
) {
2086 if (cur_tg
->tg_status
== TG_ACTIVE
)
2087 pii
->pii_ntargets
--;
2088 cur_tg
->tg_status
= TG_DEAD
;
2089 cur_tg
->tg_crtt
= 0;
2090 cur_tg
->tg_rtt_sa
= -1;
2091 cur_tg
->tg_rtt_sd
= 0;
2092 if (pii
->pii_target_next
== cur_tg
)
2093 pii
->pii_target_next
= target_next(cur_tg
);
2095 target_delete(cur_tg
);
2096 probe(pii
, PROBE_MULTI
, gethrtime());
2102 * If the phyint has lost NUM_PROBE_FAILS or more
2103 * consecutive probes, on both IPv4 and IPv6 protocol
2104 * instances of the phyint, then trigger failure
2105 * detection, else return false
2107 if (pfinfo
.pf_nfail
< NUM_PROBE_FAILS
)
2110 pii_other
= phyint_inst_other(pii
);
2111 if (PROBE_CAPABLE(pii_other
)) {
2112 probe_fail_info(pii_other
, NULL
, &pfinfo
);
2113 if (pfinfo
.pf_nfail
>= NUM_PROBE_FAILS
) {
2115 * We have NUM_PROBE_FAILS or more failures
2116 * on both IPv4 and IPv6. Get the earliest
2117 * time when failure was detected on this
2118 * phyint across IPv4 and IPv6.
2120 if (TIME_LT(pfinfo
.pf_tff
, pi_tff
))
2121 pi_tff
= pfinfo
.pf_tff
;
2124 * This instance has < NUM_PROBE_FAILS failure.
2131 return (PHYINT_FAILURE
);
2135 * Check if the link has gone down on this phyint, or it has failed the
2136 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2137 * Also look at other phyints of this group, for group failures.
2140 failure_state(struct phyint_instance
*pii
)
2142 struct probe_success_count psinfo
;
2143 uint_t pi2_tls
; /* time last success */
2144 uint_t pi_tff
; /* time first fail */
2147 struct phyint_instance
*pii2
;
2148 struct phyint_group
*pg
;
2151 if (debug
& D_FAILREP
)
2152 logdebug("phyint_failed(%s)\n", pii
->pii_name
);
2154 pi
= pii
->pii_phyint
;
2157 if (LINK_UP(pi
) && phyint_inst_probe_failure_state(pii
, &pi_tff
) ==
2162 * At this point, the link is down, or the phyint is suspect, as it
2163 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2164 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
2165 * on to determine whether this should be considered a PHYINT_FAILURE
2168 if (pg
== phyint_anongroup
)
2169 return (PHYINT_FAILURE
);
2172 * Need to compare against other phyints of the same group
2173 * to exclude group failures. If the failure was detected via
2174 * probing, then if the time of last success (tls) of any
2175 * phyint is more recent than the time of first fail (tff) of the
2176 * phyint in question, and the link is up on the phyint,
2177 * then it is a phyint failure. Otherwise it is a group failure.
2178 * If failure was detected via a link down notification sent from
2179 * the driver to IP, we see if any phyints in the group are still
2180 * running and haven't received a link down notification. We
2181 * will usually be processing the link down notification shortly
2182 * after it was received, so there is no point looking at the tls
2185 retval
= GROUP_FAILURE
;
2186 for (pi2
= pg
->pg_phyint
; pi2
!= NULL
; pi2
= pi2
->pi_pgnext
) {
2187 /* Exclude ourself from comparison */
2191 if (LINK_DOWN(pi
)) {
2193 * We use FLAGS_TO_LINK_STATE() to test the flags
2194 * directly, rather then LINK_UP() or LINK_DOWN(), as
2195 * we may not have got round to processing the link
2196 * state for the other phyints in the group yet.
2198 * The check for PI_RUNNING and group failure handles
2199 * the case when the group begins to recover.
2200 * PI_RUNNING will be set, and group failure cleared
2201 * only after receipt of NUM_PROBE_REPAIRS, by which
2202 * time the other phyints should have received at
2203 * least 1 packet, and so will not have NUM_PROBE_FAILS.
2205 if ((pi2
->pi_state
== PI_RUNNING
) &&
2206 !GROUP_FAILED(pg
) && FLAGS_TO_LINK_STATE(pi2
)) {
2207 retval
= PHYINT_FAILURE
;
2217 * If there's no probe-based failure detection on this
2218 * interface, and its link is still up, then it's still
2219 * working and thus the group has not failed.
2221 if (!PROBE_ENABLED(pi2
->pi_v4
) && !PROBE_ENABLED(pi2
->pi_v6
)) {
2222 retval
= PHYINT_FAILURE
;
2227 * Need to compare against both IPv4 and IPv6 instances.
2231 probe_success_info(pii2
, NULL
, &psinfo
);
2232 if (psinfo
.ps_tls_valid
) {
2233 pi2_tls
= psinfo
.ps_tls
;
2235 * See comment above regarding check
2236 * for PI_RUNNING and group failure.
2238 if (TIME_GT(pi2_tls
, pi_tff
) &&
2239 (pi2
->pi_state
== PI_RUNNING
) &&
2240 !GROUP_FAILED(pg
) &&
2241 FLAGS_TO_LINK_STATE(pi2
)) {
2242 retval
= PHYINT_FAILURE
;
2250 probe_success_info(pii2
, NULL
, &psinfo
);
2251 if (psinfo
.ps_tls_valid
) {
2252 pi2_tls
= psinfo
.ps_tls
;
2254 * See comment above regarding check
2255 * for PI_RUNNING and group failure.
2257 if (TIME_GT(pi2_tls
, pi_tff
) &&
2258 (pi2
->pi_state
== PI_RUNNING
) &&
2259 !GROUP_FAILED(pg
) &&
2260 FLAGS_TO_LINK_STATE(pi2
)) {
2261 retval
= PHYINT_FAILURE
;
2269 * Update the group state to account for the changes.
2271 phyint_group_refresh_state(pg
);
2276 * Return the information associated with consecutive probe successes
2277 * starting with the most recent probe. At most the last 2 probes can be
2278 * in the unacknowledged state. All previous probes have either failed
2282 probe_success_info(struct phyint_instance
*pii
, struct target
*cur_tg
,
2283 struct probe_success_count
*psinfo
)
2286 struct probe_stats
*pr_statp
;
2288 uint_t second_most_recent
;
2289 boolean_t pi_found_failure
= _B_FALSE
;
2290 boolean_t tg_found_failure
= _B_FALSE
;
2295 if (debug
& D_FAILREP
)
2296 logdebug("probe_success_info(%s)\n", pii
->pii_name
);
2298 bzero(psinfo
, sizeof (*psinfo
));
2299 now
= getcurrenttime();
2302 * Start with the most recent probe, and count the number
2303 * of consecutive probe successes. Latch the number of successes
2304 * on hitting a failure.
2306 most_recent
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
2307 second_most_recent
= PROBE_INDEX_PREV(most_recent
);
2309 for (i
= most_recent
; i
!= pii
->pii_probe_next
;
2310 i
= PROBE_INDEX_PREV(i
)) {
2311 pr_statp
= &pii
->pii_probes
[i
];
2313 switch (pr_statp
->pr_status
) {
2316 * Only the most recent 2 probes can be unacknowledged
2318 assert(i
== most_recent
|| i
== second_most_recent
);
2320 tg
= pr_statp
->pr_target
;
2323 * The crtt could be zero for some reason,
2324 * Eg. the phyint could be failed. If the crtt is
2325 * not available use the value of the group's probe
2326 * interval which is a worst case estimate.
2328 timeout
= ns2ms(pr_statp
->pr_hrtime_start
);
2329 if (tg
->tg_crtt
!= 0) {
2330 timeout
+= tg
->tg_crtt
;
2333 pii
->pii_phyint
->pi_group
->pg_probeint
;
2336 if (TIME_LT(timeout
, now
)) {
2338 * We hit a failure. Latch the total number of
2339 * recent consecutive successes.
2341 pr_statp
->pr_time_lost
= timeout
;
2342 probe_chstate(pr_statp
, pii
, PR_LOST
);
2343 pi_found_failure
= _B_TRUE
;
2344 if (cur_tg
!= NULL
&& tg
== cur_tg
) {
2346 * We hit a failure for the desired
2347 * target. Latch the number of recent
2348 * consecutive successes for this target
2350 tg_found_failure
= _B_TRUE
;
2357 * Bump up the count of probe successes, if we
2358 * have not seen any failure so far.
2360 if (!pi_found_failure
)
2363 if (cur_tg
!= NULL
&& pr_statp
->pr_target
== cur_tg
&&
2364 !tg_found_failure
) {
2365 psinfo
->ps_nsucc_tg
++;
2369 * Record the time of last success, if this is
2370 * the most recent probe success.
2372 if (!psinfo
->ps_tls_valid
) {
2374 ns2ms(pr_statp
->pr_hrtime_ackproc
);
2375 psinfo
->ps_tls_valid
= _B_TRUE
;
2381 * We hit a failure. Latch the total number of
2382 * recent consecutive successes.
2384 pi_found_failure
= _B_TRUE
;
2385 if (cur_tg
!= NULL
&& pr_statp
->pr_target
== cur_tg
) {
2387 * We hit a failure for the desired target.
2388 * Latch the number of recent consecutive
2389 * successes for this target
2391 tg_found_failure
= _B_TRUE
;
2403 * Return the information associated with consecutive probe failures
2404 * starting with the most recent probe. Only the last 2 probes can be in the
2405 * unacknowledged state. All previous probes have either failed or succeeded.
2408 probe_fail_info(struct phyint_instance
*pii
, struct target
*cur_tg
,
2409 struct probe_fail_count
*pfinfo
)
2412 struct probe_stats
*pr_statp
;
2413 boolean_t tg_found_success
= _B_FALSE
;
2414 boolean_t pi_found_success
= _B_FALSE
;
2416 int second_most_recent
;
2421 if (debug
& D_FAILREP
)
2422 logdebug("probe_fail_info(%s)\n", pii
->pii_name
);
2424 bzero(pfinfo
, sizeof (*pfinfo
));
2425 now
= getcurrenttime();
2428 * Start with the most recent probe, and count the number
2429 * of consecutive probe failures. Latch the number of failures
2430 * on hitting a probe success.
2432 most_recent
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
2433 second_most_recent
= PROBE_INDEX_PREV(most_recent
);
2435 for (i
= most_recent
; i
!= pii
->pii_probe_next
;
2436 i
= PROBE_INDEX_PREV(i
)) {
2437 pr_statp
= &pii
->pii_probes
[i
];
2439 assert(PR_STATUS_VALID(pr_statp
->pr_status
));
2441 switch (pr_statp
->pr_status
) {
2444 * Only the most recent 2 probes can be unacknowledged
2446 assert(i
== most_recent
|| i
== second_most_recent
);
2448 tg
= pr_statp
->pr_target
;
2450 * Target is guaranteed to exist in the unack. state
2454 * The crtt could be zero for some reason,
2455 * Eg. the phyint could be failed. If the crtt is
2456 * not available use the group's probe interval,
2457 * which is a worst case estimate.
2459 timeout
= ns2ms(pr_statp
->pr_hrtime_start
);
2460 if (tg
->tg_crtt
!= 0) {
2461 timeout
+= tg
->tg_crtt
;
2464 pii
->pii_phyint
->pi_group
->pg_probeint
;
2467 if (TIME_GT(timeout
, now
))
2470 pr_statp
->pr_time_lost
= timeout
;
2471 probe_chstate(pr_statp
, pii
, PR_LOST
);
2475 if (!pi_found_success
) {
2477 pfinfo
->pf_tff
= pr_statp
->pr_time_lost
;
2479 if (cur_tg
!= NULL
&& pr_statp
->pr_target
== cur_tg
&&
2480 !tg_found_success
) {
2481 pfinfo
->pf_nfail_tg
++;
2487 * We hit a success or unused slot. Latch the
2488 * total number of recent consecutive failures.
2490 pi_found_success
= _B_TRUE
;
2491 if (cur_tg
!= NULL
&& pr_statp
->pr_target
== cur_tg
) {
2493 * We hit a success for the desired target.
2494 * Latch the number of recent consecutive
2495 * failures for this target
2497 tg_found_success
= _B_TRUE
;
2504 * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2507 probe_chstate(struct probe_stats
*pr
, struct phyint_instance
*pii
, int state
)
2509 if (pr
->pr_status
== state
)
2512 pr
->pr_status
= state
;
2513 (void) probe_state_event(pr
, pii
);
2517 * Check if the phyint has been repaired. If no test address has been
2518 * configured, then consider the interface repaired if the link is up (unless
2519 * the link is flapping; see below). Otherwise, look for proof of probes
2520 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2521 * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2524 phyint_repaired(struct phyint
*pi
)
2526 struct probe_success_count psinfo
;
2527 struct phyint_instance
*pii
;
2528 struct target
*cur_tg
;
2532 if (debug
& D_FAILREP
)
2533 logdebug("phyint_repaired(%s)\n", pi
->pi_name
);
2539 * If we don't have any test addresses and the link is up, then
2540 * consider the interface repaired, unless we've received more than
2541 * LINK_UP_PERMIN link up notifications in the last minute, in
2542 * which case we keep the link down until we drop back below
2545 if (!PROBE_ENABLED(pi
->pi_v4
) && !PROBE_ENABLED(pi
->pi_v6
)) {
2546 cur_time
= getcurrenttime();
2547 if ((pi
->pi_whenup
[pi
->pi_whendx
] == 0 ||
2548 (cur_time
- pi
->pi_whenup
[pi
->pi_whendx
]) > MSEC_PERMIN
)) {
2549 pi
->pi_lfmsg_printed
= 0;
2552 if (!pi
->pi_lfmsg_printed
) {
2553 logerr("The link has come up on %s more than %d times "
2554 "in the last minute; disabling repair until it "
2555 "stabilizes\n", pi
->pi_name
, LINK_UP_PERMIN
);
2556 pi
->pi_lfmsg_printed
= 1;
2563 if (PROBE_CAPABLE(pii
)) {
2564 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
2565 cur_tg
= pii
->pii_probes
[pr_ndx
].pr_target
;
2566 probe_success_info(pii
, cur_tg
, &psinfo
);
2567 if (psinfo
.ps_nsucc
>= NUM_PROBE_REPAIRS
||
2568 psinfo
.ps_nsucc_tg
>= NUM_PROBE_REPAIRS
)
2573 if (PROBE_CAPABLE(pii
)) {
2574 pr_ndx
= PROBE_INDEX_PREV(pii
->pii_probe_next
);
2575 cur_tg
= pii
->pii_probes
[pr_ndx
].pr_target
;
2576 probe_success_info(pii
, cur_tg
, &psinfo
);
2577 if (psinfo
.ps_nsucc
>= NUM_PROBE_REPAIRS
||
2578 psinfo
.ps_nsucc_tg
>= NUM_PROBE_REPAIRS
)
2586 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2589 change_pif_flags(struct phyint
*pi
, uint64_t set
, uint64_t clear
)
2595 if (debug
& D_FAILREP
) {
2596 logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2597 pi
->pi_name
, set
, clear
);
2600 if (pi
->pi_v4
!= NULL
)
2606 * Get the current flags from the kernel, and set/clear the
2607 * desired phyint flags. Since we set only phyint flags, we can
2608 * do it on either IPv4 or IPv6 instance.
2610 (void) strlcpy(lifr
.lifr_name
, pi
->pi_name
, sizeof (lifr
.lifr_name
));
2612 if (ioctl(ifsock
, SIOCGLIFFLAGS
, (char *)&lifr
) < 0) {
2614 logperror("change_pif_flags: ioctl (get flags)");
2618 old_flags
= lifr
.lifr_flags
;
2619 lifr
.lifr_flags
|= set
;
2620 lifr
.lifr_flags
&= ~clear
;
2622 if (old_flags
== lifr
.lifr_flags
) {
2623 /* No change in the flags. No need to send ioctl */
2627 if (ioctl(ifsock
, SIOCSLIFFLAGS
, (char *)&lifr
) < 0) {
2629 logperror("change_pif_flags: ioctl (set flags)");
2634 * Keep pi_flags in synch. with actual flags. Assumes flags are
2637 pi
->pi_flags
|= set
;
2638 pi
->pi_flags
&= ~clear
;
2640 if (pi
->pi_v4
!= NULL
)
2641 pi
->pi_v4
->pii_flags
= pi
->pi_flags
;
2643 if (pi
->pi_v6
!= NULL
)
2644 pi
->pi_v6
->pii_flags
= pi
->pi_flags
;
2650 * icmp cksum computation for IPv4.
2653 in_cksum(ushort_t
*addr
, int len
)
2655 register int nleft
= len
;
2656 register ushort_t
*w
= addr
;
2657 register ushort_t answer
;
2658 ushort_t odd_byte
= 0;
2659 register int sum
= 0;
2662 * Our algorithm is simple, using a 32 bit accumulator (sum),
2663 * we add sequential 16 bit words to it, and at the end, fold
2664 * back all the carry bits from the top 16 bits into the lower
2672 /* mop up an odd byte, if necessary */
2674 *(uchar_t
*)(&odd_byte
) = *(uchar_t
*)w
;
2679 * add back carry outs from top 16 bits to low 16 bits
2681 sum
= (sum
>> 16) + (sum
& 0xffff); /* add hi 16 to low 16 */
2682 sum
+= (sum
>> 16); /* add carry */
2683 answer
= ~sum
; /* truncate to 16 bits */
2688 reset_snxt_basetimes(void)
2690 struct phyint_instance
*pii
;
2692 for (pii
= phyint_instances
; pii
!= NULL
; pii
= pii
->pii_next
) {
2693 pii
->pii_fd_snxt_basetime
= pii
->pii_snxt_basetime
;
2698 * Is the address one of our own addresses? Unfortunately,
2699 * we cannot check our phyint tables to determine if the address
2700 * is our own. This is because, we don't track interfaces that
2701 * are not part of any group. We have to either use a 'bind' or
2702 * get the complete list of all interfaces using SIOCGLIFCONF,
2703 * to do this check. We could also use SIOCTMYADDR.
2704 * Bind fails for the local zone address, so we might include local zone
2705 * address as target address. If local zone address is a target address
2706 * and it is up, it is not possible to detect the interface failure.
2707 * SIOCTMYADDR also doesn't consider local zone address as own address.
2708 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2709 * are stored in `localaddrs'
2712 own_address(struct in6_addr addr
)
2715 struct sockaddr_storage ss
;
2716 int af
= IN6_IS_ADDR_V4MAPPED(&addr
) ? AF_INET
: AF_INET6
;
2718 addr2storage(af
, &addr
, &ss
);
2719 for (addrp
= localaddrs
; addrp
!= NULL
; addrp
= addrp
->al_next
) {
2720 if (sockaddrcmp(&ss
, &addrp
->al_addr
))
2729 return (NSEC2MSEC(ns
));
2733 tv2ns(struct timeval
*tvp
)
2735 return (tvp
->tv_sec
* NANOSEC
+ tvp
->tv_usec
* 1000);