dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / cmd / cmd-inet / usr.lib / in.mpathd / mpd_probe.c
blobdf22d95ef6ad27d7e863a87caf1c8a4bee0e3099
1 /*
2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
6 /*
7 * Copyright (c) 1987 Regents of the University of California.
8 * All rights reserved.
10 * Redistribution and use in source and binary forms are permitted
11 * provided that the above copyright notice and this paragraph are
12 * duplicated in all such forms and that any documentation,
13 * advertising materials, and other materials related to such
14 * distribution and use acknowledge that the software was developed
15 * by the University of California, Berkeley. The name of the
16 * University may not be used to endorse or promote products derived
17 * from this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23 #include "mpd_defs.h"
24 #include "mpd_tables.h"
27 * Probe types for probe()
29 #define PROBE_UNI 0x1234 /* Unicast probe packet */
30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */
31 #define PROBE_RTT 0x9abc /* RTT only probe packet */
33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
36 * Format of probe / probe response packets. This is an ICMP Echo request
37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
39 struct pr_icmp
41 uint8_t pr_icmp_type; /* type field */
42 uint8_t pr_icmp_code; /* code field */
43 uint16_t pr_icmp_cksum; /* checksum field */
44 uint16_t pr_icmp_id; /* Identification */
45 uint16_t pr_icmp_seq; /* sequence number */
46 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */
47 uint32_t pr_icmp_mtype; /* Message type */
50 static struct in6_addr all_nodes_mcast_v6 = { { { 0xff, 0x2, 0x0, 0x0,
51 0x0, 0x0, 0x0, 0x0,
52 0x0, 0x0, 0x0, 0x0,
53 0x0, 0x0, 0x0, 0x1 } } };
55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
57 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
59 static void *find_ancillary(struct msghdr *msg, int cmsg_level,
60 int cmsg_type);
61 static void pi_set_crtt(struct target *tg, int64_t m,
62 boolean_t is_probe_uni);
63 static void incoming_echo_reply(struct phyint_instance *pii,
64 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
65 static void incoming_rtt_reply(struct phyint_instance *pii,
66 struct pr_icmp *reply, struct in6_addr fromaddr);
67 static void incoming_mcast_reply(struct phyint_instance *pii,
68 struct pr_icmp *reply, struct in6_addr fromaddr);
70 static boolean_t check_pg_crtt_improved(struct phyint_group *pg);
71 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii);
72 static boolean_t check_exception_target(struct phyint_instance *pii,
73 struct target *target);
74 static void probe_fail_info(struct phyint_instance *pii,
75 struct target *cur_tg, struct probe_fail_count *pfinfo);
76 static void probe_success_info(struct phyint_instance *pii,
77 struct target *cur_tg, struct probe_success_count *psinfo);
78 static boolean_t phyint_repaired(struct phyint *pi);
80 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
81 static int in_cksum(ushort_t *addr, int len);
82 static void reset_snxt_basetimes(void);
83 static int ns2ms(int64_t ns);
84 static int64_t tv2ns(struct timeval *);
87 * CRTT - Conservative Round Trip Time Estimate
88 * Probe success - A matching probe reply received before CRTT ms has elapsed
89 * after sending the probe.
90 * Probe failure - No probe reply received and more than CRTT ms has elapsed
91 * after sending the probe.
93 * TLS - Time last success. Most recent probe ack received at this time.
94 * TFF - Time first fail. The time of the earliest probe failure in
95 * a consecutive series of probe failures.
96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required
97 * before declaring phyint repair.
98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to
99 * declare a phyint failure.
101 * Phyint state diagram
103 * The state of a phyint that is capable of being probed, is completely
104 * specified by the 3-tuple <pi_state, pg_state, I>.
106 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107 * IFF_OFFLINE is set. If the phyint is also configured with a test address
108 * (the common case) and probe targets, then a phyint must also successfully
109 * be able to send and receive probes in order to remain in the PI_RUNNING
110 * state (otherwise, it transitions to PI_FAILED).
112 * Further, if a PI_RUNNING phyint is configured with a test address but is
113 * unable to find any probe targets, it will transition to the PI_NOTARGETS
114 * state, which indicates that the link is apparently functional but that
115 * in.mpathd is unable to send probes to verify functionality (in this case,
116 * in.mpathd makes the optimistic assumption that the interface is working
117 * correctly and thus does not mark the interface FAILED, but reports it as
118 * IPMP_IF_UNKNOWN through the async events and query interfaces).
120 * At any point, a phyint may be administratively marked offline via if_mpadm.
121 * In this case, the interface always transitions to PI_OFFLINE, regardless
122 * of its previous state. When the interface is later brought back online,
123 * in.mpathd acts as if the interface is new (and thus it transitions to
124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of
125 * its probes, if probes are sent).
127 * pi_state - PI_RUNNING or PI_FAILED
128 * PI_RUNNING: The failure detection logic says the phyint is good.
129 * PI_FAILED: The failure detection logic says the phyint has failed.
131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
132 * PG_OK: All interfaces in the group are OK.
133 * PG_DEGRADED: Some interfaces in the group are unusable.
134 * PG_FAILED: All interfaces in the group are unusable.
136 * In the case of router targets, we assume that the current list of
137 * targets obtained from the routing table, is still valid, so the
138 * phyint stat is PI_FAILED. In the case of host targets, we delete the
139 * list of targets, and multicast to the all hosts, to reconstruct the
140 * target list. So the phyints are in the PI_NOTARGETS state.
142 * I - value of (pi_flags & IFF_INACTIVE)
143 * IFF_INACTIVE: This phyint will not send or receive packets.
144 * Usually, inactive is tied to standby interfaces that are not yet
145 * needed (e.g., no non-standby interfaces in the group have failed).
146 * When failback has been disabled (FAILBACK=no configured), phyint can
147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148 * subsequently recovers after a failure.
150 * Not all 9 possible combinations of the above 3-tuple are possible.
152 * I is tracked by IP. pi_state is tracked by mpathd.
154 * pi_state state machine
155 * ---------------------------------------------------------------------------
156 * Event State New State
157 * Action:
158 * ---------------------------------------------------------------------------
159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
160 * detection : set IFF_FAILED on this phyint
162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
163 * detection : set IFF_FAILED on this phyint
165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
166 * detection -> (PI_RUNNING, I == 0)
167 * : clear IFF_FAILED on this phyint
169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
170 * detection -> (PI_RUNNING, I == 1)
171 * : clear IFF_FAILED on this phyint
172 * : if failback is disabled set I == 1
174 * Group failure (perform on all phyints in the group)
175 * detection PI_RUNNING PI_FAILED
176 * (Router targets) : set IFF_FAILED
178 * Group failure (perform on all phyints in the group)
179 * detection PI_RUNNING PI_NOTARGETS
180 * (Host targets) : set IFF_FAILED
181 * : delete the target list on all phyints
182 * ---------------------------------------------------------------------------
185 struct probes_missed probes_missed;
188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header
189 * will be added on by the kernel. The id field identifies this phyint.
190 * and the sequence number is an increasing (modulo 2^^16) integer. The data
191 * portion holds the time value when the packet is sent. On echo this is
192 * extracted to compute the round-trip time. Three different types of
193 * probe packets are used.
195 * PROBE_UNI: This type is used to do failure detection / failure recovery
196 * and RTT calculation. PROBE_UNI probes are spaced apart in time,
197 * not less than the current CRTT. pii_probes[] stores data
198 * about these probes. These packets consume sequence number space.
200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these
201 * are not used. Under heavy network load, the rtt may go up very high,
202 * due to a spike, or may appear to go high, due to extreme scheduling
203 * delays. Once the network stress is removed, mpathd takes long time to
204 * recover, because the probe_interval is already high, and it takes
205 * a long time to send out sufficient number of probes to bring down the
206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every
207 * user_probe_interval ms. and will cause only rtt updates. These packets
208 * do not consume sequence number space nor is information about these
209 * packets stored in the pii_probes[]
211 * PROBE_MULTI: This type is only used to construct a list of targets, when
212 * no targets are known. The packet is multicast to the all hosts addr.
214 static void
215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
217 hrtime_t sent_hrtime;
218 struct timeval sent_tv;
219 struct pr_icmp probe_pkt; /* Probe packet */
220 struct sockaddr_storage targ; /* target address */
221 uint_t targaddrlen; /* targed address length */
222 int pr_ndx; /* probe index in pii->pii_probes[] */
223 boolean_t sent = _B_FALSE;
224 int rval;
226 if (debug & D_TARGET) {
227 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228 pii->pii_name, probe_type, start_hrtime);
231 assert(pii->pii_probe_sock != -1);
232 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
233 probe_type == PROBE_RTT);
235 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
236 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
237 probe_pkt.pr_icmp_code = 0;
238 probe_pkt.pr_icmp_cksum = 0;
239 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
242 * Since there is no need to do arithmetic on the icmpid,
243 * (only equality check is done) pii_icmpid is stored in
244 * network byte order at initialization itself.
246 probe_pkt.pr_icmp_id = pii->pii_icmpid;
247 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
248 probe_pkt.pr_icmp_mtype = htonl(probe_type);
251 * If probe_type is PROBE_MULTI, this packet will be multicast to
252 * the all hosts address. Otherwise it is unicast to the next target.
254 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
255 pii->pii_rtt_target_next != NULL));
257 bzero(&targ, sizeof (targ));
258 targ.ss_family = pii->pii_af;
260 if (pii->pii_af == AF_INET6) {
261 struct in6_addr *addr6;
263 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264 targaddrlen = sizeof (struct sockaddr_in6);
265 if (probe_type == PROBE_MULTI) {
266 *addr6 = all_nodes_mcast_v6;
267 } else if (probe_type == PROBE_UNI) {
268 *addr6 = pii->pii_target_next->tg_address;
269 } else { /* type is PROBE_RTT */
270 *addr6 = pii->pii_rtt_target_next->tg_address;
272 } else {
273 struct in_addr *addr4;
275 addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276 targaddrlen = sizeof (struct sockaddr_in);
277 if (probe_type == PROBE_MULTI) {
278 *addr4 = all_nodes_mcast_v4;
279 } else if (probe_type == PROBE_UNI) {
280 IN6_V4MAPPED_TO_INADDR(
281 &pii->pii_target_next->tg_address, addr4);
282 } else { /* type is PROBE_RTT */
283 IN6_V4MAPPED_TO_INADDR(
284 &pii->pii_rtt_target_next->tg_address, addr4);
288 * Compute the IPv4 icmp checksum. Does not cover the IP header.
290 probe_pkt.pr_icmp_cksum =
291 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
295 * Use the current time as the time we sent. Not atomic, but the best
296 * we can do from here.
298 sent_hrtime = gethrtime();
299 (void) gettimeofday(&sent_tv, NULL);
300 rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301 (struct sockaddr *)&targ, targaddrlen);
303 * If the send would block, this may either be transient or a hang in a
304 * lower layer. We pretend the probe was actually sent, the daemon will
305 * not see a reply to the probe and will fail the interface if normal
306 * failure detection criteria are met.
308 if (rval == sizeof (probe_pkt) ||
309 (rval == -1 && errno == EWOULDBLOCK)) {
310 sent = _B_TRUE;
311 } else {
312 logperror_pii(pii, "probe: probe sendto");
316 * If this is a PROBE_UNI probe packet being unicast to a target, then
317 * update our tables. We will need this info in processing the probe
318 * response. PROBE_MULTI and PROBE_RTT packets are not used for
319 * the purpose of failure or recovery detection. PROBE_MULTI packets
320 * are only used to construct a list of targets. PROBE_RTT packets are
321 * used only for updating the rtt and not for failure detection.
323 if (probe_type == PROBE_UNI && sent) {
324 pr_ndx = pii->pii_probe_next;
325 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
327 /* Collect statistics, before we reuse the last slot. */
328 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
329 pii->pii_cum_stats.lost++;
330 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
331 pii->pii_cum_stats.acked++;
332 pii->pii_cum_stats.sent++;
334 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
338 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
341 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
342 pii->pii_target_next = target_next(pii->pii_target_next);
343 assert(pii->pii_target_next != NULL);
345 * If we have a single variable to denote the next target to
346 * probe for both rtt probes and failure detection probes, we
347 * could end up with a situation where the failure detection
348 * probe targets become disjoint from the rtt probe targets.
349 * Eg. if 2 targets and the actual fdt is double the user
350 * specified fdt. So we have 2 variables. In this scheme
351 * we also reset pii_rtt_target_next for every fdt probe,
352 * though that may not be necessary.
354 pii->pii_rtt_target_next = pii->pii_target_next;
355 pii->pii_snxt++;
356 } else if (probe_type == PROBE_RTT) {
357 pii->pii_rtt_target_next =
358 target_next(pii->pii_rtt_target_next);
359 assert(pii->pii_rtt_target_next != NULL);
364 * Incoming IPv4 data from wire, is received here. Called from main.
366 void
367 in_data(struct phyint_instance *pii)
369 struct sockaddr_in from;
370 struct in6_addr fromaddr;
371 static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
373 struct ip *ip;
374 int iphlen;
375 int len;
376 char abuf[INET_ADDRSTRLEN];
377 struct msghdr msg;
378 struct iovec iov;
379 struct pr_icmp *reply;
380 struct timeval *recv_tvp;
382 if (debug & D_PROBE) {
383 logdebug("in_data(%s %s)\n",
384 AF_STR(pii->pii_af), pii->pii_name);
387 iov.iov_base = (char *)in_packet;
388 iov.iov_len = sizeof (in_packet);
389 msg.msg_iov = &iov;
390 msg.msg_iovlen = 1;
391 msg.msg_name = (struct sockaddr *)&from;
392 msg.msg_namelen = sizeof (from);
393 msg.msg_control = ancillary_data;
394 msg.msg_controllen = sizeof (ancillary_data);
397 * Poll has already told us that a message is waiting,
398 * on this socket. Read it now. We should not block.
400 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
401 logperror_pii(pii, "in_data: recvmsg");
402 return;
406 * If the datalink has indicated the link is down, don't go
407 * any further.
409 if (LINK_DOWN(pii->pii_phyint))
410 return;
412 /* Get the printable address for error reporting */
413 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
415 /* Ignore packets > 64k or control buffers that don't fit */
416 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
417 if (debug & D_PKTBAD) {
418 logdebug("Truncated message: msg_flags 0x%x from %s\n",
419 msg.msg_flags, abuf);
421 return;
424 /* Make sure packet contains at least minimum ICMP header */
425 ip = (struct ip *)in_packet;
426 iphlen = ip->ip_hl << 2;
427 if (len < iphlen + ICMP_MINLEN) {
428 if (debug & D_PKTBAD) {
429 logdebug("in_data: packet too short (%d bytes)"
430 " from %s\n", len, abuf);
432 return;
436 * Subtract the IP hdr length, 'len' will be length of the probe
437 * reply, starting from the icmp hdr.
439 len -= iphlen;
440 /* LINTED */
441 reply = (struct pr_icmp *)((char *)in_packet + iphlen);
443 /* Probe replies are icmp echo replies. Ignore anything else */
444 if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
445 return;
448 * The icmp id should match what we sent, which is stored
449 * in pi_icmpid. The icmp code for reply must be 0.
450 * The reply content must be a struct pr_icmp
452 if (reply->pr_icmp_id != pii->pii_icmpid) {
453 /* Not in response to our probe */
454 return;
457 if (reply->pr_icmp_code != 0) {
458 logtrace("probe reply code %d from %s on %s\n",
459 reply->pr_icmp_code, abuf, pii->pii_name);
460 return;
463 if (len < sizeof (struct pr_icmp)) {
464 logtrace("probe reply too short: %d bytes from %s on %s\n",
465 len, abuf, pii->pii_name);
466 return;
469 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
470 if (recv_tvp == NULL) {
471 logtrace("message without timestamp from %s on %s\n",
472 abuf, pii->pii_name);
473 return;
476 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
477 if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
478 /* Unicast probe reply */
479 incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
480 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
481 /* Multicast reply */
482 incoming_mcast_reply(pii, reply, fromaddr);
483 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
484 incoming_rtt_reply(pii, reply, fromaddr);
485 } else {
486 /* Probably not in response to our probe */
487 logtrace("probe reply type: %d from %s on %s\n",
488 reply->pr_icmp_mtype, abuf, pii->pii_name);
489 return;
494 * Incoming IPv6 data from wire is received here. Called from main.
496 void
497 in6_data(struct phyint_instance *pii)
499 struct sockaddr_in6 from;
500 static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
501 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
502 int len;
503 char abuf[INET6_ADDRSTRLEN];
504 struct msghdr msg;
505 struct iovec iov;
506 void *opt;
507 struct pr_icmp *reply;
508 struct timeval *recv_tvp;
510 if (debug & D_PROBE) {
511 logdebug("in6_data(%s %s)\n",
512 AF_STR(pii->pii_af), pii->pii_name);
515 iov.iov_base = (char *)in_packet;
516 iov.iov_len = sizeof (in_packet);
517 msg.msg_iov = &iov;
518 msg.msg_iovlen = 1;
519 msg.msg_name = (struct sockaddr *)&from;
520 msg.msg_namelen = sizeof (from);
521 msg.msg_control = ancillary_data;
522 msg.msg_controllen = sizeof (ancillary_data);
524 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
525 logperror_pii(pii, "in6_data: recvmsg");
526 return;
530 * If the datalink has indicated that the link is down, don't go
531 * any further.
533 if (LINK_DOWN(pii->pii_phyint))
534 return;
536 /* Get the printable address for error reporting */
537 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
538 if (len < ICMP_MINLEN) {
539 if (debug & D_PKTBAD) {
540 logdebug("Truncated message: msg_flags 0x%x from %s\n",
541 msg.msg_flags, abuf);
543 return;
545 /* Ignore packets > 64k or control buffers that don't fit */
546 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
547 if (debug & D_PKTBAD) {
548 logdebug("Truncated message: msg_flags 0x%x from %s\n",
549 msg.msg_flags, abuf);
551 return;
554 reply = (struct pr_icmp *)in_packet;
555 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
556 return;
558 if (reply->pr_icmp_id != pii->pii_icmpid) {
559 /* Not in response to our probe */
560 return;
564 * The kernel has already verified the the ICMP checksum.
566 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
567 logtrace("ICMPv6 echo reply source address not linklocal from "
568 "%s on %s\n", abuf, pii->pii_name);
569 return;
571 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
572 if (opt != NULL) {
573 /* Can't allow routing headers in probe replies */
574 logtrace("message with routing header from %s on %s\n",
575 abuf, pii->pii_name);
576 return;
579 if (reply->pr_icmp_code != 0) {
580 logtrace("probe reply code: %d from %s on %s\n",
581 reply->pr_icmp_code, abuf, pii->pii_name);
582 return;
584 if (len < (sizeof (struct pr_icmp))) {
585 logtrace("probe reply too short: %d bytes from %s on %s\n",
586 len, abuf, pii->pii_name);
587 return;
590 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
591 if (recv_tvp == NULL) {
592 logtrace("message without timestamp from %s on %s\n",
593 abuf, pii->pii_name);
594 return;
597 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
598 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
599 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
600 incoming_mcast_reply(pii, reply, from.sin6_addr);
601 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
602 incoming_rtt_reply(pii, reply, from.sin6_addr);
603 } else {
604 /* Probably not in response to our probe */
605 logtrace("probe reply type: %d from %s on %s\n",
606 reply->pr_icmp_mtype, abuf, pii->pii_name);
611 * Process the incoming rtt reply, in response to our rtt probe.
612 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
613 * have any stored information about the probe we sent. So we don't log
614 * any errors if we receive bad replies.
616 static void
617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
618 struct in6_addr fromaddr)
620 int64_t m; /* rtt measurement in ns */
621 char abuf[INET6_ADDRSTRLEN];
622 struct target *target;
623 struct phyint_group *pg;
625 /* Get the printable address for error reporting */
626 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
628 if (debug & D_PROBE) {
629 logdebug("incoming_rtt_reply: %s %s %s\n",
630 AF_STR(pii->pii_af), pii->pii_name, abuf);
633 /* Do we know this target ? */
634 target = target_lookup(pii, fromaddr);
635 if (target == NULL)
636 return;
638 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
639 /* Invalid rtt. It has wrapped around */
640 if (m < 0)
641 return;
644 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
645 * The initial few responses after the interface is repaired may
646 * contain high rtt's because they could have been queued up waiting
647 * for ARP/NDP resolution on a failed interface.
649 pg = pii->pii_phyint->pi_group;
650 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
651 return;
654 * Update rtt only if the new rtt is lower than the current rtt.
655 * (specified by the 3rd parameter to pi_set_crtt).
656 * If a spike has caused the current probe_interval to be >
657 * user_probe_interval, then this mechanism is used to bring down
658 * the rtt rapidly once the network stress is removed.
659 * If the new rtt is higher than the current rtt, we don't want to
660 * update the rtt. We are having more than 1 outstanding probe and
661 * the increase in rtt we are seeing is being unnecessarily weighted
662 * many times. The regular rtt update will be handled by
663 * incoming_echo_reply() and will take care of any rtt increase.
665 pi_set_crtt(target, m, _B_FALSE);
666 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
667 (user_failure_detection_time < pg->pg_fdt) &&
668 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
670 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
671 * investigate if we can improve the failure detection time to
672 * meet whatever the user specified.
674 if (check_pg_crtt_improved(pg)) {
675 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
676 user_failure_detection_time);
677 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
678 if (pii->pii_phyint->pi_group != phyint_anongroup) {
679 logerr("Improved failure detection time %d ms "
680 "on (%s %s) for group \"%s\"\n",
681 pg->pg_fdt, AF_STR(pii->pii_af),
682 pii->pii_name,
683 pii->pii_phyint->pi_group->pg_name);
685 if (user_failure_detection_time == pg->pg_fdt) {
686 /* Avoid any truncation or rounding errors */
687 pg->pg_probeint = user_probe_interval;
689 * No more rtt probes will be sent. The actual
690 * fdt has dropped to the user specified value.
691 * pii_fd_snxt_basetime and pii_snxt_basetime
692 * will be in sync henceforth.
694 reset_snxt_basetimes();
701 * Process the incoming echo reply, in response to our unicast probe.
702 * Common for both IPv4 and IPv6
704 static void
705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
706 struct in6_addr fromaddr, struct timeval *recv_tvp)
708 int64_t m; /* rtt measurement in ns */
709 hrtime_t cur_hrtime; /* in ns from some arbitrary point */
710 char abuf[INET6_ADDRSTRLEN];
711 int pr_ndx;
712 struct target *target;
713 boolean_t exception;
714 uint64_t pr_icmp_timestamp;
715 uint16_t pr_icmp_seq;
716 struct probe_stats *pr_statp;
717 struct phyint_group *pg = pii->pii_phyint->pi_group;
719 /* Get the printable address for error reporting */
720 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
722 if (debug & D_PROBE) {
723 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
724 AF_STR(pii->pii_af), pii->pii_name, abuf,
725 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
728 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
729 pr_icmp_seq = ntohs(reply->pr_icmp_seq);
731 /* Reject out of window probe replies */
732 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
733 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
734 logtrace("out of window probe seq %u snxt %u on %s from %s\n",
735 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
736 pii->pii_cum_stats.unknown++;
737 return;
740 cur_hrtime = gethrtime();
741 m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
742 if (m < 0) {
744 * This is a ridiculously high value of rtt. rtt has wrapped
745 * around. Log a message, and ignore the rtt.
747 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
748 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
752 * Get the probe index pr_ndx corresponding to the received icmp seq.
753 * number in our pii->pii_probes[] array. The icmp sequence number
754 * pii_snxt corresponds to the probe index pii->pii_probe_next
756 pr_ndx = MOD_SUB(pii->pii_probe_next,
757 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
759 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
761 target = pii->pii_probes[pr_ndx].pr_target;
764 * Perform sanity checks, whether this probe reply that we
765 * have received is genuine
767 if (target != NULL) {
769 * Compare the src. addr of the received ICMP or ICMPv6
770 * probe reply with the target address in our tables.
772 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
774 * We don't have any record of having sent a probe to
775 * this target. This is a fake probe reply. Log an error
777 logtrace("probe status %d Fake probe reply seq %u "
778 "snxt %u on %s from %s\n",
779 pii->pii_probes[pr_ndx].pr_status,
780 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
781 pii->pii_cum_stats.unknown++;
782 return;
783 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
785 * The address matches, but our tables indicate that
786 * this probe reply has been acked already. So this
787 * is a duplicate probe reply. Log an error
789 logtrace("probe status %d Duplicate probe reply seq %u "
790 "snxt %u on %s from %s\n",
791 pii->pii_probes[pr_ndx].pr_status,
792 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
793 pii->pii_cum_stats.unknown++;
794 return;
796 } else {
798 * Target must not be NULL in the PR_UNACKED state
800 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
801 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
803 * The probe stats slot is unused. So we didn't
804 * send out any probe to this target. This is a fake.
805 * Log an error.
807 logtrace("probe status %d Fake probe reply seq %u "
808 "snxt %u on %s from %s\n",
809 pii->pii_probes[pr_ndx].pr_status,
810 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
812 pii->pii_cum_stats.unknown++;
813 return;
817 * If the rtt does not appear to be right, don't update the
818 * rtt stats. This can happen if the system dropped into the
819 * debugger, or the system was hung or too busy for a
820 * substantial time that we didn't get a chance to run.
822 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
824 * If the probe corresponding to this received response
825 * was truly sent 'm' ns. ago, then this response must
826 * have been rejected by the sequence number checks. The
827 * fact that it has passed the sequence number checks
828 * means that the measured rtt is wrong. We were probably
829 * scheduled long after the packet was received.
831 goto out;
835 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
836 * The initial few responses after the interface is repaired may
837 * contain high rtt's because they could have been queued up waiting
838 * for ARP/NDP resolution on a failed interface.
840 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
841 goto out;
844 * Don't update the Conservative Round Trip Time estimate for this
845 * (phint, target) pair if this is the not the highest ack seq seen
846 * thus far on this target.
848 if (!highest_ack_tg(pr_icmp_seq, target))
849 goto out;
852 * Always update the rtt. This is a failure detection probe
853 * and we want to measure both increase / decrease in rtt.
855 pi_set_crtt(target, m, _B_TRUE);
858 * If the crtt exceeds the average time between probes,
859 * investigate if this slow target is an exception. If so we
860 * can avoid this target and still meet the failure detection
861 * time. Otherwise we can't meet the failure detection time.
863 if (target->tg_crtt > pg->pg_probeint) {
864 exception = check_exception_target(pii, target);
865 if (exception) {
867 * This target is exceptionally slow. Don't use it
868 * for future probes. check_exception_target() has
869 * made sure that we have at least MIN_PROBE_TARGETS
870 * other active targets
872 if (pii->pii_targets_are_routers) {
874 * This is a slow router, mark it as slow
875 * and don't use it for further probes. We
876 * don't delete it, since it will be populated
877 * again when we do a router scan. Hence we
878 * need to maintain extra state (unlike the
879 * host case below). Mark it as TG_SLOW.
881 if (target->tg_status == TG_ACTIVE)
882 pii->pii_ntargets--;
883 target->tg_status = TG_SLOW;
884 target->tg_latime = gethrtime();
885 target->tg_rtt_sa = -1;
886 target->tg_crtt = 0;
887 target->tg_rtt_sd = 0;
888 if (pii->pii_target_next == target) {
889 pii->pii_target_next =
890 target_next(target);
892 } else {
894 * the slow target is not a router, we can
895 * just delete it. Send an icmp multicast and
896 * pick the fastest responder that is not
897 * already an active target. target_delete()
898 * adjusts pii->pii_target_next
900 target_delete(target);
901 probe(pii, PROBE_MULTI, cur_hrtime);
903 } else {
905 * We can't meet the failure detection time.
906 * Log a message, and update the detection time to
907 * whatever we can achieve.
909 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
910 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
911 last_fdt_bumpup_time = gethrtime();
912 if (pg != phyint_anongroup) {
913 logtrace("Cannot meet requested failure"
914 " detection time of %d ms on (%s %s) new"
915 " failure detection time for group \"%s\""
916 " is %d ms\n", user_failure_detection_time,
917 AF_STR(pii->pii_af), pii->pii_name,
918 pg->pg_name, pg->pg_fdt);
921 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
922 (user_failure_detection_time < pg->pg_fdt) &&
923 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
925 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
926 * investigate if we can improve the failure detection time to
927 * meet whatever the user specified.
929 if (check_pg_crtt_improved(pg)) {
930 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
931 user_failure_detection_time);
932 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
933 if (pg != phyint_anongroup) {
934 logtrace("Improved failure detection time %d ms"
935 " on (%s %s) for group \"%s\"\n",
936 pg->pg_fdt, AF_STR(pii->pii_af),
937 pii->pii_name, pg->pg_name);
939 if (user_failure_detection_time == pg->pg_fdt) {
940 /* Avoid any truncation or rounding errors */
941 pg->pg_probeint = user_probe_interval;
943 * No more rtt probes will be sent. The actual
944 * fdt has dropped to the user specified value.
945 * pii_fd_snxt_basetime and pii_snxt_basetime
946 * will be in sync henceforth.
948 reset_snxt_basetimes();
952 out:
953 pr_statp = &pii->pii_probes[pr_ndx];
954 pr_statp->pr_hrtime_ackproc = cur_hrtime;
955 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
956 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
958 probe_chstate(pr_statp, pii, PR_ACKED);
961 * Update pii->pii_rack, i.e. the sequence number of the last received
962 * probe response, based on the echo reply we have received now, if
963 * either of the following conditions are satisfied.
964 * a. pii_rack is outside the current receive window of
965 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
966 * This means we have not received probe responses for a
967 * long time, and the sequence number has wrapped around.
968 * b. pii_rack is within the current receive window and this echo
969 * reply corresponds to the highest sequence number we have seen
970 * so far.
972 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
973 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
974 SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
975 pii->pii_rack = pr_icmp_seq;
980 * Returns true if seq is the highest unacknowledged seq for target tg
981 * else returns false
983 static boolean_t
984 highest_ack_tg(uint16_t seq, struct target *tg)
986 struct phyint_instance *pii;
987 int pr_ndx;
988 uint16_t pr_seq;
990 pii = tg->tg_phyint_inst;
993 * Get the seq number of the most recent probe sent so far,
994 * and also get the corresponding probe index in the probe stats
995 * array.
997 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
998 pr_seq = pii->pii_snxt;
999 pr_seq--;
1002 * Start from the most recent probe and walk back, trying to find
1003 * an acked probe corresponding to target tg.
1005 for (; pr_ndx != pii->pii_probe_next;
1006 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1007 if (pii->pii_probes[pr_ndx].pr_target == tg &&
1008 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1009 if (SEQ_GT(pr_seq, seq))
1010 return (_B_FALSE);
1013 return (_B_TRUE);
1017 * Check whether the crtt for the group has improved by a factor of
1018 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
1019 * detection time flapping in the face of small crtt changes.
1021 static boolean_t
1022 check_pg_crtt_improved(struct phyint_group *pg)
1024 struct phyint *pi;
1026 if (debug & D_PROBE)
1027 logdebug("check_pg_crtt_improved()\n");
1030 * The crtt for the group is only improved if each phyint_instance
1031 * for both ipv4 and ipv6 is improved.
1033 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1034 if (!check_pii_crtt_improved(pi->pi_v4) ||
1035 !check_pii_crtt_improved(pi->pi_v6))
1036 return (_B_FALSE);
1039 return (_B_TRUE);
1043 * Check whether the crtt has improved substantially on this phyint_instance.
1044 * Returns _B_TRUE if there's no crtt information available, because pii
1045 * is NULL or the phyint_instance is not capable of probing.
1047 boolean_t
1048 check_pii_crtt_improved(struct phyint_instance *pii) {
1049 struct target *tg;
1051 if (pii == NULL)
1052 return (_B_TRUE);
1054 if (!PROBE_CAPABLE(pii) ||
1055 pii->pii_phyint->pi_state == PI_FAILED)
1056 return (_B_TRUE);
1058 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1059 if (tg->tg_status != TG_ACTIVE)
1060 continue;
1061 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1062 LOWER_FDT_TRIGGER)) {
1063 return (_B_FALSE);
1067 return (_B_TRUE);
1071 * This target responds very slowly to probes. The target's crtt exceeds
1072 * the probe interval of its group. Compare against other targets
1073 * and determine if this target is an exception, if so return true, else false
1075 static boolean_t
1076 check_exception_target(struct phyint_instance *pii, struct target *target)
1078 struct target *tg;
1079 char abuf[INET6_ADDRSTRLEN];
1081 if (debug & D_PROBE) {
1082 logdebug("check_exception_target(%s %s target %s)\n",
1083 AF_STR(pii->pii_af), pii->pii_name,
1084 pr_addr(pii->pii_af, target->tg_address,
1085 abuf, sizeof (abuf)));
1089 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1090 * to make a good judgement. Otherwise don't drop this target.
1092 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1)
1093 return (_B_FALSE);
1096 * Determine whether only this particular target is slow.
1097 * We know that this target's crtt exceeds the group's probe interval.
1098 * If all other active targets have a
1099 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1100 * then this target is considered slow.
1102 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1103 if (tg != target && tg->tg_status == TG_ACTIVE) {
1104 if (tg->tg_crtt >
1105 pii->pii_phyint->pi_group->pg_probeint /
1106 EXCEPTION_FACTOR) {
1107 return (_B_FALSE);
1112 return (_B_TRUE);
1116 * Update the target list. The icmp all hosts multicast has given us
1117 * some host to which we can send probes. If we already have sufficient
1118 * targets, discard it.
1120 static void
1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1122 struct in6_addr fromaddr)
1123 /* ARGSUSED */
1125 int af;
1126 char abuf[INET6_ADDRSTRLEN];
1127 struct phyint *pi;
1129 if (debug & D_PROBE) {
1130 logdebug("incoming_mcast_reply(%s %s %s)\n",
1131 AF_STR(pii->pii_af), pii->pii_name,
1132 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1136 * Using host targets is a fallback mechanism. If we have
1137 * found a router, don't add this host target. If we already
1138 * know MAX_PROBE_TARGETS, don't add another target.
1140 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1141 if (pii->pii_targets != NULL) {
1142 if (pii->pii_targets_are_routers ||
1143 (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1144 return;
1148 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1149 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1151 * Guard against response from 0.0.0.0
1152 * and ::. Log a trace message
1154 logtrace("probe response from %s on %s\n",
1155 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1156 pii->pii_name);
1157 return;
1161 * This address is one of our own, so reject this address as a
1162 * valid probe target.
1164 af = pii->pii_af;
1165 if (own_address(fromaddr))
1166 return;
1169 * If the phyint is part a named group, then add the address to all
1170 * members of the group. Otherwise, add the address only to the
1171 * phyint itself, since other phyints in the anongroup may not be on
1172 * the same subnet.
1174 pi = pii->pii_phyint;
1175 if (pi->pi_group == phyint_anongroup) {
1176 target_add(pii, fromaddr, _B_FALSE);
1177 } else {
1178 pi = pi->pi_group->pg_phyint;
1179 for (; pi != NULL; pi = pi->pi_pgnext)
1180 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1185 * Compute CRTT given an existing scaled average, scaled deviation estimate
1186 * and a new rtt time. The formula is from Jacobson and Karels'
1187 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
1188 * are the same as those in Appendix A.2 of that paper.
1190 * m = new measurement
1191 * sa = scaled RTT average (8 * average estimates)
1192 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1193 * crtt = Conservative round trip time. Used to determine whether probe
1194 * has timed out.
1196 * New scaled average and deviation are passed back via sap and svp
1198 static int64_t
1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
1201 int64_t sa = *sap;
1202 int64_t sv = *svp;
1203 int64_t crtt;
1204 int64_t saved_m = m;
1206 assert(*sap >= -1);
1207 assert(*svp >= 0);
1209 if (sa != -1) {
1211 * Update average estimator:
1212 * new rtt = old rtt + 1/8 Error
1213 * where Error = m - old rtt
1214 * i.e. 8 * new rtt = 8 * old rtt + Error
1215 * i.e. new sa = old sa + Error
1217 m -= sa >> 3; /* m is now Error in estimate. */
1218 if ((sa += m) < 0) {
1219 /* Don't allow the smoothed average to be negative. */
1220 sa = 0;
1224 * Update deviation estimator:
1225 * new mdev = old mdev + 1/4 (abs(Error) - old mdev)
1226 * i.e. 4 * new mdev = 4 * old mdev +
1227 * (abs(Error) - old mdev)
1228 * i.e. new sv = old sv + (abs(Error) - old mdev)
1230 if (m < 0)
1231 m = -m;
1232 m -= sv >> 2;
1233 sv += m;
1234 } else {
1235 /* Initialization. This is the first response received. */
1236 sa = (m << 3);
1237 sv = (m << 1);
1240 crtt = (sa >> 3) + sv;
1242 if (debug & D_PROBE) {
1243 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244 "crtt = %lld\n", saved_m, sa, sv, crtt);
1247 *sap = sa;
1248 *svp = sv;
1251 * CRTT = average estimates + 4 * deviation estimates
1252 * = sa / 8 + sv
1254 return (crtt);
1257 static void
1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
1260 struct phyint_instance *pii = tg->tg_phyint_inst;
1261 int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1262 int64_t sa = tg->tg_rtt_sa;
1263 int64_t sv = tg->tg_rtt_sd;
1264 int new_crtt;
1265 int i;
1267 if (debug & D_PROBE)
1268 logdebug("pi_set_crtt: target - m %lld\n", m);
1270 /* store the round trip time, in case we need to defer computation */
1271 tg->tg_deferred[tg->tg_num_deferred] = m;
1273 new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
1276 * If this probe's round trip time would singlehandedly cause an
1277 * increase in the group's probe interval consider it suspect.
1279 if ((new_crtt > probe_interval) && is_probe_uni) {
1280 if (debug & D_PROBE) {
1281 logdebug("Received a suspect probe on %s, new_crtt ="
1282 " %d, probe_interval = %d, num_deferred = %d\n",
1283 pii->pii_probe_logint->li_name, new_crtt,
1284 probe_interval, tg->tg_num_deferred);
1288 * If we've deferred as many rtts as we plan on deferring, then
1289 * assume the link really did slow down and process all queued
1290 * rtts
1292 if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1293 if (debug & D_PROBE) {
1294 logdebug("Received MAXDEFERREDRTT probes which "
1295 "would cause an increased probe_interval. "
1296 "Integrating queued rtt data points.\n");
1299 for (i = 0; i <= tg->tg_num_deferred; i++) {
1300 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1301 &tg->tg_rtt_sd, tg->tg_deferred[i]));
1304 tg->tg_num_deferred = 0;
1305 } else {
1306 tg->tg_num_deferred++;
1308 return;
1312 * If this is a normal probe, or an RTT probe that would lead to a
1313 * reduced CRTT, then update our CRTT data. Further, if this was
1314 * a normal probe, pitch any deferred probes since our probes are
1315 * again being answered within our CRTT estimates.
1317 if (is_probe_uni || new_crtt < tg->tg_crtt) {
1318 tg->tg_rtt_sa = sa;
1319 tg->tg_rtt_sd = sv;
1320 tg->tg_crtt = new_crtt;
1321 if (is_probe_uni)
1322 tg->tg_num_deferred = 0;
1327 * Return a pointer to the specified option buffer.
1328 * If not found return NULL.
1330 static void *
1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
1333 struct cmsghdr *cmsg;
1335 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1336 cmsg = CMSG_NXTHDR(msg, cmsg)) {
1337 if (cmsg->cmsg_level == cmsg_level &&
1338 cmsg->cmsg_type == cmsg_type) {
1339 return (CMSG_DATA(cmsg));
1342 return (NULL);
1346 * Try to activate another INACTIVE interface in the same group as `pi'.
1347 * Prefer STANDBY INACTIVE to just INACTIVE.
1349 void
1350 phyint_activate_another(struct phyint *pi)
1352 struct phyint *pi2;
1353 struct phyint *inactivepi = NULL;
1355 if (pi->pi_group == phyint_anongroup)
1356 return;
1358 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1359 if (pi == pi2 || !phyint_is_functioning(pi2) ||
1360 !(pi2->pi_flags & IFF_INACTIVE))
1361 continue;
1363 inactivepi = pi2;
1364 if (pi2->pi_flags & IFF_STANDBY)
1365 break;
1368 if (inactivepi != NULL)
1369 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1373 * Transition a phyint to PI_RUNNING. The caller must ensure that the
1374 * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if
1375 * appropriate. Also sets IFF_INACTIVE on this or other interfaces as
1376 * appropriate (see comment below). Finally, also updates the phyint's group
1377 * state to account for the change.
1379 void
1380 phyint_transition_to_running(struct phyint *pi)
1382 struct phyint *pi2;
1383 struct phyint *actstandbypi = NULL;
1384 uint_t nactive = 0, nnonstandby = 0;
1385 boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1386 boolean_t initial = (pi->pi_state == PI_INIT);
1387 uint64_t set, clear;
1390 * The interface is running again, but should it or another interface
1391 * in the group end up INACTIVE? There are three cases:
1393 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394 * the group is operating at capacity (i.e., there are at least as
1395 * many active interfaces as non-STANDBY interfaces in the group).
1396 * No other interfaces should be changed.
1398 * 2. If it's a non-STANDBY interface and we're onlining it or
1399 * FAILBACK is enabled, then it should *not* end up INACTIVE.
1400 * Further, if the group is above capacity as a result of this
1401 * interface, then an active STANDBY interface in the group should
1402 * end up INACTIVE.
1404 * 3. If it's a non-STANDBY interface, we're repairing it, and
1405 * FAILBACK is disabled, then it should end up INACTIVE *unless*
1406 * the group was failed (in which case we have no choice but to
1407 * use it). No other interfaces should be changed.
1409 if (pi->pi_group != phyint_anongroup) {
1410 pi2 = pi->pi_group->pg_phyint;
1411 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1412 if (!(pi2->pi_flags & IFF_STANDBY))
1413 nnonstandby++;
1415 if (phyint_is_functioning(pi2) &&
1416 !(pi2->pi_flags & IFF_INACTIVE)) {
1417 nactive++;
1418 if (pi2->pi_flags & IFF_STANDBY)
1419 actstandbypi = pi2;
1424 set = 0;
1425 clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1427 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */
1428 if (nactive >= nnonstandby)
1429 set |= IFF_INACTIVE;
1430 else
1431 clear |= IFF_INACTIVE;
1432 } else if (onlining || failback_enabled) { /* case 2 */
1433 if (nactive >= nnonstandby && actstandbypi != NULL)
1434 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1435 } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */
1436 set |= IFF_INACTIVE;
1438 (void) change_pif_flags(pi, set, clear);
1440 phyint_chstate(pi, PI_RUNNING);
1443 * Update the group state to account for the change.
1445 phyint_group_refresh_state(pi->pi_group);
1449 * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
1450 * to have at least one active interface and as many active interfaces as
1451 * non-standby interfaces.
1453 void
1454 phyint_standby_refresh_inactive(struct phyint *pi)
1456 struct phyint *pi2;
1457 uint_t nactive = 0, nnonstandby = 0;
1460 * All phyints in the anonymous group are effectively in their own
1461 * group and thus active regardless of whether they're marked standby.
1463 if (pi->pi_group == phyint_anongroup) {
1464 (void) change_pif_flags(pi, 0, IFF_INACTIVE);
1465 return;
1469 * If the phyint isn't functioning we can't consider it.
1471 if (!phyint_is_functioning(pi))
1472 return;
1474 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1475 if (!(pi2->pi_flags & IFF_STANDBY))
1476 nnonstandby++;
1478 if (phyint_is_functioning(pi2) &&
1479 !(pi2->pi_flags & IFF_INACTIVE))
1480 nactive++;
1483 if (nactive == 0 || nactive < nnonstandby)
1484 (void) change_pif_flags(pi, 0, IFF_INACTIVE);
1485 else if (nactive > nnonstandby)
1486 (void) change_pif_flags(pi, IFF_INACTIVE, 0);
1490 * See if a previously failed interface has started working again.
1492 void
1493 phyint_check_for_repair(struct phyint *pi)
1495 if (!phyint_repaired(pi))
1496 return;
1498 if (pi->pi_group == phyint_anongroup) {
1499 logerr("IP interface repair detected on %s\n", pi->pi_name);
1500 } else {
1501 logerr("IP interface repair detected on %s of group %s\n",
1502 pi->pi_name, pi->pi_group->pg_name);
1506 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1507 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1508 * until it is brought back online.
1510 if (pi->pi_state == PI_OFFLINE) {
1511 (void) change_pif_flags(pi, 0, IFF_FAILED);
1512 return;
1515 phyint_transition_to_running(pi); /* calls phyint_chstate() */
1519 * See if an interface has failed, or if the whole group of interfaces has
1520 * failed.
1522 static void
1523 phyint_inst_check_for_failure(struct phyint_instance *pii)
1525 struct phyint *pi = pii->pii_phyint;
1526 struct phyint *pi2;
1527 boolean_t was_active;
1529 switch (failure_state(pii)) {
1530 case PHYINT_FAILURE:
1531 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1533 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1534 if (pi->pi_group == phyint_anongroup) {
1535 logerr("IP interface failure detected on %s\n",
1536 pii->pii_name);
1537 } else {
1538 logerr("IP interface failure detected on %s of group"
1539 " %s\n", pii->pii_name, pi->pi_group->pg_name);
1543 * If the failed interface was active, activate another
1544 * INACTIVE interface in the group if possible.
1546 if (was_active)
1547 phyint_activate_another(pi);
1550 * If the interface is offline, the state change will be
1551 * noted when it comes back online.
1553 if (pi->pi_state != PI_OFFLINE) {
1554 phyint_chstate(pi, PI_FAILED);
1555 reset_crtt_all(pi);
1557 break;
1559 case GROUP_FAILURE:
1560 pi2 = pi->pi_group->pg_phyint;
1561 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1562 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1563 if (pi2->pi_state == PI_OFFLINE) /* see comment above */
1564 continue;
1566 reset_crtt_all(pi2);
1568 * In the case of host targets, we would have flushed
1569 * the targets, and gone to PI_NOTARGETS state.
1571 if (pi2->pi_state == PI_RUNNING)
1572 phyint_chstate(pi2, PI_FAILED);
1574 break;
1576 default:
1577 break;
1582 * Determines if any timeout event has occurred and returns the number of
1583 * milliseconds until the next timeout event for the phyint. Returns
1584 * TIMER_INFINITY for "never".
1586 uint_t
1587 phyint_inst_timer(struct phyint_instance *pii)
1589 int pr_ndx;
1590 uint_t timeout;
1591 struct target *cur_tg;
1592 struct probe_stats *pr_statp;
1593 struct phyint_instance *pii_other;
1594 struct phyint *pi;
1595 int valid_unack_count;
1596 int i;
1597 int interval;
1598 uint_t check_time;
1599 uint_t cur_time;
1600 hrtime_t cur_hrtime;
1601 int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1603 cur_hrtime = gethrtime();
1604 cur_time = ns2ms(cur_hrtime);
1606 if (debug & D_TIMER) {
1607 logdebug("phyint_inst_timer(%s %s)\n",
1608 AF_STR(pii->pii_af), pii->pii_name);
1611 pii_other = phyint_inst_other(pii);
1612 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1614 * Check to see if we're here due to link up/down flapping; If
1615 * enough time has passed, then try to bring the interface
1616 * back up; otherwise, schedule a timer to bring it back up
1617 * when enough time *has* elapsed.
1619 pi = pii->pii_phyint;
1620 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1621 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1622 if (check_time > cur_time)
1623 return (check_time - cur_time);
1625 phyint_check_for_repair(pi);
1630 * If probing is not enabled on this phyint instance, don't proceed.
1632 if (!PROBE_ENABLED(pii))
1633 return (TIMER_INFINITY);
1636 * If the timer has fired too soon, probably triggered
1637 * by some other phyint instance, return the remaining
1638 * time
1640 if (TIME_LT(cur_time, pii->pii_snxt_time))
1641 return (pii->pii_snxt_time - cur_time);
1644 * If the link is down, don't send any probes for now.
1646 if (LINK_DOWN(pii->pii_phyint))
1647 return (TIMER_INFINITY);
1650 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1651 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1652 * Base probe time is strictly periodic.
1654 interval = GET_RANDOM(
1655 (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1656 (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1657 pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1660 * Check if the current time > next time to probe. If so, we missed
1661 * sending 1 or more probes, probably due to heavy system load. At least
1662 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1663 * were scheduled. Make adjustments to the times, in multiples of
1664 * user_probe_interval.
1666 if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1667 int n;
1669 n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1670 pii->pii_snxt_time += (n + 1) * user_probe_interval;
1671 pii->pii_snxt_basetime += (n + 1) * user_probe_interval;
1672 logtrace("missed sending %d probes cur_time %u snxt_time %u"
1673 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1674 pii->pii_snxt_basetime);
1676 /* Collect statistics about missed probes */
1677 probes_missed.pm_nprobes += n + 1;
1678 probes_missed.pm_ntimes++;
1680 pii->pii_snxt_basetime += user_probe_interval;
1681 interval = pii->pii_snxt_time - cur_time;
1682 if (debug & D_TARGET) {
1683 logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1684 " interval %u\n", cur_time, pii->pii_snxt_time,
1685 pii->pii_snxt_basetime, interval);
1689 * If no targets are known, we need to send an ICMP multicast. The
1690 * probe type is PROBE_MULTI. We'll check back in 'interval' msec
1691 * to see if we found a target.
1693 if (pii->pii_target_next == NULL) {
1694 assert(pii->pii_ntargets == 0);
1695 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1696 probe(pii, PROBE_MULTI, cur_time);
1697 return (interval);
1700 if ((user_probe_interval != probe_interval) &&
1701 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1703 * the failure detection (fd) probe timer has not yet fired.
1704 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1706 probe(pii, PROBE_RTT, cur_hrtime);
1707 return (interval);
1710 * the fd probe timer has fired. Need to do all failure
1711 * detection / recovery calculations, and then send an fd probe
1712 * of type PROBE_UNI.
1714 if (user_probe_interval == probe_interval) {
1716 * We could have missed some probes, and then adjusted
1717 * pii_snxt_basetime above. Otherwise we could have
1718 * blindly added probe_interval to pii_fd_snxt_basetime.
1720 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1721 } else {
1722 pii->pii_fd_snxt_basetime += probe_interval;
1723 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1724 int n;
1726 n = (cur_time - pii->pii_fd_snxt_basetime) /
1727 probe_interval;
1728 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1733 * We can have at most, the latest 2 probes that we sent, in
1734 * the PR_UNACKED state. All previous probes sent, are either
1735 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1736 * timed out if the probe's time_start + the CRTT < currenttime.
1737 * For each of the last 2 probes, examine whether it has timed
1738 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1740 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1741 valid_unack_count = 0;
1743 for (i = 0; i < 2; i++) {
1744 pr_statp = &pii->pii_probes[pr_ndx];
1745 cur_tg = pii->pii_probes[pr_ndx].pr_target;
1746 switch (pr_statp->pr_status) {
1747 case PR_ACKED:
1749 * We received back an ACK, so the switch clearly
1750 * is not dropping our traffic, and thus we can
1751 * enable failure detection immediately.
1753 if (pii->pii_fd_hrtime > gethrtime()) {
1754 if (debug & D_PROBE) {
1755 logdebug("successful probe on %s; "
1756 "ending quiet period\n",
1757 pii->pii_phyint->pi_name);
1759 pii->pii_fd_hrtime = gethrtime();
1761 break;
1763 case PR_UNACKED:
1764 assert(cur_tg != NULL);
1766 * The crtt could be zero for some reason,
1767 * Eg. the phyint could be failed. If the crtt is
1768 * not available use group's probe interval,
1769 * which is a worst case estimate.
1771 timeout = ns2ms(pr_statp->pr_hrtime_start);
1772 if (cur_tg->tg_crtt != 0) {
1773 timeout += cur_tg->tg_crtt;
1774 } else {
1775 timeout += probe_interval;
1777 if (TIME_LT(timeout, cur_time)) {
1778 pr_statp->pr_time_lost = timeout;
1779 probe_chstate(pr_statp, pii, PR_LOST);
1780 } else if (i == 1) {
1782 * We are forced to consider this probe
1783 * lost, as we can have at most 2 unack.
1784 * probes any time, and we will be sending a
1785 * probe at the end of this function.
1786 * Normally, we should not be here, but
1787 * this can happen if an incoming response
1788 * that was considered lost has increased
1789 * the crtt for this target, and also bumped
1790 * up the FDT. Note that we never cancel or
1791 * increase the current pii_time_left, so
1792 * when the timer fires, we find 2 valid
1793 * unacked probes, and they are yet to timeout
1795 pr_statp->pr_time_lost = cur_time;
1796 probe_chstate(pr_statp, pii, PR_LOST);
1797 } else {
1799 * Only the most recent probe can enter
1800 * this 'else' arm. The second most recent
1801 * probe must take either of the above arms,
1802 * if it is unacked.
1804 valid_unack_count++;
1806 break;
1808 pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1812 * We send out 1 probe randomly in the interval between one half
1813 * and one probe interval for the group. Given that the CRTT is always
1814 * less than the group's probe interval, we can have at most 1
1815 * unacknowledged probe now. All previous probes are either lost or
1816 * acked.
1818 assert(valid_unack_count == 0 || valid_unack_count == 1);
1821 * The timer has fired. Take appropriate action depending
1822 * on the current state of the phyint.
1824 * PI_RUNNING state - Failure detection
1825 * PI_FAILED state - Repair detection
1827 switch (pii->pii_phyint->pi_state) {
1828 case PI_FAILED:
1830 * If the most recent probe (excluding unacked probes that
1831 * are yet to time out) has been acked, check whether the
1832 * phyint is now repaired.
1834 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1835 phyint_check_for_repair(pii->pii_phyint);
1837 break;
1839 case PI_RUNNING:
1841 * It's possible our probes have been lost because of a
1842 * spanning-tree mandated quiet period on the switch. If so,
1843 * ignore the lost probes.
1845 if (pii->pii_fd_hrtime - cur_hrtime > 0)
1846 break;
1848 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1850 * We have 1 or more failed probes (excluding unacked
1851 * probes that are yet to time out). Determine if the
1852 * phyint has failed.
1854 phyint_inst_check_for_failure(pii);
1856 break;
1858 default:
1859 logerr("phyint_inst_timer: invalid state %d\n",
1860 pii->pii_phyint->pi_state);
1861 abort();
1865 * Start the next probe. probe() will also set pii->pii_probe_time_left
1866 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1867 * was called, the target list may be empty.
1869 if (pii->pii_target_next != NULL) {
1870 probe(pii, PROBE_UNI, cur_hrtime);
1872 * If we have just the one probe target, and we're not using
1873 * router targets, try to find another as we presently have
1874 * no resilience.
1876 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1877 probe(pii, PROBE_MULTI, cur_hrtime);
1878 } else {
1879 probe(pii, PROBE_MULTI, cur_hrtime);
1881 return (interval);
1885 * Start the probe timer for an interface instance.
1887 void
1888 start_timer(struct phyint_instance *pii)
1890 uint32_t interval;
1893 * Spread the base probe times (pi_snxt_basetime) across phyints
1894 * uniformly over the (curtime..curtime + the group's probe_interval).
1895 * pi_snxt_basetime is strictly periodic with a frequency of
1896 * the group's probe interval. The actual probe time pi_snxt_time
1897 * adds some randomness to pi_snxt_basetime and happens in probe().
1898 * For the 1st probe on each phyint after the timer is started,
1899 * pi_snxt_time and pi_snxt_basetime are the same.
1901 interval = GET_RANDOM(0,
1902 (int)pii->pii_phyint->pi_group->pg_probeint);
1904 pii->pii_snxt_basetime = getcurrenttime() + interval;
1905 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1906 pii->pii_snxt_time = pii->pii_snxt_basetime;
1907 timer_schedule(interval);
1911 * Restart the probe timer on an interface instance.
1913 static void
1914 restart_timer(struct phyint_instance *pii)
1917 * We don't need to restart the timer if it was never started in
1918 * the first place (pii->pii_basetime_inited not set), as the timer
1919 * won't have gone off yet.
1921 if (pii->pii_basetime_inited != 0) {
1923 if (debug & D_LINKNOTE)
1924 logdebug("restart timer: restarting timer on %s, "
1925 "address family %s\n", pii->pii_phyint->pi_name,
1926 AF_STR(pii->pii_af));
1928 start_timer(pii);
1932 static void
1933 process_link_state_down(struct phyint *pi)
1935 logerr("The link has gone down on %s\n", pi->pi_name);
1938 * Clear the probe statistics arrays, we don't want the repair
1939 * detection logic relying on probes that were successful prior
1940 * to the link going down.
1942 if (PROBE_CAPABLE(pi->pi_v4))
1943 clear_pii_probe_stats(pi->pi_v4);
1944 if (PROBE_CAPABLE(pi->pi_v6))
1945 clear_pii_probe_stats(pi->pi_v6);
1947 * Check for interface failure. Although we know the interface
1948 * has failed, we don't know if all the other interfaces in the
1949 * group have failed as well.
1951 if ((pi->pi_state == PI_RUNNING) ||
1952 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1953 if (debug & D_LINKNOTE) {
1954 logdebug("process_link_state_down:"
1955 " checking for failure on %s\n", pi->pi_name);
1958 if (pi->pi_v4 != NULL)
1959 phyint_inst_check_for_failure(pi->pi_v4);
1960 else if (pi->pi_v6 != NULL)
1961 phyint_inst_check_for_failure(pi->pi_v6);
1965 static void
1966 process_link_state_up(struct phyint *pi)
1968 logerr("The link has come up on %s\n", pi->pi_name);
1971 * We stopped any running timers on each instance when the link
1972 * went down, so restart them.
1974 if (pi->pi_v4)
1975 restart_timer(pi->pi_v4);
1976 if (pi->pi_v6)
1977 restart_timer(pi->pi_v6);
1979 phyint_check_for_repair(pi);
1981 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1982 if (pi->pi_whendx == LINK_UP_PERMIN)
1983 pi->pi_whendx = 0;
1987 * Process any changes in link state passed up from the interfaces.
1989 void
1990 process_link_state_changes(void)
1992 struct phyint *pi;
1994 /* Look for interfaces where the link state has just changed */
1996 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1997 boolean_t old_link_state_up = LINK_UP(pi);
2000 * Except when the "phyint" structure is created, this is
2001 * the only place the link state is updated. This allows
2002 * this routine to detect changes in link state, rather
2003 * than just the current state.
2005 UPDATE_LINK_STATE(pi);
2007 if (LINK_DOWN(pi)) {
2009 * Has link just gone down?
2011 if (old_link_state_up)
2012 process_link_state_down(pi);
2013 } else {
2015 * Has link just gone back up?
2017 if (!old_link_state_up)
2018 process_link_state_up(pi);
2023 void
2024 reset_crtt_all(struct phyint *pi)
2026 struct phyint_instance *pii;
2027 struct target *tg;
2029 pii = pi->pi_v4;
2030 if (pii != NULL) {
2031 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2032 tg->tg_crtt = 0;
2033 tg->tg_rtt_sa = -1;
2034 tg->tg_rtt_sd = 0;
2038 pii = pi->pi_v6;
2039 if (pii != NULL) {
2040 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2041 tg->tg_crtt = 0;
2042 tg->tg_rtt_sa = -1;
2043 tg->tg_rtt_sd = 0;
2049 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2050 * probes on both instances IPv4 and IPv6.
2051 * If the interface has failed, return the time of the first probe failure
2052 * in "tff".
2054 static int
2055 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
2057 uint_t pi_tff;
2058 struct target *cur_tg;
2059 struct probe_fail_count pfinfo;
2060 struct phyint_instance *pii_other;
2061 int pr_ndx;
2064 * Get the number of consecutive failed probes on
2065 * this phyint across all targets. Also get the number
2066 * of consecutive failed probes on this target only
2068 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2069 cur_tg = pii->pii_probes[pr_ndx].pr_target;
2070 probe_fail_info(pii, cur_tg, &pfinfo);
2072 /* Get the time of first failure, for later use */
2073 pi_tff = pfinfo.pf_tff;
2076 * If the current target has not responded to the
2077 * last NUM_PROBE_FAILS probes, and other targets are
2078 * responding delete this target. Dead gateway detection
2079 * will eventually remove this target (if router) from the
2080 * routing tables. If that does not occur, we may end
2081 * up adding this to our list again.
2083 if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2084 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2085 if (pii->pii_targets_are_routers) {
2086 if (cur_tg->tg_status == TG_ACTIVE)
2087 pii->pii_ntargets--;
2088 cur_tg->tg_status = TG_DEAD;
2089 cur_tg->tg_crtt = 0;
2090 cur_tg->tg_rtt_sa = -1;
2091 cur_tg->tg_rtt_sd = 0;
2092 if (pii->pii_target_next == cur_tg)
2093 pii->pii_target_next = target_next(cur_tg);
2094 } else {
2095 target_delete(cur_tg);
2096 probe(pii, PROBE_MULTI, gethrtime());
2098 return (PHYINT_OK);
2102 * If the phyint has lost NUM_PROBE_FAILS or more
2103 * consecutive probes, on both IPv4 and IPv6 protocol
2104 * instances of the phyint, then trigger failure
2105 * detection, else return false
2107 if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2108 return (PHYINT_OK);
2110 pii_other = phyint_inst_other(pii);
2111 if (PROBE_CAPABLE(pii_other)) {
2112 probe_fail_info(pii_other, NULL, &pfinfo);
2113 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2115 * We have NUM_PROBE_FAILS or more failures
2116 * on both IPv4 and IPv6. Get the earliest
2117 * time when failure was detected on this
2118 * phyint across IPv4 and IPv6.
2120 if (TIME_LT(pfinfo.pf_tff, pi_tff))
2121 pi_tff = pfinfo.pf_tff;
2122 } else {
2124 * This instance has < NUM_PROBE_FAILS failure.
2125 * So return false
2127 return (PHYINT_OK);
2130 *tff = pi_tff;
2131 return (PHYINT_FAILURE);
2135 * Check if the link has gone down on this phyint, or it has failed the
2136 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2137 * Also look at other phyints of this group, for group failures.
2140 failure_state(struct phyint_instance *pii)
2142 struct probe_success_count psinfo;
2143 uint_t pi2_tls; /* time last success */
2144 uint_t pi_tff; /* time first fail */
2145 struct phyint *pi2;
2146 struct phyint *pi;
2147 struct phyint_instance *pii2;
2148 struct phyint_group *pg;
2149 int retval;
2151 if (debug & D_FAILREP)
2152 logdebug("phyint_failed(%s)\n", pii->pii_name);
2154 pi = pii->pii_phyint;
2155 pg = pi->pi_group;
2157 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2158 PHYINT_OK)
2159 return (PHYINT_OK);
2162 * At this point, the link is down, or the phyint is suspect, as it
2163 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2164 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
2165 * on to determine whether this should be considered a PHYINT_FAILURE
2166 * or GROUP_FAILURE.
2168 if (pg == phyint_anongroup)
2169 return (PHYINT_FAILURE);
2172 * Need to compare against other phyints of the same group
2173 * to exclude group failures. If the failure was detected via
2174 * probing, then if the time of last success (tls) of any
2175 * phyint is more recent than the time of first fail (tff) of the
2176 * phyint in question, and the link is up on the phyint,
2177 * then it is a phyint failure. Otherwise it is a group failure.
2178 * If failure was detected via a link down notification sent from
2179 * the driver to IP, we see if any phyints in the group are still
2180 * running and haven't received a link down notification. We
2181 * will usually be processing the link down notification shortly
2182 * after it was received, so there is no point looking at the tls
2183 * of other phyints.
2185 retval = GROUP_FAILURE;
2186 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2187 /* Exclude ourself from comparison */
2188 if (pi2 == pi)
2189 continue;
2191 if (LINK_DOWN(pi)) {
2193 * We use FLAGS_TO_LINK_STATE() to test the flags
2194 * directly, rather then LINK_UP() or LINK_DOWN(), as
2195 * we may not have got round to processing the link
2196 * state for the other phyints in the group yet.
2198 * The check for PI_RUNNING and group failure handles
2199 * the case when the group begins to recover.
2200 * PI_RUNNING will be set, and group failure cleared
2201 * only after receipt of NUM_PROBE_REPAIRS, by which
2202 * time the other phyints should have received at
2203 * least 1 packet, and so will not have NUM_PROBE_FAILS.
2205 if ((pi2->pi_state == PI_RUNNING) &&
2206 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2207 retval = PHYINT_FAILURE;
2208 break;
2210 continue;
2213 if (LINK_DOWN(pi2))
2214 continue;
2217 * If there's no probe-based failure detection on this
2218 * interface, and its link is still up, then it's still
2219 * working and thus the group has not failed.
2221 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2222 retval = PHYINT_FAILURE;
2223 break;
2227 * Need to compare against both IPv4 and IPv6 instances.
2229 pii2 = pi2->pi_v4;
2230 if (pii2 != NULL) {
2231 probe_success_info(pii2, NULL, &psinfo);
2232 if (psinfo.ps_tls_valid) {
2233 pi2_tls = psinfo.ps_tls;
2235 * See comment above regarding check
2236 * for PI_RUNNING and group failure.
2238 if (TIME_GT(pi2_tls, pi_tff) &&
2239 (pi2->pi_state == PI_RUNNING) &&
2240 !GROUP_FAILED(pg) &&
2241 FLAGS_TO_LINK_STATE(pi2)) {
2242 retval = PHYINT_FAILURE;
2243 break;
2248 pii2 = pi2->pi_v6;
2249 if (pii2 != NULL) {
2250 probe_success_info(pii2, NULL, &psinfo);
2251 if (psinfo.ps_tls_valid) {
2252 pi2_tls = psinfo.ps_tls;
2254 * See comment above regarding check
2255 * for PI_RUNNING and group failure.
2257 if (TIME_GT(pi2_tls, pi_tff) &&
2258 (pi2->pi_state == PI_RUNNING) &&
2259 !GROUP_FAILED(pg) &&
2260 FLAGS_TO_LINK_STATE(pi2)) {
2261 retval = PHYINT_FAILURE;
2262 break;
2269 * Update the group state to account for the changes.
2271 phyint_group_refresh_state(pg);
2272 return (retval);
2276 * Return the information associated with consecutive probe successes
2277 * starting with the most recent probe. At most the last 2 probes can be
2278 * in the unacknowledged state. All previous probes have either failed
2279 * or succeeded.
2281 static void
2282 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2283 struct probe_success_count *psinfo)
2285 uint_t i;
2286 struct probe_stats *pr_statp;
2287 uint_t most_recent;
2288 uint_t second_most_recent;
2289 boolean_t pi_found_failure = _B_FALSE;
2290 boolean_t tg_found_failure = _B_FALSE;
2291 uint_t now;
2292 uint_t timeout;
2293 struct target *tg;
2295 if (debug & D_FAILREP)
2296 logdebug("probe_success_info(%s)\n", pii->pii_name);
2298 bzero(psinfo, sizeof (*psinfo));
2299 now = getcurrenttime();
2302 * Start with the most recent probe, and count the number
2303 * of consecutive probe successes. Latch the number of successes
2304 * on hitting a failure.
2306 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2307 second_most_recent = PROBE_INDEX_PREV(most_recent);
2309 for (i = most_recent; i != pii->pii_probe_next;
2310 i = PROBE_INDEX_PREV(i)) {
2311 pr_statp = &pii->pii_probes[i];
2313 switch (pr_statp->pr_status) {
2314 case PR_UNACKED:
2316 * Only the most recent 2 probes can be unacknowledged
2318 assert(i == most_recent || i == second_most_recent);
2320 tg = pr_statp->pr_target;
2321 assert(tg != NULL);
2323 * The crtt could be zero for some reason,
2324 * Eg. the phyint could be failed. If the crtt is
2325 * not available use the value of the group's probe
2326 * interval which is a worst case estimate.
2328 timeout = ns2ms(pr_statp->pr_hrtime_start);
2329 if (tg->tg_crtt != 0) {
2330 timeout += tg->tg_crtt;
2331 } else {
2332 timeout +=
2333 pii->pii_phyint->pi_group->pg_probeint;
2336 if (TIME_LT(timeout, now)) {
2338 * We hit a failure. Latch the total number of
2339 * recent consecutive successes.
2341 pr_statp->pr_time_lost = timeout;
2342 probe_chstate(pr_statp, pii, PR_LOST);
2343 pi_found_failure = _B_TRUE;
2344 if (cur_tg != NULL && tg == cur_tg) {
2346 * We hit a failure for the desired
2347 * target. Latch the number of recent
2348 * consecutive successes for this target
2350 tg_found_failure = _B_TRUE;
2353 break;
2355 case PR_ACKED:
2357 * Bump up the count of probe successes, if we
2358 * have not seen any failure so far.
2360 if (!pi_found_failure)
2361 psinfo->ps_nsucc++;
2363 if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2364 !tg_found_failure) {
2365 psinfo->ps_nsucc_tg++;
2369 * Record the time of last success, if this is
2370 * the most recent probe success.
2372 if (!psinfo->ps_tls_valid) {
2373 psinfo->ps_tls =
2374 ns2ms(pr_statp->pr_hrtime_ackproc);
2375 psinfo->ps_tls_valid = _B_TRUE;
2377 break;
2379 case PR_LOST:
2381 * We hit a failure. Latch the total number of
2382 * recent consecutive successes.
2384 pi_found_failure = _B_TRUE;
2385 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2387 * We hit a failure for the desired target.
2388 * Latch the number of recent consecutive
2389 * successes for this target
2391 tg_found_failure = _B_TRUE;
2393 break;
2395 default:
2396 return;
2403 * Return the information associated with consecutive probe failures
2404 * starting with the most recent probe. Only the last 2 probes can be in the
2405 * unacknowledged state. All previous probes have either failed or succeeded.
2407 static void
2408 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2409 struct probe_fail_count *pfinfo)
2411 int i;
2412 struct probe_stats *pr_statp;
2413 boolean_t tg_found_success = _B_FALSE;
2414 boolean_t pi_found_success = _B_FALSE;
2415 int most_recent;
2416 int second_most_recent;
2417 uint_t now;
2418 uint_t timeout;
2419 struct target *tg;
2421 if (debug & D_FAILREP)
2422 logdebug("probe_fail_info(%s)\n", pii->pii_name);
2424 bzero(pfinfo, sizeof (*pfinfo));
2425 now = getcurrenttime();
2428 * Start with the most recent probe, and count the number
2429 * of consecutive probe failures. Latch the number of failures
2430 * on hitting a probe success.
2432 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2433 second_most_recent = PROBE_INDEX_PREV(most_recent);
2435 for (i = most_recent; i != pii->pii_probe_next;
2436 i = PROBE_INDEX_PREV(i)) {
2437 pr_statp = &pii->pii_probes[i];
2439 assert(PR_STATUS_VALID(pr_statp->pr_status));
2441 switch (pr_statp->pr_status) {
2442 case PR_UNACKED:
2444 * Only the most recent 2 probes can be unacknowledged
2446 assert(i == most_recent || i == second_most_recent);
2448 tg = pr_statp->pr_target;
2450 * Target is guaranteed to exist in the unack. state
2452 assert(tg != NULL);
2454 * The crtt could be zero for some reason,
2455 * Eg. the phyint could be failed. If the crtt is
2456 * not available use the group's probe interval,
2457 * which is a worst case estimate.
2459 timeout = ns2ms(pr_statp->pr_hrtime_start);
2460 if (tg->tg_crtt != 0) {
2461 timeout += tg->tg_crtt;
2462 } else {
2463 timeout +=
2464 pii->pii_phyint->pi_group->pg_probeint;
2467 if (TIME_GT(timeout, now))
2468 break;
2470 pr_statp->pr_time_lost = timeout;
2471 probe_chstate(pr_statp, pii, PR_LOST);
2472 /* FALLTHRU */
2474 case PR_LOST:
2475 if (!pi_found_success) {
2476 pfinfo->pf_nfail++;
2477 pfinfo->pf_tff = pr_statp->pr_time_lost;
2479 if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2480 !tg_found_success) {
2481 pfinfo->pf_nfail_tg++;
2483 break;
2485 default:
2487 * We hit a success or unused slot. Latch the
2488 * total number of recent consecutive failures.
2490 pi_found_success = _B_TRUE;
2491 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2493 * We hit a success for the desired target.
2494 * Latch the number of recent consecutive
2495 * failures for this target
2497 tg_found_success = _B_TRUE;
2504 * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2506 void
2507 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2509 if (pr->pr_status == state)
2510 return;
2512 pr->pr_status = state;
2513 (void) probe_state_event(pr, pii);
2517 * Check if the phyint has been repaired. If no test address has been
2518 * configured, then consider the interface repaired if the link is up (unless
2519 * the link is flapping; see below). Otherwise, look for proof of probes
2520 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2521 * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2523 static boolean_t
2524 phyint_repaired(struct phyint *pi)
2526 struct probe_success_count psinfo;
2527 struct phyint_instance *pii;
2528 struct target *cur_tg;
2529 int pr_ndx;
2530 uint_t cur_time;
2532 if (debug & D_FAILREP)
2533 logdebug("phyint_repaired(%s)\n", pi->pi_name);
2535 if (LINK_DOWN(pi))
2536 return (_B_FALSE);
2539 * If we don't have any test addresses and the link is up, then
2540 * consider the interface repaired, unless we've received more than
2541 * LINK_UP_PERMIN link up notifications in the last minute, in
2542 * which case we keep the link down until we drop back below
2543 * the threshold.
2545 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2546 cur_time = getcurrenttime();
2547 if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2548 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2549 pi->pi_lfmsg_printed = 0;
2550 return (_B_TRUE);
2552 if (!pi->pi_lfmsg_printed) {
2553 logerr("The link has come up on %s more than %d times "
2554 "in the last minute; disabling repair until it "
2555 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2556 pi->pi_lfmsg_printed = 1;
2559 return (_B_FALSE);
2562 pii = pi->pi_v4;
2563 if (PROBE_CAPABLE(pii)) {
2564 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2565 cur_tg = pii->pii_probes[pr_ndx].pr_target;
2566 probe_success_info(pii, cur_tg, &psinfo);
2567 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2568 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2569 return (_B_TRUE);
2572 pii = pi->pi_v6;
2573 if (PROBE_CAPABLE(pii)) {
2574 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2575 cur_tg = pii->pii_probes[pr_ndx].pr_target;
2576 probe_success_info(pii, cur_tg, &psinfo);
2577 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2578 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2579 return (_B_TRUE);
2582 return (_B_FALSE);
2586 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2588 boolean_t
2589 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
2591 int ifsock;
2592 struct lifreq lifr;
2593 uint64_t old_flags;
2595 if (debug & D_FAILREP) {
2596 logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2597 pi->pi_name, set, clear);
2600 if (pi->pi_v4 != NULL)
2601 ifsock = ifsock_v4;
2602 else
2603 ifsock = ifsock_v6;
2606 * Get the current flags from the kernel, and set/clear the
2607 * desired phyint flags. Since we set only phyint flags, we can
2608 * do it on either IPv4 or IPv6 instance.
2610 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2612 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2613 if (errno != ENXIO)
2614 logperror("change_pif_flags: ioctl (get flags)");
2615 return (_B_FALSE);
2618 old_flags = lifr.lifr_flags;
2619 lifr.lifr_flags |= set;
2620 lifr.lifr_flags &= ~clear;
2622 if (old_flags == lifr.lifr_flags) {
2623 /* No change in the flags. No need to send ioctl */
2624 return (_B_TRUE);
2627 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2628 if (errno != ENXIO)
2629 logperror("change_pif_flags: ioctl (set flags)");
2630 return (_B_FALSE);
2634 * Keep pi_flags in synch. with actual flags. Assumes flags are
2635 * phyint flags.
2637 pi->pi_flags |= set;
2638 pi->pi_flags &= ~clear;
2640 if (pi->pi_v4 != NULL)
2641 pi->pi_v4->pii_flags = pi->pi_flags;
2643 if (pi->pi_v6 != NULL)
2644 pi->pi_v6->pii_flags = pi->pi_flags;
2646 return (_B_TRUE);
2650 * icmp cksum computation for IPv4.
2652 static int
2653 in_cksum(ushort_t *addr, int len)
2655 register int nleft = len;
2656 register ushort_t *w = addr;
2657 register ushort_t answer;
2658 ushort_t odd_byte = 0;
2659 register int sum = 0;
2662 * Our algorithm is simple, using a 32 bit accumulator (sum),
2663 * we add sequential 16 bit words to it, and at the end, fold
2664 * back all the carry bits from the top 16 bits into the lower
2665 * 16 bits.
2667 while (nleft > 1) {
2668 sum += *w++;
2669 nleft -= 2;
2672 /* mop up an odd byte, if necessary */
2673 if (nleft == 1) {
2674 *(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2675 sum += odd_byte;
2679 * add back carry outs from top 16 bits to low 16 bits
2681 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */
2682 sum += (sum >> 16); /* add carry */
2683 answer = ~sum; /* truncate to 16 bits */
2684 return (answer);
2687 static void
2688 reset_snxt_basetimes(void)
2690 struct phyint_instance *pii;
2692 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2693 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2698 * Is the address one of our own addresses? Unfortunately,
2699 * we cannot check our phyint tables to determine if the address
2700 * is our own. This is because, we don't track interfaces that
2701 * are not part of any group. We have to either use a 'bind' or
2702 * get the complete list of all interfaces using SIOCGLIFCONF,
2703 * to do this check. We could also use SIOCTMYADDR.
2704 * Bind fails for the local zone address, so we might include local zone
2705 * address as target address. If local zone address is a target address
2706 * and it is up, it is not possible to detect the interface failure.
2707 * SIOCTMYADDR also doesn't consider local zone address as own address.
2708 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2709 * are stored in `localaddrs'
2711 boolean_t
2712 own_address(struct in6_addr addr)
2714 addrlist_t *addrp;
2715 struct sockaddr_storage ss;
2716 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
2718 addr2storage(af, &addr, &ss);
2719 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2720 if (sockaddrcmp(&ss, &addrp->al_addr))
2721 return (_B_TRUE);
2723 return (_B_FALSE);
2726 static int
2727 ns2ms(int64_t ns)
2729 return (NSEC2MSEC(ns));
2732 static int64_t
2733 tv2ns(struct timeval *tvp)
2735 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);