usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c

   1 /*
   2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   3  * Use is subject to license terms.
   4  */
   5
   6 /*
   7  * Copyright (c) 1987 Regents of the University of California.
   8  * All rights reserved.
   9  *
  10  * Redistribution and use in source and binary forms are permitted
  11  * provided that the above copyright notice and this paragraph are
  12  * duplicated in all such forms and that any documentation,
  13  * advertising materials, and other materials related to such
  14  * distribution and use acknowledge that the software was developed
  15  * by the University of California, Berkeley. The name of the
  16  * University may not be used to endorse or promote products derived
  17  * from this software without specific prior written permission.
  18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
  19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
  20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  21  */
  22
  23 #include "mpd_defs.h"
  24 #include "mpd_tables.h"
  25
  26 /*
  27  * Probe types for probe()
  28  */
  29 #define PROBE_UNI       0x1234          /* Unicast probe packet */
  30 #define PROBE_MULTI     0x5678          /* Multicast probe packet */
  31 #define PROBE_RTT       0x9abc          /* RTT only probe packet */
  32
  33 #define MSEC_PERMIN     (60 * MILLISEC) /* Number of milliseconds in a minute */
  34
  35 /*
  36  * Format of probe / probe response packets. This is an ICMP Echo request
  37  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
  38  */
  39 struct pr_icmp
  40 {
  41         uint8_t  pr_icmp_type;          /* type field */
  42         uint8_t  pr_icmp_code;          /* code field */
  43         uint16_t pr_icmp_cksum;         /* checksum field */
  44         uint16_t pr_icmp_id;            /* Identification */
  45         uint16_t pr_icmp_seq;           /* sequence number */
  46         uint64_t pr_icmp_timestamp;     /* Time stamp (in ns) */
  47         uint32_t pr_icmp_mtype;         /* Message type */
  48 };
  49
  50 static struct in6_addr all_nodes_mcast_v6 = { { { 0xff, 0x2, 0x0, 0x0,
  51                                     0x0, 0x0, 0x0, 0x0,
  52                                     0x0, 0x0, 0x0, 0x0,
  53                                     0x0, 0x0, 0x0, 0x1 } } };
  54
  55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
  56
  57 static hrtime_t last_fdt_bumpup_time;   /* When FDT was bumped up last */
  58
  59 static void             *find_ancillary(struct msghdr *msg, int cmsg_level,
  60     int cmsg_type);
  61 static void             pi_set_crtt(struct target *tg, int64_t m,
  62     boolean_t is_probe_uni);
  63 static void             incoming_echo_reply(struct phyint_instance *pii,
  64     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
  65 static void             incoming_rtt_reply(struct phyint_instance *pii,
  66     struct pr_icmp *reply, struct in6_addr fromaddr);
  67 static void             incoming_mcast_reply(struct phyint_instance *pii,
  68     struct pr_icmp *reply, struct in6_addr fromaddr);
  69
  70 static boolean_t        check_pg_crtt_improved(struct phyint_group *pg);
  71 static boolean_t        check_pii_crtt_improved(struct phyint_instance *pii);
  72 static boolean_t        check_exception_target(struct phyint_instance *pii,
  73     struct target *target);
  74 static void             probe_fail_info(struct phyint_instance *pii,
  75     struct target *cur_tg, struct probe_fail_count *pfinfo);
  76 static void             probe_success_info(struct phyint_instance *pii,
  77     struct target *cur_tg, struct probe_success_count *psinfo);
  78 static boolean_t        phyint_repaired(struct phyint *pi);
  79
  80 static boolean_t        highest_ack_tg(uint16_t seq, struct target *tg);
  81 static int              in_cksum(ushort_t *addr, int len);
  82 static void             reset_snxt_basetimes(void);
  83 static int              ns2ms(int64_t ns);
  84 static int64_t          tv2ns(struct timeval *);
  85
  86 /*
  87  * CRTT - Conservative Round Trip Time Estimate
  88  * Probe success - A matching probe reply received before CRTT ms has elapsed
  89  *      after sending the probe.
  90  * Probe failure - No probe reply received and more than CRTT ms has elapsed
  91  *      after sending the probe.
  92  *
  93  * TLS - Time last success. Most recent probe ack received at this time.
  94  * TFF - Time first fail. The time of the earliest probe failure in
  95  *      a consecutive series of probe failures.
  96  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
  97  *      before declaring phyint repair.
  98  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
  99  *      declare a phyint failure.
 100  *
 101  *                      Phyint state diagram
 102  *
 103  * The state of a phyint that is capable of being probed, is completely
 104  * specified by the 3-tuple <pi_state, pg_state, I>.
 105  *
 106  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
 107  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
 108  * (the common case) and probe targets, then a phyint must also successfully
 109  * be able to send and receive probes in order to remain in the PI_RUNNING
 110  * state (otherwise, it transitions to PI_FAILED).
 111  *
 112  * Further, if a PI_RUNNING phyint is configured with a test address but is
 113  * unable to find any probe targets, it will transition to the PI_NOTARGETS
 114  * state, which indicates that the link is apparently functional but that
 115  * in.mpathd is unable to send probes to verify functionality (in this case,
 116  * in.mpathd makes the optimistic assumption that the interface is working
 117  * correctly and thus does not mark the interface FAILED, but reports it as
 118  * IPMP_IF_UNKNOWN through the async events and query interfaces).
 119  *
 120  * At any point, a phyint may be administratively marked offline via if_mpadm.
 121  * In this case, the interface always transitions to PI_OFFLINE, regardless
 122  * of its previous state.  When the interface is later brought back online,
 123  * in.mpathd acts as if the interface is new (and thus it transitions to
 124  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
 125  * its probes, if probes are sent).
 126  *
 127  * pi_state -  PI_RUNNING or PI_FAILED
 128  *      PI_RUNNING: The failure detection logic says the phyint is good.
 129  *      PI_FAILED: The failure detection logic says the phyint has failed.
 130  *
 131  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
 132  *      PG_OK: All interfaces in the group are OK.
 133  *      PG_DEGRADED: Some interfaces in the group are unusable.
 134  *      PG_FAILED: All interfaces in the group are unusable.
 135  *
 136  *      In the case of router targets, we assume that the current list of
 137  *      targets obtained from the routing table, is still valid, so the
 138  *      phyint stat is PI_FAILED. In the case of host targets, we delete the
 139  *      list of targets, and multicast to the all hosts, to reconstruct the
 140  *      target list. So the phyints are in the PI_NOTARGETS state.
 141  *
 142  * I -  value of (pi_flags & IFF_INACTIVE)
 143  *      IFF_INACTIVE: This phyint will not send or receive packets.
 144  *      Usually, inactive is tied to standby interfaces that are not yet
 145  *      needed (e.g., no non-standby interfaces in the group have failed).
 146  *      When failback has been disabled (FAILBACK=no configured), phyint can
 147  *      also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
 148  *      subsequently recovers after a failure.
 149  *
 150  * Not all 9 possible combinations of the above 3-tuple are possible.
 151  *
 152  * I is tracked by IP. pi_state is tracked by mpathd.
 153  *
 154  *                      pi_state state machine
 155  * ---------------------------------------------------------------------------
 156  *      Event                   State                   New State
 157  *                              Action:
 158  * ---------------------------------------------------------------------------
 159  *      IP interface failure    (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
 160  *      detection               : set IFF_FAILED on this phyint
 161  *
 162  *      IP interface failure    (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
 163  *      detection               : set IFF_FAILED on this phyint
 164  *
 165  *      IP interface repair     (PI_FAILED, I == 0, FAILBACK=yes)
 166  *      detection                                    -> (PI_RUNNING, I == 0)
 167  *                              : clear IFF_FAILED on this phyint
 168  *
 169  *      IP interface repair     (PI_FAILED, I == 0, FAILBACK=no)
 170  *      detection                                    -> (PI_RUNNING, I == 1)
 171  *                              : clear IFF_FAILED on this phyint
 172  *                              : if failback is disabled set I == 1
 173  *
 174  *      Group failure           (perform on all phyints in the group)
 175  *      detection               PI_RUNNING              PI_FAILED
 176  *      (Router targets)        : set IFF_FAILED
 177  *
 178  *      Group failure           (perform on all phyints in the group)
 179  *      detection               PI_RUNNING              PI_NOTARGETS
 180  *      (Host targets)          : set IFF_FAILED
 181  *                              : delete the target list on all phyints
 182  * ---------------------------------------------------------------------------
 183  */
 184
 185 struct probes_missed probes_missed;
 186
 187 /*
 188  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
 189  * will be added on by the kernel.  The id field identifies this phyint.
 190  * and the sequence number is an increasing (modulo 2^^16) integer. The data
 191  * portion holds the time value when the packet is sent. On echo this is
 192  * extracted to compute the round-trip time. Three different types of
 193  * probe packets are used.
 194  *
 195  * PROBE_UNI: This type is used to do failure detection / failure recovery
 196  *      and RTT calculation. PROBE_UNI probes are spaced apart in time,
 197  *      not less than the current CRTT. pii_probes[] stores data
 198  *      about these probes. These packets consume sequence number space.
 199  *
 200  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
 201  *      are not used. Under heavy network load, the rtt may go up very high,
 202  *      due to a spike, or may appear to go high, due to extreme scheduling
 203  *      delays. Once the network stress is removed, mpathd takes long time to
 204  *      recover, because the probe_interval is already high, and it takes
 205  *      a long time to send out sufficient number of probes to bring down the
 206  *      rtt. To avoid this problem, PROBE_RTT probes are sent out every
 207  *      user_probe_interval ms. and will cause only rtt updates. These packets
 208  *      do not consume sequence number space nor is information about these
 209  *      packets stored in the pii_probes[]
 210  *
 211  * PROBE_MULTI: This type is only used to construct a list of targets, when
 212  *      no targets are known. The packet is multicast to the all hosts addr.
 213  */
 214 static void
 215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
 216 {
 217         hrtime_t sent_hrtime;
 218         struct timeval sent_tv;
 219         struct pr_icmp probe_pkt;       /* Probe packet */
 220         struct sockaddr_storage targ;   /* target address */
 221         uint_t  targaddrlen;            /* targed address length */
 222         int     pr_ndx;                 /* probe index in pii->pii_probes[] */
 223         boolean_t sent = _B_FALSE;
 224         int     rval;
 225
 226         if (debug & D_TARGET) {
 227                 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
 228                     pii->pii_name, probe_type, start_hrtime);
 229         }
 230
 231         assert(pii->pii_probe_sock != -1);
 232         assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
 233             probe_type == PROBE_RTT);
 234
 235         probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
 236             ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
 237         probe_pkt.pr_icmp_code = 0;
 238         probe_pkt.pr_icmp_cksum = 0;
 239         probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
 240
 241         /*
 242          * Since there is no need to do arithmetic on the icmpid,
 243          * (only equality check is done) pii_icmpid is stored in
 244          * network byte order at initialization itself.
 245          */
 246         probe_pkt.pr_icmp_id = pii->pii_icmpid;
 247         probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
 248         probe_pkt.pr_icmp_mtype = htonl(probe_type);
 249
 250         /*
 251          * If probe_type is PROBE_MULTI, this packet will be multicast to
 252          * the all hosts address. Otherwise it is unicast to the next target.
 253          */
 254         assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
 255             pii->pii_rtt_target_next != NULL));
 256
 257         bzero(&targ, sizeof (targ));
 258         targ.ss_family = pii->pii_af;
 259
 260         if (pii->pii_af == AF_INET6) {
 261                 struct in6_addr *addr6;
 262
 263                 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
 264                 targaddrlen = sizeof (struct sockaddr_in6);
 265                 if (probe_type == PROBE_MULTI) {
 266                         *addr6 = all_nodes_mcast_v6;
 267                 } else if (probe_type == PROBE_UNI) {
 268                         *addr6 = pii->pii_target_next->tg_address;
 269                 } else { /* type is PROBE_RTT */
 270                         *addr6 = pii->pii_rtt_target_next->tg_address;
 271                 }
 272         } else {
 273                 struct in_addr *addr4;
 274
 275                 addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
 276                 targaddrlen = sizeof (struct sockaddr_in);
 277                 if (probe_type == PROBE_MULTI) {
 278                         *addr4 = all_nodes_mcast_v4;
 279                 } else if (probe_type == PROBE_UNI) {
 280                         IN6_V4MAPPED_TO_INADDR(
 281                             &pii->pii_target_next->tg_address, addr4);
 282                 } else { /* type is PROBE_RTT */
 283                         IN6_V4MAPPED_TO_INADDR(
 284                             &pii->pii_rtt_target_next->tg_address, addr4);
 285                 }
 286
 287                 /*
 288                  * Compute the IPv4 icmp checksum. Does not cover the IP header.
 289                  */
 290                 probe_pkt.pr_icmp_cksum =
 291                     in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
 292         }
 293
 294         /*
 295          * Use the current time as the time we sent.  Not atomic, but the best
 296          * we can do from here.
 297          */
 298         sent_hrtime = gethrtime();
 299         (void) gettimeofday(&sent_tv, NULL);
 300         rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
 301             (struct sockaddr *)&targ, targaddrlen);
 302         /*
 303          * If the send would block, this may either be transient or a hang in a
 304          * lower layer. We pretend the probe was actually sent, the daemon will
 305          * not see a reply to the probe and will fail the interface if normal
 306          * failure detection criteria are met.
 307          */
 308         if (rval == sizeof (probe_pkt) ||
 309             (rval == -1 && errno == EWOULDBLOCK)) {
 310                 sent = _B_TRUE;
 311         } else {
 312                 logperror_pii(pii, "probe: probe sendto");
 313         }
 314
 315         /*
 316          * If this is a PROBE_UNI probe packet being unicast to a target, then
 317          * update our tables. We will need this info in processing the probe
 318          * response. PROBE_MULTI and PROBE_RTT packets are not used for
 319          * the purpose of failure or recovery detection. PROBE_MULTI packets
 320          * are only used to construct a list of targets. PROBE_RTT packets are
 321          * used only for updating the rtt and not for failure detection.
 322          */
 323         if (probe_type == PROBE_UNI && sent) {
 324                 pr_ndx = pii->pii_probe_next;
 325                 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
 326
 327                 /* Collect statistics, before we reuse the last slot. */
 328                 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
 329                         pii->pii_cum_stats.lost++;
 330                 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
 331                         pii->pii_cum_stats.acked++;
 332                 pii->pii_cum_stats.sent++;
 333
 334                 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
 335                 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
 336                 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
 337                 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
 338                 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
 339                 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
 340
 341                 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
 342                 pii->pii_target_next = target_next(pii->pii_target_next);
 343                 assert(pii->pii_target_next != NULL);
 344                 /*
 345                  * If we have a single variable to denote the next target to
 346                  * probe for both rtt probes and failure detection probes, we
 347                  * could end up with a situation where the failure detection
 348                  * probe targets become disjoint from the rtt probe targets.
 349                  * Eg. if 2 targets and the actual fdt is double the user
 350                  * specified fdt. So we have 2 variables. In this scheme
 351                  * we also reset pii_rtt_target_next for every fdt probe,
 352                  * though that may not be necessary.
 353                  */
 354                 pii->pii_rtt_target_next = pii->pii_target_next;
 355                 pii->pii_snxt++;
 356         } else if (probe_type == PROBE_RTT) {
 357                 pii->pii_rtt_target_next =
 358                     target_next(pii->pii_rtt_target_next);
 359                 assert(pii->pii_rtt_target_next != NULL);
 360         }
 361 }
 362
 363 /*
 364  * Incoming IPv4 data from wire, is received here. Called from main.
 365  */
 366 void
 367 in_data(struct phyint_instance *pii)
 368 {
 369         struct  sockaddr_in     from;
 370         struct  in6_addr        fromaddr;
 371         static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
 372         static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
 373         struct ip *ip;
 374         int     iphlen;
 375         int     len;
 376         char    abuf[INET_ADDRSTRLEN];
 377         struct msghdr msg;
 378         struct iovec iov;
 379         struct pr_icmp *reply;
 380         struct timeval *recv_tvp;
 381
 382         if (debug & D_PROBE) {
 383                 logdebug("in_data(%s %s)\n",
 384                     AF_STR(pii->pii_af), pii->pii_name);
 385         }
 386
 387         iov.iov_base = (char *)in_packet;
 388         iov.iov_len = sizeof (in_packet);
 389         msg.msg_iov = &iov;
 390         msg.msg_iovlen = 1;
 391         msg.msg_name = (struct sockaddr *)&from;
 392         msg.msg_namelen = sizeof (from);
 393         msg.msg_control = ancillary_data;
 394         msg.msg_controllen = sizeof (ancillary_data);
 395
 396         /*
 397          * Poll has already told us that a message is waiting,
 398          * on this socket. Read it now. We should not block.
 399          */
 400         if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
 401                 logperror_pii(pii, "in_data: recvmsg");
 402                 return;
 403         }
 404
 405         /*
 406          * If the datalink has indicated the link is down, don't go
 407          * any further.
 408          */
 409         if (LINK_DOWN(pii->pii_phyint))
 410                 return;
 411
 412         /* Get the printable address for error reporting */
 413         (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
 414
 415         /* Ignore packets > 64k or control buffers that don't fit */
 416         if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
 417                 if (debug & D_PKTBAD) {
 418                         logdebug("Truncated message: msg_flags 0x%x from %s\n",
 419                             msg.msg_flags, abuf);
 420                 }
 421                 return;
 422         }
 423
 424         /* Make sure packet contains at least minimum ICMP header */
 425         ip = (struct ip *)in_packet;
 426         iphlen = ip->ip_hl << 2;
 427         if (len < iphlen + ICMP_MINLEN) {
 428                 if (debug & D_PKTBAD) {
 429                         logdebug("in_data: packet too short (%d bytes)"
 430                             " from %s\n", len, abuf);
 431                 }
 432                 return;
 433         }
 434
 435         /*
 436          * Subtract the IP hdr length, 'len' will be length of the probe
 437          * reply, starting from the icmp hdr.
 438          */
 439         len -= iphlen;
 440         /* LINTED */
 441         reply = (struct pr_icmp *)((char *)in_packet + iphlen);
 442
 443         /* Probe replies are icmp echo replies. Ignore anything else */
 444         if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
 445                 return;
 446
 447         /*
 448          * The icmp id should match what we sent, which is stored
 449          * in pi_icmpid. The icmp code for reply must be 0.
 450          * The reply content must be a struct pr_icmp
 451          */
 452         if (reply->pr_icmp_id != pii->pii_icmpid) {
 453                 /* Not in response to our probe */
 454                 return;
 455         }
 456
 457         if (reply->pr_icmp_code != 0) {
 458                 logtrace("probe reply code %d from %s on %s\n",
 459                     reply->pr_icmp_code, abuf, pii->pii_name);
 460                 return;
 461         }
 462
 463         if (len < sizeof (struct pr_icmp)) {
 464                 logtrace("probe reply too short: %d bytes from %s on %s\n",
 465                     len, abuf, pii->pii_name);
 466                 return;
 467         }
 468
 469         recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
 470         if (recv_tvp == NULL) {
 471                 logtrace("message without timestamp from %s on %s\n",
 472                     abuf, pii->pii_name);
 473                 return;
 474         }
 475
 476         IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
 477         if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
 478                 /* Unicast probe reply */
 479                 incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
 480         else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
 481                 /* Multicast reply */
 482                 incoming_mcast_reply(pii, reply, fromaddr);
 483         } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
 484                 incoming_rtt_reply(pii, reply, fromaddr);
 485         } else {
 486                 /* Probably not in response to our probe */
 487                 logtrace("probe reply type: %d from %s on %s\n",
 488                     reply->pr_icmp_mtype, abuf, pii->pii_name);
 489                 return;
 490         }
 491 }
 492
 493 /*
 494  * Incoming IPv6 data from wire is received here. Called from main.
 495  */
 496 void
 497 in6_data(struct phyint_instance *pii)
 498 {
 499         struct sockaddr_in6 from;
 500         static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
 501         static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
 502         int len;
 503         char abuf[INET6_ADDRSTRLEN];
 504         struct msghdr msg;
 505         struct iovec iov;
 506         void    *opt;
 507         struct  pr_icmp *reply;
 508         struct  timeval *recv_tvp;
 509
 510         if (debug & D_PROBE) {
 511                 logdebug("in6_data(%s %s)\n",
 512                     AF_STR(pii->pii_af), pii->pii_name);
 513         }
 514
 515         iov.iov_base = (char *)in_packet;
 516         iov.iov_len = sizeof (in_packet);
 517         msg.msg_iov = &iov;
 518         msg.msg_iovlen = 1;
 519         msg.msg_name = (struct sockaddr *)&from;
 520         msg.msg_namelen = sizeof (from);
 521         msg.msg_control = ancillary_data;
 522         msg.msg_controllen = sizeof (ancillary_data);
 523
 524         if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
 525                 logperror_pii(pii, "in6_data: recvmsg");
 526                 return;
 527         }
 528
 529         /*
 530          * If the datalink has indicated that the link is down, don't go
 531          * any further.
 532          */
 533         if (LINK_DOWN(pii->pii_phyint))
 534                 return;
 535
 536         /* Get the printable address for error reporting */
 537         (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
 538         if (len < ICMP_MINLEN) {
 539                 if (debug & D_PKTBAD) {
 540                         logdebug("Truncated message: msg_flags 0x%x from %s\n",
 541                             msg.msg_flags, abuf);
 542                 }
 543                 return;
 544         }
 545         /* Ignore packets > 64k or control buffers that don't fit */
 546         if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
 547                 if (debug & D_PKTBAD) {
 548                         logdebug("Truncated message: msg_flags 0x%x from %s\n",
 549                             msg.msg_flags, abuf);
 550                 }
 551                 return;
 552         }
 553
 554         reply = (struct pr_icmp *)in_packet;
 555         if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
 556                 return;
 557
 558         if (reply->pr_icmp_id != pii->pii_icmpid) {
 559                 /* Not in response to our probe */
 560                 return;
 561         }
 562
 563         /*
 564          * The kernel has already verified the the ICMP checksum.
 565          */
 566         if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
 567                 logtrace("ICMPv6 echo reply source address not linklocal from "
 568                     "%s on %s\n", abuf, pii->pii_name);
 569                 return;
 570         }
 571         opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
 572         if (opt != NULL) {
 573                 /* Can't allow routing headers in probe replies  */
 574                 logtrace("message with routing header from %s on %s\n",
 575                     abuf, pii->pii_name);
 576                 return;
 577         }
 578
 579         if (reply->pr_icmp_code != 0) {
 580                 logtrace("probe reply code: %d from %s on %s\n",
 581                     reply->pr_icmp_code, abuf, pii->pii_name);
 582                 return;
 583         }
 584         if (len < (sizeof (struct pr_icmp))) {
 585                 logtrace("probe reply too short: %d bytes from %s on %s\n",
 586                     len, abuf, pii->pii_name);
 587                 return;
 588         }
 589
 590         recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
 591         if (recv_tvp == NULL) {
 592                 logtrace("message without timestamp from %s on %s\n",
 593                     abuf, pii->pii_name);
 594                 return;
 595         }
 596
 597         if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
 598                 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
 599         } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
 600                 incoming_mcast_reply(pii, reply, from.sin6_addr);
 601         } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
 602                 incoming_rtt_reply(pii, reply, from.sin6_addr);
 603         } else  {
 604                 /* Probably not in response to our probe */
 605                 logtrace("probe reply type: %d from %s on %s\n",
 606                     reply->pr_icmp_mtype, abuf, pii->pii_name);
 607         }
 608 }
 609
 610 /*
 611  * Process the incoming rtt reply, in response to our rtt probe.
 612  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
 613  * have any stored information about the probe we sent. So we don't log
 614  * any errors if we receive bad replies.
 615  */
 616 static void
 617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
 618     struct in6_addr fromaddr)
 619 {
 620         int64_t m;              /* rtt measurement in ns */
 621         char    abuf[INET6_ADDRSTRLEN];
 622         struct  target  *target;
 623         struct  phyint_group *pg;
 624
 625         /* Get the printable address for error reporting */
 626         (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
 627
 628         if (debug & D_PROBE) {
 629                 logdebug("incoming_rtt_reply: %s %s %s\n",
 630                     AF_STR(pii->pii_af), pii->pii_name, abuf);
 631         }
 632
 633         /* Do we know this target ? */
 634         target = target_lookup(pii, fromaddr);
 635         if (target == NULL)
 636                 return;
 637
 638         m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
 639         /* Invalid rtt. It has wrapped around */
 640         if (m < 0)
 641                 return;
 642
 643         /*
 644          * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
 645          * The initial few responses after the interface is repaired may
 646          * contain high rtt's because they could have been queued up waiting
 647          * for ARP/NDP resolution on a failed interface.
 648          */
 649         pg = pii->pii_phyint->pi_group;
 650         if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
 651                 return;
 652
 653         /*
 654          * Update rtt only if the new rtt is lower than the current rtt.
 655          * (specified by the 3rd parameter to pi_set_crtt).
 656          * If a spike has caused the current probe_interval to be >
 657          * user_probe_interval, then this mechanism is used to bring down
 658          * the rtt rapidly once the network stress is removed.
 659          * If the new rtt is higher than the current rtt, we don't want to
 660          * update the rtt. We are having more than 1 outstanding probe and
 661          * the increase in rtt we are seeing is being unnecessarily weighted
 662          * many times. The regular rtt update will be handled by
 663          * incoming_echo_reply() and will take care of any rtt increase.
 664          */
 665         pi_set_crtt(target, m, _B_FALSE);
 666         if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
 667             (user_failure_detection_time < pg->pg_fdt) &&
 668             (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
 669                 /*
 670                  * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
 671                  * investigate if we can improve the failure detection time to
 672                  * meet whatever the user specified.
 673                  */
 674                 if (check_pg_crtt_improved(pg)) {
 675                         pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
 676                             user_failure_detection_time);
 677                         pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
 678                         if (pii->pii_phyint->pi_group != phyint_anongroup) {
 679                                 logerr("Improved failure detection time %d ms "
 680                                     "on (%s %s) for group \"%s\"\n",
 681                                     pg->pg_fdt, AF_STR(pii->pii_af),
 682                                     pii->pii_name,
 683                                     pii->pii_phyint->pi_group->pg_name);
 684                         }
 685                         if (user_failure_detection_time == pg->pg_fdt) {
 686                                 /* Avoid any truncation or rounding errors */
 687                                 pg->pg_probeint = user_probe_interval;
 688                                 /*
 689                                  * No more rtt probes will be sent. The actual
 690                                  * fdt has dropped to the user specified value.
 691                                  * pii_fd_snxt_basetime and pii_snxt_basetime
 692                                  * will be in sync henceforth.
 693                                  */
 694                                 reset_snxt_basetimes();
 695                         }
 696                 }
 697         }
 698 }
 699
 700 /*
 701  * Process the incoming echo reply, in response to our unicast probe.
 702  * Common for both IPv4 and IPv6
 703  */
 704 static void
 705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
 706     struct in6_addr fromaddr, struct timeval *recv_tvp)
 707 {
 708         int64_t m;              /* rtt measurement in ns */
 709         hrtime_t cur_hrtime;    /* in ns from some arbitrary point */
 710         char    abuf[INET6_ADDRSTRLEN];
 711         int     pr_ndx;
 712         struct  target  *target;
 713         boolean_t exception;
 714         uint64_t pr_icmp_timestamp;
 715         uint16_t pr_icmp_seq;
 716         struct  probe_stats *pr_statp;
 717         struct  phyint_group *pg = pii->pii_phyint->pi_group;
 718
 719         /* Get the printable address for error reporting */
 720         (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
 721
 722         if (debug & D_PROBE) {
 723                 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
 724                     AF_STR(pii->pii_af), pii->pii_name, abuf,
 725                     ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
 726         }
 727
 728         pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
 729         pr_icmp_seq = ntohs(reply->pr_icmp_seq);
 730
 731         /* Reject out of window probe replies */
 732         if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
 733             SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
 734                 logtrace("out of window probe seq %u snxt %u on %s from %s\n",
 735                     pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
 736                 pii->pii_cum_stats.unknown++;
 737                 return;
 738         }
 739
 740         cur_hrtime = gethrtime();
 741         m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
 742         if (m < 0) {
 743                 /*
 744                  * This is a ridiculously high value of rtt. rtt has wrapped
 745                  * around. Log a message, and ignore the rtt.
 746                  */
 747                 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
 748                     "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
 749         }
 750
 751         /*
 752          * Get the probe index pr_ndx corresponding to the received icmp seq.
 753          * number in our pii->pii_probes[] array. The icmp sequence number
 754          * pii_snxt corresponds to the probe index pii->pii_probe_next
 755          */
 756         pr_ndx = MOD_SUB(pii->pii_probe_next,
 757             (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
 758
 759         assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
 760
 761         target = pii->pii_probes[pr_ndx].pr_target;
 762
 763         /*
 764          * Perform sanity checks, whether this probe reply that we
 765          * have received is genuine
 766          */
 767         if (target != NULL) {
 768                 /*
 769                  * Compare the src. addr of the received ICMP or ICMPv6
 770                  * probe reply with the target address in our tables.
 771                  */
 772                 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
 773                         /*
 774                          * We don't have any record of having sent a probe to
 775                          * this target. This is a fake probe reply. Log an error
 776                          */
 777                         logtrace("probe status %d Fake probe reply seq %u "
 778                             "snxt %u on %s from %s\n",
 779                             pii->pii_probes[pr_ndx].pr_status,
 780                             pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
 781                         pii->pii_cum_stats.unknown++;
 782                         return;
 783                 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
 784                         /*
 785                          * The address matches, but our tables indicate that
 786                          * this probe reply has been acked already. So this
 787                          * is a duplicate probe reply. Log an error
 788                          */
 789                         logtrace("probe status %d Duplicate probe reply seq %u "
 790                             "snxt %u on %s from %s\n",
 791                             pii->pii_probes[pr_ndx].pr_status,
 792                             pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
 793                         pii->pii_cum_stats.unknown++;
 794                         return;
 795                 }
 796         } else {
 797                 /*
 798                  * Target must not be NULL in the PR_UNACKED state
 799                  */
 800                 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
 801                 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
 802                         /*
 803                          * The probe stats slot is unused. So we didn't
 804                          * send out any probe to this target. This is a fake.
 805                          * Log an error.
 806                          */
 807                         logtrace("probe status %d Fake probe reply seq %u "
 808                             "snxt %u on %s from %s\n",
 809                             pii->pii_probes[pr_ndx].pr_status,
 810                             pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
 811                 }
 812                 pii->pii_cum_stats.unknown++;
 813                 return;
 814         }
 815
 816         /*
 817          * If the rtt does not appear to be right, don't update the
 818          * rtt stats. This can happen if the system dropped into the
 819          * debugger, or the system was hung or too busy for a
 820          * substantial time that we didn't get a chance to run.
 821          */
 822         if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
 823                 /*
 824                  * If the probe corresponding to this received response
 825                  * was truly sent 'm' ns. ago, then this response must
 826                  * have been rejected by the sequence number checks. The
 827                  * fact that it has passed the sequence number checks
 828                  * means that the measured rtt is wrong. We were probably
 829                  * scheduled long after the packet was received.
 830                  */
 831                 goto out;
 832         }
 833
 834         /*
 835          * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
 836          * The initial few responses after the interface is repaired may
 837          * contain high rtt's because they could have been queued up waiting
 838          * for ARP/NDP resolution on a failed interface.
 839          */
 840         if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
 841                 goto out;
 842
 843         /*
 844          * Don't update the Conservative Round Trip Time estimate for this
 845          * (phint, target) pair if this is the not the highest ack seq seen
 846          * thus far on this target.
 847          */
 848         if (!highest_ack_tg(pr_icmp_seq, target))
 849                 goto out;
 850
 851         /*
 852          * Always update the rtt. This is a failure detection probe
 853          * and we want to measure both increase / decrease in rtt.
 854          */
 855         pi_set_crtt(target, m, _B_TRUE);
 856
 857         /*
 858          * If the crtt exceeds the average time between probes,
 859          * investigate if this slow target is an exception. If so we
 860          * can avoid this target and still meet the failure detection
 861          * time. Otherwise we can't meet the failure detection time.
 862          */
 863         if (target->tg_crtt > pg->pg_probeint) {
 864                 exception = check_exception_target(pii, target);
 865                 if (exception) {
 866                         /*
 867                          * This target is exceptionally slow. Don't use it
 868                          * for future probes. check_exception_target() has
 869                          * made sure that we have at least MIN_PROBE_TARGETS
 870                          * other active targets
 871                          */
 872                         if (pii->pii_targets_are_routers) {
 873                                 /*
 874                                  * This is a slow router, mark it as slow
 875                                  * and don't use it for further probes. We
 876                                  * don't delete it, since it will be populated
 877                                  * again when we do a router scan. Hence we
 878                                  * need to maintain extra state (unlike the
 879                                  * host case below).  Mark it as TG_SLOW.
 880                                  */
 881                                 if (target->tg_status == TG_ACTIVE)
 882                                         pii->pii_ntargets--;
 883                                 target->tg_status = TG_SLOW;
 884                                 target->tg_latime = gethrtime();
 885                                 target->tg_rtt_sa = -1;
 886                                 target->tg_crtt = 0;
 887                                 target->tg_rtt_sd = 0;
 888                                 if (pii->pii_target_next == target) {
 889                                         pii->pii_target_next =
 890                                             target_next(target);
 891                                 }
 892                         } else {
 893                                 /*
 894                                  * the slow target is not a router, we can
 895                                  * just delete it. Send an icmp multicast and
 896                                  * pick the fastest responder that is not
 897                                  * already an active target. target_delete()
 898                                  * adjusts pii->pii_target_next
 899                                  */
 900                                 target_delete(target);
 901                                 probe(pii, PROBE_MULTI, cur_hrtime);
 902                         }
 903                 } else {
 904                         /*
 905                          * We can't meet the failure detection time.
 906                          * Log a message, and update the detection time to
 907                          * whatever we can achieve.
 908                          */
 909                         pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
 910                         pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
 911                         last_fdt_bumpup_time = gethrtime();
 912                         if (pg != phyint_anongroup) {
 913                                 logtrace("Cannot meet requested failure"
 914                                     " detection time of %d ms on (%s %s) new"
 915                                     " failure detection time for group \"%s\""
 916                                     " is %d ms\n", user_failure_detection_time,
 917                                     AF_STR(pii->pii_af), pii->pii_name,
 918                                     pg->pg_name, pg->pg_fdt);
 919                         }
 920                 }
 921         } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
 922             (user_failure_detection_time < pg->pg_fdt) &&
 923             (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
 924                 /*
 925                  * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
 926                  * investigate if we can improve the failure detection time to
 927                  * meet whatever the user specified.
 928                  */
 929                 if (check_pg_crtt_improved(pg)) {
 930                         pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
 931                             user_failure_detection_time);
 932                         pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
 933                         if (pg != phyint_anongroup) {
 934                                 logtrace("Improved failure detection time %d ms"
 935                                     " on (%s %s) for group \"%s\"\n",
 936                                     pg->pg_fdt, AF_STR(pii->pii_af),
 937                                     pii->pii_name, pg->pg_name);
 938                         }
 939                         if (user_failure_detection_time == pg->pg_fdt) {
 940                                 /* Avoid any truncation or rounding errors */
 941                                 pg->pg_probeint = user_probe_interval;
 942                                 /*
 943                                  * No more rtt probes will be sent. The actual
 944                                  * fdt has dropped to the user specified value.
 945                                  * pii_fd_snxt_basetime and pii_snxt_basetime
 946                                  * will be in sync henceforth.
 947                                  */
 948                                 reset_snxt_basetimes();
 949                         }
 950                 }
 951         }
 952 out:
 953         pr_statp = &pii->pii_probes[pr_ndx];
 954         pr_statp->pr_hrtime_ackproc = cur_hrtime;
 955         pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
 956             (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
 957
 958         probe_chstate(pr_statp, pii, PR_ACKED);
 959
 960         /*
 961          * Update pii->pii_rack, i.e. the sequence number of the last received
 962          * probe response, based on the echo reply we have received now, if
 963          * either of the following conditions are satisfied.
 964          * a. pii_rack is outside the current receive window of
 965          *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
 966          *    This means we have not received probe responses for a
 967          *    long time, and the sequence number has wrapped around.
 968          * b. pii_rack is within the current receive window and this echo
 969          *    reply corresponds to the highest sequence number we have seen
 970          *    so far.
 971          */
 972         if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
 973             SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
 974             SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
 975                 pii->pii_rack = pr_icmp_seq;
 976         }
 977 }
 978
 979 /*
 980  * Returns true if seq is the highest unacknowledged seq for target tg
 981  * else returns false
 982  */
 983 static boolean_t
 984 highest_ack_tg(uint16_t seq, struct target *tg)
 985 {
 986         struct phyint_instance *pii;
 987         int      pr_ndx;
 988         uint16_t pr_seq;
 989
 990         pii = tg->tg_phyint_inst;
 991
 992         /*
 993          * Get the seq number of the most recent probe sent so far,
 994          * and also get the corresponding probe index in the probe stats
 995          * array.
 996          */
 997         pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
 998         pr_seq = pii->pii_snxt;
 999         pr_seq--;
1000
1001         /*
1002          * Start from the most recent probe and walk back, trying to find
1003          * an acked probe corresponding to target tg.
1004          */
1005         for (; pr_ndx != pii->pii_probe_next;
1006             pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1007                 if (pii->pii_probes[pr_ndx].pr_target == tg &&
1008                     pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1009                         if (SEQ_GT(pr_seq, seq))
1010                                 return (_B_FALSE);
1011                 }
1012         }
1013         return (_B_TRUE);
1014 }
1015
1016 /*
1017  * Check whether the crtt for the group has improved by a factor of
1018  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1019  * detection time flapping in the face of small crtt changes.
1020  */
1021 static boolean_t
1022 check_pg_crtt_improved(struct phyint_group *pg)
1023 {
1024         struct  phyint *pi;
1025
1026         if (debug & D_PROBE)
1027                 logdebug("check_pg_crtt_improved()\n");
1028
1029         /*
1030          * The crtt for the group is only improved if each phyint_instance
1031          * for both ipv4 and ipv6 is improved.
1032          */
1033         for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1034                 if (!check_pii_crtt_improved(pi->pi_v4) ||
1035                     !check_pii_crtt_improved(pi->pi_v6))
1036                         return (_B_FALSE);
1037         }
1038
1039         return (_B_TRUE);
1040 }
1041
1042 /*
1043  * Check whether the crtt has improved substantially on this phyint_instance.
1044  * Returns _B_TRUE if there's no crtt information available, because pii
1045  * is NULL or the phyint_instance is not capable of probing.
1046  */
1047 boolean_t
1048 check_pii_crtt_improved(struct phyint_instance *pii) {
1049         struct  target *tg;
1050
1051         if (pii == NULL)
1052                 return (_B_TRUE);
1053
1054         if (!PROBE_CAPABLE(pii) ||
1055             pii->pii_phyint->pi_state == PI_FAILED)
1056                 return (_B_TRUE);
1057
1058         for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1059                 if (tg->tg_status != TG_ACTIVE)
1060                         continue;
1061                 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1062                     LOWER_FDT_TRIGGER)) {
1063                         return (_B_FALSE);
1064                 }
1065         }
1066
1067         return (_B_TRUE);
1068 }
1069
1070 /*
1071  * This target responds very slowly to probes. The target's crtt exceeds
1072  * the probe interval of its group. Compare against other targets
1073  * and determine if this target is an exception, if so return true, else false
1074  */
1075 static boolean_t
1076 check_exception_target(struct phyint_instance *pii, struct target *target)
1077 {
1078         struct  target *tg;
1079         char abuf[INET6_ADDRSTRLEN];
1080
1081         if (debug & D_PROBE) {
1082                 logdebug("check_exception_target(%s %s target %s)\n",
1083                     AF_STR(pii->pii_af), pii->pii_name,
1084                     pr_addr(pii->pii_af, target->tg_address,
1085                     abuf, sizeof (abuf)));
1086         }
1087
1088         /*
1089          * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1090          * to make a good judgement. Otherwise don't drop this target.
1091          */
1092         if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1093                 return (_B_FALSE);
1094
1095         /*
1096          * Determine whether only this particular target is slow.
1097          * We know that this target's crtt exceeds the group's probe interval.
1098          * If all other active targets have a
1099          * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1100          * then this target is considered slow.
1101          */
1102         for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1103                 if (tg != target && tg->tg_status == TG_ACTIVE) {
1104                         if (tg->tg_crtt >
1105                             pii->pii_phyint->pi_group->pg_probeint /
1106                             EXCEPTION_FACTOR) {
1107                                 return (_B_FALSE);
1108                         }
1109                 }
1110         }
1111
1112         return (_B_TRUE);
1113 }
1114
1115 /*
1116  * Update the target list. The icmp all hosts multicast has given us
1117  * some host to which we can send probes. If we already have sufficient
1118  * targets, discard it.
1119  */
1120 static void
1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1122     struct in6_addr fromaddr)
1123 /* ARGSUSED */
1124 {
1125         int af;
1126         char abuf[INET6_ADDRSTRLEN];
1127         struct phyint *pi;
1128
1129         if (debug & D_PROBE) {
1130                 logdebug("incoming_mcast_reply(%s %s %s)\n",
1131                     AF_STR(pii->pii_af), pii->pii_name,
1132                     pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1133         }
1134
1135         /*
1136          * Using host targets is a fallback mechanism. If we have
1137          * found a router, don't add this host target. If we already
1138          * know MAX_PROBE_TARGETS, don't add another target.
1139          */
1140         assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1141         if (pii->pii_targets != NULL) {
1142                 if (pii->pii_targets_are_routers ||
1143                     (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1144                         return;
1145                 }
1146         }
1147
1148         if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1149             IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1150                 /*
1151                  * Guard against response from 0.0.0.0
1152                  * and ::. Log a trace message
1153                  */
1154                 logtrace("probe response from %s on %s\n",
1155                     pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1156                     pii->pii_name);
1157                 return;
1158         }
1159
1160         /*
1161          * This address is one of our own, so reject this address as a
1162          * valid probe target.
1163          */
1164         af = pii->pii_af;
1165         if (own_address(fromaddr))
1166                 return;
1167
1168         /*
1169          * If the phyint is part a named group, then add the address to all
1170          * members of the group.  Otherwise, add the address only to the
1171          * phyint itself, since other phyints in the anongroup may not be on
1172          * the same subnet.
1173          */
1174         pi = pii->pii_phyint;
1175         if (pi->pi_group == phyint_anongroup) {
1176                 target_add(pii, fromaddr, _B_FALSE);
1177         } else {
1178                 pi = pi->pi_group->pg_phyint;
1179                 for (; pi != NULL; pi = pi->pi_pgnext)
1180                         target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1181         }
1182 }
1183
1184 /*
1185  * Compute CRTT given an existing scaled average, scaled deviation estimate
1186  * and a new rtt time.  The formula is from Jacobson and Karels'
1187  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1188  * are the same as those in Appendix A.2 of that paper.
1189  *
1190  * m = new measurement
1191  * sa = scaled RTT average (8 * average estimates)
1192  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1193  * crtt = Conservative round trip time. Used to determine whether probe
1194  * has timed out.
1195  *
1196  * New scaled average and deviation are passed back via sap and svp
1197  */
1198 static int64_t
1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
1200 {
1201         int64_t sa = *sap;
1202         int64_t sv = *svp;
1203         int64_t crtt;
1204         int64_t saved_m = m;
1205
1206         assert(*sap >= -1);
1207         assert(*svp >= 0);
1208
1209         if (sa != -1) {
1210                 /*
1211                  * Update average estimator:
1212                  *      new rtt = old rtt + 1/8 Error
1213                  *          where Error = m - old rtt
1214                  *      i.e. 8 * new rtt = 8 * old rtt + Error
1215                  *      i.e. new sa =  old sa + Error
1216                  */
1217                 m -= sa >> 3;           /* m is now Error in estimate. */
1218                 if ((sa += m) < 0) {
1219                         /* Don't allow the smoothed average to be negative. */
1220                         sa = 0;
1221                 }
1222
1223                 /*
1224                  * Update deviation estimator:
1225                  *      new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1226                  *      i.e. 4 * new mdev = 4 * old mdev +
1227                  *              (abs(Error) - old mdev)
1228                  *      i.e. new sv = old sv + (abs(Error) - old mdev)
1229                  */
1230                 if (m < 0)
1231                         m = -m;
1232                 m -= sv >> 2;
1233                 sv += m;
1234         } else {
1235                 /* Initialization. This is the first response received. */
1236                 sa = (m << 3);
1237                 sv = (m << 1);
1238         }
1239
1240         crtt = (sa >> 3) + sv;
1241
1242         if (debug & D_PROBE) {
1243                 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244                     "crtt = %lld\n", saved_m, sa, sv, crtt);
1245         }
1246
1247         *sap = sa;
1248         *svp = sv;
1249
1250         /*
1251          * CRTT = average estimates  + 4 * deviation estimates
1252          *      = sa / 8 + sv
1253          */
1254         return (crtt);
1255 }
1256
1257 static void
1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
1259 {
1260         struct phyint_instance *pii = tg->tg_phyint_inst;
1261         int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1262         int64_t sa = tg->tg_rtt_sa;
1263         int64_t sv = tg->tg_rtt_sd;
1264         int new_crtt;
1265         int i;
1266
1267         if (debug & D_PROBE)
1268                 logdebug("pi_set_crtt: target -  m %lld\n", m);
1269
1270         /* store the round trip time, in case we need to defer computation */
1271         tg->tg_deferred[tg->tg_num_deferred] = m;
1272
1273         new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
1274
1275         /*
1276          * If this probe's round trip time would singlehandedly cause an
1277          * increase in the group's probe interval consider it suspect.
1278          */
1279         if ((new_crtt > probe_interval) && is_probe_uni) {
1280                 if (debug & D_PROBE) {
1281                         logdebug("Received a suspect probe on %s, new_crtt ="
1282                             " %d, probe_interval = %d, num_deferred = %d\n",
1283                             pii->pii_probe_logint->li_name, new_crtt,
1284                             probe_interval, tg->tg_num_deferred);
1285                 }
1286
1287                 /*
1288                  * If we've deferred as many rtts as we plan on deferring, then
1289                  * assume the link really did slow down and process all queued
1290                  * rtts
1291                  */
1292                 if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1293                         if (debug & D_PROBE) {
1294                                 logdebug("Received MAXDEFERREDRTT probes which "
1295                                     "would cause an increased probe_interval.  "
1296                                     "Integrating queued rtt data points.\n");
1297                         }
1298
1299                         for (i = 0; i <= tg->tg_num_deferred; i++) {
1300                                 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1301                                     &tg->tg_rtt_sd, tg->tg_deferred[i]));
1302                         }
1303
1304                         tg->tg_num_deferred = 0;
1305                 } else {
1306                         tg->tg_num_deferred++;
1307                 }
1308                 return;
1309         }
1310
1311         /*
1312          * If this is a normal probe, or an RTT probe that would lead to a
1313          * reduced CRTT, then update our CRTT data.  Further, if this was
1314          * a normal probe, pitch any deferred probes since our probes are
1315          * again being answered within our CRTT estimates.
1316          */
1317         if (is_probe_uni || new_crtt < tg->tg_crtt) {
1318                 tg->tg_rtt_sa = sa;
1319                 tg->tg_rtt_sd = sv;
1320                 tg->tg_crtt = new_crtt;
1321                 if (is_probe_uni)
1322                         tg->tg_num_deferred = 0;
1323         }
1324 }
1325
1326 /*
1327  * Return a pointer to the specified option buffer.
1328  * If not found return NULL.
1329  */
1330 static void *
1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
1332 {
1333         struct cmsghdr *cmsg;
1334
1335         for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1336             cmsg = CMSG_NXTHDR(msg, cmsg)) {
1337                 if (cmsg->cmsg_level == cmsg_level &&
1338                     cmsg->cmsg_type == cmsg_type) {
1339                         return (CMSG_DATA(cmsg));
1340                 }
1341         }
1342         return (NULL);
1343 }
1344
1345 /*
1346  * Try to activate another INACTIVE interface in the same group as `pi'.
1347  * Prefer STANDBY INACTIVE to just INACTIVE.
1348  */
1349 void
1350 phyint_activate_another(struct phyint *pi)
1351 {
1352         struct phyint *pi2;
1353         struct phyint *inactivepi = NULL;
1354
1355         if (pi->pi_group == phyint_anongroup)
1356                 return;
1357
1358         for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1359                 if (pi == pi2 || !phyint_is_functioning(pi2) ||
1360                     !(pi2->pi_flags & IFF_INACTIVE))
1361                         continue;
1362
1363                 inactivepi = pi2;
1364                 if (pi2->pi_flags & IFF_STANDBY)
1365                         break;
1366         }
1367
1368         if (inactivepi != NULL)
1369                 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1370 }
1371
1372 /*
1373  * Transition a phyint to PI_RUNNING.  The caller must ensure that the
1374  * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
1375  * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
1376  * appropriate (see comment below).  Finally, also updates the phyint's group
1377  * state to account for the change.
1378  */
1379 void
1380 phyint_transition_to_running(struct phyint *pi)
1381 {
1382         struct phyint *pi2;
1383         struct phyint *actstandbypi = NULL;
1384         uint_t nactive = 0, nnonstandby = 0;
1385         boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1386         boolean_t initial = (pi->pi_state == PI_INIT);
1387         uint64_t set, clear;
1388
1389         /*
1390          * The interface is running again, but should it or another interface
1391          * in the group end up INACTIVE?  There are three cases:
1392          *
1393          * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394          *    the group is operating at capacity (i.e., there are at least as
1395          *    many active interfaces as non-STANDBY interfaces in the group).
1396          *    No other interfaces should be changed.
1397          *
1398          * 2. If it's a non-STANDBY interface and we're onlining it or
1399          *    FAILBACK is enabled, then it should *not* end up INACTIVE.
1400          *    Further, if the group is above capacity as a result of this
1401          *    interface, then an active STANDBY interface in the group should
1402          *    end up INACTIVE.
1403          *
1404          * 3. If it's a non-STANDBY interface, we're repairing it, and
1405          *    FAILBACK is disabled, then it should end up INACTIVE *unless*
1406          *    the group was failed (in which case we have no choice but to
1407          *    use it).  No other interfaces should be changed.
1408          */
1409         if (pi->pi_group != phyint_anongroup) {
1410                 pi2 = pi->pi_group->pg_phyint;
1411                 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1412                         if (!(pi2->pi_flags & IFF_STANDBY))
1413                                 nnonstandby++;
1414
1415                         if (phyint_is_functioning(pi2) &&
1416                             !(pi2->pi_flags & IFF_INACTIVE)) {
1417                                 nactive++;
1418                                 if (pi2->pi_flags & IFF_STANDBY)
1419                                         actstandbypi = pi2;
1420                         }
1421                 }
1422         }
1423
1424         set = 0;
1425         clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1426
1427         if (pi->pi_flags & IFF_STANDBY) {                       /* case 1 */
1428                 if (nactive >= nnonstandby)
1429                         set |= IFF_INACTIVE;
1430                 else
1431                         clear |= IFF_INACTIVE;
1432         } else if (onlining || failback_enabled) {              /* case 2 */
1433                 if (nactive >= nnonstandby && actstandbypi != NULL)
1434                         (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1435         } else if (!initial && !GROUP_FAILED(pi->pi_group)) {   /* case 3 */
1436                 set |= IFF_INACTIVE;
1437         }
1438         (void) change_pif_flags(pi, set, clear);
1439
1440         phyint_chstate(pi, PI_RUNNING);
1441
1442         /*
1443          * Update the group state to account for the change.
1444          */
1445         phyint_group_refresh_state(pi->pi_group);
1446 }
1447
1448 /*
1449  * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
1450  * to have at least one active interface and as many active interfaces as
1451  * non-standby interfaces.
1452  */
1453 void
1454 phyint_standby_refresh_inactive(struct phyint *pi)
1455 {
1456         struct phyint *pi2;
1457         uint_t nactive = 0, nnonstandby = 0;
1458
1459         /*
1460          * All phyints in the anonymous group are effectively in their own
1461          * group and thus active regardless of whether they're marked standby.
1462          */
1463         if (pi->pi_group == phyint_anongroup) {
1464                 (void) change_pif_flags(pi, 0, IFF_INACTIVE);
1465                 return;
1466         }
1467
1468         /*
1469          * If the phyint isn't functioning we can't consider it.
1470          */
1471         if (!phyint_is_functioning(pi))
1472                 return;
1473
1474         for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1475                 if (!(pi2->pi_flags & IFF_STANDBY))
1476                         nnonstandby++;
1477
1478                 if (phyint_is_functioning(pi2) &&
1479                     !(pi2->pi_flags & IFF_INACTIVE))
1480                         nactive++;
1481         }
1482
1483         if (nactive == 0 || nactive < nnonstandby)
1484                 (void) change_pif_flags(pi, 0, IFF_INACTIVE);
1485         else if (nactive > nnonstandby)
1486                 (void) change_pif_flags(pi, IFF_INACTIVE, 0);
1487 }
1488
1489 /*
1490  * See if a previously failed interface has started working again.
1491  */
1492 void
1493 phyint_check_for_repair(struct phyint *pi)
1494 {
1495         if (!phyint_repaired(pi))
1496                 return;
1497
1498         if (pi->pi_group == phyint_anongroup) {
1499                 logerr("IP interface repair detected on %s\n", pi->pi_name);
1500         } else {
1501                 logerr("IP interface repair detected on %s of group %s\n",
1502                     pi->pi_name, pi->pi_group->pg_name);
1503         }
1504
1505         /*
1506          * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1507          * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1508          * until it is brought back online.
1509          */
1510         if (pi->pi_state == PI_OFFLINE) {
1511                 (void) change_pif_flags(pi, 0, IFF_FAILED);
1512                 return;
1513         }
1514
1515         phyint_transition_to_running(pi);       /* calls phyint_chstate() */
1516 }
1517
1518 /*
1519  * See if an interface has failed, or if the whole group of interfaces has
1520  * failed.
1521  */
1522 static void
1523 phyint_inst_check_for_failure(struct phyint_instance *pii)
1524 {
1525         struct phyint   *pi = pii->pii_phyint;
1526         struct phyint   *pi2;
1527         boolean_t       was_active;
1528
1529         switch (failure_state(pii)) {
1530         case PHYINT_FAILURE:
1531                 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1532
1533                 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1534                 if (pi->pi_group == phyint_anongroup) {
1535                         logerr("IP interface failure detected on %s\n",
1536                             pii->pii_name);
1537                 } else {
1538                         logerr("IP interface failure detected on %s of group"
1539                             " %s\n", pii->pii_name, pi->pi_group->pg_name);
1540                 }
1541
1542                 /*
1543                  * If the failed interface was active, activate another
1544                  * INACTIVE interface in the group if possible.
1545                  */
1546                 if (was_active)
1547                         phyint_activate_another(pi);
1548
1549                 /*
1550                  * If the interface is offline, the state change will be
1551                  * noted when it comes back online.
1552                  */
1553                 if (pi->pi_state != PI_OFFLINE) {
1554                         phyint_chstate(pi, PI_FAILED);
1555                         reset_crtt_all(pi);
1556                 }
1557                 break;
1558
1559         case GROUP_FAILURE:
1560                 pi2 = pi->pi_group->pg_phyint;
1561                 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1562                         (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1563                         if (pi2->pi_state == PI_OFFLINE) /* see comment above */
1564                                 continue;
1565
1566                         reset_crtt_all(pi2);
1567                         /*
1568                          * In the case of host targets, we would have flushed
1569                          * the targets, and gone to PI_NOTARGETS state.
1570                          */
1571                         if (pi2->pi_state == PI_RUNNING)
1572                                 phyint_chstate(pi2, PI_FAILED);
1573                 }
1574                 break;
1575
1576         default:
1577                 break;
1578         }
1579 }
1580
1581 /*
1582  * Determines if any timeout event has occurred and returns the number of
1583  * milliseconds until the next timeout event for the phyint. Returns
1584  * TIMER_INFINITY for "never".
1585  */
1586 uint_t
1587 phyint_inst_timer(struct phyint_instance *pii)
1588 {
1589         int     pr_ndx;
1590         uint_t  timeout;
1591         struct  target  *cur_tg;
1592         struct  probe_stats *pr_statp;
1593         struct  phyint_instance *pii_other;
1594         struct  phyint *pi;
1595         int     valid_unack_count;
1596         int     i;
1597         int     interval;
1598         uint_t  check_time;
1599         uint_t  cur_time;
1600         hrtime_t cur_hrtime;
1601         int     probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1602
1603         cur_hrtime = gethrtime();
1604         cur_time = ns2ms(cur_hrtime);
1605
1606         if (debug & D_TIMER) {
1607                 logdebug("phyint_inst_timer(%s %s)\n",
1608                     AF_STR(pii->pii_af), pii->pii_name);
1609         }
1610
1611         pii_other = phyint_inst_other(pii);
1612         if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1613                 /*
1614                  * Check to see if we're here due to link up/down flapping; If
1615                  * enough time has passed, then try to bring the interface
1616                  * back up; otherwise, schedule a timer to bring it back up
1617                  * when enough time *has* elapsed.
1618                  */
1619                 pi = pii->pii_phyint;
1620                 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1621                         check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1622                         if (check_time > cur_time)
1623                                 return (check_time - cur_time);
1624
1625                         phyint_check_for_repair(pi);
1626                 }
1627         }
1628
1629         /*
1630          * If probing is not enabled on this phyint instance, don't proceed.
1631          */
1632         if (!PROBE_ENABLED(pii))
1633                 return (TIMER_INFINITY);
1634
1635         /*
1636          * If the timer has fired too soon, probably triggered
1637          * by some other phyint instance, return the remaining
1638          * time
1639          */
1640         if (TIME_LT(cur_time, pii->pii_snxt_time))
1641                 return (pii->pii_snxt_time - cur_time);
1642
1643         /*
1644          * If the link is down, don't send any probes for now.
1645          */
1646         if (LINK_DOWN(pii->pii_phyint))
1647                 return (TIMER_INFINITY);
1648
1649         /*
1650          * Randomize the next probe time, between MIN_RANDOM_FACTOR
1651          * and MAX_RANDOM_FACTOR with respect to the base probe time.
1652          * Base probe time is strictly periodic.
1653          */
1654         interval = GET_RANDOM(
1655             (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1656             (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1657         pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1658
1659         /*
1660          * Check if the current time > next time to probe. If so, we missed
1661          * sending 1 or more probes, probably due to heavy system load. At least
1662          * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1663          * were scheduled. Make adjustments to the times, in multiples of
1664          * user_probe_interval.
1665          */
1666         if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1667                 int n;
1668
1669                 n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1670                 pii->pii_snxt_time      += (n + 1) * user_probe_interval;
1671                 pii->pii_snxt_basetime  += (n + 1) * user_probe_interval;
1672                 logtrace("missed sending %d probes cur_time %u snxt_time %u"
1673                     " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1674                     pii->pii_snxt_basetime);
1675
1676                 /* Collect statistics about missed probes */
1677                 probes_missed.pm_nprobes += n + 1;
1678                 probes_missed.pm_ntimes++;
1679         }
1680         pii->pii_snxt_basetime += user_probe_interval;
1681         interval = pii->pii_snxt_time - cur_time;
1682         if (debug & D_TARGET) {
1683                 logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1684                     " interval %u\n", cur_time, pii->pii_snxt_time,
1685                     pii->pii_snxt_basetime, interval);
1686         }
1687
1688         /*
1689          * If no targets are known, we need to send an ICMP multicast. The
1690          * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1691          * to see if we found a target.
1692          */
1693         if (pii->pii_target_next == NULL) {
1694                 assert(pii->pii_ntargets == 0);
1695                 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1696                 probe(pii, PROBE_MULTI, cur_time);
1697                 return (interval);
1698         }
1699
1700         if ((user_probe_interval != probe_interval) &&
1701             TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1702                 /*
1703                  * the failure detection (fd) probe timer has not yet fired.
1704                  * Need to send only an rtt probe. The probe type is PROBE_RTT.
1705                  */
1706                 probe(pii, PROBE_RTT, cur_hrtime);
1707                 return (interval);
1708         }
1709         /*
1710          * the fd probe timer has fired. Need to do all failure
1711          * detection / recovery calculations, and then send an fd probe
1712          * of type PROBE_UNI.
1713          */
1714         if (user_probe_interval == probe_interval) {
1715                 /*
1716                  * We could have missed some probes, and then adjusted
1717                  * pii_snxt_basetime above. Otherwise we could have
1718                  * blindly added probe_interval to pii_fd_snxt_basetime.
1719                  */
1720                 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1721         } else {
1722                 pii->pii_fd_snxt_basetime += probe_interval;
1723                 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1724                         int n;
1725
1726                         n = (cur_time - pii->pii_fd_snxt_basetime) /
1727                             probe_interval;
1728                         pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1729                 }
1730         }
1731
1732         /*
1733          * We can have at most, the latest 2 probes that we sent, in
1734          * the PR_UNACKED state. All previous probes sent, are either
1735          * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1736          * timed out if the probe's time_start + the CRTT < currenttime.
1737          * For each of the last 2 probes, examine whether it has timed
1738          * out. If so, mark it PR_LOST. The probe stats is a circular array.
1739          */
1740         pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1741         valid_unack_count = 0;
1742
1743         for (i = 0; i < 2; i++) {
1744                 pr_statp = &pii->pii_probes[pr_ndx];
1745                 cur_tg = pii->pii_probes[pr_ndx].pr_target;
1746                 switch (pr_statp->pr_status) {
1747                 case PR_ACKED:
1748                         /*
1749                          * We received back an ACK, so the switch clearly
1750                          * is not dropping our traffic, and thus we can
1751                          * enable failure detection immediately.
1752                          */
1753                         if (pii->pii_fd_hrtime > gethrtime()) {
1754                                 if (debug & D_PROBE) {
1755                                         logdebug("successful probe on %s; "
1756                                             "ending quiet period\n",
1757                                             pii->pii_phyint->pi_name);
1758                                 }
1759                                 pii->pii_fd_hrtime = gethrtime();
1760                         }
1761                         break;
1762
1763                 case PR_UNACKED:
1764                         assert(cur_tg != NULL);
1765                         /*
1766                          * The crtt could be zero for some reason,
1767                          * Eg. the phyint could be failed. If the crtt is
1768                          * not available use group's probe interval,
1769                          * which is a worst case estimate.
1770                          */
1771                         timeout = ns2ms(pr_statp->pr_hrtime_start);
1772                         if (cur_tg->tg_crtt != 0) {
1773                                 timeout += cur_tg->tg_crtt;
1774                         } else {
1775                                 timeout += probe_interval;
1776                         }
1777                         if (TIME_LT(timeout, cur_time)) {
1778                                 pr_statp->pr_time_lost = timeout;
1779                                 probe_chstate(pr_statp, pii, PR_LOST);
1780                         } else if (i == 1) {
1781                                 /*
1782                                  * We are forced to consider this probe
1783                                  * lost, as we can have at most 2 unack.
1784                                  * probes any time, and we will be sending a
1785                                  * probe at the end of this function.
1786                                  * Normally, we should not be here, but
1787                                  * this can happen if an incoming response
1788                                  * that was considered lost has increased
1789                                  * the crtt for this target, and also bumped
1790                                  * up the FDT. Note that we never cancel or
1791                                  * increase the current pii_time_left, so
1792                                  * when the timer fires, we find 2 valid
1793                                  * unacked probes, and they are yet to timeout
1794                                  */
1795                                 pr_statp->pr_time_lost = cur_time;
1796                                 probe_chstate(pr_statp, pii, PR_LOST);
1797                         } else {
1798                                 /*
1799                                  * Only the most recent probe can enter
1800                                  * this 'else' arm. The second most recent
1801                                  * probe must take either of the above arms,
1802                                  * if it is unacked.
1803                                  */
1804                                 valid_unack_count++;
1805                         }
1806                         break;
1807                 }
1808                 pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1809         }
1810
1811         /*
1812          * We send out 1 probe randomly in the interval between one half
1813          * and one probe interval for the group. Given that the CRTT is always
1814          * less than the group's probe interval, we can have at most 1
1815          * unacknowledged probe now.  All previous probes are either lost or
1816          * acked.
1817          */
1818         assert(valid_unack_count == 0 || valid_unack_count == 1);
1819
1820         /*
1821          * The timer has fired. Take appropriate action depending
1822          * on the current state of the phyint.
1823          *
1824          * PI_RUNNING state     - Failure detection
1825          * PI_FAILED state      - Repair detection
1826          */
1827         switch (pii->pii_phyint->pi_state) {
1828         case PI_FAILED:
1829                 /*
1830                  * If the most recent probe (excluding unacked probes that
1831                  * are yet to time out) has been acked, check whether the
1832                  * phyint is now repaired.
1833                  */
1834                 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1835                         phyint_check_for_repair(pii->pii_phyint);
1836                 }
1837                 break;
1838
1839         case PI_RUNNING:
1840                 /*
1841                  * It's possible our probes have been lost because of a
1842                  * spanning-tree mandated quiet period on the switch.  If so,
1843                  * ignore the lost probes.
1844                  */
1845                 if (pii->pii_fd_hrtime - cur_hrtime > 0)
1846                         break;
1847
1848                 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1849                         /*
1850                          * We have 1 or more failed probes (excluding unacked
1851                          * probes that are yet to time out). Determine if the
1852                          * phyint has failed.
1853                          */
1854                         phyint_inst_check_for_failure(pii);
1855                 }
1856                 break;
1857
1858         default:
1859                 logerr("phyint_inst_timer: invalid state %d\n",
1860                     pii->pii_phyint->pi_state);
1861                 abort();
1862         }
1863
1864         /*
1865          * Start the next probe. probe() will also set pii->pii_probe_time_left
1866          * to the group's probe interval. If phyint_failed -> target_flush_hosts
1867          * was called, the target list may be empty.
1868          */
1869         if (pii->pii_target_next != NULL) {
1870                 probe(pii, PROBE_UNI, cur_hrtime);
1871                 /*
1872                  * If we have just the one probe target, and we're not using
1873                  * router targets, try to find another as we presently have
1874                  * no resilience.
1875                  */
1876                 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1877                         probe(pii, PROBE_MULTI, cur_hrtime);
1878         } else {
1879                 probe(pii, PROBE_MULTI, cur_hrtime);
1880         }
1881         return (interval);
1882 }
1883
1884 /*
1885  * Start the probe timer for an interface instance.
1886  */
1887 void
1888 start_timer(struct phyint_instance *pii)
1889 {
1890         uint32_t interval;
1891
1892         /*
1893          * Spread the base probe times (pi_snxt_basetime) across phyints
1894          * uniformly over the (curtime..curtime + the group's probe_interval).
1895          * pi_snxt_basetime is strictly periodic with a frequency of
1896          * the group's probe interval. The actual probe time pi_snxt_time
1897          * adds some randomness to pi_snxt_basetime and happens in probe().
1898          * For the 1st probe on each phyint after the timer is started,
1899          * pi_snxt_time and pi_snxt_basetime are the same.
1900          */
1901         interval = GET_RANDOM(0,
1902             (int)pii->pii_phyint->pi_group->pg_probeint);
1903
1904         pii->pii_snxt_basetime = getcurrenttime() + interval;
1905         pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1906         pii->pii_snxt_time = pii->pii_snxt_basetime;
1907         timer_schedule(interval);
1908 }
1909
1910 /*
1911  * Restart the probe timer on an interface instance.
1912  */
1913 static void
1914 restart_timer(struct phyint_instance *pii)
1915 {
1916         /*
1917          * We don't need to restart the timer if it was never started in
1918          * the first place (pii->pii_basetime_inited not set), as the timer
1919          * won't have gone off yet.
1920          */
1921         if (pii->pii_basetime_inited != 0) {
1922
1923                 if (debug & D_LINKNOTE)
1924                         logdebug("restart timer: restarting timer on %s, "
1925                             "address family %s\n", pii->pii_phyint->pi_name,
1926                             AF_STR(pii->pii_af));
1927
1928                 start_timer(pii);
1929         }
1930 }
1931
1932 static void
1933 process_link_state_down(struct phyint *pi)
1934 {
1935         logerr("The link has gone down on %s\n", pi->pi_name);
1936
1937         /*
1938          * Clear the probe statistics arrays, we don't want the repair
1939          * detection logic relying on probes that were successful prior
1940          * to the link going down.
1941          */
1942         if (PROBE_CAPABLE(pi->pi_v4))
1943                 clear_pii_probe_stats(pi->pi_v4);
1944         if (PROBE_CAPABLE(pi->pi_v6))
1945                 clear_pii_probe_stats(pi->pi_v6);
1946         /*
1947          * Check for interface failure.  Although we know the interface
1948          * has failed, we don't know if all the other interfaces in the
1949          * group have failed as well.
1950          */
1951         if ((pi->pi_state == PI_RUNNING) ||
1952             (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1953                 if (debug & D_LINKNOTE) {
1954                         logdebug("process_link_state_down:"
1955                             " checking for failure on %s\n", pi->pi_name);
1956                 }
1957
1958                 if (pi->pi_v4 != NULL)
1959                         phyint_inst_check_for_failure(pi->pi_v4);
1960                 else if (pi->pi_v6 != NULL)
1961                         phyint_inst_check_for_failure(pi->pi_v6);
1962         }
1963 }
1964
1965 static void
1966 process_link_state_up(struct phyint *pi)
1967 {
1968         logerr("The link has come up on %s\n", pi->pi_name);
1969
1970         /*
1971          * We stopped any running timers on each instance when the link
1972          * went down, so restart them.
1973          */
1974         if (pi->pi_v4)
1975                 restart_timer(pi->pi_v4);
1976         if (pi->pi_v6)
1977                 restart_timer(pi->pi_v6);
1978
1979         phyint_check_for_repair(pi);
1980
1981         pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1982         if (pi->pi_whendx == LINK_UP_PERMIN)
1983                 pi->pi_whendx = 0;
1984 }
1985
1986 /*
1987  * Process any changes in link state passed up from the interfaces.
1988  */
1989 void
1990 process_link_state_changes(void)
1991 {
1992         struct phyint *pi;
1993
1994         /* Look for interfaces where the link state has just changed */
1995
1996         for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1997                 boolean_t old_link_state_up = LINK_UP(pi);
1998
1999                 /*
2000                  * Except when the "phyint" structure is created, this is
2001                  * the only place the link state is updated.  This allows
2002                  * this routine to detect changes in link state, rather
2003                  * than just the current state.
2004                  */
2005                 UPDATE_LINK_STATE(pi);
2006
2007                 if (LINK_DOWN(pi)) {
2008                         /*
2009                          * Has link just gone down?
2010                          */
2011                         if (old_link_state_up)
2012                                 process_link_state_down(pi);
2013                 } else {
2014                         /*
2015                          * Has link just gone back up?
2016                          */
2017                         if (!old_link_state_up)
2018                                 process_link_state_up(pi);
2019                 }
2020         }
2021 }
2022
2023 void
2024 reset_crtt_all(struct phyint *pi)
2025 {
2026         struct phyint_instance *pii;
2027         struct target *tg;
2028
2029         pii = pi->pi_v4;
2030         if (pii != NULL) {
2031                 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2032                         tg->tg_crtt = 0;
2033                         tg->tg_rtt_sa = -1;
2034                         tg->tg_rtt_sd = 0;
2035                 }
2036         }
2037
2038         pii = pi->pi_v6;
2039         if (pii != NULL) {
2040                 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2041                         tg->tg_crtt = 0;
2042                         tg->tg_rtt_sa = -1;
2043                         tg->tg_rtt_sd = 0;
2044                 }
2045         }
2046 }
2047
2048 /*
2049  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2050  * probes on both instances IPv4 and IPv6.
2051  * If the interface has failed, return the time of the first probe failure
2052  * in "tff".
2053  */
2054 static int
2055 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
2056 {
2057         uint_t  pi_tff;
2058         struct  target *cur_tg;
2059         struct  probe_fail_count pfinfo;
2060         struct  phyint_instance *pii_other;
2061         int     pr_ndx;
2062
2063         /*
2064          * Get the number of consecutive failed probes on
2065          * this phyint across all targets. Also get the number
2066          * of consecutive failed probes on this target only
2067          */
2068         pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2069         cur_tg = pii->pii_probes[pr_ndx].pr_target;
2070         probe_fail_info(pii, cur_tg, &pfinfo);
2071
2072         /* Get the time of first failure, for later use */
2073         pi_tff = pfinfo.pf_tff;
2074
2075         /*
2076          * If the current target has not responded to the
2077          * last NUM_PROBE_FAILS probes, and other targets are
2078          * responding delete this target. Dead gateway detection
2079          * will eventually remove this target (if router) from the
2080          * routing tables. If that does not occur, we may end
2081          * up adding this to our list again.
2082          */
2083         if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2084             pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2085                 if (pii->pii_targets_are_routers) {
2086                         if (cur_tg->tg_status == TG_ACTIVE)
2087                                 pii->pii_ntargets--;
2088                         cur_tg->tg_status = TG_DEAD;
2089                         cur_tg->tg_crtt = 0;
2090                         cur_tg->tg_rtt_sa = -1;
2091                         cur_tg->tg_rtt_sd = 0;
2092                         if (pii->pii_target_next == cur_tg)
2093                                 pii->pii_target_next = target_next(cur_tg);
2094                 } else {
2095                         target_delete(cur_tg);
2096                         probe(pii, PROBE_MULTI, gethrtime());
2097                 }
2098                 return (PHYINT_OK);
2099         }
2100
2101         /*
2102          * If the phyint has lost NUM_PROBE_FAILS or more
2103          * consecutive probes, on both IPv4 and IPv6 protocol
2104          * instances of the phyint, then trigger failure
2105          * detection, else return false
2106          */
2107         if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2108                 return (PHYINT_OK);
2109
2110         pii_other = phyint_inst_other(pii);
2111         if (PROBE_CAPABLE(pii_other)) {
2112                 probe_fail_info(pii_other, NULL, &pfinfo);
2113                 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2114                         /*
2115                          * We have NUM_PROBE_FAILS or more failures
2116                          * on both IPv4 and IPv6. Get the earliest
2117                          * time when failure was detected on this
2118                          * phyint across IPv4 and IPv6.
2119                          */
2120                         if (TIME_LT(pfinfo.pf_tff, pi_tff))
2121                                 pi_tff = pfinfo.pf_tff;
2122                 } else {
2123                         /*
2124                          * This instance has < NUM_PROBE_FAILS failure.
2125                          * So return false
2126                          */
2127                         return (PHYINT_OK);
2128                 }
2129         }
2130         *tff = pi_tff;
2131         return (PHYINT_FAILURE);
2132 }
2133
2134 /*
2135  * Check if the link has gone down on this phyint, or it has failed the
2136  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2137  * Also look at other phyints of this group, for group failures.
2138  */
2139 int
2140 failure_state(struct phyint_instance *pii)
2141 {
2142         struct  probe_success_count psinfo;
2143         uint_t  pi2_tls;                /* time last success */
2144         uint_t  pi_tff;                 /* time first fail */
2145         struct  phyint *pi2;
2146         struct  phyint *pi;
2147         struct  phyint_instance *pii2;
2148         struct  phyint_group *pg;
2149         int     retval;
2150
2151         if (debug & D_FAILREP)
2152                 logdebug("phyint_failed(%s)\n", pii->pii_name);
2153
2154         pi = pii->pii_phyint;
2155         pg = pi->pi_group;
2156
2157         if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2158             PHYINT_OK)
2159                 return (PHYINT_OK);
2160
2161         /*
2162          * At this point, the link is down, or the phyint is suspect, as it
2163          * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2164          * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
2165          * on to determine whether this should be considered a PHYINT_FAILURE
2166          * or GROUP_FAILURE.
2167          */
2168         if (pg == phyint_anongroup)
2169                 return (PHYINT_FAILURE);
2170
2171         /*
2172          * Need to compare against other phyints of the same group
2173          * to exclude group failures. If the failure was detected via
2174          * probing, then if the time of last success (tls) of any
2175          * phyint is more recent than the time of first fail (tff) of the
2176          * phyint in question, and the link is up on the phyint,
2177          * then it is a phyint failure. Otherwise it is a group failure.
2178          * If failure was detected via a link down notification sent from
2179          * the driver to IP, we see if any phyints in the group are still
2180          * running and haven't received a link down notification.  We
2181          * will usually be processing the link down notification shortly
2182          * after it was received, so there is no point looking at the tls
2183          * of other phyints.
2184          */
2185         retval = GROUP_FAILURE;
2186         for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2187                 /* Exclude ourself from comparison */
2188                 if (pi2 == pi)
2189                         continue;
2190
2191                 if (LINK_DOWN(pi)) {
2192                         /*
2193                          * We use FLAGS_TO_LINK_STATE() to test the flags
2194                          * directly, rather then LINK_UP() or LINK_DOWN(), as
2195                          * we may not have got round to processing the link
2196                          * state for the other phyints in the group yet.
2197                          *
2198                          * The check for PI_RUNNING and group failure handles
2199                          * the case when the group begins to recover.
2200                          * PI_RUNNING will be set, and group failure cleared
2201                          * only after receipt of NUM_PROBE_REPAIRS, by which
2202                          * time the other phyints should have received at
2203                          * least 1 packet, and so will not have NUM_PROBE_FAILS.
2204                          */
2205                         if ((pi2->pi_state == PI_RUNNING) &&
2206                             !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2207                                 retval = PHYINT_FAILURE;
2208                                 break;
2209                         }
2210                         continue;
2211                 }
2212
2213                 if (LINK_DOWN(pi2))
2214                         continue;
2215
2216                 /*
2217                  * If there's no probe-based failure detection on this
2218                  * interface, and its link is still up, then it's still
2219                  * working and thus the group has not failed.
2220                  */
2221                 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2222                         retval = PHYINT_FAILURE;
2223                         break;
2224                 }
2225
2226                 /*
2227                  * Need to compare against both IPv4 and IPv6 instances.
2228                  */
2229                 pii2 = pi2->pi_v4;
2230                 if (pii2 != NULL) {
2231                         probe_success_info(pii2, NULL, &psinfo);
2232                         if (psinfo.ps_tls_valid) {
2233                                 pi2_tls = psinfo.ps_tls;
2234                                 /*
2235                                  * See comment above regarding check
2236                                  * for PI_RUNNING and group failure.
2237                                  */
2238                                 if (TIME_GT(pi2_tls, pi_tff) &&
2239                                     (pi2->pi_state == PI_RUNNING) &&
2240                                     !GROUP_FAILED(pg) &&
2241                                     FLAGS_TO_LINK_STATE(pi2)) {
2242                                         retval = PHYINT_FAILURE;
2243                                         break;
2244                                 }
2245                         }
2246                 }
2247
2248                 pii2 = pi2->pi_v6;
2249                 if (pii2 != NULL) {
2250                         probe_success_info(pii2, NULL, &psinfo);
2251                         if (psinfo.ps_tls_valid) {
2252                                 pi2_tls = psinfo.ps_tls;
2253                                 /*
2254                                  * See comment above regarding check
2255                                  * for PI_RUNNING and group failure.
2256                                  */
2257                                 if (TIME_GT(pi2_tls, pi_tff) &&
2258                                     (pi2->pi_state == PI_RUNNING) &&
2259                                     !GROUP_FAILED(pg) &&
2260                                     FLAGS_TO_LINK_STATE(pi2)) {
2261                                         retval = PHYINT_FAILURE;
2262                                         break;
2263                                 }
2264                         }
2265                 }
2266         }
2267
2268         /*
2269          * Update the group state to account for the changes.
2270          */
2271         phyint_group_refresh_state(pg);
2272         return (retval);
2273 }
2274
2275 /*
2276  * Return the information associated with consecutive probe successes
2277  * starting with the most recent probe. At most the last 2 probes can be
2278  * in the unacknowledged state. All previous probes have either failed
2279  * or succeeded.
2280  */
2281 static void
2282 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2283     struct probe_success_count *psinfo)
2284 {
2285         uint_t  i;
2286         struct probe_stats *pr_statp;
2287         uint_t most_recent;
2288         uint_t second_most_recent;
2289         boolean_t pi_found_failure = _B_FALSE;
2290         boolean_t tg_found_failure = _B_FALSE;
2291         uint_t now;
2292         uint_t timeout;
2293         struct target *tg;
2294
2295         if (debug & D_FAILREP)
2296                 logdebug("probe_success_info(%s)\n", pii->pii_name);
2297
2298         bzero(psinfo, sizeof (*psinfo));
2299         now = getcurrenttime();
2300
2301         /*
2302          * Start with the most recent probe, and count the number
2303          * of consecutive probe successes. Latch the number of successes
2304          * on hitting a failure.
2305          */
2306         most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2307         second_most_recent = PROBE_INDEX_PREV(most_recent);
2308
2309         for (i = most_recent; i != pii->pii_probe_next;
2310             i = PROBE_INDEX_PREV(i)) {
2311                 pr_statp = &pii->pii_probes[i];
2312
2313                 switch (pr_statp->pr_status) {
2314                 case PR_UNACKED:
2315                         /*
2316                          * Only the most recent 2 probes can be unacknowledged
2317                          */
2318                         assert(i == most_recent || i == second_most_recent);
2319
2320                         tg = pr_statp->pr_target;
2321                         assert(tg != NULL);
2322                         /*
2323                          * The crtt could be zero for some reason,
2324                          * Eg. the phyint could be failed. If the crtt is
2325                          * not available use the value of the group's probe
2326                          * interval which is a worst case estimate.
2327                          */
2328                         timeout = ns2ms(pr_statp->pr_hrtime_start);
2329                         if (tg->tg_crtt != 0) {
2330                                 timeout += tg->tg_crtt;
2331                         } else {
2332                                 timeout +=
2333                                     pii->pii_phyint->pi_group->pg_probeint;
2334                         }
2335
2336                         if (TIME_LT(timeout, now)) {
2337                                 /*
2338                                  * We hit a failure. Latch the total number of
2339                                  * recent consecutive successes.
2340                                  */
2341                                 pr_statp->pr_time_lost = timeout;
2342                                 probe_chstate(pr_statp, pii, PR_LOST);
2343                                 pi_found_failure = _B_TRUE;
2344                                 if (cur_tg != NULL && tg == cur_tg) {
2345                                         /*
2346                                          * We hit a failure for the desired
2347                                          * target. Latch the number of recent
2348                                          * consecutive successes for this target
2349                                          */
2350                                         tg_found_failure = _B_TRUE;
2351                                 }
2352                         }
2353                         break;
2354
2355                 case PR_ACKED:
2356                         /*
2357                          * Bump up the count of probe successes, if we
2358                          * have not seen any failure so far.
2359                          */
2360                         if (!pi_found_failure)
2361                                 psinfo->ps_nsucc++;
2362
2363                         if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2364                             !tg_found_failure) {
2365                                 psinfo->ps_nsucc_tg++;
2366                         }
2367
2368                         /*
2369                          * Record the time of last success, if this is
2370                          * the most recent probe success.
2371                          */
2372                         if (!psinfo->ps_tls_valid) {
2373                                 psinfo->ps_tls =
2374                                     ns2ms(pr_statp->pr_hrtime_ackproc);
2375                                 psinfo->ps_tls_valid = _B_TRUE;
2376                         }
2377                         break;
2378
2379                 case PR_LOST:
2380                         /*
2381                          * We hit a failure. Latch the total number of
2382                          * recent consecutive successes.
2383                          */
2384                         pi_found_failure = _B_TRUE;
2385                         if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2386                                 /*
2387                                  * We hit a failure for the desired target.
2388                                  * Latch the number of recent consecutive
2389                                  * successes for this target
2390                                  */
2391                                 tg_found_failure = _B_TRUE;
2392                         }
2393                         break;
2394
2395                 default:
2396                         return;
2397
2398                 }
2399         }
2400 }
2401
2402 /*
2403  * Return the information associated with consecutive probe failures
2404  * starting with the most recent probe. Only the last 2 probes can be in the
2405  * unacknowledged state. All previous probes have either failed or succeeded.
2406  */
2407 static void
2408 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2409     struct probe_fail_count *pfinfo)
2410 {
2411         int     i;
2412         struct probe_stats *pr_statp;
2413         boolean_t       tg_found_success = _B_FALSE;
2414         boolean_t       pi_found_success = _B_FALSE;
2415         int     most_recent;
2416         int     second_most_recent;
2417         uint_t  now;
2418         uint_t  timeout;
2419         struct  target *tg;
2420
2421         if (debug & D_FAILREP)
2422                 logdebug("probe_fail_info(%s)\n", pii->pii_name);
2423
2424         bzero(pfinfo, sizeof (*pfinfo));
2425         now = getcurrenttime();
2426
2427         /*
2428          * Start with the most recent probe, and count the number
2429          * of consecutive probe failures. Latch the number of failures
2430          * on hitting a probe success.
2431          */
2432         most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2433         second_most_recent = PROBE_INDEX_PREV(most_recent);
2434
2435         for (i = most_recent; i != pii->pii_probe_next;
2436             i = PROBE_INDEX_PREV(i)) {
2437                 pr_statp = &pii->pii_probes[i];
2438
2439                 assert(PR_STATUS_VALID(pr_statp->pr_status));
2440
2441                 switch (pr_statp->pr_status) {
2442                 case PR_UNACKED:
2443                         /*
2444                          * Only the most recent 2 probes can be unacknowledged
2445                          */
2446                         assert(i == most_recent || i == second_most_recent);
2447
2448                         tg = pr_statp->pr_target;
2449                         /*
2450                          * Target is guaranteed to exist in the unack. state
2451                          */
2452                         assert(tg != NULL);
2453                         /*
2454                          * The crtt could be zero for some reason,
2455                          * Eg. the phyint could be failed. If the crtt is
2456                          * not available use the group's probe interval,
2457                          * which is a worst case estimate.
2458                          */
2459                         timeout = ns2ms(pr_statp->pr_hrtime_start);
2460                         if (tg->tg_crtt != 0) {
2461                                 timeout += tg->tg_crtt;
2462                         } else {
2463                                 timeout +=
2464                                     pii->pii_phyint->pi_group->pg_probeint;
2465                         }
2466
2467                         if (TIME_GT(timeout, now))
2468                                 break;
2469
2470                         pr_statp->pr_time_lost = timeout;
2471                         probe_chstate(pr_statp, pii, PR_LOST);
2472                         /* FALLTHRU */
2473
2474                 case PR_LOST:
2475                         if (!pi_found_success) {
2476                                 pfinfo->pf_nfail++;
2477                                 pfinfo->pf_tff = pr_statp->pr_time_lost;
2478                         }
2479                         if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2480                             !tg_found_success)  {
2481                                 pfinfo->pf_nfail_tg++;
2482                         }
2483                         break;
2484
2485                 default:
2486                         /*
2487                          * We hit a success or unused slot. Latch the
2488                          * total number of recent consecutive failures.
2489                          */
2490                         pi_found_success = _B_TRUE;
2491                         if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2492                                 /*
2493                                  * We hit a success for the desired target.
2494                                  * Latch the number of recent consecutive
2495                                  * failures for this target
2496                                  */
2497                                 tg_found_success = _B_TRUE;
2498                         }
2499                 }
2500         }
2501 }
2502
2503 /*
2504  * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2505  */
2506 void
2507 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2508 {
2509         if (pr->pr_status == state)
2510                 return;
2511
2512         pr->pr_status = state;
2513         (void) probe_state_event(pr, pii);
2514 }
2515
2516 /*
2517  * Check if the phyint has been repaired.  If no test address has been
2518  * configured, then consider the interface repaired if the link is up (unless
2519  * the link is flapping; see below).  Otherwise, look for proof of probes
2520  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2521  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2522  */
2523 static boolean_t
2524 phyint_repaired(struct phyint *pi)
2525 {
2526         struct  probe_success_count psinfo;
2527         struct  phyint_instance *pii;
2528         struct  target *cur_tg;
2529         int     pr_ndx;
2530         uint_t  cur_time;
2531
2532         if (debug & D_FAILREP)
2533                 logdebug("phyint_repaired(%s)\n", pi->pi_name);
2534
2535         if (LINK_DOWN(pi))
2536                 return (_B_FALSE);
2537
2538         /*
2539          * If we don't have any test addresses and the link is up, then
2540          * consider the interface repaired, unless we've received more than
2541          * LINK_UP_PERMIN link up notifications in the last minute, in
2542          * which case we keep the link down until we drop back below
2543          * the threshold.
2544          */
2545         if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2546                 cur_time = getcurrenttime();
2547                 if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2548                     (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2549                         pi->pi_lfmsg_printed = 0;
2550                         return (_B_TRUE);
2551                 }
2552                 if (!pi->pi_lfmsg_printed) {
2553                         logerr("The link has come up on %s more than %d times "
2554                             "in the last minute; disabling repair until it "
2555                             "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2556                         pi->pi_lfmsg_printed = 1;
2557                 }
2558
2559                 return (_B_FALSE);
2560         }
2561
2562         pii = pi->pi_v4;
2563         if (PROBE_CAPABLE(pii)) {
2564                 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2565                 cur_tg = pii->pii_probes[pr_ndx].pr_target;
2566                 probe_success_info(pii, cur_tg, &psinfo);
2567                 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2568                     psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2569                         return (_B_TRUE);
2570         }
2571
2572         pii = pi->pi_v6;
2573         if (PROBE_CAPABLE(pii)) {
2574                 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2575                 cur_tg = pii->pii_probes[pr_ndx].pr_target;
2576                 probe_success_info(pii, cur_tg, &psinfo);
2577                 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2578                     psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2579                         return (_B_TRUE);
2580         }
2581
2582         return (_B_FALSE);
2583 }
2584
2585 /*
2586  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2587  */
2588 boolean_t
2589 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
2590 {
2591         int ifsock;
2592         struct lifreq lifr;
2593         uint64_t old_flags;
2594
2595         if (debug & D_FAILREP) {
2596                 logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2597                     pi->pi_name, set, clear);
2598         }
2599
2600         if (pi->pi_v4 != NULL)
2601                 ifsock = ifsock_v4;
2602         else
2603                 ifsock = ifsock_v6;
2604
2605         /*
2606          * Get the current flags from the kernel, and set/clear the
2607          * desired phyint flags. Since we set only phyint flags, we can
2608          * do it on either IPv4 or IPv6 instance.
2609          */
2610         (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2611
2612         if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2613                 if (errno != ENXIO)
2614                         logperror("change_pif_flags: ioctl (get flags)");
2615                 return (_B_FALSE);
2616         }
2617
2618         old_flags = lifr.lifr_flags;
2619         lifr.lifr_flags |= set;
2620         lifr.lifr_flags &= ~clear;
2621
2622         if (old_flags == lifr.lifr_flags) {
2623                 /* No change in the flags. No need to send ioctl */
2624                 return (_B_TRUE);
2625         }
2626
2627         if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2628                 if (errno != ENXIO)
2629                         logperror("change_pif_flags: ioctl (set flags)");
2630                 return (_B_FALSE);
2631         }
2632
2633         /*
2634          * Keep pi_flags in synch. with actual flags. Assumes flags are
2635          * phyint flags.
2636          */
2637         pi->pi_flags |= set;
2638         pi->pi_flags &= ~clear;
2639
2640         if (pi->pi_v4 != NULL)
2641                 pi->pi_v4->pii_flags = pi->pi_flags;
2642
2643         if (pi->pi_v6 != NULL)
2644                 pi->pi_v6->pii_flags = pi->pi_flags;
2645
2646         return (_B_TRUE);
2647 }
2648
2649 /*
2650  * icmp cksum computation for IPv4.
2651  */
2652 static int
2653 in_cksum(ushort_t *addr, int len)
2654 {
2655         register int nleft = len;
2656         register ushort_t *w = addr;
2657         register ushort_t answer;
2658         ushort_t odd_byte = 0;
2659         register int sum = 0;
2660
2661         /*
2662          *  Our algorithm is simple, using a 32 bit accumulator (sum),
2663          *  we add sequential 16 bit words to it, and at the end, fold
2664          *  back all the carry bits from the top 16 bits into the lower
2665          *  16 bits.
2666          */
2667         while (nleft > 1)  {
2668                 sum += *w++;
2669                 nleft -= 2;
2670         }
2671
2672         /* mop up an odd byte, if necessary */
2673         if (nleft == 1) {
2674                 *(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2675                 sum += odd_byte;
2676         }
2677
2678         /*
2679          * add back carry outs from top 16 bits to low 16 bits
2680          */
2681         sum = (sum >> 16) + (sum & 0xffff);     /* add hi 16 to low 16 */
2682         sum += (sum >> 16);                     /* add carry */
2683         answer = ~sum;                          /* truncate to 16 bits */
2684         return (answer);
2685 }
2686
2687 static void
2688 reset_snxt_basetimes(void)
2689 {
2690         struct phyint_instance *pii;
2691
2692         for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2693                 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2694         }
2695 }
2696
2697 /*
2698  * Is the address one of our own addresses? Unfortunately,
2699  * we cannot check our phyint tables to determine if the address
2700  * is our own. This is because, we don't track interfaces that
2701  * are not part of any group. We have to either use a 'bind' or
2702  * get the complete list of all interfaces using SIOCGLIFCONF,
2703  * to do this check. We could also use SIOCTMYADDR.
2704  * Bind fails for the local zone address, so we might include local zone
2705  * address as target address. If local zone address is a target address
2706  * and it is up, it is not possible to detect the interface failure.
2707  * SIOCTMYADDR also doesn't consider local zone address as own address.
2708  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2709  * are stored in `localaddrs'
2710  */
2711 boolean_t
2712 own_address(struct in6_addr addr)
2713 {
2714         addrlist_t *addrp;
2715         struct sockaddr_storage ss;
2716         int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
2717
2718         addr2storage(af, &addr, &ss);
2719         for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2720                 if (sockaddrcmp(&ss, &addrp->al_addr))
2721                         return (_B_TRUE);
2722         }
2723         return (_B_FALSE);
2724 }
2725
2726 static int
2727 ns2ms(int64_t ns)
2728 {
2729         return (NSEC2MSEC(ns));
2730 }
2731
2732 static int64_t
2733 tv2ns(struct timeval *tvp)
2734 {
2735         return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
2736 }