4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/socket.h>
31 #include <netinet/in_systm.h>
32 #include <netinet/in.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip6.h>
35 #include <arpa/inet.h>
36 #include <netinet/tcp.h>
37 #include <netinet/ip_icmp.h>
38 #include <netinet/icmp6.h>
39 #include <netinet/udp.h>
52 * The following values are what ilbd will set argv[0] to. This determines
53 * what type of probe to send out.
55 #define PROBE_PING "ilb_ping"
56 #define PROBE_PROTO "ilb_probe"
58 /* The transport protocol to use in the probe. Value of argv[3]. */
59 #define PROTO_TCP "TCP"
60 #define PROTO_UDP "UDP"
62 enum probe_type
{ ping_probe
, tcp_probe
, udp_probe
};
64 /* Load balance mode. Value of argv[4]. */
65 #define MODE_DSR "DSR"
66 #define MODE_NAT "NAT"
67 #define MODE_HALF_NAT "HALF_NAT"
69 enum lb_mode
{ dsr
, nat
, half_nat
};
71 /* Number of arguments to the command from ilbd. */
74 /* Size of buffer used to receive ICMP packet */
75 #define RECV_PKT_SZ 256
78 * Struct to store the probe info (most is passed in using the argv[] array to
79 * the command given by ilbd). The argv[] contains the following.
81 * argv[0] is either PROBE_PING or PROBE_PROTO
83 * argv[2] is the backend server address
84 * argv[3] is the transport protocol used in the rule
85 * argv[4] is the load balance mode, "DSR", "NAT", "HALF-NAT"
86 * argv[5] is the probe port
87 * argv[6] is the probe timeout
89 * The following three fields are used in sending ICMP ECHO probe.
91 * echo_id is the ID set in the probe
92 * echo_seq is the sequence set in the probe
93 * echo_cookie is the random number data in a probe
94 * lport is the local port (in network byte order) used to send the probe
97 enum probe_type probe
;
98 struct in6_addr vip
; /* argv[1] */
99 struct in6_addr srv_addr
; /* argv[2] */
100 int proto
; /* argv[3] */
101 enum lb_mode mode
; /* argv[4] */
102 in_port_t port
; /* argv[5] */
103 uint32_t timeout
; /* argv[6] */
107 uint32_t echo_cookie
;
111 /* Global variable to indicate whether a timeout means success. */
112 static boolean_t timeout_is_good
;
114 /* SIGALRM handler */
119 if (timeout_is_good
) {
129 * Checksum routine for Internet Protocol family headers (C Version)
130 * (copied from ping.c)
133 in_cksum(ushort_t
*addr
, int len
)
138 ushort_t odd_byte
= 0;
142 * Our algorithm is simple, using a 32 bit accumulator (sum),
143 * we add sequential 16 bit words to it, and at the end, fold
144 * back all the carry bits from the top 16 bits into the lower
152 /* mop up an odd byte, if necessary */
154 *(uchar_t
*)(&odd_byte
) = *(uchar_t
*)w
;
159 * add back carry outs from top 16 bits to low 16 bits
161 sum
= (sum
>> 16) + (sum
& 0xffff); /* add hi 16 to low 16 */
162 sum
+= (sum
>> 16); /* add carry */
163 answer
= ~sum
; /* truncate to 16 bits */
167 /* It is assumed that argv[] contains PROBE_ARGC arguments. */
169 parse_probe_param(char *argv
[], probe_param_t
*param
)
173 struct in_addr v4addr
;
175 if (strcmp(basename(argv
[0]), PROBE_PING
) == 0) {
176 param
->probe
= ping_probe
;
178 if (strcmp(basename(argv
[0]), PROBE_PROTO
) != 0)
181 if (strcasecmp(argv
[3], PROTO_TCP
) == 0) {
182 param
->probe
= tcp_probe
;
183 param
->proto
= IPPROTO_TCP
;
184 } else if (strcasecmp(argv
[3], PROTO_UDP
) == 0) {
185 param
->probe
= udp_probe
;
186 param
->proto
= IPPROTO_UDP
;
192 if (strchr(argv
[1], ':') != NULL
) {
193 if (inet_pton(AF_INET6
, argv
[1], ¶m
->vip
) == 0)
195 } else if (strchr(argv
[1], '.') != NULL
) {
196 if (inet_pton(AF_INET
, argv
[1], &v4addr
) == 0)
198 IN6_INADDR_TO_V4MAPPED(&v4addr
, ¶m
->vip
);
204 * The address family of vip and srv_addr should be the same for
205 * now. But in future, we may allow them to be different... So
206 * we don't do a check here.
208 if (strchr(argv
[2], ':') != NULL
) {
209 if (inet_pton(AF_INET6
, argv
[2], ¶m
->srv_addr
) == 0)
211 } else if (strchr(argv
[2], '.') != NULL
) {
212 if (inet_pton(AF_INET
, argv
[2], &v4addr
) == 0)
214 IN6_INADDR_TO_V4MAPPED(&v4addr
, ¶m
->srv_addr
);
219 if (strcasecmp(argv
[4], MODE_DSR
) == 0)
221 else if (strcasecmp(argv
[4], MODE_NAT
) == 0)
223 else if (strcasecmp(argv
[4], MODE_HALF_NAT
) == 0)
224 param
->mode
= half_nat
;
228 if ((port
= atoi(argv
[5])) <= 0 || port
> USHRT_MAX
)
232 if ((timeout
= strtoll(argv
[6], NULL
, 10)) <= 0 || timeout
> UINT_MAX
)
234 param
->timeout
= timeout
;
240 * Set up the destination address to be used to send a probe based on
244 set_sockaddr(struct sockaddr_storage
*addr
, socklen_t
*addr_len
,
245 void **next_hop
, probe_param_t
*param
)
248 struct in6_addr
*param_addr
;
249 struct sockaddr_in
*v4_addr
;
250 struct sockaddr_in6
*v6_addr
;
251 boolean_t nh
= B_FALSE
;
253 switch (param
->mode
) {
255 param_addr
= ¶m
->vip
;
260 param_addr
= ¶m
->srv_addr
;
263 if (IN6_IS_ADDR_V4MAPPED(param_addr
)) {
265 v4_addr
= (struct sockaddr_in
*)addr
;
266 IN6_V4MAPPED_TO_INADDR(param_addr
, &v4_addr
->sin_addr
);
267 v4_addr
->sin_family
= AF_INET
;
268 v4_addr
->sin_port
= htons(param
->port
);
270 *addr_len
= sizeof (*v4_addr
);
273 v6_addr
= (struct sockaddr_in6
*)addr
;
274 v6_addr
->sin6_family
= AF_INET6
;
275 v6_addr
->sin6_addr
= *param_addr
;
276 v6_addr
->sin6_port
= htons(param
->port
);
277 v6_addr
->sin6_flowinfo
= 0;
278 v6_addr
->sin6_scope_id
= 0;
280 *addr_len
= sizeof (*v6_addr
);
291 nh_addr
= malloc(sizeof (ipaddr_t
));
292 IN6_V4MAPPED_TO_IPADDR(¶m
->srv_addr
, *nh_addr
);
295 struct sockaddr_in6
*nh_addr
;
297 nh_addr
= malloc(sizeof (*nh_addr
));
298 nh_addr
->sin6_family
= AF_INET6
;
299 nh_addr
->sin6_addr
= param
->srv_addr
;
300 nh_addr
->sin6_flowinfo
= 0;
301 nh_addr
->sin6_scope_id
= 0;
309 * Use TCP to check if the peer server is alive. Create a TCP socket and
310 * then call connect() to reach the peer server. If connect() does not
311 * return within the timeout period, the SIGALRM handler will be invoked
312 * and tell ilbd that the peer server is not alive.
315 tcp_query(probe_param_t
*param
)
319 struct sockaddr_storage dst_addr
;
320 socklen_t dst_addr_len
;
328 af
= set_sockaddr(&dst_addr
, &dst_addr_len
, &next_hop
, param
);
330 if ((sd
= socket(af
, SOCK_STREAM
, param
->proto
)) == -1)
333 /* DSR mode, need to set the next hop */
334 if (next_hop
!= NULL
) {
336 if (setsockopt(sd
, IPPROTO_IP
, IP_NEXTHOP
, next_hop
,
337 sizeof (ipaddr_t
)) < 0) {
342 if (setsockopt(sd
, IPPROTO_IPV6
, IPV6_NEXTHOP
,
343 next_hop
, sizeof (struct sockaddr_in6
)) < 0) {
350 timeout_is_good
= B_FALSE
;
351 (void) alarm(param
->timeout
);
353 if (connect(sd
, (struct sockaddr
*)&dst_addr
, dst_addr_len
) != 0) {
359 rtt
= (end
- start
) / (NANOSEC
/ MICROSEC
);
362 (void) printf("%u", rtt
);
370 * Check if the ICMP packet is a port unreachable message in respnsed to
371 * our probe. Return -1 if no, 0 if yes.
374 check_icmp_unreach_v4(struct icmp
*icmph
, probe_param_t
*param
)
379 if (icmph
->icmp_type
!= ICMP_UNREACH
)
381 if (icmph
->icmp_code
!= ICMP_UNREACH_PORT
)
384 /* LINTED E_BAD_PTR_CAST_ALIGN */
385 iph
= (struct ip
*)((char *)icmph
+ ICMP_MINLEN
);
386 if (iph
->ip_p
!= IPPROTO_UDP
)
389 /* LINTED E_BAD_PTR_CAST_ALIGN */
390 udph
= (struct udphdr
*)((char *)iph
+ (iph
->ip_hl
<< 2));
391 if (udph
->uh_dport
!= htons(param
->port
))
393 if (udph
->uh_sport
!= param
->lport
)
396 /* All matched, it is a response to the probe we sent. */
401 * Check if the ICMP packet is a reply to our echo request. Need to match
402 * the ID and sequence.
405 check_icmp_echo_v4(struct icmp
*icmph
, probe_param_t
*param
)
410 if (icmph
->icmp_type
!= ICMP_ECHOREPLY
)
412 if (icmph
->icmp_id
!= param
->echo_id
)
414 if (icmph
->icmp_seq
!= param
->echo_seq
)
417 bcopy(icmph
->icmp_data
, &cookie
, sizeof (cookie
));
418 if (cookie
!= param
->echo_cookie
)
420 bcopy(icmph
->icmp_data
+ sizeof (cookie
), &port
, sizeof (port
));
421 if (port
!= param
->port
)
424 /* All matched, it is a response to the echo we sent. */
428 /* Verify if an ICMP packet is what we expect. */
430 check_icmp_v4(char *buf
, ssize_t rcvd
, probe_param_t
*param
)
436 * We can dereference the length field without worry since the stack
437 * should not have sent up the packet if it is smaller than a normal
440 /* LINTED E_BAD_PTR_CAST_ALIGN */
441 iph
= (struct ip
*)buf
;
442 /* LINTED E_BAD_PTR_CAST_ALIGN */
443 icmph
= (struct icmp
*)((char *)iph
+ (iph
->ip_hl
<< 2));
446 * If we sent an UDP probe, check if the packet is a port
447 * unreachable message in response to our probe.
449 * If we sent an ICMP echo request, check if the packet is a reply
450 * to our echo request.
452 if (param
->probe
== udp_probe
) {
453 /* Is the packet large enough for further checking? */
454 if (rcvd
< 2 * sizeof (struct ip
) + ICMP_MINLEN
+
455 sizeof (struct udphdr
)) {
458 return (check_icmp_unreach_v4(icmph
, param
));
460 if (rcvd
< sizeof (struct ip
) + ICMP_MINLEN
)
462 return (check_icmp_echo_v4(icmph
, param
));
467 * Check if the ICMPv6 packet is a port unreachable message in respnsed to
468 * our probe. Return -1 if no, 0 if yes.
471 check_icmp_unreach_v6(icmp6_t
*icmp6h
, probe_param_t
*param
)
476 if (icmp6h
->icmp6_type
!= ICMP6_DST_UNREACH
)
478 if (icmp6h
->icmp6_code
!= ICMP6_DST_UNREACH_NOPORT
)
481 /* LINTED E_BAD_PTR_CAST_ALIGN */
482 ip6h
= (ip6_t
*)((char *)icmp6h
+ ICMP6_MINLEN
);
483 if (ip6h
->ip6_nxt
!= IPPROTO_UDP
)
486 udph
= (struct udphdr
*)(ip6h
+ 1);
488 if (udph
->uh_dport
!= htons(param
->port
))
490 if (udph
->uh_sport
!= param
->lport
)
497 * Check if the ICMPv6 packet is a reply to our echo request. Need to match
498 * the ID and sequence.
501 check_icmp_echo_v6(icmp6_t
*icmp6h
, probe_param_t
*param
)
507 if (icmp6h
->icmp6_type
!= ICMP6_ECHO_REPLY
)
509 if (icmp6h
->icmp6_id
!= param
->echo_id
)
511 if (icmp6h
->icmp6_seq
!= param
->echo_seq
)
513 tmp
= (char *)icmp6h
+ ICMP6_MINLEN
;
514 bcopy(tmp
, &cookie
, sizeof (cookie
));
515 if (cookie
!= param
->echo_cookie
)
517 tmp
+= sizeof (cookie
);
518 bcopy(tmp
, &port
, sizeof (port
));
519 if (port
!= param
->port
)
522 /* All matched, it is a response to the echo we sent. */
526 /* Verify if an ICMPv6 packet is what we expect. */
528 check_icmp_v6(char *buf
, ssize_t rcvd
, probe_param_t
*param
)
532 /* LINTED E_BAD_PTR_CAST_ALIGN */
533 icmp6h
= (icmp6_t
*)(buf
);
536 * If we sent an UDP probe, check if the packet is a port
537 * unreachable message.
539 * If we sent an ICMPv6 echo request, check if the packet is a reply.
541 if (param
->probe
== udp_probe
) {
542 /* Is the packet large enough for further checking? */
543 if (rcvd
< sizeof (ip6_t
) + ICMP6_MINLEN
+
544 sizeof (struct udphdr
)) {
547 return (check_icmp_unreach_v6(icmp6h
, param
));
549 if (rcvd
< ICMP6_MINLEN
)
551 return (check_icmp_echo_v6(icmp6h
, param
));
556 * Wait for an ICMP reply indefinitely. If we get what we expect, return 0.
557 * If an error happnes, return -1.
560 wait_icmp_reply(int af
, int recv_sd
, struct sockaddr_storage
*exp_from
,
561 probe_param_t
*param
)
563 char buf
[RECV_PKT_SZ
];
570 struct sockaddr_in v4_from
;
572 from_len
= sizeof (v4_from
);
573 if ((rcvd
= recvfrom(recv_sd
, buf
, RECV_PKT_SZ
, 0,
574 (struct sockaddr
*)&v4_from
, &from_len
)) < 0) {
579 /* Packet not from our peer, ignore it. */
580 if ((((struct sockaddr_in
*)exp_from
)->sin_addr
.s_addr
)
581 != v4_from
.sin_addr
.s_addr
) {
584 if (check_icmp_v4(buf
, rcvd
, param
) == 0) {
589 struct sockaddr_in6 v6_from
;
591 from_len
= sizeof (struct sockaddr_in6
);
592 if ((rcvd
= recvfrom(recv_sd
, buf
, RECV_PKT_SZ
, 0,
593 (struct sockaddr
*)&v6_from
, &from_len
)) < 0) {
598 if (!IN6_ARE_ADDR_EQUAL(&(v6_from
.sin6_addr
),
599 &((struct sockaddr_in6
*)exp_from
)->sin6_addr
)) {
602 if (check_icmp_v6(buf
, rcvd
, param
) == 0) {
611 /* Return the local port used (network byte order) in a socket. */
613 get_lport(int sd
, in_port_t
*lport
)
615 struct sockaddr_storage addr
;
618 addr_sz
= sizeof (addr
);
619 if (getsockname(sd
, (struct sockaddr
*)&addr
, &addr_sz
) != 0)
621 if (addr
.ss_family
== AF_INET
)
622 *lport
= ((struct sockaddr_in
*)&addr
)->sin_port
;
624 *lport
= ((struct sockaddr_in6
*)&addr
)->sin6_port
;
629 * Use UDP to check if the peer server is alive. Send a 0 length UDP packet
630 * to the peer server. If there is no one listening, the peer IP stack
631 * should send back a port unreachable ICMP(v4/v6) packet. If the peer
632 * server is alive, there should be no response. So if we get SIGALRM,
636 udp_query(probe_param_t
*param
)
639 int send_sd
, recv_sd
, af
;
640 struct sockaddr_storage dst_addr
;
644 struct itimerval timeout
;
650 af
= set_sockaddr(&dst_addr
, &addr_len
, &next_hop
, param
);
652 if ((send_sd
= socket(af
, SOCK_DGRAM
, param
->proto
)) == -1)
654 if ((recv_sd
= socket(af
, SOCK_RAW
, (af
== AF_INET
) ? IPPROTO_ICMP
:
655 IPPROTO_ICMPV6
)) == -1) {
659 /* DSR mode, need to set the next hop */
660 if (next_hop
!= NULL
) {
662 if (setsockopt(send_sd
, IPPROTO_IP
, IP_NEXTHOP
,
663 next_hop
, sizeof (ipaddr_t
)) < 0) {
668 if (setsockopt(send_sd
, IPPROTO_IPV6
, IPV6_NEXTHOP
,
669 next_hop
, sizeof (struct sockaddr_in6
)) < 0) {
677 * If ilbd asks us to wait at most t, we will wait for at most
678 * t', which is 3/4 of t. If we wait for too long, ilbd may
679 * timeout and kill us.
681 timeout
.it_interval
.tv_sec
= 0;
682 timeout
.it_interval
.tv_usec
= 0;
683 tm
= (param
->timeout
* MICROSEC
>> 2) * 3;
685 timeout
.it_value
.tv_sec
= tm
/ MICROSEC
;
686 timeout
.it_value
.tv_usec
= tm
- (timeout
.it_value
.tv_sec
*
689 timeout
.it_value
.tv_sec
= 0;
690 timeout
.it_value
.tv_usec
= tm
;
692 timeout_is_good
= B_TRUE
;
693 if (setitimer(ITIMER_REAL
, &timeout
, NULL
) != 0) {
698 if (sendto(send_sd
, buf
, 0, 0, (struct sockaddr
*)&dst_addr
,
703 if ((ret
= get_lport(send_sd
, ¶m
->lport
)) != 0)
707 * If the server app is listening, we should not get back a
708 * response. So if wait_icmp_reply() returns, either there
709 * is an error or we get back something.
711 (void) wait_icmp_reply(af
, recv_sd
, &dst_addr
, param
);
715 (void) close(send_sd
);
716 (void) close(recv_sd
);
721 * Size (in uint32_t) of the ping packet to be sent to server. It includes
722 * a cookie (random number) + the target port. The cookie and port are used
723 * for matching ping request since there can be many such ping packets sent
724 * to different servers from the same source address and using the same VIP.
725 * The last two bytes are for padding.
728 #define PING_PKT_LEN \
729 ((ICMP_MINLEN + 2 * sizeof (uint32_t)) / sizeof (uint32_t))
732 * Try to get a random number from the pseudo random number device
733 * /dev/urandom. If there is any error, return (uint32_t)gethrtime()
742 if ((fd
= open("/dev/urandom", O_RDONLY
)) == -1)
743 return ((uint32_t)gethrtime());
745 if (read(fd
, &num
, sizeof (num
)) != sizeof (num
))
746 num
= ((uint32_t)gethrtime());
753 * Use ICMP(v4/v6) echo request to check if the peer server machine is
754 * reachable. Send a echo request and expect to get back a echo reply.
757 ping_query(probe_param_t
*param
)
761 struct sockaddr_storage dst_addr
;
762 socklen_t dst_addr_len
;
766 uint32_t buf
[PING_PKT_LEN
];
772 af
= set_sockaddr(&dst_addr
, &dst_addr_len
, &next_hop
, param
);
774 if ((sd
= socket(af
, SOCK_RAW
, (af
== AF_INET
) ? IPPROTO_ICMP
:
775 IPPROTO_ICMPV6
)) == -1) {
779 /* DSR mode, need to set the next hop */
780 if (next_hop
!= NULL
) {
782 if (setsockopt(sd
, IPPROTO_IP
, IP_NEXTHOP
, next_hop
,
783 sizeof (ipaddr_t
)) < 0) {
788 if (setsockopt(sd
, IPPROTO_IPV6
, IPV6_NEXTHOP
,
789 next_hop
, sizeof (struct sockaddr_in6
)) < 0) {
796 bzero(buf
, sizeof (buf
));
797 icmph
= (struct icmp
*)buf
;
798 icmph
->icmp_type
= af
== AF_INET
? ICMP_ECHO
: ICMP6_ECHO_REQUEST
;
799 icmph
->icmp_code
= 0;
800 icmph
->icmp_cksum
= 0;
801 icmph
->icmp_id
= htons(gethrtime() % USHRT_MAX
);
802 icmph
->icmp_seq
= htons(gethrtime() % USHRT_MAX
);
804 param
->echo_cookie
= get_random();
805 bcopy(¶m
->echo_cookie
, icmph
->icmp_data
,
806 sizeof (param
->echo_cookie
));
807 bcopy(¶m
->port
, icmph
->icmp_data
+ sizeof (param
->echo_cookie
),
808 sizeof (param
->port
));
809 icmph
->icmp_cksum
= in_cksum((ushort_t
*)buf
, sizeof (buf
));
810 param
->echo_id
= icmph
->icmp_id
;
811 param
->echo_seq
= icmph
->icmp_seq
;
813 timeout_is_good
= B_FALSE
;
814 (void) alarm(param
->timeout
);
816 if (sendto(sd
, buf
, sizeof (buf
), 0, (struct sockaddr
*)&dst_addr
,
817 dst_addr_len
) != sizeof (buf
)) {
821 if (wait_icmp_reply(af
, sd
, &dst_addr
, param
) != 0) {
827 rtt
= (end
- start
) / (NANOSEC
/ MICROSEC
);
830 (void) printf("%u", rtt
);
838 main(int argc
, char *argv
[])
843 /* ilbd should pass in PROG_ARGC parameters. */
844 if (argc
!= PROG_ARGC
) {
849 if (signal(SIGALRM
, probe_exit
) == SIG_ERR
) {
854 if (!parse_probe_param(argv
, ¶m
)) {
859 switch (param
.probe
) {
861 ret
= ping_query(¶m
);
864 ret
= tcp_query(¶m
);
867 ret
= udp_query(¶m
);