4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * nfs_cast.c : broadcast to a specific group of NFS servers
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
29 #pragma ident "%Z%%M% %I% %E% SMI"
35 #include <sys/types.h>
37 #include <sys/resource.h>
41 #include <rpc/clnt_soc.h>
42 #include <rpc/nettype.h>
43 #include <rpc/pmap_prot.h>
44 #include <netconfig.h>
49 #include "automount.h"
51 #define PENALTY_WEIGHT 100000
54 struct tstamps
*ts_next
;
58 struct timeval ts_timeval
;
61 /* A list of addresses - all belonging to the same transport */
64 struct addrs
*addr_next
;
65 struct mapfs
*addr_mfs
;
66 struct nd_addrlist
*addr_addrs
;
67 struct tstamps
*addr_if_tstamps
;
70 /* A list of connectionless transports */
73 struct transp
*tr_next
;
76 struct t_bind
*tr_taddr
;
77 struct addrs
*tr_addrs
;
80 /* A list of map entries and their roundtrip times, for sorting */
84 struct timeval timeval
;
87 static void free_transports(struct transp
*);
88 static void calc_resp_time(struct timeval
*);
89 static struct mapfs
*sort_responses(struct transp
*);
90 static int host_sm(const void *, const void *b
);
91 static int time_sm(const void *, const void *b
);
92 extern struct mapfs
*add_mfs(struct mapfs
*, int, struct mapfs
**,
96 * This routine is designed to be able to "ping"
97 * a list of hosts and create a list of responding
98 * hosts sorted by response time.
99 * This must be done without any prior
100 * contact with the host - therefore the "ping"
101 * must be to a "well-known" address. The outstanding
102 * candidate here is the address of "rpcbind".
104 * A response to a ping is no guarantee that the host
105 * is running NFS, has a mount daemon, or exports
106 * the required filesystem. If the subsequent
107 * mount attempt fails then the host will be marked
108 * "ignore" and the host list will be re-pinged
109 * (sans the bad host). This process continues
110 * until a successful mount is achieved or until
111 * there are no hosts left to try.
114 nfs_cast(struct mapfs
*mfs_in
, struct mapfs
**mfs_out
, int timeout
)
117 AUTH
*sys_auth
= authsys_create_default();
119 register XDR
*xdrs
= &xdr_stream
;
124 int sent
, addr_cnt
, rcvd
, if_cnt
;
125 fd_set readfds
, mask
;
126 register ulong_t xid
; /* xid - unique per addr */
129 struct timeval t
, rcv_timeout
;
130 char outbuf
[UDPMSGSIZE
], inbuf
[UDPMSGSIZE
];
131 struct t_unitdata t_udata
, t_rdata
;
132 struct nd_hostserv hs
;
133 struct nd_addrlist
*retaddrs
;
134 struct transp
*tr_head
;
135 struct transp
*trans
, *prev_trans
;
136 struct addrs
*a
, *prev_addr
;
137 struct tstamps
*ts
, *prev_ts
;
138 NCONF_HANDLE
*nc
= NULL
;
139 struct netconfig
*nconf
;
145 * For each connectionless transport get a list of
146 * host addresses. Any single host may have
147 * addresses on several transports.
149 addr_cnt
= sent
= rcvd
= 0;
154 * Set the default select size to be the maximum FD_SETSIZE, unless
155 * the current rlimit is lower.
157 dtbsize
= FD_SETSIZE
;
158 if (getrlimit(RLIMIT_NOFILE
, &rl
) == 0) {
159 if (rl
.rlim_cur
< FD_SETSIZE
)
160 dtbsize
= rl
.rlim_cur
;
166 for (mfs
= mfs_in
; mfs
; mfs
= mfs
->mfs_next
) {
169 trace_prt(1, "nfs_cast: host=%s\n", mfs
->mfs_host
);
176 while (nconf
= getnetconfig(nc
)) {
177 if (!(nconf
->nc_flag
& NC_VISIBLE
) ||
178 nconf
->nc_semantics
!= NC_TPI_CLTS
||
179 (strcmp(nconf
->nc_protofmly
, NC_LOOPBACK
) == 0))
181 trans
= (struct transp
*)malloc(sizeof (*trans
));
183 syslog(LOG_ERR
, "no memory");
187 (void) memset(trans
, 0, sizeof (*trans
));
191 prev_trans
->tr_next
= trans
;
194 trans
->tr_fd
= t_open(nconf
->nc_device
, O_RDWR
, NULL
);
195 if (trans
->tr_fd
< 0) {
196 syslog(LOG_ERR
, "nfscast: t_open: %s:%m",
201 if (t_bind(trans
->tr_fd
, (struct t_bind
*)NULL
,
202 (struct t_bind
*)NULL
) < 0) {
203 syslog(LOG_ERR
, "nfscast: t_bind: %m");
208 /* LINTED pointer alignment */
209 (struct t_bind
*)t_alloc(trans
->tr_fd
, T_BIND
, T_ADDR
);
210 if (trans
->tr_taddr
== (struct t_bind
*)NULL
) {
211 syslog(LOG_ERR
, "nfscast: t_alloc: %m");
212 stat
= RPC_SYSTEMERROR
;
216 trans
->tr_device
= nconf
->nc_device
;
217 FD_SET(trans
->tr_fd
, &mask
);
220 hs
.h_host
= mfs
->mfs_host
;
221 hs
.h_serv
= "rpcbind";
222 if (netdir_getbyname(nconf
, &hs
, &retaddrs
) == ND_OK
) {
225 * If mfs->ignore is previously set for
226 * this map, clear it. Because a host can
227 * have either v6 or v4 address
229 if (mfs
->mfs_ignore
== 1)
232 a
= (struct addrs
*)malloc(sizeof (*a
));
234 syslog(LOG_ERR
, "no memory");
238 (void) memset(a
, 0, sizeof (*a
));
239 if (trans
->tr_addrs
== NULL
)
242 prev_addr
->addr_next
= a
;
244 a
->addr_if_tstamps
= NULL
;
246 a
->addr_addrs
= retaddrs
;
247 if_cnt
= retaddrs
->n_cnt
;
249 ts
= (struct tstamps
*)
250 malloc(sizeof (*ts
));
252 syslog(LOG_ERR
, "no memory");
256 (void) memset(ts
, 0, sizeof (*ts
));
257 ts
->ts_penalty
= mfs
->mfs_penalty
;
258 if (a
->addr_if_tstamps
== NULL
)
259 a
->addr_if_tstamps
= ts
;
261 prev_ts
->ts_next
= ts
;
263 ts
->ts_inx
= if_inx
++;
271 "%s:%s address not known",
273 strcmp(nconf
->nc_proto
, NC_INET
)?"IPv6":"IPv4");
281 syslog(LOG_ERR
, "nfscast: couldn't find addresses");
286 (void) gettimeofday(&t
, (struct timezone
*)0);
287 xid
= (getpid() ^ t
.tv_sec
^ t
.tv_usec
) & ~0xFF;
290 /* serialize the RPC header */
292 msg
.rm_direction
= CALL
;
293 msg
.rm_call
.cb_rpcvers
= RPC_MSG_VERSION
;
294 msg
.rm_call
.cb_prog
= RPCBPROG
;
296 * we can not use RPCBVERS here since it doesn't exist in 4.X,
297 * the fix to bug 1139883 has made the 4.X portmapper silent to
298 * version mismatches. This causes the RPC call to the remote
299 * portmapper to simply be ignored if it's not Version 2.
301 msg
.rm_call
.cb_vers
= PMAPVERS
;
302 msg
.rm_call
.cb_proc
= NULLPROC
;
303 if (sys_auth
== (AUTH
*)NULL
) {
304 stat
= RPC_SYSTEMERROR
;
307 msg
.rm_call
.cb_cred
= sys_auth
->ah_cred
;
308 msg
.rm_call
.cb_verf
= sys_auth
->ah_verf
;
309 xdrmem_create(xdrs
, outbuf
, sizeof (outbuf
), XDR_ENCODE
);
310 if (! xdr_callmsg(xdrs
, &msg
)) {
311 stat
= RPC_CANTENCODEARGS
;
314 outlen
= (int)xdr_getpos(xdrs
);
318 t_udata
.udata
.buf
= outbuf
;
319 t_udata
.udata
.len
= outlen
;
322 * Basic loop: send packet to all hosts and wait for response(s).
323 * The response timeout grows larger per iteration.
324 * A unique xid is assigned to each address in order to
325 * correctly match the replies.
327 for (tsec
= 4; timeout
> 0; tsec
*= 2) {
333 rcv_timeout
.tv_sec
= tsec
;
334 rcv_timeout
.tv_usec
= 0;
337 for (trans
= tr_head
; trans
; trans
= trans
->tr_next
) {
338 for (a
= trans
->tr_addrs
; a
; a
= a
->addr_next
) {
339 struct netbuf
*if_netbuf
=
340 a
->addr_addrs
->n_addrs
;
341 ts
= a
->addr_if_tstamps
;
342 if_cnt
= a
->addr_addrs
->n_cnt
;
346 * xid is the first thing in
347 * preserialized buffer
349 /* LINTED pointer alignment */
350 *((ulong_t
*)outbuf
) =
351 htonl(xid
+ ts
->ts_inx
);
352 (void) gettimeofday(&(ts
->ts_timeval
),
353 (struct timezone
*)0);
355 * Check if already received
356 * from a previous iteration.
364 t_udata
.addr
= *if_netbuf
++;
366 if (t_sndudata(trans
->tr_fd
,
375 if (sent
== 0) { /* no packets sent ? */
381 * Have sent all the packets. Now collect the responses...
385 msg
.acpted_rply
.ar_verf
= _null_auth
;
386 msg
.acpted_rply
.ar_results
.proc
= xdr_void
;
389 switch (select(dtbsize
, &readfds
,
390 (fd_set
*)NULL
, (fd_set
*)NULL
, &rcv_timeout
)) {
392 case 0: /* Timed out */
394 * If we got at least one response in the
395 * last interval, then don't wait for any
396 * more. In theory we should wait for
397 * the max weighting (penalty) value so
398 * that a very slow server has a chance to
399 * respond but this could take a long time
400 * if the admin has set a high weighting
409 case -1: /* some kind of error */
412 syslog(LOG_ERR
, "nfscast: select: %m");
417 } /* end of select results switch */
419 for (trans
= tr_head
; trans
; trans
= trans
->tr_next
) {
420 if (FD_ISSET(trans
->tr_fd
, &readfds
))
427 t_rdata
.addr
= trans
->tr_taddr
->addr
;
428 t_rdata
.udata
.buf
= inbuf
;
429 t_rdata
.udata
.maxlen
= sizeof (inbuf
);
430 t_rdata
.udata
.len
= 0;
432 if (t_rcvudata(trans
->tr_fd
, &t_rdata
, &flag
) < 0) {
435 syslog(LOG_ERR
, "nfscast: t_rcvudata: %s:%m",
440 if (t_rdata
.udata
.len
< sizeof (ulong_t
))
444 "nfscast: t_rcvudata: %s: buffer overflow",
450 * see if reply transaction id matches sent id.
451 * If so, decode the results.
452 * Note: received addr is ignored, it could be
453 * different from the send addr if the host has
454 * more than one addr.
456 xdrmem_create(xdrs
, inbuf
, (uint_t
)t_rdata
.udata
.len
,
458 if (xdr_replymsg(xdrs
, &msg
)) {
459 if (msg
.rm_reply
.rp_stat
== MSG_ACCEPTED
&&
460 (msg
.rm_xid
& ~0xFF) == xid
) {
461 struct addrs
*curr_addr
;
463 i
= msg
.rm_xid
& 0xFF;
464 for (curr_addr
= trans
->tr_addrs
; curr_addr
;
465 curr_addr
= curr_addr
->addr_next
) {
466 for (ts
= curr_addr
->addr_if_tstamps
; ts
;
468 if (ts
->ts_inx
== i
&& !ts
->ts_rcvd
) {
470 calc_resp_time(&ts
->ts_timeval
);
476 } /* otherwise, we just ignore the errors ... */
478 xdrs
->x_op
= XDR_FREE
;
479 msg
.acpted_rply
.ar_results
.proc
= xdr_void
;
480 (void) xdr_replymsg(xdrs
, &msg
);
492 *mfs_out
= sort_responses(tr_head
);
497 free_transports(tr_head
);
498 AUTH_DESTROY(sys_auth
);
503 * Go through all the responses and sort fastest to slowest.
504 * Note that any penalty is added to the response time - so the
505 * fastest response isn't necessarily the one that arrived first.
507 static struct mapfs
*
508 sort_responses(trans
)
509 struct transp
*trans
;
514 int i
, size
= 0, allocsize
= 10;
515 struct mapfs
*p
, *mfs_head
= NULL
, *mfs_tail
= NULL
;
518 buffer
= (struct sm
*)malloc(allocsize
* sizeof (struct sm
));
520 syslog(LOG_ERR
, "sort_responses: malloc error.\n");
524 for (t
= trans
; t
; t
= t
->tr_next
) {
525 for (a
= t
->tr_addrs
; a
; a
= a
->addr_next
) {
526 for (ti
= a
->addr_if_tstamps
;
527 ti
; ti
= ti
->ts_next
) {
530 ti
->ts_timeval
.tv_usec
+=
531 (ti
->ts_penalty
* PENALTY_WEIGHT
);
532 if (ti
->ts_timeval
.tv_usec
>= 1000000) {
533 ti
->ts_timeval
.tv_sec
+=
534 (ti
->ts_timeval
.tv_usec
/ 1000000);
535 ti
->ts_timeval
.tv_usec
=
536 (ti
->ts_timeval
.tv_usec
% 1000000);
539 if (size
>= allocsize
) {
541 buffer
= (struct sm
*)realloc(buffer
,
542 allocsize
* sizeof (struct sm
));
545 "sort_responses: malloc error.\n");
549 buffer
[size
].timeval
= ti
->ts_timeval
;
550 buffer
[size
].mfs
= a
->addr_mfs
;
558 trace_prt(1, " sort_responses: before host sort:\n");
559 for (i
= 0; i
< size
; i
++)
560 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
561 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
566 qsort((void *)buffer
, size
, sizeof (struct sm
), host_sm
);
569 * Cope with multiply listed hosts by choosing first time
571 for (i
= 1; i
< size
; i
++) {
574 trace_prt(1, " sort_responses: comparing %s and %s\n",
575 buffer
[i
-1].mfs
->mfs_host
,
576 buffer
[i
].mfs
->mfs_host
);
579 if (strcmp(buffer
[i
-1].mfs
->mfs_host
,
580 buffer
[i
].mfs
->mfs_host
) == 0)
581 memcpy(&buffer
[i
].timeval
, &buffer
[i
-1].timeval
,
582 sizeof (struct timeval
));
589 trace_prt(1, " sort_responses: before time sort:\n");
590 for (i
= 0; i
< size
; i
++)
591 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
592 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
597 qsort((void *)buffer
, size
, sizeof (struct sm
), time_sm
);
601 trace_prt(1, " sort_responses: after sort:\n");
602 for (i
= 0; i
< size
; i
++)
603 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
604 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
609 for (i
= 0; i
< size
; i
++) {
612 trace_prt(1, " sort_responses: adding %s\n",
613 buffer
[i
].mfs
->mfs_host
);
616 p
= add_mfs(buffer
[i
].mfs
, 0, &mfs_head
, &mfs_tail
);
627 * Comparison routines called by qsort(3).
629 static int host_sm(const void *a
, const void *b
)
631 return (strcmp(((struct sm
*)a
)->mfs
->mfs_host
,
632 ((struct sm
*)b
)->mfs
->mfs_host
));
635 static int time_sm(const void *a
, const void *b
)
637 if (timercmp(&(((struct sm
*)a
)->timeval
),
638 &(((struct sm
*)b
)->timeval
), < /* cstyle */))
640 else if (timercmp(&(((struct sm
*)a
)->timeval
),
641 &(((struct sm
*)b
)->timeval
), > /* cstyle */))
648 * Given send_time which is the time a request
649 * was transmitted to a server, subtract it
650 * from the time "now" thereby converting it
651 * to an elapsed time.
654 calc_resp_time(send_time
)
655 struct timeval
*send_time
;
657 struct timeval time_now
;
659 (void) gettimeofday(&time_now
, (struct timezone
*)0);
660 if (time_now
.tv_usec
< send_time
->tv_usec
) {
662 time_now
.tv_usec
+= 1000000;
664 send_time
->tv_sec
= time_now
.tv_sec
- send_time
->tv_sec
;
665 send_time
->tv_usec
= time_now
.tv_usec
- send_time
->tv_usec
;
669 free_transports(trans
)
670 struct transp
*trans
;
672 struct transp
*t
, *tmpt
= NULL
;
673 struct addrs
*a
, *tmpa
= NULL
;
674 struct tstamps
*ts
, *tmpts
= NULL
;
676 for (t
= trans
; t
; t
= tmpt
) {
678 (void) t_free((char *)t
->tr_taddr
, T_BIND
);
680 (void) t_close(t
->tr_fd
);
681 for (a
= t
->tr_addrs
; a
; a
= tmpa
) {
682 for (ts
= a
->addr_if_tstamps
; ts
; ts
= tmpts
) {
686 (void) netdir_free((char *)a
->addr_addrs
, ND_ADDRLIST
);