4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * nfs_cast.c : broadcast to a specific group of NFS servers
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
29 #pragma ident "%Z%%M% %I% %E% SMI"
35 #include <sys/types.h>
37 #include <sys/resource.h>
41 #include <rpc/clnt_soc.h>
42 #include <rpc/nettype.h>
43 #include <rpc/pmap_prot.h>
44 #include <netconfig.h>
49 #include "automount.h"
51 #define PENALTY_WEIGHT 100000
54 struct tstamps
*ts_next
;
58 struct timeval ts_timeval
;
61 /* A list of addresses - all belonging to the same transport */
64 struct addrs
*addr_next
;
65 struct mapfs
*addr_mfs
;
66 struct nd_addrlist
*addr_addrs
;
67 struct tstamps
*addr_if_tstamps
;
70 /* A list of connectionless transports */
73 struct transp
*tr_next
;
76 struct t_bind
*tr_taddr
;
77 struct addrs
*tr_addrs
;
80 /* A list of map entries and their roundtrip times, for sorting */
84 struct timeval timeval
;
87 static void free_transports(struct transp
*);
88 static void calc_resp_time(struct timeval
*);
89 static struct mapfs
*sort_responses(struct transp
*);
90 static int host_sm(const void *, const void *b
);
91 static int time_sm(const void *, const void *b
);
92 extern struct mapfs
*add_mfs(struct mapfs
*, int, struct mapfs
**,
96 * This routine is designed to be able to "ping"
97 * a list of hosts and create a list of responding
98 * hosts sorted by response time.
99 * This must be done without any prior
100 * contact with the host - therefore the "ping"
101 * must be to a "well-known" address. The outstanding
102 * candidate here is the address of "rpcbind".
104 * A response to a ping is no guarantee that the host
105 * is running NFS, has a mount daemon, or exports
106 * the required filesystem. If the subsequent
107 * mount attempt fails then the host will be marked
108 * "ignore" and the host list will be re-pinged
109 * (sans the bad host). This process continues
110 * until a successful mount is achieved or until
111 * there are no hosts left to try.
114 nfs_cast(struct mapfs
*mfs_in
, struct mapfs
**mfs_out
, int timeout
)
117 AUTH
*sys_auth
= authsys_create_default();
119 register XDR
*xdrs
= &xdr_stream
;
124 int sent
, addr_cnt
, rcvd
, if_cnt
;
125 fd_set readfds
, mask
;
126 register ulong_t xid
; /* xid - unique per addr */
129 struct timeval t
, rcv_timeout
;
130 char outbuf
[UDPMSGSIZE
], inbuf
[UDPMSGSIZE
];
131 struct t_unitdata t_udata
, t_rdata
;
132 struct nd_hostserv hs
;
133 struct nd_addrlist
*retaddrs
;
134 struct transp
*tr_head
;
135 struct transp
*trans
, *prev_trans
;
136 struct addrs
*a
, *prev_addr
;
137 struct tstamps
*ts
, *prev_ts
;
138 NCONF_HANDLE
*nc
= NULL
;
139 struct netconfig
*nconf
;
145 * For each connectionless transport get a list of
146 * host addresses. Any single host may have
147 * addresses on several transports.
149 addr_cnt
= sent
= rcvd
= 0;
154 * Set the default select size to be the maximum FD_SETSIZE, unless
155 * the current rlimit is lower.
157 dtbsize
= FD_SETSIZE
;
158 if (getrlimit(RLIMIT_NOFILE
, &rl
) == 0) {
159 if (rl
.rlim_cur
< FD_SETSIZE
)
160 dtbsize
= rl
.rlim_cur
;
166 for (mfs
= mfs_in
; mfs
; mfs
= mfs
->mfs_next
) {
169 trace_prt(1, "nfs_cast: host=%s\n", mfs
->mfs_host
);
176 while (nconf
= getnetconfig(nc
)) {
177 if (!(nconf
->nc_flag
& NC_VISIBLE
) ||
178 nconf
->nc_semantics
!= NC_TPI_CLTS
||
179 (strcmp(nconf
->nc_protofmly
, NC_LOOPBACK
) == 0))
181 trans
= (struct transp
*)malloc(sizeof (*trans
));
183 syslog(LOG_ERR
, "no memory");
187 (void) memset(trans
, 0, sizeof (*trans
));
191 prev_trans
->tr_next
= trans
;
194 trans
->tr_fd
= t_open(nconf
->nc_device
, O_RDWR
, NULL
);
195 if (trans
->tr_fd
< 0) {
196 syslog(LOG_ERR
, "nfscast: t_open: %s:%m",
201 if (t_bind(trans
->tr_fd
, NULL
, NULL
) < 0) {
202 syslog(LOG_ERR
, "nfscast: t_bind: %m");
207 /* LINTED pointer alignment */
208 (struct t_bind
*)t_alloc(trans
->tr_fd
, T_BIND
, T_ADDR
);
209 if (trans
->tr_taddr
== NULL
) {
210 syslog(LOG_ERR
, "nfscast: t_alloc: %m");
211 stat
= RPC_SYSTEMERROR
;
215 trans
->tr_device
= nconf
->nc_device
;
216 FD_SET(trans
->tr_fd
, &mask
);
219 hs
.h_host
= mfs
->mfs_host
;
220 hs
.h_serv
= "rpcbind";
221 if (netdir_getbyname(nconf
, &hs
, &retaddrs
) == ND_OK
) {
224 * If mfs->ignore is previously set for
225 * this map, clear it. Because a host can
226 * have either v6 or v4 address
228 if (mfs
->mfs_ignore
== 1)
231 a
= (struct addrs
*)malloc(sizeof (*a
));
233 syslog(LOG_ERR
, "no memory");
237 (void) memset(a
, 0, sizeof (*a
));
238 if (trans
->tr_addrs
== NULL
)
241 prev_addr
->addr_next
= a
;
243 a
->addr_if_tstamps
= NULL
;
245 a
->addr_addrs
= retaddrs
;
246 if_cnt
= retaddrs
->n_cnt
;
248 ts
= (struct tstamps
*)
249 malloc(sizeof (*ts
));
251 syslog(LOG_ERR
, "no memory");
255 (void) memset(ts
, 0, sizeof (*ts
));
256 ts
->ts_penalty
= mfs
->mfs_penalty
;
257 if (a
->addr_if_tstamps
== NULL
)
258 a
->addr_if_tstamps
= ts
;
260 prev_ts
->ts_next
= ts
;
262 ts
->ts_inx
= if_inx
++;
270 "%s:%s address not known",
272 strcmp(nconf
->nc_proto
, NC_INET
)?"IPv6":"IPv4");
280 syslog(LOG_ERR
, "nfscast: couldn't find addresses");
285 (void) gettimeofday(&t
, NULL
);
286 xid
= (getpid() ^ t
.tv_sec
^ t
.tv_usec
) & ~0xFF;
289 /* serialize the RPC header */
291 msg
.rm_direction
= CALL
;
292 msg
.rm_call
.cb_rpcvers
= RPC_MSG_VERSION
;
293 msg
.rm_call
.cb_prog
= RPCBPROG
;
295 * we can not use RPCBVERS here since it doesn't exist in 4.X,
296 * the fix to bug 1139883 has made the 4.X portmapper silent to
297 * version mismatches. This causes the RPC call to the remote
298 * portmapper to simply be ignored if it's not Version 2.
300 msg
.rm_call
.cb_vers
= PMAPVERS
;
301 msg
.rm_call
.cb_proc
= NULLPROC
;
302 if (sys_auth
== (AUTH
*)NULL
) {
303 stat
= RPC_SYSTEMERROR
;
306 msg
.rm_call
.cb_cred
= sys_auth
->ah_cred
;
307 msg
.rm_call
.cb_verf
= sys_auth
->ah_verf
;
308 xdrmem_create(xdrs
, outbuf
, sizeof (outbuf
), XDR_ENCODE
);
309 if (! xdr_callmsg(xdrs
, &msg
)) {
310 stat
= RPC_CANTENCODEARGS
;
313 outlen
= (int)xdr_getpos(xdrs
);
317 t_udata
.udata
.buf
= outbuf
;
318 t_udata
.udata
.len
= outlen
;
321 * Basic loop: send packet to all hosts and wait for response(s).
322 * The response timeout grows larger per iteration.
323 * A unique xid is assigned to each address in order to
324 * correctly match the replies.
326 for (tsec
= 4; timeout
> 0; tsec
*= 2) {
332 rcv_timeout
.tv_sec
= tsec
;
333 rcv_timeout
.tv_usec
= 0;
336 for (trans
= tr_head
; trans
; trans
= trans
->tr_next
) {
337 for (a
= trans
->tr_addrs
; a
; a
= a
->addr_next
) {
338 struct netbuf
*if_netbuf
=
339 a
->addr_addrs
->n_addrs
;
340 ts
= a
->addr_if_tstamps
;
341 if_cnt
= a
->addr_addrs
->n_cnt
;
345 * xid is the first thing in
346 * preserialized buffer
348 /* LINTED pointer alignment */
349 *((ulong_t
*)outbuf
) =
350 htonl(xid
+ ts
->ts_inx
);
351 (void) gettimeofday(&(ts
->ts_timeval
),
354 * Check if already received
355 * from a previous iteration.
363 t_udata
.addr
= *if_netbuf
++;
365 if (t_sndudata(trans
->tr_fd
,
374 if (sent
== 0) { /* no packets sent ? */
380 * Have sent all the packets. Now collect the responses...
384 msg
.acpted_rply
.ar_verf
= _null_auth
;
385 msg
.acpted_rply
.ar_results
.proc
= xdr_void
;
388 switch (select(dtbsize
, &readfds
,
389 (fd_set
*)NULL
, (fd_set
*)NULL
, &rcv_timeout
)) {
391 case 0: /* Timed out */
393 * If we got at least one response in the
394 * last interval, then don't wait for any
395 * more. In theory we should wait for
396 * the max weighting (penalty) value so
397 * that a very slow server has a chance to
398 * respond but this could take a long time
399 * if the admin has set a high weighting
408 case -1: /* some kind of error */
411 syslog(LOG_ERR
, "nfscast: select: %m");
416 } /* end of select results switch */
418 for (trans
= tr_head
; trans
; trans
= trans
->tr_next
) {
419 if (FD_ISSET(trans
->tr_fd
, &readfds
))
426 t_rdata
.addr
= trans
->tr_taddr
->addr
;
427 t_rdata
.udata
.buf
= inbuf
;
428 t_rdata
.udata
.maxlen
= sizeof (inbuf
);
429 t_rdata
.udata
.len
= 0;
431 if (t_rcvudata(trans
->tr_fd
, &t_rdata
, &flag
) < 0) {
434 syslog(LOG_ERR
, "nfscast: t_rcvudata: %s:%m",
439 if (t_rdata
.udata
.len
< sizeof (ulong_t
))
443 "nfscast: t_rcvudata: %s: buffer overflow",
449 * see if reply transaction id matches sent id.
450 * If so, decode the results.
451 * Note: received addr is ignored, it could be
452 * different from the send addr if the host has
453 * more than one addr.
455 xdrmem_create(xdrs
, inbuf
, (uint_t
)t_rdata
.udata
.len
,
457 if (xdr_replymsg(xdrs
, &msg
)) {
458 if (msg
.rm_reply
.rp_stat
== MSG_ACCEPTED
&&
459 (msg
.rm_xid
& ~0xFF) == xid
) {
460 struct addrs
*curr_addr
;
462 i
= msg
.rm_xid
& 0xFF;
463 for (curr_addr
= trans
->tr_addrs
; curr_addr
;
464 curr_addr
= curr_addr
->addr_next
) {
465 for (ts
= curr_addr
->addr_if_tstamps
; ts
;
467 if (ts
->ts_inx
== i
&& !ts
->ts_rcvd
) {
469 calc_resp_time(&ts
->ts_timeval
);
475 } /* otherwise, we just ignore the errors ... */
477 xdrs
->x_op
= XDR_FREE
;
478 msg
.acpted_rply
.ar_results
.proc
= xdr_void
;
479 (void) xdr_replymsg(xdrs
, &msg
);
491 *mfs_out
= sort_responses(tr_head
);
496 free_transports(tr_head
);
497 AUTH_DESTROY(sys_auth
);
502 * Go through all the responses and sort fastest to slowest.
503 * Note that any penalty is added to the response time - so the
504 * fastest response isn't necessarily the one that arrived first.
506 static struct mapfs
*
507 sort_responses(trans
)
508 struct transp
*trans
;
513 int i
, size
= 0, allocsize
= 10;
514 struct mapfs
*p
, *mfs_head
= NULL
, *mfs_tail
= NULL
;
517 buffer
= (struct sm
*)malloc(allocsize
* sizeof (struct sm
));
519 syslog(LOG_ERR
, "sort_responses: malloc error.\n");
523 for (t
= trans
; t
; t
= t
->tr_next
) {
524 for (a
= t
->tr_addrs
; a
; a
= a
->addr_next
) {
525 for (ti
= a
->addr_if_tstamps
;
526 ti
; ti
= ti
->ts_next
) {
529 ti
->ts_timeval
.tv_usec
+=
530 (ti
->ts_penalty
* PENALTY_WEIGHT
);
531 if (ti
->ts_timeval
.tv_usec
>= 1000000) {
532 ti
->ts_timeval
.tv_sec
+=
533 (ti
->ts_timeval
.tv_usec
/ 1000000);
534 ti
->ts_timeval
.tv_usec
=
535 (ti
->ts_timeval
.tv_usec
% 1000000);
538 if (size
>= allocsize
) {
540 buffer
= reallocarray(buffer
, allocsize
,
544 "sort_responses: malloc error.\n");
548 buffer
[size
].timeval
= ti
->ts_timeval
;
549 buffer
[size
].mfs
= a
->addr_mfs
;
557 trace_prt(1, " sort_responses: before host sort:\n");
558 for (i
= 0; i
< size
; i
++)
559 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
560 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
565 qsort((void *)buffer
, size
, sizeof (struct sm
), host_sm
);
568 * Cope with multiply listed hosts by choosing first time
570 for (i
= 1; i
< size
; i
++) {
573 trace_prt(1, " sort_responses: comparing %s and %s\n",
574 buffer
[i
-1].mfs
->mfs_host
,
575 buffer
[i
].mfs
->mfs_host
);
578 if (strcmp(buffer
[i
-1].mfs
->mfs_host
,
579 buffer
[i
].mfs
->mfs_host
) == 0)
580 memcpy(&buffer
[i
].timeval
, &buffer
[i
-1].timeval
,
581 sizeof (struct timeval
));
588 trace_prt(1, " sort_responses: before time sort:\n");
589 for (i
= 0; i
< size
; i
++)
590 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
591 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
596 qsort((void *)buffer
, size
, sizeof (struct sm
), time_sm
);
600 trace_prt(1, " sort_responses: after sort:\n");
601 for (i
= 0; i
< size
; i
++)
602 trace_prt(1, " %s %d.%d\n", buffer
[i
].mfs
->mfs_host
,
603 buffer
[i
].timeval
.tv_sec
, buffer
[i
].timeval
.tv_usec
);
608 for (i
= 0; i
< size
; i
++) {
611 trace_prt(1, " sort_responses: adding %s\n",
612 buffer
[i
].mfs
->mfs_host
);
615 p
= add_mfs(buffer
[i
].mfs
, 0, &mfs_head
, &mfs_tail
);
626 * Comparison routines called by qsort(3).
628 static int host_sm(const void *a
, const void *b
)
630 return (strcmp(((struct sm
*)a
)->mfs
->mfs_host
,
631 ((struct sm
*)b
)->mfs
->mfs_host
));
634 static int time_sm(const void *a
, const void *b
)
636 if (timercmp(&(((struct sm
*)a
)->timeval
),
637 &(((struct sm
*)b
)->timeval
), < /* cstyle */))
639 else if (timercmp(&(((struct sm
*)a
)->timeval
),
640 &(((struct sm
*)b
)->timeval
), > /* cstyle */))
647 * Given send_time which is the time a request
648 * was transmitted to a server, subtract it
649 * from the time "now" thereby converting it
650 * to an elapsed time.
653 calc_resp_time(send_time
)
654 struct timeval
*send_time
;
656 struct timeval time_now
;
658 (void) gettimeofday(&time_now
, NULL
);
659 if (time_now
.tv_usec
< send_time
->tv_usec
) {
661 time_now
.tv_usec
+= 1000000;
663 send_time
->tv_sec
= time_now
.tv_sec
- send_time
->tv_sec
;
664 send_time
->tv_usec
= time_now
.tv_usec
- send_time
->tv_usec
;
668 free_transports(trans
)
669 struct transp
*trans
;
671 struct transp
*t
, *tmpt
= NULL
;
672 struct addrs
*a
, *tmpa
= NULL
;
673 struct tstamps
*ts
, *tmpts
= NULL
;
675 for (t
= trans
; t
; t
= tmpt
) {
677 (void) t_free((char *)t
->tr_taddr
, T_BIND
);
679 (void) t_close(t
->tr_fd
);
680 for (a
= t
->tr_addrs
; a
; a
= tmpa
) {
681 for (ts
= a
->addr_if_tstamps
; ts
; ts
= tmpts
) {
685 (void) netdir_free((char *)a
->addr_addrs
, ND_ADDRLIST
);