dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / cmd / fs.d / autofs / nfs_cast.c
blob653b91c771ad69448c8c77e0358e76cc3840a03b
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * nfs_cast.c : broadcast to a specific group of NFS servers
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
29 #pragma ident "%Z%%M% %I% %E% SMI"
31 #include <stdio.h>
32 #include <syslog.h>
33 #include <errno.h>
34 #include <string.h>
35 #include <sys/types.h>
36 #include <sys/time.h>
37 #include <sys/resource.h>
38 #include <unistd.h>
39 #include <stdlib.h>
40 #include <rpc/rpc.h>
41 #include <rpc/clnt_soc.h>
42 #include <rpc/nettype.h>
43 #include <rpc/pmap_prot.h>
44 #include <netconfig.h>
45 #include <netdir.h>
46 #include <nfs/nfs.h>
47 #define NFSCLIENT
48 #include <locale.h>
49 #include "automount.h"
51 #define PENALTY_WEIGHT 100000
53 struct tstamps {
54 struct tstamps *ts_next;
55 int ts_penalty;
56 int ts_inx;
57 int ts_rcvd;
58 struct timeval ts_timeval;
61 /* A list of addresses - all belonging to the same transport */
63 struct addrs {
64 struct addrs *addr_next;
65 struct mapfs *addr_mfs;
66 struct nd_addrlist *addr_addrs;
67 struct tstamps *addr_if_tstamps;
70 /* A list of connectionless transports */
72 struct transp {
73 struct transp *tr_next;
74 int tr_fd;
75 char *tr_device;
76 struct t_bind *tr_taddr;
77 struct addrs *tr_addrs;
80 /* A list of map entries and their roundtrip times, for sorting */
82 struct sm {
83 struct mapfs *mfs;
84 struct timeval timeval;
87 static void free_transports(struct transp *);
88 static void calc_resp_time(struct timeval *);
89 static struct mapfs *sort_responses(struct transp *);
90 static int host_sm(const void *, const void *b);
91 static int time_sm(const void *, const void *b);
92 extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **,
93 struct mapfs **);
96 * This routine is designed to be able to "ping"
97 * a list of hosts and create a list of responding
98 * hosts sorted by response time.
99 * This must be done without any prior
100 * contact with the host - therefore the "ping"
101 * must be to a "well-known" address. The outstanding
102 * candidate here is the address of "rpcbind".
104 * A response to a ping is no guarantee that the host
105 * is running NFS, has a mount daemon, or exports
106 * the required filesystem. If the subsequent
107 * mount attempt fails then the host will be marked
108 * "ignore" and the host list will be re-pinged
109 * (sans the bad host). This process continues
110 * until a successful mount is achieved or until
111 * there are no hosts left to try.
113 enum clnt_stat
114 nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout)
116 enum clnt_stat stat;
117 AUTH *sys_auth = authsys_create_default();
118 XDR xdr_stream;
119 register XDR *xdrs = &xdr_stream;
120 int outlen;
121 int if_inx;
122 int tsec;
123 int flag;
124 int sent, addr_cnt, rcvd, if_cnt;
125 fd_set readfds, mask;
126 register ulong_t xid; /* xid - unique per addr */
127 register int i;
128 struct rpc_msg msg;
129 struct timeval t, rcv_timeout;
130 char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE];
131 struct t_unitdata t_udata, t_rdata;
132 struct nd_hostserv hs;
133 struct nd_addrlist *retaddrs;
134 struct transp *tr_head;
135 struct transp *trans, *prev_trans;
136 struct addrs *a, *prev_addr;
137 struct tstamps *ts, *prev_ts;
138 NCONF_HANDLE *nc = NULL;
139 struct netconfig *nconf;
140 struct rlimit rl;
141 int dtbsize;
142 struct mapfs *mfs;
145 * For each connectionless transport get a list of
146 * host addresses. Any single host may have
147 * addresses on several transports.
149 addr_cnt = sent = rcvd = 0;
150 tr_head = NULL;
151 FD_ZERO(&mask);
154 * Set the default select size to be the maximum FD_SETSIZE, unless
155 * the current rlimit is lower.
157 dtbsize = FD_SETSIZE;
158 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
159 if (rl.rlim_cur < FD_SETSIZE)
160 dtbsize = rl.rlim_cur;
163 prev_trans = NULL;
164 prev_addr = NULL;
165 prev_ts = NULL;
166 for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) {
168 if (trace > 2)
169 trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host);
171 nc = setnetconfig();
172 if (nc == NULL) {
173 stat = RPC_CANTSEND;
174 goto done_broad;
176 while (nconf = getnetconfig(nc)) {
177 if (!(nconf->nc_flag & NC_VISIBLE) ||
178 nconf->nc_semantics != NC_TPI_CLTS ||
179 (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0))
180 continue;
181 trans = (struct transp *)malloc(sizeof (*trans));
182 if (trans == NULL) {
183 syslog(LOG_ERR, "no memory");
184 stat = RPC_CANTSEND;
185 goto done_broad;
187 (void) memset(trans, 0, sizeof (*trans));
188 if (tr_head == NULL)
189 tr_head = trans;
190 else
191 prev_trans->tr_next = trans;
192 prev_trans = trans;
194 trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL);
195 if (trans->tr_fd < 0) {
196 syslog(LOG_ERR, "nfscast: t_open: %s:%m",
197 nconf->nc_device);
198 stat = RPC_CANTSEND;
199 goto done_broad;
201 if (t_bind(trans->tr_fd, NULL, NULL) < 0) {
202 syslog(LOG_ERR, "nfscast: t_bind: %m");
203 stat = RPC_CANTSEND;
204 goto done_broad;
206 trans->tr_taddr =
207 /* LINTED pointer alignment */
208 (struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR);
209 if (trans->tr_taddr == NULL) {
210 syslog(LOG_ERR, "nfscast: t_alloc: %m");
211 stat = RPC_SYSTEMERROR;
212 goto done_broad;
215 trans->tr_device = nconf->nc_device;
216 FD_SET(trans->tr_fd, &mask);
218 if_inx = 0;
219 hs.h_host = mfs->mfs_host;
220 hs.h_serv = "rpcbind";
221 if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) {
224 * If mfs->ignore is previously set for
225 * this map, clear it. Because a host can
226 * have either v6 or v4 address
228 if (mfs->mfs_ignore == 1)
229 mfs->mfs_ignore = 0;
231 a = (struct addrs *)malloc(sizeof (*a));
232 if (a == NULL) {
233 syslog(LOG_ERR, "no memory");
234 stat = RPC_CANTSEND;
235 goto done_broad;
237 (void) memset(a, 0, sizeof (*a));
238 if (trans->tr_addrs == NULL)
239 trans->tr_addrs = a;
240 else
241 prev_addr->addr_next = a;
242 prev_addr = a;
243 a->addr_if_tstamps = NULL;
244 a->addr_mfs = mfs;
245 a->addr_addrs = retaddrs;
246 if_cnt = retaddrs->n_cnt;
247 while (if_cnt--) {
248 ts = (struct tstamps *)
249 malloc(sizeof (*ts));
250 if (ts == NULL) {
251 syslog(LOG_ERR, "no memory");
252 stat = RPC_CANTSEND;
253 goto done_broad;
255 (void) memset(ts, 0, sizeof (*ts));
256 ts->ts_penalty = mfs->mfs_penalty;
257 if (a->addr_if_tstamps == NULL)
258 a->addr_if_tstamps = ts;
259 else
260 prev_ts->ts_next = ts;
261 prev_ts = ts;
262 ts->ts_inx = if_inx++;
263 addr_cnt++;
265 break;
266 } else {
267 mfs->mfs_ignore = 1;
268 if (verbose)
269 syslog(LOG_ERR,
270 "%s:%s address not known",
271 mfs->mfs_host,
272 strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4");
274 } /* while */
276 endnetconfig(nc);
277 nc = NULL;
278 } /* for */
279 if (addr_cnt == 0) {
280 syslog(LOG_ERR, "nfscast: couldn't find addresses");
281 stat = RPC_CANTSEND;
282 goto done_broad;
285 (void) gettimeofday(&t, NULL);
286 xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF;
287 t.tv_usec = 0;
289 /* serialize the RPC header */
291 msg.rm_direction = CALL;
292 msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
293 msg.rm_call.cb_prog = RPCBPROG;
295 * we can not use RPCBVERS here since it doesn't exist in 4.X,
296 * the fix to bug 1139883 has made the 4.X portmapper silent to
297 * version mismatches. This causes the RPC call to the remote
298 * portmapper to simply be ignored if it's not Version 2.
300 msg.rm_call.cb_vers = PMAPVERS;
301 msg.rm_call.cb_proc = NULLPROC;
302 if (sys_auth == (AUTH *)NULL) {
303 stat = RPC_SYSTEMERROR;
304 goto done_broad;
306 msg.rm_call.cb_cred = sys_auth->ah_cred;
307 msg.rm_call.cb_verf = sys_auth->ah_verf;
308 xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE);
309 if (! xdr_callmsg(xdrs, &msg)) {
310 stat = RPC_CANTENCODEARGS;
311 goto done_broad;
313 outlen = (int)xdr_getpos(xdrs);
314 xdr_destroy(xdrs);
316 t_udata.opt.len = 0;
317 t_udata.udata.buf = outbuf;
318 t_udata.udata.len = outlen;
321 * Basic loop: send packet to all hosts and wait for response(s).
322 * The response timeout grows larger per iteration.
323 * A unique xid is assigned to each address in order to
324 * correctly match the replies.
326 for (tsec = 4; timeout > 0; tsec *= 2) {
328 timeout -= tsec;
329 if (timeout <= 0)
330 tsec += timeout;
332 rcv_timeout.tv_sec = tsec;
333 rcv_timeout.tv_usec = 0;
335 sent = 0;
336 for (trans = tr_head; trans; trans = trans->tr_next) {
337 for (a = trans->tr_addrs; a; a = a->addr_next) {
338 struct netbuf *if_netbuf =
339 a->addr_addrs->n_addrs;
340 ts = a->addr_if_tstamps;
341 if_cnt = a->addr_addrs->n_cnt;
342 while (if_cnt--) {
345 * xid is the first thing in
346 * preserialized buffer
348 /* LINTED pointer alignment */
349 *((ulong_t *)outbuf) =
350 htonl(xid + ts->ts_inx);
351 (void) gettimeofday(&(ts->ts_timeval),
352 NULL);
354 * Check if already received
355 * from a previous iteration.
357 if (ts->ts_rcvd) {
358 sent++;
359 ts = ts->ts_next;
360 continue;
363 t_udata.addr = *if_netbuf++;
365 if (t_sndudata(trans->tr_fd,
366 &t_udata) == 0) {
367 sent++;
370 ts = ts->ts_next;
374 if (sent == 0) { /* no packets sent ? */
375 stat = RPC_CANTSEND;
376 goto done_broad;
380 * Have sent all the packets. Now collect the responses...
382 rcvd = 0;
383 recv_again:
384 msg.acpted_rply.ar_verf = _null_auth;
385 msg.acpted_rply.ar_results.proc = xdr_void;
386 readfds = mask;
388 switch (select(dtbsize, &readfds,
389 (fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) {
391 case 0: /* Timed out */
393 * If we got at least one response in the
394 * last interval, then don't wait for any
395 * more. In theory we should wait for
396 * the max weighting (penalty) value so
397 * that a very slow server has a chance to
398 * respond but this could take a long time
399 * if the admin has set a high weighting
400 * value.
402 if (rcvd > 0)
403 goto done_broad;
405 stat = RPC_TIMEDOUT;
406 continue;
408 case -1: /* some kind of error */
409 if (errno == EINTR)
410 goto recv_again;
411 syslog(LOG_ERR, "nfscast: select: %m");
412 if (rcvd == 0)
413 stat = RPC_CANTRECV;
414 goto done_broad;
416 } /* end of select results switch */
418 for (trans = tr_head; trans; trans = trans->tr_next) {
419 if (FD_ISSET(trans->tr_fd, &readfds))
420 break;
422 if (trans == NULL)
423 goto recv_again;
425 try_again:
426 t_rdata.addr = trans->tr_taddr->addr;
427 t_rdata.udata.buf = inbuf;
428 t_rdata.udata.maxlen = sizeof (inbuf);
429 t_rdata.udata.len = 0;
430 t_rdata.opt.len = 0;
431 if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) {
432 if (errno == EINTR)
433 goto try_again;
434 syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m",
435 trans->tr_device);
436 stat = RPC_CANTRECV;
437 continue;
439 if (t_rdata.udata.len < sizeof (ulong_t))
440 goto recv_again;
441 if (flag & T_MORE) {
442 syslog(LOG_ERR,
443 "nfscast: t_rcvudata: %s: buffer overflow",
444 trans->tr_device);
445 goto recv_again;
449 * see if reply transaction id matches sent id.
450 * If so, decode the results.
451 * Note: received addr is ignored, it could be
452 * different from the send addr if the host has
453 * more than one addr.
455 xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len,
456 XDR_DECODE);
457 if (xdr_replymsg(xdrs, &msg)) {
458 if (msg.rm_reply.rp_stat == MSG_ACCEPTED &&
459 (msg.rm_xid & ~0xFF) == xid) {
460 struct addrs *curr_addr;
462 i = msg.rm_xid & 0xFF;
463 for (curr_addr = trans->tr_addrs; curr_addr;
464 curr_addr = curr_addr->addr_next) {
465 for (ts = curr_addr->addr_if_tstamps; ts;
466 ts = ts->ts_next)
467 if (ts->ts_inx == i && !ts->ts_rcvd) {
468 ts->ts_rcvd = 1;
469 calc_resp_time(&ts->ts_timeval);
470 stat = RPC_SUCCESS;
471 rcvd++;
472 break;
475 } /* otherwise, we just ignore the errors ... */
477 xdrs->x_op = XDR_FREE;
478 msg.acpted_rply.ar_results.proc = xdr_void;
479 (void) xdr_replymsg(xdrs, &msg);
480 XDR_DESTROY(xdrs);
481 if (rcvd == sent)
482 goto done_broad;
483 else
484 goto recv_again;
486 if (!rcvd)
487 stat = RPC_TIMEDOUT;
489 done_broad:
490 if (rcvd) {
491 *mfs_out = sort_responses(tr_head);
492 stat = RPC_SUCCESS;
494 if (nc)
495 endnetconfig(nc);
496 free_transports(tr_head);
497 AUTH_DESTROY(sys_auth);
498 return (stat);
502 * Go through all the responses and sort fastest to slowest.
503 * Note that any penalty is added to the response time - so the
504 * fastest response isn't necessarily the one that arrived first.
506 static struct mapfs *
507 sort_responses(trans)
508 struct transp *trans;
510 struct transp *t;
511 struct addrs *a;
512 struct tstamps *ti;
513 int i, size = 0, allocsize = 10;
514 struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL;
515 struct sm *buffer;
517 buffer = (struct sm *)malloc(allocsize * sizeof (struct sm));
518 if (!buffer) {
519 syslog(LOG_ERR, "sort_responses: malloc error.\n");
520 return (NULL);
523 for (t = trans; t; t = t->tr_next) {
524 for (a = t->tr_addrs; a; a = a->addr_next) {
525 for (ti = a->addr_if_tstamps;
526 ti; ti = ti->ts_next) {
527 if (!ti->ts_rcvd)
528 continue;
529 ti->ts_timeval.tv_usec +=
530 (ti->ts_penalty * PENALTY_WEIGHT);
531 if (ti->ts_timeval.tv_usec >= 1000000) {
532 ti->ts_timeval.tv_sec +=
533 (ti->ts_timeval.tv_usec / 1000000);
534 ti->ts_timeval.tv_usec =
535 (ti->ts_timeval.tv_usec % 1000000);
538 if (size >= allocsize) {
539 allocsize += 10;
540 buffer = reallocarray(buffer, allocsize,
541 sizeof (struct sm));
542 if (!buffer) {
543 syslog(LOG_ERR,
544 "sort_responses: malloc error.\n");
545 return (NULL);
548 buffer[size].timeval = ti->ts_timeval;
549 buffer[size].mfs = a->addr_mfs;
550 size++;
555 #ifdef DEBUG
556 if (trace > 3) {
557 trace_prt(1, " sort_responses: before host sort:\n");
558 for (i = 0; i < size; i++)
559 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
560 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
561 trace_prt(0, "\n");
563 #endif
565 qsort((void *)buffer, size, sizeof (struct sm), host_sm);
568 * Cope with multiply listed hosts by choosing first time
570 for (i = 1; i < size; i++) {
571 #ifdef DEBUG
572 if (trace > 3) {
573 trace_prt(1, " sort_responses: comparing %s and %s\n",
574 buffer[i-1].mfs->mfs_host,
575 buffer[i].mfs->mfs_host);
577 #endif
578 if (strcmp(buffer[i-1].mfs->mfs_host,
579 buffer[i].mfs->mfs_host) == 0)
580 memcpy(&buffer[i].timeval, &buffer[i-1].timeval,
581 sizeof (struct timeval));
583 if (trace > 3)
584 trace_prt(0, "\n");
586 #ifdef DEBUG
587 if (trace > 3) {
588 trace_prt(1, " sort_responses: before time sort:\n");
589 for (i = 0; i < size; i++)
590 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
591 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
592 trace_prt(0, "\n");
594 #endif
596 qsort((void *)buffer, size, sizeof (struct sm), time_sm);
598 #ifdef DEBUG
599 if (trace > 3) {
600 trace_prt(1, " sort_responses: after sort:\n");
601 for (i = 0; i < size; i++)
602 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
603 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
604 trace_prt(0, "\n");
606 #endif
608 for (i = 0; i < size; i++) {
609 #ifdef DEBUG
610 if (trace > 3) {
611 trace_prt(1, " sort_responses: adding %s\n",
612 buffer[i].mfs->mfs_host);
614 #endif
615 p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail);
616 if (!p)
617 return (NULL);
619 free(buffer);
621 return (mfs_head);
626 * Comparison routines called by qsort(3).
628 static int host_sm(const void *a, const void *b)
630 return (strcmp(((struct sm *)a)->mfs->mfs_host,
631 ((struct sm *)b)->mfs->mfs_host));
634 static int time_sm(const void *a, const void *b)
636 if (timercmp(&(((struct sm *)a)->timeval),
637 &(((struct sm *)b)->timeval), < /* cstyle */))
638 return (-1);
639 else if (timercmp(&(((struct sm *)a)->timeval),
640 &(((struct sm *)b)->timeval), > /* cstyle */))
641 return (1);
642 else
643 return (0);
647 * Given send_time which is the time a request
648 * was transmitted to a server, subtract it
649 * from the time "now" thereby converting it
650 * to an elapsed time.
652 static void
653 calc_resp_time(send_time)
654 struct timeval *send_time;
656 struct timeval time_now;
658 (void) gettimeofday(&time_now, NULL);
659 if (time_now.tv_usec < send_time->tv_usec) {
660 time_now.tv_sec--;
661 time_now.tv_usec += 1000000;
663 send_time->tv_sec = time_now.tv_sec - send_time->tv_sec;
664 send_time->tv_usec = time_now.tv_usec - send_time->tv_usec;
667 static void
668 free_transports(trans)
669 struct transp *trans;
671 struct transp *t, *tmpt = NULL;
672 struct addrs *a, *tmpa = NULL;
673 struct tstamps *ts, *tmpts = NULL;
675 for (t = trans; t; t = tmpt) {
676 if (t->tr_taddr)
677 (void) t_free((char *)t->tr_taddr, T_BIND);
678 if (t->tr_fd > 0)
679 (void) t_close(t->tr_fd);
680 for (a = t->tr_addrs; a; a = tmpa) {
681 for (ts = a->addr_if_tstamps; ts; ts = tmpts) {
682 tmpts = ts->ts_next;
683 free(ts);
685 (void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST);
686 tmpa = a->addr_next;
687 free(a);
689 tmpt = t->tr_next;
690 free(t);