all: check return value from daemon() call
[jleu-quagga.git] / watchquagga / watchquagga.c
blobfb628acca31640a1df139fe8d8aff9e6bd2778f4
1 /*
2 $Id$
4 Monitor status of quagga daemons and restart if necessary.
6 Copyright (C) 2004 Andrew J. Schorr
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include <zebra.h>
24 #include <thread.h>
25 #include <log.h>
26 #include <network.h>
27 #include <sigevent.h>
28 #include <lib/version.h>
29 #include <getopt.h>
30 #include <sys/un.h>
31 #include <sys/wait.h>
33 #ifndef MIN
34 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
35 #endif
37 /* Macros to help randomize timers. */
38 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
39 #define FUZZY(X) ((X)+JITTER((X)/20))
41 #define DEFAULT_PERIOD 5
42 #define DEFAULT_TIMEOUT 10
43 #define DEFAULT_RESTART_TIMEOUT 20
44 #define DEFAULT_LOGLEVEL LOG_INFO
45 #define DEFAULT_MIN_RESTART 60
46 #define DEFAULT_MAX_RESTART 600
47 #ifdef PATH_WATCHQUAGGA_PID
48 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
49 #else
50 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
51 #endif
52 #ifdef DAEMON_VTY_DIR
53 #define VTYDIR DAEMON_VTY_DIR
54 #else
55 #define VTYDIR STATEDIR
56 #endif
58 #define PING_TOKEN "PING"
60 /* Needs to be global, referenced somewhere inside libzebra. */
61 struct thread_master *master;
63 typedef enum
65 MODE_MONITOR = 0,
66 MODE_GLOBAL_RESTART,
67 MODE_SEPARATE_RESTART,
68 MODE_PHASED_ZEBRA_RESTART,
69 MODE_PHASED_ALL_RESTART
70 } watch_mode_t;
72 static const char *mode_str[] =
74 "monitor",
75 "global restart",
76 "individual daemon restart",
77 "phased zebra restart",
78 "phased global restart for any failure",
81 typedef enum
83 PHASE_NONE = 0,
84 PHASE_STOPS_PENDING,
85 PHASE_WAITING_DOWN,
86 PHASE_ZEBRA_RESTART_PENDING,
87 PHASE_WAITING_ZEBRA_UP
88 } restart_phase_t;
90 static const char *phase_str[] =
92 "None",
93 "Stop jobs running",
94 "Waiting for other daemons to come down",
95 "Zebra restart job running",
96 "Waiting for zebra to come up",
97 "Start jobs running",
100 #define PHASE_TIMEOUT (3*gs.restart_timeout)
102 struct restart_info
104 const char *name;
105 const char *what;
106 pid_t pid;
107 struct timeval time;
108 long interval;
109 struct thread *t_kill;
110 int kills;
113 static struct global_state
115 watch_mode_t mode;
116 restart_phase_t phase;
117 struct thread *t_phase_hanging;
118 const char *vtydir;
119 long period;
120 long timeout;
121 long restart_timeout;
122 long min_restart_interval;
123 long max_restart_interval;
124 int do_ping;
125 struct daemon *daemons;
126 const char *restart_command;
127 const char *start_command;
128 const char *stop_command;
129 struct restart_info restart;
130 int unresponsive_restart;
131 int loglevel;
132 struct daemon *special; /* points to zebra when doing phased restart */
133 int numdaemons;
134 int numpids;
135 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
136 } gs = {
137 .mode = MODE_MONITOR,
138 .phase = PHASE_NONE,
139 .vtydir = VTYDIR,
140 .period = 1000*DEFAULT_PERIOD,
141 .timeout = DEFAULT_TIMEOUT,
142 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
143 .loglevel = DEFAULT_LOGLEVEL,
144 .min_restart_interval = DEFAULT_MIN_RESTART,
145 .max_restart_interval = DEFAULT_MAX_RESTART,
146 .do_ping = 1,
149 typedef enum
151 DAEMON_INIT,
152 DAEMON_DOWN,
153 DAEMON_CONNECTING,
154 DAEMON_UP,
155 DAEMON_UNRESPONSIVE
156 } daemon_state_t;
158 #define IS_UP(DMN) \
159 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
161 static const char *state_str[] =
163 "Init",
164 "Down",
165 "Connecting",
166 "Up",
167 "Unresponsive",
170 struct daemon {
171 const char *name;
172 daemon_state_t state;
173 int fd;
174 struct timeval echo_sent;
175 u_int connect_tries;
176 struct thread *t_wakeup;
177 struct thread *t_read;
178 struct thread *t_write;
179 struct daemon *next;
180 struct restart_info restart;
183 static const struct option longopts[] =
185 { "daemon", no_argument, NULL, 'd'},
186 { "statedir", required_argument, NULL, 'S'},
187 { "no-echo", no_argument, NULL, 'e'},
188 { "loglevel", required_argument, NULL, 'l'},
189 { "interval", required_argument, NULL, 'i'},
190 { "timeout", required_argument, NULL, 't'},
191 { "restart-timeout", required_argument, NULL, 'T'},
192 { "restart", required_argument, NULL, 'r'},
193 { "start-command", required_argument, NULL, 's'},
194 { "kill-command", required_argument, NULL, 'k'},
195 { "restart-all", required_argument, NULL, 'R'},
196 { "all-restart", no_argument, NULL, 'a'},
197 { "always-all-restart", no_argument, NULL, 'A'},
198 { "unresponsive-restart", no_argument, NULL, 'z'},
199 { "min-restart-interval", required_argument, NULL, 'm'},
200 { "max-restart-interval", required_argument, NULL, 'M'},
201 { "pid-file", required_argument, NULL, 'p'},
202 { "blank-string", required_argument, NULL, 'b'},
203 { "help", no_argument, NULL, 'h'},
204 { "version", no_argument, NULL, 'v'},
205 { NULL, 0, NULL, 0 }
208 static int try_connect(struct daemon *dmn);
209 static int wakeup_send_echo(struct thread *t_wakeup);
210 static void try_restart(struct daemon *dmn);
211 static void phase_check(void);
213 static int
214 usage(const char *progname, int status)
216 if (status != 0)
217 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
218 else
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
220 Watchdog program to monitor status of quagga daemons and try to restart\n\
221 them if they are down or unresponsive. It determines whether a daemon is\n\
222 up based on whether it can connect to the daemon's vty unix stream socket.\n\
223 It then repeatedly sends echo commands over that socket to determine whether\n\
224 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225 on the socket connection and know immediately that the daemon is down.\n\n\
226 The daemons to be monitored should be listed on the command line.\n\n\
227 This program can run in one of 5 modes:\n\n\
228 0. Mode: %s.\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
231 1. Mode: %s.\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
234 %s -dz \\\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
236 zebra ospfd\n\n\
237 2. Mode: %s.\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
241 3. Mode: %s.\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
245 daemons. Example:\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
249 4. Mode: %s.\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255 As of this writing, it is believed that mode 2 [%s]\n\
256 is not safe, and mode 3 [%s] may not be safe with some of the\n\
257 routing daemons.\n\n\
258 In order to avoid attempting to restart the daemons in a fast loop,\n\
259 the -m and -M options allow you to control the minimum delay between\n\
260 restart commands. The minimum restart delay is recalculated each time\n\
261 a restart is attempted: if the time since the last restart attempt exceeds\n\
262 twice the -M value, then the restart delay is set to the -m value.\n\
263 Otherwise, the interval is doubled (but capped at the -M value).\n\n\
264 Options:\n\
265 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
266 to syslog instead of stdout.\n\
267 -S, --statedir Set the vty socket directory (default is %s)\n\
268 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
269 option is necessary if the daemons do not support the\n\
270 echo command)\n\
271 -l, --loglevel Set the logging level (default is %d).\n\
272 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
273 but it can be set higher than %d if extra-verbose debugging\n\
274 messages are desired.\n\
275 -m, --min-restart-interval\n\
276 Set the minimum seconds to wait between invocations of daemon\n\
277 restart commands (default is %d).\n\
278 -M, --max-restart-interval\n\
279 Set the maximum seconds to wait between invocations of daemon\n\
280 restart commands (default is %d).\n\
281 -i, --interval Set the status polling interval in seconds (default is %d)\n\
282 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
283 -T, --restart-timeout\n\
284 Set the restart (kill) timeout in seconds (default is %d).\n\
285 If any background jobs are still running after this much\n\
286 time has elapsed, they will be killed.\n\
287 -r, --restart Supply a Bourne shell command to use to restart a single\n\
288 daemon. The command string should include '%%s' where the\n\
289 name of the daemon should be substituted.\n\
290 Note that -r and -R are incompatible.\n\
291 -s, --start-command\n\
292 Supply a Bourne shell to command to use to start a single\n\
293 daemon. The command string should include '%%s' where the\n\
294 name of the daemon should be substituted.\n\
295 -k, --kill-command\n\
296 Supply a Bourne shell to command to use to stop a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
299 -R, --restart-all\n\
300 When one or more daemons is down, try to restart everything\n\
301 using the Bourne shell command supplied as the argument.\n\
302 Note that -r and -R are incompatible.\n\
303 -z, --unresponsive-restart\n\
304 When a daemon is unresponsive, treat it as being down for\n\
305 restart purposes.\n\
306 -a, --all-restart\n\
307 When zebra hangs or crashes, restart all daemons using\n\
308 this phased approach: 1. stop all other daemons; 2. restart\n\
309 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
310 -A, --always-all-restart\n\
311 When any daemon (not just zebra) hangs or crashes, use the\n\
312 same phased restart mechanism described above for -a.\n\
313 Requires -r, -s, and -k.\n\
314 -p, --pid-file Set process identifier file name\n\
315 (default is %s).\n\
316 -b, --blank-string\n\
317 When the supplied argument string is found in any of the\n\
318 various shell command arguments (-r, -s, -k, or -R), replace\n\
319 it with a space. This is an ugly hack to circumvent problems\n\
320 passing command-line arguments with embedded spaces.\n\
321 -v, --version Print program version\n\
322 -h, --help Display this help and exit\n\
323 ", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
324 progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
325 VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
326 DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
327 DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
329 return status;
332 static pid_t
333 run_background(const char *shell_cmd)
335 pid_t child;
337 switch (child = fork())
339 case -1:
340 zlog_err("fork failed, cannot run command [%s]: %s",
341 shell_cmd,safe_strerror(errno));
342 return -1;
343 case 0:
344 /* Child process. */
345 /* Use separate process group so child processes can be killed easily. */
346 if (setpgid(0,0) < 0)
347 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
349 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
350 execv("/bin/sh",(char *const *)argv);
351 zlog_err("execv(/bin/sh -c '%s') failed: %s",
352 shell_cmd,safe_strerror(errno));
353 _exit(127);
355 default:
356 /* Parent process: we will reap the child later. */
357 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
358 return child;
362 static struct timeval *
363 time_elapsed(struct timeval *result, const struct timeval *start_time)
365 gettimeofday(result,NULL);
366 result->tv_sec -= start_time->tv_sec;
367 result->tv_usec -= start_time->tv_usec;
368 while (result->tv_usec < 0)
370 result->tv_usec += 1000000L;
371 result->tv_sec--;
373 return result;
376 static int
377 restart_kill(struct thread *t_kill)
379 struct restart_info *restart = THREAD_ARG(t_kill);
380 struct timeval delay;
382 time_elapsed(&delay,&restart->time);
383 zlog_warn("Warning: %s %s child process %d still running after "
384 "%ld seconds, sending signal %d",
385 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
386 (restart->kills ? SIGKILL : SIGTERM));
387 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
388 restart->kills++;
389 restart->t_kill = thread_add_timer(master,restart_kill,restart,
390 gs.restart_timeout);
391 return 0;
394 static struct restart_info *
395 find_child(pid_t child)
397 if (gs.mode == MODE_GLOBAL_RESTART)
399 if (gs.restart.pid == child)
400 return &gs.restart;
402 else
404 struct daemon *dmn;
405 for (dmn = gs.daemons; dmn; dmn = dmn->next)
407 if (dmn->restart.pid == child)
408 return &dmn->restart;
411 return NULL;
414 static void
415 sigchild(void)
417 pid_t child;
418 int status;
419 const char *name;
420 const char *what;
421 struct restart_info *restart;
423 switch (child = waitpid(-1,&status,WNOHANG))
425 case -1:
426 zlog_err("waitpid failed: %s",safe_strerror(errno));
427 return;
428 case 0:
429 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
430 return;
433 if ((restart = find_child(child)) != NULL)
435 name = restart->name;
436 what = restart->what;
437 restart->pid = 0;
438 gs.numpids--;
439 thread_cancel(restart->t_kill);
440 restart->t_kill = NULL;
441 /* Update restart time to reflect the time the command completed. */
442 gettimeofday(&restart->time,NULL);
444 else
446 zlog_err("waitpid returned status for an unknown child process %d",
447 (int)child);
448 name = "(unknown)";
449 what = "background";
451 if (WIFSTOPPED(status))
452 zlog_warn("warning: %s %s process %d is stopped",
453 what,name,(int)child);
454 else if (WIFSIGNALED(status))
455 zlog_warn("%s %s process %d terminated due to signal %d",
456 what,name,(int)child,WTERMSIG(status));
457 else if (WIFEXITED(status))
459 if (WEXITSTATUS(status) != 0)
460 zlog_warn("%s %s process %d exited with non-zero status %d",
461 what,name,(int)child,WEXITSTATUS(status));
462 else
463 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
465 else
466 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
467 what,name,(int)child,status);
468 phase_check();
471 static int
472 run_job(struct restart_info *restart, const char *cmdtype, const char *command,
473 int force, int update_interval)
475 struct timeval delay;
477 if (gs.loglevel > LOG_DEBUG+1)
478 zlog_debug("attempting to %s %s",cmdtype,restart->name);
480 if (restart->pid)
482 if (gs.loglevel > LOG_DEBUG+1)
483 zlog_debug("cannot %s %s, previous pid %d still running",
484 cmdtype,restart->name,(int)restart->pid);
485 return -1;
488 /* Note: time_elapsed test must come before the force test, since we need
489 to make sure that delay is initialized for use below in updating the
490 restart interval. */
491 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
492 !force)
494 if (gs.loglevel > LOG_DEBUG+1)
495 zlog_debug("postponing %s %s: "
496 "elapsed time %ld < retry interval %ld",
497 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
498 return -1;
501 gettimeofday(&restart->time,NULL);
502 restart->kills = 0;
504 char cmd[strlen(command)+strlen(restart->name)+1];
505 snprintf(cmd,sizeof(cmd),command,restart->name);
506 if ((restart->pid = run_background(cmd)) > 0)
508 restart->t_kill = thread_add_timer(master,restart_kill,restart,
509 gs.restart_timeout);
510 restart->what = cmdtype;
511 gs.numpids++;
513 else
514 restart->pid = 0;
517 /* Calculate the new restart interval. */
518 if (update_interval)
520 if (delay.tv_sec > 2*gs.max_restart_interval)
521 restart->interval = gs.min_restart_interval;
522 else if ((restart->interval *= 2) > gs.max_restart_interval)
523 restart->interval = gs.max_restart_interval;
524 if (gs.loglevel > LOG_DEBUG+1)
525 zlog_debug("restart %s interval is now %ld",
526 restart->name,restart->interval);
528 return restart->pid;
531 #define SET_READ_HANDLER(DMN) \
532 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
534 #define SET_WAKEUP_DOWN(DMN) \
535 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
536 FUZZY(gs.period))
538 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
539 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
540 FUZZY(gs.period))
542 #define SET_WAKEUP_ECHO(DMN) \
543 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
544 FUZZY(gs.period))
546 static int
547 wakeup_down(struct thread *t_wakeup)
549 struct daemon *dmn = THREAD_ARG(t_wakeup);
551 dmn->t_wakeup = NULL;
552 if (try_connect(dmn) < 0)
553 SET_WAKEUP_DOWN(dmn);
554 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
555 try_restart(dmn);
556 return 0;
559 static int
560 wakeup_init(struct thread *t_wakeup)
562 struct daemon *dmn = THREAD_ARG(t_wakeup);
564 dmn->t_wakeup = NULL;
565 if (try_connect(dmn) < 0)
567 SET_WAKEUP_DOWN(dmn);
568 zlog_err("%s state -> down : initial connection attempt failed",
569 dmn->name);
570 dmn->state = DAEMON_DOWN;
572 return 0;
575 static void
576 daemon_down(struct daemon *dmn, const char *why)
578 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
579 zlog_err("%s state -> down : %s",dmn->name,why);
580 else if (gs.loglevel > LOG_DEBUG)
581 zlog_debug("%s still down : %s",dmn->name,why);
582 if (IS_UP(dmn))
583 gs.numdown++;
584 dmn->state = DAEMON_DOWN;
585 if (dmn->fd >= 0)
587 close(dmn->fd);
588 dmn->fd = -1;
590 THREAD_OFF(dmn->t_read);
591 THREAD_OFF(dmn->t_write);
592 THREAD_OFF(dmn->t_wakeup);
593 if (try_connect(dmn) < 0)
594 SET_WAKEUP_DOWN(dmn);
595 phase_check();
598 static int
599 handle_read(struct thread *t_read)
601 struct daemon *dmn = THREAD_ARG(t_read);
602 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
603 char buf[sizeof(resp)+100];
604 ssize_t rc;
605 struct timeval delay;
607 dmn->t_read = NULL;
608 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
610 char why[100];
612 if (ERRNO_IO_RETRY(errno))
614 /* Pretend it never happened. */
615 SET_READ_HANDLER(dmn);
616 return 0;
618 snprintf(why,sizeof(why),"unexpected read error: %s",
619 safe_strerror(errno));
620 daemon_down(dmn,why);
621 return 0;
623 if (rc == 0)
625 daemon_down(dmn,"read returned EOF");
626 return 0;
628 if (!dmn->echo_sent.tv_sec)
630 char why[sizeof(buf)+100];
631 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
632 (int)rc,(int)rc,buf);
633 daemon_down(dmn,why);
634 return 0;
637 /* We are expecting an echo response: is there any chance that the
638 response would not be returned entirely in the first read? That
639 seems inconceivable... */
640 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
642 char why[100+sizeof(buf)];
643 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
644 "(expecting %u): %.*s",
645 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
646 daemon_down(dmn,why);
647 return 0;
650 time_elapsed(&delay,&dmn->echo_sent);
651 dmn->echo_sent.tv_sec = 0;
652 if (dmn->state == DAEMON_UNRESPONSIVE)
654 if (delay.tv_sec < gs.timeout)
656 dmn->state = DAEMON_UP;
657 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
658 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
660 else
661 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
662 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
664 else if (gs.loglevel > LOG_DEBUG+1)
665 zlog_debug("%s: echo response received after %ld.%06ld seconds",
666 dmn->name,delay.tv_sec,delay.tv_usec);
668 SET_READ_HANDLER(dmn);
669 if (dmn->t_wakeup)
670 thread_cancel(dmn->t_wakeup);
671 SET_WAKEUP_ECHO(dmn);
673 return 0;
676 static void
677 daemon_up(struct daemon *dmn, const char *why)
679 dmn->state = DAEMON_UP;
680 gs.numdown--;
681 dmn->connect_tries = 0;
682 zlog_notice("%s state -> up : %s",dmn->name,why);
683 if (gs.do_ping)
684 SET_WAKEUP_ECHO(dmn);
685 phase_check();
688 static int
689 check_connect(struct thread *t_write)
691 struct daemon *dmn = THREAD_ARG(t_write);
692 int sockerr;
693 socklen_t reslen = sizeof(sockerr);
695 dmn->t_write = NULL;
696 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
698 zlog_warn("%s: check_connect: getsockopt failed: %s",
699 dmn->name,safe_strerror(errno));
700 daemon_down(dmn,"getsockopt failed checking connection success");
701 return 0;
703 if ((reslen == sizeof(sockerr)) && sockerr)
705 char why[100];
706 snprintf(why,sizeof(why),
707 "getsockopt reports that connection attempt failed: %s",
708 safe_strerror(sockerr));
709 daemon_down(dmn,why);
710 return 0;
713 daemon_up(dmn,"delayed connect succeeded");
714 return 0;
717 static int
718 wakeup_connect_hanging(struct thread *t_wakeup)
720 struct daemon *dmn = THREAD_ARG(t_wakeup);
721 char why[100];
723 dmn->t_wakeup = NULL;
724 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
725 gs.timeout);
726 daemon_down(dmn,why);
727 return 0;
730 /* Making connection to protocol daemon. */
731 static int
732 try_connect(struct daemon *dmn)
734 int sock;
735 struct sockaddr_un addr;
736 socklen_t len;
738 if (gs.loglevel > LOG_DEBUG+1)
739 zlog_debug("%s: attempting to connect",dmn->name);
740 dmn->connect_tries++;
742 memset (&addr, 0, sizeof (struct sockaddr_un));
743 addr.sun_family = AF_UNIX;
744 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
745 gs.vtydir,dmn->name);
746 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
747 len = addr.sun_len = SUN_LEN(&addr);
748 #else
749 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
750 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
752 /* Quick check to see if we might succeed before we go to the trouble
753 of creating a socket. */
754 if (access(addr.sun_path, W_OK) < 0)
756 if (errno != ENOENT)
757 zlog_err("%s: access to socket %s denied: %s",
758 dmn->name,addr.sun_path,safe_strerror(errno));
759 return -1;
762 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
764 zlog_err("%s(%s): cannot make socket: %s",
765 __func__,addr.sun_path, safe_strerror(errno));
766 return -1;
769 if (set_nonblocking(sock) < 0)
771 zlog_err("%s(%s): set_nonblocking(%d) failed",
772 __func__, addr.sun_path, sock);
773 close(sock);
774 return -1;
777 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
779 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
781 if (gs.loglevel > LOG_DEBUG)
782 zlog_debug("%s(%s): connect failed: %s",
783 __func__,addr.sun_path, safe_strerror(errno));
784 close (sock);
785 return -1;
787 if (gs.loglevel > LOG_DEBUG)
788 zlog_debug("%s: connection in progress",dmn->name);
789 dmn->state = DAEMON_CONNECTING;
790 dmn->fd = sock;
791 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
792 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
793 gs.timeout);
794 SET_READ_HANDLER(dmn);
795 return 0;
798 dmn->fd = sock;
799 SET_READ_HANDLER(dmn);
800 daemon_up(dmn,"connect succeeded");
801 return 1;
804 static int
805 phase_hanging(struct thread *t_hanging)
807 gs.t_phase_hanging = NULL;
808 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
809 phase_str[gs.phase],PHASE_TIMEOUT);
810 gs.phase = PHASE_NONE;
811 return 0;
814 static void
815 set_phase(restart_phase_t new_phase)
817 gs.phase = new_phase;
818 if (gs.t_phase_hanging)
819 thread_cancel(gs.t_phase_hanging);
820 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
821 PHASE_TIMEOUT);
824 static void
825 phase_check(void)
827 switch (gs.phase)
829 case PHASE_NONE:
830 break;
831 case PHASE_STOPS_PENDING:
832 if (gs.numpids)
833 break;
834 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
835 set_phase(PHASE_WAITING_DOWN);
836 /*FALLTHRU*/
837 case PHASE_WAITING_DOWN:
838 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
839 break;
840 zlog_info("Phased restart: all routing daemons now down.");
841 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
842 set_phase(PHASE_ZEBRA_RESTART_PENDING);
843 /*FALLTHRU*/
844 case PHASE_ZEBRA_RESTART_PENDING:
845 if (gs.special->restart.pid)
846 break;
847 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
848 set_phase(PHASE_WAITING_ZEBRA_UP);
849 /*FALLTHRU*/
850 case PHASE_WAITING_ZEBRA_UP:
851 if (!IS_UP(gs.special))
852 break;
853 zlog_info("Phased restart: %s is now up.",gs.special->name);
855 struct daemon *dmn;
856 for (dmn = gs.daemons; dmn; dmn = dmn->next)
858 if (dmn != gs.special)
859 run_job(&dmn->restart,"start",gs.start_command,1,0);
862 gs.phase = PHASE_NONE;
863 THREAD_OFF(gs.t_phase_hanging);
864 zlog_notice("Phased global restart has completed.");
865 break;
869 static void
870 try_restart(struct daemon *dmn)
872 switch (gs.mode)
874 case MODE_MONITOR:
875 return;
876 case MODE_GLOBAL_RESTART:
877 run_job(&gs.restart,"restart",gs.restart_command,0,1);
878 break;
879 case MODE_SEPARATE_RESTART:
880 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
881 break;
882 case MODE_PHASED_ZEBRA_RESTART:
883 if (dmn != gs.special)
885 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
886 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
887 else
888 zlog_debug("%s: postponing restart attempt because master %s daemon "
889 "not up [%s], or phased restart in progress",
890 dmn->name,gs.special->name,state_str[gs.special->state]);
891 break;
893 /*FALLTHRU*/
894 case MODE_PHASED_ALL_RESTART:
895 if ((gs.phase != PHASE_NONE) || gs.numpids)
897 if (gs.loglevel > LOG_DEBUG+1)
898 zlog_debug("postponing phased global restart: restart already in "
899 "progress [%s], or outstanding child processes [%d]",
900 phase_str[gs.phase],gs.numpids);
901 break;
903 /* Is it too soon for a restart? */
905 struct timeval delay;
906 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
907 gs.special->restart.interval)
909 if (gs.loglevel > LOG_DEBUG+1)
910 zlog_debug("postponing phased global restart: "
911 "elapsed time %ld < retry interval %ld",
912 (long)delay.tv_sec,gs.special->restart.interval);
913 break;
916 zlog_info("Phased restart: stopping all routing daemons.");
917 /* First step: stop all other daemons. */
918 for (dmn = gs.daemons; dmn; dmn = dmn->next)
920 if (dmn != gs.special)
921 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
923 set_phase(PHASE_STOPS_PENDING);
924 break;
925 default:
926 zlog_err("error: unknown restart mode %d",gs.mode);
927 break;
931 static int
932 wakeup_unresponsive(struct thread *t_wakeup)
934 struct daemon *dmn = THREAD_ARG(t_wakeup);
936 dmn->t_wakeup = NULL;
937 if (dmn->state != DAEMON_UNRESPONSIVE)
938 zlog_err("%s: no longer unresponsive (now %s), "
939 "wakeup should have been cancelled!",
940 dmn->name,state_str[dmn->state]);
941 else
943 SET_WAKEUP_UNRESPONSIVE(dmn);
944 try_restart(dmn);
946 return 0;
949 static int
950 wakeup_no_answer(struct thread *t_wakeup)
952 struct daemon *dmn = THREAD_ARG(t_wakeup);
954 dmn->t_wakeup = NULL;
955 dmn->state = DAEMON_UNRESPONSIVE;
956 zlog_err("%s state -> unresponsive : no response yet to ping "
957 "sent %ld seconds ago",dmn->name,gs.timeout);
958 if (gs.unresponsive_restart)
960 SET_WAKEUP_UNRESPONSIVE(dmn);
961 try_restart(dmn);
963 return 0;
966 static int
967 wakeup_send_echo(struct thread *t_wakeup)
969 static const char echocmd[] = "echo " PING_TOKEN;
970 ssize_t rc;
971 struct daemon *dmn = THREAD_ARG(t_wakeup);
973 dmn->t_wakeup = NULL;
974 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
975 ((size_t)rc != sizeof(echocmd)))
977 char why[100+sizeof(echocmd)];
978 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
979 echocmd,(int)rc,(u_int)sizeof(echocmd));
980 daemon_down(dmn,why);
982 else
984 gettimeofday(&dmn->echo_sent,NULL);
985 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
987 return 0;
990 static void
991 sigint(void)
993 zlog_notice("Terminating on signal");
994 exit(0);
997 static int
998 valid_command(const char *cmd)
1000 char *p;
1002 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1005 /* This is an ugly hack to circumvent problems with passing command-line
1006 arguments that contain spaces. The fix is to use a configuration file. */
1007 static char *
1008 translate_blanks(const char *cmd, const char *blankstr)
1010 char *res;
1011 char *p;
1012 size_t bslen = strlen(blankstr);
1014 if (!(res = strdup(cmd)))
1016 perror("strdup");
1017 exit(1);
1019 while ((p = strstr(res,blankstr)) != NULL)
1021 *p = ' ';
1022 if (bslen != 1)
1023 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1025 return res;
1029 main(int argc, char **argv)
1031 const char *progname;
1032 int opt;
1033 int daemon_mode = 0;
1034 const char *pidfile = DEFAULT_PIDFILE;
1035 const char *special = "zebra";
1036 const char *blankstr = NULL;
1037 static struct quagga_signal_t my_signals[] =
1040 .signal = SIGINT,
1041 .handler = sigint,
1044 .signal = SIGTERM,
1045 .handler = sigint,
1048 .signal = SIGCHLD,
1049 .handler = sigchild,
1053 if ((progname = strrchr (argv[0], '/')) != NULL)
1054 progname++;
1055 else
1056 progname = argv[0];
1058 gs.restart.name = "all";
1059 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1060 longopts, 0)) != EOF)
1062 switch (opt)
1064 case 0:
1065 break;
1066 case 'a':
1067 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1069 fputs("Ambiguous operating mode selected.\n",stderr);
1070 return usage(progname,1);
1072 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1073 break;
1074 case 'A':
1075 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1077 fputs("Ambiguous operating mode selected.\n",stderr);
1078 return usage(progname,1);
1080 gs.mode = MODE_PHASED_ALL_RESTART;
1081 break;
1082 case 'b':
1083 blankstr = optarg;
1084 break;
1085 case 'd':
1086 daemon_mode = 1;
1087 break;
1088 case 'e':
1089 gs.do_ping = 0;
1090 break;
1091 case 'k':
1092 if (!valid_command(optarg))
1094 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1095 optarg);
1096 return usage(progname,1);
1098 gs.stop_command = optarg;
1099 break;
1100 case 'l':
1102 char garbage[3];
1103 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1104 (gs.loglevel < LOG_EMERG))
1106 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1107 return usage(progname,1);
1110 break;
1111 case 'm':
1113 char garbage[3];
1114 if ((sscanf(optarg,"%ld%1s",
1115 &gs.min_restart_interval,garbage) != 1) ||
1116 (gs.min_restart_interval < 0))
1118 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1119 optarg);
1120 return usage(progname,1);
1123 break;
1124 case 'M':
1126 char garbage[3];
1127 if ((sscanf(optarg,"%ld%1s",
1128 &gs.max_restart_interval,garbage) != 1) ||
1129 (gs.max_restart_interval < 0))
1131 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1132 optarg);
1133 return usage(progname,1);
1136 break;
1137 case 'i':
1139 char garbage[3];
1140 int period;
1141 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1142 (gs.period < 1))
1144 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1145 return usage(progname,1);
1147 gs.period = 1000*period;
1149 break;
1150 case 'p':
1151 pidfile = optarg;
1152 break;
1153 case 'r':
1154 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1155 (gs.mode == MODE_SEPARATE_RESTART))
1157 fputs("Ambiguous operating mode selected.\n",stderr);
1158 return usage(progname,1);
1160 if (!valid_command(optarg))
1162 fprintf(stderr,
1163 "Invalid restart command, must contain '%%s': %s\n",
1164 optarg);
1165 return usage(progname,1);
1167 gs.restart_command = optarg;
1168 if (gs.mode == MODE_MONITOR)
1169 gs.mode = MODE_SEPARATE_RESTART;
1170 break;
1171 case 'R':
1172 if (gs.mode != MODE_MONITOR)
1174 fputs("Ambiguous operating mode selected.\n",stderr);
1175 return usage(progname,1);
1177 if (strchr(optarg,'%'))
1179 fprintf(stderr,
1180 "Invalid restart-all arg, must not contain '%%s': %s\n",
1181 optarg);
1182 return usage(progname,1);
1184 gs.restart_command = optarg;
1185 gs.mode = MODE_GLOBAL_RESTART;
1186 break;
1187 case 's':
1188 if (!valid_command(optarg))
1190 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1191 optarg);
1192 return usage(progname,1);
1194 gs.start_command = optarg;
1195 break;
1196 case 'S':
1197 gs.vtydir = optarg;
1198 break;
1199 case 't':
1201 char garbage[3];
1202 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1203 (gs.timeout < 1))
1205 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1206 return usage(progname,1);
1209 break;
1210 case 'T':
1212 char garbage[3];
1213 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1214 (gs.restart_timeout < 1))
1216 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1217 return usage(progname,1);
1220 break;
1221 case 'z':
1222 gs.unresponsive_restart = 1;
1223 break;
1224 case 'v':
1225 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1226 puts("Copyright 2004 Andrew J. Schorr");
1227 return 0;
1228 case 'h':
1229 return usage(progname,0);
1230 default:
1231 fputs("Invalid option.\n",stderr);
1232 return usage(progname,1);
1236 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1238 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1239 return usage(progname,1);
1241 switch (gs.mode)
1243 case MODE_MONITOR:
1244 if (gs.restart_command || gs.start_command || gs.stop_command)
1246 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1247 mode_str[gs.mode]);
1248 return usage(progname,1);
1250 break;
1251 case MODE_GLOBAL_RESTART:
1252 case MODE_SEPARATE_RESTART:
1253 if (!gs.restart_command || gs.start_command || gs.stop_command)
1255 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1256 mode_str[gs.mode]);
1257 return usage(progname,1);
1259 break;
1260 case MODE_PHASED_ZEBRA_RESTART:
1261 case MODE_PHASED_ALL_RESTART:
1262 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1264 fprintf(stderr,
1265 "Need start, kill, and restart commands in [%s] mode.\n",
1266 mode_str[gs.mode]);
1267 return usage(progname,1);
1269 break;
1272 if (blankstr)
1274 if (gs.restart_command)
1275 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1276 if (gs.start_command)
1277 gs.start_command = translate_blanks(gs.start_command,blankstr);
1278 if (gs.stop_command)
1279 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1282 gs.restart.interval = gs.min_restart_interval;
1283 master = thread_master_create();
1284 signal_init (master, Q_SIGC(my_signals), my_signals);
1285 srandom(time(NULL));
1288 int i;
1289 struct daemon *tail = NULL;
1291 for (i = optind; i < argc; i++)
1293 struct daemon *dmn;
1295 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1297 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1298 (u_int)sizeof(*dmn), safe_strerror(errno));
1299 return 1;
1301 dmn->name = dmn->restart.name = argv[i];
1302 dmn->state = DAEMON_INIT;
1303 gs.numdaemons++;
1304 gs.numdown++;
1305 dmn->fd = -1;
1306 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1307 100+(random() % 900));
1308 dmn->restart.interval = gs.min_restart_interval;
1309 if (tail)
1310 tail->next = dmn;
1311 else
1312 gs.daemons = dmn;
1313 tail = dmn;
1315 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1316 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1317 !strcmp(dmn->name,special))
1318 gs.special = dmn;
1321 if (!gs.daemons)
1323 fputs("Must specify one or more daemons to monitor.\n",stderr);
1324 return usage(progname,1);
1326 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1327 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1329 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1330 mode_str[gs.mode],special);
1331 return usage(progname,1);
1333 if (gs.special && (gs.numdaemons < 2))
1335 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1336 "to watch.\n",mode_str[gs.mode]);
1337 return usage(progname,1);
1340 zlog_default = openzlog(progname, ZLOG_NONE,
1341 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1342 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1343 if (daemon_mode)
1345 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1346 if (daemon (0, 0) < 0)
1348 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1349 exit (1);
1352 else
1353 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1355 /* Make sure we're not already running. */
1356 pid_output (pidfile);
1358 /* Announce which daemons are being monitored. */
1360 struct daemon *dmn;
1361 size_t len = 0;
1363 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1364 len += strlen(dmn->name)+1;
1367 char buf[len+1];
1368 char *p = buf;
1370 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1372 if (p != buf)
1373 *p++ = ' ';
1374 strcpy(p,dmn->name);
1375 p += strlen(p);
1377 zlog_notice("%s %s watching [%s], mode [%s]",
1378 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1383 struct thread thread;
1385 while (thread_fetch (master, &thread))
1386 thread_call (&thread);
1389 /* Not reached. */
1390 return 0;