4 Monitor status of quagga daemons and restart if necessary.
6 Copyright (C) 2004 Andrew J. Schorr
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 #include <lib/version.h>
34 #define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
37 /* Macros to help randomize timers. */
38 #define JITTER(X) ((random() % ((X)+1))-((X)/2))
39 #define FUZZY(X) ((X)+JITTER((X)/20))
41 #define DEFAULT_PERIOD 5
42 #define DEFAULT_TIMEOUT 10
43 #define DEFAULT_RESTART_TIMEOUT 20
44 #define DEFAULT_LOGLEVEL LOG_INFO
45 #define DEFAULT_MIN_RESTART 60
46 #define DEFAULT_MAX_RESTART 600
47 #ifdef PATH_WATCHQUAGGA_PID
48 #define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
50 #define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
53 #define VTYDIR DAEMON_VTY_DIR
55 #define VTYDIR STATEDIR
58 #define PING_TOKEN "PING"
60 /* Needs to be global, referenced somewhere inside libzebra. */
61 struct thread_master
*master
;
67 MODE_SEPARATE_RESTART
,
68 MODE_PHASED_ZEBRA_RESTART
,
69 MODE_PHASED_ALL_RESTART
72 static const char *mode_str
[] =
76 "individual daemon restart",
77 "phased zebra restart",
78 "phased global restart for any failure",
86 PHASE_ZEBRA_RESTART_PENDING
,
87 PHASE_WAITING_ZEBRA_UP
90 static const char *phase_str
[] =
94 "Waiting for other daemons to come down",
95 "Zebra restart job running",
96 "Waiting for zebra to come up",
100 #define PHASE_TIMEOUT (3*gs.restart_timeout)
109 struct thread
*t_kill
;
113 static struct global_state
116 restart_phase_t phase
;
117 struct thread
*t_phase_hanging
;
121 long restart_timeout
;
122 long min_restart_interval
;
123 long max_restart_interval
;
125 struct daemon
*daemons
;
126 const char *restart_command
;
127 const char *start_command
;
128 const char *stop_command
;
129 struct restart_info restart
;
130 int unresponsive_restart
;
132 struct daemon
*special
; /* points to zebra when doing phased restart */
135 int numdown
; /* # of daemons that are not UP or UNRESPONSIVE */
137 .mode
= MODE_MONITOR
,
140 .period
= 1000*DEFAULT_PERIOD
,
141 .timeout
= DEFAULT_TIMEOUT
,
142 .restart_timeout
= DEFAULT_RESTART_TIMEOUT
,
143 .loglevel
= DEFAULT_LOGLEVEL
,
144 .min_restart_interval
= DEFAULT_MIN_RESTART
,
145 .max_restart_interval
= DEFAULT_MAX_RESTART
,
159 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
161 static const char *state_str
[] =
172 daemon_state_t state
;
174 struct timeval echo_sent
;
176 struct thread
*t_wakeup
;
177 struct thread
*t_read
;
178 struct thread
*t_write
;
180 struct restart_info restart
;
183 static const struct option longopts
[] =
185 { "daemon", no_argument
, NULL
, 'd'},
186 { "statedir", required_argument
, NULL
, 'S'},
187 { "no-echo", no_argument
, NULL
, 'e'},
188 { "loglevel", required_argument
, NULL
, 'l'},
189 { "interval", required_argument
, NULL
, 'i'},
190 { "timeout", required_argument
, NULL
, 't'},
191 { "restart-timeout", required_argument
, NULL
, 'T'},
192 { "restart", required_argument
, NULL
, 'r'},
193 { "start-command", required_argument
, NULL
, 's'},
194 { "kill-command", required_argument
, NULL
, 'k'},
195 { "restart-all", required_argument
, NULL
, 'R'},
196 { "all-restart", no_argument
, NULL
, 'a'},
197 { "always-all-restart", no_argument
, NULL
, 'A'},
198 { "unresponsive-restart", no_argument
, NULL
, 'z'},
199 { "min-restart-interval", required_argument
, NULL
, 'm'},
200 { "max-restart-interval", required_argument
, NULL
, 'M'},
201 { "pid-file", required_argument
, NULL
, 'p'},
202 { "blank-string", required_argument
, NULL
, 'b'},
203 { "help", no_argument
, NULL
, 'h'},
204 { "version", no_argument
, NULL
, 'v'},
208 static int try_connect(struct daemon
*dmn
);
209 static int wakeup_send_echo(struct thread
*t_wakeup
);
210 static void try_restart(struct daemon
*dmn
);
211 static void phase_check(void);
214 usage(const char *progname
, int status
)
217 fprintf(stderr
, "Try `%s --help' for more information.\n", progname
);
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
220 Watchdog program to monitor status of quagga daemons and try to restart\n\
221 them if they are down or unresponsive. It determines whether a daemon is\n\
222 up based on whether it can connect to the daemon's vty unix stream socket.\n\
223 It then repeatedly sends echo commands over that socket to determine whether\n\
224 the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225 on the socket connection and know immediately that the daemon is down.\n\n\
226 The daemons to be monitored should be listed on the command line.\n\n\
227 This program can run in one of 5 modes:\n\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255 As of this writing, it is believed that mode 2 [%s]\n\
256 is not safe, and mode 3 [%s] may not be safe with some of the\n\
257 routing daemons.\n\n\
258 In order to avoid attempting to restart the daemons in a fast loop,\n\
259 the -m and -M options allow you to control the minimum delay between\n\
260 restart commands. The minimum restart delay is recalculated each time\n\
261 a restart is attempted: if the time since the last restart attempt exceeds\n\
262 twice the -M value, then the restart delay is set to the -m value.\n\
263 Otherwise, the interval is doubled (but capped at the -M value).\n\n\
265 -d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
266 to syslog instead of stdout.\n\
267 -S, --statedir Set the vty socket directory (default is %s)\n\
268 -e, --no-echo Do not ping the daemons to test responsiveness (this\n\
269 option is necessary if the daemons do not support the\n\
271 -l, --loglevel Set the logging level (default is %d).\n\
272 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
273 but it can be set higher than %d if extra-verbose debugging\n\
274 messages are desired.\n\
275 -m, --min-restart-interval\n\
276 Set the minimum seconds to wait between invocations of daemon\n\
277 restart commands (default is %d).\n\
278 -M, --max-restart-interval\n\
279 Set the maximum seconds to wait between invocations of daemon\n\
280 restart commands (default is %d).\n\
281 -i, --interval Set the status polling interval in seconds (default is %d)\n\
282 -t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
283 -T, --restart-timeout\n\
284 Set the restart (kill) timeout in seconds (default is %d).\n\
285 If any background jobs are still running after this much\n\
286 time has elapsed, they will be killed.\n\
287 -r, --restart Supply a Bourne shell command to use to restart a single\n\
288 daemon. The command string should include '%%s' where the\n\
289 name of the daemon should be substituted.\n\
290 Note that -r and -R are incompatible.\n\
291 -s, --start-command\n\
292 Supply a Bourne shell to command to use to start a single\n\
293 daemon. The command string should include '%%s' where the\n\
294 name of the daemon should be substituted.\n\
295 -k, --kill-command\n\
296 Supply a Bourne shell to command to use to stop a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
300 When one or more daemons is down, try to restart everything\n\
301 using the Bourne shell command supplied as the argument.\n\
302 Note that -r and -R are incompatible.\n\
303 -z, --unresponsive-restart\n\
304 When a daemon is unresponsive, treat it as being down for\n\
307 When zebra hangs or crashes, restart all daemons using\n\
308 this phased approach: 1. stop all other daemons; 2. restart\n\
309 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
310 -A, --always-all-restart\n\
311 When any daemon (not just zebra) hangs or crashes, use the\n\
312 same phased restart mechanism described above for -a.\n\
313 Requires -r, -s, and -k.\n\
314 -p, --pid-file Set process identifier file name\n\
316 -b, --blank-string\n\
317 When the supplied argument string is found in any of the\n\
318 various shell command arguments (-r, -s, -k, or -R), replace\n\
319 it with a space. This is an ugly hack to circumvent problems\n\
320 passing command-line arguments with embedded spaces.\n\
321 -v, --version Print program version\n\
322 -h, --help Display this help and exit\n\
323 ", progname
,mode_str
[0],progname
,mode_str
[1],progname
,mode_str
[2],
324 progname
,mode_str
[3],progname
,mode_str
[4],progname
,mode_str
[2],mode_str
[3],
325 VTYDIR
,DEFAULT_LOGLEVEL
,LOG_EMERG
,LOG_DEBUG
,LOG_DEBUG
,
326 DEFAULT_MIN_RESTART
,DEFAULT_MAX_RESTART
,
327 DEFAULT_PERIOD
,DEFAULT_TIMEOUT
,DEFAULT_RESTART_TIMEOUT
,DEFAULT_PIDFILE
);
333 run_background(const char *shell_cmd
)
337 switch (child
= fork())
340 zlog_err("fork failed, cannot run command [%s]: %s",
341 shell_cmd
,safe_strerror(errno
));
345 /* Use separate process group so child processes can be killed easily. */
346 if (setpgid(0,0) < 0)
347 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno
));
349 const char *argv
[4] = { "sh", "-c", shell_cmd
, NULL
};
350 execv("/bin/sh",(char *const *)argv
);
351 zlog_err("execv(/bin/sh -c '%s') failed: %s",
352 shell_cmd
,safe_strerror(errno
));
356 /* Parent process: we will reap the child later. */
357 zlog_err("Forked background command [pid %d]: %s",(int)child
,shell_cmd
);
362 static struct timeval
*
363 time_elapsed(struct timeval
*result
, const struct timeval
*start_time
)
365 gettimeofday(result
,NULL
);
366 result
->tv_sec
-= start_time
->tv_sec
;
367 result
->tv_usec
-= start_time
->tv_usec
;
368 while (result
->tv_usec
< 0)
370 result
->tv_usec
+= 1000000L;
377 restart_kill(struct thread
*t_kill
)
379 struct restart_info
*restart
= THREAD_ARG(t_kill
);
380 struct timeval delay
;
382 time_elapsed(&delay
,&restart
->time
);
383 zlog_warn("Warning: %s %s child process %d still running after "
384 "%ld seconds, sending signal %d",
385 restart
->what
,restart
->name
,(int)restart
->pid
,delay
.tv_sec
,
386 (restart
->kills
? SIGKILL
: SIGTERM
));
387 kill(-restart
->pid
,(restart
->kills
? SIGKILL
: SIGTERM
));
389 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
394 static struct restart_info
*
395 find_child(pid_t child
)
397 if (gs
.mode
== MODE_GLOBAL_RESTART
)
399 if (gs
.restart
.pid
== child
)
405 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
407 if (dmn
->restart
.pid
== child
)
408 return &dmn
->restart
;
421 struct restart_info
*restart
;
423 switch (child
= waitpid(-1,&status
,WNOHANG
))
426 zlog_err("waitpid failed: %s",safe_strerror(errno
));
429 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
433 if ((restart
= find_child(child
)) != NULL
)
435 name
= restart
->name
;
436 what
= restart
->what
;
439 thread_cancel(restart
->t_kill
);
440 restart
->t_kill
= NULL
;
441 /* Update restart time to reflect the time the command completed. */
442 gettimeofday(&restart
->time
,NULL
);
446 zlog_err("waitpid returned status for an unknown child process %d",
451 if (WIFSTOPPED(status
))
452 zlog_warn("warning: %s %s process %d is stopped",
453 what
,name
,(int)child
);
454 else if (WIFSIGNALED(status
))
455 zlog_warn("%s %s process %d terminated due to signal %d",
456 what
,name
,(int)child
,WTERMSIG(status
));
457 else if (WIFEXITED(status
))
459 if (WEXITSTATUS(status
) != 0)
460 zlog_warn("%s %s process %d exited with non-zero status %d",
461 what
,name
,(int)child
,WEXITSTATUS(status
));
463 zlog_debug("%s %s process %d exited normally",what
,name
,(int)child
);
466 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
467 what
,name
,(int)child
,status
);
472 run_job(struct restart_info
*restart
, const char *cmdtype
, const char *command
,
473 int force
, int update_interval
)
475 struct timeval delay
;
477 if (gs
.loglevel
> LOG_DEBUG
+1)
478 zlog_debug("attempting to %s %s",cmdtype
,restart
->name
);
482 if (gs
.loglevel
> LOG_DEBUG
+1)
483 zlog_debug("cannot %s %s, previous pid %d still running",
484 cmdtype
,restart
->name
,(int)restart
->pid
);
488 /* Note: time_elapsed test must come before the force test, since we need
489 to make sure that delay is initialized for use below in updating the
491 if ((time_elapsed(&delay
,&restart
->time
)->tv_sec
< restart
->interval
) &&
494 if (gs
.loglevel
> LOG_DEBUG
+1)
495 zlog_debug("postponing %s %s: "
496 "elapsed time %ld < retry interval %ld",
497 cmdtype
,restart
->name
,(long)delay
.tv_sec
,restart
->interval
);
501 gettimeofday(&restart
->time
,NULL
);
504 char cmd
[strlen(command
)+strlen(restart
->name
)+1];
505 snprintf(cmd
,sizeof(cmd
),command
,restart
->name
);
506 if ((restart
->pid
= run_background(cmd
)) > 0)
508 restart
->t_kill
= thread_add_timer(master
,restart_kill
,restart
,
510 restart
->what
= cmdtype
;
517 /* Calculate the new restart interval. */
520 if (delay
.tv_sec
> 2*gs
.max_restart_interval
)
521 restart
->interval
= gs
.min_restart_interval
;
522 else if ((restart
->interval
*= 2) > gs
.max_restart_interval
)
523 restart
->interval
= gs
.max_restart_interval
;
524 if (gs
.loglevel
> LOG_DEBUG
+1)
525 zlog_debug("restart %s interval is now %ld",
526 restart
->name
,restart
->interval
);
531 #define SET_READ_HANDLER(DMN) \
532 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
534 #define SET_WAKEUP_DOWN(DMN) \
535 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
538 #define SET_WAKEUP_UNRESPONSIVE(DMN) \
539 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
542 #define SET_WAKEUP_ECHO(DMN) \
543 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
547 wakeup_down(struct thread
*t_wakeup
)
549 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
551 dmn
->t_wakeup
= NULL
;
552 if (try_connect(dmn
) < 0)
553 SET_WAKEUP_DOWN(dmn
);
554 if ((dmn
->connect_tries
> 1) && (dmn
->state
!= DAEMON_UP
))
560 wakeup_init(struct thread
*t_wakeup
)
562 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
564 dmn
->t_wakeup
= NULL
;
565 if (try_connect(dmn
) < 0)
567 SET_WAKEUP_DOWN(dmn
);
568 zlog_err("%s state -> down : initial connection attempt failed",
570 dmn
->state
= DAEMON_DOWN
;
576 daemon_down(struct daemon
*dmn
, const char *why
)
578 if (IS_UP(dmn
) || (dmn
->state
== DAEMON_INIT
))
579 zlog_err("%s state -> down : %s",dmn
->name
,why
);
580 else if (gs
.loglevel
> LOG_DEBUG
)
581 zlog_debug("%s still down : %s",dmn
->name
,why
);
584 dmn
->state
= DAEMON_DOWN
;
590 THREAD_OFF(dmn
->t_read
);
591 THREAD_OFF(dmn
->t_write
);
592 THREAD_OFF(dmn
->t_wakeup
);
593 if (try_connect(dmn
) < 0)
594 SET_WAKEUP_DOWN(dmn
);
599 handle_read(struct thread
*t_read
)
601 struct daemon
*dmn
= THREAD_ARG(t_read
);
602 static const char resp
[sizeof(PING_TOKEN
)+4] = PING_TOKEN
"\n";
603 char buf
[sizeof(resp
)+100];
605 struct timeval delay
;
608 if ((rc
= read(dmn
->fd
,buf
,sizeof(buf
))) < 0)
612 if (ERRNO_IO_RETRY(errno
))
614 /* Pretend it never happened. */
615 SET_READ_HANDLER(dmn
);
618 snprintf(why
,sizeof(why
),"unexpected read error: %s",
619 safe_strerror(errno
));
620 daemon_down(dmn
,why
);
625 daemon_down(dmn
,"read returned EOF");
628 if (!dmn
->echo_sent
.tv_sec
)
630 char why
[sizeof(buf
)+100];
631 snprintf(why
,sizeof(why
),"unexpected read returns %d bytes: %.*s",
632 (int)rc
,(int)rc
,buf
);
633 daemon_down(dmn
,why
);
637 /* We are expecting an echo response: is there any chance that the
638 response would not be returned entirely in the first read? That
639 seems inconceivable... */
640 if ((rc
!= sizeof(resp
)) || memcmp(buf
,resp
,sizeof(resp
)))
642 char why
[100+sizeof(buf
)];
643 snprintf(why
,sizeof(why
),"read returned bad echo response of %d bytes "
644 "(expecting %u): %.*s",
645 (int)rc
,(u_int
)sizeof(resp
),(int)rc
,buf
);
646 daemon_down(dmn
,why
);
650 time_elapsed(&delay
,&dmn
->echo_sent
);
651 dmn
->echo_sent
.tv_sec
= 0;
652 if (dmn
->state
== DAEMON_UNRESPONSIVE
)
654 if (delay
.tv_sec
< gs
.timeout
)
656 dmn
->state
= DAEMON_UP
;
657 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
658 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
661 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
662 "seconds", dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
664 else if (gs
.loglevel
> LOG_DEBUG
+1)
665 zlog_debug("%s: echo response received after %ld.%06ld seconds",
666 dmn
->name
,delay
.tv_sec
,delay
.tv_usec
);
668 SET_READ_HANDLER(dmn
);
670 thread_cancel(dmn
->t_wakeup
);
671 SET_WAKEUP_ECHO(dmn
);
677 daemon_up(struct daemon
*dmn
, const char *why
)
679 dmn
->state
= DAEMON_UP
;
681 dmn
->connect_tries
= 0;
682 zlog_notice("%s state -> up : %s",dmn
->name
,why
);
684 SET_WAKEUP_ECHO(dmn
);
689 check_connect(struct thread
*t_write
)
691 struct daemon
*dmn
= THREAD_ARG(t_write
);
693 socklen_t reslen
= sizeof(sockerr
);
696 if (getsockopt(dmn
->fd
,SOL_SOCKET
,SO_ERROR
,(char *)&sockerr
,&reslen
) < 0)
698 zlog_warn("%s: check_connect: getsockopt failed: %s",
699 dmn
->name
,safe_strerror(errno
));
700 daemon_down(dmn
,"getsockopt failed checking connection success");
703 if ((reslen
== sizeof(sockerr
)) && sockerr
)
706 snprintf(why
,sizeof(why
),
707 "getsockopt reports that connection attempt failed: %s",
708 safe_strerror(sockerr
));
709 daemon_down(dmn
,why
);
713 daemon_up(dmn
,"delayed connect succeeded");
718 wakeup_connect_hanging(struct thread
*t_wakeup
)
720 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
723 dmn
->t_wakeup
= NULL
;
724 snprintf(why
,sizeof(why
),"connection attempt timed out after %ld seconds",
726 daemon_down(dmn
,why
);
730 /* Making connection to protocol daemon. */
732 try_connect(struct daemon
*dmn
)
735 struct sockaddr_un addr
;
738 if (gs
.loglevel
> LOG_DEBUG
+1)
739 zlog_debug("%s: attempting to connect",dmn
->name
);
740 dmn
->connect_tries
++;
742 memset (&addr
, 0, sizeof (struct sockaddr_un
));
743 addr
.sun_family
= AF_UNIX
;
744 snprintf(addr
.sun_path
, sizeof(addr
.sun_path
), "%s/%s.vty",
745 gs
.vtydir
,dmn
->name
);
746 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
747 len
= addr
.sun_len
= SUN_LEN(&addr
);
749 len
= sizeof (addr
.sun_family
) + strlen (addr
.sun_path
);
750 #endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
752 /* Quick check to see if we might succeed before we go to the trouble
753 of creating a socket. */
754 if (access(addr
.sun_path
, W_OK
) < 0)
757 zlog_err("%s: access to socket %s denied: %s",
758 dmn
->name
,addr
.sun_path
,safe_strerror(errno
));
762 if ((sock
= socket (AF_UNIX
, SOCK_STREAM
, 0)) < 0)
764 zlog_err("%s(%s): cannot make socket: %s",
765 __func__
,addr
.sun_path
, safe_strerror(errno
));
769 if (set_nonblocking(sock
) < 0)
771 zlog_err("%s(%s): set_nonblocking(%d) failed",
772 __func__
, addr
.sun_path
, sock
);
777 if (connect (sock
, (struct sockaddr
*) &addr
, len
) < 0)
779 if ((errno
!= EINPROGRESS
) && (errno
!= EWOULDBLOCK
))
781 if (gs
.loglevel
> LOG_DEBUG
)
782 zlog_debug("%s(%s): connect failed: %s",
783 __func__
,addr
.sun_path
, safe_strerror(errno
));
787 if (gs
.loglevel
> LOG_DEBUG
)
788 zlog_debug("%s: connection in progress",dmn
->name
);
789 dmn
->state
= DAEMON_CONNECTING
;
791 dmn
->t_write
= thread_add_write(master
,check_connect
,dmn
,dmn
->fd
);
792 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_connect_hanging
,dmn
,
794 SET_READ_HANDLER(dmn
);
799 SET_READ_HANDLER(dmn
);
800 daemon_up(dmn
,"connect succeeded");
805 phase_hanging(struct thread
*t_hanging
)
807 gs
.t_phase_hanging
= NULL
;
808 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
809 phase_str
[gs
.phase
],PHASE_TIMEOUT
);
810 gs
.phase
= PHASE_NONE
;
815 set_phase(restart_phase_t new_phase
)
817 gs
.phase
= new_phase
;
818 if (gs
.t_phase_hanging
)
819 thread_cancel(gs
.t_phase_hanging
);
820 gs
.t_phase_hanging
= thread_add_timer(master
,phase_hanging
,NULL
,
831 case PHASE_STOPS_PENDING
:
834 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
835 set_phase(PHASE_WAITING_DOWN
);
837 case PHASE_WAITING_DOWN
:
838 if (gs
.numdown
+IS_UP(gs
.special
) < gs
.numdaemons
)
840 zlog_info("Phased restart: all routing daemons now down.");
841 run_job(&gs
.special
->restart
,"restart",gs
.restart_command
,1,1);
842 set_phase(PHASE_ZEBRA_RESTART_PENDING
);
844 case PHASE_ZEBRA_RESTART_PENDING
:
845 if (gs
.special
->restart
.pid
)
847 zlog_info("Phased restart: %s restart job completed.",gs
.special
->name
);
848 set_phase(PHASE_WAITING_ZEBRA_UP
);
850 case PHASE_WAITING_ZEBRA_UP
:
851 if (!IS_UP(gs
.special
))
853 zlog_info("Phased restart: %s is now up.",gs
.special
->name
);
856 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
858 if (dmn
!= gs
.special
)
859 run_job(&dmn
->restart
,"start",gs
.start_command
,1,0);
862 gs
.phase
= PHASE_NONE
;
863 THREAD_OFF(gs
.t_phase_hanging
);
864 zlog_notice("Phased global restart has completed.");
870 try_restart(struct daemon
*dmn
)
876 case MODE_GLOBAL_RESTART
:
877 run_job(&gs
.restart
,"restart",gs
.restart_command
,0,1);
879 case MODE_SEPARATE_RESTART
:
880 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
882 case MODE_PHASED_ZEBRA_RESTART
:
883 if (dmn
!= gs
.special
)
885 if ((gs
.special
->state
== DAEMON_UP
) && (gs
.phase
== PHASE_NONE
))
886 run_job(&dmn
->restart
,"restart",gs
.restart_command
,0,1);
888 zlog_debug("%s: postponing restart attempt because master %s daemon "
889 "not up [%s], or phased restart in progress",
890 dmn
->name
,gs
.special
->name
,state_str
[gs
.special
->state
]);
894 case MODE_PHASED_ALL_RESTART
:
895 if ((gs
.phase
!= PHASE_NONE
) || gs
.numpids
)
897 if (gs
.loglevel
> LOG_DEBUG
+1)
898 zlog_debug("postponing phased global restart: restart already in "
899 "progress [%s], or outstanding child processes [%d]",
900 phase_str
[gs
.phase
],gs
.numpids
);
903 /* Is it too soon for a restart? */
905 struct timeval delay
;
906 if (time_elapsed(&delay
,&gs
.special
->restart
.time
)->tv_sec
<
907 gs
.special
->restart
.interval
)
909 if (gs
.loglevel
> LOG_DEBUG
+1)
910 zlog_debug("postponing phased global restart: "
911 "elapsed time %ld < retry interval %ld",
912 (long)delay
.tv_sec
,gs
.special
->restart
.interval
);
916 zlog_info("Phased restart: stopping all routing daemons.");
917 /* First step: stop all other daemons. */
918 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
920 if (dmn
!= gs
.special
)
921 run_job(&dmn
->restart
,"stop",gs
.stop_command
,1,1);
923 set_phase(PHASE_STOPS_PENDING
);
926 zlog_err("error: unknown restart mode %d",gs
.mode
);
932 wakeup_unresponsive(struct thread
*t_wakeup
)
934 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
936 dmn
->t_wakeup
= NULL
;
937 if (dmn
->state
!= DAEMON_UNRESPONSIVE
)
938 zlog_err("%s: no longer unresponsive (now %s), "
939 "wakeup should have been cancelled!",
940 dmn
->name
,state_str
[dmn
->state
]);
943 SET_WAKEUP_UNRESPONSIVE(dmn
);
950 wakeup_no_answer(struct thread
*t_wakeup
)
952 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
954 dmn
->t_wakeup
= NULL
;
955 dmn
->state
= DAEMON_UNRESPONSIVE
;
956 zlog_err("%s state -> unresponsive : no response yet to ping "
957 "sent %ld seconds ago",dmn
->name
,gs
.timeout
);
958 if (gs
.unresponsive_restart
)
960 SET_WAKEUP_UNRESPONSIVE(dmn
);
967 wakeup_send_echo(struct thread
*t_wakeup
)
969 static const char echocmd
[] = "echo " PING_TOKEN
;
971 struct daemon
*dmn
= THREAD_ARG(t_wakeup
);
973 dmn
->t_wakeup
= NULL
;
974 if (((rc
= write(dmn
->fd
,echocmd
,sizeof(echocmd
))) < 0) ||
975 ((size_t)rc
!= sizeof(echocmd
)))
977 char why
[100+sizeof(echocmd
)];
978 snprintf(why
,sizeof(why
),"write '%s' returned %d instead of %u",
979 echocmd
,(int)rc
,(u_int
)sizeof(echocmd
));
980 daemon_down(dmn
,why
);
984 gettimeofday(&dmn
->echo_sent
,NULL
);
985 dmn
->t_wakeup
= thread_add_timer(master
,wakeup_no_answer
,dmn
,gs
.timeout
);
993 zlog_notice("Terminating on signal");
998 valid_command(const char *cmd
)
1002 return ((p
= strchr(cmd
,'%')) != NULL
) && (*(p
+1) == 's') && !strchr(p
+1,'%');
1005 /* This is an ugly hack to circumvent problems with passing command-line
1006 arguments that contain spaces. The fix is to use a configuration file. */
1008 translate_blanks(const char *cmd
, const char *blankstr
)
1012 size_t bslen
= strlen(blankstr
);
1014 if (!(res
= strdup(cmd
)))
1019 while ((p
= strstr(res
,blankstr
)) != NULL
)
1023 memmove(p
+1,p
+bslen
,strlen(p
+bslen
)+1);
1029 main(int argc
, char **argv
)
1031 const char *progname
;
1033 int daemon_mode
= 0;
1034 const char *pidfile
= DEFAULT_PIDFILE
;
1035 const char *special
= "zebra";
1036 const char *blankstr
= NULL
;
1037 static struct quagga_signal_t my_signals
[] =
1049 .handler
= sigchild
,
1053 if ((progname
= strrchr (argv
[0], '/')) != NULL
)
1058 gs
.restart
.name
= "all";
1059 while ((opt
= getopt_long(argc
, argv
, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
1060 longopts
, 0)) != EOF
)
1067 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1069 fputs("Ambiguous operating mode selected.\n",stderr
);
1070 return usage(progname
,1);
1072 gs
.mode
= MODE_PHASED_ZEBRA_RESTART
;
1075 if ((gs
.mode
!= MODE_MONITOR
) && (gs
.mode
!= MODE_SEPARATE_RESTART
))
1077 fputs("Ambiguous operating mode selected.\n",stderr
);
1078 return usage(progname
,1);
1080 gs
.mode
= MODE_PHASED_ALL_RESTART
;
1092 if (!valid_command(optarg
))
1094 fprintf(stderr
,"Invalid kill command, must contain '%%s': %s\n",
1096 return usage(progname
,1);
1098 gs
.stop_command
= optarg
;
1103 if ((sscanf(optarg
,"%d%1s",&gs
.loglevel
,garbage
) != 1) ||
1104 (gs
.loglevel
< LOG_EMERG
))
1106 fprintf(stderr
,"Invalid loglevel argument: %s\n",optarg
);
1107 return usage(progname
,1);
1114 if ((sscanf(optarg
,"%ld%1s",
1115 &gs
.min_restart_interval
,garbage
) != 1) ||
1116 (gs
.min_restart_interval
< 0))
1118 fprintf(stderr
,"Invalid min_restart_interval argument: %s\n",
1120 return usage(progname
,1);
1127 if ((sscanf(optarg
,"%ld%1s",
1128 &gs
.max_restart_interval
,garbage
) != 1) ||
1129 (gs
.max_restart_interval
< 0))
1131 fprintf(stderr
,"Invalid max_restart_interval argument: %s\n",
1133 return usage(progname
,1);
1141 if ((sscanf(optarg
,"%d%1s",&period
,garbage
) != 1) ||
1144 fprintf(stderr
,"Invalid interval argument: %s\n",optarg
);
1145 return usage(progname
,1);
1147 gs
.period
= 1000*period
;
1154 if ((gs
.mode
== MODE_GLOBAL_RESTART
) ||
1155 (gs
.mode
== MODE_SEPARATE_RESTART
))
1157 fputs("Ambiguous operating mode selected.\n",stderr
);
1158 return usage(progname
,1);
1160 if (!valid_command(optarg
))
1163 "Invalid restart command, must contain '%%s': %s\n",
1165 return usage(progname
,1);
1167 gs
.restart_command
= optarg
;
1168 if (gs
.mode
== MODE_MONITOR
)
1169 gs
.mode
= MODE_SEPARATE_RESTART
;
1172 if (gs
.mode
!= MODE_MONITOR
)
1174 fputs("Ambiguous operating mode selected.\n",stderr
);
1175 return usage(progname
,1);
1177 if (strchr(optarg
,'%'))
1180 "Invalid restart-all arg, must not contain '%%s': %s\n",
1182 return usage(progname
,1);
1184 gs
.restart_command
= optarg
;
1185 gs
.mode
= MODE_GLOBAL_RESTART
;
1188 if (!valid_command(optarg
))
1190 fprintf(stderr
,"Invalid start command, must contain '%%s': %s\n",
1192 return usage(progname
,1);
1194 gs
.start_command
= optarg
;
1202 if ((sscanf(optarg
,"%ld%1s",&gs
.timeout
,garbage
) != 1) ||
1205 fprintf(stderr
,"Invalid timeout argument: %s\n",optarg
);
1206 return usage(progname
,1);
1213 if ((sscanf(optarg
,"%ld%1s",&gs
.restart_timeout
,garbage
) != 1) ||
1214 (gs
.restart_timeout
< 1))
1216 fprintf(stderr
,"Invalid restart timeout argument: %s\n",optarg
);
1217 return usage(progname
,1);
1222 gs
.unresponsive_restart
= 1;
1225 printf ("%s version %s\n", progname
, QUAGGA_VERSION
);
1226 puts("Copyright 2004 Andrew J. Schorr");
1229 return usage(progname
,0);
1231 fputs("Invalid option.\n",stderr
);
1232 return usage(progname
,1);
1236 if (gs
.unresponsive_restart
&& (gs
.mode
== MODE_MONITOR
))
1238 fputs("Option -z requires a -r or -R restart option.\n",stderr
);
1239 return usage(progname
,1);
1244 if (gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1246 fprintf(stderr
,"No kill/(re)start commands needed for %s mode.\n",
1248 return usage(progname
,1);
1251 case MODE_GLOBAL_RESTART
:
1252 case MODE_SEPARATE_RESTART
:
1253 if (!gs
.restart_command
|| gs
.start_command
|| gs
.stop_command
)
1255 fprintf(stderr
,"No start/kill commands needed in [%s] mode.\n",
1257 return usage(progname
,1);
1260 case MODE_PHASED_ZEBRA_RESTART
:
1261 case MODE_PHASED_ALL_RESTART
:
1262 if (!gs
.restart_command
|| !gs
.start_command
|| !gs
.stop_command
)
1265 "Need start, kill, and restart commands in [%s] mode.\n",
1267 return usage(progname
,1);
1274 if (gs
.restart_command
)
1275 gs
.restart_command
= translate_blanks(gs
.restart_command
,blankstr
);
1276 if (gs
.start_command
)
1277 gs
.start_command
= translate_blanks(gs
.start_command
,blankstr
);
1278 if (gs
.stop_command
)
1279 gs
.stop_command
= translate_blanks(gs
.stop_command
,blankstr
);
1282 gs
.restart
.interval
= gs
.min_restart_interval
;
1283 master
= thread_master_create();
1284 signal_init (master
, Q_SIGC(my_signals
), my_signals
);
1285 srandom(time(NULL
));
1289 struct daemon
*tail
= NULL
;
1291 for (i
= optind
; i
< argc
; i
++)
1295 if (!(dmn
= (struct daemon
*)calloc(1,sizeof(*dmn
))))
1297 fprintf(stderr
,"calloc(1,%u) failed: %s\n",
1298 (u_int
)sizeof(*dmn
), safe_strerror(errno
));
1301 dmn
->name
= dmn
->restart
.name
= argv
[i
];
1302 dmn
->state
= DAEMON_INIT
;
1306 dmn
->t_wakeup
= thread_add_timer_msec(master
,wakeup_init
,dmn
,
1307 100+(random() % 900));
1308 dmn
->restart
.interval
= gs
.min_restart_interval
;
1315 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1316 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) &&
1317 !strcmp(dmn
->name
,special
))
1323 fputs("Must specify one or more daemons to monitor.\n",stderr
);
1324 return usage(progname
,1);
1326 if (((gs
.mode
== MODE_PHASED_ZEBRA_RESTART
) ||
1327 (gs
.mode
== MODE_PHASED_ALL_RESTART
)) && !gs
.special
)
1329 fprintf(stderr
,"In mode [%s], but cannot find master daemon %s\n",
1330 mode_str
[gs
.mode
],special
);
1331 return usage(progname
,1);
1333 if (gs
.special
&& (gs
.numdaemons
< 2))
1335 fprintf(stderr
,"Mode [%s] does not make sense with only 1 daemon "
1336 "to watch.\n",mode_str
[gs
.mode
]);
1337 return usage(progname
,1);
1340 zlog_default
= openzlog(progname
, ZLOG_NONE
,
1341 LOG_CONS
|LOG_NDELAY
|LOG_PID
, LOG_DAEMON
);
1342 zlog_set_level(NULL
, ZLOG_DEST_MONITOR
, ZLOG_DISABLED
);
1345 zlog_set_level(NULL
, ZLOG_DEST_SYSLOG
, MIN(gs
.loglevel
,LOG_DEBUG
));
1346 if (daemon (0, 0) < 0)
1348 fprintf(stderr
, "Watchquagga daemon failed: %s", strerror(errno
));
1353 zlog_set_level(NULL
, ZLOG_DEST_STDOUT
, MIN(gs
.loglevel
,LOG_DEBUG
));
1355 /* Make sure we're not already running. */
1356 pid_output (pidfile
);
1358 /* Announce which daemons are being monitored. */
1363 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1364 len
+= strlen(dmn
->name
)+1;
1370 for (dmn
= gs
.daemons
; dmn
; dmn
= dmn
->next
)
1374 strcpy(p
,dmn
->name
);
1377 zlog_notice("%s %s watching [%s], mode [%s]",
1378 progname
, QUAGGA_VERSION
, buf
, mode_str
[gs
.mode
]);
1383 struct thread thread
;
1385 while (thread_fetch (master
, &thread
))
1386 thread_call (&thread
);