more safety checks in noshellinject; bugfix (perl string-comparision) in notashell...
[hband-tools.git] / admin-tools / dmaster
blob3cc037d2efcb3bbdb32fb67f85ec669edfb2e70b
1 #!/usr/bin/env perl
3 =pod
5 =head1 NAME
7 dmaster - Manage continuously running processes
9 =head1 SYNOPSIS
11 dmaster
13 =head1 DESCRIPTION
15 Daemon master.
16 Start programs defined in B<daemontab> file.
17 Restart them when ended.
19 Re-read the B<daemontab> file on B<HUP> signal,
20 stops processes which are no longer in it (or I<ID> changed),
21 and starts new ones.
22 Those daemons whose I<COMMAND> I<ARGS> changed, are not restarted automatically.
24 Automatically respawn a daemon when it exits.
25 If a daemon exits too often, suppress it a little while.
26 Default parameters: at most 5 spawns in 2 sec, hold up for 10 sec.
28 =head1 FILES
30 =over 4
32 =item F</etc/daemontab>
34 Format is I<ID> I<COMMAND> I<ARGS>, delimited by any amount of space/tab, one line per each.
35 I<ID> is an arbitrary word choosed by the user, to identify daemon commands across reloads.
36 Command arguments may enclosed in double quotes (C<">), but not part of the argument. See:
38 d1 my-daemon --param="foo bar"
40 is invalid (parsed as 2 parameters: C<--param="foo> and C<bar"> with dangling quotes).
42 d1 my-daemon "--param=foo bar"
44 is what you want.
46 It's not recommended to write complex scripts directly in daemontab.
47 Put them in F</etc/daemons> directory instead.
49 =item F</var/run/dmaster.sock>
51 dmaster(1) listens on a Unix Domain socket (see unix(7)).
52 When a client connects, internal state is dumped on the socket
53 in B<Single Value per Line with Hierarchical Keys> format.
55 Internal state includes all started daemon's B<PIDFD>,
56 which may be copied by an other program by pidfd_getfd(2),
57 in order to reliably send signals to it.
58 After the internal state dump is closed by a line containing a lone C<END>,
59 it reads one line on the management socket.
60 Currently you can not control dmaster(1) this way (use signals for this, see L</SIGNALS>),
61 but while it waits for that one line, does not do anything,
62 so it's time for the management client side to call pidfd_getfd(2)
63 to reliably copy the PIDFD representing a given daemon process.
65 B<Note>, this also means that daemon management is paused until
66 normal operation is resumed by sending a newline on the management socket.
67 Due to this potential DoS vector, management socket is chmod(2) to 0660.
69 =item F</var/run/shm/dmaster.state>
71 Dump internal state to it.
72 See USR1 signal below.
74 =back
76 =head1 SIGNALS
78 =over 4
80 =item USR1
82 Write internal state into F</var/run/shm/dmaster.state> file.
84 =item TERM, INT
86 Terminate managed daemons (always with B<TERM> signal),
87 wait for them to exit, then exit dmaster(1) itself.
88 If a daemon is a process group leader, sends signal to the whole process group.
90 =item HUP
92 Re-read B<daemontab>, stop daemons which are disappeared and start new ones.
93 Leave others running.
95 =back
97 =head1 ENVIRONMENT
99 =over 4
101 =item XDG_CONFIG_HOME
103 Where to find C<daemontab>.
104 Default is F</etc>.
106 =item XDG_CACHE_HOME
108 Where to create F<dmaster.sock>
109 Default is F</var/run>.
111 =item DAEMONS_RESPAWN_BURST_PERIOD
113 Measure this much time (seconds) to detect respawn-burst.
115 =item DAEMONS_RESPAWN_BURST_LIMIT
117 If a daemon respawns this many times within DAEMONS_RESPAWN_BURST_PERIOD,
118 consider it a respawn-burst, and hold it back a while.
120 =item DAEMONS_RESPAWN_BURST_HOLD
122 How much time (seconds) to wait before starting a daemon
123 after a respawn-burst.
125 =back
127 =head1 EXAMPLE INSTALLATION
129 Put this in your inittab(5):
131 dm:2345:respawn:loggerexec daemon dmaster /sbin/dmaster
133 This one removes the socket file in case of it's left there:
135 dm:2345:respawn:@multicmd rm /var/run/dmaster.sock ; loggerexec daemon dmaster dmaster
137 =head1 SEE ALSO
139 supervisor(1), supervise(8), daemon(1), runit(1)
141 =cut
144 use Data::Dumper;
145 use POSIX ":sys_wait_h";
146 no if ($] >= 5.018), 'warnings' => 'experimental::smartmatch';
147 use Socket;
148 use Config;
149 require 'syscall.ph';
150 @signal_names = split ' ', $Config{'sig_name'};
152 $pidfd_support = eval q{ use Linux::FD::Pid; 1; };
153 warn "deprecating PIDFD support: $@" if $@;
156 $0 = 'dmaster';
157 %Daemon = ();
158 $respawn_burst_period = $ENV{'DAEMONS_RESPAWN_BURST_PERIOD'} // 2;
159 $respawn_burst_limit = $ENV{'DAEMONS_RESPAWN_BURST_LIMIT'} // 5;
160 $respawn_burst_hold = $ENV{'DAEMONS_RESPAWN_BURST_HOLD'} // 10;
163 sub logger
165 my $s = shift;
166 warn "$0: $s\n";
169 sub append_fixsizelist
171 my ($listref, $size, @items) = @_;
172 push @$listref, @items;
173 while(scalar @$listref > $size)
175 shift @$listref;
179 sub getpgid
181 my $pid = shift;
182 return syscall(SYS_getpgid(), int $pid);
185 sub stop_daemon
187 my $did = shift;
188 my $pid = $Daemon{$did}->{'pid'};
189 if(defined $pid)
191 my $pgid = getpgid($pid);
192 if($pgid == $pid)
194 logger "daemon $did is a process group leader, terminating PGID $pgid";
195 $kid = -$pid;
197 else
199 logger "terminating $did PID $pid";
200 $kid = $pid;
202 kill 'TERM', $kid;
204 else
206 logger "daemon $did has no PID";
210 sub start_daemon
212 my $id = shift;
213 my $cmd_args = $Daemon{$id}->{'cmd_args'};
214 logger "starting $id: @$cmd_args";
216 my $pid = fork;
217 if($pid == 0)
219 close $management_socket;
220 close $_ for map {$Daemon->{$_}->{'pidfd'}} grep {exists $Daemon->{$_}->{'pidfd'}} keys %Daemon;
221 # leave STDIO there for convenience
223 exec @$cmd_args;
224 exit 127;
226 elsif(not defined $pid)
228 logger "fork: $!";
230 else
232 logger "started $id PID $pid";
233 $Daemon{$id}->{'pid'} = $pid;
234 $Daemon{$id}->{'pidfd'} = Linux::FD::Pid->new($pid) if $pidfd_support;
235 append_fixsizelist \@{$Daemon{$id}->{'starttimes'}}, $respawn_burst_limit, time;
239 sub parse_cmd_string
241 my $cmd = shift;
242 my @cmd_args = ();
243 while($cmd =~ /^("([^""]*)"|\S+)\s*/)
245 my $arg = defined $2 ? $2 : $1;
246 push @cmd_args, $arg;
247 $cmd = $';
249 return @cmd_args;
252 sub read_daemon_inittab
254 my $path = shift;
255 # TODO: lock
256 open my $fh, '<', $path or logger "open $path: $!";
257 return unless $fh;
258 my %new_daemon = ();
259 while(<$fh>)
261 next if /^\s*(#|$)/;
262 if(my ($id, $cmd) = /^(\S+)\s+(.+)$/)
264 my @cmd_args = parse_cmd_string $cmd;
265 $new_daemon{$id}->{'cmd_args'} = \@cmd_args;
267 else
269 logger "can not parse line $.";
272 close $fh;
273 return %new_daemon;
276 sub reload
278 my $daemontab = ($ENV{'XDG_CONFIG_HOME'} || '/etc').'/daemontab';
279 logger "reloading $daemontab";
280 my %new_daemon = read_daemon_inittab $daemontab;
282 for my $id (keys %Daemon)
284 if($id ~~ [keys %new_daemon])
286 logger "keep $id";
287 $Daemon{$id}->{'cmd_args'} = $new_daemon{$id}->{'cmd_args'};
288 delete $new_daemon{$id};
290 else
292 logger "terminate $id as it's no longer managed";
293 delete $Daemon{$id}->{'cmd_args'};
294 stop_daemon $id;
298 for my $id (keys %new_daemon)
300 $Daemon{$id}->{'cmd_args'} = $new_daemon{$id}->{'cmd_args'};
302 for my $id (keys %new_daemon)
304 start_daemon $id;
308 sub errstr
310 my $errno = shift;
311 $! = $errno;
312 return $^E;
315 sub handle_children
317 for my $id (keys %Daemon)
319 my $daemon_pid = $Daemon{$id}->{'pid'};
320 next unless defined $daemon_pid;
322 my $pid = waitpid($daemon_pid, WNOHANG);
323 my $status = ${^CHILD_ERROR_NATIVE};
324 if($pid == $daemon_pid)
326 my $exitcode = WIFEXITED($status) ? WEXITSTATUS($status) : undef;
327 my $exitsignal = WIFSIGNALED($status) ? WTERMSIG($status) : undef;
328 logger sprintf "daemon %s PID %d deceased, exit status %s (%s) signal %s (%s)",
329 $id, $daemon_pid, $exitcode // 'none', errstr($exitcode), $exitsignal // 'none', $signal_names[$exitsignal];
330 $Daemon{$id}->{'exitcode'} = $exitcode;
331 $Daemon{$id}->{'exitsignal'} = $exitsignal;
332 append_fixsizelist \@{$Daemon{$id}->{'exittimes'}}, $respawn_burst_limit, time;
333 delete $Daemon{$id}->{'pid'};
338 sub svlhk_dump
340 my $objref = shift;
341 my $prefix = shift;
342 my $cb_ref = shift;
344 my $type = ref $objref;
346 if($type eq '')
348 $cb_ref->("$prefix $objref\n");
350 elsif($type eq 'SCALAR')
352 $cb_ref->("$prefix $$objref\n");
354 elsif($type eq 'ARRAY')
356 my $idx = 0;
357 for my $elem (@$objref)
359 svlhk_dump($elem, "$prefix.$idx", $cb_ref);
360 $idx++;
363 elsif($type eq 'HASH')
365 for my $key (keys %$objref)
367 svlhk_dump($objref->{$key}, "$prefix.$key", $cb_ref);
370 else
372 $cb_ref->("$prefix $type\n");
373 if($type eq 'Linux::FD::Pid')
375 my $fileno = $objref->fileno;
376 $cb_ref->("$prefix.fileno $fileno\n");
381 sub init_management_socket
383 #my $management_socket_path = chr(0).'dmaster-management'.$ENV{'DMASTER_SOCKET_NAME_SUFFIX'};
384 #my $management_socket_addr = pack('S', AF_UNIX) . $management_socket_path;
385 my $management_socket_path = ($ENV{'XDG_CACHE_HOME'} || '/var/run').'/dmaster.sock';
386 my $management_socket_addr = sockaddr_un($management_socket_path);
387 socket(my $management_socket, AF_UNIX, SOCK_STREAM, 0) or die "socket: $!\n";
388 bind($management_socket, $management_socket_addr) or die "bind: $management_socket_path: $!\n";
389 chmod(0660, $management_socket_path) or die "chmod: $management_socket_path: $!\n";
390 listen($management_socket, SOMAXCONN) or die "listen: $!\n";
391 return $management_socket;
395 $NextAlarm_ts = 0;
397 sub wakeup_at_most
399 my $sec = shift;
400 my $now = time;
401 my $next_alarm_ts = $now + $sec;
402 if($NextAlarm_ts <= $now or $next_alarm_ts < $NextAlarm_ts)
404 $NextAlarm_ts = $next_alarm_ts;
405 alarm $sec;
410 $SIG{'HUP'} = sub { $do_reload = 1 unless $do_shutdown; };
411 $SIG{'CHLD'} = sub { $do_handle_children = 1; };
412 $SIG{'TERM'} = sub { $shutdown_signalname = $_[0]; $do_shutdown = 1; };
413 $SIG{'INT'} = $SIG{'TERM'};
414 $SIG{'ALRM'} = sub { 1; };
415 $SIG{'USR1'} = sub { $do_dump_state = 1; };
416 #$SIG{'PIPE'} = TODO
419 logger "init";
420 $management_socket = init_management_socket();
421 $shutdown_signalname = undef;
424 logger "start";
425 reload;
427 while(1)
429 # go idle. but it will be interrupted by signals.
430 my $management_client_addr = accept(my $management_client_socket, $management_socket);
432 if(defined $management_client_addr)
434 print {$management_client_socket} "dmaster.pid $$\n";
435 svlhk_dump(\%Daemon, 'daemon', sub { print {$management_client_socket} $_[0]; });
436 print {$management_client_socket} "END\n";
437 $management_client_socket->flush;
438 logger "holding on management command";
439 <$management_client_socket>;
440 logger "resume from management command";
441 close $management_client_socket;
442 undef $management_client_socket;
445 if($do_handle_children)
447 $do_handle_children = 0;
448 handle_children;
451 if($do_shutdown)
453 if(not $prev_do_shutdown)
455 logger "got SIG$shutdown_signalname, exiting...";
456 $prev_do_shutdown = $do_shutdown;
459 my $subprocesses = 0;
460 for my $id (keys %Daemon)
462 if(exists $Daemon{$id}->{'pid'})
464 stop_daemon $id;
465 $subprocesses++;
468 if($subprocesses == 0)
470 logger "exit";
471 exit;
474 else
476 if($do_reload)
478 $do_reload = 0;
479 reload;
482 # check if daemons need respawn
483 for my $id (keys %Daemon)
485 my $dmon = $Daemon{$id};
486 if(not exists $dmon->{'pid'})
488 if(exists $dmon->{'cmd_args'})
490 if(scalar @{$dmon->{'exittimes'}} >= $respawn_burst_limit and $dmon->{'exittimes'}->[0] > time - $respawn_burst_period)
492 my $hold_sec = $dmon->{'exittimes'}->[0] + $respawn_burst_hold - time;
493 $hold_sec = 1 if $hold_sec < 1; # alarm(0) takes int but alarm(0) means cancel
494 logger "daemon $id respawned too often, hold up for $hold_sec sec";
495 wakeup_at_most $hold_sec;
497 else
499 start_daemon $id;
502 else
504 # it's ended and command deleted, so
505 # we no longer manage this daemon
506 delete $Daemon{$id};
512 if($do_dump_state)
514 $do_dump_state = 0;
515 my $dump_path = '/var/run/shm/dmaster.state';
516 open my $fh, '>', $dump_path or warn "$dump_path: $!\n";
517 if($fh)
519 print {$fh} Dumper \%Daemon;
520 close $fh;