3 # check_status.pl Nagios Plugin - Version 1.3
4 # Last Updated: 1/9/2003
6 # Report any bugs/questions to Russell Scibetti at russell@quadrix.com
8 # check_status Change Log:
11 # - Better help and documentation (separate doc?)
12 # - Take argument (patterns to match) from a separate spec file
15 # - Added ChangeLog information and updated --help output
16 # - hostdown (hd) argument for how a service check should respond
17 # when its host is Down/Unreachable
18 # (--hostdown="ok|warning|critical|unknown")
19 # - Changed name from check_state to check_status
20 # - Set hostdown to default to OK when the argument isn't specified
21 # - Number of Hosts checked is now output in OK result
23 # Version 1.2 additions:
25 # - Added ability to handle ack'd and downtimed services differently
26 # depending on argument provided
27 # (--ack="ok|warning|critical|unknown|down|unreachable"
28 # --dt="ok|warning|critical|unknown|down|unreachable")
30 # Version 1.1 additions:
32 # - Added --host=<regex>, --servhost=<regex> to allow for specific field
33 # matching (host for matching hostname in host checks, servhost for
34 # matching the hostname in service checks, service for matching the
35 # service name in service checks)
36 # - Output the number of OK services for an OK output
38 # Version 1.0 features:
40 # - Freshness check of status.log (timestamp)
41 # - Match service or host checks
42 # - Can ignore acknowledged or downtimes services/hosts (--ack, --dt)
43 # - Can output different levels of detail dependent on # of problems
44 # - Can check for number of critical, warning, or unknowns
46 #############################################################
51 Getopt
::Long
::Configure
('bundling');
54 ("V" => \
$version, "version" => \
$version,
55 "h" => \
$help, "help" => \
$help,
56 "v" => \
$verbose, "verbose" => \
$verbose,
57 "w=s" => \
$warning, "warning=s" => \
$warning,
58 "c=s" => \
$critical, "critical=s" => \
$critical,
59 "u=s" => \
$unknown, "unknown=s" => \
$unknown,
60 "p=s" => \
$pattern, "pattern=s" => \
$pattern,
61 "S:s" => \
$service, "service:s" => \
$service,
62 "s=s" => \
$status, "status=s" => \
$status,
63 "d=s" => \
$dir, "dir=s" => \
$dir,
64 "D=s" => \
$details, "details=s" => \
$details,
65 "H:s" => \
$host, "host:s" => \
$host,
66 "f=s" => \
$freshness, "freshness=s" => \
$freshness,
67 "servhost=s" => \
$servhost,
68 "a:s" => \
$ack, "ack:s" => \
$ack,
69 "dt:s"=> \
$dt, "downtime:s" => \
$dt,
70 "hd:s"=> \
$hdown, "hostdown:s" => \
$hdown,
83 my $unreach="UNREACHABLE";
85 # Print out Help information
92 # Print out version information
98 # Check for status log or directory argument or print usage
101 print "Usage: $0 -s <status file> | -d <Nagios log dir>\n";
102 print "Use the --help option for full list of arguments\n";
105 elsif ($dir =~ m
#[^/]/$#) {
106 $status = $dir . "status.log";
109 $status = $dir . "/status.log";
119 if (!$host && !$servhost) {
123 if (!$host && !$service) {
131 elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) {
132 print "Invalid value for ack\n";
141 elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) {
142 print "Invalid value for dt\n";
147 if (defined $hdown) {
151 elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) {
152 print "Invalid value for hostdown\n";
157 my $much_details = 0;
159 my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN";
160 my $HostNotOK = "DOWN|UNREACHABLE";
162 my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
171 my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
172 my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
173 my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
174 my %hostlevel = ("DOWN",0,"UNREACHABLE",0);
176 # Store Hosts in downtime
180 # Store Hosts in a Down/Unreachable state
184 # Hash for storing state-change to OK times for hosts:
187 # Number of matches in parsing
191 if ($warning =~ /,/) {
192 @wlev = split /,/,$warning;
193 $warnlevel{"WARNING"} = $wlev[0];
194 $warnlevel{"CRITICAL"} = $wlev[1];
196 $warnlevel{"UNKNOWN"} = $wlev[2];
200 $WarnOnly = $warning;
208 if ($critical =~ /,/) {
209 @clev = split /,/,$critical;
210 $critlevel{"WARNING"} = $clev[0];
211 $critlevel{"CRITICAL"} = $clev[1];
213 $critlevel{"UNKNOWN"} = $clev[2];
217 $CritOnly = $critical;
225 if ($unknown =~ /,/) {
226 @ulev = split /,/,$unknown;
227 $unklevel{"WARNING"} = $ulev[0];
228 $unklevel{"CRITICAL"} = $ulev[1];
230 $unklevel{"UNKNOWN"} = $ulev[2];
243 $freshness = 30 * 60;
246 $freshness = $freshness * 60;
249 my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
250 my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
252 my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
253 my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
256 if ($details =~ /,/) {
257 my @tempv = split /,/,$details;
258 $much_details = $tempv[0];
259 $details = $tempv[1];
263 open("sta","$status") || die "Cannot open status file $status!";
266 $file_time = stat($status)->mtime;
268 if ($curr_time - $file_time > $freshness) {
269 printf "State CRITICAL - Status file is stale!!!\n";
270 exitcheck
($CRITICAL);
275 if (/^[^\s]+[\s]+HOST;/) {
276 @hdata = split /;/,$_;
278 # If you care about matching hosts (not services):
279 if ($host && $hdata[1] =~ /$host/) {
281 if ( $hdata[2] =~ /$HostNotOK/ ) {
282 addproblem
($_,$hdata[2]);
286 # If you are matching services, gather host information:
288 if ( $hdata[2] =~ /$HostNotOK/ ) {
289 $hostdown[$numdown] = $hdata[1];
293 $hostoktimes{$hdata[1]} = $hdata[4];
295 if ( $hdata[17] ne "0" ) {
296 $hostdowntime[$numdowntime] = $hdata[1];
301 elsif (!$host && /^[^\s]+[\s]+SERVICE;/) {
302 @servdata = split /;/,$_;
303 if ( ( $pattern && ($_ =~ /$pattern/)) ||
304 (($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){
306 if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) {
307 addproblem
($_,$servdata[3]);
316 print "Nothing Matches your criteria!\n";
320 # Count the number of problems (for reference):
322 $total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"};
325 $total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"};
328 my $numok = $nummatch - $total;
330 # If this is a host state check:
332 if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) {
333 if ($details && ($total <= $details)) {
334 print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n";
335 exitcheck
($CRITICAL);
338 print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n";
339 exitcheck
($CRITICAL);
343 print "State OK - $numok Hosts Up, $total Problems\n";
348 #If you only defined a Critical level in terms of # of criticals...
349 elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) {
350 countAndPrint
($crit,$numprob{$crit},0);
351 exitcheck
($CRITICAL);
354 #Critical in terms on # criticals and # warnings...
355 elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} ||
356 $numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} ||
357 $numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) {
358 countAndPrint
($crit,$total,1);
359 exitcheck
($CRITICAL);
362 #Warning in terms of # warnings only...
363 elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) {
364 countAndPrint
($warn,$numprob{$warn},0);
368 #Warning in terms of # warnings and # criticals...
369 elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} ||
370 $numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} ||
371 $numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) {
372 countAndPrint
($warn,$total,1);
376 #Unknown in terms on # unknown only...
377 elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) {
378 countAndPrint
($unk,$numprob{$unk},0);
382 #Unknown in terms of # warning, critical, and unknown...
383 elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} ||
384 $numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} ||
385 $numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) {
386 countAndPrint
($unk,$total,1);
392 print "State OK - $numok OK, $total problems\n";
398 ############################
400 ############################
402 # Return the proper exit code for Critical, Warning, Unknown, or OK
412 # Decide what to print for services:
416 my $alltypes = $_[2];
417 my $output = "State $state - ";
420 if ($count<=$much_details) {
422 $output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}";
425 $output .= "$count \L$state\E: $much_output{$state}";
428 elsif ($count<=$details) {
430 $output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}";
433 $output .= "$count \L$state\E: $output{$state}";
438 $output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown";
441 $output .= "$count \L$state\E";
446 $output .= "$count problems";
453 # Add-in the problem found in the status log
460 my @values = split /;/,$_[0];
463 my $namehold = $values[1];
464 if ($ack && ($values[13] eq "1")) {
472 elsif ($hdown && grep /$namehold/, @hostdown) {
473 if ($hdown =~ "ok") {
478 $diffout = "$values[1] is down";
481 elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){
489 elsif (exists $hostoktimes{$namehold}) {
490 # If the state change time of the host is more recent than the last
491 # service check, must wait until the next service check runs!
492 if ($hostoktimes{$namehold} > $values[6]) {
498 if ($ack && $values[5]) {
506 elsif ($dt && ($values[17] ne "0")) {
516 if ($details && $test) {
519 $much_output{$type} .= " $diffout;";
520 $output{$type} .= "$diffout;";
525 if ($much_details && $much_ct{$type}<$much_details) {
526 $much_output{$type} .= " $values[2] on $values[1] $values[31];";
529 if ($ct{$type} < $details) {
530 $output{$type} .= " $values[2] on $values[1];";
536 $much_output{$type} .= " $values[1] $_[1] $values[20],";
538 $output{$type} .= " $values[1] HOST $_[1],";
547 ################################
549 # Version and Help Information
551 ################################
555 $0 (nagios-plugins) 1.3
556 The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute
557 copies of the plugins under the terms of the GNU General Public License.
558 For more information about these matters, see the file named COPYING.
565 This plugin parses through the Nagios status log and will return a
566 Critical, Warning, or Unknown state depending on the number of
567 Critical, Warning, and/or Unknown services found in the log
568 (or Down/Unreachable hosts when matching against hosts)
570 Usage: $0 -s <Status File> | -d <Nagios Log Directory>
571 [-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]]
572 [--service=<RegEx> | --servhost=<RegEx> | --pattern=<RegEx> |
573 --host | --host=<RegEx>]
574 [--ack[=string]] [--dt[=string]] [--hostdown[=string]]
575 [-D #[,#]] [--ok] [-f <Log freshness in # minutes>]
578 NOTE: One of -s and -d must be specified
581 -s, --status=FILE_NAME
582 Location and name of status log (e.g. /usr/local/nagios/var/status.log)
583 -d, --dir=DIRECTORY_NAME
584 Directory that contains the nagios logs (e.g. /usr/local/nagios/var/)
585 -w, --warning=INTEGER[,INTEGER][,INTEGER]
586 #: Number of warnings to result in a WARNING state
588 #,#: Warning,Criticals to result in a WARNING state
590 #,#,#: Warning,Critical,Unknown to result in a WARNING state
592 -c, --critical=INTEGER[,INTEGER][,INTEGER]
593 #: Number of criticals to result in a CRITICAL state
595 #,#: Warning,Criticals to result in a CRITICAL state
597 #,#,#: Warning,Critical,Unknown to result in a CRITICAL state
599 -u, --unknown=INTEGER[,INTEGER][,INTEGER]
600 #: Number of unknowns to result in a UNKNOWN state
602 #,#: Warning,Criticals to result in a UNKNOWN state
604 #,#,#: Warning,Critical,Unknown to result in a UNKNOWN state
606 -r, --service[=REGEX]
607 Only match services [that match the RegEx]
608 (--service is default setting if no other matching arguments provided)
610 Only match services whose host match the RegEx
612 Only parse for this regular expression (services only, not hosts)
614 Report on the state of hosts (whose name matches the RegEx if provided)
615 -a, --ack[=ok|warning|critical|unknown|down|unreachable]
616 Handle Acknowledged problems [--ack defaults to ok]
617 --dt, --downtime[=ok|warning|critical|unknown|down|unreachable]
618 Handle problems in scheduled downtime [--dt defaults to ok]
619 --hd, --hostdown[=ok|warning|critical|unknown|down|unreachable]
620 Handle services whose Host is down [--hd defaults to ok]
621 -D, --details=INTEGER[,INTEGER]
622 Amount of verbosity to output
624 <= 1st integer, return full details (each plugin's output)
625 <= 2nd integer, return some details (list each service host pair)
626 > 2nd integer, return the # of problems
627 -f, --freshness=INTEGER
628 Number of minutes old the log can be to make sure Nagios is running
629 (Default = 30 minutes)
631 Return an OK exit code, regardless of number of problems found
633 Print detailed help screen
635 Print version information
637 For service checking (use --service and/or --servhost):
638 1. The values of warning, critical, and unknown default to 1, i.e.
639 $0 will return CRITICAL if there is at least 1 critical service,
640 WARNING if there is at least 1 warning service, and UNKNOWN if there is
641 at least one unknown service.
643 2. If a service's host is DOWN or UNREACHABLE, $0 will use the
644 value of --hostdown to determine how to treat the service. Without that
645 argument, $0 will count the service as OK.
647 3. If a service's host is OK, but the last host-state change occurred more
648 recently than the last service check, $0 will ignore that service
649 (want to wait until the service has been checked after a host has recovered
650 or you may get service alert for services that still need to be checked)
652 4. If the --dt, --ack, or --hd tags are used, $0 will use the value
653 of the arguments to determine how to handle services in downtime, acknowledged,
654 or with down hosts (default=OK). For service checks, --dt will also check
655 if the service's host is in a downtime.
657 For host checking (use --host):
658 1. Using the --host argument, $0 will look for DOWN and UNREACHABLE
659 hosts. If any are found, $0 will return a CRITICAL. You can provide
660 an REGEX for --host to only check hosts with matching host names.
662 2. If the --dt or --ack tags are used, $0 will use the value of the
663 --dt/--ack arguments to determine the state of the host (default is OK)