Fixed typo in check_disk
[monitoring-plugins.git] / contrib / check_remote_nagios_status.pl
blobdc99705e8ca025609cd894bcf5c79a0087287460
1 #!/usr/bin/perl -w
3 # check_status.pl Nagios Plugin - Version 1.3
4 # Last Updated: 1/9/2003
6 # Report any bugs/questions to Russell Scibetti at russell@quadrix.com
8 # check_status Change Log:
10 # To do for 1.4
11 # - Better help and documentation (separate doc?)
12 # - Take argument (patterns to match) from a separate spec file
14 # New Addition to 1.3
15 # - Added ChangeLog information and updated --help output
16 # - hostdown (hd) argument for how a service check should respond
17 # when its host is Down/Unreachable
18 # (--hostdown="ok|warning|critical|unknown")
19 # - Changed name from check_state to check_status
20 # - Set hostdown to default to OK when the argument isn't specified
21 # - Number of Hosts checked is now output in OK result
23 # Version 1.2 additions:
25 # - Added ability to handle ack'd and downtimed services differently
26 # depending on argument provided
27 # (--ack="ok|warning|critical|unknown|down|unreachable"
28 # --dt="ok|warning|critical|unknown|down|unreachable")
30 # Version 1.1 additions:
32 # - Added --host=<regex>, --servhost=<regex> to allow for specific field
33 # matching (host for matching hostname in host checks, servhost for
34 # matching the hostname in service checks, service for matching the
35 # service name in service checks)
36 # - Output the number of OK services for an OK output
38 # Version 1.0 features:
40 # - Freshness check of status.log (timestamp)
41 # - Match service or host checks
42 # - Can ignore acknowledged or downtimes services/hosts (--ack, --dt)
43 # - Can output different levels of detail dependent on # of problems
44 # - Can check for number of critical, warning, or unknowns
46 #############################################################
48 use Getopt::Long;
49 use File::stat;
51 Getopt::Long::Configure('bundling');
53 GetOptions
54 ("V" => \$version, "version" => \$version,
55 "h" => \$help, "help" => \$help,
56 "v" => \$verbose, "verbose" => \$verbose,
57 "w=s" => \$warning, "warning=s" => \$warning,
58 "c=s" => \$critical, "critical=s" => \$critical,
59 "u=s" => \$unknown, "unknown=s" => \$unknown,
60 "p=s" => \$pattern, "pattern=s" => \$pattern,
61 "S:s" => \$service, "service:s" => \$service,
62 "s=s" => \$status, "status=s" => \$status,
63 "d=s" => \$dir, "dir=s" => \$dir,
64 "D=s" => \$details, "details=s" => \$details,
65 "H:s" => \$host, "host:s" => \$host,
66 "f=s" => \$freshness, "freshness=s" => \$freshness,
67 "servhost=s" => \$servhost,
68 "a:s" => \$ack, "ack:s" => \$ack,
69 "dt:s"=> \$dt, "downtime:s" => \$dt,
70 "hd:s"=> \$hdown, "hostdown:s" => \$hdown,
71 "ok" => \$ok);
73 #Constants:
74 my $OK = 0;
75 my $WARNING = 1;
76 my $CRITICAL = 2;
77 my $UNKNOWN = 3;
79 my $crit="CRITICAL";
80 my $warn="WARNING";
81 my $unk="UNKNOWN";
82 my $down="DOWN";
83 my $unreach="UNREACHABLE";
85 # Print out Help information
86 if ($help) {
87 printVersion();
88 printHelp();
89 exitcheck($UNKNOWN);
92 # Print out version information
93 if ($version) {
94 printVersion();
95 exitcheck($UNKNOWN);
98 # Check for status log or directory argument or print usage
99 if (!$status) {
100 if (!$dir) {
101 print "Usage: $0 -s <status file> | -d <Nagios log dir>\n";
102 print "Use the --help option for full list of arguments\n";
103 exitcheck($UNKNOWN);
105 elsif ($dir =~ m#[^/]/$#) {
106 $status = $dir . "status.log";
108 else {
109 $status = $dir . "/status.log";
113 if (defined $host) {
114 if (!$host) {
115 $host="[^\\s]*";
119 if (!$host && !$servhost) {
120 $servhost="[^\\s]*";
123 if (!$host && !$service) {
124 $service="[^\\s]*";
127 if (defined $ack) {
128 if (!$ack) {
129 $ack="ok";
131 elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) {
132 print "Invalid value for ack\n";
133 exitcheck($UNKNOWN);
137 if (defined $dt) {
138 if (!$dt) {
139 $dt="ok";
141 elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) {
142 print "Invalid value for dt\n";
143 exitcheck($UNKNOWN);
147 if (defined $hdown) {
148 if (!$hdown) {
149 $hdown="ok";
151 elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) {
152 print "Invalid value for hostdown\n";
153 exitcheck($UNKNOWN);
157 my $much_details = 0;
159 my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN";
160 my $HostNotOK = "DOWN|UNREACHABLE";
162 my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
164 my $CritOnly = 0;
165 my $WarnOnly = 0;
166 my $UnkOnly = 0;
168 my @wlev;
169 my @clev;
170 my @ulev;
171 my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
172 my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
173 my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
174 my %hostlevel = ("DOWN",0,"UNREACHABLE",0);
176 # Store Hosts in downtime
177 my @hostdowntime;
178 my $numdowntime = 0;
180 # Store Hosts in a Down/Unreachable state
181 my @hostdown;
182 my $numdown = 0;
184 # Hash for storing state-change to OK times for hosts:
185 my %hostoktimes;
187 # Number of matches in parsing
188 my $nummatch = 0;
190 if ($warning) {
191 if ($warning =~ /,/) {
192 @wlev = split /,/,$warning;
193 $warnlevel{"WARNING"} = $wlev[0];
194 $warnlevel{"CRITICAL"} = $wlev[1];
195 if ($wlev[2] ) {
196 $warnlevel{"UNKNOWN"} = $wlev[2];
199 else {
200 $WarnOnly = $warning;
203 else {
204 $WarnOnly = 1;
207 if ($critical) {
208 if ($critical =~ /,/) {
209 @clev = split /,/,$critical;
210 $critlevel{"WARNING"} = $clev[0];
211 $critlevel{"CRITICAL"} = $clev[1];
212 if ($clev[2] ) {
213 $critlevel{"UNKNOWN"} = $clev[2];
216 else {
217 $CritOnly = $critical;
220 else {
221 $CritOnly = 1;
224 if ($unknown) {
225 if ($unknown =~ /,/) {
226 @ulev = split /,/,$unknown;
227 $unklevel{"WARNING"} = $ulev[0];
228 $unklevel{"CRITICAL"} = $ulev[1];
229 if ($ulev[2] ) {
230 $unklevel{"UNKNOWN"} = $ulev[2];
233 else {
234 $UnkOnly = $unknown;
237 else {
238 $UnkOnly = 1;
242 if (!$freshness) {
243 $freshness = 30 * 60;
245 else {
246 $freshness = $freshness * 60;
249 my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
250 my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
252 my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
253 my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
255 if ($details) {
256 if ($details =~ /,/) {
257 my @tempv = split /,/,$details;
258 $much_details = $tempv[0];
259 $details = $tempv[1];
263 open("sta","$status") || die "Cannot open status file $status!";
265 $curr_time = time;
266 $file_time = stat($status)->mtime;
268 if ($curr_time - $file_time > $freshness) {
269 printf "State CRITICAL - Status file is stale!!!\n";
270 exitcheck($CRITICAL);
273 while(<sta>) {
274 chomp;
275 if (/^[^\s]+[\s]+HOST;/) {
276 @hdata = split /;/,$_;
278 # If you care about matching hosts (not services):
279 if ($host && $hdata[1] =~ /$host/) {
280 $nummatch++;
281 if ( $hdata[2] =~ /$HostNotOK/ ) {
282 addproblem($_,$hdata[2]);
286 # If you are matching services, gather host information:
287 else {
288 if ( $hdata[2] =~ /$HostNotOK/ ) {
289 $hostdown[$numdown] = $hdata[1];
290 $numdown++;
292 else {
293 $hostoktimes{$hdata[1]} = $hdata[4];
295 if ( $hdata[17] ne "0" ) {
296 $hostdowntime[$numdowntime] = $hdata[1];
297 $numdowntime++;
301 elsif (!$host && /^[^\s]+[\s]+SERVICE;/) {
302 @servdata = split /;/,$_;
303 if ( ( $pattern && ($_ =~ /$pattern/)) ||
304 (($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){
305 $nummatch++;
306 if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) {
307 addproblem($_,$servdata[3]);
313 close("sta");
315 if ($nummatch==0) {
316 print "Nothing Matches your criteria!\n";
317 exitcheck($UNKNOWN);
320 # Count the number of problems (for reference):
321 if ($host) {
322 $total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"};
324 else {
325 $total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"};
328 my $numok = $nummatch - $total;
330 # If this is a host state check:
331 if ($host) {
332 if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) {
333 if ($details && ($total <= $details)) {
334 print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n";
335 exitcheck($CRITICAL);
337 else {
338 print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n";
339 exitcheck($CRITICAL);
342 else {
343 print "State OK - $numok Hosts Up, $total Problems\n";
344 exitcheck($OK);
348 #If you only defined a Critical level in terms of # of criticals...
349 elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) {
350 countAndPrint($crit,$numprob{$crit},0);
351 exitcheck($CRITICAL);
354 #Critical in terms on # criticals and # warnings...
355 elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} ||
356 $numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} ||
357 $numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) {
358 countAndPrint($crit,$total,1);
359 exitcheck($CRITICAL);
362 #Warning in terms of # warnings only...
363 elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) {
364 countAndPrint($warn,$numprob{$warn},0);
365 exitcheck($WARNING);
368 #Warning in terms of # warnings and # criticals...
369 elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} ||
370 $numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} ||
371 $numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) {
372 countAndPrint($warn,$total,1);
373 exitcheck($WARNING);
376 #Unknown in terms on # unknown only...
377 elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) {
378 countAndPrint($unk,$numprob{$unk},0);
379 exitcheck($UNKNOWN);
382 #Unknown in terms of # warning, critical, and unknown...
383 elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} ||
384 $numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} ||
385 $numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) {
386 countAndPrint($unk,$total,1);
387 exitcheck($UNKNOWN);
390 # Everything is OK!
391 else {
392 print "State OK - $numok OK, $total problems\n";
393 exitcheck($OK);
398 ############################
399 # Subroutines
400 ############################
402 # Return the proper exit code for Critical, Warning, Unknown, or OK
403 sub exitcheck {
404 if ($ok) {
405 exit 0;
407 else {
408 exit $_[0];
412 # Decide what to print for services:
413 sub countAndPrint {
414 my $state = $_[0];
415 my $count = $_[1];
416 my $alltypes = $_[2];
417 my $output = "State $state - ";
419 if ($details) {
420 if ($count<=$much_details) {
421 if ($alltypes) {
422 $output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}";
424 else {
425 $output .= "$count \L$state\E: $much_output{$state}";
428 elsif ($count<=$details) {
429 if ($alltypes) {
430 $output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}";
432 else {
433 $output .= "$count \L$state\E: $output{$state}";
436 else {
437 if ($alltypes) {
438 $output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown";
440 else {
441 $output .= "$count \L$state\E";
445 else {
446 $output .= "$count problems";
449 print "$output\n";
453 # Add-in the problem found in the status log
454 sub addproblem {
456 $test = 1;
457 $type = $_[1];
458 my $diffout = "";
460 my @values = split /;/,$_[0];
462 if (!$host) {
463 my $namehold = $values[1];
464 if ($ack && ($values[13] eq "1")) {
465 if ($ack =~ "ok") {
466 $test = 0;
468 else {
469 $type = "\U$ack";
472 elsif ($hdown && grep /$namehold/, @hostdown) {
473 if ($hdown =~ "ok") {
474 $test = 0;
476 else {
477 $type = "\U$hdown";
478 $diffout = "$values[1] is down";
481 elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){
482 if ($dt =~ "ok") {
483 $test = 0;
485 else {
486 $type = "\U$dt";
489 elsif (exists $hostoktimes{$namehold}) {
490 # If the state change time of the host is more recent than the last
491 # service check, must wait until the next service check runs!
492 if ($hostoktimes{$namehold} > $values[6]) {
493 $test = 0;
497 else {
498 if ($ack && $values[5]) {
499 if ($ack =~ "ok") {
500 $test = 0;
502 else {
503 $type = "\U$ack";
506 elsif ($dt && ($values[17] ne "0")) {
507 if ($dt =~ "ok") {
508 $test = 0;
510 else {
511 $type = "\U$dt";
516 if ($details && $test) {
517 if (!$host) {
518 if ($diffout) {
519 $much_output{$type} .= " $diffout;";
520 $output{$type} .= "$diffout;";
521 $much_ct{$type}++;
522 $ct{$type}++;
524 else {
525 if ($much_details && $much_ct{$type}<$much_details) {
526 $much_output{$type} .= " $values[2] on $values[1] $values[31];";
527 $much_ct{$type}++;
529 if ($ct{$type} < $details) {
530 $output{$type} .= " $values[2] on $values[1];";
531 $ct{$type}++;
535 else {
536 $much_output{$type} .= " $values[1] $_[1] $values[20],";
537 $much_ct{type}++;
538 $output{$type} .= " $values[1] HOST $_[1],";
539 $ct{$type}++;
542 if ($test) {
543 $numprob{$type}++;
547 ################################
549 # Version and Help Information
551 ################################
553 sub printVersion {
554 printf <<EndVersion;
555 $0 (nagios-plugins) 1.3
556 The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute
557 copies of the plugins under the terms of the GNU General Public License.
558 For more information about these matters, see the file named COPYING.
559 EndVersion
562 sub printHelp {
563 printf <<EOF;
565 This plugin parses through the Nagios status log and will return a
566 Critical, Warning, or Unknown state depending on the number of
567 Critical, Warning, and/or Unknown services found in the log
568 (or Down/Unreachable hosts when matching against hosts)
570 Usage: $0 -s <Status File> | -d <Nagios Log Directory>
571 [-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]]
572 [--service=<RegEx> | --servhost=<RegEx> | --pattern=<RegEx> |
573 --host | --host=<RegEx>]
574 [--ack[=string]] [--dt[=string]] [--hostdown[=string]]
575 [-D #[,#]] [--ok] [-f <Log freshness in # minutes>]
576 $0 --help
577 $0 --version
578 NOTE: One of -s and -d must be specified
580 Options:
581 -s, --status=FILE_NAME
582 Location and name of status log (e.g. /usr/local/nagios/var/status.log)
583 -d, --dir=DIRECTORY_NAME
584 Directory that contains the nagios logs (e.g. /usr/local/nagios/var/)
585 -w, --warning=INTEGER[,INTEGER][,INTEGER]
586 #: Number of warnings to result in a WARNING state
588 #,#: Warning,Criticals to result in a WARNING state
590 #,#,#: Warning,Critical,Unknown to result in a WARNING state
591 Default: -w=1
592 -c, --critical=INTEGER[,INTEGER][,INTEGER]
593 #: Number of criticals to result in a CRITICAL state
595 #,#: Warning,Criticals to result in a CRITICAL state
597 #,#,#: Warning,Critical,Unknown to result in a CRITICAL state
598 Default: -c=1
599 -u, --unknown=INTEGER[,INTEGER][,INTEGER]
600 #: Number of unknowns to result in a UNKNOWN state
602 #,#: Warning,Criticals to result in a UNKNOWN state
604 #,#,#: Warning,Critical,Unknown to result in a UNKNOWN state
605 Default: -u=1
606 -r, --service[=REGEX]
607 Only match services [that match the RegEx]
608 (--service is default setting if no other matching arguments provided)
609 --servhost=REGEX
610 Only match services whose host match the RegEx
611 -p, --pattern=REGEX
612 Only parse for this regular expression (services only, not hosts)
613 --host[=REGEX]
614 Report on the state of hosts (whose name matches the RegEx if provided)
615 -a, --ack[=ok|warning|critical|unknown|down|unreachable]
616 Handle Acknowledged problems [--ack defaults to ok]
617 --dt, --downtime[=ok|warning|critical|unknown|down|unreachable]
618 Handle problems in scheduled downtime [--dt defaults to ok]
619 --hd, --hostdown[=ok|warning|critical|unknown|down|unreachable]
620 Handle services whose Host is down [--hd defaults to ok]
621 -D, --details=INTEGER[,INTEGER]
622 Amount of verbosity to output
623 If # problems:
624 <= 1st integer, return full details (each plugin's output)
625 <= 2nd integer, return some details (list each service host pair)
626 > 2nd integer, return the # of problems
627 -f, --freshness=INTEGER
628 Number of minutes old the log can be to make sure Nagios is running
629 (Default = 30 minutes)
630 --ok
631 Return an OK exit code, regardless of number of problems found
632 -h, --help
633 Print detailed help screen
634 -V, --version
635 Print version information
637 For service checking (use --service and/or --servhost):
638 1. The values of warning, critical, and unknown default to 1, i.e.
639 $0 will return CRITICAL if there is at least 1 critical service,
640 WARNING if there is at least 1 warning service, and UNKNOWN if there is
641 at least one unknown service.
643 2. If a service's host is DOWN or UNREACHABLE, $0 will use the
644 value of --hostdown to determine how to treat the service. Without that
645 argument, $0 will count the service as OK.
647 3. If a service's host is OK, but the last host-state change occurred more
648 recently than the last service check, $0 will ignore that service
649 (want to wait until the service has been checked after a host has recovered
650 or you may get service alert for services that still need to be checked)
652 4. If the --dt, --ack, or --hd tags are used, $0 will use the value
653 of the arguments to determine how to handle services in downtime, acknowledged,
654 or with down hosts (default=OK). For service checks, --dt will also check
655 if the service's host is in a downtime.
657 For host checking (use --host):
658 1. Using the --host argument, $0 will look for DOWN and UNREACHABLE
659 hosts. If any are found, $0 will return a CRITICAL. You can provide
660 an REGEX for --host to only check hosts with matching host names.
662 2. If the --dt or --ack tags are used, $0 will use the value of the
663 --dt/--ack arguments to determine the state of the host (default is OK)