11 #include <nagios/broker.h>
12 #include <nagios/nebcallbacks.h>
20 #define CONCERNS_HOST 50
21 #define CONCERNS_SERVICE 60
24 #define HASH_TABLE_SIZE 128
26 /* for some reason these aren't defined inside Nagios' headers */
28 #define SERVICE_WARNING 1
29 #define SERVICE_CRITICAL 2
30 #define SERVICE_UNKNOWN 3
39 static size_t imported
, totsize
, totlines
;
40 struct timeval import_start
;
41 static int debug_level
;
43 static time_t daemon_start
, daemon_stop
, incremental
;
44 static int daemon_is_running
;
46 struct naglog_file
*cur_file
; /* the file we're currently importing */
48 static int ignore_process_events
;
50 static time_t first_time
, last_time
; /* first and last timestamp to show */
51 static time_t ltime
; /* the timestamp from the current log-line */
60 struct downtime_entry
{
74 struct downtime_entry
*next
;
77 #define add_code(n, s, c) { n, s, sizeof(s) - 1, c, }
78 #define add_ignored(s) add_code(0, s, IGNORE_LINE)
79 struct string_code event_codes
[] = {
81 add_ignored("Warning"),
82 add_ignored("LOG ROTATION"),
83 add_ignored("HOST NOTIFICATION"),
84 add_ignored("HOST FLAPPING ALERT"),
85 add_ignored("SERVICE NOTIFICATION"),
86 add_ignored("SERVICE FLAPPING ALERT"),
87 add_ignored("SERVICE EVENT HANDLER"),
88 add_ignored("HOST EVENT HANDLER"),
89 add_ignored("LOG VERSION"),
91 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END
),
92 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED
),
93 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
94 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
95 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED
),
96 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
97 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
98 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
),
99 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
),
103 #define DEL_HOST_DOWNTIME 1
104 #define DEL_SVC_DOWNTIME 2
105 #define SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME 3
106 #define SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME 4
107 #define SCHEDULE_HOSTGROUP_HOST_DOWNTIME 5
108 #define SCHEDULE_HOSTGROUP_SVC_DOWNTIME 6
109 #define SCHEDULE_HOST_DOWNTIME 7
110 #define SCHEDULE_HOST_SVC_DOWNTIME 8
111 #define SCHEDULE_SERVICEGROUP_HOST_DOWNTIME 9
112 #define SCHEDULE_SERVICEGROUP_SVC_DOWNTIME 10
113 #define SCHEDULE_SVC_DOWNTIME 11
114 #define ACKNOWLEDGE_HOST_PROBLEM 12
115 #define ACKNOWLEDGE_SVC_PROBLEM 13
117 #define add_cdef(__nvecs, __define) add_code(__nvecs, #__define, __define)
118 struct string_code command_codes
[] = {
119 add_cdef(1, DEL_HOST_DOWNTIME
),
120 add_cdef(1, DEL_SVC_DOWNTIME
),
121 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
),
122 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
),
123 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME
),
124 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME
),
125 add_cdef(8, SCHEDULE_HOST_DOWNTIME
),
126 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME
),
127 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
),
128 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
),
129 add_cdef(8, SCHEDULE_SVC_DOWNTIME
),
132 * These really have one more field than listed here. We omit one
133 * to make author and comment concatenated with a semi-colon by default.
135 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM
),
136 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM
),
140 static inline void print_strvec(char **v
, int n
)
144 for (i
= 0; i
< n
; i
++)
145 printf("v[%2d]: %s\n", i
, v
[i
]);
149 const char *tobytes(size_t n
)
151 const char *suffix
= "KMGT";
152 static char tbuf
[2][30];
158 sprintf(tbuf
[t
], "%d bytes", n
);
162 while (n
>> (shift
* 10) > 1024)
165 sprintf(tbuf
[t
], "%0.2f %ciB",
166 (float)n
/ (float)(1 << (shift
* 10)), suffix
[shift
- 1]);
171 static inline struct string_code
*
172 get_string_code(struct string_code
*codes
, const char *str
, size_t len
)
176 for (i
= 0; codes
[i
].str
; i
++)
177 if (codes
[i
].len
== len
&& !memcmp(str
, codes
[i
].str
, len
))
182 #define get_event_type(str, len) get_string_code(event_codes, str, len)
183 #define get_command_type(str, len) get_string_code(command_codes, str, len)
185 static void crash(const char *fmt
, ...)
186 __attribute__((__format__(__printf__
, 1, 2), __noreturn__
));
188 static void __attribute__((__noreturn__
)) crash(const char *fmt
, ...)
193 vfprintf(stderr
, fmt
, ap
);
198 fprintf(stderr
, "crash() called when parsing line %d in %s\n",
199 line_no
, cur_file
->path
);
205 static void pdebug(int lvl
, const char *fmt
, ...)
206 __attribute__((__format__(__printf__
, 2, 3)));
207 #define debug(...) pdebug(1, __VA_ARGS__)
208 static void pdebug(int lvl
, const char *fmt
, ...)
212 if (debug_level
< lvl
)
218 if (fmt
[strlen(fmt
) - 1] != '\n')
222 static void warn(const char *fmt
, ...)
223 __attribute__((__format__(__printf__
, 1, 2)));
225 static unsigned int warnings
;
226 static void warn(const char *fmt
, ...)
243 #define prefixcmp(s1, s2) strncmp(s1, s2, strlen(s2))
244 static int is_interesting(const char *ptr
)
246 if (!prefixcmp(ptr
, "Auto-save of retention data"))
248 if (!prefixcmp(ptr
, "Event broker module"))
250 if (!prefixcmp(ptr
, "You do not have permission"))
252 if (!prefixcmp(ptr
, "Local time is"))
258 static int is_start_event(const char *ptr
)
260 if (!prefixcmp(ptr
, "Finished daemonizing..."))
262 if (!prefixcmp(ptr
, "PROGRAM_RESTART"))
264 if (!prefixcmp(ptr
, "Caught SIGHUP"))
266 if (strstr(ptr
, "starting..."))
272 static int is_stop_event(const char *ptr
)
274 if (!prefixcmp(ptr
, "Caught SIGTERM"))
276 if (!prefixcmp(ptr
, "Successfully shutdown..."))
278 if (!prefixcmp(ptr
, "Bailing out"))
280 if (!prefixcmp(ptr
, "Lockfile"))
282 if (strstr(ptr
, "shutting down..."))
288 struct unhandled_event
{
292 struct unhandled_event
*next
;
295 static struct unhandled_event
*event_list
;
296 static int num_unhandled
;
299 * This is a fairly toothless function, since we can encounter
300 * pretty much any kind of message in the logfiles. In order to
301 * make sure we don't miss anything important though, we should
302 * probably stash the messages away and print them at the end
303 * so the user can decide if he/she wants to make a re-import.
304 * In 99% of all cases, the user will just want to ignore the
305 * messages and keep going
307 static void handle_unknown_event(const char *line
)
309 struct unhandled_event
*event
;
313 if (!(event
= malloc(sizeof(*event
))) || !(event
->line
= strdup(line
))) {
314 crash("Failed to allocate memory for unhandled event [%s]\n", line
);
318 event
->line_no
= line_no
;
319 event
->file
= cur_file
->path
;
321 /* add to "top" of list. we'll print in reverse order */
322 event
->next
= event_list
;
326 static void print_unhandled_events()
328 struct unhandled_event
*event
;
334 printf("\n%d Unhandled events encountered:\n" \
335 "------------------------------", num_unhandled
);
337 for (x
= 1; num_unhandled
> (x
* 10); x
*= 10)
341 for (event
= event_list
; event
; event
= event
->next
) {
342 printf("%s:%d:\n%s\n----\n", event
->file
, event
->line_no
, event
->line
);
346 static int vectorize_string(char *str
, int nvecs
)
352 for (p
= str
; *p
&& i
< nvecs
; p
++) {
362 static hash_table
*interesting_hosts
, *interesting_services
;
363 static int host_is_interesting(const char *host
)
365 if (interesting_hosts
)
366 return !!hash_find(interesting_hosts
, host
);
371 static int service_is_interesting(const char *host
, const char *service
)
373 /* fall back to just checking if host is interesting */
374 if (!service
|| !interesting_services
)
375 return host_is_interesting(host
);
377 return !!hash_find2(interesting_services
, host
, service
);
380 static int strtotimet(const char *str
, time_t *val
)
384 *val
= strtoul(str
, &endp
, 10);
386 warn("strtotimet(): %s is not a valid time_t\n", str
);
393 static int parse_line(char *line
, size_t len
)
397 struct string_code
*sc
;
398 static time_t last_ltime
= 0;
401 imported
+= len
+ 1; /* make up for 1 lost byte per newline */
403 /* ignore empty lines. whitespace is trimmed in the cfg_* api */
407 /* skip obviously bogus lines */
408 if (len
< 12 || *line
!= '[') {
409 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no
, line
);
413 ltime
= strtoul(line
+ 1, &ptr
, 10);
414 if (line
+ 1 == ptr
) {
415 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line
);
419 /* only print lines in the interesting interval */
420 if ((first_time
&& ltime
< first_time
) || (last_time
&& ltime
> last_time
))
423 /* more heuristics should go below, but we remain lazy for now */
427 if (ltime
< last_ltime
) {
434 * Incremental will be 0 if not set, or 1 if set but
435 * the database is currently empty.
436 * Note that this will not always do the correct thing,
437 * as downtime entries that might have been scheduled for
438 * purging may never show up as "stopped" in the database
439 * with this scheme. As such, incremental imports absolutely
440 * require that nothing is in scheduled downtime when the
441 * import is running (well, started really, but it amounts
442 * to the same thing).
444 if (ltime
< incremental
)
447 while (*ptr
== ']' || *ptr
== ' ')
450 if (!is_interesting(ptr
))
453 if (!(colon
= strchr(ptr
, ':'))) {
454 /* stupid heuristic, but might be good for something,
455 * somewhere, sometime. if nothing else, it should suppress
457 if (is_start_event(ptr
)) {
458 daemon_start
= ltime
;
459 daemon_is_running
= 1;
462 if (is_stop_event(ptr
)) {
463 daemon_is_running
= 0;
469 * An unhandled event. We should probably crash here
471 handle_unknown_event(line
);
475 /* an event happened without us having gotten a start-event */
476 if (!daemon_is_running
) {
477 daemon_start
= ltime
;
478 daemon_is_running
= 1;
481 if (!(sc
= get_event_type(ptr
, colon
- ptr
))) {
482 handle_unknown_event(line
);
486 if (sc
->code
== IGNORE_LINE
)
497 nvecs
= vectorize_string(ptr
, sc
->nvecs
);
499 if (nvecs
!= sc
->nvecs
) {
501 warn("Line %d in %s seems to not have all the fields it should",
502 line_no
, cur_file
->path
);
506 for (i
= 0; i
< sc
->nvecs
; i
++) {
508 /* this should never happen */
509 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
510 line_no
, cur_file
->path
);
519 case NEBTYPE_EXTERNALCOMMAND_END
:
520 semi_colon
= strchr(ptr
, ';');
523 if (!(sc
= get_command_type(ptr
, semi_colon
- ptr
))) {
527 nvecs
= vectorize_string(semi_colon
+ 1, sc
->nvecs
);
528 if (nvecs
!= sc
->nvecs
) {
529 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs
, sc
->nvecs
, ptr
);
533 case NEBTYPE_HOSTCHECK_PROCESSED
:
534 case NEBTYPE_SERVICECHECK_PROCESSED
:
537 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
:
538 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
:
549 * Returns an increasing numeric value for a nagios logfile
550 * For a file with a name such as:
551 * nagios-12-01-2002-00.log
556 static size_t path_cmp_number(char *path
)
558 size_t ret
, len
= strlen(path
);
561 unsigned long part
[NUM_PARTS
];
563 if (len
< 18 || strcmp(&path
[len
- 4], ".log"))
565 dash
= strrchr(path
, '/');
571 * we special-case nagios.log as always being the
572 * last file to be parsed. It has to be, since it's
573 * the currently active logfile
575 if (!strcmp(dash
, "nagios.log") || num_nfile
== 1)
578 for (i
= 0; i
< NUM_PARTS
; i
++) {
581 dash
= strchr(dash
, '-');
583 crash("dash is not");
586 part
[i
] = strtoul(dash
, &endp
, 10);
587 if (!part
[i
] && dash
== endp
)
593 if (part
[0] < 1 || part
[0] > 12)
595 if (part
[1] < 1 || part
[1] > 31)
597 if (part
[2] < 2000 || part
[2] > 2008)
599 ret
= part
[2] * 1000000;
600 ret
+= part
[0] * 10000;
601 ret
+= part
[1] * 100;
607 #define min(a, b) ((a) < (b) ? (a) : (b))
608 static void first_log_time(struct naglog_file
*nf
)
614 if (!(fd
= open(nf
->path
, O_RDONLY
)))
615 crash("Failed to open %s: %s", nf
->path
, strerror(errno
));
618 * since we're looking at every file in here anyway,
619 * we also determine the size of them so we can do an
620 * arena allocation large enough to fit the largest
621 * file + an added newline later
623 if (fstat(fd
, &st
) < 0)
624 crash("Failed to stat %s: %s", nf
->path
, strerror(errno
));
626 nf
->size
= st
.st_size
;
628 if (read(fd
, buf
, sizeof(buf
)) < min(sizeof(buf
), st
.st_size
))
629 crash("Incomplete read of %s", nf
->path
);
631 buf
[sizeof(buf
) - 1] = 0;
632 /* skip empty lines at top of file */
633 while (i
< sizeof(buf
) - 12 && (buf
[i
] == '\n' || buf
[i
] == '\r'))
636 if (strtotimet(buf
+ i
+ 1, &nf
->first
))
637 crash("'%s' has no timestamp for us to parse", buf
);
639 nf
->cmp
= path_cmp_number(nf
->path
);
643 int nfile_cmp(const void *p1
, const void *p2
)
645 const struct naglog_file
*a
= p1
;
646 const struct naglog_file
*b
= p2
;
648 if (a
->first
> b
->first
)
650 if (b
->first
> a
->first
)
658 crash("Two files with same 'first' and 'cmp'? Bizarre...");
665 * hashes one line from an "interesting"-file. We use (void *)1
666 * to mark this as "present in hash-table" as we have no real
667 * data to lookup but still want hash_find{,2} to return non-NULL
668 * when it finds a match
670 static int hash_one_line(char *line
, size_t len
)
674 p
= strchr(line
, ';');
677 if (!interesting_services
)
678 interesting_services
= hash_init(16384);
679 hash_add2(interesting_services
, line
, p
, (void *)1);
682 if (!interesting_hosts
)
683 interesting_hosts
= hash_init(16384);
685 hash_add(interesting_hosts
, line
, (void *)1);
691 static int hash_interesting(const char *path
)
695 if (stat(path
, &st
) < 0)
696 crash("failed to stat %s: %s", path
, strerror(errno
));
698 lparse_path(path
, st
.st_size
, hash_one_line
);
703 extern const char *__progname
;
704 int main(int argc
, char **argv
)
707 struct naglog_file
*nfile
;
709 strv
= calloc(sizeof(char *), MAX_NVECS
);
710 nfile
= calloc(sizeof(*nfile
), argc
- 1);
712 crash("Failed to alloc initial structs");
715 for (num_nfile
= 0,i
= 1; i
< argc
; i
++) {
716 char *opt
, *arg
= argv
[i
];
717 struct naglog_file
*nf
;
720 if ((opt
= strchr(arg
, '='))) {
724 else if (i
< argc
- 1) {
728 if (!prefixcmp(arg
, "--ignore-process-events")) {
729 ignore_process_events
= 1;
732 if (!prefixcmp(arg
, "--debug") || !prefixcmp(arg
, "-d")) {
736 if (!prefixcmp(arg
, "--interesting") || !prefixcmp(arg
, "-i")) {
738 crash("%s requires a filename as argument", arg
);
739 hash_interesting(opt
);
744 if (!prefixcmp(arg
, "--first") || !prefixcmp(arg
, "--last")) {
748 crash("%s requires a timestamp as argument", arg
);
749 when
= strtoul(opt
, NULL
, 0);
752 if (!prefixcmp(arg
, "--first"))
759 /* non-argument, so treat as file */
760 nf
= &nfile
[num_nfile
++];
766 log_grok_var("logfile", "/dev/null");
767 log_grok_var("log_levels", "warn");
770 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
774 crash("log_init() failed");
776 qsort(nfile
, num_nfile
, sizeof(*nfile
), nfile_cmp
);
778 gettimeofday(&import_start
, NULL
);
780 for (i
= 0; i
< num_nfile
; i
++) {
781 struct naglog_file
*nf
= &nfile
[i
];
782 if (last_time
&& nf
->first
> last_time
) {
783 debug("ignoring %s\n", nf
->path
);
786 if (first_time
&& i
< num_nfile
- 1 && nfile
[i
+ 1].first
< first_time
) {
787 debug("ignoring %s\n", nf
->path
);
792 debug("importing from %s (%lu : %u)\n", nf
->path
, nf
->first
, nf
->cmp
);
794 lparse_path(nf
->path
, nf
->size
, parse_line
);
796 imported
++; /* make up for one lost byte per file */
799 if (warnings
&& debug_level
)
800 fprintf(stderr
, "Total warnings: %d\n", warnings
);
802 print_unhandled_events();