Augment end_process() with amount of bytes skipped
[nagios-reports-module.git] / import.c
blob2412cb421b2c8430619bf65109908960e9777e95
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include "nagios/broker.h"
6 #include "nagios/nebcallbacks.h"
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
13 #include "cfgfile.h"
15 #define IGNORE_LINE 0
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
20 #define MAX_NVECS 16
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
24 #define SERVICE_OK 0
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
32 static int only_notifications;
33 static uint64_t imported, totsize, totlines, skipped;
34 static int lines_since_progress, do_progress;
35 static struct timeval import_start;
36 static time_t daemon_start, daemon_stop, incremental;
37 static int daemon_is_running;
38 static uint max_dt_depth, skipped_files;
40 static time_t next_dt_purge; /* when next to purge expired downtime */
41 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
43 static time_t ltime; /* the timestamp from the current log-line */
45 static int dt_start, dt_stop, dt_skip;
46 #define dt_depth (dt_start - dt_stop)
47 static hash_table *host_downtime;
48 static hash_table *service_downtime;
49 static int downtime_id;
50 static time_t probably_ignore_downtime;
52 struct downtime_entry {
53 int id;
54 int code;
55 char *host;
56 char *service;
57 time_t start;
58 time_t stop;
59 int fixed;
60 time_t duration;
61 time_t started;
62 time_t ended;
63 int purged;
64 int trigger;
65 int slot;
66 struct downtime_entry *next;
69 #define NUM_DENTRIES 1024
70 static struct downtime_entry **dentry;
71 static time_t last_downtime_start;
73 static struct string_code event_codes[] = {
74 add_ignored("Error"),
75 add_ignored("Warning"),
76 add_ignored("LOG ROTATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_HOST),
84 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE),
85 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
86 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
87 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
88 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
90 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
91 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
93 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
94 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
95 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
96 { 0, NULL, 0, 0 },
99 static struct string_code command_codes[] = {
100 add_cdef(1, DEL_HOST_DOWNTIME),
101 add_cdef(1, DEL_SVC_DOWNTIME),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
106 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
107 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
109 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
110 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
113 * These really have one more field than listed here. We omit one
114 * to make author and comment concatenated with a semi-colon by default.
116 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
117 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
118 { 0, NULL, 0, 0 },
122 static inline void print_strvec(char **v, int n)
124 int i;
126 for (i = 0; i < n; i++)
127 printf("v[%2d]: %s\n", i, v[i]);
131 static const char *tobytes(uint64_t n)
133 const char *suffix = "KMGTP";
134 static char tbuf[2][30];
135 static int t = 0;
136 int shift = 1;
138 t ^= 1;
139 if (n < 1024) {
140 sprintf(tbuf[t], "%llu bytes", n);
141 return tbuf[t];
144 while (n >> (shift * 10) > 1024 && shift < sizeof(suffix) - 1)
145 shift++;
147 sprintf(tbuf[t], "%0.2f %ciB",
148 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
150 return tbuf[t];
153 static const char *tv_delta(struct timeval *start, struct timeval *stop)
155 static char buf[30];
156 double secs;
157 unsigned int days, hours, mins;
159 secs = stop->tv_sec - start->tv_sec;
160 days = secs / 86400;
161 secs -= days * 86400;
162 hours = secs / 3600;
163 secs -= hours * 3600;
164 mins = secs / 60;
165 secs -= mins * 60;
167 /* add the micro-seconds */
168 secs = ((secs * 1000000) + (stop->tv_usec - start->tv_usec)) / 1000000;
170 if (!mins && !hours && !days) {
171 sprintf(buf, "%.3lfs", secs);
172 } else if (!hours && !days) {
173 sprintf(buf, "%um %.3lfs", mins, secs);
174 } else if (!days) {
175 sprintf(buf, "%uh %um %.3lfs", hours, mins, secs);
176 } else {
177 sprintf(buf, "%ud %uh %um %.3lfs", days, hours, mins, secs);
180 return buf;
183 static void show_progress(void)
185 time_t eta, elapsed;
186 float pct_done, real_pct_done;
188 totlines += lines_since_progress;
189 lines_since_progress = 0;
191 if (!do_progress)
192 return;
194 elapsed = time(NULL) - import_start.tv_sec;
195 if (!elapsed)
196 elapsed = 1;
198 real_pct_done = (float)imported / (float)(totsize - skipped) * 100;
199 pct_done = ((float)(imported + skipped) / (float)totsize) * 100;
200 eta = (elapsed / real_pct_done) * (100.0 - real_pct_done);
202 printf("\rImporting data: %.2f%% (%s) done ",
203 pct_done, tobytes(imported + skipped));
204 if (elapsed > 10) {
205 printf("ETA: ");
206 if (eta > 60)
207 printf("%lum%lus", eta / 60, eta % 60);
208 else
209 printf("%lus", eta);
211 printf(" ");
214 static void end_progress(void)
216 struct timeval tv;
218 gettimeofday(&tv, NULL);
221 * If any of the logfiles doesn't have a newline
222 * at end of file, imported will be slightly off.
223 * We set it hard here so as to make sure that
224 * the final progress output stops at exactly 100%
226 imported = totsize - skipped;
228 show_progress();
229 putchar('\n');
230 printf("%s, %llu lines imported in %s.",
231 tobytes(totsize), totlines, tv_delta(&import_start, &tv));
232 if (skipped)
233 printf(" %s in %u files skipped.", tobytes(skipped), skipped_files);
234 putchar('\n');
239 static int use_sql = 1;
240 static int insert_downtime_event(int type, char *host, char *service, int id)
242 nebstruct_downtime_data ds;
243 int result;
245 if (!is_interesting_service(host, service))
246 return 0;
248 dt_start += type == NEBTYPE_DOWNTIME_START;
249 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
250 if (dt_depth > max_dt_depth)
251 max_dt_depth = dt_depth;
253 if (!use_sql || only_notifications)
254 return 0;
256 memset(&ds, 0, sizeof(ds));
258 ds.type = type;
259 ds.timestamp.tv_sec = ltime;
260 ds.host_name = host;
261 ds.service_description = service;
262 ds.downtime_id = id;
264 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
265 if (result < 0)
266 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
267 type, host, service, id);
269 return result;
272 typedef struct import_notification {
273 int type, reason, state;
274 } import_notification;
276 static int parse_import_notification(char *str, import_notification *n)
278 char *state_str = str;
280 n->reason = parse_notification_reason(str);
281 if (n->reason != NOTIFICATION_NORMAL) {
282 char *space, *paren;
284 space = strchr(str, ' ');
285 if (!space)
286 return -1;
287 paren = strchr(space, ')');
288 if (!paren)
289 return -1;
290 *paren = '\0';
292 state_str = space + 2;
295 n->type = SERVICE_NOTIFICATION;
296 n->state = parse_service_state_gently(state_str);
297 if (n->state < 0) {
298 n->type = HOST_NOTIFICATION;
299 n->state = parse_host_state_gently(state_str);
302 return 0;
305 static int insert_notification(struct string_code *sc)
307 int base_idx;
308 const char *desc;
309 struct import_notification n;
311 if (!only_notifications)
312 return 0;
314 if (sc->code - NEBTYPE_NOTIFICATION_END == CONCERNS_SERVICE) {
315 base_idx = 1;
316 desc = strv[2];
317 } else {
318 base_idx = 0;
319 desc = 0;
321 if (parse_import_notification(strv[base_idx + 2], &n) < 0) {
322 handle_unknown_event(strv[base_idx + 2]);
323 return 0;
326 if (!use_sql)
327 return 0;
329 return sql_query
330 ("INSERT INTO %s.%s("
331 "notification_type, start_time, end_time, contact_name, "
332 "host_name, service_description, "
333 "command_name, output, "
334 "state, reason_type) "
335 "VALUES("
336 "%d, %lu, %lu, '%s', "
337 "'%s', '%s', "
338 "'%s', '%s', "
339 "%d, %d)",
340 sql_db_name(), sql_table_name(),
341 n.type, ltime, ltime, sql_escape(strv[0]),
342 sql_escape(strv[1]), desc ? sql_escape(desc) : "",
343 sql_escape(strv[base_idx + 3]), sql_escape(strv[base_idx + 4]),
344 n.state, n.reason);
347 static int insert_service_check(struct string_code *sc)
349 nebstruct_service_check_data ds;
351 if (!is_interesting_service(strv[0], strv[1]))
352 return 0;
354 memset(&ds, 0, sizeof(ds));
356 ds.timestamp.tv_sec = ltime;
357 ds.type = sc->code;
358 ds.host_name = strv[0];
359 ds.service_description = strv[1];
360 if (sc->nvecs == 4) {
361 /* passive service check result */
362 if (*strv[2] >= '0' && *strv[2] <= '9')
363 ds.state = atoi(strv[2]);
364 else
365 ds.state = parse_service_state(strv[2]);
366 ds.state_type = HARD_STATE;
367 ds.current_attempt = 1;
368 ds.output = strv[3];
369 } else {
370 ds.state = parse_service_state(strv[2]);
371 ds.state_type = soft_hard(strv[3]);
372 ds.current_attempt = atoi(strv[4]);
373 ds.output = strv[5];
376 if (!use_sql || only_notifications)
377 return 0;
379 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
382 static int insert_host_check(struct string_code *sc)
384 nebstruct_host_check_data ds;
386 if (!is_interesting_host(strv[0]))
387 return 0;
389 memset(&ds, 0, sizeof(ds));
391 ds.timestamp.tv_sec = ltime;
392 ds.type = sc->code;
393 ds.host_name = strv[0];
394 if (sc->nvecs == 3) {
395 if (*strv[1] >= '0' && *strv[1] <= '9')
396 ds.state = atoi(strv[1]);
397 else
398 ds.state = parse_host_state(strv[1]);
399 /* passive host check result */
400 ds.output = strv[2];
401 ds.current_attempt = 1;
402 ds.state_type = HARD_STATE;
403 } else {
404 ds.state = parse_host_state(strv[1]);
405 ds.state_type = soft_hard(strv[2]);
406 ds.current_attempt = atoi(strv[3]);
407 ds.output = strv[4];
410 if (!use_sql || only_notifications)
411 return 0;
413 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
416 static int insert_process_event(int type)
418 nebstruct_process_data ds;
420 if (!use_sql || only_notifications)
421 return 0;
423 memset(&ds, 0, sizeof(ds));
424 ds.timestamp.tv_sec = ltime;
425 ds.type = type;
426 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
429 static int insert_acknowledgement(struct string_code *sc)
431 return 0;
434 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
436 if (!debug_level)
437 return;
439 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
440 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
441 printf("%s", dt->host);
442 if (dt->service)
443 printf(";%s", dt->service);
444 putchar('\n');
447 static struct downtime_entry *last_dte;
448 static struct downtime_entry *del_dte;
450 static void remove_downtime(struct downtime_entry *dt);
451 static int del_matching_dt(void *data)
453 struct downtime_entry *dt = data;
455 if (del_dte->id == dt->id) {
456 dt_print("ALSO", 0, dt);
457 remove_downtime(dt);
458 return HASH_WALK_REMOVE;
461 return 0;
464 static void stash_downtime_command(struct downtime_entry *dt)
466 dt->slot = dt->start % NUM_DENTRIES;
467 dt->next = dentry[dt->slot];
468 dentry[dt->slot] = dt;
471 static void remove_downtime(struct downtime_entry *dt)
473 if (!is_interesting_service(dt->host, dt->service))
474 return;
476 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
478 dt_print("RM_DT", ltime, dt);
479 dt->purged = 1;
482 static struct downtime_entry *
483 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
485 for (; dt; dt = dt->next) {
486 time_t diff;
488 if (ltime > dt->stop || ltime < dt->start) {
489 continue;
492 switch (dt->code) {
493 case SCHEDULE_SVC_DOWNTIME:
494 if (service && strcmp(service, dt->service))
495 continue;
497 /* fallthrough */
498 case SCHEDULE_HOST_DOWNTIME:
499 case SCHEDULE_HOST_SVC_DOWNTIME:
500 if (strcmp(host, dt->host)) {
501 continue;
504 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
505 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
506 /* these two have host set in dt, but
507 * it will not match all the possible hosts */
509 /* fallthrough */
510 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
511 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
512 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
513 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
514 break;
515 default:
516 crash("dt->code not set properly\n");
520 * Once we get here all the various other criteria have
521 * been matched, so we need to check if the daemon was
522 * running when this downtime was supposed to have
523 * started, and otherwise use the daemon start time
524 * as the value to diff against
526 if (daemon_stop < dt->start && daemon_start > dt->start) {
527 debug("Adjusting dt->start (%lu) to (%lu)\n",
528 dt->start, daemon_start);
529 dt->start = daemon_start;
530 if (dt->trigger && dt->duration)
531 dt->stop = dt->start + dt->duration;
534 diff = ltime - dt->start;
535 if (diff < 3 || dt->trigger || !dt->fixed)
536 return dt;
539 return NULL;
542 static struct downtime_entry *
543 find_downtime_command(char *host, char *service)
545 int i;
546 struct downtime_entry *shortcut = NULL;
548 if (last_dte && last_dte->start == ltime) {
549 shortcut = last_dte;
550 // return last_dte;
552 for (i = 0; i < NUM_DENTRIES; i++) {
553 struct downtime_entry *dt;
554 dt = dt_matches_command(dentry[i], host, service);
555 if (dt) {
556 if (shortcut && dt != shortcut)
557 if (debug_level)
558 printf("FIND shortcut no good\n");
559 last_dte = dt;
560 return dt;
564 debug("FIND not\n");
565 return NULL;
568 static int print_downtime(void *data)
570 struct downtime_entry *dt = (struct downtime_entry *)data;
572 dt_print("UNCLOSED", ltime, dt);
574 return 0;
577 static inline void set_next_dt_purge(time_t base, time_t add)
579 if (!next_dt_purge || next_dt_purge > base + add)
580 next_dt_purge = base + add;
582 if (next_dt_purge <= ltime)
583 next_dt_purge = ltime + 1;
586 static inline void add_downtime(char *host, char *service, int id)
588 struct downtime_entry *dt, *cmd, *old;
590 if (!is_interesting_service(host, service))
591 return;
593 dt = malloc(sizeof(*dt));
594 cmd = find_downtime_command(host, service);
595 if (!cmd) {
596 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
597 memset(dt, 0, sizeof(*dt));
598 dt->duration = 7200; /* the default downtime duration in nagios */
599 dt->start = ltime;
600 dt->stop = dt->start + dt->duration;
602 else
603 memcpy(dt, cmd, sizeof(*dt));
605 dt->host = strdup(host);
606 dt->id = id;
607 dt->started = ltime;
609 set_next_dt_purge(ltime, dt->duration);
611 if (!service) {
612 dt->service = NULL;
613 old = hash_update(host_downtime, dt->host, dt);
615 else {
616 dt->service = strdup(service);
617 old = hash_update2(service_downtime, dt->host, dt->service, dt);
620 if (old && old != dt) {
621 free(old->host);
622 if (old->service)
623 free(old->service);
624 free(old);
627 dt_print("IN_DT", ltime, dt);
628 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
631 static time_t last_host_dt_del, last_svc_dt_del;
632 static int register_downtime_command(struct string_code *sc)
634 struct downtime_entry *dt;
635 char *start_time, *end_time, *duration = NULL;
636 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
637 time_t foo;
639 switch (sc->code) {
640 case DEL_HOST_DOWNTIME:
641 last_host_dt_del = ltime;
642 return 0;
643 case DEL_SVC_DOWNTIME:
644 last_svc_dt_del = ltime;
645 return 0;
647 case SCHEDULE_HOST_DOWNTIME:
648 if (strtotimet(strv[5], &foo))
649 duration = strv[4];
650 /* fallthrough */
651 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
652 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
653 case SCHEDULE_HOST_SVC_DOWNTIME:
654 host = strv[0];
655 /* fallthrough */
656 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
657 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
658 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
659 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
660 start_time = strv[1];
661 end_time = strv[2];
662 fixed = strv[3];
663 if (strtotimet(strv[5], &foo))
664 triggered_by = strv[4];
665 if (!duration)
666 duration = strv[5];
668 break;
670 case SCHEDULE_SVC_DOWNTIME:
671 host = strv[0];
672 service = strv[1];
673 start_time = strv[2];
674 end_time = strv[3];
675 fixed = strv[4];
676 if (strtotimet(strv[6], &foo)) {
677 triggered_by = strv[5];
678 duration = strv[6];
680 else {
681 duration = strv[5];
683 break;
685 default:
686 crash("Unknown downtime type: %d", sc->code);
689 if (!(dt = calloc(sizeof(*dt), 1)))
690 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
692 dt->code = sc->code;
693 if (host)
694 dt->host = strdup(host);
695 if (service)
696 dt->service = strdup(service);
698 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
699 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
701 print_strvec(strv, sc->nvecs);
702 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
703 command_codes[sc->code - 1].str, start_time, end_time, duration);
707 * sometimes downtime commands can be logged according to
708 * log version 1, while the log still claims to be version 2.
709 * Apparently, this happens when using a daemon supporting
710 * version 2 logging but a downtime command is added that
711 * follows the version 1 standard.
712 * As such, we simply ignore the result of the "duration"
713 * field conversion and just accept that it might not work
715 (void)strtotimet(duration, &dt->duration);
716 dt->fixed = *fixed - '0';
719 * ignore downtime scheduled to take place in the future.
720 * It will be picked up by the module anyways
722 if (dt->start > time(NULL)) {
723 free(dt);
724 return 0;
727 if (dt->duration > time(NULL)) {
728 warn("Bizarrely large duration (%lu)", dt->duration);
730 if (dt->start < ltime) {
731 if (dt->duration && dt->duration > ltime - dt->start)
732 dt->duration -= ltime - dt->start;
734 dt->start = ltime;
736 if (dt->stop < ltime || dt->stop < dt->start) {
737 /* retroactively scheduled downtime, or just plain wrong */
738 dt->stop = dt->start;
739 dt->duration = 0;
742 if (dt->fixed && dt->duration != dt->stop - dt->start) {
743 // warn("duration doesn't match stop - start: (%lu : %lu)",
744 // dt->duration, dt->stop - dt->start);
746 dt->duration = dt->stop - dt->start;
748 else if (dt->duration > 86400 * 14) {
749 warn("Oddly long duration: %lu", dt->duration);
752 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
753 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
755 stash_downtime_command(dt);
756 return 0;
759 static int insert_downtime(struct string_code *sc)
761 int type;
762 struct downtime_entry *dt = NULL;
763 int id = 0;
764 time_t dt_del_cmd;
765 char *host, *service = NULL;
767 host = strv[0];
768 if (sc->nvecs == 4) {
769 service = strv[1];
770 dt = hash_find2(service_downtime, host, service);
772 else
773 dt = hash_find(host_downtime, host);
776 * to stop a downtime we can either get STOPPED or
777 * CANCELLED. So far, I've only ever seen STARTED
778 * for when it actually starts though, and since
779 * the Nagios daemon is reponsible for launching
780 * it, it's unlikely there are more variants of
781 * that string
783 type = NEBTYPE_DOWNTIME_STOP;
784 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
785 type = NEBTYPE_DOWNTIME_START;
787 switch (type) {
788 case NEBTYPE_DOWNTIME_START:
789 if (dt) {
790 if (!probably_ignore_downtime)
791 dt_print("ALRDY", ltime, dt);
792 return 0;
795 if (probably_ignore_downtime)
796 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
797 probably_ignore_downtime, ltime, host, service);
799 if (ltime - last_downtime_start > 1)
800 downtime_id++;
802 id = downtime_id;
803 add_downtime(host, service, id);
804 last_downtime_start = ltime;
805 break;
807 case NEBTYPE_DOWNTIME_STOP:
808 if (!dt) {
810 * this can happen when overlapping downtime entries
811 * occur, and the start event for the second (or nth)
812 * downtime starts before the first downtime has had
813 * a stop event. It basically means we've almost
814 * certainly done something wrong.
816 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
817 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
818 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
819 dt_skip++;
820 return 0;
823 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
825 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
826 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
827 ltime - dt->started, dt->host, dt->service, dt->duration);
829 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
830 dt_print("Long", ltime, dt);
832 remove_downtime(dt);
834 * Now delete whatever matching downtimes we can find.
835 * this must be here, or we'll recurse like crazy into
836 * remove_downtime(), possibly exhausting the stack
837 * frame buffer
839 del_dte = dt;
840 if (!dt->service)
841 hash_walk_data(host_downtime, del_matching_dt);
842 else
843 hash_walk_data(service_downtime, del_matching_dt);
844 break;
846 default:
847 return -1;
850 return 0;
853 static int dt_purged;
854 static int purge_expired_dt(void *data)
856 struct downtime_entry *dt = data;
858 if (dt->purged) {
859 dt_skip++;
860 return 0;
863 set_next_dt_purge(dt->started, dt->duration);
865 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
866 dt_purged++;
867 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
868 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
869 remove_downtime(dt);
870 return HASH_WALK_REMOVE;
872 else {
873 dt_print("PURGED_NOT_TIME", ltime, dt);
876 return 0;
879 static int purged_downtimes;
880 static void purge_expired_downtime(void)
882 int tot_purged = 0;
884 next_dt_purge = 0;
885 dt_purged = 0;
886 hash_walk_data(host_downtime, purge_expired_dt);
887 if (dt_purged)
888 debug("PURGE %d host downtimes purged", dt_purged);
889 tot_purged += dt_purged;
890 dt_purged = 0;
891 hash_walk_data(service_downtime, purge_expired_dt);
892 if (dt_purged)
893 debug("PURGE %d service downtimes purged", dt_purged);
894 tot_purged += dt_purged;
895 if (tot_purged)
896 debug("PURGE total %d entries purged", tot_purged);
898 if (next_dt_purge)
899 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
900 next_dt_purge, next_dt_purge - ltime);
902 purged_downtimes += tot_purged;
905 static inline void handle_start_event(void)
907 if (!daemon_is_running)
908 insert_process_event(NEBTYPE_PROCESS_START);
910 probably_ignore_downtime = daemon_start = ltime;
911 daemon_is_running = 1;
914 static inline void handle_stop_event(void)
916 if (daemon_is_running) {
917 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
918 daemon_is_running = 0;
920 daemon_stop = ltime;
923 static int parse_line(char *line, uint len)
925 char *ptr, *colon;
926 int nvecs = 0;
927 struct string_code *sc;
928 static time_t last_ltime = 0;
930 imported += len + 1; /* make up for 1 lost byte per newline */
932 /* ignore empty lines */
933 if (!len)
934 return 0;
936 if (++lines_since_progress >= PROGRESS_INTERVAL)
937 show_progress();
939 /* skip obviously bogus lines */
940 if (len < 12 || *line != '[') {
941 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
942 return -1;
945 ltime = strtoul(line + 1, &ptr, 10);
946 if (line + 1 == ptr) {
947 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
948 return -1;
951 if (ltime < last_ltime) {
952 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
953 // ltime, last_ltime, last_ltime - ltime);
954 ltime = last_ltime;
956 else
957 last_ltime = ltime;
960 * Incremental will be 0 if not set, or 1 if set but
961 * the database is currently empty.
962 * Note that this will not always do the correct thing,
963 * as downtime entries that might have been scheduled for
964 * purging may never show up as "stopped" in the database
965 * with this scheme. As such, incremental imports absolutely
966 * require that nothing is in scheduled downtime when the
967 * import is running (well, started really, but it amounts
968 * to the same thing).
970 if (ltime < incremental)
971 return 0;
973 if (next_dt_purge && ltime >= next_dt_purge)
974 purge_expired_downtime();
976 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
977 probably_ignore_downtime = 0;
979 while (*ptr == ']' || *ptr == ' ')
980 ptr++;
982 if (!is_interesting(ptr))
983 return 0;
985 if (!(colon = strchr(ptr, ':'))) {
986 /* stupid heuristic, but might be good for something,
987 * somewhere, sometime. if nothing else, it should suppress
988 * annoying output */
989 if (is_start_event(ptr)) {
990 handle_start_event();
991 return 0;
993 if (is_stop_event(ptr)) {
994 handle_stop_event();
995 return 0;
999 * An unhandled event. We should probably crash here
1001 handle_unknown_event(line);
1002 return -1;
1005 /* an event happened without us having gotten a start-event */
1006 if (!daemon_is_running) {
1007 insert_process_event(NEBTYPE_PROCESS_START);
1008 daemon_start = ltime;
1009 daemon_is_running = 1;
1012 if (!(sc = get_event_type(ptr, colon - ptr))) {
1013 handle_unknown_event(line);
1014 return -1;
1017 if (sc->code == IGNORE_LINE)
1018 return 0;
1020 *colon = 0;
1021 ptr = colon + 1;
1022 while (*ptr == ' ')
1023 ptr++;
1025 if (sc->nvecs) {
1026 int i;
1028 nvecs = vectorize_string(ptr, sc->nvecs);
1030 if (nvecs != sc->nvecs) {
1031 /* broken line */
1032 warn("Line %d in %s seems to not have all the fields it should",
1033 line_no, cur_file->path);
1034 return -1;
1037 for (i = 0; i < sc->nvecs; i++) {
1038 if (!strv[i]) {
1039 /* this should never happen */
1040 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1041 line_no, cur_file->path);
1042 return -1;
1047 switch (sc->code) {
1048 char *semi_colon;
1050 case NEBTYPE_EXTERNALCOMMAND_END:
1051 semi_colon = strchr(ptr, ';');
1052 if (!semi_colon)
1053 return 0;
1054 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
1055 return 0;
1057 if (sc->code == RESTART_PROGRAM) {
1058 handle_stop_event();
1059 return 0;
1062 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
1063 if (nvecs != sc->nvecs) {
1064 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
1066 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
1067 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
1069 register_downtime_command(sc);
1070 } else {
1071 insert_acknowledgement(sc);
1073 break;
1075 case NEBTYPE_HOSTCHECK_PROCESSED:
1076 return insert_host_check(sc);
1078 case NEBTYPE_SERVICECHECK_PROCESSED:
1079 return insert_service_check(sc);
1081 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
1082 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
1083 return insert_downtime(sc);
1085 case NEBTYPE_NOTIFICATION_END + CONCERNS_HOST:
1086 case NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE:
1087 return insert_notification(sc);
1089 case IGNORE_LINE:
1090 return 0;
1093 return 0;
1096 static int parse_one_line(char *str, uint len)
1098 if (parse_line(str, len) && use_sql && sql_errno())
1099 crash("sql error: %s", sql_error());
1101 return 0;
1104 static int hash_one_line(char *line, uint len)
1106 return add_interesting_object(line);
1109 static int hash_interesting(const char *path)
1111 struct stat st;
1113 if (stat(path, &st) < 0)
1114 crash("failed to stat %s: %s", path, strerror(errno));
1116 lparse_path(path, st.st_size, hash_one_line);
1118 return 0;
1121 extern const char *__progname;
1122 __attribute__((__format__(__printf__, 1, 2)))
1123 static void usage(const char *fmt, ...)
1125 if (fmt && *fmt) {
1126 va_list ap;
1128 va_start(ap, fmt);
1129 vfprintf(stdout, fmt, ap);
1130 va_end(ap);
1133 printf("Usage %s [options] [logfiles]\n\n", __progname);
1134 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1135 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1136 printf("\nOptions:\n");
1137 printf(" --help this cruft\n");
1138 printf(" --no-progress don't display progress output\n");
1139 printf(" --no-sql don't access the database\n");
1140 printf(" --db-name database name\n");
1141 printf(" --db-table database table name\n");
1142 printf(" --db-user database user\n");
1143 printf(" --db-pass database password\n");
1144 printf(" --incremental perform an incremental import\n");
1145 printf(" --truncate-db truncate database before importing\n");
1146 printf(" --only-notifications only import notifications\n");
1147 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1148 printf("\n\n");
1150 if (fmt && *fmt)
1151 exit(1);
1153 exit(0);
1156 int main(int argc, char **argv)
1158 int i, truncate_db = 0;
1159 const char *nagios_cfg = NULL;
1160 char *db_name, *db_user, *db_pass, *db_table;
1162 db_name = db_user = db_pass = db_table = NULL;
1164 do_progress = isatty(fileno(stdout));
1166 strv = calloc(sizeof(char *), MAX_NVECS);
1167 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1168 if (!strv || !dentry)
1169 crash("Failed to alloc initial structs");
1172 for (num_nfile = 0,i = 1; i < argc; i++) {
1173 char *opt, *arg = argv[i];
1174 int arg_len, eq_opt = 0;
1176 if ((opt = strchr(arg, '='))) {
1177 *opt++ = '\0';
1178 eq_opt = 1;
1180 else if (i < argc - 1) {
1181 opt = argv[i + 1];
1184 if (!prefixcmp(arg, "-h") || !prefixcmp(arg, "--help")) {
1185 usage(NULL);
1187 if (!prefixcmp(arg, "--incremental")) {
1188 incremental = 1;
1191 * nifty for debugging --incremental skipping log-files
1192 * The value will be overwritten unless --no-sql is also
1193 * in effect
1195 if (eq_opt) {
1196 incremental = strtoul(opt, NULL, 0);
1197 if (!incremental)
1198 usage("--incremental= requires a parameter");
1200 continue;
1202 if (!prefixcmp(arg, "--no-sql")) {
1203 use_sql = 0;
1204 continue;
1206 if (!prefixcmp(arg, "--only-notifications")) {
1207 only_notifications = 1;
1208 db_name = db_name ? db_name : "merlin";
1209 db_user = db_user ? db_user : "merlin";
1210 db_pass = db_pass ? db_pass : "merlin";
1211 db_table = db_table ? db_table : "notification";
1212 continue;
1214 if (!prefixcmp(arg, "--no-progress")) {
1215 do_progress = 0;
1216 continue;
1218 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1219 do_progress = 0;
1220 debug_level++;
1221 continue;
1223 if (!prefixcmp(arg, "--truncate-db")) {
1224 truncate_db = 1;
1225 continue;
1227 if (!prefixcmp(arg, "--nagios-cfg")) {
1228 if (!opt || !*opt) {
1229 crash("%s requires the path to nagios.cfg as argument", arg);
1231 nagios_cfg = opt;
1232 if (opt && !eq_opt)
1233 i++;
1234 continue;
1236 if (!prefixcmp(arg, "--db-name")) {
1237 if (!opt || !*opt)
1238 crash("%s requires a database name as an argument", arg);
1239 db_name = opt;
1240 if (opt && !eq_opt)
1241 i++;
1242 continue;
1244 if (!prefixcmp(arg, "--db-user")) {
1245 if (!opt || !*opt)
1246 crash("%s requires a database username as argument", arg);
1247 db_user = opt;
1248 if (opt && !eq_opt)
1249 i++;
1250 continue;
1252 if (!prefixcmp(arg, "--db-pass")) {
1253 if (!opt || !*opt)
1254 crash("%s requires a database username as argument", arg);
1255 db_pass = opt;
1256 if (opt && !eq_opt)
1257 i++;
1258 continue;
1260 if (!prefixcmp(arg, "--db-table")) {
1261 if (!opt || !*opt)
1262 crash("%s requires a database table name as argument", arg);
1263 db_table = opt;
1264 if (opt && !eq_opt)
1265 i++;
1266 continue;
1268 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1269 if (!opt || !*opt)
1270 crash("%s requires a filename as argument", arg);
1271 hash_interesting(opt);
1272 if (opt && !eq_opt)
1273 i++;
1274 continue;
1277 /* non-argument, so treat as a config- or log-file */
1278 arg_len = strlen(arg);
1279 if (arg_len >= 10 && !strcmp(&arg[arg_len - 10], "nagios.cfg")) {
1280 nagios_cfg = arg;
1281 } else {
1282 add_naglog_path(arg);
1286 /* fallback for op5 systems */
1287 if (!nagios_cfg && !num_nfile) {
1288 nagios_cfg = "/opt/monitor/etc/nagios.cfg";
1290 if (nagios_cfg) {
1291 struct cfg_comp *conf;
1292 conf = cfg_parse_file(nagios_cfg);
1293 for (i = 0; i < conf->vars; i++) {
1294 struct cfg_var *v = conf->vlist[i];
1295 if (!strcmp(v->key, "log_file")) {
1296 add_naglog_path(v->value);
1298 if (!strcmp(v->key, "log_archive_path")) {
1299 add_naglog_path(v->value);
1304 if (use_sql) {
1305 db_name = db_name ? db_name : "monitor_reports";
1306 db_user = db_user ? db_user : "monitor";
1307 db_pass = db_pass ? db_pass : "monitor";
1308 db_table = db_table ? db_table : "report_data";
1309 sql_config("db_database", db_name);
1310 sql_config("db_user", db_user);
1311 sql_config("db_pass", db_pass);
1312 sql_config("db_table", db_table);
1314 if (sql_init() < 0)
1315 crash("sql_init() failed");
1316 if (truncate_db)
1317 sql_query("TRUNCATE %s", sql_table_name());
1319 if (incremental) {
1320 MYSQL_RES *result;
1321 MYSQL_ROW row;
1322 sql_query("SELECT %s FROM %s.%s ORDER BY %s DESC LIMIT 1",
1323 only_notifications ? "end_time" : "timestamp",
1324 db_name, db_table,
1325 only_notifications ? "end_time" : "timestamp");
1327 if (!(result = sql_get_result()))
1328 crash("Failed to get last timestamp: %s\n", sql_error());
1330 /* someone might use --incremental with an empty
1331 * database. We shouldn't crash in that case */
1332 if ((row = sql_fetch_row(result)))
1333 incremental = strtoul(row[0], NULL, 0);
1335 sql_free_result(result);
1338 * We lock the table we'll be working with and disable
1339 * indexes on it. Otherwise doing the actual inserts
1340 * will take just about forever, as MySQL has to update
1341 * and flush the index cache between each operation.
1343 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1344 crash("Failed to disable keys: %s", sql_error());
1345 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1346 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
1349 log_grok_var("logfile", "/dev/null");
1350 log_grok_var("log_levels", "warn");
1352 if (!num_nfile)
1353 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1354 __progname);
1356 if (log_init() < 0)
1357 crash("log_init() failed");
1359 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1361 host_downtime = hash_init(HASH_TABLE_SIZE);
1362 service_downtime = hash_init(HASH_TABLE_SIZE);
1364 if (hook_init() < 0)
1365 crash("Failed to initialize hooks");
1367 /* go through them once to count the total size for progress output */
1368 for (i = 0; i < num_nfile; i++) {
1369 totsize += nfile[i].size;
1372 gettimeofday(&import_start, NULL);
1373 printf("Importing %s of data from %d files\n",
1374 tobytes(totsize), num_nfile);
1376 for (i = 0; i < num_nfile; i++) {
1377 struct naglog_file *nf = &nfile[i];
1378 cur_file = nf;
1379 show_progress();
1382 * skip parsing files if they're not interesting, such
1383 * as during incremental imports.
1384 * 'incremental' will be 0 if we're doing a full import,
1385 * 1 if we're doing an incremental but the database is
1386 * empty and will contain the timestamp of the latest
1387 * entry in the database if we're doing an incremental
1388 * import to a populated table.
1389 * Note that we can never skip the last file in the list,
1390 * although the lparse routine should sift through it
1391 * pretty quickly in case it has nothing interesting.
1393 if (i + 1 < num_nfile && incremental > nfile[i + 1].first) {
1394 skipped_files++;
1395 skipped += nf->size;
1396 continue;
1398 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1399 line_no = 0;
1400 lparse_path(nf->path, nf->size, parse_one_line);
1401 imported++; /* make up for one lost byte per file */
1404 ltime = time(NULL);
1405 purge_expired_downtime();
1406 end_progress();
1408 if (debug_level) {
1409 if (dt_depth) {
1410 printf("Unclosed host downtimes:\n");
1411 puts("------------------------");
1412 hash_walk_data(host_downtime, print_downtime);
1413 printf("Unclosed service downtimes:\n");
1414 puts("---------------------------");
1415 hash_walk_data(service_downtime, print_downtime);
1417 printf("dt_depth: %d\n", dt_depth);
1419 printf("purged downtimes: %d\n", purged_downtimes);
1420 printf("max simultaneous host downtime hashes: %u\n",
1421 hash_entries_max(host_downtime));
1422 printf("max simultaneous service downtime hashes: %u\n",
1423 hash_entries_max(service_downtime));
1424 printf("max downtime depth: %u\n", max_dt_depth);
1427 if (use_sql) {
1428 SQL_RESULT *res;
1429 SQL_ROW row;
1430 time_t start;
1431 unsigned long entries;
1433 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
1434 if (!(res = sql_get_result()))
1435 entries = 0;
1436 else {
1437 row = sql_fetch_row(res);
1438 entries = strtoul(row[0], NULL, 0);
1439 sql_free_result(res);
1442 signal(SIGINT, SIG_IGN);
1443 sql_query("UNLOCK TABLES");
1444 start = time(NULL);
1445 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1446 (entries / 50000) + 1);
1447 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1448 printf("%lu database entries indexed in %lu seconds\n",
1449 entries, time(NULL) - start);
1450 sql_close();
1453 if (warnings && debug_level)
1454 fprintf(stderr, "Total warnings: %d\n", warnings);
1456 if (debug_level || dt_start > dt_stop) {
1457 uint count;
1458 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1459 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop, dt_depth, dt_skip);
1460 hash_debug_table(host_downtime, 0);
1461 hash_debug_table(service_downtime, 0);
1462 if ((count = hash_entries(host_downtime))) {
1463 fprintf(stderr, "host_downtime as %u entries remaining\n", count);
1465 if ((count = hash_entries(service_downtime))) {
1466 fprintf(stderr, "service_downtime has %u entries remaining\n", count);
1470 print_unhandled_events();
1472 return 0;