import: Add support for passive host and service checks
[nagios-reports-module.git] / import.c
blob8182a44df3392965aae840fe9d5c593e385d36a4
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include <nagios/broker.h>
6 #include <nagios/nebcallbacks.h>
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
14 #define IGNORE_LINE 0
16 #define CONCERNS_HOST 50
17 #define CONCERNS_SERVICE 60
19 #define MAX_NVECS 16
20 #define HASH_TABLE_SIZE 128
22 /* for some reason these aren't defined inside Nagios' headers */
23 #define SERVICE_OK 0
24 #define SERVICE_WARNING 1
25 #define SERVICE_CRITICAL 2
26 #define SERVICE_UNKNOWN 3
28 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
31 static uint imported, totsize, totlines;
32 static int lines_since_progress, do_progress;
33 static struct timeval import_start;
34 static time_t daemon_start, daemon_stop, incremental;
35 static int daemon_is_running;
36 static uint max_dt_depth;
38 static time_t next_dt_purge; /* when next to purge expired downtime */
39 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
41 static time_t ltime; /* the timestamp from the current log-line */
43 static int dt_start, dt_stop;
44 #define dt_depth (dt_start - dt_stop)
45 static hash_table *host_downtime;
46 static hash_table *service_downtime;
47 static int downtime_id;
48 static time_t probably_ignore_downtime;
50 struct downtime_entry {
51 int id;
52 int code;
53 char *host;
54 char *service;
55 time_t start;
56 time_t stop;
57 int fixed;
58 time_t duration;
59 time_t started;
60 time_t ended;
61 int purged;
62 int trigger;
63 int slot;
64 struct downtime_entry *next;
67 #define NUM_DENTRIES 1024
68 static struct downtime_entry **dentry;
69 static time_t last_downtime_start;
71 static struct string_code event_codes[] = {
72 add_ignored("Error"),
73 add_ignored("Warning"),
74 add_ignored("LOG ROTATION"),
75 add_ignored("HOST NOTIFICATION"),
76 add_ignored("HOST FLAPPING ALERT"),
77 add_ignored("SERVICE NOTIFICATION"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
84 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
85 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
86 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
87 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
88 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
90 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
91 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
93 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
94 { 0, NULL, 0, 0 },
97 static struct string_code command_codes[] = {
98 add_cdef(1, DEL_HOST_DOWNTIME),
99 add_cdef(1, DEL_SVC_DOWNTIME),
100 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
101 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
102 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
106 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
107 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
108 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
111 * These really have one more field than listed here. We omit one
112 * to make author and comment concatenated with a semi-colon by default.
114 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
115 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
116 { 0, NULL, 0, 0 },
120 static inline void print_strvec(char **v, int n)
122 int i;
124 for (i = 0; i < n; i++)
125 printf("v[%2d]: %s\n", i, v[i]);
129 static const char *tobytes(uint n)
131 const char *suffix = "KMGT";
132 static char tbuf[2][30];
133 static int t = 0;
134 int shift = 1;
136 t ^= 1;
137 if (n < 1024) {
138 sprintf(tbuf[t], "%d bytes", n);
139 return tbuf[t];
142 while (n >> (shift * 10) > 1024)
143 shift++;
145 sprintf(tbuf[t], "%0.2f %ciB",
146 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
148 return tbuf[t];
151 static void show_progress(void)
153 time_t eta, elapsed;
154 float pct_done;
156 totlines += lines_since_progress;
157 lines_since_progress = 0;
159 if (!do_progress)
160 return;
162 elapsed = time(NULL) - import_start.tv_sec;
163 if (!elapsed)
164 elapsed = 1;
166 pct_done = ((float)imported / (float)totsize) * 100;
167 eta = (elapsed / pct_done) * (100.0 - pct_done);
169 printf("\rImporting data: %.2f%% (%s) done ",
170 pct_done, tobytes(imported));
171 if (elapsed > 10) {
172 printf("ETA: ");
173 if (eta > 60)
174 printf("%lum%lus", eta / 60, eta % 60);
175 else
176 printf("%lus", eta);
178 printf(" ");
181 static void end_progress(void)
183 struct timeval tv;
184 int mins;
185 float secs;
187 gettimeofday(&tv, NULL);
190 * If any of the logfiles doesn't have a newline
191 * at end of file, imported will be slightly off.
192 * We set it hard here so as to make sure that
193 * the final progress output stops at exactly 100%
195 imported = totsize;
197 show_progress();
198 putchar('\n');
199 secs = (tv.tv_sec - import_start.tv_sec) * 1000000;
200 secs += tv.tv_usec - import_start.tv_usec;
201 mins = (tv.tv_sec - import_start.tv_sec) / 60;
202 secs /= 1000000;
203 secs -= (mins * 60);
204 printf("%s in %u lines imported in ", tobytes(totsize), totlines);
205 if (mins)
206 printf("%dm ", mins);
207 printf("%.3fs\n", secs);
210 static int use_sql = 1;
211 static int insert_downtime_event(int type, char *host, char *service, int id)
213 nebstruct_downtime_data ds;
214 int result;
216 if (!is_interesting_service(host, service))
217 return 0;
219 dt_start += type == NEBTYPE_DOWNTIME_START;
220 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
221 if (dt_depth > max_dt_depth)
222 max_dt_depth = dt_depth;
224 if (!use_sql)
225 return 0;
227 memset(&ds, 0, sizeof(ds));
229 ds.type = type;
230 ds.timestamp.tv_sec = ltime;
231 ds.host_name = host;
232 ds.service_description = service;
233 ds.downtime_id = id;
235 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
236 if (result < 0)
237 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
238 type, host, service, id);
240 return result;
243 static int insert_service_check(struct string_code *sc)
245 nebstruct_service_check_data ds;
247 if (!use_sql)
248 return 0;
250 if (!is_interesting_service(strv[0], strv[1]))
251 return 0;
253 memset(&ds, 0, sizeof(ds));
255 ds.timestamp.tv_sec = ltime;
256 ds.type = sc->code;
257 ds.host_name = strv[0];
258 ds.service_description = strv[1];
259 ds.state = parse_service_state(strv[2]);
260 if (sc->nvecs == 4) {
261 /* passive service check result */
262 ds.state_type = HARD_STATE;
263 ds.current_attempt = 1;
264 ds.output = strv[3];
265 } else {
266 ds.state_type = soft_hard(strv[3]);
267 ds.current_attempt = atoi(strv[4]);
268 ds.output = strv[5];
271 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
274 static int insert_host_check(struct string_code *sc)
276 nebstruct_host_check_data ds;
278 if (!use_sql)
279 return 0;
281 if (!is_interesting_host(strv[0]))
282 return 0;
284 memset(&ds, 0, sizeof(ds));
286 ds.timestamp.tv_sec = ltime;
287 ds.type = sc->code;
288 ds.host_name = strv[0];
289 ds.state = parse_host_state(strv[1]);
290 if (sc->nvecs == 3) {
291 /* passive host check result */
292 ds.output = strv[2];
293 ds.current_attempt = 1;
294 ds.state_type = HARD_STATE;
295 } else {
296 ds.state_type = soft_hard(strv[2]);
297 ds.current_attempt = atoi(strv[3]);
298 ds.output = strv[4];
301 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
304 static int insert_process_event(int type)
306 nebstruct_process_data ds;
308 if (!use_sql)
309 return 0;
311 memset(&ds, 0, sizeof(ds));
312 ds.timestamp.tv_sec = ltime;
313 ds.type = type;
314 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
317 static int insert_acknowledgement(struct string_code *sc)
319 return 0;
322 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
324 if (!debug_level)
325 return;
327 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
328 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
329 printf("%s", dt->host);
330 if (dt->service)
331 printf(";%s", dt->service);
332 putchar('\n');
335 static struct downtime_entry *last_dte;
336 static struct downtime_entry *del_dte;
338 static void remove_downtime(struct downtime_entry *dt);
339 static int del_matching_dt(void *data)
341 struct downtime_entry *dt = data;
343 if (del_dte->id == dt->id) {
344 dt_print("ALSO", 0, dt);
345 remove_downtime(dt);
348 return 0;
351 static void stash_downtime_command(struct downtime_entry *dt)
353 dt->slot = dt->start % NUM_DENTRIES;
354 dt->next = dentry[dt->slot];
355 dentry[dt->slot] = dt;
358 static void remove_downtime(struct downtime_entry *dt)
360 struct downtime_entry *old;
362 if (!is_interesting_service(dt->host, dt->service))
363 return;
365 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
367 if (!dt->service)
368 old = hash_remove(host_downtime, dt->host);
369 else
370 old = hash_remove2(service_downtime, dt->host, dt->service);
372 dt_print("RM_DT", ltime, dt);
373 dt->purged = 1;
376 static struct downtime_entry *
377 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
379 for (; dt; dt = dt->next) {
380 time_t diff;
382 if (ltime > dt->stop || ltime < dt->start) {
383 continue;
386 switch (dt->code) {
387 case SCHEDULE_SVC_DOWNTIME:
388 if (service && strcmp(service, dt->service))
389 continue;
391 /* fallthrough */
392 case SCHEDULE_HOST_DOWNTIME:
393 case SCHEDULE_HOST_SVC_DOWNTIME:
394 if (strcmp(host, dt->host)) {
395 continue;
398 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
399 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
400 /* these two have host set in dt, but
401 * it will not match all the possible hosts */
403 /* fallthrough */
404 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
405 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
406 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
407 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
408 break;
409 default:
410 crash("dt->code not set properly\n");
414 * Once we get here all the various other criteria have
415 * been matched, so we need to check if the daemon was
416 * running when this downtime was supposed to have
417 * started, and otherwise use the daemon start time
418 * as the value to diff against
420 if (daemon_stop < dt->start && daemon_start > dt->start) {
421 debug("Adjusting dt->start (%lu) to (%lu)\n",
422 dt->start, daemon_start);
423 dt->start = daemon_start;
424 if (dt->trigger && dt->duration)
425 dt->stop = dt->start + dt->duration;
428 diff = ltime - dt->start;
429 if (diff < 3 || dt->trigger || !dt->fixed)
430 return dt;
433 return NULL;
436 static struct downtime_entry *
437 find_downtime_command(char *host, char *service)
439 int i;
440 struct downtime_entry *shortcut = NULL;
442 if (last_dte && last_dte->start == ltime) {
443 shortcut = last_dte;
444 // return last_dte;
446 for (i = 0; i < NUM_DENTRIES; i++) {
447 struct downtime_entry *dt;
448 dt = dt_matches_command(dentry[i], host, service);
449 if (dt) {
450 if (shortcut && dt != shortcut)
451 if (debug_level)
452 printf("FIND shortcut no good\n");
453 last_dte = dt;
454 return dt;
458 debug("FIND not\n");
459 return NULL;
462 static int print_downtime(void *data)
464 struct downtime_entry *dt = (struct downtime_entry *)data;
466 dt_print("UNCLOSED", ltime, dt);
468 return 0;
471 static inline void set_next_dt_purge(time_t base, time_t add)
473 if (!next_dt_purge || next_dt_purge > base + add)
474 next_dt_purge = base + add;
476 if (next_dt_purge <= ltime)
477 next_dt_purge = ltime + 1;
480 static inline void add_downtime(char *host, char *service, int id)
482 struct downtime_entry *dt, *cmd, *old;
484 if (!is_interesting_service(host, service))
485 return;
487 dt = malloc(sizeof(*dt));
488 cmd = find_downtime_command(host, service);
489 if (!cmd) {
490 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
491 memset(dt, 0, sizeof(*dt));
492 dt->duration = 7200; /* the default downtime duration in nagios */
493 dt->start = ltime;
494 dt->stop = dt->start + dt->duration;
496 else
497 memcpy(dt, cmd, sizeof(*dt));
499 dt->host = strdup(host);
500 dt->id = id;
501 dt->started = ltime;
503 set_next_dt_purge(ltime, dt->duration);
505 if (!service) {
506 dt->service = NULL;
507 old = hash_update(host_downtime, dt->host, dt);
509 else {
510 dt->service = strdup(service);
511 old = hash_update2(service_downtime, dt->host, dt->service, dt);
514 if (old && old != dt) {
515 free(old->host);
516 if (old->service)
517 free(old->service);
518 free(old);
521 dt_print("IN_DT", ltime, dt);
522 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
525 static time_t last_host_dt_del, last_svc_dt_del;
526 static int register_downtime_command(struct string_code *sc)
528 struct downtime_entry *dt;
529 char *start_time, *end_time, *duration = NULL;
530 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
531 time_t foo;
533 switch (sc->code) {
534 case DEL_HOST_DOWNTIME:
535 last_host_dt_del = ltime;
536 return 0;
537 case DEL_SVC_DOWNTIME:
538 last_svc_dt_del = ltime;
539 return 0;
541 case SCHEDULE_HOST_DOWNTIME:
542 if (strtotimet(strv[5], &foo))
543 duration = strv[4];
544 /* fallthrough */
545 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
546 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
547 case SCHEDULE_HOST_SVC_DOWNTIME:
548 host = strv[0];
549 /* fallthrough */
550 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
551 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
552 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
553 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
554 start_time = strv[1];
555 end_time = strv[2];
556 fixed = strv[3];
557 if (strtotimet(strv[5], &foo))
558 triggered_by = strv[4];
559 if (!duration)
560 duration = strv[5];
562 break;
564 case SCHEDULE_SVC_DOWNTIME:
565 host = strv[0];
566 service = strv[1];
567 start_time = strv[2];
568 end_time = strv[3];
569 fixed = strv[4];
570 if (strtotimet(strv[6], &foo)) {
571 triggered_by = strv[5];
572 duration = strv[6];
574 else {
575 duration = strv[5];
577 break;
579 default:
580 crash("Unknown downtime type: %d", sc->code);
583 if (!(dt = calloc(sizeof(*dt), 1)))
584 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
586 dt->code = sc->code;
587 if (host)
588 dt->host = strdup(host);
589 if (service)
590 dt->service = strdup(service);
592 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
593 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
595 print_strvec(strv, sc->nvecs);
596 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
597 command_codes[sc->code - 1].str, start_time, end_time, duration);
601 * sometimes downtime commands can be logged according to
602 * log version 1, while the log still claims to be version 2.
603 * Apparently, this happens when using a daemon supporting
604 * version 2 logging but a downtime command is added that
605 * follows the version 1 standard.
606 * As such, we simply ignore the result of the "duration"
607 * field conversion and just accept that it might not work
609 (void)strtotimet(duration, &dt->duration);
610 dt->fixed = *fixed - '0';
613 * ignore downtime scheduled to take place in the future.
614 * It will be picked up by the module anyways
616 if (dt->start > time(NULL)) {
617 free(dt);
618 return 0;
621 if (dt->duration > time(NULL)) {
622 warn("Bizarrely large duration (%lu)", dt->duration);
624 if (dt->start < ltime) {
625 if (dt->duration && dt->duration > ltime - dt->start)
626 dt->duration -= ltime - dt->start;
628 dt->start = ltime;
630 if (dt->stop < ltime || dt->stop < dt->start) {
631 /* retroactively scheduled downtime, or just plain wrong */
632 dt->stop = dt->start;
633 dt->duration = 0;
636 if (dt->fixed && dt->duration != dt->stop - dt->start) {
637 // warn("duration doesn't match stop - start: (%lu : %lu)",
638 // dt->duration, dt->stop - dt->start);
640 dt->duration = dt->stop - dt->start;
642 else if (dt->duration > 86400 * 14) {
643 warn("Oddly long duration: %lu", dt->duration);
646 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
647 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
649 stash_downtime_command(dt);
650 return 0;
653 static int insert_downtime(struct string_code *sc)
655 int type;
656 struct downtime_entry *dt = NULL;
657 int id = 0;
658 time_t dt_del_cmd;
659 char *host, *service = NULL;
661 host = strv[0];
662 if (sc->nvecs == 4) {
663 service = strv[1];
664 dt = hash_find2(service_downtime, host, service);
666 else
667 dt = hash_find(host_downtime, host);
670 * to stop a downtime we can either get STOPPED or
671 * CANCELLED. So far, I've only ever seen STARTED
672 * for when it actually starts though, and since
673 * the Nagios daemon is reponsible for launching
674 * it, it's unlikely there are more variants of
675 * that string
677 type = NEBTYPE_DOWNTIME_STOP;
678 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
679 type = NEBTYPE_DOWNTIME_START;
681 switch (type) {
682 case NEBTYPE_DOWNTIME_START:
683 if (dt) {
684 if (!probably_ignore_downtime)
685 dt_print("ALRDY", ltime, dt);
686 return 0;
689 if (probably_ignore_downtime)
690 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
691 probably_ignore_downtime, ltime, host, service);
693 if (ltime - last_downtime_start > 1)
694 downtime_id++;
696 id = downtime_id;
697 add_downtime(host, service, id);
698 last_downtime_start = ltime;
699 break;
701 case NEBTYPE_DOWNTIME_STOP:
702 if (!dt) {
704 * this can happen when overlapping downtime entries
705 * occur, and the start event for the second (or nth)
706 * downtime starts before the first downtime has had
707 * a stop event. It basically means we've almost
708 * certainly done something wrong.
710 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
711 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
712 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
713 return 0;
716 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
718 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
719 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
720 ltime - dt->started, dt->host, dt->service, dt->duration);
722 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
723 dt_print("Long", ltime, dt);
725 remove_downtime(dt);
727 * Now delete whatever matching downtimes we can find.
728 * this must be here, or we'll recurse like crazy into
729 * remove_downtime(), possibly exhausting the stack
730 * frame buffer
732 del_dte = dt;
733 if (!dt->service)
734 hash_walk_data(host_downtime, del_matching_dt);
735 else
736 hash_walk_data(service_downtime, del_matching_dt);
737 break;
739 default:
740 return -1;
743 return 0;
746 static int dt_purged;
747 static int purge_expired_dt(void *data)
749 struct downtime_entry *dt = data;
751 if (dt->purged) {
752 return 0;
755 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
756 dt_purged++;
757 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
758 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
759 remove_downtime(dt);
761 else {
762 dt_print("PURGED_NOT_TIME", ltime, dt);
765 set_next_dt_purge(dt->started, dt->duration);
767 return 0;
770 static int purged_downtimes;
771 static void purge_expired_downtime(void)
773 int tot_purged = 0;
775 next_dt_purge = 0;
776 dt_purged = 0;
777 hash_walk_data(host_downtime, purge_expired_dt);
778 if (dt_purged)
779 debug("PURGE %d host downtimes purged", dt_purged);
780 tot_purged += dt_purged;
781 dt_purged = 0;
782 hash_walk_data(service_downtime, purge_expired_dt);
783 if (dt_purged)
784 debug("PURGE %d service downtimes purged", dt_purged);
785 tot_purged += dt_purged;
786 if (tot_purged)
787 debug("PURGE total %d entries purged", tot_purged);
789 if (next_dt_purge)
790 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
791 next_dt_purge, next_dt_purge - ltime);
793 purged_downtimes += tot_purged;
796 static inline void handle_start_event(void)
798 if (!daemon_is_running)
799 insert_process_event(NEBTYPE_PROCESS_START);
801 probably_ignore_downtime = daemon_start = ltime;
802 daemon_is_running = 1;
805 static inline void handle_stop_event(void)
807 if (daemon_is_running) {
808 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
809 daemon_is_running = 0;
811 daemon_stop = ltime;
814 static int parse_line(char *line, uint len)
816 char *ptr, *colon;
817 int nvecs = 0;
818 struct string_code *sc;
819 static time_t last_ltime = 0;
821 imported += len + 1; /* make up for 1 lost byte per newline */
823 /* ignore empty lines */
824 if (!len)
825 return 0;
827 if (++lines_since_progress >= PROGRESS_INTERVAL)
828 show_progress();
830 /* skip obviously bogus lines */
831 if (len < 12 || *line != '[') {
832 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
833 return -1;
836 ltime = strtoul(line + 1, &ptr, 10);
837 if (line + 1 == ptr) {
838 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
839 return -1;
842 if (ltime < last_ltime) {
843 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
844 // ltime, last_ltime, last_ltime - ltime);
845 ltime = last_ltime;
847 else
848 last_ltime = ltime;
851 * Incremental will be 0 if not set, or 1 if set but
852 * the database is currently empty.
853 * Note that this will not always do the correct thing,
854 * as downtime entries that might have been scheduled for
855 * purging may never show up as "stopped" in the database
856 * with this scheme. As such, incremental imports absolutely
857 * require that nothing is in scheduled downtime when the
858 * import is running (well, started really, but it amounts
859 * to the same thing).
861 if (ltime < incremental)
862 return 0;
864 if (next_dt_purge && ltime >= next_dt_purge)
865 purge_expired_downtime();
867 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
868 probably_ignore_downtime = 0;
870 while (*ptr == ']' || *ptr == ' ')
871 ptr++;
873 if (!is_interesting(ptr))
874 return 0;
876 if (!(colon = strchr(ptr, ':'))) {
877 /* stupid heuristic, but might be good for something,
878 * somewhere, sometime. if nothing else, it should suppress
879 * annoying output */
880 if (is_start_event(ptr)) {
881 handle_start_event();
882 return 0;
884 if (is_stop_event(ptr)) {
885 handle_stop_event();
886 return 0;
890 * An unhandled event. We should probably crash here
892 handle_unknown_event(line);
893 return -1;
896 /* an event happened without us having gotten a start-event */
897 if (!daemon_is_running) {
898 insert_process_event(NEBTYPE_PROCESS_START);
899 daemon_start = ltime;
900 daemon_is_running = 1;
903 if (!(sc = get_event_type(ptr, colon - ptr))) {
904 handle_unknown_event(line);
905 return -1;
908 if (sc->code == IGNORE_LINE)
909 return 0;
911 *colon = 0;
912 ptr = colon + 1;
913 while (*ptr == ' ')
914 ptr++;
916 if (sc->nvecs) {
917 int i;
919 nvecs = vectorize_string(ptr, sc->nvecs);
921 if (nvecs != sc->nvecs) {
922 /* broken line */
923 warn("Line %d in %s seems to not have all the fields it should",
924 line_no, cur_file->path);
925 return -1;
928 for (i = 0; i < sc->nvecs; i++) {
929 if (!strv[i]) {
930 /* this should never happen */
931 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
932 line_no, cur_file->path);
933 return -1;
938 switch (sc->code) {
939 char *semi_colon;
941 case NEBTYPE_EXTERNALCOMMAND_END:
942 semi_colon = strchr(ptr, ';');
943 if (!semi_colon)
944 return 0;
945 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
946 return 0;
948 if (sc->code == RESTART_PROGRAM) {
949 handle_stop_event();
950 return 0;
953 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
954 if (nvecs != sc->nvecs) {
955 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
957 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
958 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
960 register_downtime_command(sc);
961 } else {
962 insert_acknowledgement(sc);
964 break;
966 case NEBTYPE_HOSTCHECK_PROCESSED:
967 return insert_host_check(sc);
969 case NEBTYPE_SERVICECHECK_PROCESSED:
970 return insert_service_check(sc);
972 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
973 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
974 return insert_downtime(sc);
976 case IGNORE_LINE:
977 return 0;
980 return 0;
983 static int parse_one_line(char *str, uint len)
985 if (parse_line(str, len) && use_sql && sql_errno())
986 crash("sql error: %s", sql_error());
988 return 0;
991 static int hash_one_line(char *line, uint len)
993 return add_interesting_object(line);
996 static int hash_interesting(const char *path)
998 struct stat st;
1000 if (stat(path, &st) < 0)
1001 crash("failed to stat %s: %s", path, strerror(errno));
1003 lparse_path(path, st.st_size, hash_one_line);
1005 return 0;
1008 extern const char *__progname;
1009 int main(int argc, char **argv)
1011 int i, truncate_db = 0;
1012 struct naglog_file *nfile;
1013 char *db_name = "monitor_reports";
1014 char *db_user = "monitor";
1015 char *db_pass = "monitor";
1016 char *db_table = "report_data";
1018 do_progress = isatty(fileno(stdout));
1020 strv = calloc(sizeof(char *), MAX_NVECS);
1021 nfile = calloc(sizeof(*nfile), argc - 1);
1022 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1023 if (!strv || !nfile || !dentry)
1024 crash("Failed to alloc initial structs");
1027 for (num_nfile = 0,i = 1; i < argc; i++) {
1028 char *opt, *arg = argv[i];
1029 struct naglog_file *nf;
1030 int eq_opt = 0;
1032 if ((opt = strchr(arg, '='))) {
1033 *opt++ = '\0';
1034 eq_opt = 1;
1036 else if (i < argc - 1) {
1037 opt = argv[i + 1];
1040 if (!prefixcmp(arg, "--incremental")) {
1041 incremental = 1;
1042 continue;
1044 if (!prefixcmp(arg, "--no-sql")) {
1045 use_sql = 0;
1046 continue;
1048 if (!prefixcmp(arg, "--no-progress")) {
1049 do_progress = 0;
1050 continue;
1052 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1053 do_progress = 0;
1054 debug_level++;
1055 continue;
1057 if (!prefixcmp(arg, "--truncate-db")) {
1058 truncate_db = 1;
1059 continue;
1061 if (!prefixcmp(arg, "--db-name")) {
1062 if (!opt || !*opt)
1063 crash("%s requires a database name as an argument", arg);
1064 db_name = opt;
1065 if (opt && !eq_opt)
1066 i++;
1067 continue;
1069 if (!prefixcmp(arg, "--db-user")) {
1070 if (!opt || !*opt)
1071 crash("%s requires a database username as argument", arg);
1072 db_user = opt;
1073 if (opt && !eq_opt)
1074 i++;
1075 continue;
1077 if (!prefixcmp(arg, "--db-pass")) {
1078 if (!opt || !*opt)
1079 crash("%s requires a database username as argument", arg);
1080 db_pass = opt;
1081 if (opt && !eq_opt)
1082 i++;
1083 continue;
1085 if (!prefixcmp(arg, "--db-table")) {
1086 if (!opt || !*opt)
1087 crash("%s requires a database table name as argument", arg);
1088 db_table = opt;
1089 if (opt && !eq_opt)
1090 i++;
1091 continue;
1093 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1094 if (!opt || !*opt)
1095 crash("%s requires a filename as argument", arg);
1096 hash_interesting(opt);
1097 if (opt && !eq_opt)
1098 i++;
1099 continue;
1102 /* non-argument, so treat as file */
1103 nf = &nfile[num_nfile++];
1104 nf->path = arg;
1105 first_log_time(nf);
1106 totsize += nf->size;
1109 if (use_sql) {
1110 sql_config("db_database", db_name);
1111 sql_config("db_user", db_user);
1112 sql_config("db_pass", db_pass);
1113 sql_config("db_table", db_table);
1115 if (sql_init() < 0)
1116 crash("sql_init() failed");
1117 if (truncate_db)
1118 sql_query("TRUNCATE %s", sql_table_name());
1120 if (incremental) {
1121 MYSQL_RES *result;
1122 MYSQL_ROW row;
1123 sql_query("SELECT timestamp FROM %s.%s ORDER BY timestamp DESC LIMIT 1",
1124 db_name, db_table);
1126 if (!(result = sql_get_result()))
1127 crash("Failed to get last timestamp: %s\n", sql_error());
1129 /* someone might use --incremental with an empty
1130 * database. We shouldn't crash in that case */
1131 if ((row = sql_fetch_row(result)))
1132 incremental = strtoul(row[0], NULL, 0);
1134 sql_free_result(result);
1137 * We lock the table we'll be working with and disable
1138 * indexes on it. Otherwise doing the actual inserts
1139 * will take just about forever, as MySQL has to update
1140 * and flush the index cache between each operation.
1142 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1143 crash("Failed to disable keys: %s", sql_error());
1144 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1145 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
1148 log_grok_var("logfile", "/dev/null");
1149 log_grok_var("log_levels", "warn");
1151 if (!num_nfile)
1152 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1153 __progname);
1155 if (log_init() < 0)
1156 crash("log_init() failed");
1158 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1160 host_downtime = hash_init(HASH_TABLE_SIZE);
1161 service_downtime = hash_init(HASH_TABLE_SIZE);
1163 if (hook_init() < 0)
1164 crash("Failed to initialize hooks");
1166 gettimeofday(&import_start, NULL);
1167 printf("Importing %s of data from %d files\n",
1168 tobytes(totsize), num_nfile);
1170 for (i = 0; i < num_nfile; i++) {
1171 struct naglog_file *nf = &nfile[i];
1172 cur_file = nf;
1173 show_progress();
1174 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1175 line_no = 0;
1176 lparse_path(nf->path, nf->size, parse_one_line);
1177 imported++; /* make up for one lost byte per file */
1180 end_progress();
1182 if (debug_level) {
1183 if (dt_depth) {
1184 printf("Unclosed host downtimes:\n");
1185 puts("------------------------");
1186 hash_walk_data(host_downtime, print_downtime);
1187 printf("Unclosed service downtimes:\n");
1188 puts("---------------------------");
1189 hash_walk_data(service_downtime, print_downtime);
1191 printf("dt_depth: %d\n", dt_depth);
1193 printf("purged downtimes: %d\n", purged_downtimes);
1194 printf("max simultaneous host downtime hashes: %u\n",
1195 hash_get_max_entries(host_downtime));
1196 printf("max simultaneous service downtime hashes: %u\n",
1197 hash_get_max_entries(service_downtime));
1198 printf("max downtime depth: %u\n", max_dt_depth);
1201 if (use_sql) {
1202 SQL_RESULT *res;
1203 SQL_ROW row;
1204 time_t start;
1205 unsigned long entries;
1207 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
1208 if (!(res = sql_get_result()))
1209 entries = 0;
1210 else {
1211 row = sql_fetch_row(res);
1212 entries = strtoul(row[0], NULL, 0);
1213 sql_free_result(res);
1216 signal(SIGINT, SIG_IGN);
1217 sql_query("UNLOCK TABLES");
1218 start = time(NULL);
1219 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1220 (entries / 50000) + 1);
1221 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1222 printf("%lu database entries indexed in %lu seconds\n",
1223 entries, time(NULL) - start);
1224 sql_close();
1227 if (warnings && debug_level)
1228 fprintf(stderr, "Total warnings: %d\n", warnings);
1230 if (debug_level || dt_start != dt_stop)
1231 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n",
1232 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop);
1233 if (hash_check_table(host_downtime))
1234 fprintf(stderr, "Hash table inconsistencies for host_downtime\n");
1235 if (hash_check_table(service_downtime))
1236 fprintf(stderr, "Hash table inconsistencies for service_downtime\n");
1238 print_unhandled_events();
1240 return 0;