5 #include <nagios/broker.h>
6 #include <nagios/nebcallbacks.h>
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
32 static uint imported
, totsize
, totlines
;
33 static int lines_since_progress
, do_progress
;
34 static struct timeval import_start
;
35 static time_t daemon_start
, daemon_stop
, incremental
;
36 static int daemon_is_running
;
37 static uint max_dt_depth
;
39 static time_t next_dt_purge
; /* when next to purge expired downtime */
40 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
42 static time_t ltime
; /* the timestamp from the current log-line */
44 static int dt_start
, dt_stop
;
45 #define dt_depth (dt_start - dt_stop)
46 static hash_table
*host_downtime
;
47 static hash_table
*service_downtime
;
48 static int downtime_id
;
49 static time_t probably_ignore_downtime
;
51 struct downtime_entry
{
65 struct downtime_entry
*next
;
68 #define NUM_DENTRIES 1024
69 static struct downtime_entry
**dentry
;
70 static time_t last_downtime_start
;
72 static struct string_code event_codes
[] = {
74 add_ignored("Warning"),
75 add_ignored("LOG ROTATION"),
76 add_ignored("HOST NOTIFICATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE NOTIFICATION"),
79 add_ignored("SERVICE FLAPPING ALERT"),
80 add_ignored("SERVICE EVENT HANDLER"),
81 add_ignored("HOST EVENT HANDLER"),
82 add_ignored("LOG VERSION"),
84 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED
),
85 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED
),
86 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END
),
87 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED
),
88 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
89 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
90 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED
),
91 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
92 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
93 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
),
94 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
),
98 static struct string_code command_codes
[] = {
99 add_cdef(1, DEL_HOST_DOWNTIME
),
100 add_cdef(1, DEL_SVC_DOWNTIME
),
101 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
),
103 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME
),
104 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME
),
105 add_cdef(8, SCHEDULE_HOST_DOWNTIME
),
106 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME
),
107 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
),
109 add_cdef(8, SCHEDULE_SVC_DOWNTIME
),
112 * These really have one more field than listed here. We omit one
113 * to make author and comment concatenated with a semi-colon by default.
115 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM
),
116 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM
),
121 static inline void print_strvec(char **v
, int n
)
125 for (i
= 0; i
< n
; i
++)
126 printf("v[%2d]: %s\n", i
, v
[i
]);
130 static const char *tobytes(uint n
)
132 const char *suffix
= "KMGT";
133 static char tbuf
[2][30];
139 sprintf(tbuf
[t
], "%d bytes", n
);
143 while (n
>> (shift
* 10) > 1024)
146 sprintf(tbuf
[t
], "%0.2f %ciB",
147 (float)n
/ (float)(1 << (shift
* 10)), suffix
[shift
- 1]);
152 static void show_progress(void)
157 totlines
+= lines_since_progress
;
158 lines_since_progress
= 0;
163 elapsed
= time(NULL
) - import_start
.tv_sec
;
167 pct_done
= ((float)imported
/ (float)totsize
) * 100;
168 eta
= (elapsed
/ pct_done
) * (100.0 - pct_done
);
170 printf("\rImporting data: %.2f%% (%s) done ",
171 pct_done
, tobytes(imported
));
175 printf("%lum%lus", eta
/ 60, eta
% 60);
182 static void end_progress(void)
188 gettimeofday(&tv
, NULL
);
191 * If any of the logfiles doesn't have a newline
192 * at end of file, imported will be slightly off.
193 * We set it hard here so as to make sure that
194 * the final progress output stops at exactly 100%
200 secs
= (tv
.tv_sec
- import_start
.tv_sec
) * 1000000;
201 secs
+= tv
.tv_usec
- import_start
.tv_usec
;
202 mins
= (tv
.tv_sec
- import_start
.tv_sec
) / 60;
205 printf("%s in %u lines imported in ", tobytes(totsize
), totlines
);
207 printf("%dm ", mins
);
208 printf("%.3fs\n", secs
);
211 static int use_sql
= 1;
212 static int insert_downtime_event(int type
, char *host
, char *service
, int id
)
214 nebstruct_downtime_data ds
;
217 if (!is_interesting_service(host
, service
))
220 dt_start
+= type
== NEBTYPE_DOWNTIME_START
;
221 dt_stop
+= type
== NEBTYPE_DOWNTIME_STOP
;
222 if (dt_depth
> max_dt_depth
)
223 max_dt_depth
= dt_depth
;
228 memset(&ds
, 0, sizeof(ds
));
231 ds
.timestamp
.tv_sec
= ltime
;
233 ds
.service_description
= service
;
236 result
= hook_downtime(NEBCALLBACK_DOWNTIME_DATA
, (void *)&ds
);
238 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
239 type
, host
, service
, id
);
244 static int insert_service_check(struct string_code
*sc
)
246 nebstruct_service_check_data ds
;
248 if (!is_interesting_service(strv
[0], strv
[1]))
251 memset(&ds
, 0, sizeof(ds
));
253 ds
.timestamp
.tv_sec
= ltime
;
255 ds
.host_name
= strv
[0];
256 ds
.service_description
= strv
[1];
257 if (sc
->nvecs
== 4) {
258 /* passive service check result */
259 if (*strv
[2] >= '0' && *strv
[2] <= '9')
260 ds
.state
= atoi(strv
[2]);
262 ds
.state
= parse_service_state(strv
[2]);
263 ds
.state_type
= HARD_STATE
;
264 ds
.current_attempt
= 1;
267 ds
.state
= parse_service_state(strv
[2]);
268 ds
.state_type
= soft_hard(strv
[3]);
269 ds
.current_attempt
= atoi(strv
[4]);
276 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA
, (void *)&ds
);
279 static int insert_host_check(struct string_code
*sc
)
281 nebstruct_host_check_data ds
;
283 if (!is_interesting_host(strv
[0]))
286 memset(&ds
, 0, sizeof(ds
));
288 ds
.timestamp
.tv_sec
= ltime
;
290 ds
.host_name
= strv
[0];
291 if (sc
->nvecs
== 3) {
292 if (*strv
[1] >= '0' && *strv
[1] <= '9')
293 ds
.state
= atoi(strv
[1]);
295 ds
.state
= parse_host_state(strv
[1]);
296 /* passive host check result */
298 ds
.current_attempt
= 1;
299 ds
.state_type
= HARD_STATE
;
301 ds
.state
= parse_host_state(strv
[1]);
302 ds
.state_type
= soft_hard(strv
[2]);
303 ds
.current_attempt
= atoi(strv
[3]);
310 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA
, (void *)&ds
);
313 static int insert_process_event(int type
)
315 nebstruct_process_data ds
;
320 memset(&ds
, 0, sizeof(ds
));
321 ds
.timestamp
.tv_sec
= ltime
;
323 return hook_process_data(NEBCALLBACK_PROCESS_DATA
, (void *)&ds
);
326 static int insert_acknowledgement(struct string_code
*sc
)
331 static void dt_print(char *tpc
, time_t when
, struct downtime_entry
*dt
)
336 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
337 tpc
, when
, dt
->started
, dt
->start
, dt
->stop
, dt
->duration
, dt
->id
);
338 printf("%s", dt
->host
);
340 printf(";%s", dt
->service
);
344 static struct downtime_entry
*last_dte
;
345 static struct downtime_entry
*del_dte
;
347 static void remove_downtime(struct downtime_entry
*dt
);
348 static int del_matching_dt(void *data
)
350 struct downtime_entry
*dt
= data
;
352 if (del_dte
->id
== dt
->id
) {
353 dt_print("ALSO", 0, dt
);
360 static void stash_downtime_command(struct downtime_entry
*dt
)
362 dt
->slot
= dt
->start
% NUM_DENTRIES
;
363 dt
->next
= dentry
[dt
->slot
];
364 dentry
[dt
->slot
] = dt
;
367 static void remove_downtime(struct downtime_entry
*dt
)
369 struct downtime_entry
*old
;
371 if (!is_interesting_service(dt
->host
, dt
->service
))
374 insert_downtime_event(NEBTYPE_DOWNTIME_STOP
, dt
->host
, dt
->service
, dt
->id
);
377 old
= hash_remove(host_downtime
, dt
->host
);
379 old
= hash_remove2(service_downtime
, dt
->host
, dt
->service
);
381 dt_print("RM_DT", ltime
, dt
);
385 static struct downtime_entry
*
386 dt_matches_command(struct downtime_entry
*dt
, char *host
, char *service
)
388 for (; dt
; dt
= dt
->next
) {
391 if (ltime
> dt
->stop
|| ltime
< dt
->start
) {
396 case SCHEDULE_SVC_DOWNTIME
:
397 if (service
&& strcmp(service
, dt
->service
))
401 case SCHEDULE_HOST_DOWNTIME
:
402 case SCHEDULE_HOST_SVC_DOWNTIME
:
403 if (strcmp(host
, dt
->host
)) {
407 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
408 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
409 /* these two have host set in dt, but
410 * it will not match all the possible hosts */
413 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
414 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
415 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
416 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
419 crash("dt->code not set properly\n");
423 * Once we get here all the various other criteria have
424 * been matched, so we need to check if the daemon was
425 * running when this downtime was supposed to have
426 * started, and otherwise use the daemon start time
427 * as the value to diff against
429 if (daemon_stop
< dt
->start
&& daemon_start
> dt
->start
) {
430 debug("Adjusting dt->start (%lu) to (%lu)\n",
431 dt
->start
, daemon_start
);
432 dt
->start
= daemon_start
;
433 if (dt
->trigger
&& dt
->duration
)
434 dt
->stop
= dt
->start
+ dt
->duration
;
437 diff
= ltime
- dt
->start
;
438 if (diff
< 3 || dt
->trigger
|| !dt
->fixed
)
445 static struct downtime_entry
*
446 find_downtime_command(char *host
, char *service
)
449 struct downtime_entry
*shortcut
= NULL
;
451 if (last_dte
&& last_dte
->start
== ltime
) {
455 for (i
= 0; i
< NUM_DENTRIES
; i
++) {
456 struct downtime_entry
*dt
;
457 dt
= dt_matches_command(dentry
[i
], host
, service
);
459 if (shortcut
&& dt
!= shortcut
)
461 printf("FIND shortcut no good\n");
471 static int print_downtime(void *data
)
473 struct downtime_entry
*dt
= (struct downtime_entry
*)data
;
475 dt_print("UNCLOSED", ltime
, dt
);
480 static inline void set_next_dt_purge(time_t base
, time_t add
)
482 if (!next_dt_purge
|| next_dt_purge
> base
+ add
)
483 next_dt_purge
= base
+ add
;
485 if (next_dt_purge
<= ltime
)
486 next_dt_purge
= ltime
+ 1;
489 static inline void add_downtime(char *host
, char *service
, int id
)
491 struct downtime_entry
*dt
, *cmd
, *old
;
493 if (!is_interesting_service(host
, service
))
496 dt
= malloc(sizeof(*dt
));
497 cmd
= find_downtime_command(host
, service
);
499 warn("DT with no ext cmd? %lu %s;%s", ltime
, host
, service
);
500 memset(dt
, 0, sizeof(*dt
));
501 dt
->duration
= 7200; /* the default downtime duration in nagios */
503 dt
->stop
= dt
->start
+ dt
->duration
;
506 memcpy(dt
, cmd
, sizeof(*dt
));
508 dt
->host
= strdup(host
);
512 set_next_dt_purge(ltime
, dt
->duration
);
516 old
= hash_update(host_downtime
, dt
->host
, dt
);
519 dt
->service
= strdup(service
);
520 old
= hash_update2(service_downtime
, dt
->host
, dt
->service
, dt
);
523 if (old
&& old
!= dt
) {
530 dt_print("IN_DT", ltime
, dt
);
531 insert_downtime_event(NEBTYPE_DOWNTIME_START
, dt
->host
, dt
->service
, dt
->id
);
534 static time_t last_host_dt_del
, last_svc_dt_del
;
535 static int register_downtime_command(struct string_code
*sc
)
537 struct downtime_entry
*dt
;
538 char *start_time
, *end_time
, *duration
= NULL
;
539 char *host
= NULL
, *service
= NULL
, *fixed
, *triggered_by
= NULL
;
543 case DEL_HOST_DOWNTIME
:
544 last_host_dt_del
= ltime
;
546 case DEL_SVC_DOWNTIME
:
547 last_svc_dt_del
= ltime
;
550 case SCHEDULE_HOST_DOWNTIME
:
551 if (strtotimet(strv
[5], &foo
))
554 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
555 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
556 case SCHEDULE_HOST_SVC_DOWNTIME
:
559 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
560 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
561 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
562 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
563 start_time
= strv
[1];
566 if (strtotimet(strv
[5], &foo
))
567 triggered_by
= strv
[4];
573 case SCHEDULE_SVC_DOWNTIME
:
576 start_time
= strv
[2];
579 if (strtotimet(strv
[6], &foo
)) {
580 triggered_by
= strv
[5];
589 crash("Unknown downtime type: %d", sc
->code
);
592 if (!(dt
= calloc(sizeof(*dt
), 1)))
593 crash("calloc(%u, 1) failed: %s", (uint
)sizeof(*dt
), strerror(errno
));
597 dt
->host
= strdup(host
);
599 dt
->service
= strdup(service
);
601 dt
->trigger
= triggered_by
? !!(*triggered_by
- '0') : 0;
602 if (strtotimet(start_time
, &dt
->start
) || strtotimet(end_time
, &dt
->stop
))
604 print_strvec(strv
, sc
->nvecs
);
605 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
606 command_codes
[sc
->code
- 1].str
, start_time
, end_time
, duration
);
610 * sometimes downtime commands can be logged according to
611 * log version 1, while the log still claims to be version 2.
612 * Apparently, this happens when using a daemon supporting
613 * version 2 logging but a downtime command is added that
614 * follows the version 1 standard.
615 * As such, we simply ignore the result of the "duration"
616 * field conversion and just accept that it might not work
618 (void)strtotimet(duration
, &dt
->duration
);
619 dt
->fixed
= *fixed
- '0';
622 * ignore downtime scheduled to take place in the future.
623 * It will be picked up by the module anyways
625 if (dt
->start
> time(NULL
)) {
630 if (dt
->duration
> time(NULL
)) {
631 warn("Bizarrely large duration (%lu)", dt
->duration
);
633 if (dt
->start
< ltime
) {
634 if (dt
->duration
&& dt
->duration
> ltime
- dt
->start
)
635 dt
->duration
-= ltime
- dt
->start
;
639 if (dt
->stop
< ltime
|| dt
->stop
< dt
->start
) {
640 /* retroactively scheduled downtime, or just plain wrong */
641 dt
->stop
= dt
->start
;
645 if (dt
->fixed
&& dt
->duration
!= dt
->stop
- dt
->start
) {
646 // warn("duration doesn't match stop - start: (%lu : %lu)",
647 // dt->duration, dt->stop - dt->start);
649 dt
->duration
= dt
->stop
- dt
->start
;
651 else if (dt
->duration
> 86400 * 14) {
652 warn("Oddly long duration: %lu", dt
->duration
);
655 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
656 dt
->start
, dt
->stop
, dt
->duration
, dt
->fixed
, dt
->trigger
, dt
->host
, dt
->service
);
658 stash_downtime_command(dt
);
662 static int insert_downtime(struct string_code
*sc
)
665 struct downtime_entry
*dt
= NULL
;
668 char *host
, *service
= NULL
;
671 if (sc
->nvecs
== 4) {
673 dt
= hash_find2(service_downtime
, host
, service
);
676 dt
= hash_find(host_downtime
, host
);
679 * to stop a downtime we can either get STOPPED or
680 * CANCELLED. So far, I've only ever seen STARTED
681 * for when it actually starts though, and since
682 * the Nagios daemon is reponsible for launching
683 * it, it's unlikely there are more variants of
686 type
= NEBTYPE_DOWNTIME_STOP
;
687 if (!strcmp(strv
[sc
->nvecs
- 2], "STARTED"))
688 type
= NEBTYPE_DOWNTIME_START
;
691 case NEBTYPE_DOWNTIME_START
:
693 if (!probably_ignore_downtime
)
694 dt_print("ALRDY", ltime
, dt
);
698 if (probably_ignore_downtime
)
699 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
700 probably_ignore_downtime
, ltime
, host
, service
);
702 if (ltime
- last_downtime_start
> 1)
706 add_downtime(host
, service
, id
);
707 last_downtime_start
= ltime
;
710 case NEBTYPE_DOWNTIME_STOP
:
713 * this can happen when overlapping downtime entries
714 * occur, and the start event for the second (or nth)
715 * downtime starts before the first downtime has had
716 * a stop event. It basically means we've almost
717 * certainly done something wrong.
719 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
720 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
721 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
725 dt_del_cmd
= !dt
->service
? last_host_dt_del
: last_svc_dt_del
;
727 if ((ltime
- dt_del_cmd
) > 1 && dt
->duration
- (ltime
- dt
->started
) > 60) {
728 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
729 ltime
- dt
->started
, dt
->host
, dt
->service
, dt
->duration
);
731 if (ltime
- dt
->started
> dt
->duration
+ DT_PURGE_GRACETIME
)
732 dt_print("Long", ltime
, dt
);
736 * Now delete whatever matching downtimes we can find.
737 * this must be here, or we'll recurse like crazy into
738 * remove_downtime(), possibly exhausting the stack
743 hash_walk_data(host_downtime
, del_matching_dt
);
745 hash_walk_data(service_downtime
, del_matching_dt
);
755 static int dt_purged
;
756 static int purge_expired_dt(void *data
)
758 struct downtime_entry
*dt
= data
;
764 if (ltime
+ DT_PURGE_GRACETIME
> dt
->stop
) {
766 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
767 ltime
, dt
->id
, dt
->start
, dt
->started
, dt
->stop
, dt
->duration
, dt
->host
, dt
->service
);
771 dt_print("PURGED_NOT_TIME", ltime
, dt
);
774 set_next_dt_purge(dt
->started
, dt
->duration
);
779 static int purged_downtimes
;
780 static void purge_expired_downtime(void)
786 hash_walk_data(host_downtime
, purge_expired_dt
);
788 debug("PURGE %d host downtimes purged", dt_purged
);
789 tot_purged
+= dt_purged
;
791 hash_walk_data(service_downtime
, purge_expired_dt
);
793 debug("PURGE %d service downtimes purged", dt_purged
);
794 tot_purged
+= dt_purged
;
796 debug("PURGE total %d entries purged", tot_purged
);
799 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
800 next_dt_purge
, next_dt_purge
- ltime
);
802 purged_downtimes
+= tot_purged
;
805 static inline void handle_start_event(void)
807 if (!daemon_is_running
)
808 insert_process_event(NEBTYPE_PROCESS_START
);
810 probably_ignore_downtime
= daemon_start
= ltime
;
811 daemon_is_running
= 1;
814 static inline void handle_stop_event(void)
816 if (daemon_is_running
) {
817 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN
);
818 daemon_is_running
= 0;
823 static int parse_line(char *line
, uint len
)
827 struct string_code
*sc
;
828 static time_t last_ltime
= 0;
830 imported
+= len
+ 1; /* make up for 1 lost byte per newline */
832 /* ignore empty lines */
836 if (++lines_since_progress
>= PROGRESS_INTERVAL
)
839 /* skip obviously bogus lines */
840 if (len
< 12 || *line
!= '[') {
841 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no
, line
);
845 ltime
= strtoul(line
+ 1, &ptr
, 10);
846 if (line
+ 1 == ptr
) {
847 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line
);
851 if (ltime
< last_ltime
) {
852 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
853 // ltime, last_ltime, last_ltime - ltime);
860 * Incremental will be 0 if not set, or 1 if set but
861 * the database is currently empty.
862 * Note that this will not always do the correct thing,
863 * as downtime entries that might have been scheduled for
864 * purging may never show up as "stopped" in the database
865 * with this scheme. As such, incremental imports absolutely
866 * require that nothing is in scheduled downtime when the
867 * import is running (well, started really, but it amounts
868 * to the same thing).
870 if (ltime
< incremental
)
873 if (next_dt_purge
&& ltime
>= next_dt_purge
)
874 purge_expired_downtime();
876 if (probably_ignore_downtime
&& ltime
- probably_ignore_downtime
> 1)
877 probably_ignore_downtime
= 0;
879 while (*ptr
== ']' || *ptr
== ' ')
882 if (!is_interesting(ptr
))
885 if (!(colon
= strchr(ptr
, ':'))) {
886 /* stupid heuristic, but might be good for something,
887 * somewhere, sometime. if nothing else, it should suppress
889 if (is_start_event(ptr
)) {
890 handle_start_event();
893 if (is_stop_event(ptr
)) {
899 * An unhandled event. We should probably crash here
901 handle_unknown_event(line
);
905 /* an event happened without us having gotten a start-event */
906 if (!daemon_is_running
) {
907 insert_process_event(NEBTYPE_PROCESS_START
);
908 daemon_start
= ltime
;
909 daemon_is_running
= 1;
912 if (!(sc
= get_event_type(ptr
, colon
- ptr
))) {
913 handle_unknown_event(line
);
917 if (sc
->code
== IGNORE_LINE
)
928 nvecs
= vectorize_string(ptr
, sc
->nvecs
);
930 if (nvecs
!= sc
->nvecs
) {
932 warn("Line %d in %s seems to not have all the fields it should",
933 line_no
, cur_file
->path
);
937 for (i
= 0; i
< sc
->nvecs
; i
++) {
939 /* this should never happen */
940 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
941 line_no
, cur_file
->path
);
950 case NEBTYPE_EXTERNALCOMMAND_END
:
951 semi_colon
= strchr(ptr
, ';');
954 if (!(sc
= get_command_type(ptr
, semi_colon
- ptr
))) {
957 if (sc
->code
== RESTART_PROGRAM
) {
962 nvecs
= vectorize_string(semi_colon
+ 1, sc
->nvecs
);
963 if (nvecs
!= sc
->nvecs
) {
964 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs
, sc
->nvecs
, ptr
);
966 if (sc
->code
!= ACKNOWLEDGE_HOST_PROBLEM
&&
967 sc
->code
!= ACKNOWLEDGE_SVC_PROBLEM
)
969 register_downtime_command(sc
);
971 insert_acknowledgement(sc
);
975 case NEBTYPE_HOSTCHECK_PROCESSED
:
976 return insert_host_check(sc
);
978 case NEBTYPE_SERVICECHECK_PROCESSED
:
979 return insert_service_check(sc
);
981 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
:
982 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
:
983 return insert_downtime(sc
);
992 static int parse_one_line(char *str
, uint len
)
994 if (parse_line(str
, len
) && use_sql
&& sql_errno())
995 crash("sql error: %s", sql_error());
1000 static int hash_one_line(char *line
, uint len
)
1002 return add_interesting_object(line
);
1005 static int hash_interesting(const char *path
)
1009 if (stat(path
, &st
) < 0)
1010 crash("failed to stat %s: %s", path
, strerror(errno
));
1012 lparse_path(path
, st
.st_size
, hash_one_line
);
1017 extern const char *__progname
;
1018 int main(int argc
, char **argv
)
1020 int i
, truncate_db
= 0;
1021 const char *nagios_cfg
= NULL
;
1022 char *db_name
= "monitor_reports";
1023 char *db_user
= "monitor";
1024 char *db_pass
= "monitor";
1025 char *db_table
= "report_data";
1027 do_progress
= isatty(fileno(stdout
));
1029 strv
= calloc(sizeof(char *), MAX_NVECS
);
1030 dentry
= calloc(sizeof(*dentry
), NUM_DENTRIES
);
1031 if (!strv
|| !dentry
)
1032 crash("Failed to alloc initial structs");
1035 for (num_nfile
= 0,i
= 1; i
< argc
; i
++) {
1036 char *opt
, *arg
= argv
[i
];
1039 if ((opt
= strchr(arg
, '='))) {
1043 else if (i
< argc
- 1) {
1047 if (!prefixcmp(arg
, "--incremental")) {
1051 if (!prefixcmp(arg
, "--no-sql")) {
1055 if (!prefixcmp(arg
, "--no-progress")) {
1059 if (!prefixcmp(arg
, "--debug") || !prefixcmp(arg
, "-d")) {
1064 if (!prefixcmp(arg
, "--truncate-db")) {
1068 if (!prefixcmp(arg
, "--nagios-cfg")) {
1069 if (!opt
|| !*opt
) {
1070 crash("%s requires the path to nagios.cfg as argument", arg
);
1077 if (!prefixcmp(arg
, "--db-name")) {
1079 crash("%s requires a database name as an argument", arg
);
1085 if (!prefixcmp(arg
, "--db-user")) {
1087 crash("%s requires a database username as argument", arg
);
1093 if (!prefixcmp(arg
, "--db-pass")) {
1095 crash("%s requires a database username as argument", arg
);
1101 if (!prefixcmp(arg
, "--db-table")) {
1103 crash("%s requires a database table name as argument", arg
);
1109 if (!prefixcmp(arg
, "--interesting") || !prefixcmp(arg
, "-i")) {
1111 crash("%s requires a filename as argument", arg
);
1112 hash_interesting(opt
);
1118 /* non-argument, so treat as either nagios.cfg or a logfile */
1119 if (!strcmp(&arg
[strlen(arg
) - 11], "/nagios.cfg")) {
1122 add_naglog_path(arg
);
1126 /* fallback for op5 systems */
1127 if (!nagios_cfg
&& !num_nfile
) {
1128 nagios_cfg
= "/opt/monitor/etc/nagios.cfg";
1131 struct cfg_comp
*conf
;
1132 conf
= cfg_parse_file(nagios_cfg
);
1133 for (i
= 0; i
< conf
->vars
; i
++) {
1134 struct cfg_var
*v
= conf
->vlist
[i
];
1135 if (!strcmp(v
->key
, "log_file")) {
1136 add_naglog_path(v
->value
);
1138 if (!strcmp(v
->key
, "log_archive_path")) {
1139 add_naglog_path(v
->value
);
1145 sql_config("db_database", db_name
);
1146 sql_config("db_user", db_user
);
1147 sql_config("db_pass", db_pass
);
1148 sql_config("db_table", db_table
);
1151 crash("sql_init() failed");
1153 sql_query("TRUNCATE %s", sql_table_name());
1158 sql_query("SELECT timestamp FROM %s.%s ORDER BY timestamp DESC LIMIT 1",
1161 if (!(result
= sql_get_result()))
1162 crash("Failed to get last timestamp: %s\n", sql_error());
1164 /* someone might use --incremental with an empty
1165 * database. We shouldn't crash in that case */
1166 if ((row
= sql_fetch_row(result
)))
1167 incremental
= strtoul(row
[0], NULL
, 0);
1169 sql_free_result(result
);
1172 * We lock the table we'll be working with and disable
1173 * indexes on it. Otherwise doing the actual inserts
1174 * will take just about forever, as MySQL has to update
1175 * and flush the index cache between each operation.
1177 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1178 crash("Failed to disable keys: %s", sql_error());
1179 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1180 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
1183 log_grok_var("logfile", "/dev/null");
1184 log_grok_var("log_levels", "warn");
1187 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1191 crash("log_init() failed");
1193 qsort(nfile
, num_nfile
, sizeof(*nfile
), nfile_cmp
);
1195 host_downtime
= hash_init(HASH_TABLE_SIZE
);
1196 service_downtime
= hash_init(HASH_TABLE_SIZE
);
1198 if (hook_init() < 0)
1199 crash("Failed to initialize hooks");
1201 /* go through them once to count the total size for progress output */
1202 for (i
= 0; i
< num_nfile
; i
++) {
1203 totsize
+= nfile
[i
].size
;
1206 gettimeofday(&import_start
, NULL
);
1207 printf("Importing %s of data from %d files\n",
1208 tobytes(totsize
), num_nfile
);
1210 for (i
= 0; i
< num_nfile
; i
++) {
1211 struct naglog_file
*nf
= &nfile
[i
];
1214 debug("importing from %s (%lu : %u)\n", nf
->path
, nf
->first
, nf
->cmp
);
1216 lparse_path(nf
->path
, nf
->size
, parse_one_line
);
1217 imported
++; /* make up for one lost byte per file */
1224 printf("Unclosed host downtimes:\n");
1225 puts("------------------------");
1226 hash_walk_data(host_downtime
, print_downtime
);
1227 printf("Unclosed service downtimes:\n");
1228 puts("---------------------------");
1229 hash_walk_data(service_downtime
, print_downtime
);
1231 printf("dt_depth: %d\n", dt_depth
);
1233 printf("purged downtimes: %d\n", purged_downtimes
);
1234 printf("max simultaneous host downtime hashes: %u\n",
1235 hash_get_max_entries(host_downtime
));
1236 printf("max simultaneous service downtime hashes: %u\n",
1237 hash_get_max_entries(service_downtime
));
1238 printf("max downtime depth: %u\n", max_dt_depth
);
1245 unsigned long entries
;
1247 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
1248 if (!(res
= sql_get_result()))
1251 row
= sql_fetch_row(res
);
1252 entries
= strtoul(row
[0], NULL
, 0);
1253 sql_free_result(res
);
1256 signal(SIGINT
, SIG_IGN
);
1257 sql_query("UNLOCK TABLES");
1259 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1260 (entries
/ 50000) + 1);
1261 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1262 printf("%lu database entries indexed in %lu seconds\n",
1263 entries
, time(NULL
) - start
);
1267 if (warnings
&& debug_level
)
1268 fprintf(stderr
, "Total warnings: %d\n", warnings
);
1270 if (debug_level
|| dt_start
!= dt_stop
)
1271 fprintf(stderr
, "Downtime data %s\n started: %d\n stopped: %d\n",
1272 dt_depth
? "mismatch!" : "consistent", dt_start
, dt_stop
);
1273 if (hash_check_table(host_downtime
))
1274 fprintf(stderr
, "Hash table inconsistencies for host_downtime\n");
1275 if (hash_check_table(service_downtime
))
1276 fprintf(stderr
, "Hash table inconsistencies for service_downtime\n");
1278 print_unhandled_events();