10 #include <nagios/broker.h>
11 #include <nagios/nebcallbacks.h>
19 #define CONCERNS_HOST 50
20 #define CONCERNS_SERVICE 60
23 #define HASH_TABLE_SIZE 128
25 /* for some reason these aren't defined inside Nagios' headers */
27 #define SERVICE_WARNING 1
28 #define SERVICE_CRITICAL 2
29 #define SERVICE_UNKNOWN 3
31 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
40 static size_t imported
, totsize
, totlines
;
41 static int lines_since_progress
, do_progress
;
42 static time_t start_time
;
43 static int log_version
;
44 static int debug_level
;
46 static time_t daemon_start
, daemon_stop
, incremental
;
47 static int daemon_is_running
;
49 static int max_dt_depth
;
50 struct naglog_file
*cur_file
; /* the file we're currently importing */
53 static time_t next_dt_purge
; /* when next to purge expired downtime */
54 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
56 static time_t ltime
; /* the timestamp from the current log-line */
65 static int dt_start
, dt_stop
;
66 #define dt_depth (dt_start - dt_stop)
67 static hash_table
*host_downtime
;
68 static hash_table
*service_downtime
;
69 static int downtime_id
;
70 static time_t probably_ignore_downtime
;
72 #define svc_downtime(h, s) hash_find2(service_downtime, h, s)
73 #define host_downtime(h) hash_find(host_downtime, h)
74 struct downtime_entry
{
88 struct downtime_entry
*next
;
91 #define NUM_DENTRIES 1024
92 static struct downtime_entry
**dentry
;
93 static time_t last_downtime_start
;
95 #define LOG_VERSION (-2)
96 #define add_code(n, s, c) { n, s, sizeof(s) - 1, c, }
97 #define add_ignored(s) add_code(0, s, IGNORE_LINE)
98 struct string_code event_codes
[] = {
100 add_ignored("Warning"),
101 add_ignored("LOG ROTATION"),
102 add_ignored("HOST NOTIFICATION"),
103 add_ignored("HOST FLAPPING ALERT"),
104 add_ignored("SERVICE NOTIFICATION"),
105 add_ignored("SERVICE FLAPPING ALERT"),
106 add_ignored("SERVICE EVENT HANDLER"),
107 add_ignored("HOST EVENT HANDLER"),
109 add_code(0, "LOG VERSION", LOG_VERSION
),
110 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END
),
111 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED
),
112 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
113 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
114 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED
),
115 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
116 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
117 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
),
118 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
),
122 #define DEL_HOST_DOWNTIME 1
123 #define DEL_SVC_DOWNTIME 2
124 #define SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME 3
125 #define SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME 4
126 #define SCHEDULE_HOSTGROUP_HOST_DOWNTIME 5
127 #define SCHEDULE_HOSTGROUP_SVC_DOWNTIME 6
128 #define SCHEDULE_HOST_DOWNTIME 7
129 #define SCHEDULE_HOST_SVC_DOWNTIME 8
130 #define SCHEDULE_SERVICEGROUP_HOST_DOWNTIME 9
131 #define SCHEDULE_SERVICEGROUP_SVC_DOWNTIME 10
132 #define SCHEDULE_SVC_DOWNTIME 11
134 #define add_cdef(__nvecs, __define) add_code(__nvecs, #__define, __define)
135 struct string_code command_codes
[] = {
136 add_cdef(1, DEL_HOST_DOWNTIME
),
137 add_cdef(1, DEL_SVC_DOWNTIME
),
138 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
),
139 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
),
140 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME
),
141 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME
),
142 add_cdef(8, SCHEDULE_HOST_DOWNTIME
),
143 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME
),
144 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
),
145 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
),
146 add_cdef(8, SCHEDULE_SVC_DOWNTIME
),
150 static void set_log_version(int version
)
154 command_codes
[SCHEDULE_HOST_DOWNTIME
- 1].nvecs
= 7;
157 command_codes
[SCHEDULE_HOST_DOWNTIME
- 1].nvecs
= 8;
161 log_version
= version
;
164 static inline void print_strvec(char **v
, int n
)
168 for (i
= 0; i
< n
; i
++)
169 printf("v[%2d]: %s\n", i
, v
[i
]);
173 const char *tobytes(size_t n
)
175 const char *suffix
= "KMGT";
176 static char tbuf
[2][30];
182 sprintf(tbuf
[t
], "%d bytes", n
);
186 while (n
>> (shift
* 10) > 1024)
189 sprintf(tbuf
[t
], "%0.2f %ciB",
190 (float)n
/ (float)(1 << (shift
* 10)), suffix
[shift
- 1]);
195 static void show_progress(void)
200 totlines
+= lines_since_progress
;
201 lines_since_progress
= 0;
206 elapsed
= time(NULL
) - start_time
;
210 pct_done
= ((float)imported
/ (float)totsize
) * 100;
211 eta
= (elapsed
/ pct_done
) * (100.0 - pct_done
);
213 printf("\rImporting data: %.2f%% (%s) done ",
214 pct_done
, tobytes(imported
));
218 printf("%lum%lus", eta
/ 60, eta
% 60);
225 static void end_progress(void)
227 time_t now
= time(NULL
);
231 * If any of the logfiles doesn't have a newline
232 * at end of file, imported will be slightly off.
233 * We set it hard here so as to make sure that
234 * the final progress output stops at exactly 100%
240 secs
= now
- start_time
;
243 printf("%s in %u lines imported in ", tobytes(totsize
), totlines
);
245 printf("%dm ", mins
);
246 printf("%ds\n", secs
);
249 static inline struct string_code
*
250 get_string_code(struct string_code
*codes
, const char *str
, size_t len
)
254 for (i
= 0; codes
[i
].str
; i
++)
255 if (codes
[i
].len
== len
&& !memcmp(str
, codes
[i
].str
, len
))
260 #define get_event_type(str, len) get_string_code(event_codes, str, len)
261 #define get_command_type(str, len) get_string_code(command_codes, str, len)
263 static void crash(const char *fmt
, ...)
264 __attribute__((__format__(__printf__
, 1, 2), __noreturn__
));
266 static void __attribute__((__noreturn__
)) crash(const char *fmt
, ...)
271 vfprintf(stderr
, fmt
, ap
);
276 fprintf(stderr
, "crash() called when parsing line %d in %s\n",
277 line_no
, cur_file
->path
);
283 static void pdebug(int lvl
, const char *fmt
, ...)
284 __attribute__((__format__(__printf__
, 2, 3)));
285 #define debug(...) pdebug(1, __VA_ARGS__)
286 static void pdebug(int lvl
, const char *fmt
, ...)
290 if (debug_level
< lvl
)
296 if (fmt
[strlen(fmt
) - 1] != '\n')
300 static void warn(const char *fmt
, ...)
301 __attribute__((__format__(__printf__
, 1, 2)));
303 static unsigned int warnings
;
304 static void warn(const char *fmt
, ...)
321 #define prefixcmp(s1, s2) strncmp(s1, s2, strlen(s2))
322 static int is_interesting(const char *ptr
)
324 if (!prefixcmp(ptr
, "Auto-save of retention data"))
326 if (!prefixcmp(ptr
, "Event broker module"))
328 if (!prefixcmp(ptr
, "You do not have permission"))
334 static int is_start_event(const char *ptr
)
336 if (!prefixcmp(ptr
, "Finished daemonizing..."))
338 if (!prefixcmp(ptr
, "PROGRAM_RESTART"))
340 if (!prefixcmp(ptr
, "Caught SIGHUP"))
342 if (strstr(ptr
, "starting..."))
348 static int is_stop_event(const char *ptr
)
350 if (!prefixcmp(ptr
, "Caught SIGTERM"))
352 if (!prefixcmp(ptr
, "Successfully shutdown..."))
354 if (!prefixcmp(ptr
, "Bailing out"))
356 if (!prefixcmp(ptr
, "Lockfile"))
358 if (strstr(ptr
, "shutting down..."))
364 struct unhandled_event
{
368 struct unhandled_event
*next
;
371 static struct unhandled_event
*event_list
;
372 static int num_unhandled
;
375 * This is a fairly toothless function, since we can encounter
376 * pretty much any kind of message in the logfiles. In order to
377 * make sure we don't miss anything important though, we should
378 * probably stash the messages away and print them at the end
379 * so the user can decide if he/she wants to make a re-import.
380 * In 99% of all cases, the user will just want to ignore the
381 * messages and keep going
383 static void handle_unknown_event(const char *line
)
385 struct unhandled_event
*event
;
389 if (!(event
= malloc(sizeof(*event
))) || !(event
->line
= strdup(line
))) {
390 crash("Failed to allocate memory for unhandled event [%s]\n", line
);
394 event
->line_no
= line_no
;
395 event
->file
= cur_file
->path
;
397 /* add to "top" of list. we'll print in reverse order */
398 event
->next
= event_list
;
402 static void print_unhandled_events()
404 struct unhandled_event
*event
;
410 printf("\n%d Unhandled events encountered:\n" \
411 "------------------------------", num_unhandled
);
413 for (x
= 1; num_unhandled
> (x
* 10); x
*= 10)
417 for (event
= event_list
; event
; event
= event
->next
) {
418 printf("%s:%d:\n%s\n----\n", event
->file
, event
->line_no
, event
->line
);
422 static int vectorize_string(char *str
, int nvecs
)
428 for (p
= str
; *p
&& i
< nvecs
; p
++) {
438 static int parse_service_state(const char *str
)
440 if (!strcmp(str
, "OK"))
442 if (!strcmp(str
, "WARNING"))
443 return SERVICE_WARNING
;
444 if (!strcmp(str
, "UNKNOWN"))
445 return SERVICE_UNKNOWN
;
446 if (!strcmp(str
, "CRITICAL"))
447 return SERVICE_CRITICAL
;
449 crash("bad value for service state: '%s'", str
);
452 static int parse_host_state(const char *str
)
454 if (!strcmp(str
, "UP"))
456 if (!strcmp(str
, "DOWN"))
458 if (!strcmp(str
, "UNREACHABLE"))
459 return HOST_UNREACHABLE
;
461 crash("bad value for host state: '%s'", str
);
464 static int soft_hard(const char *str
)
466 if (!strcmp(str
, "HARD"))
469 if (!strcmp(str
, "SOFT"))
471 crash("wtf kind of value is '%s' to determin 'soft' or 'hard' from?", str
);
474 static int use_sql
= 1;
475 static hash_table
*interesting_hosts
, *interesting_services
;
476 static int insert_downtime_event(int type
, char *host
, char *service
, int id
)
478 nebstruct_downtime_data ds
;
481 dt_start
+= type
== NEBTYPE_DOWNTIME_START
;
482 dt_stop
+= type
== NEBTYPE_DOWNTIME_STOP
;
483 if (dt_depth
> max_dt_depth
)
484 max_dt_depth
= dt_depth
;
489 memset(&ds
, 0, sizeof(ds
));
492 ds
.timestamp
.tv_sec
= ltime
;
494 ds
.service_description
= service
;
497 result
= hook_downtime(NEBCALLBACK_DOWNTIME_DATA
, (void *)&ds
);
499 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
500 type
, host
, service
, id
);
505 static int insert_service_check(struct string_code
*sc
)
507 nebstruct_service_check_data ds
;
512 if (interesting_services
&& !hash_find2(interesting_services
, strv
[0], strv
[1]))
515 memset(&ds
, 0, sizeof(ds
));
517 ds
.timestamp
.tv_sec
= ltime
;
519 ds
.host_name
= strv
[0];
520 ds
.service_description
= strv
[1];
521 ds
.state
= parse_service_state(strv
[2]);
522 ds
.state_type
= soft_hard(strv
[3]);
523 ds
.current_attempt
= atoi(strv
[4]);
526 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA
, (void *)&ds
);
529 static int insert_host_check(struct string_code
*sc
)
531 nebstruct_host_check_data ds
;
536 if (interesting_hosts
&& !hash_find(interesting_hosts
, strv
[0]))
539 memset(&ds
, 0, sizeof(ds
));
541 ds
.timestamp
.tv_sec
= ltime
;
543 ds
.host_name
= strv
[0];
544 ds
.state
= parse_host_state(strv
[1]);
545 ds
.state_type
= soft_hard(strv
[2]);
546 ds
.current_attempt
= atoi(strv
[3]);
549 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA
, (void *)&ds
);
552 static int insert_process_event(int type
)
554 nebstruct_process_data ds
;
559 memset(&ds
, 0, sizeof(ds
));
560 ds
.timestamp
.tv_sec
= ltime
;
562 return hook_process_data(NEBCALLBACK_PROCESS_DATA
, (void *)&ds
);
565 void dt_print(char *tpc
, time_t when
, struct downtime_entry
*dt
)
570 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
571 tpc
, when
, dt
->started
, dt
->start
, dt
->stop
, dt
->duration
, dt
->id
);
572 printf("%s", dt
->host
);
574 printf(";%s", dt
->service
);
578 static struct downtime_entry
*last_dte
;
579 static struct downtime_entry
*del_dte
;
581 static void remove_downtime(struct downtime_entry
*dt
);
582 static int del_matching_dt(void *data
)
584 struct downtime_entry
*dt
= data
;
586 if (del_dte
->id
== dt
->id
) {
587 dt_print("ALSO", 0, dt
);
594 static void stash_downtime_command(struct downtime_entry
*dt
)
596 dt
->slot
= dt
->start
% NUM_DENTRIES
;
597 dt
->next
= dentry
[dt
->slot
];
598 dentry
[dt
->slot
] = dt
;
601 static void remove_downtime(struct downtime_entry
*dt
)
603 struct downtime_entry
*old
;
605 insert_downtime_event(NEBTYPE_DOWNTIME_STOP
, dt
->host
, dt
->service
, dt
->id
);
608 old
= hash_remove(host_downtime
, dt
->host
);
610 old
= hash_remove2(service_downtime
, dt
->host
, dt
->service
);
612 dt_print("RM_DT", ltime
, dt
);
616 static struct downtime_entry
*
617 dt_matches_command(struct downtime_entry
*dt
, char *host
, char *service
)
619 for (; dt
; dt
= dt
->next
) {
622 if (ltime
> dt
->stop
|| ltime
< dt
->start
) {
627 case SCHEDULE_SVC_DOWNTIME
:
628 if (service
&& strcmp(service
, dt
->service
))
632 case SCHEDULE_HOST_DOWNTIME
:
633 case SCHEDULE_HOST_SVC_DOWNTIME
:
634 if (strcmp(host
, dt
->host
)) {
638 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
639 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
640 /* these two have host set in dt, but
641 * it will not match all the possible hosts */
644 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
645 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
646 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
647 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
650 crash("dt->code not set properly\n");
654 * Once we get here all the various other criteria have
655 * been matched, so we need to check if the daemon was
656 * running when this downtime was supposed to have
657 * started, and otherwise use the daemon start time
658 * as the value to diff against
660 if (daemon_stop
< dt
->start
&& daemon_start
> dt
->start
) {
661 debug("Adjusting dt->start (%lu) to (%lu)\n",
662 dt
->start
, daemon_start
);
663 dt
->start
= daemon_start
;
664 if (dt
->trigger
&& dt
->duration
)
665 dt
->stop
= dt
->start
+ dt
->duration
;
668 diff
= ltime
- dt
->start
;
669 if (diff
< 3 || dt
->trigger
|| !dt
->fixed
)
676 static struct downtime_entry
*
677 find_downtime_command(char *host
, char *service
)
680 struct downtime_entry
*shortcut
= NULL
;
682 if (last_dte
&& last_dte
->start
== ltime
) {
686 for (i
= 0; i
< NUM_DENTRIES
; i
++) {
687 struct downtime_entry
*dt
;
688 dt
= dt_matches_command(dentry
[i
], host
, service
);
690 if (shortcut
&& dt
!= shortcut
)
692 printf("FIND shortcut no good\n");
702 static int print_downtime(void *data
)
704 struct downtime_entry
*dt
= (struct downtime_entry
*)data
;
706 dt_print("UNCLOSED", ltime
, dt
);
711 static inline void set_next_dt_purge(time_t base
, time_t add
)
713 if (!next_dt_purge
|| next_dt_purge
> base
+ add
)
714 next_dt_purge
= base
+ add
;
716 if (next_dt_purge
<= ltime
)
717 next_dt_purge
= ltime
+ 1;
720 static inline void add_downtime(char *host
, char *service
, int id
)
722 struct downtime_entry
*dt
, *cmd
, *old
;
724 dt
= malloc(sizeof(*dt
));
725 cmd
= find_downtime_command(host
, service
);
727 warn("DT with no ext cmd? %lu %s;%s", ltime
, host
, service
);
728 memset(dt
, 0, sizeof(*dt
));
729 dt
->duration
= 7200; /* the default downtime duration in nagios */
731 dt
->stop
= dt
->start
+ dt
->duration
;
734 memcpy(dt
, cmd
, sizeof(*dt
));
736 dt
->host
= strdup(host
);
740 set_next_dt_purge(ltime
, dt
->duration
);
744 old
= hash_update(host_downtime
, dt
->host
, dt
);
747 dt
->service
= strdup(service
);
748 old
= hash_update2(service_downtime
, dt
->host
, dt
->service
, dt
);
751 if (old
&& old
!= dt
) {
758 dt_print("IN_DT", ltime
, dt
);
759 insert_downtime_event(NEBTYPE_DOWNTIME_START
, dt
->host
, dt
->service
, dt
->id
);
762 static int strtotimet(const char *str
, time_t *val
)
766 *val
= strtoul(str
, &endp
, 10);
768 warn("strtotimet(): %s is not a valid time_t\n", str
);
775 static time_t last_host_dt_del
, last_svc_dt_del
;
776 static int register_downtime_command(struct string_code
*sc
)
778 struct downtime_entry
*dt
;
779 char *start_time
, *end_time
, *duration
= NULL
;
780 char *host
= NULL
, *service
= NULL
, *fixed
, *triggered_by
= NULL
;
784 case DEL_HOST_DOWNTIME
:
785 last_host_dt_del
= ltime
;
787 case DEL_SVC_DOWNTIME
:
788 last_svc_dt_del
= ltime
;
791 case SCHEDULE_HOST_DOWNTIME
:
792 if (strtotimet(strv
[5], &foo
))
795 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
796 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
797 case SCHEDULE_HOST_SVC_DOWNTIME
:
798 host
= strdup(strv
[0]);
800 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
801 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
802 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
803 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
804 start_time
= strv
[1];
807 if (strtotimet(strv
[5], &foo
))
808 triggered_by
= strv
[4];
814 case SCHEDULE_SVC_DOWNTIME
:
815 host
= strdup(strv
[0]);
816 service
= strdup(strv
[1]);
817 start_time
= strv
[2];
820 if (strtotimet(strv
[6], &foo
)) {
821 triggered_by
= strv
[5];
830 crash("Unknown downtime type: %d", sc
->code
);
833 if (!(dt
= calloc(sizeof(*dt
), 1)))
834 crash("calloc(%d, 1) failed: %s", sizeof(*dt
), strerror(errno
));
838 dt
->host
= strdup(host
);
840 dt
->service
= strdup(service
);
842 dt
->trigger
= triggered_by
? !!(*triggered_by
- '0') : 0;
843 if (strtotimet(start_time
, &dt
->start
) || strtotimet(end_time
, &dt
->stop
))
845 print_strvec(strv
, sc
->nvecs
);
846 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s'; log_version=%d",
847 command_codes
[sc
->code
- 1].str
, start_time
, end_time
, duration
, log_version
);
851 * sometimes downtime commands can be logged according to
852 * log version 1, while the log still claims to be version 2.
853 * Apparently, this happens when using a daemon supporting
854 * version 2 logging but a downtime command is added that
855 * follows the version 1 standard.
856 * As such, we simply ignore the result of the "duration"
857 * field conversion and just accept that it might not work
859 (void)strtotimet(duration
, &dt
->duration
);
860 dt
->fixed
= *fixed
- '0';
863 * ignore downtime scheduled to take place in the future.
864 * It will be picked up by the module anyways
866 if (dt
->start
> time(NULL
)) {
871 if (dt
->duration
> time(NULL
)) {
872 warn("Bizarrely large duration (%lu)", dt
->duration
);
874 if (dt
->start
< ltime
) {
875 if (dt
->duration
&& dt
->duration
> ltime
- dt
->start
)
876 dt
->duration
-= ltime
- dt
->start
;
880 if (dt
->stop
< ltime
|| dt
->stop
< dt
->start
) {
881 /* retroactively scheduled downtime, or just plain wrong */
882 dt
->stop
= dt
->start
;
886 if (dt
->fixed
&& dt
->duration
!= dt
->stop
- dt
->start
) {
887 // warn("duration doesn't match stop - start: (%lu : %lu)",
888 // dt->duration, dt->stop - dt->start);
890 dt
->duration
= dt
->stop
- dt
->start
;
892 else if (dt
->duration
> 86400 * 14) {
893 warn("Oddly long duration: %lu", dt
->duration
);
896 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
897 dt
->start
, dt
->stop
, dt
->duration
, dt
->fixed
, dt
->trigger
, dt
->host
, dt
->service
);
899 stash_downtime_command(dt
);
903 static int insert_downtime(struct string_code
*sc
)
906 struct downtime_entry
*dt
= NULL
;
909 char *host
, *service
= NULL
;
912 if (sc
->nvecs
== 4) {
914 dt
= svc_downtime(host
, service
);
917 dt
= host_downtime(host
);
920 * to stop a downtime we can either get STOPPED or
921 * CANCELLED. So far, I've only ever seen STARTED
922 * for when it actually starts though, and since
923 * the Nagios daemon is reponsible for launching
924 * it, it's unlikely there are more variants of
927 type
= NEBTYPE_DOWNTIME_STOP
;
928 if (!strcmp(strv
[sc
->nvecs
- 2], "STARTED"))
929 type
= NEBTYPE_DOWNTIME_START
;
932 case NEBTYPE_DOWNTIME_START
:
934 if (!probably_ignore_downtime
)
935 dt_print("ALRDY", ltime
, dt
);
939 if (probably_ignore_downtime
)
940 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
941 probably_ignore_downtime
, ltime
, host
, service
);
943 if (ltime
- last_downtime_start
> 1)
947 add_downtime(host
, service
, id
);
948 last_downtime_start
= ltime
;
951 case NEBTYPE_DOWNTIME_STOP
:
954 * this can happen when overlapping downtime entries
955 * occur, and the start event for the second (or nth)
956 * downtime starts before the first downtime has had
957 * a stop event. It basically means we've almost
958 * certainly done something wrong.
960 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
961 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
962 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
966 dt_del_cmd
= !dt
->service
? last_host_dt_del
: last_svc_dt_del
;
968 if ((ltime
- dt_del_cmd
) > 1 && dt
->duration
- (ltime
- dt
->started
) > 60) {
969 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
970 ltime
- dt
->started
, dt
->host
, dt
->service
, dt
->duration
);
972 if (ltime
- dt
->started
> dt
->duration
+ DT_PURGE_GRACETIME
)
973 dt_print("Long", ltime
, dt
);
977 * Now delete whatever matching downtimes we can find.
978 * this must be here, or we'll recurse like crazy into
979 * remove_downtime(), possibly exhausting the stack
984 hash_walk_data(host_downtime
, del_matching_dt
);
986 hash_walk_data(service_downtime
, del_matching_dt
);
996 static int dt_purged
;
997 static int purge_expired_dt(void *data
)
999 struct downtime_entry
*dt
= data
;
1005 if (ltime
+ DT_PURGE_GRACETIME
> dt
->stop
) {
1007 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
1008 ltime
, dt
->id
, dt
->start
, dt
->started
, dt
->stop
, dt
->duration
, dt
->host
, dt
->service
);
1009 remove_downtime(dt
);
1012 dt_print("PURGED_NOT_TIME", ltime
, dt
);
1015 set_next_dt_purge(dt
->started
, dt
->duration
);
1020 static int purged_downtimes
;
1021 static void purge_expired_downtime(void)
1027 hash_walk_data(host_downtime
, purge_expired_dt
);
1029 debug("PURGE %d host downtimes purged", dt_purged
);
1030 tot_purged
+= dt_purged
;
1032 hash_walk_data(service_downtime
, purge_expired_dt
);
1034 debug("PURGE %d service downtimes purged", dt_purged
);
1035 tot_purged
+= dt_purged
;
1037 debug("PURGE total %d entries purged", tot_purged
);
1040 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
1041 next_dt_purge
, next_dt_purge
- ltime
);
1043 purged_downtimes
+= tot_purged
;
1046 static int parse_line(char *line
, size_t len
)
1050 struct string_code
*sc
;
1051 static time_t last_ltime
= 0;
1055 imported
+= len
+ 1; /* make up for 1 lost byte per newline */
1056 if (++lines_since_progress
>= PROGRESS_INTERVAL
)
1059 /* skip obviously bogus lines */
1060 if (len
< 12 || *line
!= '[') {
1061 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no
, line
);
1065 ltime
= strtoul(line
+ 1, &ptr
, 10);
1066 if (line
+ 1 == ptr
) {
1067 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line
);
1071 if (ltime
< last_ltime
) {
1072 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
1073 // ltime, last_ltime, last_ltime - ltime);
1080 * Incremental will be 0 if not set, or 1 if set but
1081 * the database is currently empty.
1082 * Note that this will not always do the correct thing,
1083 * as downtime entries that might have been scheduled for
1084 * purging may never show up as "stopped" in the database
1085 * with this scheme. As such, incremental imports absolutely
1086 * require that nothing is in scheduled downtime when the
1087 * import is running (well, started really, but it amounts
1088 * to the same thing).
1090 if (ltime
< incremental
)
1093 if (next_dt_purge
&& ltime
>= next_dt_purge
)
1094 purge_expired_downtime();
1096 if (probably_ignore_downtime
&& ltime
- probably_ignore_downtime
> 1)
1097 probably_ignore_downtime
= 0;
1099 while (*ptr
== ']' || *ptr
== ' ')
1102 if (!(colon
= strchr(ptr
, ':'))) {
1103 if (!is_interesting(ptr
))
1106 /* stupid heuristic, but might be good for something,
1107 * somewhere, sometime. if nothing else, it should suppress
1108 * annoying output */
1109 if (is_start_event(ptr
)) {
1110 if (!daemon_is_running
)
1111 insert_process_event(NEBTYPE_PROCESS_START
);
1113 probably_ignore_downtime
= daemon_start
= ltime
;
1114 daemon_is_running
= 1;
1117 if (is_stop_event(ptr
)) {
1118 if (daemon_is_running
) {
1119 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN
);
1120 daemon_is_running
= 0;
1122 daemon_stop
= ltime
;
1127 * An unhandled event. We should probably crash here
1129 handle_unknown_event(line
);
1133 /* an event happened without us having gotten a start-event */
1134 if (!daemon_is_running
) {
1135 insert_process_event(NEBTYPE_PROCESS_START
);
1136 daemon_start
= ltime
;
1137 daemon_is_running
= 1;
1140 if (!(sc
= get_event_type(ptr
, colon
- ptr
))) {
1141 handle_unknown_event(line
);
1145 if (sc
->code
== IGNORE_LINE
)
1156 nvecs
= vectorize_string(ptr
, sc
->nvecs
);
1158 if (nvecs
!= sc
->nvecs
) /* broken line */
1159 crash("It appears we ran into a broken line (1)");
1161 for (i
= 0; i
< sc
->nvecs
; i
++)
1163 crash("It appears we ran into a broken line");
1170 set_log_version(*ptr
- '0');
1172 case NEBTYPE_EXTERNALCOMMAND_END
:
1173 semi_colon
= strchr(ptr
, ';');
1176 if (!(sc
= get_command_type(ptr
, semi_colon
- ptr
))) {
1180 nvecs
= vectorize_string(semi_colon
+ 1, sc
->nvecs
);
1181 if (nvecs
!= sc
->nvecs
) {
1182 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs
, sc
->nvecs
, ptr
);
1184 register_downtime_command(sc
);
1187 case NEBTYPE_HOSTCHECK_PROCESSED
:
1188 return insert_host_check(sc
);
1190 case NEBTYPE_SERVICECHECK_PROCESSED
:
1191 return insert_service_check(sc
);
1193 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
:
1194 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
:
1195 return insert_downtime(sc
);
1205 static size_t bufsize
;
1207 static void lparse_file(const char *path
, size_t size
, int (*parse
)(char *, size_t))
1212 /* zero size files are never interesting */
1216 if ((fd
= open(path
, O_RDONLY
)) < 0)
1217 crash("open %s failed: %s", path
, strerror(errno
));
1222 if (!buf
&& !(buf
= calloc(bufsize
+ 1, 1)))
1223 crash("Failed to allocate %u bytes\n", bufsize
+ 1);
1225 if (read(fd
, buf
, size
) != size
)
1226 crash("partial read, or read of %s faied: %s", path
, strerror(errno
));
1229 /* enforce a newline at end of buffer */
1233 while ((next
= strchr(cur
, '\n'))) {
1234 size_t len
= next
- cur
;
1237 if (cur
>= &buf
[size
])
1239 if (parse(cur
, len
) && use_sql
&& sql_errno())
1240 crash("sql error: %s", sql_error());
1246 * Returns an increasing numeric value for a nagios logfile
1247 * For a file with a name such as:
1248 * nagios-12-01-2002-00.log
1253 static size_t path_cmp_number(char *path
)
1255 size_t ret
, len
= strlen(path
);
1258 unsigned long part
[NUM_PARTS
];
1260 if (len
< 18 || strcmp(&path
[len
- 4], ".log"))
1262 dash
= strrchr(path
, '/');
1268 * we special-case nagios.log as always being the
1269 * last file to be parsed. It has to be, since it's
1270 * the currently active logfile
1272 if (!strcmp(dash
, "nagios.log") || num_nfile
== 1)
1275 for (i
= 0; i
< NUM_PARTS
; i
++) {
1278 dash
= strchr(dash
, '-');
1280 crash("dash is not");
1283 part
[i
] = strtoul(dash
, &endp
, 10);
1284 if (!part
[i
] && dash
== endp
)
1290 if (part
[0] < 1 || part
[0] > 12)
1292 if (part
[1] < 1 || part
[1] > 31)
1294 if (part
[2] < 2000 || part
[2] > 2008)
1296 ret
= part
[2] * 1000000;
1297 ret
+= part
[0] * 10000;
1298 ret
+= part
[1] * 100;
1304 static void first_log_time(struct naglog_file
*nf
)
1310 if (!(fd
= open(nf
->path
, O_RDONLY
)))
1311 crash("Failed to open %s: %s", nf
->path
, strerror(errno
));
1314 * since we're looking at every file in here anyway,
1315 * we also determine the size of them so we can do an
1316 * arena allocation large enough to fit the largest
1317 * file + an added newline later
1319 if (fstat(fd
, &st
) < 0)
1320 crash("Failed to stat %s: %s", nf
->path
, strerror(errno
));
1321 if (bufsize
<= st
.st_size
)
1322 bufsize
= st
.st_size
+ 1;
1324 nf
->size
= st
.st_size
;
1326 if (read(fd
, buf
, sizeof(buf
)) != sizeof(buf
))
1327 crash("Incomplete read of %s", nf
->path
);
1329 buf
[sizeof(buf
) - 1] = 0;
1330 if (!(nf
->first
= strtoul(buf
+ 1, NULL
, 10)))
1331 crash("'%s' has no timestamp for us to parse", buf
);
1333 nf
->cmp
= path_cmp_number(nf
->path
);
1337 int nfile_cmp(const void *p1
, const void *p2
)
1339 const struct naglog_file
*a
= p1
;
1340 const struct naglog_file
*b
= p2
;
1342 if (a
->first
> b
->first
)
1344 if (b
->first
> a
->first
)
1347 if (a
->cmp
> b
->cmp
)
1349 if (b
->cmp
> a
->cmp
)
1352 crash("Two files with same 'first' and 'cmp'? Bizarre...");
1359 * hashes one line from an "interesting"-file. We use (void *)1
1360 * to mark this as "present in hash-table" as we have no real
1361 * data to lookup but still want hash_find{,2} to return non-NULL
1362 * when it finds a match
1364 static int hash_one_line(char *line
, size_t len
)
1368 p
= strchr(line
, ';');
1371 hash_add2(interesting_services
, line
, p
, (void *)1);
1374 hash_add(interesting_hosts
, line
, (void *)1);
1379 static int hash_interesting(const char *path
)
1383 if (stat(path
, &st
) < 0) {
1384 crash("failed to stat %s: %s", path
, strerror(errno
));
1387 interesting_hosts
= hash_init(st
.st_size
/ 20);
1388 interesting_services
= hash_init(st
.st_size
/ 20);
1389 lparse_file(path
, st
.st_size
, hash_one_line
);
1395 extern const char *__progname
;
1396 int main(int argc
, char **argv
)
1398 int i
, truncate_db
= 0;
1399 struct naglog_file
*nfile
;
1400 char *db_name
= "monitor_reports";
1401 char *db_user
= "monitor";
1402 char *db_pass
= "monitor";
1403 char *db_table
= "report_data";
1405 do_progress
= isatty(fileno(stdout
));
1407 strv
= calloc(sizeof(char *), MAX_NVECS
);
1408 nfile
= calloc(sizeof(*nfile
), argc
- 1);
1409 dentry
= calloc(sizeof(*dentry
), NUM_DENTRIES
);
1410 if (!strv
|| !nfile
|| !dentry
)
1411 crash("Failed to alloc initial structs");
1414 for (num_nfile
= 0,i
= 1; i
< argc
; i
++) {
1415 char *opt
, *arg
= argv
[i
];
1416 struct naglog_file
*nf
;
1419 if ((opt
= strchr(arg
, '='))) {
1423 else if (i
< argc
- 1) {
1427 if (!prefixcmp(arg
, "--incremental")) {
1431 if (!prefixcmp(arg
, "--no-sql")) {
1435 if (!prefixcmp(arg
, "--no-progress")) {
1439 if (!prefixcmp(arg
, "--debug") || !prefixcmp(arg
, "-d")) {
1444 if (!prefixcmp(arg
, "--truncate-db")) {
1448 if (!prefixcmp(arg
, "--db-name")) {
1450 crash("%s requires a database name as an argument", arg
);
1456 if (!prefixcmp(arg
, "--db-user")) {
1458 crash("%s requires a database username as argument", arg
);
1464 if (!prefixcmp(arg
, "--db-pass")) {
1466 crash("%s requires a database username as argument", arg
);
1472 if (!prefixcmp(arg
, "--db-table")) {
1474 crash("%s requires a database table name as argument", arg
);
1480 if (!prefixcmp(arg
, "--interesting") || !prefixcmp(arg
, "-i")) {
1482 crash("%s requires a filename as argument", arg
);
1483 hash_interesting(opt
);
1489 /* non-argument, so treat as file */
1490 nf
= &nfile
[num_nfile
++];
1493 totsize
+= nf
->size
;
1497 sql_config("db_database", db_name
);
1498 sql_config("db_user", db_user
);
1499 sql_config("db_pass", db_pass
);
1500 sql_config("db_table", db_table
);
1503 crash("sql_init() failed");
1505 sql_query("TRUNCATE %s", sql_table_name());
1510 sql_query("SELECT timestamp FROM %s.%s ORDER BY timestamp DESC LIMIT 1",
1513 if (!(result
= sql_get_result()))
1514 crash("Failed to get last timestamp: %s\n", sql_error());
1516 /* someone might use --incremental with an empty
1517 * database. We shouldn't crash in that case */
1518 if ((row
= sql_fetch_row(result
)))
1519 incremental
= strtoul(row
[0], NULL
, 0);
1521 sql_free_result(result
);
1524 * We lock the table we'll be working with and disable
1525 * indexes on it. Otherwise doing the actual inserts
1526 * will take just about forever, as MySQL has to update
1527 * and flush the index cache between each operation.
1529 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1530 crash("Failed to disable keys: %s", sql_error());
1531 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1532 crash("Failed to lock report_data table: %s", sql_error());
1535 log_grok_var("logfile", "/dev/null");
1536 log_grok_var("log_levels", "warn");
1539 crash("Usage: %s [--incremental] [--interesting <file>] logfiles\n",
1543 crash("log_init() failed");
1545 qsort(nfile
, num_nfile
, sizeof(*nfile
), nfile_cmp
);
1547 if (!(buf
= calloc(bufsize
+ 1, 1)))
1548 crash("Failed to malloc %u bytes for buffer: %s",
1549 bufsize
+ 1, strerror(errno
));
1551 host_downtime
= hash_init(HASH_TABLE_SIZE
);
1552 service_downtime
= hash_init(HASH_TABLE_SIZE
);
1554 if (hook_init() < 0)
1555 crash("Failed to initialize hooks");
1557 start_time
= time(NULL
);
1558 printf("Importing %s of data from %d files\n",
1559 tobytes(totsize
), num_nfile
);
1561 for (i
= 0; i
< num_nfile
; i
++) {
1562 struct naglog_file
*nf
= &nfile
[i
];
1566 debug("importing from %s (%lu : %u)\n", nf
->path
, nf
->first
, nf
->cmp
);
1568 lparse_file(nf
->path
, nf
->size
, parse_line
);
1569 imported
++; /* make up for one lost byte per file */
1576 printf("Unclosed host downtimes:\n");
1577 puts("------------------------");
1578 hash_walk_data(host_downtime
, print_downtime
);
1579 printf("Unclosed service downtimes:\n");
1580 puts("---------------------------");
1581 hash_walk_data(service_downtime
, print_downtime
);
1583 printf("dt_depth: %d\n", dt_depth
);
1585 printf("purged downtimes: %d\n", purged_downtimes
);
1586 printf("max simultaneous host downtime hashes: %u\n", host_downtime
->max_entries
);
1587 printf("max simultaneous service downtime hashes: %u\n", service_downtime
->max_entries
);
1588 printf("max downtime depth: %d\n", max_dt_depth
);
1595 unsigned long entries
;
1597 sql_query("SELECT id FROM report_data ORDER BY id DESC LIMIT 1");
1598 if (!(res
= sql_get_result()))
1601 row
= sql_fetch_row(res
);
1602 entries
= strtoul(row
[0], NULL
, 0);
1603 sql_free_result(res
);
1606 signal(SIGINT
, SIG_IGN
);
1607 sql_query("UNLOCK TABLES");
1609 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1610 (entries
/ 50000) + 1);
1611 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1612 printf("%lu database entries indexed in %lu seconds\n",
1613 entries
, time(NULL
) - start
);
1617 if (warnings
&& debug_level
)
1618 fprintf(stderr
, "Total warnings: %d\n", warnings
);
1620 if (debug_level
|| dt_start
!= dt_stop
)
1621 fprintf(stderr
, "Downtime data %s\n started: %d\n stopped: %d\n",
1622 dt_depth
? "mismatch!" : "consistent", dt_start
, dt_stop
);
1623 if (hash_check_table(host_downtime
))
1624 fprintf(stderr
, "Hash table inconsistencies for host_downtime\n");
1625 if (hash_check_table(service_downtime
))
1626 fprintf(stderr
, "Hash table inconsistencies for service_downtime\n");
1628 print_unhandled_events();