5 #include "nagios/broker.h"
6 #include "nagios/nebcallbacks.h"
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 25000 /* lines to parse between progress updates */
32 static int only_notifications
;
33 static unsigned long long imported
, totsize
, totlines
, skipped
;
34 static int lines_since_progress
, do_progress
, list_files
;
35 static struct timeval import_start
;
36 static time_t daemon_start
, daemon_stop
, incremental
;
37 static int daemon_is_running
;
38 static uint max_dt_depth
, skipped_files
;
40 static time_t next_dt_purge
; /* when next to purge expired downtime */
41 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
43 static time_t ltime
; /* the timestamp from the current log-line */
45 static int dt_start
, dt_stop
, dt_skip
;
46 #define dt_depth (dt_start - dt_stop)
47 static hash_table
*host_downtime
;
48 static hash_table
*service_downtime
;
49 static int downtime_id
;
50 static time_t probably_ignore_downtime
;
52 struct downtime_entry
{
66 struct downtime_entry
*next
;
69 #define NUM_DENTRIES 1024
70 static struct downtime_entry
**dentry
;
71 static time_t last_downtime_start
;
73 static struct string_code event_codes
[] = {
75 add_ignored("Warning"),
76 add_ignored("LOG ROTATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END
+ CONCERNS_HOST
),
84 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END
+ CONCERNS_SERVICE
),
85 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED
),
86 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED
),
87 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END
),
88 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED
),
89 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
90 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
91 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED
),
92 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
93 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
94 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
),
95 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
),
99 static struct string_code command_codes
[] = {
100 add_cdef(1, DEL_HOST_DOWNTIME
),
101 add_cdef(1, DEL_SVC_DOWNTIME
),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
),
103 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
),
104 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME
),
105 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME
),
106 add_cdef(8, SCHEDULE_HOST_DOWNTIME
),
107 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME
),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
),
109 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
),
110 add_cdef(8, SCHEDULE_SVC_DOWNTIME
),
113 * These really have one more field than listed here. We omit one
114 * to make author and comment concatenated with a semi-colon by default.
116 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM
),
117 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM
),
122 static inline void print_strvec(char **v
, int n
)
126 for (i
= 0; i
< n
; i
++)
127 printf("v[%2d]: %s\n", i
, v
[i
]);
131 static const char *tobytes(unsigned long long n
)
133 const char *suffix
= "KMGTP";
134 static char tbuf
[2][30];
140 sprintf(tbuf
[t
], "%llu bytes", n
);
144 while (n
>> (shift
* 10) > 1024 && shift
< sizeof(suffix
) - 1)
147 sprintf(tbuf
[t
], "%0.2f %ciB",
148 (float)n
/ (float)(1 << (shift
* 10)), suffix
[shift
- 1]);
153 static const char *tv_delta(struct timeval
*start
, struct timeval
*stop
)
157 unsigned int days
, hours
, mins
;
159 secs
= stop
->tv_sec
- start
->tv_sec
;
161 secs
-= days
* 86400;
163 secs
-= hours
* 3600;
167 /* add the micro-seconds */
168 secs
= ((secs
* 1000000) + (stop
->tv_usec
- start
->tv_usec
)) / 1000000;
170 if (!mins
&& !hours
&& !days
) {
171 sprintf(buf
, "%.3lfs", secs
);
172 } else if (!hours
&& !days
) {
173 sprintf(buf
, "%um %.3lfs", mins
, secs
);
175 sprintf(buf
, "%uh %um %.3lfs", hours
, mins
, secs
);
177 sprintf(buf
, "%ud %uh %um %.3lfs", days
, hours
, mins
, secs
);
183 static void show_progress(void)
186 float pct_done
, real_pct_done
;
188 totlines
+= lines_since_progress
;
189 lines_since_progress
= 0;
194 elapsed
= time(NULL
) - import_start
.tv_sec
;
198 real_pct_done
= (float)imported
/ (float)(totsize
- skipped
) * 100;
199 pct_done
= ((float)(imported
+ skipped
) / (float)totsize
) * 100;
200 eta
= (elapsed
/ real_pct_done
) * (100.0 - real_pct_done
);
202 printf("Importing data: %.2f%% (%s) done ",
203 pct_done
, tobytes(imported
+ skipped
));
207 printf("%lum%lus", eta
/ 60, eta
% 60);
215 static void end_progress(void)
222 gettimeofday(&tv
, NULL
);
225 * If any of the logfiles doesn't have a newline
226 * at end of file, imported will be slightly off.
227 * We set it hard here so as to make sure that
228 * the final progress output stops at exactly 100%
230 imported
= totsize
- skipped
;
234 printf("%s, %llu lines imported in %s.",
235 tobytes(totsize
), totlines
, tv_delta(&import_start
, &tv
));
237 printf(" %s in %u files skipped.", tobytes(skipped
), skipped_files
);
241 static int use_sql
= 1, indexes_disabled
;
242 static void disable_indexes(void)
244 if (indexes_disabled
)
248 * if we're more than 95% done before inserting anything,
249 * such as might be the case when running an incremental
250 * import, we might as well not bother with disabling
251 * the indexes, since enabling them again can take quite
254 if (((float)(skipped
+ imported
) / (float)totsize
) * 100 >= 95.0)
258 * We lock the table we'll be working with and disable
259 * indexes on it. Otherwise doing the actual inserts
260 * will take just about forever, as MySQL has to update
261 * and flush the index cache between each operation.
263 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
264 crash("Failed to disable keys: %s", sql_error());
265 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
266 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
268 indexes_disabled
= 1;
271 static void enable_indexes(void)
275 unsigned long entries
;
278 /* if we haven't disabled the indexes we can quit early */
279 if (!indexes_disabled
)
282 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
283 if (!(res
= sql_get_result()))
286 row
= sql_fetch_row(res
);
287 entries
= strtoul(row
[0], NULL
, 0);
288 sql_free_result(res
);
291 signal(SIGINT
, SIG_IGN
);
292 sql_query("UNLOCK TABLES");
294 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
295 (entries
/ 50000) + 1);
296 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
297 printf("%lu database entries indexed in %lu seconds\n",
298 entries
, time(NULL
) - start
);
301 static int insert_downtime_event(int type
, char *host
, char *service
, int id
)
303 nebstruct_downtime_data ds
;
306 if (!is_interesting_service(host
, service
))
309 dt_start
+= type
== NEBTYPE_DOWNTIME_START
;
310 dt_stop
+= type
== NEBTYPE_DOWNTIME_STOP
;
311 if (dt_depth
> max_dt_depth
)
312 max_dt_depth
= dt_depth
;
314 if (!use_sql
|| only_notifications
)
317 memset(&ds
, 0, sizeof(ds
));
320 ds
.timestamp
.tv_sec
= ltime
;
322 ds
.service_description
= service
;
326 result
= hook_downtime(NEBCALLBACK_DOWNTIME_DATA
, (void *)&ds
);
328 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
329 type
, host
, service
, id
);
334 typedef struct import_notification
{
335 int type
, reason
, state
;
336 } import_notification
;
338 static int parse_import_notification(char *str
, import_notification
*n
)
340 char *state_str
= str
;
342 n
->reason
= parse_notification_reason(str
);
343 if (n
->reason
!= NOTIFICATION_NORMAL
) {
346 space
= strchr(str
, ' ');
349 paren
= strchr(space
, ')');
354 state_str
= space
+ 2;
357 n
->type
= SERVICE_NOTIFICATION
;
358 n
->state
= parse_service_state_gently(state_str
);
360 n
->type
= HOST_NOTIFICATION
;
361 n
->state
= parse_host_state_gently(state_str
);
367 static int insert_notification(struct string_code
*sc
)
371 struct import_notification n
;
373 if (!only_notifications
)
376 if (sc
->code
- NEBTYPE_NOTIFICATION_END
== CONCERNS_SERVICE
) {
383 if (parse_import_notification(strv
[base_idx
+ 2], &n
) < 0) {
384 handle_unknown_event(strv
[base_idx
+ 2]);
393 ("INSERT INTO %s.%s("
394 "notification_type, start_time, end_time, contact_name, "
395 "host_name, service_description, "
396 "command_name, output, "
397 "state, reason_type) "
399 "%d, %lu, %lu, '%s', "
403 sql_db_name(), sql_table_name(),
404 n
.type
, ltime
, ltime
, sql_escape(strv
[0]),
405 sql_escape(strv
[1]), desc
? sql_escape(desc
) : "",
406 sql_escape(strv
[base_idx
+ 3]), sql_escape(strv
[base_idx
+ 4]),
410 static int insert_service_check(struct string_code
*sc
)
412 nebstruct_service_check_data ds
;
414 if (!is_interesting_service(strv
[0], strv
[1]))
417 memset(&ds
, 0, sizeof(ds
));
419 ds
.timestamp
.tv_sec
= ltime
;
421 ds
.host_name
= strv
[0];
422 ds
.service_description
= strv
[1];
423 if (sc
->nvecs
== 4) {
424 /* passive service check result */
425 if (*strv
[2] >= '0' && *strv
[2] <= '9')
426 ds
.state
= atoi(strv
[2]);
428 ds
.state
= parse_service_state(strv
[2]);
429 ds
.state_type
= HARD_STATE
;
430 ds
.current_attempt
= 1;
433 ds
.state
= parse_service_state(strv
[2]);
434 ds
.state_type
= soft_hard(strv
[3]);
435 ds
.current_attempt
= atoi(strv
[4]);
439 if (!use_sql
|| only_notifications
)
443 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA
, (void *)&ds
);
446 static int insert_host_check(struct string_code
*sc
)
448 nebstruct_host_check_data ds
;
450 if (!is_interesting_host(strv
[0]))
453 memset(&ds
, 0, sizeof(ds
));
455 ds
.timestamp
.tv_sec
= ltime
;
457 ds
.host_name
= strv
[0];
458 if (sc
->nvecs
== 3) {
459 if (*strv
[1] >= '0' && *strv
[1] <= '9')
460 ds
.state
= atoi(strv
[1]);
462 ds
.state
= parse_host_state(strv
[1]);
463 /* passive host check result */
465 ds
.current_attempt
= 1;
466 ds
.state_type
= HARD_STATE
;
468 ds
.state
= parse_host_state(strv
[1]);
469 ds
.state_type
= soft_hard(strv
[2]);
470 ds
.current_attempt
= atoi(strv
[3]);
474 if (!use_sql
|| only_notifications
)
478 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA
, (void *)&ds
);
481 static int insert_process_event(int type
)
483 nebstruct_process_data ds
;
485 if (!use_sql
|| only_notifications
)
488 memset(&ds
, 0, sizeof(ds
));
489 ds
.timestamp
.tv_sec
= ltime
;
492 return hook_process_data(NEBCALLBACK_PROCESS_DATA
, (void *)&ds
);
495 static int insert_acknowledgement(struct string_code
*sc
)
500 static void dt_print(char *tpc
, time_t when
, struct downtime_entry
*dt
)
505 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
506 tpc
, when
, dt
->started
, dt
->start
, dt
->stop
, dt
->duration
, dt
->id
);
507 printf("%s", dt
->host
);
509 printf(";%s", dt
->service
);
513 static struct downtime_entry
*last_dte
;
514 static struct downtime_entry
*del_dte
;
516 static void remove_downtime(struct downtime_entry
*dt
);
517 static int del_matching_dt(void *data
)
519 struct downtime_entry
*dt
= data
;
521 if (del_dte
->id
== dt
->id
) {
522 dt_print("ALSO", 0, dt
);
524 return HASH_WALK_REMOVE
;
530 static void stash_downtime_command(struct downtime_entry
*dt
)
532 dt
->slot
= dt
->start
% NUM_DENTRIES
;
533 dt
->next
= dentry
[dt
->slot
];
534 dentry
[dt
->slot
] = dt
;
537 static void remove_downtime(struct downtime_entry
*dt
)
539 if (!is_interesting_service(dt
->host
, dt
->service
))
542 insert_downtime_event(NEBTYPE_DOWNTIME_STOP
, dt
->host
, dt
->service
, dt
->id
);
544 dt_print("RM_DT", ltime
, dt
);
548 static struct downtime_entry
*
549 dt_matches_command(struct downtime_entry
*dt
, char *host
, char *service
)
551 for (; dt
; dt
= dt
->next
) {
554 if (ltime
> dt
->stop
|| ltime
< dt
->start
) {
559 case SCHEDULE_SVC_DOWNTIME
:
560 if (service
&& strcmp(service
, dt
->service
))
564 case SCHEDULE_HOST_DOWNTIME
:
565 case SCHEDULE_HOST_SVC_DOWNTIME
:
566 if (strcmp(host
, dt
->host
)) {
570 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
571 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
572 /* these two have host set in dt, but
573 * it will not match all the possible hosts */
576 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
577 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
578 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
579 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
582 crash("dt->code not set properly\n");
586 * Once we get here all the various other criteria have
587 * been matched, so we need to check if the daemon was
588 * running when this downtime was supposed to have
589 * started, and otherwise use the daemon start time
590 * as the value to diff against
592 if (daemon_stop
< dt
->start
&& daemon_start
> dt
->start
) {
593 debug("Adjusting dt->start (%lu) to (%lu)\n",
594 dt
->start
, daemon_start
);
595 dt
->start
= daemon_start
;
596 if (dt
->trigger
&& dt
->duration
)
597 dt
->stop
= dt
->start
+ dt
->duration
;
600 diff
= ltime
- dt
->start
;
601 if (diff
< 3 || dt
->trigger
|| !dt
->fixed
)
608 static struct downtime_entry
*
609 find_downtime_command(char *host
, char *service
)
612 struct downtime_entry
*shortcut
= NULL
;
614 if (last_dte
&& last_dte
->start
== ltime
) {
618 for (i
= 0; i
< NUM_DENTRIES
; i
++) {
619 struct downtime_entry
*dt
;
620 dt
= dt_matches_command(dentry
[i
], host
, service
);
622 if (shortcut
&& dt
!= shortcut
)
624 printf("FIND shortcut no good\n");
634 static int print_downtime(void *data
)
636 struct downtime_entry
*dt
= (struct downtime_entry
*)data
;
638 dt_print("UNCLOSED", ltime
, dt
);
643 static inline void set_next_dt_purge(time_t base
, time_t add
)
645 if (!next_dt_purge
|| next_dt_purge
> base
+ add
)
646 next_dt_purge
= base
+ add
;
648 if (next_dt_purge
<= ltime
)
649 next_dt_purge
= ltime
+ 1;
652 static inline void add_downtime(char *host
, char *service
, int id
)
654 struct downtime_entry
*dt
, *cmd
, *old
;
656 if (!is_interesting_service(host
, service
))
659 dt
= malloc(sizeof(*dt
));
660 cmd
= find_downtime_command(host
, service
);
662 warn("DT with no ext cmd? %lu %s;%s", ltime
, host
, service
);
663 memset(dt
, 0, sizeof(*dt
));
664 dt
->duration
= 7200; /* the default downtime duration in nagios */
666 dt
->stop
= dt
->start
+ dt
->duration
;
669 memcpy(dt
, cmd
, sizeof(*dt
));
671 dt
->host
= strdup(host
);
675 set_next_dt_purge(ltime
, dt
->duration
);
679 old
= hash_update(host_downtime
, dt
->host
, dt
);
682 dt
->service
= strdup(service
);
683 old
= hash_update2(service_downtime
, dt
->host
, dt
->service
, dt
);
686 if (old
&& old
!= dt
) {
693 dt_print("IN_DT", ltime
, dt
);
694 insert_downtime_event(NEBTYPE_DOWNTIME_START
, dt
->host
, dt
->service
, dt
->id
);
697 static time_t last_host_dt_del
, last_svc_dt_del
;
698 static int register_downtime_command(struct string_code
*sc
)
700 struct downtime_entry
*dt
;
701 char *start_time
, *end_time
, *duration
= NULL
;
702 char *host
= NULL
, *service
= NULL
, *fixed
, *triggered_by
= NULL
;
706 case DEL_HOST_DOWNTIME
:
707 last_host_dt_del
= ltime
;
709 case DEL_SVC_DOWNTIME
:
710 last_svc_dt_del
= ltime
;
713 case SCHEDULE_HOST_DOWNTIME
:
714 if (strtotimet(strv
[5], &foo
))
717 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
718 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
719 case SCHEDULE_HOST_SVC_DOWNTIME
:
722 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
723 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
724 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
725 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
726 start_time
= strv
[1];
729 if (strtotimet(strv
[5], &foo
))
730 triggered_by
= strv
[4];
736 case SCHEDULE_SVC_DOWNTIME
:
739 start_time
= strv
[2];
742 if (strtotimet(strv
[6], &foo
)) {
743 triggered_by
= strv
[5];
752 crash("Unknown downtime type: %d", sc
->code
);
755 if (!(dt
= calloc(sizeof(*dt
), 1)))
756 crash("calloc(%u, 1) failed: %s", (uint
)sizeof(*dt
), strerror(errno
));
760 dt
->host
= strdup(host
);
762 dt
->service
= strdup(service
);
764 dt
->trigger
= triggered_by
? !!(*triggered_by
- '0') : 0;
765 if (strtotimet(start_time
, &dt
->start
) || strtotimet(end_time
, &dt
->stop
))
767 print_strvec(strv
, sc
->nvecs
);
768 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
769 command_codes
[sc
->code
- 1].str
, start_time
, end_time
, duration
);
773 * sometimes downtime commands can be logged according to
774 * log version 1, while the log still claims to be version 2.
775 * Apparently, this happens when using a daemon supporting
776 * version 2 logging but a downtime command is added that
777 * follows the version 1 standard.
778 * As such, we simply ignore the result of the "duration"
779 * field conversion and just accept that it might not work
781 (void)strtotimet(duration
, &dt
->duration
);
782 dt
->fixed
= *fixed
- '0';
785 * ignore downtime scheduled to take place in the future.
786 * It will be picked up by the module anyways
788 if (dt
->start
> time(NULL
)) {
793 if (dt
->duration
> time(NULL
)) {
794 warn("Bizarrely large duration (%lu)", dt
->duration
);
796 if (dt
->start
< ltime
) {
797 if (dt
->duration
&& dt
->duration
> ltime
- dt
->start
)
798 dt
->duration
-= ltime
- dt
->start
;
802 if (dt
->stop
< ltime
|| dt
->stop
< dt
->start
) {
803 /* retroactively scheduled downtime, or just plain wrong */
804 dt
->stop
= dt
->start
;
808 if (dt
->fixed
&& dt
->duration
!= dt
->stop
- dt
->start
) {
809 // warn("duration doesn't match stop - start: (%lu : %lu)",
810 // dt->duration, dt->stop - dt->start);
812 dt
->duration
= dt
->stop
- dt
->start
;
814 else if (dt
->duration
> 86400 * 14) {
815 warn("Oddly long duration: %lu", dt
->duration
);
818 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
819 dt
->start
, dt
->stop
, dt
->duration
, dt
->fixed
, dt
->trigger
, dt
->host
, dt
->service
);
821 stash_downtime_command(dt
);
825 static int insert_downtime(struct string_code
*sc
)
828 struct downtime_entry
*dt
= NULL
;
831 char *host
, *service
= NULL
;
834 if (sc
->nvecs
== 4) {
836 dt
= hash_find2(service_downtime
, host
, service
);
839 dt
= hash_find(host_downtime
, host
);
842 * to stop a downtime we can either get STOPPED or
843 * CANCELLED. So far, I've only ever seen STARTED
844 * for when it actually starts though, and since
845 * the Nagios daemon is reponsible for launching
846 * it, it's unlikely there are more variants of
849 type
= NEBTYPE_DOWNTIME_STOP
;
850 if (!strcmp(strv
[sc
->nvecs
- 2], "STARTED"))
851 type
= NEBTYPE_DOWNTIME_START
;
854 case NEBTYPE_DOWNTIME_START
:
856 if (!probably_ignore_downtime
)
857 dt_print("ALRDY", ltime
, dt
);
861 if (probably_ignore_downtime
)
862 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
863 probably_ignore_downtime
, ltime
, host
, service
);
865 if (ltime
- last_downtime_start
> 1)
869 add_downtime(host
, service
, id
);
870 last_downtime_start
= ltime
;
873 case NEBTYPE_DOWNTIME_STOP
:
876 * this can happen when overlapping downtime entries
877 * occur, and the start event for the second (or nth)
878 * downtime starts before the first downtime has had
879 * a stop event. It basically means we've almost
880 * certainly done something wrong.
882 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
883 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
884 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
889 dt_del_cmd
= !dt
->service
? last_host_dt_del
: last_svc_dt_del
;
891 if ((ltime
- dt_del_cmd
) > 1 && dt
->duration
- (ltime
- dt
->started
) > 60) {
892 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
893 ltime
- dt
->started
, dt
->host
, dt
->service
, dt
->duration
);
895 if (ltime
- dt
->started
> dt
->duration
+ DT_PURGE_GRACETIME
)
896 dt_print("Long", ltime
, dt
);
900 * Now delete whatever matching downtimes we can find.
901 * this must be here, or we'll recurse like crazy into
902 * remove_downtime(), possibly exhausting the stack
907 hash_walk_data(host_downtime
, del_matching_dt
);
909 hash_walk_data(service_downtime
, del_matching_dt
);
919 static int dt_purged
;
920 static int purge_expired_dt(void *data
)
922 struct downtime_entry
*dt
= data
;
929 set_next_dt_purge(dt
->started
, dt
->duration
);
931 if (ltime
+ DT_PURGE_GRACETIME
> dt
->stop
) {
933 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
934 ltime
, dt
->id
, dt
->start
, dt
->started
, dt
->stop
, dt
->duration
, dt
->host
, dt
->service
);
936 return HASH_WALK_REMOVE
;
939 dt_print("PURGED_NOT_TIME", ltime
, dt
);
945 static int purged_downtimes
;
946 static void purge_expired_downtime(void)
952 hash_walk_data(host_downtime
, purge_expired_dt
);
954 debug("PURGE %d host downtimes purged", dt_purged
);
955 tot_purged
+= dt_purged
;
957 hash_walk_data(service_downtime
, purge_expired_dt
);
959 debug("PURGE %d service downtimes purged", dt_purged
);
960 tot_purged
+= dt_purged
;
962 debug("PURGE total %d entries purged", tot_purged
);
965 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
966 next_dt_purge
, next_dt_purge
- ltime
);
968 purged_downtimes
+= tot_purged
;
971 static inline void handle_start_event(void)
973 if (!daemon_is_running
)
974 insert_process_event(NEBTYPE_PROCESS_START
);
976 probably_ignore_downtime
= daemon_start
= ltime
;
977 daemon_is_running
= 1;
980 static inline void handle_stop_event(void)
982 if (daemon_is_running
) {
983 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN
);
984 daemon_is_running
= 0;
989 static int parse_line(char *line
, uint len
)
993 struct string_code
*sc
;
994 static time_t last_ltime
= 0;
996 imported
+= len
+ 1; /* make up for 1 lost byte per newline */
998 /* ignore empty lines */
1002 if (++lines_since_progress
>= PROGRESS_INTERVAL
)
1005 /* skip obviously bogus lines */
1006 if (len
< 12 || *line
!= '[') {
1007 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no
, line
);
1011 ltime
= strtoul(line
+ 1, &ptr
, 10);
1012 if (line
+ 1 == ptr
) {
1013 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line
);
1017 if (ltime
< last_ltime
) {
1018 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
1019 // ltime, last_ltime, last_ltime - ltime);
1026 * Incremental will be 0 if not set, or 1 if set but
1027 * the database is currently empty.
1028 * Note that this will not always do the correct thing,
1029 * as downtime entries that might have been scheduled for
1030 * purging may never show up as "stopped" in the database
1031 * with this scheme. As such, incremental imports absolutely
1032 * require that nothing is in scheduled downtime when the
1033 * import is running (well, started really, but it amounts
1034 * to the same thing).
1036 if (ltime
< incremental
)
1039 if (next_dt_purge
&& ltime
>= next_dt_purge
)
1040 purge_expired_downtime();
1042 if (probably_ignore_downtime
&& ltime
- probably_ignore_downtime
> 1)
1043 probably_ignore_downtime
= 0;
1045 while (*ptr
== ']' || *ptr
== ' ')
1048 if (!is_interesting(ptr
))
1051 if (!(colon
= strchr(ptr
, ':'))) {
1052 /* stupid heuristic, but might be good for something,
1053 * somewhere, sometime. if nothing else, it should suppress
1054 * annoying output */
1055 if (is_start_event(ptr
)) {
1056 handle_start_event();
1059 if (is_stop_event(ptr
)) {
1060 handle_stop_event();
1065 * An unhandled event. We should probably crash here
1067 handle_unknown_event(line
);
1071 /* an event happened without us having gotten a start-event */
1072 if (!daemon_is_running
) {
1073 insert_process_event(NEBTYPE_PROCESS_START
);
1074 daemon_start
= ltime
;
1075 daemon_is_running
= 1;
1078 if (!(sc
= get_event_type(ptr
, colon
- ptr
))) {
1079 handle_unknown_event(line
);
1083 if (sc
->code
== IGNORE_LINE
)
1094 nvecs
= vectorize_string(ptr
, sc
->nvecs
);
1096 if (nvecs
!= sc
->nvecs
) {
1098 warn("Line %d in %s seems to not have all the fields it should",
1099 line_no
, cur_file
->path
);
1103 for (i
= 0; i
< sc
->nvecs
; i
++) {
1105 /* this should never happen */
1106 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1107 line_no
, cur_file
->path
);
1116 case NEBTYPE_EXTERNALCOMMAND_END
:
1117 semi_colon
= strchr(ptr
, ';');
1120 if (!(sc
= get_command_type(ptr
, semi_colon
- ptr
))) {
1123 if (sc
->code
== RESTART_PROGRAM
) {
1124 handle_stop_event();
1128 nvecs
= vectorize_string(semi_colon
+ 1, sc
->nvecs
);
1129 if (nvecs
!= sc
->nvecs
) {
1130 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs
, sc
->nvecs
, ptr
);
1132 if (sc
->code
!= ACKNOWLEDGE_HOST_PROBLEM
&&
1133 sc
->code
!= ACKNOWLEDGE_SVC_PROBLEM
)
1135 register_downtime_command(sc
);
1137 insert_acknowledgement(sc
);
1141 case NEBTYPE_HOSTCHECK_PROCESSED
:
1142 return insert_host_check(sc
);
1144 case NEBTYPE_SERVICECHECK_PROCESSED
:
1145 return insert_service_check(sc
);
1147 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
:
1148 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
:
1149 return insert_downtime(sc
);
1151 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_HOST
:
1152 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_SERVICE
:
1153 return insert_notification(sc
);
1162 static int parse_one_line(char *str
, uint len
)
1164 if (parse_line(str
, len
) && use_sql
&& sql_errno())
1165 crash("sql error: %s", sql_error());
1170 static int hash_one_line(char *line
, uint len
)
1172 return add_interesting_object(line
);
1175 static int hash_interesting(const char *path
)
1179 if (stat(path
, &st
) < 0)
1180 crash("failed to stat %s: %s", path
, strerror(errno
));
1182 lparse_path(path
, st
.st_size
, hash_one_line
);
1187 extern const char *__progname
;
1188 __attribute__((__format__(__printf__
, 1, 2)))
1189 static void usage(const char *fmt
, ...)
1195 vfprintf(stdout
, fmt
, ap
);
1199 printf("Usage %s [options] [logfiles]\n\n", __progname
);
1200 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1201 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1202 printf("\nOptions:\n");
1203 printf(" --help this cruft\n");
1204 printf(" --no-progress don't display progress output\n");
1205 printf(" --no-sql don't access the database\n");
1206 printf(" --db-name database name\n");
1207 printf(" --db-table database table name\n");
1208 printf(" --db-user database user\n");
1209 printf(" --db-pass database password\n");
1210 printf(" --incremental[=<when>] do an incremental import (since $when)\n");
1211 printf(" --truncate-db truncate database before importing\n");
1212 printf(" --only-notifications only import notifications\n");
1213 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1214 printf(" --list-files list files to import\n");
1223 int main(int argc
, char **argv
)
1225 int i
, truncate_db
= 0;
1226 const char *nagios_cfg
= NULL
;
1227 char *db_name
, *db_user
, *db_pass
, *db_table
;
1229 db_name
= db_user
= db_pass
= db_table
= NULL
;
1231 do_progress
= isatty(fileno(stdout
));
1233 strv
= calloc(sizeof(char *), MAX_NVECS
);
1234 dentry
= calloc(sizeof(*dentry
), NUM_DENTRIES
);
1235 if (!strv
|| !dentry
)
1236 crash("Failed to alloc initial structs");
1239 for (num_nfile
= 0,i
= 1; i
< argc
; i
++) {
1240 char *opt
, *arg
= argv
[i
];
1241 int arg_len
, eq_opt
= 0;
1243 if ((opt
= strchr(arg
, '='))) {
1247 else if (i
< argc
- 1) {
1251 if (!prefixcmp(arg
, "-h") || !prefixcmp(arg
, "--help")) {
1254 if (!prefixcmp(arg
, "--incremental")) {
1258 * nifty for debugging --incremental skipping log-files
1259 * The value will be overwritten unless --no-sql is also
1263 incremental
= strtoul(opt
, NULL
, 0);
1265 usage("--incremental= requires a parameter");
1269 if (!prefixcmp(arg
, "--no-sql")) {
1273 if (!prefixcmp(arg
, "--only-notifications")) {
1274 only_notifications
= 1;
1275 db_name
= db_name
? db_name
: "merlin";
1276 db_user
= db_user
? db_user
: "merlin";
1277 db_pass
= db_pass
? db_pass
: "merlin";
1278 db_table
= db_table
? db_table
: "notification";
1281 if (!prefixcmp(arg
, "--no-progress")) {
1285 if (!prefixcmp(arg
, "--debug") || !prefixcmp(arg
, "-d")) {
1290 if (!prefixcmp(arg
, "--truncate-db")) {
1294 if (!prefixcmp(arg
, "--list-files")) {
1299 if (!prefixcmp(arg
, "--nagios-cfg")) {
1300 if (!opt
|| !*opt
) {
1301 crash("%s requires the path to nagios.cfg as argument", arg
);
1308 if (!prefixcmp(arg
, "--db-name")) {
1310 crash("%s requires a database name as an argument", arg
);
1316 if (!prefixcmp(arg
, "--db-user")) {
1318 crash("%s requires a database username as argument", arg
);
1324 if (!prefixcmp(arg
, "--db-pass")) {
1326 crash("%s requires a database username as argument", arg
);
1332 if (!prefixcmp(arg
, "--db-table")) {
1334 crash("%s requires a database table name as argument", arg
);
1340 if (!prefixcmp(arg
, "--interesting") || !prefixcmp(arg
, "-i")) {
1342 crash("%s requires a filename as argument", arg
);
1343 hash_interesting(opt
);
1349 /* non-argument, so treat as a config- or log-file */
1350 arg_len
= strlen(arg
);
1351 if (arg_len
>= 10 && !strcmp(&arg
[arg_len
- 10], "nagios.cfg")) {
1354 add_naglog_path(arg
);
1358 /* fallback for op5 systems */
1359 if (!nagios_cfg
&& !num_nfile
) {
1360 nagios_cfg
= "/opt/monitor/etc/nagios.cfg";
1363 struct cfg_comp
*conf
;
1364 conf
= cfg_parse_file(nagios_cfg
);
1365 for (i
= 0; i
< conf
->vars
; i
++) {
1366 struct cfg_var
*v
= conf
->vlist
[i
];
1367 if (!strcmp(v
->key
, "log_file")) {
1368 add_naglog_path(v
->value
);
1370 if (!strcmp(v
->key
, "log_archive_path")) {
1371 add_naglog_path(v
->value
);
1377 db_name
= db_name
? db_name
: "monitor_reports";
1378 db_user
= db_user
? db_user
: "monitor";
1379 db_pass
= db_pass
? db_pass
: "monitor";
1380 db_table
= db_table
? db_table
: "report_data";
1381 sql_config("db_database", db_name
);
1382 sql_config("db_user", db_user
);
1383 sql_config("db_pass", db_pass
);
1384 sql_config("db_table", db_table
);
1387 crash("sql_init() failed");
1389 sql_query("TRUNCATE %s", sql_table_name());
1391 if (incremental
== 1) {
1394 sql_query("SELECT %s FROM %s.%s ORDER BY %s DESC LIMIT 1",
1395 only_notifications
? "end_time" : "timestamp",
1397 only_notifications
? "end_time" : "timestamp");
1399 if (!(result
= sql_get_result()))
1400 crash("Failed to get last timestamp: %s\n", sql_error());
1402 /* someone might use --incremental with an empty
1403 * database. We shouldn't crash in that case */
1404 if ((row
= sql_fetch_row(result
)))
1405 incremental
= strtoul(row
[0], NULL
, 0);
1407 sql_free_result(result
);
1411 log_grok_var("logfile", "/dev/null");
1412 log_grok_var("log_levels", "warn");
1415 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1419 crash("log_init() failed");
1421 qsort(nfile
, num_nfile
, sizeof(*nfile
), nfile_cmp
);
1423 host_downtime
= hash_init(HASH_TABLE_SIZE
);
1424 service_downtime
= hash_init(HASH_TABLE_SIZE
);
1426 if (hook_init() < 0)
1427 crash("Failed to initialize hooks");
1429 /* go through them once to count the total size for progress output */
1430 for (i
= 0; i
< num_nfile
; i
++) {
1431 totsize
+= nfile
[i
].size
;
1435 gettimeofday(&import_start
, NULL
);
1436 printf("Importing %s of data from %d files\n",
1437 tobytes(totsize
), num_nfile
);
1440 for (i
= 0; i
< num_nfile
; i
++) {
1441 struct naglog_file
*nf
= &nfile
[i
];
1446 * skip parsing files if they're not interesting, such
1447 * as during incremental imports.
1448 * 'incremental' will be 0 if we're doing a full import,
1449 * 1 if we're doing an incremental but the database is
1450 * empty and will contain the timestamp of the latest
1451 * entry in the database if we're doing an incremental
1452 * import to a populated table.
1453 * Note that we can never skip the last file in the list,
1454 * although the lparse routine should sift through it
1455 * pretty quickly in case it has nothing interesting.
1457 if (i
+ 1 < num_nfile
&& incremental
> nfile
[i
+ 1].first
) {
1459 skipped
+= nf
->size
;
1463 printf("%s\n", nf
->path
);
1466 debug("importing from %s (%lu : %u)\n", nf
->path
, nf
->first
, nf
->cmp
);
1468 lparse_path(nf
->path
, nf
->size
, parse_one_line
);
1469 imported
++; /* make up for one lost byte per file */
1473 purge_expired_downtime();
1478 printf("Unclosed host downtimes:\n");
1479 puts("------------------------");
1480 hash_walk_data(host_downtime
, print_downtime
);
1481 printf("Unclosed service downtimes:\n");
1482 puts("---------------------------");
1483 hash_walk_data(service_downtime
, print_downtime
);
1485 printf("dt_depth: %d\n", dt_depth
);
1487 printf("purged downtimes: %d\n", purged_downtimes
);
1488 printf("max simultaneous host downtime hashes: %u\n",
1489 hash_entries_max(host_downtime
));
1490 printf("max simultaneous service downtime hashes: %u\n",
1491 hash_entries_max(service_downtime
));
1492 printf("max downtime depth: %u\n", max_dt_depth
);
1500 if (warnings
&& debug_level
)
1501 fprintf(stderr
, "Total warnings: %d\n", warnings
);
1503 if (debug_level
|| dt_start
> dt_stop
) {
1505 fprintf(stderr
, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1506 dt_depth
? "mismatch!" : "consistent", dt_start
, dt_stop
, dt_depth
, dt_skip
);
1507 hash_debug_table(host_downtime
, 0);
1508 hash_debug_table(service_downtime
, 0);
1509 if ((count
= hash_entries(host_downtime
))) {
1510 fprintf(stderr
, "host_downtime as %u entries remaining\n", count
);
1512 if ((count
= hash_entries(service_downtime
))) {
1513 fprintf(stderr
, "service_downtime has %u entries remaining\n", count
);
1517 print_unhandled_events();