5 #include <nagios/broker.h>
6 #include <nagios/nebcallbacks.h>
13 #include <stdint.h> /* standard fixed-size integer types. */
14 #include <inttypes.h> /* PRIxxx printf specifiers. */
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 25000 /* lines to parse between progress updates */
32 static const char *progname
;
33 static char *db_table
;
34 static int only_notifications
;
35 static unsigned long long imported
, totsize
, totlines
, skipped
;
36 static int lines_since_progress
, do_progress
, list_files
;
37 static struct timeval import_start
;
38 static time_t daemon_start
, daemon_stop
, incremental
;
39 static int daemon_is_running
;
40 static uint max_dt_depth
, skipped_files
;
41 static int repair_table
;
43 static time_t next_dt_purge
; /* when next to purge expired downtime */
44 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
46 static time_t ltime
; /* the timestamp from the current log-line */
48 static uint dt_start
, dt_stop
, dt_skip
;
49 #define dt_depth (dt_start - dt_stop)
50 static dkhash_table
*host_downtime
;
51 static dkhash_table
*service_downtime
;
52 static int downtime_id
;
54 struct downtime_entry
{
68 struct downtime_entry
*next
;
71 #define NUM_DENTRIES 1024
72 static struct downtime_entry
**dentry
;
73 static time_t last_downtime_start
;
75 static struct string_code event_codes
[] = {
77 add_ignored("Warning"),
78 add_ignored("LOG ROTATION"),
79 add_ignored("HOST FLAPPING ALERT"),
80 add_ignored("SERVICE FLAPPING ALERT"),
81 add_ignored("SERVICE EVENT HANDLER"),
82 add_ignored("HOST EVENT HANDLER"),
83 add_ignored("LOG VERSION"),
84 add_ignored("livestatus"),
85 add_ignored("TIMEPERIOD TRANSITION"),
90 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END
+ CONCERNS_HOST
),
91 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END
+ CONCERNS_SERVICE
),
92 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED
),
93 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED
),
94 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END
),
95 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED
),
96 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
97 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED
),
98 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED
),
99 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
100 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED
),
101 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
),
102 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
),
106 static struct string_code command_codes
[] = {
107 add_cdef(1, DEL_HOST_DOWNTIME
),
108 add_cdef(1, DEL_SVC_DOWNTIME
),
109 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
),
110 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
),
111 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME
),
112 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME
),
113 add_cdef(8, SCHEDULE_HOST_DOWNTIME
),
114 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME
),
115 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
),
116 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
),
117 add_cdef(8, SCHEDULE_SVC_DOWNTIME
),
120 * These really have one more field than listed here. We omit one
121 * to make author and comment concatenated with a semi-colon by default.
123 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM
),
124 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM
),
129 static void handle_sql_result(int errors
, const char *table
)
131 if (!errors
|| !sql_table_crashed
)
135 printf("Repairing table '%s'. This may take a very long time. Please be patient\n", table
);
136 sql_repair_table(table
);
139 crash("Database table '%s' appears to have crashed. Please run\n mysqlrepair %s.%s",
140 table
, sql_db_name(), table
);
144 static int insert_host_result(nebstruct_host_check_data
*ds
)
147 char *host_name
= NULL
, *output
= NULL
;
149 if (!host_has_new_state(ds
->host_name
, ds
->state
, ds
->state_type
)) {
150 linfo("state not changed for host '%s'", ds
->host_name
);
154 sql_quote(ds
->host_name
, &host_name
);
155 sql_quote(ds
->output
, &output
);
158 "timestamp, event_type, host_name, state, "
159 "hard, retry, output"
160 ") VALUES(%lu, %d, %s, %d, %d, %d, %s)",
162 ds
->timestamp
.tv_sec
, ds
->type
, host_name
, ds
->state
,
163 ds
->state_type
== HARD_STATE
, ds
->current_attempt
,
172 static int insert_service_result(nebstruct_service_check_data
*ds
)
175 char *host_name
, *service_description
, *output
;
177 if (!service_has_new_state(ds
->host_name
, ds
->service_description
, ds
->state
, ds
->state_type
)) {
178 linfo("state not changed for service '%s' on host '%s'",
179 ds
->service_description
, ds
->host_name
);
183 sql_quote(ds
->host_name
, &host_name
);
184 sql_quote(ds
->service_description
, &service_description
);
185 sql_quote(ds
->output
, &output
);
188 "timestamp, event_type, host_name, service_description, state, "
189 "hard, retry, output) "
190 "VALUES(%lu, %d, %s, %s, '%d', '%d', '%d', %s)",
192 ds
->timestamp
.tv_sec
, ds
->type
, host_name
,
193 service_description
, ds
->state
,
194 ds
->state_type
== HARD_STATE
, ds
->current_attempt
,
197 free(service_description
);
202 static int sql_insert_downtime(nebstruct_downtime_data
*ds
)
204 int depth
= 0, result
;
205 char *host_name
, *service_description
;
208 case NEBTYPE_DOWNTIME_START
:
210 * If downtime is starting, it will always be at least
211 * 1 deep. Since the report UI doesn't care about the
212 * actual depth but only whether downtime is in effect
213 * or not we can get away with cheating here.
216 case NEBTYPE_DOWNTIME_STOP
:
218 case NEBTYPE_DOWNTIME_DELETE
:
220 * if we're deleting a downtime that hasn't started yet, nothing
221 * should be added to the database. Otherwise, transform it to a
222 * NEBTYPE_DOWNTIME_STOP event to mark the downtime as stopped.
224 if (ds
->start_time
> time(NULL
))
226 ds
->type
= NEBTYPE_DOWNTIME_STOP
;
232 sql_quote(ds
->host_name
, &host_name
);
233 if (ds
->service_description
) {
234 sql_quote(ds
->service_description
, &service_description
);
238 "timestamp, event_type, host_name,"
239 "service_description, downtime_depth) "
240 "VALUES(%lu, %d, %s, %s, %d)",
242 ds
->timestamp
.tv_sec
, ds
->type
, host_name
,
243 service_description
, depth
);
244 free(service_description
);
248 "timestamp, event_type, host_name, downtime_depth)"
249 "VALUES(%lu, %d, %s, %d)",
251 ds
->timestamp
.tv_sec
, ds
->type
, host_name
, depth
);
257 static int insert_process_data(nebstruct_process_data
*ds
)
260 case NEBTYPE_PROCESS_START
:
261 case NEBTYPE_PROCESS_SHUTDOWN
:
263 case NEBTYPE_PROCESS_RESTART
:
264 ds
->type
= NEBTYPE_PROCESS_SHUTDOWN
;
271 ("INSERT INTO %s(timestamp, event_type) "
273 db_table
, ds
->timestamp
.tv_sec
, ds
->type
);
276 static inline void print_strvec(char **v
, int n
)
280 for (i
= 0; i
< n
; i
++)
281 printf("v[%2d]: %s\n", i
, v
[i
]);
285 static void show_progress(void)
288 float pct_done
, real_pct_done
;
290 totlines
+= lines_since_progress
;
291 lines_since_progress
= 0;
296 elapsed
= time(NULL
) - import_start
.tv_sec
;
300 real_pct_done
= (float)imported
/ (float)(totsize
- skipped
) * 100;
301 pct_done
= ((float)(imported
+ skipped
) / (float)totsize
) * 100;
302 eta
= (elapsed
/ real_pct_done
) * (100.0 - real_pct_done
);
304 printf("Importing data: %.2f%% (%s) done ",
305 pct_done
, human_bytes(imported
+ skipped
));
309 printf("%lum%lus", eta
/ 60, eta
% 60);
317 static void end_progress(void)
324 gettimeofday(&tv
, NULL
);
327 * If any of the logfiles doesn't have a newline
328 * at end of file, imported will be slightly off.
329 * We set it hard here so as to make sure that
330 * the final progress output stops at exactly 100%
332 imported
= totsize
- skipped
;
336 printf("%s, %llu lines imported in %s.",
337 human_bytes(totsize
), totlines
, tv_delta(&import_start
, &tv
));
339 printf(" %s in %u files skipped.", human_bytes(skipped
), skipped_files
);
343 static int indexes_disabled
;
344 static void disable_indexes(void)
346 if (indexes_disabled
)
350 * if we're more than 95% done before inserting anything,
351 * such as might be the case when running an incremental
352 * import, we might as well not bother with disabling
353 * the indexes, since enabling them again can take quite
356 if (((float)(skipped
+ imported
) / (float)totsize
) * 100 >= 95.0)
360 * We lock the table we'll be working with and disable
361 * indexes on it. Otherwise doing the actual inserts
362 * will take just about forever, as MySQL has to update
363 * and flush the index cache between each operation.
365 if (sql_query("ALTER TABLE %s DISABLE KEYS", db_table
))
366 crash("Failed to disable keys: %s", sql_error_msg());
367 if (sql_query("LOCK TABLES %s WRITE, report_data_extras WRITE", db_table
))
368 crash("Failed to lock table %s: %s", db_table
, sql_error_msg());
370 indexes_disabled
= 1;
373 static void insert_extras(void)
375 sql_query("INSERT INTO %s (`timestamp`, `event_type`, `flags`, `attrib`, `host_name`, `service_description`, `state`, `hard`, `retry`, `downtime_depth`, `output`) SELECT `timestamp`, `event_type`, `flags`, `attrib`, `host_name`, `service_description`, `state`, `hard`, `retry`, `downtime_depth`, `output` FROM report_data_extras;", db_table
);
378 static void enable_indexes(void)
380 db_wrap_result
*result
= NULL
;
384 /* if we haven't disabled the indexes we can quit early */
385 if (!indexes_disabled
)
388 sql_query("SELECT count(1) FROM %s", db_table
);
389 if (!(result
= sql_get_result()))
392 if (0 == result
->api
->step(result
)) {
393 result
->api
->get_int64_ndx(result
, 0, &entries
);
400 signal(SIGINT
, SIG_IGN
);
401 sql_query("UNLOCK TABLES");
403 printf("Creating sql table indexes. This will likely take ~%"PRIi64
" seconds\n",
404 (entries
/ 50000) + 1);
405 sql_query("ALTER TABLE %s ENABLE KEYS", db_table
);
406 printf("%lu database entries indexed in %lu seconds\n",
407 entries
, time(NULL
) - start
);
410 static int insert_downtime_event(int type
, char *host
, char *service
, int id
)
412 nebstruct_downtime_data ds
;
415 if (!is_interesting_service(host
, service
))
418 dt_start
+= type
== NEBTYPE_DOWNTIME_START
;
419 dt_stop
+= type
== NEBTYPE_DOWNTIME_STOP
;
420 if (dt_depth
> max_dt_depth
)
421 max_dt_depth
= dt_depth
;
423 if (!use_database
|| only_notifications
)
426 memset(&ds
, 0, sizeof(ds
));
429 ds
.timestamp
.tv_sec
= ltime
;
431 ds
.service_description
= service
;
435 result
= sql_insert_downtime(&ds
);
437 lp_crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
438 type
, host
, service
, id
);
443 typedef struct import_notification
{
444 int type
, reason
, state
;
445 } import_notification
;
447 static int parse_import_notification(char *str
, import_notification
*n
)
449 char *state_str
= str
;
451 n
->reason
= parse_notification_reason(str
);
452 if (n
->reason
!= NOTIFICATION_NORMAL
) {
455 space
= strchr(str
, ' ');
458 paren
= strchr(space
, ')');
463 state_str
= space
+ 2;
466 n
->type
= SERVICE_NOTIFICATION
;
467 n
->state
= parse_service_state_gently(state_str
);
469 n
->type
= HOST_NOTIFICATION
;
470 n
->state
= parse_host_state_gently(state_str
);
476 static int insert_notification(struct string_code
*sc
)
478 int base_idx
, result
;
479 char *contact_name
, *host_name
, *service_description
;
480 char *command_name
, *output
;
481 struct import_notification n
;
483 if (!only_notifications
)
486 if (sc
->code
- NEBTYPE_NOTIFICATION_END
== CONCERNS_SERVICE
) {
491 if (parse_import_notification(strv
[base_idx
+ 2], &n
) < 0) {
492 handle_unknown_event(strv
[base_idx
+ 2]);
500 sql_quote(strv
[0], &contact_name
);
501 sql_quote(strv
[1], &host_name
);
503 sql_quote(strv
[2], &service_description
);
505 service_description
= NULL
;
507 sql_quote(strv
[base_idx
+ 3], &command_name
);
508 sql_quote(strv
[base_idx
+ 4], &output
);
511 "notification_type, start_time, end_time, contact_name, "
512 "host_name, service_description, "
513 "command_name, output, "
514 "state, reason_type) "
521 n
.type
, ltime
, ltime
, contact_name
,
522 host_name
, safe_str(service_description
),
523 command_name
, output
,
527 safe_free(service_description
);
533 static int insert_service_check(struct string_code
*sc
)
535 nebstruct_service_check_data ds
;
537 if (!is_interesting_service(strv
[0], strv
[1]))
540 memset(&ds
, 0, sizeof(ds
));
542 ds
.timestamp
.tv_sec
= ltime
;
544 ds
.host_name
= strv
[0];
545 ds
.service_description
= strv
[1];
546 if (sc
->nvecs
== 4) {
547 /* passive service check result */
548 if (*strv
[2] >= '0' && *strv
[2] <= '9')
549 ds
.state
= atoi(strv
[2]);
551 ds
.state
= parse_service_state(strv
[2]);
552 ds
.state_type
= HARD_STATE
;
553 ds
.current_attempt
= 1;
556 ds
.state
= parse_service_state(strv
[2]);
557 ds
.state_type
= soft_hard(strv
[3]);
558 ds
.current_attempt
= atoi(strv
[4]);
562 if (!use_database
|| only_notifications
)
566 return insert_service_result(&ds
);
569 static int insert_host_check(struct string_code
*sc
)
571 nebstruct_host_check_data ds
;
573 if (!is_interesting_host(strv
[0]))
576 memset(&ds
, 0, sizeof(ds
));
578 ds
.timestamp
.tv_sec
= ltime
;
580 ds
.host_name
= strv
[0];
581 if (sc
->nvecs
== 3) {
582 if (*strv
[1] >= '0' && *strv
[1] <= '9')
583 ds
.state
= atoi(strv
[1]);
585 ds
.state
= parse_host_state(strv
[1]);
586 /* passive host check result */
588 ds
.current_attempt
= 1;
589 ds
.state_type
= HARD_STATE
;
591 ds
.state
= parse_host_state(strv
[1]);
592 ds
.state_type
= soft_hard(strv
[2]);
593 ds
.current_attempt
= atoi(strv
[3]);
597 if (!use_database
|| only_notifications
)
601 return insert_host_result(&ds
);
604 static int insert_process_event(int type
)
606 nebstruct_process_data ds
;
608 if (!use_database
|| only_notifications
)
611 memset(&ds
, 0, sizeof(ds
));
612 ds
.timestamp
.tv_sec
= ltime
;
615 return insert_process_data(&ds
);
619 static int insert_acknowledgement(struct string_code
*sc
)
624 # define insert_acknowledgement(foo) /* nothing */ ;
627 static void dt_print(char *tpc
, time_t when
, struct downtime_entry
*dt
)
632 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
633 tpc
, when
, dt
->started
, dt
->start
, dt
->stop
, dt
->duration
, dt
->id
);
634 printf("%s", dt
->host
);
636 printf(";%s", dt
->service
);
640 static struct downtime_entry
*last_dte
;
641 static struct downtime_entry
*del_dte
;
643 static void remove_downtime(struct downtime_entry
*dt
);
644 static int del_matching_dt(void *data
)
646 struct downtime_entry
*dt
= data
;
648 if (del_dte
->id
== dt
->id
) {
649 dt_print("ALSO", 0, dt
);
651 return DKHASH_WALK_REMOVE
;
657 static void stash_downtime_command(struct downtime_entry
*dt
)
659 dt
->slot
= dt
->start
% NUM_DENTRIES
;
660 dt
->next
= dentry
[dt
->slot
];
661 dentry
[dt
->slot
] = dt
;
664 static void remove_downtime(struct downtime_entry
*dt
)
666 if (!is_interesting_service(dt
->host
, dt
->service
))
669 insert_downtime_event(NEBTYPE_DOWNTIME_STOP
, dt
->host
, dt
->service
, dt
->id
);
671 dt_print("RM_DT", ltime
, dt
);
675 static struct downtime_entry
*
676 dt_matches_command(struct downtime_entry
*dt
, char *host
, char *service
)
678 for (; dt
; dt
= dt
->next
) {
681 if (ltime
> dt
->stop
|| ltime
< dt
->start
) {
686 case SCHEDULE_SVC_DOWNTIME
:
687 if (service
&& strcmp(service
, dt
->service
))
691 case SCHEDULE_HOST_DOWNTIME
:
692 case SCHEDULE_HOST_SVC_DOWNTIME
:
693 if (strcmp(host
, dt
->host
)) {
697 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
698 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
699 /* these two have host set in dt, but
700 * it will not match all the possible hosts */
703 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
704 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
705 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
706 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
709 lp_crash("dt->code not set properly\n");
713 * Once we get here all the various other criteria have
714 * been matched, so we need to check if the daemon was
715 * running when this downtime was supposed to have
716 * started, and otherwise use the daemon start time
717 * as the value to diff against
719 if (daemon_stop
< dt
->start
&& daemon_start
> dt
->start
) {
720 debug("Adjusting dt->start (%lu) to (%lu)\n",
721 dt
->start
, daemon_start
);
722 dt
->start
= daemon_start
;
723 if (dt
->trigger
&& dt
->duration
)
724 dt
->stop
= dt
->start
+ dt
->duration
;
727 diff
= ltime
- dt
->start
;
728 if (diff
< 3 || dt
->trigger
|| !dt
->fixed
)
735 static struct downtime_entry
*
736 find_downtime_command(char *host
, char *service
)
739 struct downtime_entry
*shortcut
= NULL
;
741 if (last_dte
&& last_dte
->start
== ltime
) {
745 for (i
= 0; i
< NUM_DENTRIES
; i
++) {
746 struct downtime_entry
*dt
;
747 dt
= dt_matches_command(dentry
[i
], host
, service
);
749 if (shortcut
&& dt
!= shortcut
)
751 printf("FIND shortcut no good\n");
761 static int print_downtime(void *data
)
763 struct downtime_entry
*dt
= (struct downtime_entry
*)data
;
765 dt_print("UNCLOSED", ltime
, dt
);
770 static inline void set_next_dt_purge(time_t base
, time_t add
)
772 if (!next_dt_purge
|| next_dt_purge
> base
+ add
)
773 next_dt_purge
= base
+ add
;
775 if (next_dt_purge
<= ltime
)
776 next_dt_purge
= ltime
+ 1;
779 static inline void mrln_add_downtime(char *host
, char *service
, int id
)
781 struct downtime_entry
*dt
, *cmd
, *old
;
782 dkhash_table
*the_table
;
784 if (!is_interesting_service(host
, service
))
787 dt
= malloc(sizeof(*dt
));
788 cmd
= find_downtime_command(host
, service
);
790 warn("DT with no ext cmd? %lu %s;%s", ltime
, host
, service
);
791 memset(dt
, 0, sizeof(*dt
));
792 dt
->duration
= 7200; /* the default downtime duration in nagios */
794 dt
->stop
= dt
->start
+ dt
->duration
;
797 memcpy(dt
, cmd
, sizeof(*dt
));
799 dt
->host
= strdup(host
);
803 set_next_dt_purge(ltime
, dt
->duration
);
806 dt
->service
= strdup(service
);
807 the_table
= service_downtime
;
811 the_table
= host_downtime
;
814 old
= dkhash_get(the_table
, dt
->host
, dt
->service
);
816 dkhash_remove(the_table
, old
->host
, old
->service
);
822 dkhash_insert(the_table
, dt
->host
, dt
->service
, dt
);
824 dt_print("IN_DT", ltime
, dt
);
825 insert_downtime_event(NEBTYPE_DOWNTIME_START
, dt
->host
, dt
->service
, dt
->id
);
828 static time_t last_host_dt_del
, last_svc_dt_del
;
829 static int register_downtime_command(struct string_code
*sc
, int nvecs
)
831 struct downtime_entry
*dt
;
832 char *start_time
, *end_time
, *duration
= NULL
;
833 char *host
= NULL
, *service
= NULL
, *fixed
, *triggered_by
= NULL
;
837 * this could cause crashes if we let it go on, so
838 * bail early if we didn't parse enough fields from
841 if (nvecs
< sc
->nvecs
) {
846 case DEL_HOST_DOWNTIME
:
847 last_host_dt_del
= ltime
;
849 case DEL_SVC_DOWNTIME
:
850 last_svc_dt_del
= ltime
;
853 case SCHEDULE_HOST_DOWNTIME
:
854 if (strtotimet(strv
[5], &foo
))
857 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME
:
858 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME
:
859 case SCHEDULE_HOST_SVC_DOWNTIME
:
862 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME
:
863 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME
:
864 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME
:
865 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME
:
866 start_time
= strv
[1];
869 if (strtotimet(strv
[5], &foo
))
870 triggered_by
= strv
[4];
876 case SCHEDULE_SVC_DOWNTIME
:
879 start_time
= strv
[2];
882 if (strtotimet(strv
[6], &foo
)) {
883 triggered_by
= strv
[5];
892 lp_crash("Unknown downtime type: %d", sc
->code
);
895 if (!(dt
= calloc(sizeof(*dt
), 1)))
896 lp_crash("calloc(%u, 1) failed: %s", (uint
)sizeof(*dt
), strerror(errno
));
900 dt
->host
= strdup(host
);
902 dt
->service
= strdup(service
);
904 dt
->trigger
= triggered_by
? !!(*triggered_by
- '0') : 0;
905 dt
->start
= dt
->stop
= 0;
906 strtotimet(start_time
, &dt
->start
);
907 strtotimet(end_time
, &dt
->stop
);
910 * if neither of these is set, we can't use this command,
911 * so log it as an unknown event and move on. We really
912 * shouldn't crash here no matter what anyways.
914 if (!dt
->start
&& !dt
->stop
) {
915 devectorize_string(strv
, nvecs
);
916 warn("No dt->start or dt->stop in: %s", strv
[0]);
921 * sometimes downtime commands can be logged according to
922 * log version 1, while the log still claims to be version 2.
923 * Apparently, this happens when using a daemon supporting
924 * version 2 logging but a downtime command is added that
925 * follows the version 1 standard.
926 * As such, we simply ignore the result of the "duration"
927 * field conversion and just accept that it might not work.
928 * If it doesn't, we force-set it to 7200, since that's what
929 * Nagios uses as a default, and we'll need two of duration,
930 * start_time and end_time in order to make some sense of
931 * this downtime entry
933 if (strtotimet(duration
, &dt
->duration
) < 0)
935 dt
->fixed
= *fixed
- '0';
938 * we know we have a duration and at least one of stop
939 * and start. Calculate the other if one is missing.
942 dt
->stop
= dt
->start
+ dt
->duration
;
943 } else if (!dt
->start
) {
944 dt
->start
= dt
->stop
- dt
->duration
;
947 /* make sure we're not starting timeperiod in the past */
948 if (dt
->start
< ltime
) {
950 if (dt
->stop
<= dt
->start
)
953 /* if fixed, we alter duration. Otherwise we alter 'stop' */
955 dt
->duration
= dt
->stop
- dt
->start
;
957 dt
->stop
= dt
->start
+ dt
->duration
;
961 * ignore downtime scheduled to take place in the future.
962 * It will be picked up by the module anyways
964 if (dt
->start
> time(NULL
)) {
969 if (dt
->duration
> time(NULL
)) {
970 warn("Bizarrely large duration (%lu)", dt
->duration
);
972 if (dt
->start
< ltime
) {
973 if (dt
->duration
&& dt
->duration
> ltime
- dt
->start
)
974 dt
->duration
-= ltime
- dt
->start
;
978 if (dt
->stop
< ltime
|| dt
->stop
< dt
->start
) {
979 /* retroactively scheduled downtime, or just plain wrong */
980 dt
->stop
= dt
->start
;
984 if (dt
->fixed
&& dt
->duration
!= dt
->stop
- dt
->start
) {
985 // warn("duration doesn't match stop - start: (%lu : %lu)",
986 // dt->duration, dt->stop - dt->start);
988 dt
->duration
= dt
->stop
- dt
->start
;
990 else if (dt
->duration
> 86400 * 14) {
991 warn("Oddly long duration: %lu", dt
->duration
);
994 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
995 dt
->start
, dt
->stop
, dt
->duration
, dt
->fixed
, dt
->trigger
, dt
->host
, dt
->service
);
997 stash_downtime_command(dt
);
1001 static int insert_downtime(struct string_code
*sc
)
1004 struct downtime_entry
*dt
= NULL
;
1007 char *host
, *service
= NULL
;
1010 if (sc
->nvecs
== 4) {
1012 dt
= dkhash_get(service_downtime
, host
, service
);
1015 dt
= dkhash_get(host_downtime
, host
, NULL
);
1021 * to stop a downtime we can either get STOPPED or
1022 * CANCELLED. So far, I've only ever seen STARTED
1023 * for when it actually starts though, and since
1024 * the Nagios daemon is reponsible for launching
1025 * it, it's unlikely there are more variants of
1028 type
= NEBTYPE_DOWNTIME_STOP
;
1029 if (!strcmp(strv
[sc
->nvecs
- 2], "STARTED"))
1030 type
= NEBTYPE_DOWNTIME_START
;
1033 case NEBTYPE_DOWNTIME_START
:
1034 if (ltime
- last_downtime_start
> 1)
1038 mrln_add_downtime(host
, service
, id
);
1039 last_downtime_start
= ltime
;
1042 case NEBTYPE_DOWNTIME_STOP
:
1045 * this can happen when overlapping downtime entries
1046 * occur, and the start event for the second (or nth)
1047 * downtime starts before the first downtime has had
1048 * a stop event. It basically means we've almost
1049 * certainly done something wrong.
1051 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
1052 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
1053 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
1058 dt_del_cmd
= !dt
->service
? last_host_dt_del
: last_svc_dt_del
;
1060 if ((ltime
- dt_del_cmd
) > 1 && dt
->duration
- (ltime
- dt
->started
) > 60) {
1061 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
1062 ltime
- dt
->started
, dt
->host
, dt
->service
, dt
->duration
);
1064 if (ltime
- dt
->started
> dt
->duration
+ DT_PURGE_GRACETIME
)
1065 dt_print("Long", ltime
, dt
);
1067 remove_downtime(dt
);
1069 * Now delete whatever matching downtimes we can find.
1070 * this must be here, or we'll recurse like crazy into
1071 * remove_downtime(), possibly exhausting the stack
1076 dkhash_walk_data(host_downtime
, del_matching_dt
);
1078 dkhash_walk_data(service_downtime
, del_matching_dt
);
1088 static int dt_purged
;
1089 static int purge_expired_dt(void *data
)
1091 struct downtime_entry
*dt
= data
;
1098 set_next_dt_purge(dt
->started
, dt
->duration
);
1100 if (ltime
+ DT_PURGE_GRACETIME
> dt
->stop
) {
1102 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
1103 ltime
, dt
->id
, dt
->start
, dt
->started
, dt
->stop
, dt
->duration
, dt
->host
, dt
->service
);
1104 remove_downtime(dt
);
1105 return DKHASH_WALK_REMOVE
;
1108 dt_print("PURGED_NOT_TIME", ltime
, dt
);
1114 static int purged_downtimes
;
1115 static void purge_expired_downtime(void)
1121 dkhash_walk_data(host_downtime
, purge_expired_dt
);
1123 debug("PURGE %d host downtimes purged", dt_purged
);
1124 tot_purged
+= dt_purged
;
1126 dkhash_walk_data(service_downtime
, purge_expired_dt
);
1128 debug("PURGE %d service downtimes purged", dt_purged
);
1129 tot_purged
+= dt_purged
;
1131 debug("PURGE total %d entries purged", tot_purged
);
1134 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
1135 next_dt_purge
, next_dt_purge
- ltime
);
1137 purged_downtimes
+= tot_purged
;
1140 static inline void handle_start_event(void)
1142 if (!daemon_is_running
)
1143 insert_process_event(NEBTYPE_PROCESS_START
);
1145 daemon_start
= ltime
;
1146 daemon_is_running
= 1;
1149 static inline void handle_stop_event(void)
1151 if (daemon_is_running
) {
1152 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN
);
1153 daemon_is_running
= 0;
1155 daemon_stop
= ltime
;
1158 static int parse_line(char *line
, uint len
)
1161 int result
= 0, nvecs
= 0;
1162 struct string_code
*sc
;
1163 static time_t last_ltime
= 0;
1165 imported
+= len
+ 1; /* make up for 1 lost byte per newline */
1168 /* ignore empty lines */
1172 if (++lines_since_progress
>= PROGRESS_INTERVAL
)
1175 /* skip obviously bogus lines */
1176 if (len
< 12 || *line
!= '[') {
1177 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no
, line
);
1181 ltime
= strtoul(line
+ 1, &ptr
, 10);
1182 if (line
+ 1 == ptr
) {
1183 lp_crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line
);
1187 if (ltime
< last_ltime
) {
1188 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
1189 // ltime, last_ltime, last_ltime - ltime);
1196 * Incremental will be 0 if not set, or 1 if set but
1197 * the database is currently empty.
1198 * Note that this will not always do the correct thing,
1199 * as downtime entries that might have been scheduled for
1200 * purging may never show up as "stopped" in the database
1201 * with this scheme. As such, incremental imports absolutely
1202 * require that nothing is in scheduled downtime when the
1203 * import is running (well, started really, but it amounts
1204 * to the same thing).
1206 if (ltime
< incremental
)
1209 if (next_dt_purge
&& ltime
>= next_dt_purge
)
1210 purge_expired_downtime();
1212 while (*ptr
== ']' || *ptr
== ' ')
1215 if (!is_interesting(ptr
))
1218 if (!(colon
= strchr(ptr
, ':'))) {
1219 /* stupid heuristic, but might be good for something,
1220 * somewhere, sometime. if nothing else, it should suppress
1221 * annoying output */
1222 if (is_start_event(ptr
)) {
1223 handle_start_event();
1226 if (is_stop_event(ptr
)) {
1227 handle_stop_event();
1232 * An unhandled event. We should probably crash here
1234 handle_unknown_event(line
);
1238 /* an event happened without us having gotten a start-event */
1239 if (!daemon_is_running
) {
1240 insert_process_event(NEBTYPE_PROCESS_START
);
1241 daemon_start
= ltime
;
1242 daemon_is_running
= 1;
1245 if (!(sc
= get_event_type(ptr
, colon
- ptr
))) {
1246 handle_unknown_event(line
);
1250 if (sc
->code
== IGNORE_LINE
)
1254 * break out early if we know we won't handle this event
1255 * There's no point in parsing a potentially huge amount
1256 * of lines we're not even interested in
1259 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_HOST
:
1260 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_SERVICE
:
1261 if (only_notifications
)
1265 if (only_notifications
)
1278 nvecs
= vectorize_string(ptr
, sc
->nvecs
);
1280 if (nvecs
!= sc
->nvecs
) {
1282 warn("Line %d in %s seems to not have all the fields it should",
1283 line_no
, cur_file
->path
);
1287 for (i
= 0; i
< sc
->nvecs
; i
++) {
1289 /* this should never happen */
1290 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1291 line_no
, cur_file
->path
);
1300 case NEBTYPE_EXTERNALCOMMAND_END
:
1301 semi_colon
= strchr(ptr
, ';');
1304 if (!(sc
= get_command_type(ptr
, semi_colon
- ptr
))) {
1307 if (sc
->code
== RESTART_PROGRAM
) {
1308 handle_stop_event();
1312 nvecs
= vectorize_string(semi_colon
+ 1, sc
->nvecs
);
1313 if (nvecs
!= sc
->nvecs
) {
1314 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs
, sc
->nvecs
, ptr
);
1316 if (sc
->code
!= ACKNOWLEDGE_HOST_PROBLEM
&&
1317 sc
->code
!= ACKNOWLEDGE_SVC_PROBLEM
)
1319 register_downtime_command(sc
, nvecs
);
1321 insert_acknowledgement(sc
);
1325 case NEBTYPE_HOSTCHECK_PROCESSED
:
1326 result
= insert_host_check(sc
);
1329 case NEBTYPE_SERVICECHECK_PROCESSED
:
1330 result
= insert_service_check(sc
);
1333 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_HOST
:
1334 case NEBTYPE_DOWNTIME_LOAD
+ CONCERNS_SERVICE
:
1335 result
= insert_downtime(sc
);
1338 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_HOST
:
1339 case NEBTYPE_NOTIFICATION_END
+ CONCERNS_SERVICE
:
1340 result
= insert_notification(sc
);
1347 handle_sql_result(result
, db_table
);
1351 static int parse_one_line(char *str
, uint len
)
1355 if (parse_line(str
, len
) && use_database
&& sql_error(&msg
))
1356 lp_crash("sql error: %s", msg
);
1361 static int hash_one_line(char *line
, uint len
)
1363 return add_interesting_object(line
);
1366 static int hash_interesting(const char *path
)
1370 if (stat(path
, &st
) < 0)
1371 lp_crash("failed to stat %s: %s", path
, strerror(errno
));
1373 lparse_path(path
, st
.st_size
, hash_one_line
);
1379 __attribute__((__format__(__printf__
, 1, 2)))
1380 static void usage(const char *fmt
, ...)
1386 vfprintf(stdout
, fmt
, ap
);
1390 printf("Usage %s [options] [logfiles]\n\n", progname
);
1391 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1392 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1393 printf("\nOptions:\n");
1394 printf(" --help this cruft\n");
1395 printf(" --no-progress don't display progress output\n");
1396 printf(" --no-sql don't access the database\n");
1397 printf(" --db-name database name\n");
1398 printf(" --db-table database table name\n");
1399 printf(" --db-user database user\n");
1400 printf(" --db-pass database password\n");
1401 printf(" --db-host database host\n");
1402 printf(" --db-port database port\n");
1403 printf(" --db-type database type\n");
1404 printf(" --db-conn-str database connection string\n");
1405 printf(" --[no-]repair] should we autorepair tables?\n");
1406 printf(" --incremental[=<when>] do an incremental import (since $when)\n");
1407 printf(" --truncate-db truncate database before importing\n");
1408 printf(" --only-notifications only import notifications\n");
1409 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1410 printf(" --list-files list files to import\n");
1419 int main(int argc
, char **argv
)
1421 int i
, truncate_db
= 0;
1422 const char *nagios_cfg
= NULL
;
1423 char *db_name
, *db_user
, *db_pass
;
1424 char *db_conn_str
, *db_host
, *db_port
, *db_type
;
1426 progname
= strrchr(argv
[0], '/');
1427 progname
= progname
? progname
+ 1 : argv
[0];
1430 db_name
= db_user
= db_pass
= NULL
;
1431 db_conn_str
= db_host
= db_port
= db_type
= NULL
;
1433 do_progress
= isatty(fileno(stdout
));
1435 strv
= calloc(sizeof(char *), MAX_NVECS
);
1436 dentry
= calloc(sizeof(*dentry
), NUM_DENTRIES
);
1437 if (!strv
|| !dentry
)
1438 crash("Failed to alloc initial structs");
1441 for (num_nfile
= 0,i
= 1; i
< argc
; i
++) {
1442 char *opt
, *arg
= argv
[i
];
1443 int arg_len
, eq_opt
= 0;
1445 if ((opt
= strchr(arg
, '='))) {
1449 else if (i
< argc
- 1) {
1453 if (!prefixcmp(arg
, "-h") || !prefixcmp(arg
, "--help")) {
1456 if (!prefixcmp(arg
, "--incremental")) {
1460 * nifty for debugging --incremental skipping log-files
1461 * The value will be overwritten unless --no-sql is also
1465 incremental
= strtoul(opt
, NULL
, 0);
1467 usage("--incremental= requires a parameter");
1469 * since we use '1' to mean "determine automatically",
1470 * we magic a '1' from userspace to '2'. In practice,
1471 * this just means the user doesn't need to know a
1472 * thing about this program's internals.
1474 if (incremental
== 1)
1479 if (!prefixcmp(arg
, "--no-sql")) {
1483 if (!prefixcmp(arg
, "--no-repair")) {
1487 if (!prefixcmp(arg
, "--repair")) {
1491 if (!prefixcmp(arg
, "--only-notifications")) {
1492 only_notifications
= 1;
1493 db_table
= db_table
? db_table
: "notification";
1496 if (!prefixcmp(arg
, "--no-progress")) {
1500 if (!prefixcmp(arg
, "--debug") || !prefixcmp(arg
, "-d")) {
1505 if (!prefixcmp(arg
, "--truncate-db")) {
1509 if (!prefixcmp(arg
, "--list-files")) {
1514 if (!prefixcmp(arg
, "--nagios-cfg")) {
1515 if (!opt
|| !*opt
) {
1516 crash("%s requires the path to nagios.cfg as argument", arg
);
1523 if (!prefixcmp(arg
, "--db-name")) {
1525 crash("%s requires a database name as an argument", arg
);
1531 if (!prefixcmp(arg
, "--db-user")) {
1533 crash("%s requires a database username as argument", arg
);
1539 if (!prefixcmp(arg
, "--db-pass")) {
1541 crash("%s requires a database username as argument", arg
);
1547 if (!prefixcmp(arg
, "--db-table")) {
1549 crash("%s requires a database table name as argument", arg
);
1555 if (!prefixcmp(arg
, "--db-conn-str")) {
1557 crash("%s requires a connection string as argument", arg
);
1563 if (!prefixcmp(arg
, "--db-host")) {
1565 crash("%s requires a host as argument", arg
);
1571 if (!prefixcmp(arg
, "--db-port")) {
1573 crash("%s requires a port as argument", arg
);
1579 if (!prefixcmp(arg
, "--db-type")) {
1581 crash("%s requires a database type as an argument", arg
);
1587 if (!prefixcmp(arg
, "--interesting") || !prefixcmp(arg
, "-i")) {
1589 crash("%s requires a filename as argument", arg
);
1590 hash_interesting(opt
);
1596 /* non-argument, so treat as a config- or log-file */
1597 arg_len
= strlen(arg
);
1598 if (arg_len
>= 10 && !strcmp(&arg
[arg_len
- 10], "nagios.cfg")) {
1601 add_naglog_path(arg
);
1605 /* fallback for op5 systems */
1606 if (!nagios_cfg
&& !num_nfile
) {
1607 nagios_cfg
= "/opt/monitor/etc/nagios.cfg";
1610 struct cfg_comp
*conf
;
1613 conf
= cfg_parse_file(nagios_cfg
);
1614 for (vi
= 0; vi
< conf
->vars
; vi
++) {
1615 struct cfg_var
*v
= conf
->vlist
[vi
];
1616 if (!strcmp(v
->key
, "log_file")) {
1617 add_naglog_path(v
->value
);
1619 if (!strcmp(v
->key
, "log_archive_path")) {
1620 add_naglog_path(v
->value
);
1625 if (!list_files
&& use_database
&& (!truncate_db
&& !incremental
)) {
1626 printf("Defaulting to incremental mode\n");
1631 db_user
= db_user
? db_user
: "merlin";
1632 db_pass
= db_pass
? db_pass
: "merlin";
1633 db_type
= db_type
? db_type
: "mysql";
1634 sql_config("user", db_user
);
1635 sql_config("pass", db_pass
);
1636 sql_config("type", db_type
);
1638 sql_config("conn_str", db_conn_str
);
1640 db_name
= db_name
? db_name
: "merlin";
1641 db_table
= db_table
? db_table
: "report_data";
1642 db_host
= db_host
? db_host
: "localhost";
1643 sql_config("database", db_name
);
1644 sql_config("host", db_host
);
1645 sql_config("port", db_port
);
1648 sql_config("commit_interval", "0");
1649 sql_config("commit_queries", "10000");
1651 if (sql_init() < 0) {
1652 crash("sql_init() failed. db=%s, table=%s, user=%s, db msg=[%s]",
1653 db_name
, db_table
, db_user
, sql_error_msg());
1656 sql_query("TRUNCATE %s", db_table
);
1658 if (incremental
== 1) {
1659 db_wrap_result
* result
= NULL
;
1660 sql_query("SELECT MAX(%s) FROM %s.%s",
1661 only_notifications
? "end_time" : "timestamp",
1664 if (!(result
= sql_get_result()))
1665 crash("Failed to get last timestamp: %s\n", sql_error_msg());
1667 * someone might use --incremental with an empty
1668 * database. We shouldn't crash in that case
1670 if (0 == result
->api
->step(result
)) {
1671 /* reminder: incremental is time_t and may be either uint32_t or uint64.
1672 Thus we use an extra int object here to avoid passing an invalid pointer
1673 to (&incremental) on platforms where time_t is not uint32_t.
1675 int32_t inctime
= 0;
1676 result
->api
->get_int32_ndx(result
, 0, &inctime
);
1677 incremental
= inctime
;
1683 log_grok_var("logfile", "/dev/null");
1684 log_grok_var("log_levels", "warn");
1687 usage("No files or directories specified, or nagios.cfg not found");
1690 crash("log_init() failed");
1692 qsort(nfile
, num_nfile
, sizeof(*nfile
), nfile_cmp
);
1694 host_downtime
= dkhash_create(HASH_TABLE_SIZE
);
1695 service_downtime
= dkhash_create(HASH_TABLE_SIZE
);
1697 if (state_init() < 0)
1698 crash("Failed to initialize state machinery");
1700 /* go through them once to count the total size for progress output */
1701 for (i
= 0; i
< num_nfile
; i
++) {
1702 totsize
+= nfile
[i
].size
;
1706 gettimeofday(&import_start
, NULL
);
1707 printf("Importing %s of data from %d files\n",
1708 human_bytes(totsize
), num_nfile
);
1711 for (i
= 0; i
< num_nfile
; i
++) {
1712 struct naglog_file
*nf
= &nfile
[i
];
1717 * skip parsing files if they're not interesting, such
1718 * as during incremental imports.
1719 * 'incremental' will be 0 if we're doing a full import,
1720 * 1 if we're doing an incremental but the database is
1721 * empty and will contain the timestamp of the latest
1722 * entry in the database if we're doing an incremental
1723 * import to a populated table.
1724 * Note that we can never skip the last file in the list,
1725 * although the lparse routine should sift through it
1726 * pretty quickly in case it has nothing interesting.
1728 if (i
+ 1 < num_nfile
&& incremental
> nfile
[i
+ 1].first
) {
1730 skipped
+= nf
->size
;
1734 printf("%s\n", nf
->path
);
1737 debug("importing from %s (%lu : %u)\n", nf
->path
, nf
->first
, nf
->cmp
);
1739 lparse_path(nf
->path
, nf
->size
, parse_one_line
);
1740 imported
++; /* make up for one lost byte per file */
1744 purge_expired_downtime();
1749 printf("Unclosed host downtimes:\n");
1750 puts("------------------------");
1751 dkhash_walk_data(host_downtime
, print_downtime
);
1752 printf("Unclosed service downtimes:\n");
1753 puts("---------------------------");
1754 dkhash_walk_data(service_downtime
, print_downtime
);
1756 printf("dt_depth: %d\n", dt_depth
);
1758 printf("purged downtimes: %d\n", purged_downtimes
);
1759 printf("max simultaneous host downtime hashes: %u\n",
1760 dkhash_num_entries_max(host_downtime
));
1761 printf("max simultaneous service downtime hashes: %u\n",
1762 dkhash_num_entries_max(service_downtime
));
1763 printf("max downtime depth: %u\n", max_dt_depth
);
1767 if (!only_notifications
)
1768 insert_extras(); /* must be before indexing */
1773 if (warnings
&& debug_level
)
1774 fprintf(stderr
, "Total warnings: %d\n", warnings
);
1776 if (debug_level
|| dt_start
> dt_stop
) {
1778 fprintf(stderr
, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1779 dt_depth
? "mismatch!" : "consistent", dt_start
, dt_stop
, dt_depth
, dt_skip
);
1780 if ((count
= dkhash_num_entries(host_downtime
))) {
1781 fprintf(stderr
, "host_downtime as %u entries remaining\n", count
);
1783 if ((count
= dkhash_num_entries(service_downtime
))) {
1784 fprintf(stderr
, "service_downtime has %u entries remaining\n", count
);
1788 print_unhandled_events();