2 # Copyright (C) 2009-2010 :
3 # Gabes Jean, naparuba@gmail.com
4 # Gerhard Lausser, Gerhard.Lausser@consol.de
6 # This file is part of Shinken.
8 # Shinken is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
13 # Shinken is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
18 # You should have received a copy of the GNU Affero General Public License
19 # along with Shinken. If not, see <http://www.gnu.org/licenses/>.
21 """ This class is a common one for service/host. Here you
22 will find all scheduling related functions, like the schedule
23 or the consume_check ones. It's a quite important class!
30 from shinken
.item
import Item
31 from shinken
.check
import Check
32 from shinken
.notification
import Notification
33 from shinken
.macroresolver
import MacroResolver
34 from shinken
.eventhandler
import EventHandler
35 from shinken
.dependencynode
import DependencyNodeFactory
, DependencyNode
37 class SchedulingItem(Item
):
39 # global counters used for [current|last]_[host|service]_[event|problem]_id
41 current_problem_id
= 0
43 # Add a flapping change, but no more than 20 states
44 # Then update the self.is_flapping bool by calling update_flapping
45 def add_flapping_change(self
, b
):
46 self
.flapping_changes
.append(b
)
48 # Keep just 20 changes (global flap_history value)
49 flap_history
= self
.__class
__.flap_history
51 if len(self
.flapping_changes
) > flap_history
:
52 self
.flapping_changes
.pop(0)
53 # Now we add a value, we update the is_flapping prop
54 self
.update_flapping()
57 # We update the is_flapping prop with value in self.flapping_states
58 # Old values have less weight than new ones
59 def update_flapping(self
):
60 flap_history
= self
.__class
__.flap_history
61 # We compute the flapping change in %
64 for b
in self
.flapping_changes
:
67 r
+= i
*(1.2-0.8)/flap_history
+ 0.8
70 # Now we get the low_flap_threshold and high_flap_threshold values
71 # They can be from self, or class
72 (low_flap_threshold
, high_flap_threshold
) = (self
.low_flap_threshold
, self
.high_flap_threshold
)
73 if low_flap_threshold
== -1:
75 low_flap_threshold
= cls
.low_flap_threshold
76 if high_flap_threshold
== -1:
78 high_flap_threshold
= cls
.high_flap_threshold
80 # Now we check is flapping change
81 if self
.is_flapping
and r
< low_flap_threshold
:
82 self
.is_flapping
= False
83 #We also raise a log entry
84 self
.raise_flapping_stop_log_entry(r
, low_flap_threshold
)
85 if not self
.is_flapping
and r
>= high_flap_threshold
:
86 self
.is_flapping
= True
87 # We also raise a log entry
88 self
.raise_flapping_start_log_entry(r
, high_flap_threshold
)
89 self
.percent_state_change
= r
92 # Add an attempt but cannot be more than max_check_attempts
93 def add_attempt(self
):
95 self
.attempt
= min(self
.attempt
, self
.max_check_attempts
)
98 # Return True if attempt is at max
99 def is_max_attempts(self
):
100 return self
.attempt
>= self
.max_check_attempts
103 # Call by scheduler to see if last state is older than
104 # freshness_threshold if check_freshness, then raise a check
105 # even if active check is disabled
106 def do_check_freshness(self
):
108 # Before, check if class (host or service) have check_freshness OK
109 # Then check if item whant fressness, then check fressness
111 if not self
.in_checking
:
112 if cls
.check_freshness
:
113 if self
.check_freshness
and self
.freshness_threshold
!= 0:
114 if self
.last_state_update
< now
- (self
.freshness_threshold
+ cls
.additional_freshness_latency
):
116 self
.raise_freshness_log_entry(int(now
-self
.last_state_update
), int(now
-self
.freshness_threshold
))
118 return self
.launch_check(now
)
122 # Raise all impact from my error. I'm setting myself
123 # as a problem, and I register myself as this in all
124 # hosts/services that depend_on_me. So they are now my
126 def set_myself_as_problem(self
):
129 self
.is_problem
= True
130 # we should warn potentials impact of our problem
131 # and they should be cool to register them so I've got
133 for (impact
, status
, dep_type
, tp
, inh_par
) in self
.act_depend_of_me
:
134 # Check if the status is ok for impact
137 # now check if we should bailout because of a
138 # not good timeperiod for dep
139 if tp
is None or tp
.is_time_valid(now
):
140 new_impacts
= impact
.register_a_problem(self
)
141 self
.impacts
.extend(new_impacts
)
142 # Make element unique in this list
143 self
.impacts
= list(set(self
.impacts
))
145 # We can update our criticity value now
146 self
.update_criticity_value()
148 # And we register a new broks for update status
149 b
= self
.get_update_status_brok()
153 # We update our 'criticity' value with the max of
154 # the impacts criticy if we got impacts. And save our 'configuration'
155 # criticity if we do not have do it before
156 # If we do not have impacts, we revert our value
157 def update_criticity_value(self
):
158 # First save our criticity if not already do
159 if self
.my_own_criticity
== -1:
160 self
.my_own_criticity
= self
.criticity
162 # If we trully have impacts, we get the max criticity
163 # if it's huge than ourselve
164 if len(self
.impacts
) != 0:
165 self
.criticity
= max(self
.criticity
, max([e
.criticity
for e
in self
.impacts
]))
166 elif self
.my_own_criticity
!= -1:
167 self
.criticity
= self
.my_own_criticity
170 # Look for my impacts, and remove me from theirs problems list
171 def no_more_a_problem(self
):
173 self
.is_problem
= False
175 # we warn impacts that we are no more a problem
176 for impact
in self
.impacts
:
177 impact
.deregister_a_problem(self
)
179 # we can just drop our impacts list
182 # And we register a new broks for update status
183 b
= self
.get_update_status_brok()
186 # We update our criticy value, it's not a huge thing :)
187 self
.update_criticity_value()
190 # call recursively by potentials impacts so they
191 # update their source_problems list. But do not
192 # go below if the problem is not a real one for me
193 # like If I've got multiple parents for examples
194 def register_a_problem(self
, pb
):
196 was_an_impact
= self
.is_impact
197 # Our father already look of he impacts us. So if we are here,
198 # it's that we really are impacted
199 self
.is_impact
= True
202 # Ok, if we are impacted, we can add it in our
204 # TODO : remove this unused check
206 # Maybe I was a problem myself, now I can say : not my fault!
208 self
.no_more_a_problem()
210 # Ok, we are now an impact, we should take the good state
211 # but only when we just go in impact state
212 if not was_an_impact
:
213 self
.set_impact_state()
215 # Ok now we can be a simple impact
217 if pb
not in self
.source_problems
:
218 self
.source_problems
.append(pb
)
219 # we should send this problem to all potential impact that
221 for (impact
, status
, dep_type
, tp
, inh_par
) in self
.act_depend_of_me
:
222 # Check if the status is ok for impact
225 # now check if we should bailout because of a
226 # not good timeperiod for dep
227 if tp
is None or tp
.is_time_valid(now
):
228 new_impacts
= impact
.register_a_problem(pb
)
229 impacts
.extend(new_impacts
)
231 # And we register a new broks for update status
232 b
= self
.get_update_status_brok()
235 # now we return all impacts (can be void of course)
239 # Just remove the problem from our problems list
240 # and check if we are still 'impacted'. It's not recursif because problem
241 # got the lsit of all its impacts
242 def deregister_a_problem(self
, pb
):
243 self
.source_problems
.remove(pb
)
245 # For know if we are still an impact, maybe our dependancies
246 # are not aware of teh remove of the impact state because it's not ordered
247 # so we can just look at if we still have some problem in our list
248 if len(self
.source_problems
) == 0:
249 self
.is_impact
= False
250 # No more an impact, we can unset the impact state
251 self
.unset_impact_state()
253 # And we register a new broks for update status
254 b
= self
.get_update_status_brok()
258 # When all dep are resolved, this function say if
259 # action can be raise or not by viewing dep status
260 # network_dep have to be all raise to be no action
261 # logic_dep : just one is enouth
262 def is_no_action_dependant(self
):
263 # Use to know if notif is raise or not
266 # So if one logic is Raise, is dep
267 # is one network is no ok, is not dep
268 # at the end, raise no dep
269 for (dep
, status
, type, tp
, inh_par
) in self
.act_depend_of
:
270 # For logic_dep, only one state raise put no action
271 if type == 'logic_dep':
275 # more complicated: if none of the states are match, the host is down
279 dep_match
= [dep
.is_state(s
) for s
in status
]
280 #check if the parent match a case, so he is down
281 if True in dep_match
:
283 parent_is_down
.append(p_is_down
)
284 # if a parent is not down, no dep can explain the pb
285 if False in parent_is_down
:
287 else:# every parents are dead, so... It's not my fault :)
291 # We check if we are no action just because of ours parents (or host for
293 # TODO : factorize with previous check?
294 def check_and_set_unreachability(self
):
296 # We must have all parents raised to be unreachable
297 for (dep
, status
, type, tp
, inh_par
) in self
.act_depend_of
:
298 # For logic_dep, only one state raise put no action
299 if type == 'network_dep':
301 dep_match
= [dep
.is_state(s
) for s
in status
]
302 if True in dep_match
:#the parent match a case, so he is down
304 parent_is_down
.append(p_is_down
)
306 # if a parent is not down, no dep can explain the pb
307 if False in parent_is_down
:
309 else:# every parents are dead, so... It's not my fault :)
310 self
.set_unreachable()
314 # Use to know if I raise dependency for soneone else (with status)
315 # If I do not raise dep, maybe my dep raise me. If so, I raise dep.
316 # So it's a recursive function
317 def do_i_raise_dependency(self
, status
, inherit_parents
):
323 # If we do not inherit parent, we have no reason to be blocking
324 if not inherit_parents
:
327 # Ok, I do not raise dep, but my dep maybe raise me
329 for (dep
, status
, type, tp
, inh_parent
) in self
.chk_depend_of
:
330 if dep
.do_i_raise_dependency(status
, inh_parent
):
331 if tp
is None or tp
.is_time_valid(now
):
334 # No, I relly do not raise...
338 # Use to know if my dep force me not to be checked
339 # So check the chk_depend_of if they raise me
340 def is_no_check_dependant(self
):
342 for (dep
, status
, type, tp
, inh_parent
) in self
.chk_depend_of
:
343 if tp
is None or tp
.is_time_valid(now
):
344 if dep
.do_i_raise_dependency(status
, inh_parent
):
349 # call by a bad consume check where item see that he have dep
350 # and maybe he is not in real fault.
351 def raise_dependancies_check(self
, ref_check
):
355 for (dep
, status
, type, tp
, inh_par
) in self
.act_depend_of
:
356 # If the dep timeperiod is not valid, do notraise the dep,
358 if tp
is None or tp
.is_time_valid(now
):
359 # if the update is 'fresh', do not raise dep,
360 # cached_check_horizon = cached_service_check_horizon for service
361 if dep
.last_state_update
< now
- cls
.cached_check_horizon
:
362 i
= dep
.launch_check(now
, ref_check
)
366 # print "DBG: **************** The state is FRESH", dep.host_name, time.asctime(time.localtime(dep.last_state_update))
370 # Main scheduling function
371 # If a check is in progress, or active cehck are disabled, do
372 # not schedule a check.
373 # The check interval change with HARD state or not:
374 # SOFT: retry_interval
375 # HARD: check_interval
376 # The first scheduling is a little random, so all checks
377 # are not launch in the same time...
378 def schedule(self
, force
=False, force_time
=None):
379 # if last_chk == 0 put in a random way so all checks
380 # are not in the same time
383 # next_chk il already set, do not change
384 # if self.next_chk >= now or self.in_checking and not force:
385 if self
.in_checking
and not force
:
389 # if no active check and no force, no check
390 if (not self
.active_checks_enabled
or not cls
.execute_checks
) and not force
:
393 # If the check_interval is 0, we should not add it
394 if self
.check_interval
== 0 and not force
:
397 # If I do not have an check_timeperiod and no force time, i do nothing
398 if (not hasattr(self
, 'check_period') or self
.check_period
== None and force_time
==None):
401 # Interval change is in a HARD state or not
402 # If the retry is 0, take the normal value
403 if self
.state_type
== 'HARD' or self
.retry_interval
== 0:
404 interval
= self
.check_interval
* 60
405 else: #TODO : if no retry_interval?
406 interval
= self
.retry_interval
* 60
408 # The next_chk is pass so we need a new one
409 # so we got a check_interval
410 if self
.next_chk
== 0:
411 # At the start, we cannot have a interval more than cls.max_check_spread
412 # is service_max_check_spread or host_max_check_spread in config
413 interval
= min(interval
, cls
.max_check_spread
* 60)
414 r
= interval
* (random
.random() - 0.5)
415 time_add
= interval
/2 + r
419 if force_time
is None:
420 self
.next_chk
= self
.check_period
.get_next_valid_time_from_t(now
+ time_add
)
422 self
.next_chk
= force_time
424 # If next time is None, do not go
425 if self
.next_chk
== None:
426 # Nagios do not raise it, I'm wondering if we should
427 # self.raise_no_next_check_log_entry()
430 # Get the command to launch, and put it in queue
431 self
.launch_check(self
.next_chk
)
434 # If we've got a system time change, we need to compensate it
435 # If modify all past value. For active one like next_chk, it's the current
436 # checks that will give us the new value
437 def compensate_system_time_change(self
, difference
):
438 # We only need to change some value
439 need_change
= ['last_notification', 'last_state_change', 'last_hard_state_change']
440 for p
in need_change
:
441 val
= getattr(self
, p
) # current value
442 #Do not go below 1970 :)
443 val
= max(0, val
+ difference
) #diff can be -
444 setattr(self
, p
, val
)
447 def remove_in_progress_check(self
, c
):
448 # The check is consume, uptade the in_checking propertie
449 if c
in self
.checks_in_progress
:
450 self
.checks_in_progress
.remove(c
)
452 print "Not removing check", c
, "for ", self
.get_name()
453 self
.update_in_checking()
456 # Is in checking if and ony if there are still checks no consumed
457 def update_in_checking(self
):
458 self
.in_checking
= (len(self
.checks_in_progress
) != 0)
461 # Del just a notification that is retured
462 def remove_in_progress_notification(self
, n
):
463 if n
.id in self
.notifications_in_progress
:
465 del self
.notifications_in_progress
[n
.id]
468 # We do not need ours currents pending notifications,
469 # so we zombify them and clean our list
470 def remove_in_progress_notifications(self
):
471 for n
in self
.notifications_in_progress
.values():
472 self
.remove_in_progress_notification(n
)
475 # Get a event handler if item got an event handler
476 # command. It must be enabled locally and globally
477 def get_event_handlers(self
, externalcmd
=False):
480 # The external command always pass
481 # if not, only if we enable them (auto launch)
482 if self
.event_handler
== None or ((not self
.event_handler_enabled
or not cls
.enable_event_handlers
) and not externalcmd
):
485 print self
.event_handler
.__dict
__
487 data
= self
.get_data_for_event_handler()
488 cmd
= m
.resolve_command(self
.event_handler
, data
)
489 e
= EventHandler(cmd
, timeout
=cls
.event_handler_timeout
)
490 #print "DBG: Event handler call created"
491 #print "DBG: ",e.__dict__
492 self
.raise_event_handler_log_entry(self
.event_handler
)
494 # ok we can put it in our temp action queue
495 self
.actions
.append(e
)
498 # Whenever a non-ok hard state is reached, we must check whether this
499 # host/service has a flexible downtime waiting to be activated
500 def check_for_flexible_downtime(self
):
501 status_updated
= False
502 for dt
in self
.downtimes
:
503 # activate flexible downtimes (do not activate triggered downtimes)
504 if dt
.fixed
== False and dt
.is_in_effect
== False and dt
.start_time
<= self
.last_chk
and self
.state_id
!= 0 and dt
.trigger_id
== 0:
505 n
= dt
.enter() # returns downtimestart notifications
507 self
.actions
.append(n
)
508 status_updated
= True
509 if status_updated
== True:
510 self
.broks
.append(self
.get_update_status_brok())
513 # consume a check return and send action in return
514 # main function of reaction of checks like raise notifications
516 # is_flapping : immediate notif when problem
517 # is_in_scheduled_downtime : no notification
518 # is_volatile : notif immediatly (service only)
519 def consume_result(self
, c
):
520 OK_UP
= self
.__class
__.ok_up
#OK for service, UP for host
522 # We check for stalking if necessery
523 # so if check is here
524 self
.manage_stalking(c
)
526 # Latency can be <0 is we get a check from the retention file
529 self
.latency
= max(0, c
.check_time
- c
.t_to_go
)
533 # Ok, the first check is done
534 self
.has_been_checked
= 1
536 # Now get data from check
537 self
.execution_time
= c
.execution_time
538 self
.last_chk
= int(c
.check_time
)
539 self
.output
= c
.output
540 self
.long_output
= c
.long_output
542 # Get the perf_data only if we want it in the configuration
543 if self
.__class
__.process_performance_data
and self
.process_perf_data
:
544 self
.last_perf_data
= self
.perf_data
545 self
.perf_data
= c
.perf_data
547 # Before set state, module thems
548 for rm
in self
.resultmodulations
:
550 c
.exit_status
= rm
.module_return(c
.exit_status
)
552 # If we got a bad result on a normal check, and we have dep,
553 # we raise dep checks
554 # put the actual check in waitdep and we return all new checks
555 if c
.exit_status
!= 0 and c
.status
== 'waitconsume' and len(self
.act_depend_of
) != 0:
557 # Make sure the check know about his dep
558 # C is my check, and he wants dependancies
559 checks_id
= self
.raise_dependancies_check(c
)
560 for check_id
in checks_id
:
561 # Get checks_id of dep
562 c
.depend_on
.append(check_id
)
563 # Ok, no more need because checks are not
564 # take by host/service, and not returned
566 # remember how we was before this check
567 last_state_type
= self
.state_type
569 self
.set_state_from_exit_status(c
.exit_status
)
571 # we change the state, do whatever we are or not in
572 # an impact mode, we can put it
573 self
.state_changed_since_impact
= True
575 # The check is consume, uptade the in_checking propertie
576 self
.remove_in_progress_check(c
)
578 # C is a check and someone wait for it
579 if c
.status
== 'waitconsume' and c
.depend_on_me
!= []:
580 c
.status
= 'havetoresolvedep'
582 # if finish, check need to be set to a zombie state to be removed
583 # it can be change if necessery before return, like for dependancies
584 if c
.status
== 'waitconsume' and c
.depend_on_me
== []:
587 # Use to know if notif is raise or not
590 # C was waitdep, but now all dep are resolved, so check for deps
591 if c
.status
== 'waitdep':
592 if c
.depend_on_me
!= []:
593 c
.status
= 'havetoresolvedep'
597 no_action
= self
.is_no_action_dependant()
598 # We recheck just for network_dep. Maybe we are just unreachable
599 # and we need to overide the state_id
600 self
.check_and_set_unreachability()
602 # OK following a previous OK. perfect if we were not in SOFT
603 if c
.exit_status
== 0 and self
.last_state
in (OK_UP
, 'PENDING'):
604 #print "Case 1 (OK following a previous OK) : code:%s last_state:%s" % (c.exit_status, self.last_state)
605 self
.unacknowledge_problem()
606 # action in return can be notification or other checks (dependancies)
607 if (self
.state_type
== 'SOFT') and self
.last_state
!= 'PENDING':
608 if self
.is_max_attempts() and self
.state_type
== 'SOFT':
609 self
.state_type
= 'HARD'
611 self
.state_type
= 'SOFT'
614 self
.state_type
= 'HARD'
616 # OK following a NON-OK.
617 elif c
.exit_status
== 0 and (self
.last_state
!= OK_UP
and self
.last_state
!= 'PENDING'):
618 self
.unacknowledge_problem()
619 #print "Case 2 (OK following a NON-OK) : code:%s last_state:%s" % (c.exit_status, self.last_state)
620 if self
.state_type
== 'SOFT':
621 # OK following a NON-OK still in SOFT state
623 self
.raise_alert_log_entry()
624 # Eventhandler gets OK;SOFT;++attempt, no notification needed
625 self
.get_event_handlers()
626 # Internally it is a hard OK
627 self
.state_type
= 'HARD'
629 elif self
.state_type
== 'HARD':
630 # OK following a HARD NON-OK
631 self
.raise_alert_log_entry()
632 # Eventhandler and notifications get OK;HARD;maxattempts
633 # Ok, so current notifications are not need, we 'zombie' thems
634 self
.remove_in_progress_notifications()
636 self
.create_notifications('RECOVERY')
637 self
.get_event_handlers()
638 # Internally it is a hard OK
639 self
.state_type
= 'HARD'
642 # I'm no more a problem if I was one
643 self
.no_more_a_problem()
647 elif c
.exit_status
!= 0 and hasattr(self
, 'is_volatile') and self
.is_volatile
:
648 #print "Case 3 (volatile only)"
649 # There are no repeated attempts, so the first non-ok results
652 self
.state_type
= 'HARD'
653 # status != 0 so add a log entry (before actions that can also raise log
654 # it is smarter to log error before notification)
655 self
.raise_alert_log_entry()
656 self
.check_for_flexible_downtime()
657 self
.remove_in_progress_notifications()
659 self
.create_notifications('PROBLEM')
660 # Ok, event handlers here too
661 self
.get_event_handlers()
664 # I'm a problem only if I'm the root problem,
667 self
.set_myself_as_problem()
669 # NON-OK follows OK. Everything was fine, but now trouble is ahead
670 elif c
.exit_status
!= 0 and self
.last_state
in (OK_UP
, 'PENDING'):
671 #print "Case 4 : NON-OK follows OK : code:%s last_state:%s" % (c.exit_status, self.last_state)
672 if self
.is_max_attempts():
673 # if max_attempts == 1 we're already in deep trouble
674 self
.state_type
= 'HARD'
675 self
.raise_alert_log_entry()
676 self
.remove_in_progress_notifications()
677 self
.check_for_flexible_downtime()
679 self
.create_notifications('PROBLEM')
680 # Oh? This is the typical go for a event handler :)
681 self
.get_event_handlers()
684 # I'm a problem only if I'm the root problem,
687 self
.set_myself_as_problem()
690 # This is the first NON-OK result. Initiate the SOFT-sequence
691 # Also launch the event handler, he might fix it.
693 self
.state_type
= 'SOFT'
694 self
.raise_alert_log_entry()
695 self
.get_event_handlers()
697 # If no OK in a no OK : if hard, still hard, if soft,
698 # check at self.max_check_attempts
699 # when we go in hard, we send notification
700 elif c
.exit_status
!= 0 and self
.last_state
!= OK_UP
:
701 #print "Case 5 (no OK in a no OK) : code:%s last_state:%s state_type:%s" % (c.exit_status, self.last_state,self.state_type)
702 if self
.state_type
== 'SOFT':
704 if self
.is_max_attempts():
705 # Ok here is when we just go to the hard state
706 self
.state_type
= 'HARD'
707 self
.raise_alert_log_entry()
708 self
.remove_in_progress_notifications()
709 # There is a request in the Nagios trac to enter downtimes
710 # on soft states which does make sense. If this becomes
711 # the default behavior, just move the following line
712 # into the else-branch below.
713 self
.check_for_flexible_downtime()
715 self
.create_notifications('PROBLEM')
716 # So event handlers here too
717 self
.get_event_handlers()
720 # I'm a problem only if I'm the root problem,
723 self
.set_myself_as_problem()
726 self
.raise_alert_log_entry()
727 # eventhandler is launched each time during the soft state
728 self
.get_event_handlers()
730 # Send notifications whenever the state has changed. (W -> C)
731 if self
.state
!= self
.last_state
:
732 self
.unacknowledge_problem_if_not_sticky()
733 self
.raise_alert_log_entry()
734 self
.remove_in_progress_notifications()
736 self
.create_notifications('PROBLEM')
739 # Maybe our new state can raise the problem
740 # when the last one was not
741 # I'm a problem only if I'm the root problem,
744 self
.set_myself_as_problem()
746 elif self
.in_scheduled_downtime_during_last_check
== True:
747 # during the last check i was in a downtime. but now
748 # the status is still critical and notifications
749 # are possible again. send an alert immediately
750 self
.remove_in_progress_notifications()
752 self
.create_notifications('PROBLEM')
754 # Reset this flag. If it was true, actions were already taken
755 self
.in_scheduled_downtime_during_last_check
== False
757 # now is the time to update state_type_id
758 # and our last_hard_state
759 if self
.state_type
== 'HARD':
760 self
.state_type_id
= 1
761 self
.last_hard_state
= self
.state
762 self
.last_hard_state_id
= self
.state_id
764 self
.state_type_id
= 0
766 # update event/problem-counters
767 self
.update_event_and_problem_id()
768 self
.broks
.append(self
.get_check_result_brok())
769 self
.get_obsessive_compulsive_processor_command()
770 self
.get_perfdata_command()
772 # fill last_hard_state_change to now
773 # if we just change from SOFT->HARD or
774 # in HARD we change of state (Warning->critical, or critical->ok, etc etc)
775 if self
.state_type
== 'HARD' and (last_state_type
== 'SOFT' or self
.last_state
!= self
.state
):
776 self
.last_hard_state_change
= int(time
.time())
780 def update_event_and_problem_id(self
):
781 OK_UP
= self
.__class
__.ok_up
#OK for service, UP for host
782 if self
.state
!= self
.last_state
and self
.last_state
!= 'PENDING' or self
.state
!= OK_UP
and self
.last_state
== 'PENDING':
783 SchedulingItem
.current_event_id
+= 1
784 self
.last_event_id
= self
.current_event_id
785 self
.current_event_id
= SchedulingItem
.current_event_id
787 if self
.state
!= OK_UP
and self
.last_state
== 'PENDING':
788 # broken ever since i can remember
789 SchedulingItem
.current_problem_id
+= 1
790 self
.last_problem_id
= self
.current_problem_id
791 self
.current_problem_id
= SchedulingItem
.current_problem_id
792 elif self
.state
!= OK_UP
and self
.last_state
!= OK_UP
:
793 # State transitions between non-OK states
794 # (e.g. WARNING to CRITICAL) do not cause
795 # this problem id to increase.
797 elif self
.state
== OK_UP
:
798 # If the service is currently in an OK state,
799 # this macro will be set to zero (0).
800 self
.last_problem_id
= self
.current_problem_id
801 self
.current_problem_id
= 0
803 # Every time a service (or host) transitions from
804 # an OK or UP state to a problem state, a global
805 # problem ID number is incremented by one (1).
806 SchedulingItem
.current_problem_id
+= 1
807 self
.last_problem_id
= self
.current_problem_id
808 self
.current_problem_id
= SchedulingItem
.current_problem_id
811 # Called by scheduler when a notification is
812 # ok to be send (so fuilly prepared to be send
813 # to reactionner). Here we update the command with
814 # status of now, and we add the contact to set of
815 # contact we notified. And we raise the log entry
816 def prepare_notification_for_sending(self
, n
):
817 if n
.status
== 'inpoller':
818 self
.update_notification_command(n
)
819 self
.notified_contacts
.add(n
.contact
)
820 self
.raise_notification_log_entry(n
)
823 # Just update the notification command by resolving Macros
824 # And because we are just launching the notification, we can say
825 # that this contact have been notified
826 def update_notification_command(self
, n
):
828 data
= self
.get_data_for_notifications(n
.contact
, n
)
829 n
.command
= m
.resolve_command(n
.command_call
, data
)
833 # See if an escalation is eligible at t and notif nb=n
834 def is_escalable(self
, n
):#t, n):
837 # We search since when we are in notification for escalations
838 # that are based on time
839 in_notif_time
= cls
.interval_length
* self
.first_notification_delay
+ (n
.notif_nb
-1) * self
.notification_interval
840 print "In notif time orig:", in_notif_time
841 in_notif_time
= time
.time() - n
.creation_time
842 print "In notif time mod:", in_notif_time
844 # Check is an escalation match the current_notification_number
845 for es
in self
.escalations
:
846 if es
.is_eligible(n
.t_to_go
, self
.state
, n
.notif_nb
, in_notif_time
, cls
.interval_length
):
851 # Give for a notification the next notification time
852 # by taking the standard notification_interval or ask for
853 # our escalation if one of them need a smaller value to escalade
854 def get_next_notification_time(self
, n
):
859 # Get the standard time like if we got no escalations
860 std_time
= n
.t_to_go
+ self
.notification_interval
* cls
.interval_length
862 # standard time is a good one
865 creation_time
= n
.creation_time
866 in_notif_time
= time
.time() - n
.creation_time
868 for es
in self
.escalations
:
869 r
= es
.get_next_notif_time(std_time
, self
.state
, creation_time
, cls
.interval_length
)
870 # If we got a real result (time base escalation), we add it
874 #And we take the minimum of this result. Can be standard or escalation asked
878 # Get all contacts (uniq) from eligible escalations
879 def get_escalable_contacts(self
,n
):
882 # We search since when we are in notification for escalations
883 # that are based on this time
884 in_notif_time
= time
.time() - n
.creation_time
887 for es
in self
.escalations
:
888 if es
.is_eligible(n
.t_to_go
, self
.state
, n
.notif_nb
, in_notif_time
, cls
.interval_length
):
889 contacts
.update(es
.contacts
)
890 return list(contacts
)
893 # Create a "master" notification here, which will later
894 # (immediately before the reactionner gets it) be split up
895 # in many "child" notifications, one for each contact.
896 def create_notifications(self
, type, t_wished
= None):
898 # t_wished==None for the first notification launch after consume
899 # here we must look at the self.notification_period
903 # if first notification, we must add first_notification_delay
904 if self
.current_notification_number
== 0 and type == 'PROBLEM':
905 last_time_non_ok_or_up
= self
.last_time_non_ok_or_up()
906 if last_time_non_ok_or_up
== 0:
907 # this happens at initial
908 t_wished
= now
+ self
.first_notification_delay
* cls
.interval_length
910 t_wished
= last_time_non_ok_or_up
+ self
.first_notification_delay
* cls
.interval_length
911 t
= self
.notification_period
.get_next_valid_time_from_t(t_wished
)
913 # We follow our order
916 if self
.notification_is_blocked_by_item(type, t_wished
) and self
.first_notification_delay
== 0 and self
.notification_interval
== 0:
917 # If notifications are blocked on the host/service level somehow
918 # and repeated notifications are not configured,
919 # we can silently drop this one
922 if type == 'PROBLEM':
923 # Create the notification with an incremented notification_number.
924 # The current_notification_number of the item itself will only
925 # be incremented when this notification (or its children)
926 # have actually be sent.
927 next_notif_nb
= self
.current_notification_number
+ 1
928 elif type == 'RECOVERY':
929 # Recovery resets the notification counter to zero
930 self
.current_notification_number
= 0
931 next_notif_nb
= self
.current_notification_number
933 # downtime/flap/etc do not change the notification number
934 next_notif_nb
= self
.current_notification_number
935 n
= Notification(type, 'scheduled', 'VOID', None, self
, None, t
, \
936 timeout
=cls
.notification_timeout
, \
937 notif_nb
=next_notif_nb
)
939 # Keep a trace in our notifications queue
940 self
.notifications_in_progress
[n
.id] = n
941 # and put it in the temp queue for scheduler
942 self
.actions
.append(n
)
945 # In create_notifications we created a notification "template". When it's
946 # time to hand it over to the reactionner, this master notification needs
947 # to be split in several child notifications, one for each contact
948 # To be more exact, one for each contact who is willing to accept
949 # notifications of this type and at this time
950 def scatter_notification(self
, n
):
952 childnotifications
= []
955 # only master notifications can be split up
957 if n
.type == 'RECOVERY':
958 if self
.first_notification_delay
!= 0 and len(self
.notified_contacts
) == 0:
959 # Recovered during first_notification_delay. No notifications
960 # have been sent yet, so we keep quiet
963 # The old way. Only send recover notifications to those contacts
964 # who also got problem notifications
965 contacts
= list(self
.notified_contacts
)
966 self
.notified_contacts
.clear()
968 # Check is an escalation match. If yes, get all contacts from escalations
969 if self
.is_escalable(n
):#.t_to_go, n.notif_nb):
970 contacts
= self
.get_escalable_contacts(n
)#.t_to_go, n.notif_nb)
971 # else take normal contacts
973 contacts
= self
.contacts
974 print "Finally raise for contacts"
975 for contact
in contacts
:
976 print contact
.get_name()
978 for contact
in contacts
:
979 # Get the property name for notif commands, like
980 # service_notification_commands for service
981 notif_commands
= contact
.get_notification_commands(cls
.my_type
)
983 for cmd
in notif_commands
:
984 child_n
= Notification(n
.type, 'scheduled', 'VOID', cmd
, self
,
985 contact
, n
.t_to_go
, timeout
=cls
.notification_timeout
,
986 notif_nb
=n
.notif_nb
)
987 if not self
.notification_is_blocked_by_contact(child_n
, contact
):
988 # Update the notification with fresh status information
989 # of the item. Example: during the notification_delay
990 # the status of a service may have changed from WARNING to CRITICAL
991 self
.update_notification_command(child_n
)
992 self
.raise_notification_log_entry(child_n
)
993 self
.notifications_in_progress
[child_n
.id] = child_n
994 childnotifications
.append(child_n
)
996 if n
.type == 'PROBLEM':
997 # Remember the contacts. We might need them later in the
998 # recovery code some lines above
999 self
.notified_contacts
.add(contact
)
1001 return childnotifications
1004 # return a check to check the host/service
1005 # and return id of the check
1006 def launch_check(self
, t
, ref_check
= None, force
=False):
1008 cls
= self
.__class
__
1010 # if I'm already in checking, Why launch a new check?
1011 # If ref_check_id is not None , this is a dependancy_ check
1012 # If none, it might be a forced check, so OK, I do a new
1013 if not force
and (self
.in_checking
and ref_check
!= None):
1014 c_in_progress
= self
.checks_in_progress
[0] #0 is OK because in_checking is True
1015 if c_in_progress
.t_to_go
> time
.time(): #Very far?
1016 c_in_progress
.t_to_go
= time
.time() #No, I want a check right NOW
1017 c_in_progress
.depend_on_me
.append(ref_check
)
1018 return c_in_progress
.id
1020 if force
or (not self
.is_no_check_dependant()):
1021 # Get the command to launch
1023 data
= self
.get_data_for_checks()
1024 command_line
= m
.resolve_command(self
.check_command
, data
)
1026 # By default env is void
1029 # And get all environnement varialbe only if need
1030 if not cls
.use_large_installation_tweaks
and cls
.enable_environment_macros
:
1031 env
= m
.get_env_macros(data
)
1033 # Make the Check object and put the service in checking
1034 # Make the check inherit poller_tag from the command
1035 c
= Check('scheduled', command_line
, self
, t
, ref_check
, \
1036 timeout
=cls
.check_timeout
, \
1037 poller_tag
=self
.check_command
.poller_tag
, env
=env
)
1039 # We keep a trace of all checks in progress
1040 # to know if we are in checking_or not
1041 self
.checks_in_progress
.append(c
)
1042 self
.update_in_checking()
1044 # We need to put this new check in our actions queue
1045 # so scheduler can take it
1047 self
.actions
.append(c
)
1049 # None mean I already take it into account
1053 # Get the perfdata command with macro resolved for this
1054 def get_perfdata_command(self
):
1055 cls
= self
.__class
__
1056 if not cls
.process_performance_data
or not self
.process_perf_data
:
1059 if cls
.perfdata_command
!= None:
1061 data
= self
.get_data_for_event_handler()
1062 cmd
= m
.resolve_command(cls
.perfdata_command
, data
)
1063 e
= EventHandler(cmd
, timeout
=cls
.perfdata_timeout
)
1065 # ok we can put it in our temp action queue
1066 self
.actions
.append(e
)
1069 # Create the whole business rule tree
1071 def create_business_rules(self
, hosts
, services
):
1072 cmdCall
= getattr(self
, 'check_command', None)
1074 # If we do not have a command, we bailout
1078 # we get our based command, like
1079 # check_tcp!80 -> check_tcp
1081 elts
= cmd
.split('!')
1084 # If it's bp_rule, we got a rule :)
1085 if base_cmd
== 'bp_rule':
1086 print "Got rule", elts
, cmd
1087 self
.got_business_rule
= True
1091 print "Got rules", rule
1092 fact
= DependencyNodeFactory()
1093 node
= fact
.eval_cor_patern(rule
, hosts
, services
)
1094 print "got node", node
1095 self
.business_rule
= node
1098 # We ask us to manage our own internal check,
1099 # like a business based one
1100 def manage_internal_check(self
, c
):
1101 #print "DBG, ask me to manage a check!"
1102 if c
.command
.startswith('bp_'):
1103 state
= self
.business_rule
.get_state()
1104 elif c
.command
== '_internal_host_up':
1106 c
.execution_time
= 0
1107 c
.check_time
= time
.time()
1108 c
.output
= 'Host assumed to be UP'
1109 c
.long_output
= c
.output
1110 c
.exit_status
= state
1111 #print "DBG, setting state", state
1114 # If I'm a business rule service/hsot, I register myself to the
1115 # elements I will depend on, so They will have ME as an impact
1116 def create_business_rules_dependencies(self
):
1117 if self
.got_business_rule
:
1118 #print "DBG: ask me to register me in my dependencies", self.get_name()
1119 elts
= self
.business_rule
.list_all_elements()
1120 # I will register myself in this
1122 #print "I register to the element", e.get_name()
1123 # all states, every timeperiod, and inherit parents
1124 e
.add_business_rule_act_dependancy(self
, ['d', 'u', 's', 'f', 'c', 'w'], None, True)