shinken/schedulingitem.py

   1 #!/usr/bin/env python
   2 # Copyright (C) 2009-2010 :
   3 #     Gabes Jean, naparuba@gmail.com
   4 #     Gerhard Lausser, Gerhard.Lausser@consol.de
   5 #
   6 # This file is part of Shinken.
   7 #
   8 # Shinken is free software: you can redistribute it and/or modify
   9 # it under the terms of the GNU Affero General Public License as published by
  10 # the Free Software Foundation, either version 3 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # Shinken is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU Affero General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU Affero General Public License
  19 # along with Shinken.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 """ This class is a common one for service/host. Here you
  22 will find all scheduling related functions, like the schedule
  23 or the consume_check ones. It's a quite important class!
  24 """
  25
  26
  27 import random
  28 import time
  29
  30 from shinken.item import Item
  31 from shinken.check import Check
  32 from shinken.notification import Notification
  33 from shinken.macroresolver import MacroResolver
  34 from shinken.eventhandler import EventHandler
  35 from shinken.dependencynode import DependencyNodeFactory, DependencyNode
  36
  37 class SchedulingItem(Item):
  38
  39     # global counters used for [current|last]_[host|service]_[event|problem]_id
  40     current_event_id = 0
  41     current_problem_id = 0
  42
  43     # Add a flapping change, but no more than 20 states
  44     # Then update the self.is_flapping bool by calling update_flapping
  45     def add_flapping_change(self, b):
  46         self.flapping_changes.append(b)
  47
  48         # Keep just 20 changes (global flap_history value)
  49         flap_history = self.__class__.flap_history
  50
  51         if len(self.flapping_changes) > flap_history:
  52             self.flapping_changes.pop(0)
  53         # Now we add a value, we update the is_flapping prop
  54         self.update_flapping()
  55
  56
  57     # We update the is_flapping prop with value in self.flapping_states
  58     # Old values have less weight than new ones
  59     def update_flapping(self):
  60         flap_history = self.__class__.flap_history
  61         # We compute the flapping change in %
  62         r = 0.0
  63         i = 0
  64         for b in self.flapping_changes:
  65             i += 1
  66             if b:
  67                 r += i*(1.2-0.8)/flap_history + 0.8
  68         r = r / flap_history
  69
  70         # Now we get the low_flap_threshold and high_flap_threshold values
  71         # They can be from self, or class
  72         (low_flap_threshold, high_flap_threshold) = (self.low_flap_threshold, self.high_flap_threshold)
  73         if low_flap_threshold == -1:
  74             cls = self.__class__
  75             low_flap_threshold = cls.low_flap_threshold
  76         if high_flap_threshold  == -1:
  77             cls = self.__class__
  78             high_flap_threshold = cls.high_flap_threshold
  79
  80         # Now we check is flapping change
  81         if self.is_flapping and r < low_flap_threshold:
  82             self.is_flapping = False
  83             #We also raise a log entry
  84             self.raise_flapping_stop_log_entry(r, low_flap_threshold)
  85         if not self.is_flapping and r >= high_flap_threshold:
  86             self.is_flapping = True
  87             # We also raise a log entry
  88             self.raise_flapping_start_log_entry(r, high_flap_threshold)
  89         self.percent_state_change = r
  90
  91
  92     # Add an attempt but cannot be more than max_check_attempts
  93     def add_attempt(self):
  94         self.attempt += 1
  95         self.attempt = min(self.attempt, self.max_check_attempts)
  96
  97
  98     # Return True if attempt is at max
  99     def is_max_attempts(self):
 100         return self.attempt >= self.max_check_attempts
 101
 102
 103     # Call by scheduler to see if last state is older than
 104     # freshness_threshold if check_freshness, then raise a check
 105     # even if active check is disabled
 106     def do_check_freshness(self):
 107         now = time.time()
 108         # Before, check if class (host or service) have check_freshness OK
 109         # Then check if item whant fressness, then check fressness
 110         cls = self.__class__
 111         if not self.in_checking:
 112             if cls.check_freshness:
 113                 if self.check_freshness and self.freshness_threshold != 0:
 114                     if self.last_state_update < now - (self.freshness_threshold + cls.additional_freshness_latency):
 115                         # Raise a log
 116                         self.raise_freshness_log_entry(int(now-self.last_state_update), int(now-self.freshness_threshold))
 117                         # And a new check
 118                         return self.launch_check(now)
 119         return None
 120
 121
 122     # Raise all impact from my error. I'm setting myself
 123     # as a problem, and I register myself as this in all
 124     # hosts/services that depend_on_me. So they are now my
 125     # impacts
 126     def set_myself_as_problem(self):
 127         now = time.time()
 128
 129         self.is_problem = True
 130         # we should warn potentials impact of our problem
 131         # and they should be cool to register them so I've got
 132         # my impacts list
 133         for (impact, status, dep_type, tp, inh_par) in self.act_depend_of_me:
 134             # Check if the status is ok for impact
 135             for s in status:
 136                 if self.is_state(s):
 137                     # now check if we should bailout because of a
 138                     # not good timeperiod for dep
 139                     if tp is None or tp.is_time_valid(now):
 140                         new_impacts = impact.register_a_problem(self)
 141                         self.impacts.extend(new_impacts)
 142                         # Make element unique in this list
 143                         self.impacts = list(set(self.impacts))
 144
 145         # We can update our criticity value now
 146         self.update_criticity_value()
 147
 148         # And we register a new broks for update status
 149         b = self.get_update_status_brok()
 150         self.broks.append(b)
 151
 152
 153     # We update our 'criticity' value with the max of
 154     # the impacts criticy if we got impacts. And save our 'configuration'
 155     # criticity if we do not have do it before
 156     # If we do not have impacts, we revert our value
 157     def update_criticity_value(self):
 158         # First save our criticity if not already do
 159         if self.my_own_criticity == -1:
 160             self.my_own_criticity = self.criticity
 161
 162         # If we trully have impacts, we get the max criticity
 163         # if it's huge than ourselve
 164         if len(self.impacts) != 0:
 165             self.criticity = max(self.criticity, max([e.criticity for e in self.impacts]))
 166         elif self.my_own_criticity != -1:
 167             self.criticity = self.my_own_criticity
 168
 169
 170     # Look for my impacts, and remove me from theirs problems list
 171     def no_more_a_problem(self):
 172         if self.is_problem:
 173             self.is_problem = False
 174
 175             # we warn impacts that we are no more a problem
 176             for impact in self.impacts:
 177                 impact.deregister_a_problem(self)
 178
 179             # we can just drop our impacts list
 180             self.impacts = []
 181
 182             # And we register a new broks for update status
 183             b = self.get_update_status_brok()
 184             self.broks.append(b)
 185
 186         # We update our criticy value, it's not a huge thing :)
 187         self.update_criticity_value()
 188
 189
 190     # call recursively by potentials impacts so they
 191     # update their source_problems list. But do not
 192     # go below if the problem is not a real one for me
 193     # like If I've got multiple parents for examples
 194     def register_a_problem(self, pb):
 195         now = time.time()
 196         was_an_impact = self.is_impact
 197         # Our father already look of he impacts us. So if we are here,
 198         # it's that we really are impacted
 199         self.is_impact = True
 200
 201         impacts = []
 202         # Ok, if we are impacted, we can add it in our
 203         # problem list
 204         # TODO : remove this unused check
 205         if self.is_impact:
 206             # Maybe I was a problem myself, now I can say : not my fault!
 207             if self.is_problem:
 208                 self.no_more_a_problem()
 209
 210             # Ok, we are now an impact, we should take the good state
 211             # but only when we just go in impact state
 212             if not was_an_impact:
 213                 self.set_impact_state()
 214
 215             # Ok now we can be a simple impact
 216             impacts.append(self)
 217             if pb not in self.source_problems:
 218                 self.source_problems.append(pb)
 219             # we should send this problem to all potential impact that
 220             # depend on us
 221             for (impact, status, dep_type, tp, inh_par) in self.act_depend_of_me:
 222                 # Check if the status is ok for impact
 223                 for s in status:
 224                     if self.is_state(s):
 225                         # now check if we should bailout because of a
 226                         # not good timeperiod for dep
 227                         if tp is None or tp.is_time_valid(now):
 228                             new_impacts = impact.register_a_problem(pb)
 229                             impacts.extend(new_impacts)
 230
 231             # And we register a new broks for update status
 232             b = self.get_update_status_brok()
 233             self.broks.append(b)
 234
 235         # now we return all impacts (can be void of course)
 236         return impacts
 237
 238
 239     # Just remove the problem from our problems list
 240     # and check if we are still 'impacted'. It's not recursif because problem
 241     # got the lsit of all its impacts
 242     def deregister_a_problem(self, pb):
 243         self.source_problems.remove(pb)
 244
 245         # For know if we are still an impact, maybe our dependancies
 246         # are not aware of teh remove of the impact state because it's not ordered
 247         # so we can just look at if we still have some problem in our list
 248         if len(self.source_problems) == 0:
 249             self.is_impact = False
 250             # No more an impact, we can unset the impact state
 251             self.unset_impact_state()
 252
 253         # And we register a new broks for update status
 254         b = self.get_update_status_brok()
 255         self.broks.append(b)
 256
 257
 258     # When all dep are resolved, this function say if
 259     # action can be raise or not by viewing dep status
 260     # network_dep have to be all raise to be no action
 261     # logic_dep : just one is enouth
 262     def is_no_action_dependant(self):
 263         # Use to know if notif is raise or not
 264         # no_action = False
 265         parent_is_down = []
 266         # So if one logic is Raise, is dep
 267         # is one network is no ok, is not dep
 268         # at the end, raise no dep
 269         for (dep, status, type, tp, inh_par) in self.act_depend_of:
 270             # For logic_dep, only one state raise put no action
 271             if type == 'logic_dep':
 272                 for s in status:
 273                     if dep.is_state(s):
 274                         return True
 275             # more complicated: if none of the states are match, the host is down
 276             # so -> network_dep
 277             else:
 278                 p_is_down = False
 279                 dep_match = [dep.is_state(s) for s in status]
 280                 #check if the parent match a case, so he is down
 281                 if True in dep_match:
 282                     p_is_down = True
 283                 parent_is_down.append(p_is_down)
 284         # if a parent is not down, no dep can explain the pb
 285         if False in parent_is_down:
 286             return False
 287         else:# every parents are dead, so... It's not my fault :)
 288             return True
 289
 290
 291     # We check if we are no action just because of ours parents (or host for
 292     # service)
 293     # TODO : factorize with previous check?
 294     def check_and_set_unreachability(self):
 295         parent_is_down = []
 296         # We must have all parents raised to be unreachable
 297         for (dep, status, type, tp, inh_par) in self.act_depend_of:
 298             # For logic_dep, only one state raise put no action
 299             if type == 'network_dep':
 300                 p_is_down = False
 301                 dep_match = [dep.is_state(s) for s in status]
 302                 if True in dep_match:#the parent match a case, so he is down
 303                     p_is_down = True
 304                 parent_is_down.append(p_is_down)
 305
 306         # if a parent is not down, no dep can explain the pb
 307         if False in parent_is_down:
 308             return
 309         else:# every parents are dead, so... It's not my fault :)
 310             self.set_unreachable()
 311             return
 312
 313
 314     # Use to know if I raise dependency for soneone else (with status)
 315     # If I do not raise dep, maybe my dep raise me. If so, I raise dep.
 316     # So it's a recursive function
 317     def do_i_raise_dependency(self, status, inherit_parents):
 318         # Do I raise dep?
 319         for s in status:
 320             if self.is_state(s):
 321                 return True
 322
 323         # If we do not inherit parent, we have no reason to be blocking
 324         if not inherit_parents:
 325             return False
 326
 327         # Ok, I do not raise dep, but my dep maybe raise me
 328         now = time.time()
 329         for (dep, status, type, tp, inh_parent) in self.chk_depend_of:
 330             if dep.do_i_raise_dependency(status, inh_parent):
 331                 if tp is None or tp.is_time_valid(now):
 332                     return True
 333
 334         # No, I relly do not raise...
 335         return False
 336
 337
 338     # Use to know if my dep force me not to be checked
 339     # So check the chk_depend_of if they raise me
 340     def is_no_check_dependant(self):
 341         now = time.time()
 342         for (dep, status, type, tp, inh_parent) in self.chk_depend_of:
 343             if tp is None or tp.is_time_valid(now):
 344                 if dep.do_i_raise_dependency(status, inh_parent):
 345                     return True
 346         return False
 347
 348
 349     # call by a bad consume check where item see that he have dep
 350     # and maybe he is not in real fault.
 351     def raise_dependancies_check(self, ref_check):
 352         now = time.time()
 353         cls = self.__class__
 354         checks = []
 355         for (dep, status, type, tp, inh_par) in self.act_depend_of:
 356             # If the dep timeperiod is not valid, do notraise the dep,
 357             # None=everytime
 358             if tp is None or tp.is_time_valid(now):
 359                 # if the update is 'fresh', do not raise dep,
 360                 # cached_check_horizon = cached_service_check_horizon for service
 361                 if dep.last_state_update < now - cls.cached_check_horizon:
 362                     i = dep.launch_check(now, ref_check)
 363                     if i != None:
 364                         checks.append(i)
 365 #                else:
 366 #                    print "DBG: **************** The state is FRESH", dep.host_name, time.asctime(time.localtime(dep.last_state_update))
 367         return checks
 368
 369
 370     # Main scheduling function
 371     # If a check is in progress, or active cehck are disabled, do
 372     # not schedule a check.
 373     # The check interval change with HARD state or not:
 374     # SOFT: retry_interval
 375     # HARD: check_interval
 376     # The first scheduling is a little random, so all checks
 377     # are not launch in the same time...
 378     def schedule(self, force=False, force_time=None):
 379         # if last_chk == 0 put in a random way so all checks
 380         # are not in the same time
 381
 382         now = time.time()
 383         # next_chk il already set, do not change
 384         # if self.next_chk >= now or self.in_checking and not force:
 385         if self.in_checking and not force:
 386             return None
 387
 388         cls = self.__class__
 389         # if no active check and no force, no check
 390         if (not self.active_checks_enabled or not cls.execute_checks) and not force:
 391             return None
 392
 393         # If the check_interval is 0, we should not add it
 394         if self.check_interval == 0 and not force:
 395             return None
 396
 397         # If I do not have an check_timeperiod and no force time, i do nothing
 398         if (not hasattr(self, 'check_period') or self.check_period == None and force_time==None):
 399             return None
 400
 401         # Interval change is in a HARD state or not
 402         # If the retry is 0, take the normal value
 403         if self.state_type == 'HARD' or self.retry_interval == 0:
 404             interval = self.check_interval * 60
 405         else: #TODO : if no retry_interval?
 406             interval = self.retry_interval * 60
 407
 408         # The next_chk is pass so we need a new one
 409         # so we got a check_interval
 410         if self.next_chk == 0:
 411             # At the start, we cannot have a interval more than cls.max_check_spread
 412             # is service_max_check_spread or host_max_check_spread in config
 413             interval = min(interval, cls.max_check_spread * 60)
 414             r = interval * (random.random() - 0.5)
 415             time_add = interval/2 + r
 416         else:
 417             time_add = interval
 418
 419         if force_time is None:
 420             self.next_chk = self.check_period.get_next_valid_time_from_t(now + time_add)
 421         else:
 422             self.next_chk = force_time
 423
 424         # If next time is None, do not go
 425         if self.next_chk == None:
 426             # Nagios do not raise it, I'm wondering if we should
 427             # self.raise_no_next_check_log_entry()
 428             return None
 429
 430         # Get the command to launch, and put it in queue
 431         self.launch_check(self.next_chk)
 432
 433
 434     # If we've got a system time change, we need to compensate it
 435     # If modify all past value. For active one like next_chk, it's the current
 436     # checks that will give us the new value
 437     def compensate_system_time_change(self, difference):
 438         # We only need to change some value
 439         need_change = ['last_notification', 'last_state_change', 'last_hard_state_change']
 440         for p in need_change:
 441             val = getattr(self, p) # current value
 442             #Do not go below 1970 :)
 443             val = max(0, val + difference) #diff can be -
 444             setattr(self, p, val)
 445
 446
 447     def remove_in_progress_check(self, c):
 448         # The check is consume, uptade the in_checking propertie
 449         if c in self.checks_in_progress:
 450             self.checks_in_progress.remove(c)
 451         else:
 452             print "Not removing check", c, "for ", self.get_name()
 453         self.update_in_checking()
 454
 455
 456     # Is in checking if and ony if there are still checks no consumed
 457     def update_in_checking(self):
 458         self.in_checking = (len(self.checks_in_progress) != 0)
 459
 460
 461     # Del just a notification that is retured
 462     def remove_in_progress_notification(self, n):
 463         if n.id in self.notifications_in_progress:
 464             n.status = 'zombie'
 465             del self.notifications_in_progress[n.id]
 466
 467
 468     # We do not need ours currents pending notifications,
 469     # so we zombify them and clean our list
 470     def remove_in_progress_notifications(self):
 471         for n in self.notifications_in_progress.values():
 472             self.remove_in_progress_notification(n)
 473
 474
 475     # Get a event handler if item got an event handler
 476     # command. It must be enabled locally and globally
 477     def get_event_handlers(self, externalcmd=False):
 478         cls = self.__class__
 479
 480         # The external command always pass
 481         # if not, only if we enable them (auto launch)
 482         if self.event_handler == None or ((not self.event_handler_enabled or not cls.enable_event_handlers) and not externalcmd):
 483             return
 484
 485         print self.event_handler.__dict__
 486         m = MacroResolver()
 487         data = self.get_data_for_event_handler()
 488         cmd = m.resolve_command(self.event_handler, data)
 489         e = EventHandler(cmd, timeout=cls.event_handler_timeout)
 490         #print "DBG: Event handler call created"
 491         #print "DBG: ",e.__dict__
 492         self.raise_event_handler_log_entry(self.event_handler)
 493
 494         # ok we can put it in our temp action queue
 495         self.actions.append(e)
 496
 497
 498     # Whenever a non-ok hard state is reached, we must check whether this
 499     # host/service has a flexible downtime waiting to be activated
 500     def check_for_flexible_downtime(self):
 501         status_updated = False
 502         for dt in self.downtimes:
 503             # activate flexible downtimes (do not activate triggered downtimes)
 504             if dt.fixed == False and dt.is_in_effect == False and dt.start_time <= self.last_chk and self.state_id != 0 and dt.trigger_id == 0:
 505                 n = dt.enter() # returns downtimestart notifications
 506                 if n is not None:
 507                     self.actions.append(n)
 508                 status_updated = True
 509         if status_updated == True:
 510             self.broks.append(self.get_update_status_brok())
 511
 512
 513     # consume a check return and send action in return
 514     # main function of reaction of checks like raise notifications
 515     # Special case:
 516     # is_flapping : immediate notif when problem
 517     # is_in_scheduled_downtime : no notification
 518     # is_volatile : notif immediatly (service only)
 519     def consume_result(self, c):
 520         OK_UP = self.__class__.ok_up #OK for service, UP for host
 521
 522         # We check for stalking if necessery
 523         # so if check is here
 524         self.manage_stalking(c)
 525
 526         # Latency can be <0 is we get a check from the retention file
 527         # so if <0, set 0
 528         try:
 529             self.latency = max(0, c.check_time - c.t_to_go)
 530         except TypeError:
 531             pass
 532
 533         # Ok, the first check is done
 534         self.has_been_checked = 1
 535
 536         # Now get data from check
 537         self.execution_time = c.execution_time
 538         self.last_chk = int(c.check_time)
 539         self.output = c.output
 540         self.long_output = c.long_output
 541
 542         # Get the perf_data only if we want it in the configuration
 543         if self.__class__.process_performance_data and self.process_perf_data:
 544             self.last_perf_data = self.perf_data
 545             self.perf_data = c.perf_data
 546
 547         # Before set state, module thems
 548         for rm in self.resultmodulations:
 549             if rm != None:
 550                 c.exit_status = rm.module_return(c.exit_status)
 551
 552         # If we got a bad result on a normal check, and we have dep,
 553         # we raise dep checks
 554         # put the actual check in waitdep and we return all new checks
 555         if c.exit_status != 0 and c.status == 'waitconsume' and len(self.act_depend_of) != 0:
 556             c.status = 'waitdep'
 557             # Make sure the check know about his dep
 558             # C is my check, and he wants dependancies
 559             checks_id = self.raise_dependancies_check(c)
 560             for check_id in checks_id:
 561                 # Get checks_id of dep
 562                 c.depend_on.append(check_id)
 563             # Ok, no more need because checks are not
 564             # take by host/service, and not returned
 565
 566         # remember how we was before this check
 567         last_state_type = self.state_type
 568
 569         self.set_state_from_exit_status(c.exit_status)
 570
 571         # we change the state, do whatever we are or not in
 572         # an impact mode, we can put it
 573         self.state_changed_since_impact = True
 574
 575         # The check is consume, uptade the in_checking propertie
 576         self.remove_in_progress_check(c)
 577
 578         # C is a check and someone wait for it
 579         if c.status == 'waitconsume' and c.depend_on_me != []:
 580             c.status = 'havetoresolvedep'
 581
 582         # if finish, check need to be set to a zombie state to be removed
 583         # it can be change if necessery before return, like for dependancies
 584         if c.status == 'waitconsume' and c.depend_on_me == []:
 585             c.status = 'zombie'
 586
 587         # Use to know if notif is raise or not
 588         no_action = False
 589
 590         # C was waitdep, but now all dep are resolved, so check for deps
 591         if c.status == 'waitdep':
 592             if c.depend_on_me != []:
 593                 c.status = 'havetoresolvedep'
 594             else:
 595                 c.status = 'zombie'
 596             # Check deps
 597             no_action = self.is_no_action_dependant()
 598             # We recheck just for network_dep. Maybe we are just unreachable
 599             # and we need to overide the state_id
 600             self.check_and_set_unreachability()
 601
 602         # OK following a previous OK. perfect if we were not in SOFT
 603         if c.exit_status == 0 and self.last_state in (OK_UP, 'PENDING'):
 604             #print "Case 1 (OK following a previous OK) : code:%s last_state:%s" % (c.exit_status, self.last_state)
 605             self.unacknowledge_problem()
 606             # action in return can be notification or other checks (dependancies)
 607             if (self.state_type == 'SOFT') and self.last_state != 'PENDING':
 608                 if self.is_max_attempts() and self.state_type == 'SOFT':
 609                     self.state_type = 'HARD'
 610                 else:
 611                     self.state_type = 'SOFT'
 612             else:
 613                 self.attempt = 1
 614                 self.state_type = 'HARD'
 615
 616         # OK following a NON-OK.
 617         elif c.exit_status == 0 and (self.last_state != OK_UP and self.last_state != 'PENDING'):
 618             self.unacknowledge_problem()
 619             #print "Case 2 (OK following a NON-OK) : code:%s last_state:%s" % (c.exit_status, self.last_state)
 620             if self.state_type == 'SOFT':
 621                 # OK following a NON-OK still in SOFT state
 622                 self.add_attempt()
 623                 self.raise_alert_log_entry()
 624                 # Eventhandler gets OK;SOFT;++attempt, no notification needed
 625                 self.get_event_handlers()
 626                 # Internally it is a hard OK
 627                 self.state_type = 'HARD'
 628                 self.attempt = 1
 629             elif self.state_type == 'HARD':
 630                 # OK following a HARD NON-OK
 631                 self.raise_alert_log_entry()
 632                 # Eventhandler and notifications get OK;HARD;maxattempts
 633                 # Ok, so current notifications are not need, we 'zombie' thems
 634                 self.remove_in_progress_notifications()
 635                 if not no_action:
 636                     self.create_notifications('RECOVERY')
 637                 self.get_event_handlers()
 638                 # Internally it is a hard OK
 639                 self.state_type = 'HARD'
 640                 self.attempt = 1
 641
 642                 # I'm no more a problem if I was one
 643                 self.no_more_a_problem()
 644
 645         # Volatile part
 646         # Only for service
 647         elif c.exit_status != 0 and hasattr(self, 'is_volatile') and self.is_volatile:
 648             #print "Case 3 (volatile only)"
 649             # There are no repeated attempts, so the first non-ok results
 650             # in a hard state
 651             self.attempt = 1
 652             self.state_type = 'HARD'
 653             # status != 0 so add a log entry (before actions that can also raise log
 654             # it is smarter to log error before notification)
 655             self.raise_alert_log_entry()
 656             self.check_for_flexible_downtime()
 657             self.remove_in_progress_notifications()
 658             if not no_action:
 659                 self.create_notifications('PROBLEM')
 660             # Ok, event handlers here too
 661             self.get_event_handlers()
 662
 663             #PROBLEM/IMPACT
 664             # I'm a problem only if I'm the root problem,
 665             # so not no_action:
 666             if not no_action:
 667                 self.set_myself_as_problem()
 668
 669         # NON-OK follows OK. Everything was fine, but now trouble is ahead
 670         elif c.exit_status != 0 and self.last_state in (OK_UP, 'PENDING'):
 671             #print "Case 4 : NON-OK follows OK : code:%s last_state:%s" % (c.exit_status, self.last_state)
 672             if self.is_max_attempts():
 673                 # if max_attempts == 1 we're already in deep trouble
 674                 self.state_type = 'HARD'
 675                 self.raise_alert_log_entry()
 676                 self.remove_in_progress_notifications()
 677                 self.check_for_flexible_downtime()
 678                 if not no_action:
 679                     self.create_notifications('PROBLEM')
 680                 # Oh? This is the typical go for a event handler :)
 681                 self.get_event_handlers()
 682
 683                 # PROBLEM/IMPACT
 684                 # I'm a problem only if I'm the root problem,
 685                 # so not no_action:
 686                 if not no_action:
 687                     self.set_myself_as_problem()
 688
 689             else:
 690                 # This is the first NON-OK result. Initiate the SOFT-sequence
 691                 # Also launch the event handler, he might fix it.
 692                 self.attempt = 1
 693                 self.state_type = 'SOFT'
 694                 self.raise_alert_log_entry()
 695                 self.get_event_handlers()
 696
 697         # If no OK in a no OK : if hard, still hard, if soft,
 698         # check at self.max_check_attempts
 699         # when we go in hard, we send notification
 700         elif c.exit_status != 0 and self.last_state != OK_UP:
 701             #print "Case 5 (no OK in a no OK) : code:%s last_state:%s state_type:%s" % (c.exit_status, self.last_state,self.state_type)
 702             if self.state_type == 'SOFT':
 703                 self.add_attempt()
 704                 if self.is_max_attempts():
 705                     # Ok here is when we just go to the hard state
 706                     self.state_type = 'HARD'
 707                     self.raise_alert_log_entry()
 708                     self.remove_in_progress_notifications()
 709                     # There is a request in the Nagios trac to enter downtimes
 710                     # on soft states which does make sense. If this becomes
 711                     # the default behavior, just move the following line
 712                     # into the else-branch below.
 713                     self.check_for_flexible_downtime()
 714                     if not no_action:
 715                         self.create_notifications('PROBLEM')
 716                     # So event handlers here too
 717                     self.get_event_handlers()
 718
 719                     # PROBLEM/IMPACT
 720                     # I'm a problem only if I'm the root problem,
 721                     # so not no_action:
 722                     if not no_action:
 723                         self.set_myself_as_problem()
 724
 725                 else:
 726                     self.raise_alert_log_entry()
 727                     # eventhandler is launched each time during the soft state
 728                     self.get_event_handlers()
 729             else:
 730                 # Send notifications whenever the state has changed. (W -> C)
 731                 if self.state != self.last_state:
 732                     self.unacknowledge_problem_if_not_sticky()
 733                     self.raise_alert_log_entry()
 734                     self.remove_in_progress_notifications()
 735                     if not no_action:
 736                         self.create_notifications('PROBLEM')
 737
 738                     # PROBLEM/IMPACT
 739                     # Maybe our new state can raise the problem
 740                     # when the last one was not
 741                     # I'm a problem only if I'm the root problem,
 742                     # so not no_action:
 743                     if not no_action:
 744                         self.set_myself_as_problem()
 745
 746                 elif self.in_scheduled_downtime_during_last_check == True:
 747                     # during the last check i was in a downtime. but now
 748                     # the status is still critical and notifications
 749                     # are possible again. send an alert immediately
 750                     self.remove_in_progress_notifications()
 751                     if not no_action:
 752                         self.create_notifications('PROBLEM')
 753
 754         # Reset this flag. If it was true, actions were already taken
 755         self.in_scheduled_downtime_during_last_check == False
 756
 757         # now is the time to update state_type_id
 758         # and our last_hard_state
 759         if self.state_type == 'HARD':
 760             self.state_type_id = 1
 761             self.last_hard_state = self.state
 762             self.last_hard_state_id = self.state_id
 763         else:
 764             self.state_type_id = 0
 765
 766         # update event/problem-counters
 767         self.update_event_and_problem_id()
 768         self.broks.append(self.get_check_result_brok())
 769         self.get_obsessive_compulsive_processor_command()
 770         self.get_perfdata_command()
 771
 772         # fill last_hard_state_change to now
 773         # if we just change from SOFT->HARD or
 774         # in HARD we change of state (Warning->critical, or critical->ok, etc etc)
 775         if self.state_type == 'HARD' and (last_state_type == 'SOFT' or self.last_state != self.state):
 776             self.last_hard_state_change = int(time.time())
 777
 778
 779
 780     def update_event_and_problem_id(self):
 781         OK_UP = self.__class__.ok_up #OK for service, UP for host
 782         if self.state != self.last_state and self.last_state != 'PENDING' or self.state != OK_UP and self.last_state == 'PENDING':
 783             SchedulingItem.current_event_id += 1
 784             self.last_event_id = self.current_event_id
 785             self.current_event_id = SchedulingItem.current_event_id
 786             # now the problem_id
 787             if self.state != OK_UP and self.last_state == 'PENDING':
 788                 # broken ever since i can remember
 789                 SchedulingItem.current_problem_id += 1
 790                 self.last_problem_id = self.current_problem_id
 791                 self.current_problem_id = SchedulingItem.current_problem_id
 792             elif self.state != OK_UP and self.last_state != OK_UP:
 793                 # State transitions between non-OK states
 794                 # (e.g. WARNING to CRITICAL) do not cause
 795                 # this problem id to increase.
 796                 pass
 797             elif self.state == OK_UP:
 798                 # If the service is currently in an OK state,
 799                 # this macro will be set to zero (0).
 800                 self.last_problem_id = self.current_problem_id
 801                 self.current_problem_id = 0
 802             else:
 803                 # Every time a service (or host) transitions from
 804                 # an OK or UP state to a problem state, a global
 805                 # problem ID number is incremented by one (1).
 806                 SchedulingItem.current_problem_id += 1
 807                 self.last_problem_id = self.current_problem_id
 808                 self.current_problem_id = SchedulingItem.current_problem_id
 809
 810
 811     # Called by scheduler when a notification is
 812     # ok to be send (so fuilly prepared to be send
 813     # to reactionner). Here we update the command with
 814     # status of now, and we add the contact to set of
 815     # contact we notified. And we raise the log entry
 816     def prepare_notification_for_sending(self, n):
 817         if n.status == 'inpoller':
 818             self.update_notification_command(n)
 819             self.notified_contacts.add(n.contact)
 820             self.raise_notification_log_entry(n)
 821
 822
 823     # Just update the notification command by resolving Macros
 824     # And because we are just launching the notification, we can say
 825     # that this contact have been notified
 826     def update_notification_command(self, n):
 827         m = MacroResolver()
 828         data = self.get_data_for_notifications(n.contact, n)
 829         n.command = m.resolve_command(n.command_call, data)
 830
 831
 832
 833     # See if an escalation is eligible at t and notif nb=n
 834     def is_escalable(self, n):#t, n):
 835         cls = self.__class__
 836
 837         # We search since when we are in notification for escalations
 838         # that are based on time
 839         in_notif_time = cls.interval_length * self.first_notification_delay + (n.notif_nb-1) * self.notification_interval
 840         print "In notif time orig:", in_notif_time
 841         in_notif_time = time.time() - n.creation_time
 842         print "In notif time mod:", in_notif_time
 843
 844         # Check is an escalation match the current_notification_number
 845         for es in self.escalations:
 846             if es.is_eligible(n.t_to_go, self.state, n.notif_nb, in_notif_time, cls.interval_length):
 847                 return True
 848         return False
 849
 850
 851     # Give for a notification the next notification time
 852     # by taking the standard notification_interval or ask for
 853     # our escalation if one of them need a smaller value to escalade
 854     def get_next_notification_time(self, n):
 855         t = []
 856         now = time.time()
 857         cls = self.__class__
 858
 859         # Get the standard time like if we got no escalations
 860         std_time = n.t_to_go + self.notification_interval * cls.interval_length
 861
 862         # standard time is a good one
 863         t.append(std_time)
 864
 865         creation_time = n.creation_time
 866         in_notif_time = time.time() - n.creation_time
 867
 868         for es in self.escalations:
 869             r = es.get_next_notif_time(std_time, self.state, creation_time, cls.interval_length)
 870             # If we got a real result (time base escalation), we add it
 871             if r != None:
 872                 t.append(r)
 873
 874         #And we take the minimum of this result. Can be standard or escalation asked
 875         return min(t)
 876
 877
 878     # Get all contacts (uniq) from eligible escalations
 879     def get_escalable_contacts(self,n):
 880         cls = self.__class__
 881
 882         # We search since when we are in notification for escalations
 883         # that are based on this time
 884         in_notif_time = time.time() - n.creation_time
 885
 886         contacts = set()
 887         for es in self.escalations:
 888             if es.is_eligible(n.t_to_go, self.state, n.notif_nb, in_notif_time, cls.interval_length):
 889                 contacts.update(es.contacts)
 890         return list(contacts)
 891
 892
 893     # Create a "master" notification here, which will later
 894     # (immediately before the reactionner gets it) be split up
 895     # in many "child" notifications, one for each contact.
 896     def create_notifications(self, type, t_wished = None):
 897         cls = self.__class__
 898         # t_wished==None for the first notification launch after consume
 899         # here we must look at the self.notification_period
 900         if t_wished == None:
 901             now = time.time()
 902             t_wished = now
 903             # if first notification, we must add first_notification_delay
 904             if self.current_notification_number == 0 and type == 'PROBLEM':
 905                 last_time_non_ok_or_up = self.last_time_non_ok_or_up()
 906                 if last_time_non_ok_or_up == 0:
 907                     # this happens at initial
 908                     t_wished = now + self.first_notification_delay * cls.interval_length
 909                 else:
 910                     t_wished = last_time_non_ok_or_up + self.first_notification_delay * cls.interval_length
 911             t = self.notification_period.get_next_valid_time_from_t(t_wished)
 912         else:
 913             # We follow our order
 914             t = t_wished
 915
 916         if self.notification_is_blocked_by_item(type, t_wished) and self.first_notification_delay == 0 and self.notification_interval == 0:
 917             # If notifications are blocked on the host/service level somehow
 918             # and repeated notifications are not configured,
 919             # we can silently drop this one
 920             return
 921
 922         if type == 'PROBLEM':
 923             # Create the notification with an incremented notification_number.
 924             # The current_notification_number  of the item itself will only
 925             # be incremented when this notification (or its children)
 926             # have actually be sent.
 927             next_notif_nb = self.current_notification_number + 1
 928         elif type == 'RECOVERY':
 929             # Recovery resets the notification counter to zero
 930             self.current_notification_number = 0
 931             next_notif_nb = self.current_notification_number
 932         else:
 933             # downtime/flap/etc do not change the notification number
 934             next_notif_nb = self.current_notification_number
 935         n = Notification(type, 'scheduled', 'VOID', None, self, None, t, \
 936             timeout=cls.notification_timeout, \
 937             notif_nb=next_notif_nb)
 938
 939         # Keep a trace in our notifications queue
 940         self.notifications_in_progress[n.id] = n
 941         # and put it in the temp queue for scheduler
 942         self.actions.append(n)
 943
 944
 945     # In create_notifications we created a notification "template". When it's
 946     # time to hand it over to the reactionner, this master notification needs
 947     # to be split in several child notifications, one for each contact
 948     # To be more exact, one for each contact who is willing to accept
 949     # notifications of this type and at this time
 950     def scatter_notification(self, n):
 951         cls = self.__class__
 952         childnotifications = []
 953
 954         if n.contact:
 955             # only master notifications can be split up
 956             return []
 957         if n.type == 'RECOVERY':
 958             if self.first_notification_delay != 0 and len(self.notified_contacts) == 0:
 959                 # Recovered during first_notification_delay. No notifications
 960                 # have been sent yet, so we keep quiet
 961                 contacts = []
 962             else:
 963                 # The old way. Only send recover notifications to those contacts
 964                 # who also got problem notifications
 965                 contacts = list(self.notified_contacts)
 966             self.notified_contacts.clear()
 967         else:
 968             # Check is an escalation match. If yes, get all contacts from escalations
 969             if self.is_escalable(n):#.t_to_go, n.notif_nb):
 970                 contacts = self.get_escalable_contacts(n)#.t_to_go, n.notif_nb)
 971             # else take normal contacts
 972             else:
 973                 contacts = self.contacts
 974         print "Finally raise for contacts"
 975         for contact in contacts:
 976             print contact.get_name()
 977
 978         for contact in contacts:
 979             # Get the property name for notif commands, like
 980             # service_notification_commands for service
 981             notif_commands = contact.get_notification_commands(cls.my_type)
 982
 983             for cmd in notif_commands:
 984                 child_n = Notification(n.type, 'scheduled', 'VOID', cmd, self,
 985                     contact, n.t_to_go, timeout=cls.notification_timeout,
 986                     notif_nb=n.notif_nb )
 987                 if not self.notification_is_blocked_by_contact(child_n, contact):
 988                     # Update the notification with fresh status information
 989                     # of the item. Example: during the notification_delay
 990                     # the status of a service may have changed from WARNING to CRITICAL
 991                     self.update_notification_command(child_n)
 992                     self.raise_notification_log_entry(child_n)
 993                     self.notifications_in_progress[child_n.id] = child_n
 994                     childnotifications.append(child_n)
 995
 996                     if n.type == 'PROBLEM':
 997                     # Remember the contacts. We might need them later in the
 998                     # recovery code some lines above
 999                         self.notified_contacts.add(contact)
1000
1001         return childnotifications
1002
1003
1004     # return a check to check the host/service
1005     # and return id of the check
1006     def launch_check(self, t, ref_check = None, force=False):
1007         c = None
1008         cls = self.__class__
1009
1010         # if I'm already in checking, Why launch a new check?
1011         # If ref_check_id is not None , this is a dependancy_ check
1012         # If none, it might be a forced check, so OK, I do a new
1013         if not force and (self.in_checking and ref_check != None):
1014             c_in_progress = self.checks_in_progress[0] #0 is OK because in_checking is True
1015             if c_in_progress.t_to_go > time.time(): #Very far?
1016                 c_in_progress.t_to_go = time.time() #No, I want a check right NOW
1017             c_in_progress.depend_on_me.append(ref_check)
1018             return c_in_progress.id
1019
1020         if force or (not self.is_no_check_dependant()):
1021             # Get the command to launch
1022             m = MacroResolver()
1023             data = self.get_data_for_checks()
1024             command_line = m.resolve_command(self.check_command, data)
1025
1026             # By default env is void
1027             env = {}
1028
1029             # And get all environnement varialbe only if need
1030             if not cls.use_large_installation_tweaks and cls.enable_environment_macros:
1031                 env = m.get_env_macros(data)
1032
1033             # Make the Check object and put the service in checking
1034             # Make the check inherit poller_tag from the command
1035             c = Check('scheduled', command_line, self, t, ref_check, \
1036                           timeout=cls.check_timeout, \
1037                           poller_tag=self.check_command.poller_tag, env=env)
1038
1039             # We keep a trace of all checks in progress
1040             # to know if we are in checking_or not
1041             self.checks_in_progress.append(c)
1042         self.update_in_checking()
1043
1044         # We need to put this new check in our actions queue
1045         # so scheduler can take it
1046         if c != None:
1047             self.actions.append(c)
1048             return c.id
1049         # None mean I already take it into account
1050         return None
1051
1052
1053     # Get the perfdata command with macro resolved for this
1054     def get_perfdata_command(self):
1055         cls = self.__class__
1056         if not cls.process_performance_data or not self.process_perf_data:
1057             return
1058
1059         if cls.perfdata_command != None:
1060             m = MacroResolver()
1061             data = self.get_data_for_event_handler()
1062             cmd = m.resolve_command(cls.perfdata_command, data)
1063             e = EventHandler(cmd, timeout=cls.perfdata_timeout)
1064
1065             # ok we can put it in our temp action queue
1066             self.actions.append(e)
1067
1068
1069     # Create the whole business rule tree
1070     # if we need it
1071     def create_business_rules(self, hosts, services):
1072         cmdCall = getattr(self, 'check_command', None)
1073
1074         # If we do not have a command, we bailout
1075         if cmdCall == None:
1076             return
1077
1078         # we get our based command, like
1079         # check_tcp!80 -> check_tcp
1080         cmd = cmdCall.call
1081         elts = cmd.split('!')
1082         base_cmd = elts[0]
1083
1084         # If it's bp_rule, we got a rule :)
1085         if base_cmd == 'bp_rule':
1086             print "Got rule", elts, cmd
1087             self.got_business_rule = True
1088             rule = ''
1089             if len(elts) >= 2:
1090                 rule = elts[1]
1091                 print "Got rules", rule
1092             fact = DependencyNodeFactory()
1093             node = fact.eval_cor_patern(rule, hosts, services)
1094             print "got node", node
1095             self.business_rule = node
1096
1097
1098     # We ask us to manage our own internal check,
1099     # like a business based one
1100     def manage_internal_check(self, c):
1101         #print "DBG, ask me to manage a check!"
1102         if c.command.startswith('bp_'):
1103             state = self.business_rule.get_state()
1104         elif c.command == '_internal_host_up':
1105             state = 0
1106             c.execution_time = 0
1107             c.check_time = time.time()
1108             c.output = 'Host assumed to be UP'
1109             c.long_output = c.output
1110         c.exit_status = state
1111         #print "DBG, setting state", state
1112
1113
1114     # If I'm a business rule service/hsot, I register myself to the
1115     # elements I will depend on, so They will have ME as an impact
1116     def create_business_rules_dependencies(self):
1117         if self.got_business_rule:
1118             #print "DBG: ask me to register me in my dependencies", self.get_name()
1119             elts = self.business_rule.list_all_elements()
1120             # I will register myself in this
1121             for e in elts:
1122                 #print "I register to the element", e.get_name()
1123                 # all states, every timeperiod, and inherit parents
1124                 e.add_business_rule_act_dependancy(self, ['d', 'u', 's', 'f', 'c', 'w'], None, True)